diff --git a/CMakeLists.txt b/CMakeLists.txt
index b1fdf913d9f..98e2e19730b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,6 +49,10 @@ option(WITH_SNAPPY "build with SNAPPY" OFF)
 option(WITH_LZ4 "build with lz4" OFF)
 option(WITH_ZLIB "build with zlib" OFF)
 option(WITH_ZSTD "build with zstd" OFF)
+option(WITH_WINDOWS_UTF8_FILENAMES "use UTF8 as characterset for opening files, regardles of the system code page" OFF)
+if (WITH_WINDOWS_UTF8_FILENAMES)
+  add_definitions(-DROCKSDB_WINDOWS_UTF8_FILENAMES)
+endif()
 if(MSVC)
   # Defaults currently different for GFLAGS.
   #  We will address find_package work a little later
@@ -362,7 +366,7 @@ endif()
 option(ROCKSDB_LITE "Build RocksDBLite version" OFF)
 if(ROCKSDB_LITE)
   add_definitions(-DROCKSDB_LITE)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -Os")
 endif()
 
 if(CMAKE_SYSTEM_NAME MATCHES "Cygwin")
@@ -467,6 +471,7 @@ set(SOURCES
         db/compaction_iterator.cc
         db/compaction_job.cc
         db/compaction_picker.cc
+        db/compaction_picker_fifo.cc
         db/compaction_picker_universal.cc
         db/convenience.cc
         db/db_filesnapshot.cc
@@ -499,6 +504,7 @@ set(SOURCES
         db/merge_helper.cc
         db/merge_operator.cc
         db/range_del_aggregator.cc
+        db/range_tombstone_fragmenter.cc
         db/repair.cc
         db/snapshot_impl.cc
         db/table_cache.cc
@@ -572,6 +578,7 @@ set(SOURCES
         table/plain_table_index.cc
         table/plain_table_key_coding.cc
         table/plain_table_reader.cc
+        table/sst_file_reader.cc
         table/sst_file_writer.cc
         table/table_properties.cc
         table/two_level_iterator.cc
@@ -598,6 +605,7 @@ set(SOURCES
         util/filename.cc
         util/filter_policy.cc
         util/hash.cc
+        util/jemalloc_nodump_allocator.cc
         util/log_buffer.cc
         util/murmurhash.cc
         util/random.cc
@@ -679,12 +687,10 @@ set(SOURCES
         utilities/write_batch_with_index/write_batch_with_index_internal.cc
         $<TARGET_OBJECTS:build_version>)
 
-if(HAVE_SSE42 AND NOT FORCE_SSE42)
-  if(NOT MSVC)
-set_source_files_properties(
-  util/crc32c.cc
-      PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
-  endif()
+if(HAVE_SSE42 AND NOT MSVC)
+  set_source_files_properties(
+    util/crc32c.cc
+    PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
 endif()
 
 if(HAVE_POWER8)
@@ -899,6 +905,8 @@ if(WITH_TESTS)
         db/perf_context_test.cc
         db/plain_table_db_test.cc
         db/prefix_test.cc
+        db/range_del_aggregator_test.cc
+        db/range_tombstone_fragmenter_test.cc
         db/repair_test.cc
         db/table_properties_collector_test.cc
         db/version_builder_test.cc
@@ -927,6 +935,7 @@ if(WITH_TESTS)
         table/data_block_hash_index_test.cc
         table/full_filter_block_test.cc
         table/merger_test.cc
+        table/sst_file_reader_test.cc
         table/table_test.cc
         tools/ldb_cmd_test.cc
         tools/reduce_levels_test.cc
diff --git a/HISTORY.md b/HISTORY.md
index 4e868344797..033e6ba7ed2 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,23 +1,50 @@
 # Rocksdb Change Log
-
-# 5.17.2 (10/24/2018)
+## 5.18.3 (2/11/2019)
 ### Bug Fixes
-* Fix the bug that WriteBatchWithIndex's SeekForPrev() doesn't see the entries with the same key.
+* Fix possible LSM corruption when both range deletions and subcompactions are used. The symptom of this corruption is L1+ files overlapping in the user key space.
+
+## 5.18.2 (01/31/2019)
+### Public API Change
+* Change time resolution in FileOperationInfo.
+* Deleting Blob files also go through SStFileManager.
+
+## 5.18.0 (11/30/2018)
+### New Features
+* Introduced `JemallocNodumpAllocator` memory allocator. When being use, block cache will be excluded from core dump.
+* Introduced `PerfContextByLevel` as part of `PerfContext` which allows storing perf context at each level. Also replaced `__thread` with `thread_local` keyword for perf_context. Added per-level perf context for bloom filter and `Get` query.
+* With level_compaction_dynamic_level_bytes = true, level multiplier may be adjusted automatically when Level 0 to 1 compaction is lagged behind.
+* Introduced DB option `atomic_flush`. If true, RocksDB supports flushing multiple column families and atomically committing the result to MANIFEST. Useful when WAL is disabled.
+* Added `num_deletions` and `num_merge_operands` members to `TableProperties`.
+* Added "rocksdb.min-obsolete-sst-number-to-keep" DB property that reports the lower bound on SST file numbers that are being kept from deletion, even if the SSTs are obsolete.
+* Add xxhash64 checksum support
+* Introduced `MemoryAllocator`, which lets the user specify custom memory allocator for block based table.
+* Improved `DeleteRange` to prevent read performance degradation. The feature is no longer marked as experimental.
+* Enabled checkpoint on readonly db (DBImplReadOnly).
+
+### Public API Change
+* `DBOptions::use_direct_reads` now affects reads issued by `BackupEngine` on the database's SSTs.
+* `NO_ITERATORS` is divided into two counters `NO_ITERATOR_CREATED` and `NO_ITERATOR_DELETE`. Both of them are only increasing now, just as other counters.
 
-# 5.17.1 (10/16/2018)
 ### Bug Fixes
-* Fix slow flush/compaction when DB contains many snapshots. The problem became noticeable to us in DBs with 100,000+ snapshots, though it will affect others at different thresholds.
-* Properly set the stop key for a truncated manual CompactRange
 * Fix corner case where a write group leader blocked due to write stall blocks other writers in queue with WriteOptions::no_slowdown set.
-
-### New Features
-* Introduced CacheAllocator, which lets the user specify custom allocator for memory in block cache.
+* Fix in-memory range tombstone truncation to avoid erroneously covering newer keys at a lower level, and include range tombstones in compacted files whose largest key is the range tombstone's start key.
+* Properly set the stop key for a truncated manual CompactRange
+* Fix slow flush/compaction when DB contains many snapshots. The problem became noticeable to us in DBs with 100,000+ snapshots, though it will affect others at different thresholds.
+* Fix the bug that WriteBatchWithIndex's SeekForPrev() doesn't see the entries with the same key.
+* Fix the bug where user comparator was sometimes fed with InternalKey instead of the user key. The bug manifests when during GenerateBottommostFiles.
+* Fix a bug in WritePrepared txns where if the number of old snapshots goes beyond the snapshot cache size (128 default) the rest will not be checked when evicting a commit entry from the commit cache.
+* Fixed Get correctness bug in the presence of range tombstones where merge operands covered by a range tombstone always result in NotFound.
+* Start populating `NO_FILE_CLOSES` ticker statistic, which was always zero previously.
+* The default value of NewBloomFilterPolicy()'s argument use_block_based_builder is changed to false. Note that this new default may cause large temp memory usage when building very large SST files.
+* Fix a deadlock caused by compaction and file ingestion waiting for each other in the event of write stalls.
+* Make DB ignore dropped column families while committing results of atomic flush.
 
 ## 5.17.0 (10/05/2018)
 ### Public API Change
 * `OnTableFileCreated` will now be called for empty files generated during compaction. In that case, `TableFileCreationInfo::file_path` will be "(nil)" and `TableFileCreationInfo::file_size` will be zero.
 * Add `FlushOptions::allow_write_stall`, which controls whether Flush calls start working immediately, even if it causes user writes to stall, or will wait until flush can be performed without causing write stall (similar to `CompactRangeOptions::allow_write_stall`). Note that the default value is false, meaning we add delay to Flush calls until stalling can be avoided when possible. This is behavior change compared to previous RocksDB versions, where Flush calls didn't check if they might cause stall or not.
 * Application using PessimisticTransactionDB is expected to rollback/commit recovered transactions before starting new ones. This assumption is used to skip concurrency control during recovery.
+* Expose column family id to `OnCompactionCompleted`.
 
 ### New Features
 * TransactionOptions::skip_concurrency_control allows pessimistic transactions to skip the overhead of concurrency control. Could be used for optimizing certain transactions or during recovery.
diff --git a/Makefile b/Makefile
index de0e9d1d511..653cd567788 100644
--- a/Makefile
+++ b/Makefile
@@ -93,19 +93,38 @@ ifeq ($(MAKECMDGOALS),rocksdbjavastaticpublish)
 	DEBUG_LEVEL=0
 endif
 
+# Lite build flag.
+LITE ?= 0
+ifeq ($(LITE), 0)
+ifneq ($(filter -DROCKSDB_LITE,$(OPT)),)
+  # Be backward compatible and support older format where OPT=-DROCKSDB_LITE is
+  # specified instead of LITE=1 on the command line.
+  LITE=1
+endif
+else ifeq ($(LITE), 1)
+ifeq ($(filter -DROCKSDB_LITE,$(OPT)),)
+	OPT += -DROCKSDB_LITE
+endif
+endif
+
+# Figure out optimize level.
+ifneq ($(DEBUG_LEVEL), 2)
+ifeq ($(LITE), 0)
+	OPT += -O2
+else
+	OPT += -Os
+endif
+endif
+
 # compile with -O2 if debug level is not 2
 ifneq ($(DEBUG_LEVEL), 2)
-OPT += -O2 -fno-omit-frame-pointer
+OPT += -fno-omit-frame-pointer
 # Skip for archs that don't support -momit-leaf-frame-pointer
 ifeq (,$(shell $(CXX) -fsyntax-only -momit-leaf-frame-pointer -xc /dev/null 2>&1))
 OPT += -momit-leaf-frame-pointer
 endif
 endif
 
-ifeq (,$(shell $(CXX) -fsyntax-only -faligned-new -xc++ /dev/null 2>&1))
-CXXFLAGS += -faligned-new -DHAVE_ALIGNED_NEW
-endif
-
 ifeq (,$(shell $(CXX) -fsyntax-only -maltivec -xc /dev/null 2>&1))
 CXXFLAGS += -DHAS_ALTIVEC
 CFLAGS += -DHAS_ALTIVEC
@@ -322,7 +341,7 @@ endif
 ifeq ("$(wildcard $(LUA_LIB))", "") # LUA_LIB does not exist
 $(error $(LUA_LIB) does not exist.  Try to specify both LUA_PATH and LUA_LIB manually)
 endif
-LDFLAGS += $(LUA_LIB)
+EXEC_LDFLAGS += $(LUA_LIB)
 
 endif
 
@@ -526,7 +545,6 @@ TESTS = \
 	persistent_cache_test \
 	statistics_test \
 	lua_test \
-	range_del_aggregator_test \
 	lru_cache_test \
 	object_registry_test \
 	repair_test \
@@ -536,6 +554,9 @@ TESTS = \
 	db_universal_compaction_test \
 	trace_analyzer_test \
 	repeatable_thread_test \
+	range_tombstone_fragmenter_test \
+	range_del_aggregator_test \
+	sst_file_reader_test \
 
 PARALLEL_TEST = \
 	backupable_db_test \
@@ -886,6 +907,7 @@ crash_test: whitebox_crash_test blackbox_crash_test
 
 blackbox_crash_test: db_stress
 	python -u tools/db_crashtest.py --simple blackbox $(CRASH_TEST_EXT_ARGS)
+	python -u tools/db_crashtest.py --enable_atomic_flush blackbox $(CRASH_TEST_EXT_ARGS)
 	python -u tools/db_crashtest.py blackbox $(CRASH_TEST_EXT_ARGS)
 
 ifeq ($(CRASH_TEST_KILL_ODD),)
@@ -894,6 +916,8 @@ endif
 
 whitebox_crash_test: db_stress
 	python -u tools/db_crashtest.py --simple whitebox --random_kill_odd \
+      $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
+	python -u tools/db_crashtest.py --enable_atomic_flush whitebox  --random_kill_odd \
       $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
 	python -u tools/db_crashtest.py whitebox  --random_kill_odd \
       $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
@@ -1568,6 +1592,12 @@ blob_db_test: utilities/blob_db/blob_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
 repeatable_thread_test: util/repeatable_thread_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+range_tombstone_fragmenter_test: db/range_tombstone_fragmenter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+sst_file_reader_test: table/sst_file_reader_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 #-------------------------------------------------
 # make install related stuff
 INSTALL_PATH ?= /usr/local
diff --git a/TARGETS b/TARGETS
index 0321475281e..43f8bd5b216 100644
--- a/TARGETS
+++ b/TARGETS
@@ -67,6 +67,14 @@ is_opt_mode = build_mode.startswith("opt")
 if is_opt_mode:
     rocksdb_compiler_flags.append("-DNDEBUG")
 
+sanitizer = read_config("fbcode", "sanitizer")
+
+# Do not enable jemalloc if sanitizer presents. RocksDB will further detect
+# whether the binary is linked with jemalloc at runtime.
+if sanitizer == "":
+    rocksdb_compiler_flags.append("-DROCKSDB_JEMALLOC")
+    rocksdb_external_deps.append(("jemalloc", None, "headers"))
+
 cpp_library(
     name = "rocksdb_lib",
     srcs = [
@@ -81,6 +89,7 @@ cpp_library(
         "db/compaction_iterator.cc",
         "db/compaction_job.cc",
         "db/compaction_picker.cc",
+        "db/compaction_picker_fifo.cc",
         "db/compaction_picker_universal.cc",
         "db/convenience.cc",
         "db/db_filesnapshot.cc",
@@ -113,6 +122,7 @@ cpp_library(
         "db/merge_helper.cc",
         "db/merge_operator.cc",
         "db/range_del_aggregator.cc",
+        "db/range_tombstone_fragmenter.cc",
         "db/repair.cc",
         "db/snapshot_impl.cc",
         "db/table_cache.cc",
@@ -190,6 +200,7 @@ cpp_library(
         "table/plain_table_index.cc",
         "table/plain_table_key_coding.cc",
         "table/plain_table_reader.cc",
+        "table/sst_file_reader.cc",
         "table/sst_file_writer.cc",
         "table/table_properties.cc",
         "table/two_level_iterator.cc",
@@ -215,6 +226,7 @@ cpp_library(
         "util/filename.cc",
         "util/filter_policy.cc",
         "util/hash.cc",
+        "util/jemalloc_nodump_allocator.cc",
         "util/log_buffer.cc",
         "util/murmurhash.cc",
         "util/random.cc",
@@ -920,6 +932,11 @@ ROCKS_TESTS = [
         "db/range_del_aggregator_test.cc",
         "serial",
     ],
+    [
+        "range_tombstone_fragmenter_test",
+        "db/range_tombstone_fragmenter_test.cc",
+        "serial",
+    ],
     [
         "rate_limiter_test",
         "util/rate_limiter_test.cc",
@@ -965,6 +982,11 @@ ROCKS_TESTS = [
         "tools/sst_dump_test.cc",
         "serial",
     ],
+    [
+        "sst_file_reader_test",
+        "table/sst_file_reader_test.cc",
+        "serial",
+    ],
     [
         "statistics_test",
         "monitoring/statistics_test.cc",
diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py
index d7078566738..7a2198fa7b7 100644
--- a/buckifier/targets_cfg.py
+++ b/buckifier/targets_cfg.py
@@ -70,6 +70,14 @@
 # doesn't harm and avoid forgetting to add it.
 if is_opt_mode:
     rocksdb_compiler_flags.append("-DNDEBUG")
+
+sanitizer = read_config("fbcode", "sanitizer")
+
+# Do not enable jemalloc if sanitizer presents. RocksDB will further detect
+# whether the binary is linked with jemalloc at runtime.
+if sanitizer == "":
+    rocksdb_compiler_flags.append("-DROCKSDB_JEMALLOC")
+    rocksdb_external_deps.append(("jemalloc", None, "headers"))
 """
 
 
diff --git a/build_tools/RocksDBCommonHelper.php b/build_tools/RocksDBCommonHelper.php
index 9fe770fe956..4f4663cbe93 100644
--- a/build_tools/RocksDBCommonHelper.php
+++ b/build_tools/RocksDBCommonHelper.php
@@ -7,12 +7,12 @@
 // Name of the environment variables which need to be set by the entity which
 // triggers continuous runs so that code at the end of the file gets executed
 // and Sandcastle run starts.
-define("ENV_POST_RECEIVE_HOOK", "POST_RECEIVE_HOOK");
-define("ENV_HTTPS_APP_VALUE", "HTTPS_APP_VALUE");
-define("ENV_HTTPS_TOKEN_VALUE", "HTTPS_TOKEN_VALUE");
+const ENV_POST_RECEIVE_HOOK = "POST_RECEIVE_HOOK";
+const ENV_HTTPS_APP_VALUE = "HTTPS_APP_VALUE";
+const ENV_HTTPS_TOKEN_VALUE = "HTTPS_TOKEN_VALUE";
 
-define("PRIMARY_TOKEN_FILE", '/home/krad/.sandcastle');
-define("CONT_RUN_ALIAS", "leveldb");
+const PRIMARY_TOKEN_FILE = '/home/krad/.sandcastle';
+const CONT_RUN_ALIAS = "leveldb";
 
 //////////////////////////////////////////////////////////////////////
 /*  Run tests in sandcastle */
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index fc7868d012c..0a7a68d715e 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -474,6 +474,17 @@ EOF
             COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_SCHED_GETCPU_PRESENT"
         fi
     fi
+
+    if ! test $ROCKSDB_DISABLE_ALIGNED_NEW; then
+        # Test whether c++17 aligned-new is supported
+        $CXX $PLATFORM_CXXFLAGS -faligned-new -x c++ - -o /dev/null 2>/dev/null <<EOF
+            struct alignas(1024) t {int a;};
+            int main() {}
+EOF
+        if [ "$?" = 0 ]; then
+            PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS -faligned-new -DHAVE_ALIGNED_NEW"
+        fi
+    fi
 fi
 
 # TODO(tec): Fix -Wshorten-64-to-32 errors on FreeBSD and enable the warning.
diff --git a/build_tools/gnu_parallel b/build_tools/gnu_parallel
index abbf8f1008a..1cf164fff0b 100755
--- a/build_tools/gnu_parallel
+++ b/build_tools/gnu_parallel
@@ -5082,8 +5082,8 @@ sub openoutputfiles {
 	# Set reading FD if using --group (--ungroup does not need)
 	for my $fdno (1,2) {
 	    # Re-open the file for reading
-	    # so fdw can be closed seperately
-	    # and fdr can be seeked seperately (for --line-buffer)
+	    # so fdw can be closed separately
+	    # and fdr can be seeked separately (for --line-buffer)
 	    open(my $fdr,"<", $self->fh($fdno,'name')) ||
 		::die_bug("fdr: Cannot open ".$self->fh($fdno,'name'));
 	    $self->set_fh($fdno,'r',$fdr);
diff --git a/build_tools/rocksdb-lego-determinator b/build_tools/rocksdb-lego-determinator
index 1365f7fe9d0..90ff38556ce 100755
--- a/build_tools/rocksdb-lego-determinator
+++ b/build_tools/rocksdb-lego-determinator
@@ -85,7 +85,6 @@ NON_SHM="TMPD=/tmp/rocksdb_test_tmp"
 GCC_481="ROCKSDB_FBCODE_BUILD_WITH_481=1"
 ASAN="COMPILE_WITH_ASAN=1"
 CLANG="USE_CLANG=1"
-LITE="OPT=\"-DROCKSDB_LITE -g\""
 TSAN="COMPILE_WITH_TSAN=1"
 UBSAN="COMPILE_WITH_UBSAN=1"
 TSAN_CRASH='CRASH_TEST_EXT_ARGS="--compression_type=zstd --log2_keys_per_lock=22"'
@@ -345,7 +344,7 @@ LITE_BUILD_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Build RocksDB debug version',
-                'shell':'$LITE make J=1 all check || $CONTRUN_NAME=lite $TASK_CREATION_TOOL',
+                'shell':'make J=1 LITE=1 all check || $CONTRUN_NAME=lite $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -663,12 +662,12 @@ run_regression()
 
   # === lite build ===
   make clean
-  OPT=-DROCKSDB_LITE make -j$(nproc) static_lib
+  make LITE=1 -j$(nproc) static_lib
   send_size_to_ods static_lib_lite $(stat --printf="%s" librocksdb.a)
   strip librocksdb.a
   send_size_to_ods static_lib_lite_stripped $(stat --printf="%s" librocksdb.a)
 
-  OPT=-DROCKSDB_LITE make -j$(nproc) shared_lib
+  make LITE=1 -j$(nproc) shared_lib
   send_size_to_ods shared_lib_lite $(stat --printf="%s" `readlink -f librocksdb.so`)
   strip `readlink -f librocksdb.so`
   send_size_to_ods shared_lib_lite_stripped $(stat --printf="%s" `readlink -f librocksdb.so`)
diff --git a/cache/cache_test.cc b/cache/cache_test.cc
index 55f9cc6bb63..0a63bfd6940 100644
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@@ -64,8 +64,8 @@ class CacheTest : public testing::TestWithParam<std::string> {
 
   std::vector<int> deleted_keys_;
   std::vector<int> deleted_values_;
-  shared_ptr<Cache> cache_;
-  shared_ptr<Cache> cache2_;
+  std::shared_ptr<Cache> cache_;
+  std::shared_ptr<Cache> cache2_;
 
   CacheTest()
       : cache_(NewCache(kCacheSize, kNumShardBits, false)),
@@ -145,7 +145,7 @@ class CacheTest : public testing::TestWithParam<std::string> {
 CacheTest* CacheTest::current_;
 
 TEST_P(CacheTest, UsageTest) {
-  // cache is shared_ptr and will be automatically cleaned up.
+  // cache is std::shared_ptr and will be automatically cleaned up.
   const uint64_t kCapacity = 100000;
   auto cache = NewCache(kCapacity, 8, false);
 
@@ -173,7 +173,7 @@ TEST_P(CacheTest, UsageTest) {
 }
 
 TEST_P(CacheTest, PinnedUsageTest) {
-  // cache is shared_ptr and will be automatically cleaned up.
+  // cache is std::shared_ptr and will be automatically cleaned up.
   const uint64_t kCapacity = 100000;
   auto cache = NewCache(kCapacity, 8, false);
 
diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc
index d4cbb9a45f3..e11ac6ddbb7 100644
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@@ -461,8 +461,10 @@ std::string LRUCacheShard::GetPrintableOptions() const {
 }
 
 LRUCache::LRUCache(size_t capacity, int num_shard_bits,
-                   bool strict_capacity_limit, double high_pri_pool_ratio)
-    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) {
+                   bool strict_capacity_limit, double high_pri_pool_ratio,
+                   std::shared_ptr<MemoryAllocator> allocator)
+    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
+                   std::move(allocator)) {
   num_shards_ = 1 << num_shard_bits;
   shards_ = reinterpret_cast<LRUCacheShard*>(
       port::cacheline_aligned_alloc(sizeof(LRUCacheShard) * num_shards_));
@@ -537,12 +539,14 @@ double LRUCache::GetHighPriPoolRatio() {
 std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts) {
   return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits,
                      cache_opts.strict_capacity_limit,
-                     cache_opts.high_pri_pool_ratio);
+                     cache_opts.high_pri_pool_ratio,
+                     cache_opts.memory_allocator);
 }
 
-std::shared_ptr<Cache> NewLRUCache(size_t capacity, int num_shard_bits,
-                                   bool strict_capacity_limit,
-                                   double high_pri_pool_ratio) {
+std::shared_ptr<Cache> NewLRUCache(
+    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+    double high_pri_pool_ratio,
+    std::shared_ptr<MemoryAllocator> memory_allocator) {
   if (num_shard_bits >= 20) {
     return nullptr;  // the cache cannot be sharded into too many fine pieces
   }
@@ -554,7 +558,8 @@ std::shared_ptr<Cache> NewLRUCache(size_t capacity, int num_shard_bits,
     num_shard_bits = GetDefaultCacheShardBits(capacity);
   }
   return std::make_shared<LRUCache>(capacity, num_shard_bits,
-                                    strict_capacity_limit, high_pri_pool_ratio);
+                                    strict_capacity_limit, high_pri_pool_ratio,
+                                    std::move(memory_allocator));
 }
 
 }  // namespace rocksdb
diff --git a/cache/lru_cache.h b/cache/lru_cache.h
index 3c067f0c1e6..0194d26a560 100644
--- a/cache/lru_cache.h
+++ b/cache/lru_cache.h
@@ -279,7 +279,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard : public CacheShard {
 class LRUCache : public ShardedCache {
  public:
   LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
-           double high_pri_pool_ratio);
+           double high_pri_pool_ratio,
+           std::shared_ptr<MemoryAllocator> memory_allocator = nullptr);
   virtual ~LRUCache();
   virtual const char* Name() const override { return "LRUCache"; }
   virtual CacheShard* GetShard(int shard) override;
diff --git a/cache/sharded_cache.cc b/cache/sharded_cache.cc
index 6a0a2228211..a48a32185bf 100644
--- a/cache/sharded_cache.cc
+++ b/cache/sharded_cache.cc
@@ -20,8 +20,10 @@
 namespace rocksdb {
 
 ShardedCache::ShardedCache(size_t capacity, int num_shard_bits,
-                           bool strict_capacity_limit)
-    : num_shard_bits_(num_shard_bits),
+                           bool strict_capacity_limit,
+                           std::shared_ptr<MemoryAllocator> allocator)
+    : Cache(std::move(allocator)),
+      num_shard_bits_(num_shard_bits),
       capacity_(capacity),
       strict_capacity_limit_(strict_capacity_limit),
       last_id_(1) {}
@@ -142,6 +144,9 @@ std::string ShardedCache::GetPrintableOptions() const {
              strict_capacity_limit_);
     ret.append(buffer);
   }
+  snprintf(buffer, kBufferSize, "    memory_allocator : %s\n",
+           memory_allocator() ? memory_allocator()->Name() : "None");
+  ret.append(buffer);
   ret.append(GetShard(0)->GetPrintableOptions());
   return ret;
 }
diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h
index 4f9dea2ad0f..543286b9e80 100644
--- a/cache/sharded_cache.h
+++ b/cache/sharded_cache.h
@@ -47,7 +47,8 @@ class CacheShard {
 // Keys are sharded by the highest num_shard_bits bits of hash value.
 class ShardedCache : public Cache {
  public:
-  ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit);
+  ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+               std::shared_ptr<MemoryAllocator> memory_allocator = nullptr);
   virtual ~ShardedCache() = default;
   virtual const char* Name() const override = 0;
   virtual CacheShard* GetShard(int shard) = 0;
diff --git a/cloud/cloud_manifest.cc b/cloud/cloud_manifest.cc
index ef9ca325a61..476b46d8fb7 100644
--- a/cloud/cloud_manifest.cc
+++ b/cloud/cloud_manifest.cc
@@ -40,7 +40,7 @@ Status CloudManifest::LoadFromLog(std::unique_ptr<SequentialFileReader> log,
   CorruptionReporter reporter;
   reporter.status = &status;
   log::Reader reader(nullptr, std::move(log), &reporter, true /* checksum */,
-                     0 /* log_num */);
+                     0 /* log_num */, false /* retry_after_eof */);
   Slice record;
   std::string scratch;
   bool headerRead = false;
diff --git a/cloud/manifest_reader.cc b/cloud/manifest_reader.cc
index 0843e295d38..316def6da92 100644
--- a/cloud/manifest_reader.cc
+++ b/cloud/manifest_reader.cc
@@ -61,7 +61,7 @@ Status ManifestReader::GetLiveFiles(const std::string bucket_path,
   VersionSet::LogReporter reporter;
   reporter.status = &s;
   log::Reader reader(nullptr, std::move(file_reader), &reporter,
-                     true /*checksum*/, 0);
+                     true /*checksum*/, 0, false /* retry_after_eof */);
 
   Slice record;
   std::string scratch;
@@ -116,7 +116,8 @@ Status ManifestReader::GetMaxFileNumberFromManifest(Env* env,
   log::Reader reader(NULL,
                      unique_ptr<SequentialFileReader>(
                          new SequentialFileReader(std::move(file), fname)),
-                     &reporter, true /*checksum*/, 0);
+                     &reporter, true /*checksum*/, 0,
+                     false /* retry_after_eof */);
 
   Slice record;
   std::string scratch;
diff --git a/db/builder.cc b/db/builder.cc
index 5c90d348cf2..b13b68aebcf 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -18,6 +18,7 @@
 #include "db/event_helpers.h"
 #include "db/internal_stats.h"
 #include "db/merge_helper.h"
+#include "db/range_del_aggregator.h"
 #include "db/table_cache.h"
 #include "db/version_edit.h"
 #include "monitoring/iostats_context_imp.h"
@@ -65,8 +66,9 @@ Status BuildTable(
     const std::string& dbname, Env* env, const ImmutableCFOptions& ioptions,
     const MutableCFOptions& mutable_cf_options, const EnvOptions& env_options,
     TableCache* table_cache, InternalIterator* iter,
-    std::unique_ptr<InternalIterator> range_del_iter, FileMetaData* meta,
-    const InternalKeyComparator& internal_comparator,
+    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+        range_del_iters,
+    FileMetaData* meta, const InternalKeyComparator& internal_comparator,
     const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
         int_tbl_prop_collector_factories,
     uint32_t column_family_id, const std::string& column_family_name,
@@ -86,12 +88,10 @@ Status BuildTable(
   Status s;
   meta->fd.file_size = 0;
   iter->SeekToFirst();
-  std::unique_ptr<RangeDelAggregator> range_del_agg(
-      new RangeDelAggregator(internal_comparator, snapshots));
-  s = range_del_agg->AddTombstones(std::move(range_del_iter));
-  if (!s.ok()) {
-    // may be non-ok if a range tombstone key is unparsable
-    return s;
+  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
+      new CompactionRangeDelAggregator(&internal_comparator, snapshots));
+  for (auto& range_del_iter : range_del_iters) {
+    range_del_agg->AddTombstones(std::move(range_del_iter));
   }
 
   std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(),
@@ -104,9 +104,9 @@ Status BuildTable(
 
   if (iter->Valid() || !range_del_agg->IsEmpty()) {
     TableBuilder* builder;
-    unique_ptr<WritableFileWriter> file_writer;
+    std::unique_ptr<WritableFileWriter> file_writer;
     {
-      unique_ptr<WritableFile> file;
+      std::unique_ptr<WritableFile> file;
 #ifndef NDEBUG
       bool use_direct_writes = env_options.use_direct_writes;
       TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes);
@@ -121,8 +121,9 @@ Status BuildTable(
       file->SetIOPriority(io_priority);
       file->SetWriteLifeTimeHint(write_hint);
 
-      file_writer.reset(new WritableFileWriter(
-          std::move(file), fname, env_options, ioptions.statistics));
+      file_writer.reset(new WritableFileWriter(std::move(file), fname,
+                                               env_options, ioptions.statistics,
+                                               ioptions.listeners));
       builder = NewTableBuilder(
           ioptions, mutable_cf_options, internal_comparator,
           int_tbl_prop_collector_factories, column_family_id,
@@ -157,8 +158,10 @@ Status BuildTable(
       }
     }
 
-    for (auto it = range_del_agg->NewIterator(); it->Valid(); it->Next()) {
-      auto tombstone = it->Tombstone();
+    auto range_del_it = range_del_agg->NewIterator();
+    for (range_del_it->SeekToFirst(); range_del_it->Valid();
+         range_del_it->Next()) {
+      auto tombstone = range_del_it->Tombstone();
       auto kv = tombstone.Serialize();
       builder->Add(kv.first.Encode(), kv.second);
       meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(),
diff --git a/db/builder.h b/db/builder.h
index 9995723df55..b81355703be 100644
--- a/db/builder.h
+++ b/db/builder.h
@@ -9,6 +9,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "db/range_tombstone_fragmenter.h"
 #include "db/table_properties_collector.h"
 #include "options/cf_options.h"
 #include "rocksdb/comparator.h"
@@ -65,8 +66,9 @@ extern Status BuildTable(
     const std::string& dbname, Env* env, const ImmutableCFOptions& options,
     const MutableCFOptions& mutable_cf_options, const EnvOptions& env_options,
     TableCache* table_cache, InternalIterator* iter,
-    std::unique_ptr<InternalIterator> range_del_iter, FileMetaData* meta,
-    const InternalKeyComparator& internal_comparator,
+    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+        range_del_iters,
+    FileMetaData* meta, const InternalKeyComparator& internal_comparator,
     const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
         int_tbl_prop_collector_factories,
     uint32_t column_family_id, const std::string& column_family_name,
diff --git a/db/c.cc b/db/c.cc
index 1b44cab2fdc..87e08f8065f 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -151,14 +151,20 @@ struct rocksdb_writablefile_t    { WritableFile*     rep; };
 struct rocksdb_wal_iterator_t { TransactionLogIterator* rep; };
 struct rocksdb_wal_readoptions_t { TransactionLogIterator::ReadOptions rep; };
 struct rocksdb_filelock_t        { FileLock*         rep; };
-struct rocksdb_logger_t          { shared_ptr<Logger>  rep; };
-struct rocksdb_cache_t           { shared_ptr<Cache>   rep; };
+struct rocksdb_logger_t {
+  std::shared_ptr<Logger> rep;
+};
+struct rocksdb_cache_t {
+  std::shared_ptr<Cache> rep;
+};
 struct rocksdb_livefiles_t       { std::vector<LiveFileMetaData> rep; };
 struct rocksdb_column_family_handle_t  { ColumnFamilyHandle* rep; };
 struct rocksdb_envoptions_t      { EnvOptions        rep; };
 struct rocksdb_ingestexternalfileoptions_t  { IngestExternalFileOptions rep; };
 struct rocksdb_sstfilewriter_t   { SstFileWriter*    rep; };
-struct rocksdb_ratelimiter_t     { shared_ptr<RateLimiter>      rep; };
+struct rocksdb_ratelimiter_t {
+  std::shared_ptr<RateLimiter> rep;
+};
 struct rocksdb_perfcontext_t     { PerfContext*      rep; };
 struct rocksdb_pinnableslice_t {
   PinnableSlice rep;
@@ -3544,6 +3550,18 @@ const char* rocksdb_livefiles_largestkey(
   return lf->rep[index].largestkey.data();
 }
 
+uint64_t rocksdb_livefiles_entries(
+    const rocksdb_livefiles_t* lf,
+    int index) {
+  return lf->rep[index].num_entries;
+}
+
+uint64_t rocksdb_livefiles_deletions(
+    const rocksdb_livefiles_t* lf,
+    int index) {
+  return lf->rep[index].num_deletions;
+}
+
 extern void rocksdb_livefiles_destroy(
   const rocksdb_livefiles_t* lf) {
   delete lf;
diff --git a/db/column_family.cc b/db/column_family.cc
index 8ea3d121637..9a3ae99ca49 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -20,10 +20,12 @@
 #include <limits>
 
 #include "db/compaction_picker.h"
+#include "db/compaction_picker_fifo.h"
 #include "db/compaction_picker_universal.h"
 #include "db/db_impl.h"
 #include "db/internal_stats.h"
 #include "db/job_context.h"
+#include "db/range_del_aggregator.h"
 #include "db/table_properties_collector.h"
 #include "db/version_set.h"
 #include "db/write_controller.h"
@@ -105,9 +107,6 @@ void GetIntTblPropCollectorFactory(
     int_tbl_prop_collector_factories->emplace_back(
         new UserKeyTablePropertiesCollectorFactory(collector_factories[i]));
   }
-  // Add collector to collect internal key statistics
-  int_tbl_prop_collector_factories->emplace_back(
-      new InternalKeyPropertiesCollectorFactory);
 }
 
 Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) {
@@ -945,26 +944,16 @@ Status ColumnFamilyData::RangesOverlapWithMemtables(
   super_version->imm->AddIterators(read_opts, &merge_iter_builder);
   ScopedArenaIterator memtable_iter(merge_iter_builder.Finish());
 
-  std::vector<InternalIterator*> memtable_range_del_iters;
+  auto read_seq = super_version->current->version_set()->LastSequence();
+  ReadRangeDelAggregator range_del_agg(&internal_comparator_, read_seq);
   auto* active_range_del_iter =
-      super_version->mem->NewRangeTombstoneIterator(read_opts);
-  if (active_range_del_iter != nullptr) {
-    memtable_range_del_iters.push_back(active_range_del_iter);
-  }
-  super_version->imm->AddRangeTombstoneIterators(read_opts,
-                                                 &memtable_range_del_iters);
-  RangeDelAggregator range_del_agg(internal_comparator_, {} /* snapshots */,
-                                   false /* collapse_deletions */);
+      super_version->mem->NewRangeTombstoneIterator(read_opts, read_seq);
+  range_del_agg.AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator>(active_range_del_iter));
+  super_version->imm->AddRangeTombstoneIterators(read_opts, nullptr /* arena */,
+                                                 &range_del_agg);
+
   Status status;
-  {
-    std::unique_ptr<InternalIterator> memtable_range_del_iter(
-        NewMergingIterator(&internal_comparator_,
-                           memtable_range_del_iters.empty()
-                               ? nullptr
-                               : &memtable_range_del_iters[0],
-                           static_cast<int>(memtable_range_del_iters.size())));
-    status = range_del_agg.AddTombstones(std::move(memtable_range_del_iter));
-  }
   for (size_t i = 0; i < ranges.size() && status.ok() && !*overlap; ++i) {
     auto* vstorage = super_version->current->storage_info();
     auto* ucmp = vstorage->InternalComparator()->user_comparator();
diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index 5c2a068b34d..07cc5dd1313 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -47,7 +47,7 @@ class EnvCounter : public EnvWrapper {
   int GetNumberOfNewWritableFileCalls() {
     return num_new_writable_file_;
   }
-  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
                          const EnvOptions& soptions) override {
     ++num_new_writable_file_;
     return EnvWrapper::NewWritableFile(f, r, soptions);
@@ -486,9 +486,9 @@ class ColumnFamilyTestBase : public testing::Test {
   void CopyFile(const std::string& source, const std::string& destination,
                 uint64_t size = 0) {
     const EnvOptions soptions;
-    unique_ptr<SequentialFile> srcfile;
+    std::unique_ptr<SequentialFile> srcfile;
     ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
-    unique_ptr<WritableFile> destfile;
+    std::unique_ptr<WritableFile> destfile;
     ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
 
     if (size == 0) {
@@ -513,6 +513,16 @@ class ColumnFamilyTestBase : public testing::Test {
     return static_cast<int>(files.size());
   }
 
+  void RecalculateWriteStallConditions(ColumnFamilyData* cfd,
+      const MutableCFOptions& mutable_cf_options)  {
+    // add lock to avoid race condition between
+    // `RecalculateWriteStallConditions` which writes to CFStats and
+    // background `DBImpl::DumpStats()` threads which read CFStats
+    dbfull()->TEST_LockMutex();
+    cfd->RecalculateWriteStallConditions(mutable_cf_options);
+    dbfull()-> TEST_UnlockMutex();
+  }
+
   std::vector<ColumnFamilyHandle*> handles_;
   std::vector<std::string> names_;
   std::vector<std::set<std::string>> keys_;
@@ -2500,139 +2510,139 @@ TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) {
   mutable_cf_options.disable_auto_compactions = false;
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(50);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(201);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(400);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(500);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(450);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(205);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(202);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(201);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(198);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(399);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(599);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(2001);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(3001);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(390);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(100);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
 
   vstorage->set_l0_delay_trigger_count(100);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->set_l0_delay_trigger_count(101);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
 
   vstorage->set_l0_delay_trigger_count(0);
   vstorage->TEST_set_estimated_compaction_needed_bytes(300);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
 
   vstorage->set_l0_delay_trigger_count(101);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25 / 1.25 / 1.25, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(200);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
 
   vstorage->set_l0_delay_trigger_count(0);
   vstorage->TEST_set_estimated_compaction_needed_bytes(0);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
 
   mutable_cf_options.disable_auto_compactions = true;
   dbfull()->TEST_write_controler().set_delayed_write_rate(kBaseRate);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
 
   vstorage->set_l0_delay_trigger_count(50);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(0, GetDbDelayedWriteRate());
@@ -2640,7 +2650,7 @@ TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) {
 
   vstorage->set_l0_delay_trigger_count(60);
   vstorage->TEST_set_estimated_compaction_needed_bytes(300);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(0, GetDbDelayedWriteRate());
@@ -2649,14 +2659,14 @@ TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) {
   mutable_cf_options.disable_auto_compactions = false;
   vstorage->set_l0_delay_trigger_count(70);
   vstorage->TEST_set_estimated_compaction_needed_bytes(500);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
 
   vstorage->set_l0_delay_trigger_count(71);
   vstorage->TEST_set_estimated_compaction_needed_bytes(501);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
@@ -2681,31 +2691,31 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupSingleColumnFamily) {
   mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(40);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(50);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(300);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(45);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->set_l0_delay_trigger_count(7);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->set_l0_delay_trigger_count(9);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->set_l0_delay_trigger_count(6);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
 
   // Speed up threshold = min(4 * 2, 4 + (12 - 4)/4) = 6
@@ -2714,15 +2724,15 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupSingleColumnFamily) {
   mutable_cf_options.level0_stop_writes_trigger = 30;
 
   vstorage->set_l0_delay_trigger_count(5);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->set_l0_delay_trigger_count(7);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->set_l0_delay_trigger_count(3);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
 }
 
@@ -2749,53 +2759,53 @@ TEST_P(ColumnFamilyTest, WriteStallTwoColumnFamilies) {
   mutable_cf_options1.soft_pending_compaction_bytes_limit = 500;
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(50);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
 
   vstorage1->TEST_set_estimated_compaction_needed_bytes(201);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
 
   vstorage1->TEST_set_estimated_compaction_needed_bytes(600);
-  cfd1->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(70);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
 
   vstorage1->TEST_set_estimated_compaction_needed_bytes(800);
-  cfd1->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(300);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
 
   vstorage1->TEST_set_estimated_compaction_needed_bytes(700);
-  cfd1->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(500);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
 
   vstorage1->TEST_set_estimated_compaction_needed_bytes(600);
-  cfd1->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
@@ -2828,41 +2838,41 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupTwoColumnFamilies) {
   mutable_cf_options1.level0_slowdown_writes_trigger = 16;
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(40);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(60);
-  cfd1->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage1->TEST_set_estimated_compaction_needed_bytes(30);
-  cfd1->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage1->TEST_set_estimated_compaction_needed_bytes(70);
-  cfd1->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(20);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage1->TEST_set_estimated_compaction_needed_bytes(3);
-  cfd1->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->set_l0_delay_trigger_count(9);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage1->set_l0_delay_trigger_count(2);
-  cfd1->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->set_l0_delay_trigger_count(0);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
 }
 
diff --git a/db/compacted_db_impl.cc b/db/compacted_db_impl.cc
index 1c315b63b62..acdaad4ec29 100644
--- a/db/compacted_db_impl.cc
+++ b/db/compacted_db_impl.cc
@@ -147,6 +147,7 @@ Status CompactedDBImpl::Open(const Options& options,
   std::unique_ptr<CompactedDBImpl> db(new CompactedDBImpl(db_options, dbname));
   Status s = db->Init(options);
   if (s.ok()) {
+    db->StartTimedTasks();
     ROCKS_LOG_INFO(db->immutable_db_options_.info_log,
                    "Opened the db as fully compacted mode");
     LogFlush(db->immutable_db_options_.info_log);
diff --git a/db/compacted_db_impl.h b/db/compacted_db_impl.h
index 736002e1e52..5c574b4b9a5 100644
--- a/db/compacted_db_impl.h
+++ b/db/compacted_db_impl.h
@@ -67,10 +67,11 @@ class CompactedDBImpl : public DBImpl {
   virtual Status EnableFileDeletions(bool /*force*/) override {
     return Status::NotSupported("Not supported in compacted db mode.");
   }
-  virtual Status GetLiveFiles(std::vector<std::string>&,
-                              uint64_t* /*manifest_file_size*/,
-                              bool /*flush_memtable*/ = true) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
+  virtual Status GetLiveFiles(std::vector<std::string>& ret,
+                              uint64_t* manifest_file_size,
+                              bool /*flush_memtable*/) override {
+    return DBImpl::GetLiveFiles(ret, manifest_file_size,
+                                false /* flush_memtable */);
   }
   using DBImpl::Flush;
   virtual Status Flush(const FlushOptions& /*options*/,
diff --git a/db/compaction.cc b/db/compaction.cc
index 4ea92d5cc78..dbcd82b75f4 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -23,6 +23,43 @@
 
 namespace rocksdb {
 
+const uint64_t kRangeTombstoneSentinel =
+    PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey& b) {
+  auto c = user_cmp->Compare(a.user_key(), b.user_key());
+  if (c != 0) {
+    return c;
+  }
+  auto a_footer = ExtractInternalKeyFooter(a.Encode());
+  auto b_footer = ExtractInternalKeyFooter(b.Encode());
+  if (a_footer == kRangeTombstoneSentinel) {
+    if (b_footer != kRangeTombstoneSentinel) {
+      return -1;
+    }
+  } else if (b_footer == kRangeTombstoneSentinel) {
+    return 1;
+  }
+  return 0;
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+                      const InternalKey& b) {
+  if (a == nullptr) {
+    return -1;
+  }
+  return sstableKeyCompare(user_cmp, *a, b);
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey* b) {
+  if (b == nullptr) {
+    return -1;
+  }
+  return sstableKeyCompare(user_cmp, a, *b);
+}
+
 uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
   uint64_t sum = 0;
   for (size_t i = 0; i < files.size() && files[i]; i++) {
@@ -81,6 +118,49 @@ void Compaction::GetBoundaryKeys(
   }
 }
 
+std::vector<CompactionInputFiles> Compaction::PopulateWithAtomicBoundaries(
+    VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs) {
+  const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
+  for (size_t i = 0; i < inputs.size(); i++) {
+    if (inputs[i].level == 0 || inputs[i].files.empty()) {
+      continue;
+    }
+    inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size());
+    AtomicCompactionUnitBoundary cur_boundary;
+    size_t first_atomic_idx = 0;
+    auto add_unit_boundary = [&](size_t to) {
+      if (first_atomic_idx == to) return;
+      for (size_t k = first_atomic_idx; k < to; k++) {
+        inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary);
+      }
+      first_atomic_idx = to;
+    };
+    for (size_t j = 0; j < inputs[i].files.size(); j++) {
+      const auto* f = inputs[i].files[j];
+      if (j == 0) {
+        // First file in a level.
+        cur_boundary.smallest = &f->smallest;
+        cur_boundary.largest = &f->largest;
+      } else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) ==
+                 0) {
+        // SSTs overlap but the end key of the previous file was not
+        // artificially extended by a range tombstone. Extend the current
+        // boundary.
+        cur_boundary.largest = &f->largest;
+      } else {
+        // Atomic compaction unit has ended.
+        add_unit_boundary(j);
+        cur_boundary.smallest = &f->smallest;
+        cur_boundary.largest = &f->largest;
+      }
+    }
+    add_unit_boundary(inputs[i].files.size());
+    assert(inputs[i].files.size() ==
+           inputs[i].atomic_compaction_unit_boundaries.size());
+  }
+  return inputs;
+}
+
 // helper function to determine if compaction is creating files at the
 // bottommost level
 bool Compaction::IsBottommostLevel(
@@ -155,7 +235,7 @@ Compaction::Compaction(VersionStorageInfo* vstorage,
       output_compression_(_compression),
       output_compression_opts_(_compression_opts),
       deletion_compaction_(_deletion_compaction),
-      inputs_(std::move(_inputs)),
+      inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))),
       grandparents_(std::move(_grandparents)),
       score_(_score),
       bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)),
@@ -331,12 +411,14 @@ const char* Compaction::InputLevelSummary(
     if (!is_first) {
       len +=
           snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + ");
+      len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
     } else {
       is_first = false;
     }
     len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
                     "%" ROCKSDB_PRIszt "@%d", input_level.size(),
                     input_level.level);
+    len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
   }
   snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
            " files to L%d", output_level());
diff --git a/db/compaction.h b/db/compaction.h
index f1d78771696..2cf737b676a 100644
--- a/db/compaction.h
+++ b/db/compaction.h
@@ -15,11 +15,43 @@
 
 namespace rocksdb {
 
+// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
+// null which provides the property that a==null indicates a key that is less
+// than any key and b==null indicates a key that is greater than any key. Note
+// that the comparison is performed primarily on the user-key portion of the
+// key. If the user-keys compare equal, an additional test is made to sort
+// range tombstone sentinel keys before other keys with the same user-key. The
+// result is that 2 user-keys will compare equal if they differ purely on
+// their sequence number and value, but the range tombstone sentinel for that
+// user-key will compare not equal. This is necessary because the range
+// tombstone sentinel key is set as the largest key for an sstable even though
+// that key never appears in the database. We don't want adjacent sstables to
+// be considered overlapping if they are separated by the range tombstone
+// sentinel.
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+                      const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey* b);
+
+// An AtomicCompactionUnitBoundary represents a range of keys [smallest,
+// largest] that exactly spans one ore more neighbouring SSTs on the same
+// level. Every pair of  SSTs in this range "overlap" (i.e., the largest
+// user key of one file is the smallest user key of the next file). These
+// boundaries are propagated down to RangeDelAggregator during compaction
+// to provide safe truncation boundaries for range tombstones.
+struct AtomicCompactionUnitBoundary {
+  const InternalKey* smallest = nullptr;
+  const InternalKey* largest = nullptr;
+};
+
 // The structure that manages compaction input files associated
 // with the same physical level.
 struct CompactionInputFiles {
   int level;
   std::vector<FileMetaData*> files;
+  std::vector<AtomicCompactionUnitBoundary> atomic_compaction_unit_boundaries;
   inline bool empty() const { return files.empty(); }
   inline size_t size() const { return files.size(); }
   inline void clear() { files.clear(); }
@@ -96,6 +128,12 @@ class Compaction {
     return inputs_[compaction_input_level][i];
   }
 
+  const std::vector<AtomicCompactionUnitBoundary>* boundaries(
+      size_t compaction_input_level) const {
+    assert(compaction_input_level < inputs_.size());
+    return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries;
+  }
+
   // Returns the list of file meta data of the specified compaction
   // input level.
   // REQUIREMENT: "compaction_input_level" must be >= 0 and
@@ -262,6 +300,13 @@ class Compaction {
                               const std::vector<CompactionInputFiles>& inputs,
                               Slice* smallest_key, Slice* largest_key);
 
+  // Get the atomic file boundaries for all files in the compaction. Necessary
+  // in order to avoid the scenario described in
+  // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and plumb
+  // down appropriate key boundaries to RangeDelAggregator during compaction.
+  static std::vector<CompactionInputFiles> PopulateWithAtomicBoundaries(
+      VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs);
+
   // helper function to determine if compaction with inputs and storage is
   // bottommost
   static bool IsBottommostLevel(
diff --git a/db/compaction_iterator.cc b/db/compaction_iterator.cc
index d81b630f386..43583af4aed 100644
--- a/db/compaction_iterator.cc
+++ b/db/compaction_iterator.cc
@@ -18,7 +18,7 @@ CompactionIterator::CompactionIterator(
     SequenceNumber earliest_write_conflict_snapshot,
     const SnapshotChecker* snapshot_checker, Env* env,
     bool report_detailed_time, bool expect_valid_internal_key,
-    RangeDelAggregator* range_del_agg, const Compaction* compaction,
+    CompactionRangeDelAggregator* range_del_agg, const Compaction* compaction,
     const CompactionFilter* compaction_filter,
     const std::atomic<bool>* shutting_down,
     const SequenceNumber preserve_deletes_seqnum)
@@ -36,7 +36,7 @@ CompactionIterator::CompactionIterator(
     SequenceNumber earliest_write_conflict_snapshot,
     const SnapshotChecker* snapshot_checker, Env* env,
     bool report_detailed_time, bool expect_valid_internal_key,
-    RangeDelAggregator* range_del_agg,
+    CompactionRangeDelAggregator* range_del_agg,
     std::unique_ptr<CompactionProxy> compaction,
     const CompactionFilter* compaction_filter,
     const std::atomic<bool>* shutting_down,
diff --git a/db/compaction_iterator.h b/db/compaction_iterator.h
index 71359169c53..6fbd3d0ef0c 100644
--- a/db/compaction_iterator.h
+++ b/db/compaction_iterator.h
@@ -64,7 +64,7 @@ class CompactionIterator {
                      SequenceNumber earliest_write_conflict_snapshot,
                      const SnapshotChecker* snapshot_checker, Env* env,
                      bool report_detailed_time, bool expect_valid_internal_key,
-                     RangeDelAggregator* range_del_agg,
+                     CompactionRangeDelAggregator* range_del_agg,
                      const Compaction* compaction = nullptr,
                      const CompactionFilter* compaction_filter = nullptr,
                      const std::atomic<bool>* shutting_down = nullptr,
@@ -77,7 +77,7 @@ class CompactionIterator {
                      SequenceNumber earliest_write_conflict_snapshot,
                      const SnapshotChecker* snapshot_checker, Env* env,
                      bool report_detailed_time, bool expect_valid_internal_key,
-                     RangeDelAggregator* range_del_agg,
+                     CompactionRangeDelAggregator* range_del_agg,
                      std::unique_ptr<CompactionProxy> compaction,
                      const CompactionFilter* compaction_filter = nullptr,
                      const std::atomic<bool>* shutting_down = nullptr,
@@ -141,7 +141,7 @@ class CompactionIterator {
   Env* env_;
   bool report_detailed_time_;
   bool expect_valid_internal_key_;
-  RangeDelAggregator* range_del_agg_;
+  CompactionRangeDelAggregator* range_del_agg_;
   std::unique_ptr<CompactionProxy> compaction_;
   const CompactionFilter* compaction_filter_;
   const std::atomic<bool>* shutting_down_;
diff --git a/db/compaction_iterator_test.cc b/db/compaction_iterator_test.cc
index 03c5a9c62a9..07a9e6ef805 100644
--- a/db/compaction_iterator_test.cc
+++ b/db/compaction_iterator_test.cc
@@ -221,10 +221,15 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
       MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr,
       bool bottommost_level = false,
       SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) {
-    std::unique_ptr<InternalIterator> range_del_iter(
+    std::unique_ptr<InternalIterator> unfragmented_range_del_iter(
         new test::VectorIterator(range_del_ks, range_del_vs));
-    range_del_agg_.reset(new RangeDelAggregator(icmp_, snapshots_));
-    ASSERT_OK(range_del_agg_->AddTombstones(std::move(range_del_iter)));
+    auto tombstone_list = std::make_shared<FragmentedRangeTombstoneList>(
+        std::move(unfragmented_range_del_iter), icmp_);
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        new FragmentedRangeTombstoneIterator(tombstone_list, icmp_,
+                                             kMaxSequenceNumber));
+    range_del_agg_.reset(new CompactionRangeDelAggregator(&icmp_, snapshots_));
+    range_del_agg_->AddTombstones(std::move(range_del_iter));
 
     std::unique_ptr<CompactionIterator::CompactionProxy> compaction;
     if (filter || bottommost_level) {
@@ -292,7 +297,7 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
   std::unique_ptr<MergeHelper> merge_helper_;
   std::unique_ptr<LoggingForwardVectorIterator> iter_;
   std::unique_ptr<CompactionIterator> c_iter_;
-  std::unique_ptr<RangeDelAggregator> range_del_agg_;
+  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_;
   std::unique_ptr<SnapshotChecker> snapshot_checker_;
   std::atomic<bool> shutting_down_{false};
   FakeCompaction* compaction_proxy_;
diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index b0a19ead40a..10aaef0983d 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -36,6 +36,7 @@
 #include "db/memtable_list.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
+#include "db/range_del_aggregator.h"
 #include "db/version_set.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
@@ -200,7 +201,7 @@ struct CompactionJob::SubcompactionState {
     return *this;
   }
 
-  // Because member unique_ptrs do not have these.
+  // Because member std::unique_ptrs do not have these.
   SubcompactionState(const SubcompactionState&) = delete;
 
   SubcompactionState& operator=(const SubcompactionState&) = delete;
@@ -511,7 +512,10 @@ void CompactionJob::GenSubcompactionBoundaries() {
   // size of data covered by keys in that range
   uint64_t sum = 0;
   std::vector<RangeWithSize> ranges;
-  auto* v = cfd->current();
+  // Get input version from CompactionState since it's already referenced
+  // earlier in SetInputVersioCompaction::SetInputVersion and will not change
+  // when db_mutex_ is released below
+  auto* v = compact_->compaction->input_version();
   for (auto it = bounds.begin();;) {
     const Slice a = *it;
     it++;
@@ -521,7 +525,13 @@ void CompactionJob::GenSubcompactionBoundaries() {
     }
 
     const Slice b = *it;
+
+    // ApproximateSize could potentially create table reader iterator to seek
+    // to the index block and may incur I/O cost in the process. Unlock db
+    // mutex to reduce contention
+    db_mutex_->Unlock();
     uint64_t size = versions_->ApproximateSize(v, a, b, start_lvl, out_lvl + 1);
+    db_mutex_->Lock();
     ranges.emplace_back(a, b, size);
     sum += size;
   }
@@ -795,10 +805,13 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
 void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   assert(sub_compact != nullptr);
   ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
-  std::unique_ptr<RangeDelAggregator> range_del_agg(
-      new RangeDelAggregator(cfd->internal_comparator(), existing_snapshots_));
+  CompactionRangeDelAggregator range_del_agg(&cfd->internal_comparator(),
+                                             existing_snapshots_);
+
+  // Although the v2 aggregator is what the level iterator(s) know about,
+  // the AddTombstones calls will be propagated down to the v1 aggregator.
   std::unique_ptr<InternalIterator> input(versions_->MakeInputIterator(
-      sub_compact->compaction, range_del_agg.get(), env_optiosn_for_read_));
+      sub_compact->compaction, &range_del_agg, env_optiosn_for_read_));
 
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
@@ -887,7 +900,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(),
       &existing_snapshots_, earliest_write_conflict_snapshot_,
       snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false,
-      range_del_agg.get(), sub_compact->compaction, compaction_filter,
+      &range_del_agg, sub_compact->compaction, compaction_filter,
       shutting_down_, preserve_deletes_seqnum_));
   auto c_iter = sub_compact->c_iter.get();
   c_iter->SeekToFirst();
@@ -1025,9 +1038,9 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
         next_key = &c_iter->key();
       }
       CompactionIterationStats range_del_out_stats;
-      status = FinishCompactionOutputFile(input_status, sub_compact,
-                                          range_del_agg.get(),
-                                          &range_del_out_stats, next_key);
+      status =
+          FinishCompactionOutputFile(input_status, sub_compact, &range_del_agg,
+                                     &range_del_out_stats, next_key);
       RecordDroppedKeys(range_del_out_stats,
                         &sub_compact->compaction_job_stats);
       if (sub_compact->outputs.size() == 1) {
@@ -1077,8 +1090,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   }
 
   if (status.ok() && sub_compact->builder == nullptr &&
-      sub_compact->outputs.size() == 0 &&
-      !range_del_agg->IsEmpty()) {
+      sub_compact->outputs.size() == 0 && !range_del_agg.IsEmpty()) {
     // handle subcompaction containing only range deletions
     status = OpenCompactionOutputFile(sub_compact);
   }
@@ -1087,8 +1099,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   // close the output file.
   if (sub_compact->builder != nullptr) {
     CompactionIterationStats range_del_out_stats;
-    Status s = FinishCompactionOutputFile(
-        status, sub_compact, range_del_agg.get(), &range_del_out_stats);
+    Status s = FinishCompactionOutputFile(status, sub_compact, &range_del_agg,
+                                          &range_del_out_stats);
     if (status.ok()) {
       status = s;
     }
@@ -1153,7 +1165,7 @@ void CompactionJob::RecordDroppedKeys(
 
 Status CompactionJob::FinishCompactionOutputFile(
     const Status& input_status, SubcompactionState* sub_compact,
-    RangeDelAggregator* range_del_agg,
+    CompactionRangeDelAggregator* range_del_agg,
     CompactionIterationStats* range_del_out_stats,
     const Slice* next_table_min_key /* = nullptr */) {
   AutoThreadOperationStageUpdater stage_updater(
@@ -1192,10 +1204,19 @@ Status CompactionJob::FinishCompactionOutputFile(
       lower_bound = nullptr;
     }
     if (next_table_min_key != nullptr) {
-      // This isn't the last file in the subcompaction, so extend until the next
-      // file starts.
+      // This may be the last file in the subcompaction in some cases, so we
+      // need to compare the end key of subcompaction with the next file start
+      // key. When the end key is chosen by the subcompaction, we know that
+      // it must be the biggest key in output file. Therefore, it is safe to
+      // use the smaller key as the upper bound of the output file, to ensure
+      // that there is no overlapping between different output files.
       upper_bound_guard = ExtractUserKey(*next_table_min_key);
-      upper_bound = &upper_bound_guard;
+      if (sub_compact->end != nullptr &&
+          ucmp->Compare(upper_bound_guard, *sub_compact->end) >= 0) {
+        upper_bound = sub_compact->end;
+      } else {
+        upper_bound = &upper_bound_guard;
+      }
     } else {
       // This is the last file in the subcompaction, so extend until the
       // subcompaction ends.
@@ -1205,18 +1226,45 @@ Status CompactionJob::FinishCompactionOutputFile(
     if (existing_snapshots_.size() > 0) {
       earliest_snapshot = existing_snapshots_[0];
     }
-    auto it = range_del_agg->NewIterator();
+    bool has_overlapping_endpoints;
+    if (upper_bound != nullptr && meta->largest.size() > 0) {
+      has_overlapping_endpoints =
+          ucmp->Compare(meta->largest.user_key(), *upper_bound) == 0;
+    } else {
+      has_overlapping_endpoints = false;
+    }
+
+    // The end key of the subcompaction must be bigger or equal to the upper
+    // bound. If the end of subcompaction is null or the upper bound is null,
+    // it means that this file is the last file in the compaction. So there
+    // will be no overlapping between this file and others.
+    assert(sub_compact->end == nullptr ||
+           upper_bound == nullptr ||
+           ucmp->Compare(*upper_bound , *sub_compact->end) <= 0);
+    auto it = range_del_agg->NewIterator(lower_bound, upper_bound,
+                                         has_overlapping_endpoints);
+    // Position the range tombstone output iterator. There may be tombstone
+    // fragments that are entirely out of range, so make sure that we do not
+    // include those.
     if (lower_bound != nullptr) {
       it->Seek(*lower_bound);
+    } else {
+      it->SeekToFirst();
     }
     for (; it->Valid(); it->Next()) {
       auto tombstone = it->Tombstone();
-      if (upper_bound != nullptr &&
-          ucmp->Compare(*upper_bound, tombstone.start_key_) <= 0) {
-        // Tombstones starting at upper_bound or later only need to be included
-        // in the next table. Break because subsequent tombstones will start
-        // even later.
-        break;
+      if (upper_bound != nullptr) {
+        int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_);
+        if ((has_overlapping_endpoints && cmp < 0) ||
+            (!has_overlapping_endpoints && cmp <= 0)) {
+          // Tombstones starting after upper_bound only need to be included in
+          // the next table. If the current SST ends before upper_bound, i.e.,
+          // `has_overlapping_endpoints == false`, we can also skip over range
+          // tombstones that start exactly at upper_bound. Such range tombstones
+          // will be included in the next file and are not relevant to the point
+          // keys or endpoints of the current file.
+          break;
+        }
       }
 
       if (bottommost_level_ && tombstone.seq_ <= earliest_snapshot) {
@@ -1228,6 +1276,8 @@ Status CompactionJob::FinishCompactionOutputFile(
       }
 
       auto kv = tombstone.Serialize();
+      assert(lower_bound == nullptr ||
+             ucmp->Compare(*lower_bound, kv.second) < 0);
       sub_compact->builder->Add(kv.first.Encode(), kv.second);
       InternalKey smallest_candidate = std::move(kv.first);
       if (lower_bound != nullptr &&
@@ -1431,7 +1481,7 @@ Status CompactionJob::OpenCompactionOutputFile(
       TableFileCreationReason::kCompaction);
 #endif  // !ROCKSDB_LITE
   // Make the output file
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
 #ifndef NDEBUG
   bool syncpoint_arg = env_options_.use_direct_writes;
   TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile",
@@ -1463,9 +1513,11 @@ Status CompactionJob::OpenCompactionOutputFile(
   writable_file->SetWriteLifeTimeHint(write_hint_);
   writable_file->SetPreallocationBlockSize(static_cast<size_t>(
       sub_compact->compaction->OutputFilePreallocationSize()));
+  const auto& listeners =
+      sub_compact->compaction->immutable_cf_options()->listeners;
   sub_compact->outfile.reset(
       new WritableFileWriter(std::move(writable_file), fname, env_options_,
-                             db_options_.statistics.get()));
+                             db_options_.statistics.get(), listeners));
 
   // If the Column family flag is to only optimize filters for hits,
   // we can skip creating filters if this is the bottommost_level where
diff --git a/db/compaction_job.h b/db/compaction_job.h
index a31e8c14232..596b5cc6011 100644
--- a/db/compaction_job.h
+++ b/db/compaction_job.h
@@ -29,8 +29,8 @@
 #include "db/version_edit.h"
 #include "db/write_controller.h"
 #include "db/write_thread.h"
-#include "options/db_options.h"
 #include "options/cf_options.h"
+#include "options/db_options.h"
 #include "port/port.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/compaction_job_stats.h"
@@ -104,7 +104,7 @@ class CompactionJob {
 
   Status FinishCompactionOutputFile(
       const Status& input_status, SubcompactionState* sub_compact,
-      RangeDelAggregator* range_del_agg,
+      CompactionRangeDelAggregator* range_del_agg,
       CompactionIterationStats* range_del_out_stats,
       const Slice* next_table_min_key = nullptr);
   Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc
index 3eb5dd70416..c7da191dd46 100644
--- a/db/compaction_job_test.cc
+++ b/db/compaction_job_test.cc
@@ -200,11 +200,11 @@ class CompactionJobTest : public testing::Test {
     new_db.SetLastSequence(0);
 
     const std::string manifest = DescriptorFileName(dbname_, 1);
-    unique_ptr<WritableFile> file;
+    std::unique_ptr<WritableFile> file;
     Status s = env_->NewWritableFile(
         manifest, &file, env_->OptimizeForManifestWrite(env_options_));
     ASSERT_OK(s);
-    unique_ptr<WritableFileWriter> file_writer(
+    std::unique_ptr<WritableFileWriter> file_writer(
         new WritableFileWriter(std::move(file), manifest, env_options_));
     {
       log::Writer log(std::move(file_writer), 0, false);
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 36b8029a380..6510d4bc0c9 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -18,6 +18,7 @@
 #include <queue>
 #include <string>
 #include <utility>
+#include <vector>
 #include "db/column_family.h"
 #include "monitoring/statistics.h"
 #include "util/filename.h"
@@ -36,6 +37,7 @@ uint64_t TotalCompensatedFileSize(const std::vector<FileMetaData*>& files) {
   }
   return sum;
 }
+}  // anonymous namespace
 
 bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
                            size_t min_files_to_compact,
@@ -69,7 +71,6 @@ bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
   }
   return false;
 }
-}  // anonymous namespace
 
 // Determine compression type, based on user options, level of the output
 // file and whether compression is disabled.
@@ -943,8 +944,8 @@ Status CompactionPicker::SanitizeCompactionInputFiles(
   // any currently-existing files.
   for (auto file_num : *input_files) {
     bool found = false;
-    for (auto level_meta : cf_meta.levels) {
-      for (auto file_meta : level_meta.files) {
+    for (const auto& level_meta : cf_meta.levels) {
+      for (const auto& file_meta : level_meta.files) {
         if (file_num == TableFileNameToNumber(file_meta.name)) {
           if (file_meta.being_compacted) {
             return Status::Aborted("Specified compaction input file " +
@@ -1547,217 +1548,4 @@ Compaction* LevelCompactionPicker::PickCompaction(
   return builder.PickCompaction();
 }
 
-#ifndef ROCKSDB_LITE
-bool FIFOCompactionPicker::NeedsCompaction(
-    const VersionStorageInfo* vstorage) const {
-  const int kLevel0 = 0;
-  return vstorage->CompactionScore(kLevel0) >= 1;
-}
-
-namespace {
-uint64_t GetTotalFilesSize(
-    const std::vector<FileMetaData*>& files) {
-  uint64_t total_size = 0;
-  for (const auto& f : files) {
-    total_size += f->fd.file_size;
-  }
-  return total_size;
-}
-}  // anonymous namespace
-
-Compaction* FIFOCompactionPicker::PickTTLCompaction(
-    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
-  assert(mutable_cf_options.compaction_options_fifo.ttl > 0);
-
-  const int kLevel0 = 0;
-  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
-  uint64_t total_size = GetTotalFilesSize(level_files);
-
-  int64_t _current_time;
-  auto status = ioptions_.env->GetCurrentTime(&_current_time);
-  if (!status.ok()) {
-    ROCKS_LOG_BUFFER(log_buffer,
-                     "[%s] FIFO compaction: Couldn't get current time: %s. "
-                     "Not doing compactions based on TTL. ",
-                     cf_name.c_str(), status.ToString().c_str());
-    return nullptr;
-  }
-  const uint64_t current_time = static_cast<uint64_t>(_current_time);
-
-  std::vector<CompactionInputFiles> inputs;
-  inputs.emplace_back();
-  inputs[0].level = 0;
-
-  // avoid underflow
-  if (current_time > mutable_cf_options.compaction_options_fifo.ttl) {
-    for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
-      auto f = *ritr;
-      if (f->fd.table_reader != nullptr &&
-          f->fd.table_reader->GetTableProperties() != nullptr) {
-        auto creation_time =
-            f->fd.table_reader->GetTableProperties()->creation_time;
-        if (creation_time == 0 ||
-            creation_time >= (current_time -
-                              mutable_cf_options.compaction_options_fifo.ttl)) {
-          break;
-        }
-        total_size -= f->compensated_file_size;
-        inputs[0].files.push_back(f);
-      }
-    }
-  }
-
-  // Return a nullptr and proceed to size-based FIFO compaction if:
-  // 1. there are no files older than ttl OR
-  // 2. there are a few files older than ttl, but deleting them will not bring
-  //    the total size to be less than max_table_files_size threshold.
-  if (inputs[0].files.empty() ||
-      total_size >
-          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
-    return nullptr;
-  }
-
-  for (const auto& f : inputs[0].files) {
-    ROCKS_LOG_BUFFER(log_buffer,
-                     "[%s] FIFO compaction: picking file %" PRIu64
-                     " with creation time %" PRIu64 " for deletion",
-                     cf_name.c_str(), f->fd.GetNumber(),
-                     f->fd.table_reader->GetTableProperties()->creation_time);
-  }
-
-  Compaction* c = new Compaction(
-      vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
-      kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0,
-      {}, /* is manual */ false, vstorage->CompactionScore(0),
-      /* is deletion compaction */ true, CompactionReason::kFIFOTtl);
-  return c;
-}
-
-Compaction* FIFOCompactionPicker::PickSizeCompaction(
-    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
-  const int kLevel0 = 0;
-  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
-  uint64_t total_size = GetTotalFilesSize(level_files);
-
-  if (total_size <=
-          mutable_cf_options.compaction_options_fifo.max_table_files_size ||
-      level_files.size() == 0) {
-    // total size not exceeded
-    if (mutable_cf_options.compaction_options_fifo.allow_compaction &&
-        level_files.size() > 0) {
-      CompactionInputFiles comp_inputs;
-      // try to prevent same files from being compacted multiple times, which
-      // could produce large files that may never TTL-expire. Achieve this by
-      // disallowing compactions with files larger than memtable (inflate its
-      // size by 10% to account for uncompressed L0 files that may have size
-      // slightly greater than memtable size limit).
-      size_t max_compact_bytes_per_del_file =
-          static_cast<size_t>(MultiplyCheckOverflow(
-              static_cast<uint64_t>(mutable_cf_options.write_buffer_size),
-              1.1));
-      if (FindIntraL0Compaction(
-              level_files,
-              mutable_cf_options
-                  .level0_file_num_compaction_trigger /* min_files_to_compact */
-              ,
-              max_compact_bytes_per_del_file, &comp_inputs)) {
-        Compaction* c = new Compaction(
-            vstorage, ioptions_, mutable_cf_options, {comp_inputs}, 0,
-            16 * 1024 * 1024 /* output file size limit */,
-            0 /* max compaction bytes, not applicable */,
-            0 /* output path ID */, mutable_cf_options.compression,
-            ioptions_.compression_opts, 0 /* max_subcompactions */, {},
-            /* is manual */ false, vstorage->CompactionScore(0),
-            /* is deletion compaction */ false,
-            CompactionReason::kFIFOReduceNumFiles);
-        return c;
-      }
-    }
-
-    ROCKS_LOG_BUFFER(
-        log_buffer,
-        "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
-        ", max size %" PRIu64 "\n",
-        cf_name.c_str(), total_size,
-        mutable_cf_options.compaction_options_fifo.max_table_files_size);
-    return nullptr;
-  }
-
-  if (!level0_compactions_in_progress_.empty()) {
-    ROCKS_LOG_BUFFER(
-        log_buffer,
-        "[%s] FIFO compaction: Already executing compaction. No need "
-        "to run parallel compactions since compactions are very fast",
-        cf_name.c_str());
-    return nullptr;
-  }
-
-  std::vector<CompactionInputFiles> inputs;
-  inputs.emplace_back();
-  inputs[0].level = 0;
-
-  for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
-    auto f = *ritr;
-    total_size -= f->compensated_file_size;
-    inputs[0].files.push_back(f);
-    char tmp_fsize[16];
-    AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
-    ROCKS_LOG_BUFFER(log_buffer,
-                     "[%s] FIFO compaction: picking file %" PRIu64
-                     " with size %s for deletion",
-                     cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
-    if (total_size <=
-        mutable_cf_options.compaction_options_fifo.max_table_files_size) {
-      break;
-    }
-  }
-
-  Compaction* c = new Compaction(
-      vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
-      kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0,
-      {}, /* is manual */ false, vstorage->CompactionScore(0),
-      /* is deletion compaction */ true, CompactionReason::kFIFOMaxSize);
-  return c;
-}
-
-Compaction* FIFOCompactionPicker::PickCompaction(
-    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
-  assert(vstorage->num_levels() == 1);
-
-  Compaction* c = nullptr;
-  if (mutable_cf_options.compaction_options_fifo.ttl > 0) {
-    c = PickTTLCompaction(cf_name, mutable_cf_options, vstorage, log_buffer);
-  }
-  if (c == nullptr) {
-    c = PickSizeCompaction(cf_name, mutable_cf_options, vstorage, log_buffer);
-  }
-  RegisterCompaction(c);
-  return c;
-}
-
-Compaction* FIFOCompactionPicker::CompactRange(
-    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, int input_level, int output_level,
-    uint32_t /*output_path_id*/, uint32_t /*max_subcompactions*/,
-    const InternalKey* /*begin*/, const InternalKey* /*end*/,
-    InternalKey** compaction_end, bool* /*manual_conflict*/) {
-#ifdef NDEBUG
-  (void)input_level;
-  (void)output_level;
-#endif
-  assert(input_level == 0);
-  assert(output_level == 0);
-  *compaction_end = nullptr;
-  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log);
-  Compaction* c =
-      PickCompaction(cf_name, mutable_cf_options, vstorage, &log_buffer);
-  log_buffer.FlushBufferToLog();
-  return c;
-}
-
-#endif  // !ROCKSDB_LITE
-
 }  // namespace rocksdb
diff --git a/db/compaction_picker.h b/db/compaction_picker.h
index e9688d75491..c60d792852d 100644
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@@ -235,42 +235,6 @@ class LevelCompactionPicker : public CompactionPicker {
 };
 
 #ifndef ROCKSDB_LITE
-class FIFOCompactionPicker : public CompactionPicker {
- public:
-  FIFOCompactionPicker(const ImmutableCFOptions& ioptions,
-                       const InternalKeyComparator* icmp)
-      : CompactionPicker(ioptions, icmp) {}
-
-  virtual Compaction* PickCompaction(const std::string& cf_name,
-                                     const MutableCFOptions& mutable_cf_options,
-                                     VersionStorageInfo* version,
-                                     LogBuffer* log_buffer) override;
-
-  virtual Compaction* CompactRange(
-      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-      VersionStorageInfo* vstorage, int input_level, int output_level,
-      uint32_t output_path_id, uint32_t max_subcompactions,
-      const InternalKey* begin, const InternalKey* end,
-      InternalKey** compaction_end, bool* manual_conflict) override;
-
-  // The maximum allowed output level.  Always returns 0.
-  virtual int MaxOutputLevel() const override { return 0; }
-
-  virtual bool NeedsCompaction(
-      const VersionStorageInfo* vstorage) const override;
-
- private:
-  Compaction* PickTTLCompaction(const std::string& cf_name,
-                                const MutableCFOptions& mutable_cf_options,
-                                VersionStorageInfo* version,
-                                LogBuffer* log_buffer);
-
-  Compaction* PickSizeCompaction(const std::string& cf_name,
-                                 const MutableCFOptions& mutable_cf_options,
-                                 VersionStorageInfo* version,
-                                 LogBuffer* log_buffer);
-};
-
 class NullCompactionPicker : public CompactionPicker {
  public:
   NullCompactionPicker(const ImmutableCFOptions& ioptions,
@@ -308,6 +272,11 @@ class NullCompactionPicker : public CompactionPicker {
 };
 #endif  // !ROCKSDB_LITE
 
+bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
+                           size_t min_files_to_compact,
+                           uint64_t max_compact_bytes_per_del_file,
+                           CompactionInputFiles* comp_inputs);
+
 CompressionType GetCompressionType(const ImmutableCFOptions& ioptions,
                                    const VersionStorageInfo* vstorage,
                                    const MutableCFOptions& mutable_cf_options,
diff --git a/db/compaction_picker_fifo.cc b/db/compaction_picker_fifo.cc
new file mode 100644
index 00000000000..fd89b65ebba
--- /dev/null
+++ b/db/compaction_picker_fifo.cc
@@ -0,0 +1,235 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction_picker_fifo.h"
+#ifndef ROCKSDB_LITE
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <string>
+#include <vector>
+#include "db/column_family.h"
+#include "util/log_buffer.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+namespace {
+uint64_t GetTotalFilesSize(const std::vector<FileMetaData*>& files) {
+  uint64_t total_size = 0;
+  for (const auto& f : files) {
+    total_size += f->fd.file_size;
+  }
+  return total_size;
+}
+}  // anonymous namespace
+
+bool FIFOCompactionPicker::NeedsCompaction(
+    const VersionStorageInfo* vstorage) const {
+  const int kLevel0 = 0;
+  return vstorage->CompactionScore(kLevel0) >= 1;
+}
+
+Compaction* FIFOCompactionPicker::PickTTLCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+  assert(mutable_cf_options.compaction_options_fifo.ttl > 0);
+
+  const int kLevel0 = 0;
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+  uint64_t total_size = GetTotalFilesSize(level_files);
+
+  int64_t _current_time;
+  auto status = ioptions_.env->GetCurrentTime(&_current_time);
+  if (!status.ok()) {
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "[%s] FIFO compaction: Couldn't get current time: %s. "
+                     "Not doing compactions based on TTL. ",
+                     cf_name.c_str(), status.ToString().c_str());
+    return nullptr;
+  }
+  const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+  std::vector<CompactionInputFiles> inputs;
+  inputs.emplace_back();
+  inputs[0].level = 0;
+
+  // avoid underflow
+  if (current_time > mutable_cf_options.compaction_options_fifo.ttl) {
+    for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+      auto f = *ritr;
+      if (f->fd.table_reader != nullptr &&
+          f->fd.table_reader->GetTableProperties() != nullptr) {
+        auto creation_time =
+            f->fd.table_reader->GetTableProperties()->creation_time;
+        if (creation_time == 0 ||
+            creation_time >= (current_time -
+                              mutable_cf_options.compaction_options_fifo.ttl)) {
+          break;
+        }
+        total_size -= f->compensated_file_size;
+        inputs[0].files.push_back(f);
+      }
+    }
+  }
+
+  // Return a nullptr and proceed to size-based FIFO compaction if:
+  // 1. there are no files older than ttl OR
+  // 2. there are a few files older than ttl, but deleting them will not bring
+  //    the total size to be less than max_table_files_size threshold.
+  if (inputs[0].files.empty() ||
+      total_size >
+          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+    return nullptr;
+  }
+
+  for (const auto& f : inputs[0].files) {
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "[%s] FIFO compaction: picking file %" PRIu64
+                     " with creation time %" PRIu64 " for deletion",
+                     cf_name.c_str(), f->fd.GetNumber(),
+                     f->fd.table_reader->GetTableProperties()->creation_time);
+  }
+
+  Compaction* c = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
+      kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0,
+      {}, /* is manual */ false, vstorage->CompactionScore(0),
+      /* is deletion compaction */ true, CompactionReason::kFIFOTtl);
+  return c;
+}
+
+Compaction* FIFOCompactionPicker::PickSizeCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+  const int kLevel0 = 0;
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+  uint64_t total_size = GetTotalFilesSize(level_files);
+
+  if (total_size <=
+          mutable_cf_options.compaction_options_fifo.max_table_files_size ||
+      level_files.size() == 0) {
+    // total size not exceeded
+    if (mutable_cf_options.compaction_options_fifo.allow_compaction &&
+        level_files.size() > 0) {
+      CompactionInputFiles comp_inputs;
+      // try to prevent same files from being compacted multiple times, which
+      // could produce large files that may never TTL-expire. Achieve this by
+      // disallowing compactions with files larger than memtable (inflate its
+      // size by 10% to account for uncompressed L0 files that may have size
+      // slightly greater than memtable size limit).
+      size_t max_compact_bytes_per_del_file =
+          static_cast<size_t>(MultiplyCheckOverflow(
+              static_cast<uint64_t>(mutable_cf_options.write_buffer_size),
+              1.1));
+      if (FindIntraL0Compaction(
+              level_files,
+              mutable_cf_options
+                  .level0_file_num_compaction_trigger /* min_files_to_compact */
+              ,
+              max_compact_bytes_per_del_file, &comp_inputs)) {
+        Compaction* c = new Compaction(
+            vstorage, ioptions_, mutable_cf_options, {comp_inputs}, 0,
+            16 * 1024 * 1024 /* output file size limit */,
+            0 /* max compaction bytes, not applicable */,
+            0 /* output path ID */, mutable_cf_options.compression,
+            ioptions_.compression_opts, 0 /* max_subcompactions */, {},
+            /* is manual */ false, vstorage->CompactionScore(0),
+            /* is deletion compaction */ false,
+            CompactionReason::kFIFOReduceNumFiles);
+        return c;
+      }
+    }
+
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
+        ", max size %" PRIu64 "\n",
+        cf_name.c_str(), total_size,
+        mutable_cf_options.compaction_options_fifo.max_table_files_size);
+    return nullptr;
+  }
+
+  if (!level0_compactions_in_progress_.empty()) {
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: Already executing compaction. No need "
+        "to run parallel compactions since compactions are very fast",
+        cf_name.c_str());
+    return nullptr;
+  }
+
+  std::vector<CompactionInputFiles> inputs;
+  inputs.emplace_back();
+  inputs[0].level = 0;
+
+  for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+    auto f = *ritr;
+    total_size -= f->compensated_file_size;
+    inputs[0].files.push_back(f);
+    char tmp_fsize[16];
+    AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "[%s] FIFO compaction: picking file %" PRIu64
+                     " with size %s for deletion",
+                     cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
+    if (total_size <=
+        mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+      break;
+    }
+  }
+
+  Compaction* c = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
+      kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0,
+      {}, /* is manual */ false, vstorage->CompactionScore(0),
+      /* is deletion compaction */ true, CompactionReason::kFIFOMaxSize);
+  return c;
+}
+
+Compaction* FIFOCompactionPicker::PickCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+  assert(vstorage->num_levels() == 1);
+
+  Compaction* c = nullptr;
+  if (mutable_cf_options.compaction_options_fifo.ttl > 0) {
+    c = PickTTLCompaction(cf_name, mutable_cf_options, vstorage, log_buffer);
+  }
+  if (c == nullptr) {
+    c = PickSizeCompaction(cf_name, mutable_cf_options, vstorage, log_buffer);
+  }
+  RegisterCompaction(c);
+  return c;
+}
+
+Compaction* FIFOCompactionPicker::CompactRange(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, int input_level, int output_level,
+    uint32_t /*output_path_id*/, uint32_t /*max_subcompactions*/,
+    const InternalKey* /*begin*/, const InternalKey* /*end*/,
+    InternalKey** compaction_end, bool* /*manual_conflict*/) {
+#ifdef NDEBUG
+  (void)input_level;
+  (void)output_level;
+#endif
+  assert(input_level == 0);
+  assert(output_level == 0);
+  *compaction_end = nullptr;
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log);
+  Compaction* c =
+      PickCompaction(cf_name, mutable_cf_options, vstorage, &log_buffer);
+  log_buffer.FlushBufferToLog();
+  return c;
+}
+
+}  // namespace rocksdb
+#endif  // !ROCKSDB_LITE
diff --git a/db/compaction_picker_fifo.h b/db/compaction_picker_fifo.h
new file mode 100644
index 00000000000..015fd42ddbf
--- /dev/null
+++ b/db/compaction_picker_fifo.h
@@ -0,0 +1,52 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction_picker.h"
+
+namespace rocksdb {
+class FIFOCompactionPicker : public CompactionPicker {
+ public:
+  FIFOCompactionPicker(const ImmutableCFOptions& ioptions,
+                       const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+
+  virtual Compaction* PickCompaction(const std::string& cf_name,
+                                     const MutableCFOptions& mutable_cf_options,
+                                     VersionStorageInfo* version,
+                                     LogBuffer* log_buffer) override;
+
+  virtual Compaction* CompactRange(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, int input_level, int output_level,
+      uint32_t output_path_id, uint32_t max_subcompactions,
+      const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end, bool* manual_conflict) override;
+
+  // The maximum allowed output level.  Always returns 0.
+  virtual int MaxOutputLevel() const override { return 0; }
+
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage) const override;
+
+ private:
+  Compaction* PickTTLCompaction(const std::string& cf_name,
+                                const MutableCFOptions& mutable_cf_options,
+                                VersionStorageInfo* version,
+                                LogBuffer* log_buffer);
+
+  Compaction* PickSizeCompaction(const std::string& cf_name,
+                                 const MutableCFOptions& mutable_cf_options,
+                                 VersionStorageInfo* version,
+                                 LogBuffer* log_buffer);
+};
+}  // namespace rocksdb
+#endif  // !ROCKSDB_LITE
diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
index 40daf9cecc5..1f5c90f487b 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@@ -4,10 +4,12 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "db/compaction_picker.h"
+
 #include <limits>
 #include <string>
 #include <utility>
 #include "db/compaction.h"
+#include "db/compaction_picker_fifo.h"
 #include "db/compaction_picker_universal.h"
 
 #include "util/logging.h"
diff --git a/db/convenience.cc b/db/convenience.cc
index 880b8406973..71c237f60c0 100644
--- a/db/convenience.cc
+++ b/db/convenience.cc
@@ -35,7 +35,7 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
 Status VerifySstFileChecksum(const Options& options,
                              const EnvOptions& env_options,
                              const std::string& file_path) {
-  unique_ptr<RandomAccessFile> file;
+  std::unique_ptr<RandomAccessFile> file;
   uint64_t file_size;
   InternalKeyComparator internal_comparator(options.comparator);
   ImmutableCFOptions ioptions(options);
@@ -46,7 +46,7 @@ Status VerifySstFileChecksum(const Options& options,
   } else {
     return s;
   }
-  unique_ptr<TableReader> table_reader;
+  std::unique_ptr<TableReader> table_reader;
   std::unique_ptr<RandomAccessFileReader> file_reader(
       new RandomAccessFileReader(std::move(file), file_path));
   const bool kImmortal = true;
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index 9488c07fee3..3bee4c6b27c 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -37,7 +37,7 @@ class CorruptionTest : public testing::Test {
  public:
   test::ErrorEnv env_;
   std::string dbname_;
-  shared_ptr<Cache> tiny_cache_;
+  std::shared_ptr<Cache> tiny_cache_;
   Options options_;
   DB* db_;
 
@@ -485,7 +485,7 @@ TEST_F(CorruptionTest, FileSystemStateCorrupted) {
     db_ = nullptr;
 
     if (iter == 0) {  // corrupt file size
-      unique_ptr<WritableFile> file;
+      std::unique_ptr<WritableFile> file;
       env_.NewWritableFile(filename, &file, EnvOptions());
       file->Append(Slice("corrupted sst"));
       file.reset();
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 0114fd08306..e50b5536663 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -794,7 +794,7 @@ TEST_F(DBBasicTest, ChecksumTest) {
   BlockBasedTableOptions table_options;
   Options options = CurrentOptions();
   // change when new checksum type added
-  int max_checksum = static_cast<int>(kxxHash);
+  int max_checksum = static_cast<int>(kxxHash64);
   const int kNumPerFile = 2;
 
   // generate one table with each type of checksum
@@ -809,7 +809,7 @@ TEST_F(DBBasicTest, ChecksumTest) {
   }
 
   // verify data with each type of checksum
-  for (int i = 0; i <= kxxHash; ++i) {
+  for (int i = 0; i <= kxxHash64; ++i) {
     table_options.checksum = static_cast<ChecksumType>(i);
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     Reopen(options);
@@ -881,7 +881,7 @@ class TestEnv : public EnvWrapper {
     int GetCloseCount() { return close_count; }
 
     virtual Status NewLogger(const std::string& /*fname*/,
-                             shared_ptr<Logger>* result) {
+                             std::shared_ptr<Logger>* result) {
       result->reset(new TestLogger(this));
       return Status::OK();
     }
diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc
index ac8c2825e37..0cefef3bb7b 100644
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@@ -349,7 +349,7 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
   // 200 bytes are enough to hold the first two blocks
   std::shared_ptr<Cache> cache = NewLRUCache(200, 0, false);
   table_options.block_cache = cache;
-  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20, true));
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   CreateAndReopenWithCF({"pikachu"}, options);
 
diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index badc23f8fe4..2fd5d6dfbe9 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -147,6 +147,7 @@ TEST_F(DBBloomFilterTest, GetFilterByPrefixBloomCustomPrefixExtractor) {
     options.prefix_extractor =
         std::make_shared<SliceTransformLimitedDomainGeneric>();
     options.statistics = rocksdb::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
     BlockBasedTableOptions bbto;
     bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
     if (partition_filters) {
@@ -171,20 +172,39 @@ TEST_F(DBBloomFilterTest, GetFilterByPrefixBloomCustomPrefixExtractor) {
 
     ASSERT_EQ("foo", Get("barbarbar"));
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ(
+        0,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
     ASSERT_EQ("foo2", Get("barbarbar2"));
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ(
+        0,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
     ASSERT_EQ("NOT_FOUND", Get("barbarbar3"));
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ(
+        0,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
 
     ASSERT_EQ("NOT_FOUND", Get("barfoofoo"));
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ(
+        1,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
 
     ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+    ASSERT_EQ(
+        2,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
 
     ro.total_order_seek = true;
     ASSERT_TRUE(db_->Get(ro, "foobarbar", &value).IsNotFound());
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+    ASSERT_EQ(
+        2,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+    get_perf_context()->Reset();
   }
 }
 
@@ -193,6 +213,7 @@ TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) {
     Options options = last_options_;
     options.prefix_extractor.reset(NewFixedPrefixTransform(8));
     options.statistics = rocksdb::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
     BlockBasedTableOptions bbto;
     bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
     if (partition_filters) {
@@ -231,6 +252,10 @@ TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) {
     ro.total_order_seek = true;
     ASSERT_TRUE(db_->Get(ro, "foobarbar", &value).IsNotFound());
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+    ASSERT_EQ(
+        2,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+    get_perf_context()->Reset();
   }
 }
 
@@ -239,6 +264,7 @@ TEST_F(DBBloomFilterTest, WholeKeyFilterProp) {
     Options options = last_options_;
     options.prefix_extractor.reset(NewFixedPrefixTransform(3));
     options.statistics = rocksdb::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
 
     BlockBasedTableOptions bbto;
     bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
@@ -386,6 +412,14 @@ TEST_F(DBBloomFilterTest, WholeKeyFilterProp) {
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
     ASSERT_EQ("bar", Get("barfoo"));
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+    uint64_t bloom_filter_useful_all_levels = 0;
+    for (auto& kv : (*(get_perf_context()->level_to_perf_context))) {
+      if (kv.second.bloom_filter_useful > 0) {
+        bloom_filter_useful_all_levels += kv.second.bloom_filter_useful;
+      }
+    }
+    ASSERT_EQ(12, bloom_filter_useful_all_levels);
+    get_perf_context()->Reset();
   }
 }
 
@@ -465,6 +499,7 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) {
   } while (ChangeCompactOptions());
 }
 
+#ifndef ROCKSDB_VALGRIND_RUN
 INSTANTIATE_TEST_CASE_P(
     FormatDef, DBBloomFilterTestDefFormatVersion,
     ::testing::Values(std::make_tuple(true, false, test::kDefaultFormatVersion),
@@ -485,11 +520,13 @@ INSTANTIATE_TEST_CASE_P(
                       std::make_tuple(false, true, test::kLatestFormatVersion),
                       std::make_tuple(false, false,
                                       test::kLatestFormatVersion)));
+#endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_F(DBBloomFilterTest, BloomFilterRate) {
   while (ChangeFilterOptions()) {
     Options options = CurrentOptions();
     options.statistics = rocksdb::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
     CreateAndReopenWithCF({"pikachu"}, options);
 
     const int maxKey = 10000;
@@ -511,6 +548,10 @@ TEST_F(DBBloomFilterTest, BloomFilterRate) {
       ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
     }
     ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98);
+    ASSERT_GE(
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful,
+        maxKey * 0.98);
+    get_perf_context()->Reset();
   }
 }
 
@@ -856,7 +897,7 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
   ASSERT_OK(Put(key1, value1, WriteOptions()));
   ASSERT_OK(Put(key3, value3, WriteOptions()));
 
-  unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
 
   // check memtable bloom stats
   iter->Seek(key1);
@@ -1032,6 +1073,7 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   options.optimize_filters_for_hits = true;
   options.statistics = rocksdb::CreateDBStatistics();
+  get_perf_context()->EnablePerLevelPerfContext();
   CreateAndReopenWithCF({"mypikachu"}, options);
 
   int numkeys = 200000;
@@ -1078,6 +1120,14 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
   // no bloom filter. Most keys be checked bloom filters twice.
   ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 65000 * 2);
   ASSERT_LT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 120000 * 2);
+  uint64_t bloom_filter_useful_all_levels = 0;
+  for (auto& kv : (*(get_perf_context()->level_to_perf_context))) {
+    if (kv.second.bloom_filter_useful > 0) {
+      bloom_filter_useful_all_levels += kv.second.bloom_filter_useful;
+    }
+  }
+  ASSERT_GT(bloom_filter_useful_all_levels, 65000 * 2);
+  ASSERT_LT(bloom_filter_useful_all_levels, 120000 * 2);
 
   for (int i = 0; i < numkeys; i += 2) {
     ASSERT_EQ(Get(1, Key(i)), "val");
@@ -1187,6 +1237,7 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
   ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
   ASSERT_EQ(2 /* index and data block */,
             TestGetTickerCount(options, BLOCK_CACHE_ADD));
+  get_perf_context()->Reset();
 }
 
 int CountIter(std::unique_ptr<Iterator>& iter, const Slice& key) {
diff --git a/db/db_compaction_filter_test.cc b/db/db_compaction_filter_test.cc
index 0c906948e6a..25045d01de9 100644
--- a/db/db_compaction_filter_test.cc
+++ b/db/db_compaction_filter_test.cc
@@ -340,9 +340,10 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) {
   Arena arena;
   {
     InternalKeyComparator icmp(options.comparator);
-    RangeDelAggregator range_del_agg(icmp, {} /* snapshots */);
-    ScopedArenaIterator iter(
-        dbfull()->NewInternalIterator(&arena, &range_del_agg, handles_[1]));
+    ReadRangeDelAggregator range_del_agg(&icmp,
+                                         kMaxSequenceNumber /* upper_bound */);
+    ScopedArenaIterator iter(dbfull()->NewInternalIterator(
+        &arena, &range_del_agg, kMaxSequenceNumber, handles_[1]));
     iter->SeekToFirst();
     ASSERT_OK(iter->status());
     while (iter->Valid()) {
@@ -429,9 +430,10 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) {
   count = 0;
   {
     InternalKeyComparator icmp(options.comparator);
-    RangeDelAggregator range_del_agg(icmp, {} /* snapshots */);
-    ScopedArenaIterator iter(
-        dbfull()->NewInternalIterator(&arena, &range_del_agg, handles_[1]));
+    ReadRangeDelAggregator range_del_agg(&icmp,
+                                         kMaxSequenceNumber /* upper_bound */);
+    ScopedArenaIterator iter(dbfull()->NewInternalIterator(
+        &arena, &range_del_agg, kMaxSequenceNumber, handles_[1]));
     iter->SeekToFirst();
     ASSERT_OK(iter->status());
     while (iter->Valid()) {
@@ -646,9 +648,10 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) {
     int total = 0;
     Arena arena;
     InternalKeyComparator icmp(options.comparator);
-    RangeDelAggregator range_del_agg(icmp, {} /* snapshots */);
-    ScopedArenaIterator iter(
-        dbfull()->NewInternalIterator(&arena, &range_del_agg));
+    ReadRangeDelAggregator range_del_agg(&icmp,
+                                         kMaxSequenceNumber /* snapshots */);
+    ScopedArenaIterator iter(dbfull()->NewInternalIterator(
+        &arena, &range_del_agg, kMaxSequenceNumber));
     iter->SeekToFirst();
     ASSERT_OK(iter->status());
     while (iter->Valid()) {
@@ -848,7 +851,7 @@ TEST_F(DBTestCompactionFilter, SkipUntilWithBloomFilter) {
   DestroyAndReopen(options);
 
   Put("0000000010", "v10");
-  Put("0000000020", "v20"); // skipped
+  Put("0000000020", "v20");  // skipped
   Put("0000000050", "v50");
   Flush();
 
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 5136b03921f..f20be594e66 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -8,11 +8,13 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/db_test_util.h"
-#include "port/stack_trace.h"
 #include "port/port.h"
+#include "port/stack_trace.h"
 #include "rocksdb/experimental.h"
 #include "rocksdb/utilities/convenience.h"
+#include "util/fault_injection_test_env.h"
 #include "util/sync_point.h"
+
 namespace rocksdb {
 
 // SYNC_POINT is not supported in released Windows mode.
@@ -2953,6 +2955,12 @@ TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) {
   dbfull()->TEST_WaitForCompact();
 }
 
+static std::string ShortKey(int i) {
+  assert(i < 10000);
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key%04d", i);
+  return std::string(buf);
+}
 
 TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
   int32_t trivial_move = 0;
@@ -2965,10 +2973,28 @@ TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
       [&](void* /*arg*/) { non_trivial_move++; });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
+  // The key size is guaranteed to be <= 8
+  class ShortKeyComparator : public Comparator {
+    int Compare(const rocksdb::Slice& a,
+                const rocksdb::Slice& b) const override {
+      assert(a.size() <= 8);
+      assert(b.size() <= 8);
+      return BytewiseComparator()->Compare(a, b);
+    }
+    const char* Name() const override { return "ShortKeyComparator"; }
+    void FindShortestSeparator(std::string* start,
+                               const rocksdb::Slice& limit) const override {
+      return BytewiseComparator()->FindShortestSeparator(start, limit);
+    }
+    void FindShortSuccessor(std::string* key) const override {
+      return BytewiseComparator()->FindShortSuccessor(key);
+    }
+  } short_key_cmp;
   Options options = CurrentOptions();
   options.target_file_size_base = 100000000;
   options.write_buffer_size = 100000000;
   options.max_subcompactions = max_subcompactions_;
+  options.comparator = &short_key_cmp;
   DestroyAndReopen(options);
 
   int32_t value_size = 10 * 1024;  // 10 KB
@@ -2978,7 +3004,7 @@ TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
   // File with keys [ 0 => 99 ]
   for (int i = 0; i < 100; i++) {
     values.push_back(RandomString(&rnd, value_size));
-    ASSERT_OK(Put(Key(i), values[i]));
+    ASSERT_OK(Put(ShortKey(i), values[i]));
   }
   ASSERT_OK(Flush());
 
@@ -2995,7 +3021,7 @@ TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
   // File with keys [ 100 => 199 ]
   for (int i = 100; i < 200; i++) {
     values.push_back(RandomString(&rnd, value_size));
-    ASSERT_OK(Put(Key(i), values[i]));
+    ASSERT_OK(Put(ShortKey(i), values[i]));
   }
   ASSERT_OK(Flush());
 
@@ -3013,7 +3039,7 @@ TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
   // File with keys [ 200 => 299 ]
   for (int i = 200; i < 300; i++) {
     values.push_back(RandomString(&rnd, value_size));
-    ASSERT_OK(Put(Key(i), values[i]));
+    ASSERT_OK(Put(ShortKey(i), values[i]));
   }
   ASSERT_OK(Flush());
 
@@ -3031,7 +3057,7 @@ TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
   ASSERT_EQ(non_trivial_move, 0);
 
   for (int i = 0; i < 300; i++) {
-    ASSERT_EQ(Get(Key(i)), values[i]);
+    ASSERT_EQ(Get(ShortKey(i)), values[i]);
   }
 
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
@@ -4005,6 +4031,123 @@ TEST_F(DBCompactionTest, PartialManualCompaction) {
   dbfull()->CompactRange(cro, nullptr, nullptr);
 }
 
+TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) {
+  // Regression test for bug where manual compaction hangs forever when the DB
+  // is in read-only mode. Verify it now at least returns, despite failing.
+  const int kNumL0Files = 4;
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+  opts.env = mock_env.get();
+  DestroyAndReopen(opts);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumL0Files; ++i) {
+    // Make sure files are overlapping in key-range to prevent trivial move.
+    Put("key1", RandomString(&rnd, 1024));
+    Put("key2", RandomString(&rnd, 1024));
+    Flush();
+  }
+  ASSERT_EQ(kNumL0Files, NumTableFilesAtLevel(0));
+
+  // Enter read-only mode by failing a write.
+  mock_env->SetFilesystemActive(false);
+  // Make sure this is outside `CompactRange`'s range so that it doesn't fail
+  // early trying to flush memtable.
+  ASSERT_NOK(Put("key3", RandomString(&rnd, 1024)));
+
+  // In the bug scenario, the first manual compaction would fail and forget to
+  // unregister itself, causing the second one to hang forever due to conflict
+  // with a non-running compaction.
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = false;
+  Slice begin_key("key1");
+  Slice end_key("key2");
+  ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key));
+  ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key));
+
+  // Close before mock_env destruct.
+  Close();
+}
+
+// FixFileIngestionCompactionDeadlock tests and verifies that compaction and
+// file ingestion do not cause deadlock in the event of write stall triggered
+// by number of L0 files reaching level0_stop_writes_trigger.
+TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) {
+  const int kNumKeysPerFile = 100;
+  // Generate SST files.
+  Options options = CurrentOptions();
+
+  // Generate an external SST file containing a single key, i.e. 99
+  std::string sst_files_dir = dbname_ + "/sst_files/";
+  test::DestroyDir(env_, sst_files_dir);
+  ASSERT_OK(env_->CreateDir(sst_files_dir));
+  SstFileWriter sst_writer(EnvOptions(), options);
+  const std::string sst_file_path = sst_files_dir + "test.sst";
+  ASSERT_OK(sst_writer.Open(sst_file_path));
+  ASSERT_OK(sst_writer.Put(Key(kNumKeysPerFile - 1), "value"));
+  ASSERT_OK(sst_writer.Finish());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::IngestExternalFile:AfterIncIngestFileCounter",
+       "BackgroundCallCompaction:0"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.level0_file_num_compaction_trigger =
+      options.level0_stop_writes_trigger;
+  options.max_subcompactions = max_subcompactions_;
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  // Generate level0_stop_writes_trigger L0 files to trigger write stop
+  for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+    for (int j = 0; j != kNumKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(j), RandomString(&rnd, 990)));
+    }
+    if (0 == i) {
+      // When we reach here, the memtables have kNumKeysPerFile keys. Note that
+      // flush is not yet triggered. We need to write an extra key so that the
+      // write path will call PreprocessWrite and flush the previous key-value
+      // pairs to e flushed. After that, there will be the newest key in the
+      // memtable, and a bunch of L0 files. Since there is already one key in
+      // the memtable, then for i = 1, 2, ..., we do not have to write this
+      // extra key to trigger flush.
+      ASSERT_OK(Put("", ""));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i + 1);
+  }
+  // When we reach this point, there will be level0_stop_writes_trigger L0
+  // files and one extra key (99) in memory, which overlaps with the external
+  // SST file. Write stall triggers, and can be cleared only after compaction
+  // reduces the number of L0 files.
+
+  // Compaction will also be triggered since we have reached the threshold for
+  // auto compaction. Note that compaction may begin after the following file
+  // ingestion thread and waits for ingestion to finish.
+
+  // Thread to ingest file with overlapping key range with the current
+  // memtable. Consequently ingestion will trigger a flush. The flush MUST
+  // proceed without waiting for the write stall condition to clear, otherwise
+  // deadlock can happen.
+  port::Thread ingestion_thr([&]() {
+    IngestExternalFileOptions ifo;
+    Status s = db_->IngestExternalFile({sst_file_path}, ifo);
+    ASSERT_OK(s);
+  });
+
+  // More write to trigger write stop
+  ingestion_thr.join();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  Close();
+}
+
 #endif // !defined(ROCKSDB_LITE)
 }  // namespace rocksdb
 
diff --git a/db/db_dynamic_level_test.cc b/db/db_dynamic_level_test.cc
index 3f33027d1cb..8fac82851eb 100644
--- a/db/db_dynamic_level_test.cc
+++ b/db/db_dynamic_level_test.cc
@@ -27,7 +27,7 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase) {
     return;
   }
   // Use InMemoryEnv, or it would be too slow.
-  unique_ptr<Env> env(new MockEnv(env_));
+  std::unique_ptr<Env> env(new MockEnv(env_));
 
   const int kNKeys = 1000;
   int keys[kNKeys];
@@ -125,6 +125,7 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
   int kMaxKey = 1000000;
 
   Options options = CurrentOptions();
+  options.compression = kNoCompression;
   options.create_if_missing = true;
   options.write_buffer_size = 20480;
   options.max_write_buffer_number = 2;
@@ -167,8 +168,8 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
   ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
   ASSERT_EQ(4U, int_prop);
 
-  // Insert extra about 28K to L0. After they are compacted to L4, base level
-  // should be changed to L3.
+  // Insert extra about 28K to L0. After they are compacted to L4, the base
+  // level should be changed to L3.
   ASSERT_OK(dbfull()->SetOptions({
       {"disable_auto_compactions", "true"},
   }));
@@ -189,13 +190,7 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
   ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop));
   ASSERT_EQ("0", str_prop);
 
-  // Trigger parallel compaction, and the first one would change the base
-  // level.
-  // Hold compaction jobs to make sure
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "CompactionJob::Run():Start",
-      [&](void* /*arg*/) { env_->SleepForMicroseconds(100000); });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  // Write even more data while leaving the base level at L3.
   ASSERT_OK(dbfull()->SetOptions({
       {"disable_auto_compactions", "true"},
   }));
@@ -208,18 +203,12 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
       {"disable_auto_compactions", "false"},
   }));
   Flush();
-  // Wait for 200 milliseconds before proceeding compactions to make sure two
-  // parallel ones are executed.
-  env_->SleepForMicroseconds(200000);
   dbfull()->TEST_WaitForCompact();
   ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
   ASSERT_EQ(3U, int_prop);
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 
-  // Trigger a condition that the compaction changes base level and L0->Lbase
-  // happens at the same time.
-  // We try to make last levels' targets to be 40K, 160K, 640K, add triggers
-  // another compaction from 40K->160K.
+  // Fill up L0, and then run an (auto) L0->Lmax compaction to raise the base
+  // level to 2.
   ASSERT_OK(dbfull()->SetOptions({
       {"disable_auto_compactions", "true"},
   }));
@@ -229,23 +218,31 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
     ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
                   RandomString(&rnd, 380)));
   }
+
+  // Make sure that the compaction starts before the last bit of data is
+  // flushed, so that the base level isn't raised to L1.
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:0"},
+  });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
   ASSERT_OK(dbfull()->SetOptions({
       {"disable_auto_compactions", "false"},
   }));
+
+  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:0");
   Flush();
   dbfull()->TEST_WaitForCompact();
   ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
   ASSERT_EQ(2U, int_prop);
-
-  // A manual compaction will trigger the base level to become L2
-  // Keep Writing data until base level changed 2->1. There will be L0->L2
-  // compaction going on at the same time.
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
   rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
 
+  // Write more data until the base level changes to L1. There will be
+  // a manual compaction going on at the same time.
   rocksdb::SyncPoint::GetInstance()->LoadDependency({
-      {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:0"},
-      {"DynamicLevelMaxBytesBase2:1", "CompactionJob::Run():End"},
+      {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:1"},
+      {"DynamicLevelMaxBytesBase2:2", "CompactionJob::Run():End"},
       {"DynamicLevelMaxBytesBase2:compact_range_finish",
        "FlushJob::WriteLevel0Table"},
   });
@@ -257,12 +254,12 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
     TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:compact_range_finish");
   });
 
-  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:0");
+  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:1");
   for (int i = 0; i < 2; i++) {
     ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
                   RandomString(&rnd, 380)));
   }
-  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:1");
+  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:2");
 
   Flush();
 
diff --git a/db/db_encryption_test.cc b/db/db_encryption_test.cc
index 38eee56459e..46ba411b6fd 100644
--- a/db/db_encryption_test.cc
+++ b/db/db_encryption_test.cc
@@ -40,7 +40,7 @@ TEST_F(DBEncryptionTest, CheckEncrypted) {
       continue;
     }
     auto filePath = dbname_ + "/" + *it;
-    unique_ptr<SequentialFile> seqFile;
+    std::unique_ptr<SequentialFile> seqFile;
     auto envOptions = EnvOptions(CurrentOptions());
     status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions);
     ASSERT_OK(status);
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index 010f9fbe8ff..ace0befb6d5 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -44,7 +44,7 @@ Status DBImpl::EnableFileDeletions(bool force) {
   // Job id == 0 means that this is not our background process, but rather
   // user thread
   JobContext job_context(0);
-  bool should_purge_files = false;
+  bool file_deletion_enabled = false;
   {
     InstrumentedMutexLock l(&mutex_);
     if (force) {
@@ -54,19 +54,18 @@ Status DBImpl::EnableFileDeletions(bool force) {
       --disable_delete_obsolete_files_;
     }
     if (disable_delete_obsolete_files_ == 0)  {
-      ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled");
-      should_purge_files = true;
+      file_deletion_enabled = true;
       FindObsoleteFiles(&job_context, true);
       bg_cv_.SignalAll();
-    } else {
-      ROCKS_LOG_WARN(
-          immutable_db_options_.info_log,
-          "File Deletions Enable, but not really enabled. Counter: %d",
-          disable_delete_obsolete_files_);
     }
   }
-  if (should_purge_files)  {
+  if (file_deletion_enabled) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled");
     PurgeObsoleteFiles(job_context);
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "File Deletions Enable, but not really enabled. Counter: %d",
+                   disable_delete_obsolete_files_);
   }
   job_context.Clean();
   LogFlush(immutable_db_options_.info_log);
@@ -87,19 +86,28 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
   if (flush_memtable) {
     // flush all dirty data to disk.
     Status status;
-    for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (cfd->IsDropped()) {
-        continue;
-      }
-      cfd->Ref();
+    if (immutable_db_options_.atomic_flush) {
+      autovector<ColumnFamilyData*> cfds;
+      SelectColumnFamiliesForAtomicFlush(&cfds);
       mutex_.Unlock();
-      status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
-      TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
-      TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
+      status = AtomicFlushMemTables(cfds, FlushOptions(),
+                                    FlushReason::kGetLiveFiles);
       mutex_.Lock();
-      cfd->Unref();
-      if (!status.ok()) {
-        break;
+    } else {
+      for (auto cfd : *versions_->GetColumnFamilySet()) {
+        if (cfd->IsDropped()) {
+          continue;
+        }
+        cfd->Ref();
+        mutex_.Unlock();
+        status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
+        TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
+        TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
+        mutex_.Lock();
+        cfd->Unref();
+        if (!status.ok()) {
+          break;
+        }
       }
     }
     versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
@@ -126,7 +134,7 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
 
   // create names of the live files. The names are not absolute
   // paths, instead they are relative to dbname_;
-  for (auto live_file : live) {
+  for (const auto& live_file : live) {
     ret.push_back(MakeTableFileName("", live_file.GetNumber()));
   }
 
diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc
index e32f3da32c6..8a4d8fc63a1 100644
--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@@ -25,6 +25,12 @@ class DBFlushDirectIOTest : public DBFlushTest,
   DBFlushDirectIOTest() : DBFlushTest() {}
 };
 
+class DBAtomicFlushTest : public DBFlushTest,
+                          public ::testing::WithParamInterface<bool> {
+ public:
+  DBAtomicFlushTest() : DBFlushTest() {}
+};
+
 // We had issue when two background threads trying to flush at the same time,
 // only one of them get committed. The test verifies the issue is fixed.
 TEST_F(DBFlushTest, FlushWhileWritingManifest) {
@@ -40,7 +46,7 @@ TEST_F(DBFlushTest, FlushWhileWritingManifest) {
   SyncPoint::GetInstance()->LoadDependency(
       {{"VersionSet::LogAndApply:WriteManifest",
         "DBFlushTest::FlushWhileWritingManifest:1"},
-       {"MemTableList::InstallMemtableFlushResults:InProgress",
+       {"MemTableList::TryInstallMemtableFlushResults:InProgress",
         "VersionSet::LogAndApply:WriteManifestDone"}});
   SyncPoint::GetInstance()->EnableProcessing();
 
@@ -56,7 +62,6 @@ TEST_F(DBFlushTest, FlushWhileWritingManifest) {
 #endif  // ROCKSDB_LITE
 }
 
-#ifndef TRAVIS
 // Disable this test temporarily on Travis as it fails intermittently.
 // Github issue: #4151
 TEST_F(DBFlushTest, SyncFail) {
@@ -67,11 +72,15 @@ TEST_F(DBFlushTest, SyncFail) {
   options.env = fault_injection_env.get();
 
   SyncPoint::GetInstance()->LoadDependency(
-      {{"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"},
+      {{"DBFlushTest::SyncFail:GetVersionRefCount:1",
+        "DBImpl::FlushMemTableToOutputFile:BeforePickMemtables"},
+       {"DBImpl::FlushMemTableToOutputFile:AfterPickMemtables",
+        "DBFlushTest::SyncFail:GetVersionRefCount:2"},
+       {"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"},
        {"DBImpl::SyncClosedLogs:Failed", "DBFlushTest::SyncFail:2"}});
   SyncPoint::GetInstance()->EnableProcessing();
 
-  Reopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
   Put("key", "value");
   auto* cfd =
       reinterpret_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
@@ -82,6 +91,10 @@ TEST_F(DBFlushTest, SyncFail) {
   // Flush installs a new super-version. Get the ref count after that.
   auto current_before = cfd->current();
   int refs_before = cfd->current()->TEST_refs();
+  TEST_SYNC_POINT("DBFlushTest::SyncFail:GetVersionRefCount:1");
+  TEST_SYNC_POINT("DBFlushTest::SyncFail:GetVersionRefCount:2");
+  int refs_after_picking_memtables = cfd->current()->TEST_refs();
+  ASSERT_EQ(refs_before + 1, refs_after_picking_memtables);
   fault_injection_env->SetFilesystemActive(false);
   TEST_SYNC_POINT("DBFlushTest::SyncFail:1");
   TEST_SYNC_POINT("DBFlushTest::SyncFail:2");
@@ -96,7 +109,30 @@ TEST_F(DBFlushTest, SyncFail) {
   ASSERT_EQ(refs_before, cfd->current()->TEST_refs());
   Destroy(options);
 }
-#endif  // TRAVIS
+
+TEST_F(DBFlushTest, SyncSkip) {
+  Options options = CurrentOptions();
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBFlushTest::SyncSkip:1", "DBImpl::SyncClosedLogs:Skip"},
+       {"DBImpl::SyncClosedLogs:Skip", "DBFlushTest::SyncSkip:2"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+  Put("key", "value");
+
+  FlushOptions flush_options;
+  flush_options.wait = false;
+  ASSERT_OK(dbfull()->Flush(flush_options));
+
+  TEST_SYNC_POINT("DBFlushTest::SyncSkip:1");
+  TEST_SYNC_POINT("DBFlushTest::SyncSkip:2");
+
+  // Now the background job will do the flush; wait for it.
+  dbfull()->TEST_WaitForFlushMemTable();
+
+  Destroy(options);
+}
 
 TEST_F(DBFlushTest, FlushInLowPriThreadPool) {
   // Verify setting an empty high-pri (flush) thread pool causes flushes to be
@@ -214,9 +250,249 @@ TEST_F(DBFlushTest, FlushError) {
   ASSERT_NE(s, Status::OK());
 }
 
+TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) {
+  // Regression test for bug where manual flush hangs forever when the DB
+  // is in read-only mode. Verify it now at least returns, despite failing.
+  Options options;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  options.env = fault_injection_env.get();
+  options.max_write_buffer_number = 2;
+  Reopen(options);
+
+  // Trigger a first flush but don't let it run
+  ASSERT_OK(db_->PauseBackgroundWork());
+  ASSERT_OK(Put("key1", "value1"));
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(db_->Flush(flush_opts));
+
+  // Write a key to the second memtable so we have something to flush later
+  // after the DB is in read-only mode.
+  ASSERT_OK(Put("key2", "value2"));
+
+  // Let the first flush continue, hit an error, and put the DB in read-only
+  // mode.
+  fault_injection_env->SetFilesystemActive(false);
+  ASSERT_OK(db_->ContinueBackgroundWork());
+  dbfull()->TEST_WaitForFlushMemTable();
+#ifndef ROCKSDB_LITE
+  uint64_t num_bg_errors;
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBackgroundErrors,
+                                  &num_bg_errors));
+  ASSERT_GT(num_bg_errors, 0);
+#endif  // ROCKSDB_LITE
+
+  // In the bug scenario, triggering another flush would cause the second flush
+  // to hang forever. After the fix we expect it to return an error.
+  ASSERT_NOK(db_->Flush(FlushOptions()));
+
+  Close();
+}
+
+TEST_P(DBAtomicFlushTest, ManualAtomicFlush) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = GetParam();
+  options.write_buffer_size = (static_cast<size_t>(64) << 20);
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+  }
+  std::vector<int> cf_ids;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    cf_ids.emplace_back(static_cast<int>(i));
+  }
+  ASSERT_OK(Flush(cf_ids));
+  for (size_t i = 0; i != num_cfs; ++i) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+  }
+}
+
+TEST_P(DBAtomicFlushTest, AtomicFlushTriggeredByMemTableFull) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = GetParam();
+  // 4KB so that we can easily trigger auto flush.
+  options.write_buffer_size = 4096;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:FlushFinish:0",
+        "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+  }
+  // Keep writing to one of them column families to trigger auto flush.
+  for (int i = 0; i != 4000; ++i) {
+    ASSERT_OK(Put(static_cast<int>(num_cfs) - 1 /*cf*/,
+                  "key" + std::to_string(i), "value" + std::to_string(i),
+                  wopts));
+  }
+
+  TEST_SYNC_POINT(
+      "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck");
+  if (options.atomic_flush) {
+    for (size_t i = 0; i != num_cfs - 1; ++i) {
+      auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+      ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+      ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+    }
+  } else {
+    for (size_t i = 0; i != num_cfs - 1; ++i) {
+      auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+      ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+      ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty());
+    }
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBAtomicFlushTest, AtomicFlushRollbackSomeJobs) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  options.env = fault_injection_env.get();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1",
+        "DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:1"},
+       {"DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2",
+        "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_OK(Put(cf_id, "key", "value", wopts));
+  }
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+  TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:1");
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2");
+  for (auto* cfh : handles_) {
+    dbfull()->TEST_WaitForFlushMemTable(cfh);
+  }
+  for (size_t i = 0; i != num_cfs; ++i) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(1, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+  }
+  fault_injection_env->SetFilesystemActive(true);
+  Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest, FlushMultipleCFs_DropSomeBeforeRequestFlush) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  std::vector<int> cf_ids;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_OK(Put(cf_id, "key", "value", wopts));
+    cf_ids.push_back(cf_id);
+  }
+  ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+  ASSERT_TRUE(Flush(cf_ids).IsShutdownInProgress());
+  Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest,
+       FlushMultipleCFs_DropSomeAfterScheduleFlushBeforeFlushJobRun) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush",
+        "DBAtomicFlushTest::BeforeDropCF"},
+       {"DBAtomicFlushTest::AfterDropCF",
+        "DBImpl::BackgroundCallFlush:start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_OK(Put(cf_id, "key", "value", wopts));
+  }
+  port::Thread user_thread([&]() {
+    TEST_SYNC_POINT("DBAtomicFlushTest::BeforeDropCF");
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    TEST_SYNC_POINT("DBAtomicFlushTest::AfterDropCF");
+  });
+  FlushOptions flush_opts;
+  flush_opts.wait = true;
+  ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+  user_thread.join();
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_EQ("value", Get(cf_id, "key"));
+  }
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "eevee"}, options);
+  num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_EQ("value", Get(cf_id, "key"));
+  }
+  Destroy(options);
+}
+
 INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest,
                         testing::Bool());
 
+INSTANTIATE_TEST_CASE_P(DBAtomicFlushTest, DBAtomicFlushTest, testing::Bool());
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 393b2d2e9e4..3f6e4467600 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -45,7 +45,7 @@
 #include "db/memtable_list.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
-#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "db/table_cache.h"
 #include "db/table_properties_collector.h"
 #include "db/transaction_log_impl.h"
@@ -219,7 +219,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
       own_sfm_(options.sst_file_manager == nullptr),
       preserve_deletes_(options.preserve_deletes),
       closed_(false),
-      error_handler_(this, immutable_db_options_, &mutex_) {
+      error_handler_(this, immutable_db_options_, &mutex_),
+      atomic_flush_install_cv_(&mutex_) {
   // !batch_per_trx_ implies seq_per_batch_ because it is only unset for
   // WriteUnprepared, which should use seq_per_batch_.
   assert(batch_per_txn_ || seq_per_batch_);
@@ -304,7 +305,30 @@ Status DBImpl::ResumeImpl() {
   // We cannot guarantee consistency of the WAL. So force flush Memtables of
   // all the column families
   if (s.ok()) {
-    s = FlushAllCFs(FlushReason::kErrorRecovery);
+    FlushOptions flush_opts;
+    // We allow flush to stall write since we are trying to resume from error.
+    flush_opts.allow_write_stall = true;
+    if (immutable_db_options_.atomic_flush) {
+      autovector<ColumnFamilyData*> cfds;
+      SelectColumnFamiliesForAtomicFlush(&cfds);
+      mutex_.Unlock();
+      s = AtomicFlushMemTables(cfds, flush_opts, FlushReason::kErrorRecovery);
+      mutex_.Lock();
+    } else {
+      for (auto cfd : *versions_->GetColumnFamilySet()) {
+        if (cfd->IsDropped()) {
+          continue;
+        }
+        cfd->Ref();
+        mutex_.Unlock();
+        s = FlushMemTable(cfd, flush_opts, FlushReason::kErrorRecovery);
+        mutex_.Lock();
+        cfd->Unref();
+        if (!s.ok()) {
+          break;
+        }
+      }
+    }
     if (!s.ok()) {
       ROCKS_LOG_INFO(immutable_db_options_.info_log,
                      "DB resume requested but failed due to Flush failure [%s]",
@@ -359,21 +383,38 @@ void DBImpl::WaitForBackgroundWork() {
 
 // Will lock the mutex_,  will wait for completion if wait is true
 void DBImpl::CancelAllBackgroundWork(bool wait) {
-  InstrumentedMutexLock l(&mutex_);
 
   ROCKS_LOG_INFO(immutable_db_options_.info_log,
                  "Shutdown: canceling all background work");
 
+  InstrumentedMutexLock l(&mutex_);
+  // To avoid deadlock, `thread_dump_stats_->cancel()` needs to be called
+  // before grabbing db mutex because the actual worker function
+  // `DBImpl::DumpStats()` also holds db mutex
+  if (thread_dump_stats_ != nullptr) {
+    mutex_.Unlock();
+    thread_dump_stats_->cancel();
+    mutex_.Lock();
+    thread_dump_stats_.reset();
+  }
   if (!shutting_down_.load(std::memory_order_acquire) &&
       has_unpersisted_data_.load(std::memory_order_relaxed) &&
       !mutable_db_options_.avoid_flush_during_shutdown) {
-    for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
-        cfd->Ref();
-        mutex_.Unlock();
-        FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
-        mutex_.Lock();
-        cfd->Unref();
+    if (immutable_db_options_.atomic_flush) {
+      autovector<ColumnFamilyData*> cfds;
+      SelectColumnFamiliesForAtomicFlush(&cfds);
+      mutex_.Unlock();
+      AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown);
+      mutex_.Lock();
+    } else {
+      for (auto cfd : *versions_->GetColumnFamilySet()) {
+        if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
+          cfd->Ref();
+          mutex_.Unlock();
+          FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
+          mutex_.Lock();
+          cfd->Unref();
+        }
       }
     }
     versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
@@ -577,66 +618,68 @@ void DBImpl::PrintStatistics() {
   }
 }
 
-void DBImpl::MaybeDumpStats() {
-  mutex_.Lock();
-  unsigned int stats_dump_period_sec =
-      mutable_db_options_.stats_dump_period_sec;
-  mutex_.Unlock();
-  if (stats_dump_period_sec == 0) return;
-
-  const uint64_t now_micros = env_->NowMicros();
-
-  if (last_stats_dump_time_microsec_ + stats_dump_period_sec * 1000000 <=
-      now_micros) {
-    // Multiple threads could race in here simultaneously.
-    // However, the last one will update last_stats_dump_time_microsec_
-    // atomically. We could see more than one dump during one dump
-    // period in rare cases.
-    last_stats_dump_time_microsec_ = now_micros;
+void DBImpl::StartTimedTasks() {
+  unsigned int stats_dump_period_sec = 0;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    stats_dump_period_sec = mutable_db_options_.stats_dump_period_sec;
+    if (stats_dump_period_sec > 0) {
+      if (!thread_dump_stats_) {
+        thread_dump_stats_.reset(new rocksdb::RepeatableThread(
+            [this]() { DBImpl::DumpStats(); }, "dump_st", env_,
+            stats_dump_period_sec * 1000000));
+      }
+    }
+  }
+}
 
+void DBImpl::DumpStats() {
+  TEST_SYNC_POINT("DBImpl::DumpStats:1");
 #ifndef ROCKSDB_LITE
-    const DBPropertyInfo* cf_property_info =
-        GetPropertyInfo(DB::Properties::kCFStats);
-    assert(cf_property_info != nullptr);
-    const DBPropertyInfo* db_property_info =
-        GetPropertyInfo(DB::Properties::kDBStats);
-    assert(db_property_info != nullptr);
-
-    std::string stats;
-    {
-      InstrumentedMutexLock l(&mutex_);
-      default_cf_internal_stats_->GetStringProperty(
-          *db_property_info, DB::Properties::kDBStats, &stats);
-      for (auto cfd : *versions_->GetColumnFamilySet()) {
-        if (cfd->initialized()) {
-          cfd->internal_stats()->GetStringProperty(
-              *cf_property_info, DB::Properties::kCFStatsNoFileHistogram,
-              &stats);
-        }
-      }
-      for (auto cfd : *versions_->GetColumnFamilySet()) {
-        if (cfd->initialized()) {
-          cfd->internal_stats()->GetStringProperty(
-              *cf_property_info, DB::Properties::kCFFileHistogram, &stats);
-        }
+  const DBPropertyInfo* cf_property_info =
+      GetPropertyInfo(DB::Properties::kCFStats);
+  assert(cf_property_info != nullptr);
+  const DBPropertyInfo* db_property_info =
+      GetPropertyInfo(DB::Properties::kDBStats);
+  assert(db_property_info != nullptr);
+
+  std::string stats;
+  if (shutdown_initiated_) {
+    return;
+  }
+  {
+    InstrumentedMutexLock l(&mutex_);
+    default_cf_internal_stats_->GetStringProperty(
+        *db_property_info, DB::Properties::kDBStats, &stats);
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->initialized()) {
+        cfd->internal_stats()->GetStringProperty(
+            *cf_property_info, DB::Properties::kCFStatsNoFileHistogram, &stats);
       }
     }
-    ROCKS_LOG_WARN(immutable_db_options_.info_log,
-                   "------- DUMPING STATS -------");
-    ROCKS_LOG_WARN(immutable_db_options_.info_log, "%s", stats.c_str());
-    if (immutable_db_options_.dump_malloc_stats) {
-      stats.clear();
-      DumpMallocStats(&stats);
-      if (!stats.empty()) {
-        ROCKS_LOG_WARN(immutable_db_options_.info_log,
-                       "------- Malloc STATS -------");
-        ROCKS_LOG_WARN(immutable_db_options_.info_log, "%s", stats.c_str());
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->initialized()) {
+        cfd->internal_stats()->GetStringProperty(
+            *cf_property_info, DB::Properties::kCFFileHistogram, &stats);
       }
     }
+  }
+  TEST_SYNC_POINT("DBImpl::DumpStats:2");
+  ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                 "------- DUMPING STATS -------");
+  ROCKS_LOG_WARN(immutable_db_options_.info_log, "%s", stats.c_str());
+  if (immutable_db_options_.dump_malloc_stats) {
+    stats.clear();
+    DumpMallocStats(&stats);
+    if (!stats.empty()) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "------- Malloc STATS -------");
+      ROCKS_LOG_WARN(immutable_db_options_.info_log, "%s", stats.c_str());
+    }
+  }
 #endif  // !ROCKSDB_LITE
 
-    PrintStatistics();
-  }
+  PrintStatistics();
 }
 
 void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
@@ -687,7 +730,6 @@ Status DBImpl::SetOptions(
   MutableCFOptions new_options;
   Status s;
   Status persist_options_status;
-  WriteThread::Writer w;
   SuperVersionContext sv_context(/* create_superversion */ true);
   {
     InstrumentedMutexLock l(&mutex_);
@@ -749,7 +791,6 @@ Status DBImpl::SetDBOptions(
   Status s;
   Status persist_options_status;
   bool wal_changed = false;
-  WriteThread::Writer w;
   WriteContext write_context;
   {
     InstrumentedMutexLock l(&mutex_);
@@ -762,7 +803,22 @@ Status DBImpl::SetDBOptions(
             new_options.max_background_compactions, Env::Priority::LOW);
         MaybeScheduleFlushOrCompaction();
       }
-
+      if (new_options.stats_dump_period_sec !=
+          mutable_db_options_.stats_dump_period_sec) {
+          if (thread_dump_stats_) {
+            mutex_.Unlock();
+            thread_dump_stats_->cancel();
+            mutex_.Lock();
+          }
+          if (new_options.stats_dump_period_sec > 0) {
+            thread_dump_stats_.reset(new rocksdb::RepeatableThread(
+                [this]() { DBImpl::DumpStats(); }, "dump_st", env_,
+                new_options.stats_dump_period_sec * 1000000));
+          }
+          else {
+            thread_dump_stats_.reset();
+          }
+      }
       write_controller_.set_max_delayed_write_rate(
           new_options.delayed_write_rate);
       table_cache_.get()->SetCapacity(new_options.max_open_files == -1
@@ -783,6 +839,7 @@ Status DBImpl::SetDBOptions(
           env_options_for_compaction_, immutable_db_options_);
       env_options_for_compaction_.compaction_readahead_size =
           mutable_db_options_.compaction_readahead_size;
+      WriteThread::Writer w;
       write_thread_.EnterUnbatched(&w, &mutex_);
       if (total_log_size_ > GetMaxTotalWalSize() || wal_changed) {
         Status purge_wal_status = SwitchWAL(&write_context);
@@ -975,7 +1032,7 @@ bool DBImpl::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) {
 }
 
 InternalIterator* DBImpl::NewInternalIterator(
-    Arena* arena, RangeDelAggregator* range_del_agg,
+    Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence,
     ColumnFamilyHandle* column_family) {
   ColumnFamilyData* cfd;
   if (column_family == nullptr) {
@@ -989,8 +1046,8 @@ InternalIterator* DBImpl::NewInternalIterator(
   SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
   mutex_.Unlock();
   ReadOptions roptions;
-  return NewInternalIterator(roptions, cfd, super_version, arena,
-                             range_del_agg);
+  return NewInternalIterator(roptions, cfd, super_version, arena, range_del_agg,
+                             sequence);
 }
 
 void DBImpl::SchedulePurge() {
@@ -1092,10 +1149,12 @@ static void CleanupIteratorState(void* arg1, void* /*arg2*/) {
 }
 }  // namespace
 
-InternalIterator* DBImpl::NewInternalIterator(
-    const ReadOptions& read_options, ColumnFamilyData* cfd,
-    SuperVersion* super_version, Arena* arena,
-    RangeDelAggregator* range_del_agg) {
+InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
+                                              ColumnFamilyData* cfd,
+                                              SuperVersion* super_version,
+                                              Arena* arena,
+                                              RangeDelAggregator* range_del_agg,
+                                              SequenceNumber sequence) {
   InternalIterator* internal_iter;
   assert(arena != nullptr);
   assert(range_del_agg != nullptr);
@@ -1107,12 +1166,12 @@ InternalIterator* DBImpl::NewInternalIterator(
   // Collect iterator for mutable mem
   merge_iter_builder.AddIterator(
       super_version->mem->NewIterator(read_options, arena));
-  std::unique_ptr<InternalIterator> range_del_iter;
+  std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter;
   Status s;
   if (!read_options.ignore_range_deletions) {
     range_del_iter.reset(
-        super_version->mem->NewRangeTombstoneIterator(read_options));
-    s = range_del_agg->AddTombstones(std::move(range_del_iter));
+        super_version->mem->NewRangeTombstoneIterator(read_options, sequence));
+    range_del_agg->AddTombstones(std::move(range_del_iter));
   }
   // Collect all needed child iterators for immutable memtables
   if (s.ok()) {
@@ -1213,7 +1272,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
 
   // Prepare to store a list of merge operations if merge occurs.
   MergeContext merge_context;
-  RangeDelAggregator range_del_agg(cfd->internal_comparator(), snapshot);
+  SequenceNumber max_covering_tombstone_seq = 0;
 
   Status s;
   // First look in the memtable, then in the immutable memtable (if any).
@@ -1227,13 +1286,14 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
   bool done = false;
   if (!skip_memtable) {
     if (sv->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
-                     &range_del_agg, read_options, callback, is_blob_index)) {
+                     &max_covering_tombstone_seq, read_options, callback,
+                     is_blob_index)) {
       done = true;
       pinnable_val->PinSelf();
       RecordTick(stats_, MEMTABLE_HIT);
     } else if ((s.ok() || s.IsMergeInProgress()) &&
                sv->imm->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
-                            &range_del_agg, read_options, callback,
+                            &max_covering_tombstone_seq, read_options, callback,
                             is_blob_index)) {
       done = true;
       pinnable_val->PinSelf();
@@ -1247,8 +1307,8 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
   if (!done) {
     PERF_TIMER_GUARD(get_from_output_files_time);
     sv->current->Get(read_options, lkey, pinnable_val, &s, &merge_context,
-                     &range_del_agg, value_found, nullptr, nullptr, callback,
-                     is_blob_index);
+                     &max_covering_tombstone_seq, value_found, nullptr, nullptr,
+                     callback, is_blob_index);
     RecordTick(stats_, MEMTABLE_MISS);
   }
 
@@ -1330,8 +1390,7 @@ std::vector<Status> DBImpl::MultiGet(
 
     LookupKey lkey(keys[i], snapshot);
     auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family[i]);
-    RangeDelAggregator range_del_agg(cfh->cfd()->internal_comparator(),
-                                     snapshot);
+    SequenceNumber max_covering_tombstone_seq = 0;
     auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID());
     assert(mgd_iter != multiget_cf_data.end());
     auto mgd = mgd_iter->second;
@@ -1342,11 +1401,12 @@ std::vector<Status> DBImpl::MultiGet(
     bool done = false;
     if (!skip_memtable) {
       if (super_version->mem->Get(lkey, value, &s, &merge_context,
-                                  &range_del_agg, read_options)) {
+                                  &max_covering_tombstone_seq, read_options)) {
         done = true;
         RecordTick(stats_, MEMTABLE_HIT);
       } else if (super_version->imm->Get(lkey, value, &s, &merge_context,
-                                         &range_del_agg, read_options)) {
+                                         &max_covering_tombstone_seq,
+                                         read_options)) {
         done = true;
         RecordTick(stats_, MEMTABLE_HIT);
       }
@@ -1355,7 +1415,7 @@ std::vector<Status> DBImpl::MultiGet(
       PinnableSlice pinnable_val;
       PERF_TIMER_GUARD(get_from_output_files_time);
       super_version->current->Get(read_options, lkey, &pinnable_val, &s,
-                                  &merge_context, &range_del_agg);
+                                  &merge_context, &max_covering_tombstone_seq);
       value->assign(pinnable_val.data(), pinnable_val.size());
       RecordTick(stats_, MEMTABLE_MISS);
     }
@@ -1783,7 +1843,7 @@ ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options,
 
   InternalIterator* internal_iter =
       NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
-                          db_iter->GetRangeDelAggregator());
+                          db_iter->GetRangeDelAggregator(), snapshot);
   db_iter->SetIterUnderDBIter(internal_iter);
 
   return db_iter;
@@ -2154,17 +2214,18 @@ ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) {
 }
 
 // REQUIRED: mutex is NOT held.
-ColumnFamilyHandle* DBImpl::GetColumnFamilyHandleUnlocked(
+std::unique_ptr<ColumnFamilyHandle> DBImpl::GetColumnFamilyHandleUnlocked(
     uint32_t column_family_id) {
-  ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get();
-
   InstrumentedMutexLock l(&mutex_);
 
-  if (!cf_memtables->Seek(column_family_id)) {
+  auto* cfd =
+      versions_->GetColumnFamilySet()->GetColumnFamily(column_family_id);
+  if (cfd == nullptr) {
     return nullptr;
   }
 
-  return cf_memtables->GetColumnFamilyHandle();
+  return std::unique_ptr<ColumnFamilyHandleImpl>(
+      new ColumnFamilyHandleImpl(cfd, this, &mutex_));
 }
 
 void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
@@ -2235,9 +2296,8 @@ void DBImpl::ReleaseFileNumberFromPendingOutputs(
 
 #ifndef ROCKSDB_LITE
 Status DBImpl::GetUpdatesSince(
-    SequenceNumber seq, unique_ptr<TransactionLogIterator>* iter,
+    SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter,
     const TransactionLogIterator::ReadOptions& read_options) {
-
   RecordTick(stats_, GET_UPDATES_SINCE_CALLS);
   if (seq > versions_->LastSequence()) {
     return Status::NotFound("Requested sequence not yet written in the db");
@@ -2485,10 +2545,10 @@ Status DBImpl::CheckConsistency() {
 Status DBImpl::GetDbIdentity(std::string& identity) const {
   std::string idfilename = IdentityFileName(dbname_);
   const EnvOptions soptions;
-  unique_ptr<SequentialFileReader> id_file_reader;
+  std::unique_ptr<SequentialFileReader> id_file_reader;
   Status s;
   {
-    unique_ptr<SequentialFile> idfile;
+    std::unique_ptr<SequentialFile> idfile;
     s = env_->NewSequentialFile(idfilename, &idfile, soptions);
     if (!s.ok()) {
       return s;
@@ -2885,8 +2945,7 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
                                        bool* is_blob_index) {
   Status s;
   MergeContext merge_context;
-  RangeDelAggregator range_del_agg(sv->mem->GetInternalKeyComparator(),
-                                   kMaxSequenceNumber);
+  SequenceNumber max_covering_tombstone_seq = 0;
 
   ReadOptions read_options;
   SequenceNumber current_seq = versions_->LastSequence();
@@ -2896,8 +2955,8 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
   *found_record_for_key = false;
 
   // Check if there is a record for this key in the latest memtable
-  sv->mem->Get(lkey, nullptr, &s, &merge_context, &range_del_agg, seq,
-               read_options, nullptr /*read_callback*/, is_blob_index);
+  sv->mem->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+               seq, read_options, nullptr /*read_callback*/, is_blob_index);
 
   if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
     // unexpected error reading memtable.
@@ -2915,8 +2974,8 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
   }
 
   // Check if there is a record for this key in the immutable memtables
-  sv->imm->Get(lkey, nullptr, &s, &merge_context, &range_del_agg, seq,
-               read_options, nullptr /*read_callback*/, is_blob_index);
+  sv->imm->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+               seq, read_options, nullptr /*read_callback*/, is_blob_index);
 
   if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
     // unexpected error reading memtable.
@@ -2934,8 +2993,9 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
   }
 
   // Check if there is a record for this key in the immutable memtables
-  sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context, &range_del_agg,
-                          seq, read_options, is_blob_index);
+  sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, seq, read_options,
+                          is_blob_index);
 
   if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
     // unexpected error reading memtable.
@@ -2958,7 +3018,7 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
   if (!cache_only) {
     // Check tables
     sv->current->Get(read_options, lkey, nullptr, &s, &merge_context,
-                     &range_del_agg, nullptr /* value_found */,
+                     &max_covering_tombstone_seq, nullptr /* value_found */,
                      found_record_for_key, seq, nullptr /*read_callback*/,
                      is_blob_index);
 
@@ -2967,12 +3027,10 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
       ROCKS_LOG_ERROR(immutable_db_options_.info_log,
                       "Unexpected status returned from Version::Get: %s\n",
                       s.ToString().c_str());
-
-      return s;
     }
   }
 
-  return Status::OK();
+  return s;
 }
 
 Status DBImpl::IngestExternalFile(
@@ -3059,6 +3117,7 @@ Status DBImpl::IngestExternalFile(
     }
 
     num_running_ingest_file_++;
+    TEST_SYNC_POINT("DBImpl::IngestExternalFile:AfterIncIngestFileCounter");
 
     // We cannot ingest a file into a dropped CF
     if (cfd->IsDropped()) {
@@ -3073,10 +3132,21 @@ Status DBImpl::IngestExternalFile(
       TEST_SYNC_POINT_CALLBACK("DBImpl::IngestExternalFile:NeedFlush",
                                &need_flush);
       if (status.ok() && need_flush) {
-        mutex_.Unlock();
-        status = FlushMemTable(cfd, FlushOptions(),
-                               FlushReason::kExternalFileIngestion,
-                               true /* writes_stopped */);
+        FlushOptions flush_opts;
+        flush_opts.allow_write_stall = true;
+        if (immutable_db_options_.atomic_flush) {
+          autovector<ColumnFamilyData*> cfds;
+          SelectColumnFamiliesForAtomicFlush(&cfds);
+          mutex_.Unlock();
+          status = AtomicFlushMemTables(cfds, flush_opts,
+                                        FlushReason::kExternalFileIngestion,
+                                        true /* writes_stopped */);
+        } else {
+          mutex_.Unlock();
+          status = FlushMemTable(cfd, flush_opts,
+                                 FlushReason::kExternalFileIngestion,
+                                 true /* writes_stopped */);
+        }
         mutex_.Lock();
       }
     }
@@ -3210,10 +3280,10 @@ void DBImpl::WaitForIngestFile() {
   }
 }
 
-Status DBImpl::StartTrace(const TraceOptions& /* options */,
+Status DBImpl::StartTrace(const TraceOptions& trace_options,
                           std::unique_ptr<TraceWriter>&& trace_writer) {
   InstrumentedMutexLock lock(&trace_mutex_);
-  tracer_.reset(new Tracer(env_, std::move(trace_writer)));
+  tracer_.reset(new Tracer(env_, trace_options, std::move(trace_writer)));
   return Status::OK();
 }
 
diff --git a/db/db_impl.h b/db/db_impl.h
index 2da8eca608f..58fcacc67e2 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -31,6 +31,7 @@
 #include "db/log_writer.h"
 #include "db/logs_with_prep_tracker.h"
 #include "db/pre_release_callback.h"
+#include "db/range_del_aggregator.h"
 #include "db/read_callback.h"
 #include "db/snapshot_checker.h"
 #include "db/snapshot_impl.h"
@@ -53,6 +54,7 @@
 #include "util/autovector.h"
 #include "util/event_logger.h"
 #include "util/hash.h"
+#include "util/repeatable_thread.h"
 #include "util/stop_watch.h"
 #include "util/thread_local.h"
 #include "util/trace_replay.h"
@@ -227,6 +229,9 @@ class DBImpl : public DB {
   using DB::Flush;
   virtual Status Flush(const FlushOptions& options,
                        ColumnFamilyHandle* column_family) override;
+  virtual Status Flush(
+      const FlushOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families) override;
   virtual Status FlushWAL(bool sync) override;
   bool TEST_WALBufferIsEmpty();
   virtual Status SyncWAL() override;
@@ -255,9 +260,9 @@ class DBImpl : public DB {
   virtual Status GetSortedWalFiles(VectorLogPtr& files) override;
 
   virtual Status GetUpdatesSince(
-      SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
-      const TransactionLogIterator::ReadOptions&
-          read_options = TransactionLogIterator::ReadOptions()) override;
+      SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options =
+          TransactionLogIterator::ReadOptions()) override;
   virtual Status DeleteFile(std::string name) override;
   Status DeleteFilesInRanges(ColumnFamilyHandle* column_family,
                              const RangePtr* ranges, size_t n,
@@ -369,7 +374,7 @@ class DBImpl : public DB {
   // The keys of this iterator are internal keys (see format.h).
   // The returned iterator should be deleted when no longer needed.
   InternalIterator* NewInternalIterator(
-      Arena* arena, RangeDelAggregator* range_del_agg,
+      Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence,
       ColumnFamilyHandle* column_family = nullptr);
 
   LogsWithPrepTracker* logs_with_prep_tracker() {
@@ -465,6 +470,7 @@ class DBImpl : public DB {
   int TEST_BGCompactionsAllowed() const;
   int TEST_BGFlushesAllowed() const;
   size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+  void TEST_WaitForTimedTaskRun(std::function<void()> callback) const;
 
 #endif  // NDEBUG
 
@@ -486,6 +492,14 @@ class DBImpl : public DB {
 
   uint64_t MinLogNumberToKeep();
 
+  // Returns the lower bound file number for SSTs that won't be deleted, even if
+  // they're obsolete. This lower bound is used internally to prevent newly
+  // created flush/compaction output files from being deleted before they're
+  // installed. This technique avoids the need for tracking the exact numbers of
+  // files pending creation, although it prevents more files than necessary from
+  // being deleted.
+  uint64_t MinObsoleteSstNumberToKeep();
+
   // Returns the list of live files in 'live' and the list
   // of all files in the filesystem in 'candidate_files'.
   // If force == false and the last call was less than
@@ -545,7 +559,8 @@ class DBImpl : public DB {
   ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id);
 
   // Same as above, should called without mutex held and not on write thread.
-  ColumnFamilyHandle* GetColumnFamilyHandleUnlocked(uint32_t column_family_id);
+  std::unique_ptr<ColumnFamilyHandle> GetColumnFamilyHandleUnlocked(
+      uint32_t column_family_id);
 
   // Returns the number of currently running flushes.
   // REQUIREMENT: mutex_ must be held when calling this function.
@@ -563,11 +578,9 @@ class DBImpl : public DB {
 
   const WriteController& write_controller() { return write_controller_; }
 
-  InternalIterator* NewInternalIterator(const ReadOptions&,
-                                        ColumnFamilyData* cfd,
-                                        SuperVersion* super_version,
-                                        Arena* arena,
-                                        RangeDelAggregator* range_del_agg);
+  InternalIterator* NewInternalIterator(
+      const ReadOptions&, ColumnFamilyData* cfd, SuperVersion* super_version,
+      Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence);
 
   // hollow transactions shell used for recovery.
   // these will then be passed to TransactionDB so that
@@ -700,7 +713,7 @@ class DBImpl : public DB {
  protected:
   Env* const env_;
   const std::string dbname_;
-  unique_ptr<VersionSet> versions_;
+  std::unique_ptr<VersionSet> versions_;
   // Flag to check whether we allocated and own the info log file
   bool own_info_log_;
   const DBOptions initial_db_options_;
@@ -732,6 +745,11 @@ class DBImpl : public DB {
                               const MutableCFOptions& mutable_cf_options,
                               int job_id, TableProperties prop);
 
+  void NotifyOnCompactionBegin(ColumnFamilyData* cfd,
+                               Compaction *c, const Status &st,
+                               const CompactionJobStats& job_stats,
+                               int job_id);
+
   void NotifyOnCompactionCompleted(ColumnFamilyData* cfd,
                                    Compaction *c, const Status &st,
                                    const CompactionJobStats& job_stats,
@@ -900,18 +918,18 @@ class DBImpl : public DB {
   // Argument required by background flush thread.
   struct BGFlushArg {
     BGFlushArg()
-        : cfd_(nullptr), memtable_id_(0), superversion_context_(nullptr) {}
-    BGFlushArg(ColumnFamilyData* cfd, uint64_t memtable_id,
+        : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {}
+    BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id,
                SuperVersionContext* superversion_context)
         : cfd_(cfd),
-          memtable_id_(memtable_id),
+          max_memtable_id_(max_memtable_id),
           superversion_context_(superversion_context) {}
 
     // Column family to flush.
     ColumnFamilyData* cfd_;
     // Maximum ID of memtable to flush. In this column family, memtables with
     // IDs smaller than this value must be flushed before this flush completes.
-    uint64_t memtable_id_;
+    uint64_t max_memtable_id_;
     // Pointer to a SuperVersionContext object. After flush completes, RocksDB
     // installs a new superversion for the column family. This operation
     // requires a SuperVersionContext object (currently embedded in JobContext).
@@ -924,6 +942,10 @@ class DBImpl : public DB {
       const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
       JobContext* job_context, LogBuffer* log_buffer);
 
+  Status AtomicFlushMemTablesToOutputFiles(
+      const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+      JobContext* job_context, LogBuffer* log_buffer);
+
   // REQUIRES: log_numbers are sorted in ascending order
   Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
                          SequenceNumber* next_sequence, bool read_only);
@@ -953,10 +975,17 @@ class DBImpl : public DB {
 
   Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
 
+  void SelectColumnFamiliesForAtomicFlush(autovector<ColumnFamilyData*>* cfds);
+
   // Force current memtable contents to be flushed.
   Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options,
                        FlushReason flush_reason, bool writes_stopped = false);
 
+  Status AtomicFlushMemTables(
+      const autovector<ColumnFamilyData*>& column_family_datas,
+      const FlushOptions& options, FlushReason flush_reason,
+      bool writes_stopped = false);
+
   // Wait until flushing this column family won't stall writes
   Status WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
                                            bool* flush_needed);
@@ -965,14 +994,22 @@ class DBImpl : public DB {
   // If flush_memtable_id is non-null, wait until the memtable with the ID
   // gets flush. Otherwise, wait until the column family don't have any
   // memtable pending flush.
+  // resuming_from_bg_err indicates whether the caller is attempting to resume
+  // from background error.
   Status WaitForFlushMemTable(ColumnFamilyData* cfd,
-                              const uint64_t* flush_memtable_id = nullptr) {
-    return WaitForFlushMemTables({cfd}, {flush_memtable_id});
+                              const uint64_t* flush_memtable_id = nullptr,
+                              bool resuming_from_bg_err = false) {
+    return WaitForFlushMemTables({cfd}, {flush_memtable_id},
+                                 resuming_from_bg_err);
   }
   // Wait for memtables to be flushed for multiple column families.
   Status WaitForFlushMemTables(
       const autovector<ColumnFamilyData*>& cfds,
-      const autovector<const uint64_t*>& flush_memtable_ids);
+      const autovector<const uint64_t*>& flush_memtable_ids,
+      bool resuming_from_bg_err);
+
+  // REQUIRES: mutex locked and in write thread.
+  void AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds);
 
   // REQUIRES: mutex locked
   Status SwitchWAL(WriteContext* write_context);
@@ -1037,6 +1074,9 @@ class DBImpl : public DB {
   // column families in this request, this flush is considered complete.
   typedef std::vector<std::pair<ColumnFamilyData*, uint64_t>> FlushRequest;
 
+  void GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
+                            FlushRequest* req);
+
   void SchedulePendingFlush(const FlushRequest& req, FlushReason flush_reason);
 
   void SchedulePendingCompaction(ColumnFamilyData* cfd);
@@ -1063,10 +1103,13 @@ class DBImpl : public DB {
                                const std::vector<CompactionInputFiles>& inputs,
                                bool* sfm_bookkeeping, LogBuffer* log_buffer);
 
+  // Schedule background tasks
+  void StartTimedTasks();
+
   void PrintStatistics();
 
   // dump rocksdb.stats to LOG
-  void MaybeDumpStats();
+  void DumpStats();
 
   // Return the minimum empty level that could hold the total data in the
   // input level. Return the input level, if such level could not be found.
@@ -1094,8 +1137,6 @@ class DBImpl : public DB {
 
   Status CloseHelper();
 
-  Status FlushAllCFs(FlushReason flush_reason);
-
   void WaitForBackgroundWork();
 
   // table_cache_ provides its own synchronization
@@ -1148,7 +1189,7 @@ class DBImpl : public DB {
   bool log_empty_;
   ColumnFamilyHandleImpl* default_cf_handle_;
   InternalStats* default_cf_internal_stats_;
-  unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
+  std::unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
   struct LogFileNumberSize {
     explicit LogFileNumberSize(uint64_t _number)
         : number(_number) {}
@@ -1176,7 +1217,7 @@ class DBImpl : public DB {
 
     uint64_t number;
     // Visual Studio doesn't support deque's member to be noncopyable because
-    // of a unique_ptr as a member.
+    // of a std::unique_ptr as a member.
     log::Writer* writer;  // own
     // true for some prefix of logs_
     bool getting_synced = false;
@@ -1262,7 +1303,7 @@ class DBImpl : public DB {
 
   WriteController write_controller_;
 
-  unique_ptr<RateLimiter> low_pri_write_rate_limiter_;
+  std::unique_ptr<RateLimiter> low_pri_write_rate_limiter_;
 
   // Size of the last batch group. In slowdown mode, next write needs to
   // sleep if it uses up the quota.
@@ -1469,6 +1510,10 @@ class DBImpl : public DB {
   // Only to be set during initialization
   std::unique_ptr<PreReleaseCallback> recoverable_state_pre_release_callback_;
 
+  // handle for scheduling jobs at fixed intervals
+  // REQUIRES: mutex locked
+  std::unique_ptr<rocksdb::RepeatableThread> thread_dump_stats_;
+
   // No copying allowed
   DBImpl(const DBImpl&);
   void operator=(const DBImpl&);
@@ -1548,7 +1593,7 @@ class DBImpl : public DB {
   // error recovery from going on in parallel. The latter, shutting_down_,
   // is set a little later during the shutdown after scheduling memtable
   // flushes
-  bool shutdown_initiated_;
+  std::atomic<bool> shutdown_initiated_;
   // Flag to indicate whether sst_file_manager object was allocated in
   // DB::Open() or passed to us
   bool own_sfm_;
@@ -1564,6 +1609,17 @@ class DBImpl : public DB {
   bool closed_;
 
   ErrorHandler error_handler_;
+
+  // Conditional variable to coordinate installation of atomic flush results.
+  // With atomic flush, each bg thread installs the result of flushing multiple
+  // column families, and different threads can flush different column
+  // families. It's difficult to rely on one thread to perform batch
+  // installation for all threads. This is different from the non-atomic flush
+  // case.
+  // atomic_flush_install_cv_ makes sure that threads install atomic flush
+  // results sequentially. Flush results of memtables with lower IDs get
+  // installed to MANIFEST first.
+  InstrumentedCondVar atomic_flush_install_cv_;
 };
 
 extern Options SanitizeOptions(const std::string& db,
diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc
index eef8cf98de6..a42e60f855a 100644
--- a/db/db_impl_compaction_flush.cc
+++ b/db/db_impl_compaction_flush.cc
@@ -125,15 +125,19 @@ Status DBImpl::FlushMemTableToOutputFile(
   }
   FlushJob flush_job(
       dbname_, cfd, immutable_db_options_, mutable_cf_options,
-      env_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_,
-      snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
-      job_context, log_buffer, directories_.GetDbDir(), GetDataDir(cfd, 0U),
+      nullptr /* memtable_id */, env_options_for_compaction_, versions_.get(),
+      &mutex_, &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
+      snapshot_checker, job_context, log_buffer, directories_.GetDbDir(),
+      GetDataDir(cfd, 0U),
       GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_,
-      &event_logger_, mutable_cf_options.report_bg_io_stats);
+      &event_logger_, mutable_cf_options.report_bg_io_stats,
+      true /* sync_output_directory */, true /* write_manifest */);
 
   FileMetaData file_meta;
 
+  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables");
   flush_job.PickMemTable();
+  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:AfterPickMemtables");
 
 #ifndef ROCKSDB_LITE
   // may temporarily unlock and lock the mutex.
@@ -143,7 +147,7 @@ Status DBImpl::FlushMemTableToOutputFile(
 
   Status s;
   if (logfile_number_ > 0 &&
-      versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 0) {
+      versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1) {
     // If there are more than one column families, we need to make sure that
     // all the log files except the most recent one are synced. Otherwise if
     // the host crashes after flushing and before WAL is persistent, the
@@ -151,6 +155,8 @@ Status DBImpl::FlushMemTableToOutputFile(
     // other column families are missing.
     // SyncClosedLogs() may unlock and re-lock the db_mutex.
     s = SyncClosedLogs(job_context);
+  } else {
+    TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip");
   }
 
   // Within flush_job.Run, rocksdb may call event listener to notify
@@ -169,7 +175,7 @@ Status DBImpl::FlushMemTableToOutputFile(
     InstallSuperVersionAndScheduleWork(cfd, superversion_context,
                                        mutable_cf_options);
     if (made_progress) {
-      *made_progress = 1;
+      *made_progress = true;
     }
     VersionStorageInfo::LevelSummaryStorage tmp;
     ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
@@ -209,19 +215,297 @@ Status DBImpl::FlushMemTableToOutputFile(
 Status DBImpl::FlushMemTablesToOutputFiles(
     const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
     JobContext* job_context, LogBuffer* log_buffer) {
-  Status s;
+  if (immutable_db_options_.atomic_flush) {
+    return AtomicFlushMemTablesToOutputFiles(bg_flush_args, made_progress,
+                                             job_context, log_buffer);
+  }
+  Status status;
   for (auto& arg : bg_flush_args) {
     ColumnFamilyData* cfd = arg.cfd_;
-    const MutableCFOptions& mutable_cf_options =
-        *cfd->GetLatestMutableCFOptions();
+    MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
     SuperVersionContext* superversion_context = arg.superversion_context_;
-    s = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress,
-                                  job_context, superversion_context,
-                                  log_buffer);
+    Status s = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress,
+                                         job_context, superversion_context,
+                                         log_buffer);
     if (!s.ok()) {
-      break;
+      status = s;
+      if (!s.IsShutdownInProgress()) {
+        // At this point, DB is not shutting down, nor is cfd dropped.
+        // Something is wrong, thus we break out of the loop.
+        break;
+      }
+    }
+  }
+  return status;
+}
+
+/*
+ * Atomically flushes multiple column families.
+ *
+ * For each column family, all memtables with ID smaller than or equal to the
+ * ID specified in bg_flush_args will be flushed. Only after all column
+ * families finish flush will this function commit to MANIFEST. If any of the
+ * column families are not flushed successfully, this function does not have
+ * any side-effect on the state of the database.
+ */
+Status DBImpl::AtomicFlushMemTablesToOutputFiles(
+    const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+    JobContext* job_context, LogBuffer* log_buffer) {
+  mutex_.AssertHeld();
+
+  autovector<ColumnFamilyData*> cfds;
+  for (const auto& arg : bg_flush_args) {
+    cfds.emplace_back(arg.cfd_);
+  }
+
+#ifndef NDEBUG
+  for (const auto cfd : cfds) {
+    assert(cfd->imm()->NumNotFlushed() != 0);
+    assert(cfd->imm()->IsFlushPending());
+  }
+#endif /* !NDEBUG */
+
+  SequenceNumber earliest_write_conflict_snapshot;
+  std::vector<SequenceNumber> snapshot_seqs =
+      snapshots_.GetAll(&earliest_write_conflict_snapshot);
+
+  auto snapshot_checker = snapshot_checker_.get();
+  if (use_custom_gc_ && snapshot_checker == nullptr) {
+    snapshot_checker = DisableGCSnapshotChecker::Instance();
+  }
+  autovector<Directory*> distinct_output_dirs;
+  std::vector<FlushJob> jobs;
+  std::vector<MutableCFOptions> all_mutable_cf_options;
+  int num_cfs = static_cast<int>(cfds.size());
+  all_mutable_cf_options.reserve(num_cfs);
+  for (int i = 0; i < num_cfs; ++i) {
+    auto cfd = cfds[i];
+    Directory* data_dir = GetDataDir(cfd, 0U);
+
+    // Add to distinct output directories if eligible. Use linear search. Since
+    // the number of elements in the vector is not large, performance should be
+    // tolerable.
+    bool found = false;
+    for (const auto dir : distinct_output_dirs) {
+      if (dir == data_dir) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      distinct_output_dirs.emplace_back(data_dir);
+    }
+
+    all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions());
+    const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back();
+    const uint64_t* max_memtable_id = &(bg_flush_args[i].max_memtable_id_);
+    jobs.emplace_back(
+        dbname_, cfds[i], immutable_db_options_, mutable_cf_options,
+        max_memtable_id, env_options_for_compaction_, versions_.get(), &mutex_,
+        &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
+        snapshot_checker, job_context, log_buffer, directories_.GetDbDir(),
+        data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
+        stats_, &event_logger_, mutable_cf_options.report_bg_io_stats,
+        false /* sync_output_directory */, false /* write_manifest */);
+    jobs.back().PickMemTable();
+  }
+
+  std::vector<FileMetaData> file_meta(num_cfs);
+  Status s;
+  assert(num_cfs == static_cast<int>(jobs.size()));
+
+#ifndef ROCKSDB_LITE
+  for (int i = 0; i != num_cfs; ++i) {
+    const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i);
+    // may temporarily unlock and lock the mutex.
+    NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options,
+                       job_context->job_id, jobs[i].GetTableProperties());
+  }
+#endif /* !ROCKSDB_LITE */
+
+  if (logfile_number_ > 0) {
+    // TODO (yanqin) investigate whether we should sync the closed logs for
+    // single column family case.
+    s = SyncClosedLogs(job_context);
+  }
+
+  // exec_status stores the execution status of flush_jobs as
+  // <bool /* executed */, Status /* status code */>
+  autovector<std::pair<bool, Status>> exec_status;
+  for (int i = 0; i != num_cfs; ++i) {
+    // Initially all jobs are not executed, with status OK.
+    exec_status.emplace_back(false, Status::OK());
+  }
+
+  if (s.ok()) {
+    // TODO (yanqin): parallelize jobs with threads.
+    for (int i = 1; i != num_cfs; ++i) {
+      exec_status[i].second =
+          jobs[i].Run(&logs_with_prep_tracker_, &file_meta[i]);
+      exec_status[i].first = true;
+    }
+    if (num_cfs > 1) {
+      TEST_SYNC_POINT(
+          "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1");
+      TEST_SYNC_POINT(
+          "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2");
+    }
+    exec_status[0].second =
+        jobs[0].Run(&logs_with_prep_tracker_, &file_meta[0]);
+    exec_status[0].first = true;
+
+    Status error_status;
+    for (const auto& e : exec_status) {
+      if (!e.second.ok()) {
+        s = e.second;
+        if (!e.second.IsShutdownInProgress()) {
+          // If a flush job did not return OK, and the CF is not dropped, and
+          // the DB is not shutting down, then we have to return this result to
+          // caller later.
+          error_status = e.second;
+        }
+      }
+    }
+
+    s = error_status.ok() ? s : error_status;
+  }
+
+  if (s.ok() || s.IsShutdownInProgress()) {
+    // Sync on all distinct output directories.
+    for (auto dir : distinct_output_dirs) {
+      if (dir != nullptr) {
+        s = dir->Fsync();
+        if (!s.ok()) {
+          break;
+        }
+      }
     }
   }
+
+  if (s.ok()) {
+    auto wait_to_install_func = [&]() {
+      bool ready = true;
+      for (size_t i = 0; i != cfds.size(); ++i) {
+        const auto& mems = jobs[i].GetMemTables();
+        if (cfds[i]->IsDropped()) {
+          // If the column family is dropped, then do not wait.
+          continue;
+        } else if (!mems.empty() &&
+                   cfds[i]->imm()->GetEarliestMemTableID() < mems[0]->GetID()) {
+          // If a flush job needs to install the flush result for mems and
+          // mems[0] is not the earliest memtable, it means another thread must
+          // be installing flush results for the same column family, then the
+          // current thread needs to wait.
+          ready = false;
+          break;
+        } else if (mems.empty() && cfds[i]->imm()->GetEarliestMemTableID() <=
+                                       bg_flush_args[i].max_memtable_id_) {
+          // If a flush job does not need to install flush results, then it has
+          // to wait until all memtables up to max_memtable_id_ (inclusive) are
+          // installed.
+          ready = false;
+          break;
+        }
+      }
+      return ready;
+    };
+
+    bool resuming_from_bg_err = error_handler_.IsDBStopped();
+    while ((!error_handler_.IsDBStopped() ||
+            error_handler_.GetRecoveryError().ok()) &&
+           !wait_to_install_func()) {
+      atomic_flush_install_cv_.Wait();
+    }
+
+    s = resuming_from_bg_err ? error_handler_.GetRecoveryError()
+                             : error_handler_.GetBGError();
+  }
+
+  if (s.ok()) {
+    autovector<ColumnFamilyData*> tmp_cfds;
+    autovector<const autovector<MemTable*>*> mems_list;
+    autovector<const MutableCFOptions*> mutable_cf_options_list;
+    autovector<FileMetaData*> tmp_file_meta;
+    for (int i = 0; i != num_cfs; ++i) {
+      const auto& mems = jobs[i].GetMemTables();
+      if (!cfds[i]->IsDropped() && !mems.empty()) {
+        tmp_cfds.emplace_back(cfds[i]);
+        mems_list.emplace_back(&mems);
+        mutable_cf_options_list.emplace_back(&all_mutable_cf_options[i]);
+        tmp_file_meta.emplace_back(&file_meta[i]);
+      }
+    }
+
+    s = InstallMemtableAtomicFlushResults(
+        nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list,
+        versions_.get(), &mutex_, tmp_file_meta,
+        &job_context->memtables_to_free, directories_.GetDbDir(), log_buffer);
+  }
+
+  if (s.ok() || s.IsShutdownInProgress()) {
+    assert(num_cfs ==
+           static_cast<int>(job_context->superversion_contexts.size()));
+    for (int i = 0; i != num_cfs; ++i) {
+      if (cfds[i]->IsDropped()) {
+        continue;
+      }
+      InstallSuperVersionAndScheduleWork(cfds[i],
+                                         &job_context->superversion_contexts[i],
+                                         all_mutable_cf_options[i]);
+      VersionStorageInfo::LevelSummaryStorage tmp;
+      ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
+                       cfds[i]->GetName().c_str(),
+                       cfds[i]->current()->storage_info()->LevelSummary(&tmp));
+    }
+    if (made_progress) {
+      *made_progress = true;
+    }
+#ifndef ROCKSDB_LITE
+    auto sfm = static_cast<SstFileManagerImpl*>(
+        immutable_db_options_.sst_file_manager.get());
+    for (int i = 0; i != num_cfs; ++i) {
+      if (cfds[i]->IsDropped()) {
+        continue;
+      }
+      NotifyOnFlushCompleted(cfds[i], &file_meta[i], all_mutable_cf_options[i],
+                             job_context->job_id, jobs[i].GetTableProperties());
+      if (sfm) {
+        std::string file_path = MakeTableFileName(
+            cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber());
+        sfm->OnAddFile(file_path);
+        if (sfm->IsMaxAllowedSpaceReached() &&
+            error_handler_.GetBGError().ok()) {
+          Status new_bg_error =
+              Status::SpaceLimit("Max allowed space was reached");
+          error_handler_.SetBGError(new_bg_error,
+                                    BackgroundErrorReason::kFlush);
+        }
+      }
+    }
+#endif  // ROCKSDB_LITE
+  }
+
+  // Need to undo atomic flush if something went wrong, i.e. s is not OK and
+  // it is not because of CF drop.
+  if (!s.ok() && !s.IsShutdownInProgress()) {
+    // Have to cancel the flush jobs that have NOT executed because we need to
+    // unref the versions.
+    for (int i = 0; i != num_cfs; ++i) {
+      if (!exec_status[i].first) {
+        jobs[i].Cancel();
+      }
+    }
+    for (int i = 0; i != num_cfs; ++i) {
+      if (exec_status[i].first && exec_status[i].second.ok()) {
+        auto& mems = jobs[i].GetMemTables();
+        cfds[i]->imm()->RollbackMemtableFlush(mems,
+                                              file_meta[i].fd.GetNumber());
+      }
+    }
+    Status new_bg_error = s;
+    error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+  }
+
   return s;
 }
 
@@ -353,8 +637,17 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options,
   if (flush_needed) {
     FlushOptions fo;
     fo.allow_write_stall = options.allow_write_stall;
-    s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction,
-      false /* writes_stopped*/);
+    if (immutable_db_options_.atomic_flush) {
+      autovector<ColumnFamilyData*> cfds;
+      mutex_.Lock();
+      SelectColumnFamiliesForAtomicFlush(&cfds);
+      mutex_.Unlock();
+      s = AtomicFlushMemTables(cfds, fo, FlushReason::kManualCompaction,
+                               false /* writes_stopped */);
+    } else {
+      s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction,
+                        false /* writes_stopped*/);
+    }
     if (!s.ok()) {
       LogFlush(immutable_db_options_.info_log);
       return s;
@@ -485,7 +778,7 @@ Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
                        immutable_db_options_.info_log.get());
 
   // Perform CompactFiles
-  SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
+  TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile2");
   {
     InstrumentedMutexLock l(&mutex_);
 
@@ -493,15 +786,16 @@ Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
     // IngestExternalFile() calls to finish.
     WaitForIngestFile();
 
-    s = CompactFilesImpl(compact_options, cfd, sv->current, input_file_names,
+    // We need to get current after `WaitForIngestFile`, because
+    // `IngestExternalFile` may add files that overlap with `input_file_names`
+    auto* current = cfd->current();
+    current->Ref();
+
+    s = CompactFilesImpl(compact_options, cfd, current, input_file_names,
                          output_file_names, output_level, output_path_id,
                          &job_context, &log_buffer);
-  }
-  if (sv->Unref()) {
-    mutex_.Lock();
-    sv->Cleanup();
-    mutex_.Unlock();
-    delete sv;
+
+    current->Unref();
   }
 
   // Find and delete obsolete files
@@ -547,7 +841,7 @@ Status DBImpl::CompactFilesImpl(
   }
 
   std::unordered_set<uint64_t> input_set;
-  for (auto file_name : input_file_names) {
+  for (const auto& file_name : input_file_names) {
     input_set.insert(TableFileNameToNumber(file_name));
   }
 
@@ -579,7 +873,7 @@ Status DBImpl::CompactFilesImpl(
     return s;
   }
 
-  for (auto inputs : input_files) {
+  for (const auto& inputs : input_files) {
     if (cfd->compaction_picker()->AreFilesInCompaction(inputs.files)) {
       return Status::Aborted(
           "Some of the necessary compaction input "
@@ -600,7 +894,7 @@ Status DBImpl::CompactFilesImpl(
   // At this point, CompactFiles will be run.
   bg_compaction_scheduled_++;
 
-  unique_ptr<Compaction> c;
+  std::unique_ptr<Compaction> c;
   assert(cfd->compaction_picker());
   c.reset(cfd->compaction_picker()->CompactFiles(
       compact_options, input_files, output_level, version->storage_info(),
@@ -744,6 +1038,69 @@ Status DBImpl::ContinueBackgroundWork() {
   return Status::OK();
 }
 
+void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd,
+                                     Compaction *c, const Status &st,
+                                     const CompactionJobStats& job_stats,
+                                     int job_id) {
+#ifndef ROCKSDB_LITE
+  if (immutable_db_options_.listeners.empty()) {
+    return;
+  }
+  mutex_.AssertHeld();
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+  Version* current = cfd->current();
+  current->Ref();
+  // release lock while notifying events
+  mutex_.Unlock();
+  TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex");
+  {
+    CompactionJobInfo info;
+    info.cf_name = cfd->GetName();
+    info.status = st;
+    info.thread_id = env_->GetThreadID();
+    info.job_id = job_id;
+    info.base_input_level = c->start_level();
+    info.output_level = c->output_level();
+    info.stats = job_stats;
+    info.table_properties = c->GetOutputTableProperties();
+    info.compaction_reason = c->compaction_reason();
+    info.compression = c->output_compression();
+    for (size_t i = 0; i < c->num_input_levels(); ++i) {
+      for (const auto fmd : *c->inputs(i)) {
+        auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
+                                fmd->fd.GetNumber(), fmd->fd.GetPathId());
+        info.input_files.push_back(fn);
+        if (info.table_properties.count(fn) == 0) {
+          std::shared_ptr<const TableProperties> tp;
+          auto s = current->GetTableProperties(&tp, fmd, &fn);
+          if (s.ok()) {
+            info.table_properties[fn] = tp;
+          }
+        }
+      }
+    }
+    for (const auto newf : c->edit()->GetNewFiles()) {
+      info.output_files.push_back(TableFileName(
+          c->immutable_cf_options()->cf_paths, newf.second.fd.GetNumber(),
+          newf.second.fd.GetPathId()));
+    }
+    for (auto listener : immutable_db_options_.listeners) {
+      listener->OnCompactionBegin(this, info);
+    }
+  }
+  mutex_.Lock();
+  current->Unref();
+#else
+  (void)cfd;
+  (void)c;
+  (void)st;
+  (void)job_stats;
+  (void)job_id;
+#endif  // ROCKSDB_LITE
+}
+
 void DBImpl::NotifyOnCompactionCompleted(
     ColumnFamilyData* cfd, Compaction* c, const Status& st,
     const CompactionJobStats& compaction_job_stats, const int job_id) {
@@ -762,6 +1119,7 @@ void DBImpl::NotifyOnCompactionCompleted(
   TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex");
   {
     CompactionJobInfo info;
+    info.cf_id = cfd->GetID();
     info.cf_name = cfd->GetName();
     info.status = st;
     info.thread_id = env_->GetThreadID();
@@ -911,73 +1269,59 @@ Status DBImpl::Flush(const FlushOptions& flush_options,
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   ROCKS_LOG_INFO(immutable_db_options_.info_log, "[%s] Manual flush start.",
                  cfh->GetName().c_str());
-  Status s =
-      FlushMemTable(cfh->cfd(), flush_options, FlushReason::kManualFlush);
+  Status s;
+  if (immutable_db_options_.atomic_flush) {
+    s = AtomicFlushMemTables({cfh->cfd()}, flush_options,
+                             FlushReason::kManualFlush);
+  } else {
+    s = FlushMemTable(cfh->cfd(), flush_options, FlushReason::kManualFlush);
+  }
+
   ROCKS_LOG_INFO(immutable_db_options_.info_log,
                  "[%s] Manual flush finished, status: %s\n",
                  cfh->GetName().c_str(), s.ToString().c_str());
   return s;
 }
 
-
-Status DBImpl::FlushAllCFs(FlushReason flush_reason) {
+Status DBImpl::Flush(const FlushOptions& flush_options,
+                     const std::vector<ColumnFamilyHandle*>& column_families) {
   Status s;
-  WriteContext context;
-  WriteThread::Writer w;
-
-  mutex_.AssertHeld();
-  write_thread_.EnterUnbatched(&w, &mutex_);
-
-  FlushRequest flush_req;
-  for (auto cfd : *versions_->GetColumnFamilySet()) {
-    if (cfd->imm()->NumNotFlushed() == 0 && cfd->mem()->IsEmpty() &&
-        cached_recoverable_state_empty_.load()) {
-      // Nothing to flush
-      continue;
+  if (!immutable_db_options_.atomic_flush) {
+    for (auto cfh : column_families) {
+      s = Flush(flush_options, cfh);
+      if (!s.ok()) {
+        break;
+      }
     }
-
-    // SwitchMemtable() will release and reacquire mutex during execution
-    s = SwitchMemtable(cfd, &context);
-    if (!s.ok()) {
-      break;
+  } else {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Manual atomic flush start.\n"
+                   "=====Column families:=====");
+    for (auto cfh : column_families) {
+      auto cfhi = static_cast<ColumnFamilyHandleImpl*>(cfh);
+      ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s",
+                     cfhi->GetName().c_str());
     }
-
-    cfd->imm()->FlushRequested();
-
-    flush_req.emplace_back(cfd, cfd->imm()->GetLatestMemTableID());
-  }
-
-  // schedule flush
-  if (s.ok() && !flush_req.empty()) {
-    SchedulePendingFlush(flush_req, flush_reason);
-    MaybeScheduleFlushOrCompaction();
-  }
-
-  write_thread_.ExitUnbatched(&w);
-
-  if (s.ok()) {
-    for (auto& flush : flush_req) {
-      auto cfd = flush.first;
-      auto flush_memtable_id = flush.second;
-      while (cfd->imm()->NumNotFlushed() > 0 &&
-             cfd->imm()->GetEarliestMemTableID() <= flush_memtable_id) {
-        if (!error_handler_.GetRecoveryError().ok()) {
-          break;
-        }
-        if (cfd->IsDropped()) {
-          // FlushJob cannot flush a dropped CF, if we did not break here
-          // we will loop forever since cfd->imm()->NumNotFlushed() will never
-          // drop to zero
-          continue;
-        }
-        cfd->Ref();
-        bg_cv_.Wait();
-        cfd->Unref();
-      }
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "=====End of column families list=====");
+    autovector<ColumnFamilyData*> cfds;
+    std::for_each(column_families.begin(), column_families.end(),
+                  [&cfds](ColumnFamilyHandle* elem) {
+                    auto cfh = static_cast<ColumnFamilyHandleImpl*>(elem);
+                    cfds.emplace_back(cfh->cfd());
+                  });
+    s = AtomicFlushMemTables(cfds, flush_options, FlushReason::kManualFlush);
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Manual atomic flush finished, status: %s\n",
+                   "=====Column families:=====", s.ToString().c_str());
+    for (auto cfh : column_families) {
+      auto cfhi = static_cast<ColumnFamilyHandleImpl*>(cfh);
+      ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s",
+                     cfhi->GetName().c_str());
     }
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "=====End of column families list=====");
   }
-
-  flush_req.clear();
   return s;
 }
 
@@ -1111,6 +1455,20 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
   return manual.status;
 }
 
+void DBImpl::GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
+                                  FlushRequest* req) {
+  assert(req != nullptr);
+  req->reserve(cfds.size());
+  for (const auto cfd : cfds) {
+    if (nullptr == cfd) {
+      // cfd may be null, see DBImpl::ScheduleFlushes
+      continue;
+    }
+    uint64_t max_memtable_id = cfd->imm()->GetLatestMemTableID();
+    req->emplace_back(cfd, max_memtable_id);
+  }
+}
+
 Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
                              const FlushOptions& flush_options,
                              FlushReason flush_reason, bool writes_stopped) {
@@ -1134,11 +1492,16 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
       write_thread_.EnterUnbatched(&w, &mutex_);
     }
 
-    if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
-        !cached_recoverable_state_empty_.load()) {
+    if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) {
       s = SwitchMemtable(cfd, &context);
-      flush_memtable_id = cfd->imm()->GetLatestMemTableID();
-      flush_req.emplace_back(cfd, flush_memtable_id);
+    }
+
+    if (s.ok()) {
+      if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+          !cached_recoverable_state_empty_.load()) {
+        flush_memtable_id = cfd->imm()->GetLatestMemTableID();
+        flush_req.emplace_back(cfd, flush_memtable_id);
+      }
     }
 
     if (s.ok() && !flush_req.empty()) {
@@ -1162,12 +1525,93 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
       cfds.push_back(iter.first);
       flush_memtable_ids.push_back(&(iter.second));
     }
-    s = WaitForFlushMemTables(cfds, flush_memtable_ids);
+    s = WaitForFlushMemTables(cfds, flush_memtable_ids,
+                              (flush_reason == FlushReason::kErrorRecovery));
   }
   TEST_SYNC_POINT("FlushMemTableFinished");
   return s;
 }
 
+// Flush all elments in 'column_family_datas'
+// and atomically record the result to the MANIFEST.
+Status DBImpl::AtomicFlushMemTables(
+    const autovector<ColumnFamilyData*>& column_family_datas,
+    const FlushOptions& flush_options, FlushReason flush_reason,
+    bool writes_stopped) {
+  Status s;
+  if (!flush_options.allow_write_stall) {
+    int num_cfs_to_flush = 0;
+    for (auto cfd : column_family_datas) {
+      bool flush_needed = true;
+      s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
+      if (!s.ok()) {
+        return s;
+      } else if (flush_needed) {
+        ++num_cfs_to_flush;
+      }
+    }
+    if (0 == num_cfs_to_flush) {
+      return s;
+    }
+  }
+  FlushRequest flush_req;
+  autovector<ColumnFamilyData*> cfds;
+  {
+    WriteContext context;
+    InstrumentedMutexLock guard_lock(&mutex_);
+
+    WriteThread::Writer w;
+    if (!writes_stopped) {
+      write_thread_.EnterUnbatched(&w, &mutex_);
+    }
+
+    for (auto cfd : column_family_datas) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+          !cached_recoverable_state_empty_.load()) {
+        cfds.emplace_back(cfd);
+      }
+    }
+    for (auto cfd : cfds) {
+      if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) {
+        continue;
+      }
+      cfd->Ref();
+      s = SwitchMemtable(cfd, &context);
+      cfd->Unref();
+      if (!s.ok()) {
+        break;
+      }
+    }
+    if (s.ok()) {
+      AssignAtomicFlushSeq(cfds);
+      for (auto cfd : cfds) {
+        cfd->imm()->FlushRequested();
+      }
+      GenerateFlushRequest(cfds, &flush_req);
+      SchedulePendingFlush(flush_req, flush_reason);
+      MaybeScheduleFlushOrCompaction();
+    }
+
+    if (!writes_stopped) {
+      write_thread_.ExitUnbatched(&w);
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush");
+
+  if (s.ok() && flush_options.wait) {
+    autovector<const uint64_t*> flush_memtable_ids;
+    for (auto& iter : flush_req) {
+      flush_memtable_ids.push_back(&(iter.second));
+    }
+    s = WaitForFlushMemTables(cfds, flush_memtable_ids,
+                              (flush_reason == FlushReason::kErrorRecovery));
+  }
+  return s;
+}
+
 // Calling FlushMemTable(), whether from DB::Flush() or from Backup Engine, can
 // cause write stall, for example if one memtable is being flushed already.
 // This method tries to avoid write stall (similar to CompactRange() behavior)
@@ -1183,6 +1627,14 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
     WriteStallCondition write_stall_condition = WriteStallCondition::kNormal;
     do {
       if (write_stall_condition != WriteStallCondition::kNormal) {
+        // Same error handling as user writes: Don't wait if there's a
+        // background error, even if it's a soft error. We might wait here
+        // indefinitely as the pending flushes/compactions may never finish
+        // successfully, resulting in the stall condition lasting indefinitely
+        if (error_handler_.IsBGWorkStopped()) {
+          return error_handler_.GetBGError();
+        }
+
         TEST_SYNC_POINT("DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait");
         ROCKS_LOG_INFO(immutable_db_options_.info_log,
                        "[%s] WaitUntilFlushWouldNotStallWrites"
@@ -1239,16 +1691,25 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
 //  2) if flush_memtable_ids[i] is null, then all memtables in THIS column
 //     family have to be flushed.
 // Finish waiting when ALL column families finish flushing memtables.
+// resuming_from_bg_err indicates whether the caller is trying to resume from
+// background error or in normal processing.
 Status DBImpl::WaitForFlushMemTables(
     const autovector<ColumnFamilyData*>& cfds,
-    const autovector<const uint64_t*>& flush_memtable_ids) {
+    const autovector<const uint64_t*>& flush_memtable_ids,
+    bool resuming_from_bg_err) {
   int num = static_cast<int>(cfds.size());
   // Wait until the compaction completes
   InstrumentedMutexLock l(&mutex_);
-  while (!error_handler_.IsDBStopped()) {
+  // If the caller is trying to resume from bg error, then
+  // error_handler_.IsDBStopped() is true.
+  while (resuming_from_bg_err || !error_handler_.IsDBStopped()) {
     if (shutting_down_.load(std::memory_order_acquire)) {
       return Status::ShutdownInProgress();
     }
+    // If an error has occurred during resumption, then no need to wait.
+    if (!error_handler_.GetRecoveryError().ok()) {
+      break;
+    }
     // Number of column families that have been dropped.
     int num_dropped = 0;
     // Number of column families that have finished flush.
@@ -1274,7 +1735,9 @@ Status DBImpl::WaitForFlushMemTables(
     bg_cv_.Wait();
   }
   Status s;
-  if (error_handler_.IsDBStopped()) {
+  // If not resuming from bg error, and an error has caused the DB to stop,
+  // then report the bg error to caller.
+  if (!resuming_from_bg_err && error_handler_.IsDBStopped()) {
     s = error_handler_.GetBGError();
   }
   return s;
@@ -1614,6 +2077,7 @@ void DBImpl::BackgroundCallFlush() {
       mutex_.Lock();
     }
 
+    TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FlushFinish:0");
     ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
 
     // If flush failed, we want to delete all temporary files that we might have
@@ -1636,12 +2100,14 @@ void DBImpl::BackgroundCallFlush() {
       job_context.Clean();
       mutex_.Lock();
     }
+    TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp");
 
     assert(num_running_flushes_ > 0);
     num_running_flushes_--;
     bg_flush_scheduled_--;
     // See if there's more work to be done
     MaybeScheduleFlushOrCompaction();
+    atomic_flush_install_cv_.SignalAll();
     bg_cv_.SignalAll();
     // IMPORTANT: there should be no code after calling SignalAll. This call may
     // signal the DB destructor that it's OK to proceed with destruction. In
@@ -1655,7 +2121,6 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
   bool made_progress = false;
   JobContext job_context(next_job_id_.fetch_add(1), true);
   TEST_SYNC_POINT("BackgroundCallCompaction:0");
-  MaybeDumpStats();
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
                        immutable_db_options_.info_log.get());
   {
@@ -1766,7 +2231,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start");
 
   bool is_manual = (manual_compaction != nullptr);
-  unique_ptr<Compaction> c;
+  std::unique_ptr<Compaction> c;
   if (prepicked_compaction != nullptr &&
       prepicked_compaction->compaction != nullptr) {
     c.reset(prepicked_compaction->compaction);
@@ -1799,6 +2264,10 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
       manual_compaction->in_progress = false;
       manual_compaction = nullptr;
     }
+    if (c) {
+      c->ReleaseCompactionFiles(status);
+      c.reset();
+    }
     return status;
   }
 
@@ -1945,6 +2414,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
 
     compaction_job_stats.num_input_files = c->num_input_files(0);
 
+    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+                            compaction_job_stats, job_context->job_id);
+
     for (const auto& f : *c->inputs(0)) {
       c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
     }
@@ -1969,6 +2441,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
 
     compaction_job_stats.num_input_files = c->num_input_files(0);
 
+    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+                            compaction_job_stats, job_context->job_id);
+
     // Move files to next level
     int32_t moved_files = 0;
     int64_t moved_bytes = 0;
@@ -2068,6 +2543,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
         &compaction_job_stats);
     compaction_job.Prepare();
 
+    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+                            compaction_job_stats, job_context->job_id);
+
     mutex_.Unlock();
     compaction_job.Run();
     TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc
index 3cf8778ad5c..bbdd5df3792 100644
--- a/db/db_impl_debug.cc
+++ b/db/db_impl_debug.cc
@@ -123,7 +123,7 @@ Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) {
     auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
     cfd = cfh->cfd();
   }
-  return WaitForFlushMemTable(cfd);
+  return WaitForFlushMemTable(cfd, nullptr, false);
 }
 
 Status DBImpl::TEST_WaitForCompact(bool wait_unscheduled) {
@@ -243,5 +243,10 @@ size_t DBImpl::TEST_GetWalPreallocateBlockSize(
   return GetWalPreallocateBlockSize(write_buffer_size);
 }
 
+void DBImpl::TEST_WaitForTimedTaskRun(std::function<void()> callback) const {
+  if (thread_dump_stats_ != nullptr) {
+    thread_dump_stats_->TEST_WaitForRun(callback);
+  }
+}
 }  // namespace rocksdb
 #endif  // NDEBUG
diff --git a/db/db_impl_files.cc b/db/db_impl_files.cc
index 99fb1df40cb..523192f002a 100644
--- a/db/db_impl_files.cc
+++ b/db/db_impl_files.cc
@@ -20,6 +20,7 @@
 #include "util/sst_file_manager_impl.h"
 
 namespace rocksdb {
+
 uint64_t DBImpl::MinLogNumberToKeep() {
   if (allow_2pc()) {
     return versions_->min_log_number_to_keep_2pc();
@@ -28,6 +29,14 @@ uint64_t DBImpl::MinLogNumberToKeep() {
   }
 }
 
+uint64_t DBImpl::MinObsoleteSstNumberToKeep() {
+  mutex_.AssertHeld();
+  if (!pending_outputs_.empty()) {
+    return *pending_outputs_.begin();
+  }
+  return std::numeric_limits<uint64_t>::max();
+}
+
 // * Returns the list of live files in 'sst_live'
 // If it's doing full scan:
 // * Returns the list of all files in the filesystem in
@@ -456,7 +465,13 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
     } else {
       dir_to_sync =
           (type == kLogFile) ? immutable_db_options_.wal_dir : dbname_;
-      fname = dir_to_sync + "/" + to_delete;
+      fname = dir_to_sync
+            + (
+                (!dir_to_sync.empty() && dir_to_sync.back() == '/') ||
+                (!to_delete.empty() && to_delete.front() == '/')
+                ? "" : "/"
+              )
+            + to_delete;
     }
 
 #ifndef ROCKSDB_LITE
diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc
index 4852d41deae..5196be7ba4a 100644
--- a/db/db_impl_open.cc
+++ b/db/db_impl_open.cc
@@ -23,8 +23,7 @@
 #include "util/sync_point.h"
 
 namespace rocksdb {
-Options SanitizeOptions(const std::string& dbname,
-                        const Options& src) {
+Options SanitizeOptions(const std::string& dbname, const Options& src) {
   auto db_options = SanitizeOptions(dbname, DBOptions(src));
   ImmutableDBOptions immutable_db_options(db_options);
   auto cf_options =
@@ -56,10 +55,9 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
     result.write_buffer_manager.reset(
         new WriteBufferManager(result.db_write_buffer_size));
   }
-  auto bg_job_limits = DBImpl::GetBGJobLimits(result.max_background_flushes,
-                                              result.max_background_compactions,
-                                              result.max_background_jobs,
-                                              true /* parallelize_compactions */);
+  auto bg_job_limits = DBImpl::GetBGJobLimits(
+      result.max_background_flushes, result.max_background_compactions,
+      result.max_background_jobs, true /* parallelize_compactions */);
   result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_compactions,
                                            Env::Priority::LOW);
   result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_flushes,
@@ -107,14 +105,12 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
     result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max());
   }
 
-  if (result.use_direct_reads &&
-      result.compaction_readahead_size == 0) {
+  if (result.use_direct_reads && result.compaction_readahead_size == 0) {
     TEST_SYNC_POINT_CALLBACK("SanitizeOptions:direct_io", nullptr);
     result.compaction_readahead_size = 1024 * 1024 * 2;
   }
 
-  if (result.compaction_readahead_size > 0 ||
-      result.use_direct_reads) {
+  if (result.compaction_readahead_size > 0 || result.use_direct_reads) {
     result.new_table_reader_for_compaction_inputs = true;
   }
 
@@ -218,7 +214,7 @@ static Status ValidateOptions(
 
   return Status::OK();
 }
-} // namespace
+}  // namespace
 Status DBImpl::NewDB() {
   VersionEdit new_db;
   new_db.SetLogNumber(0);
@@ -230,7 +226,7 @@ Status DBImpl::NewDB() {
   ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n");
   const std::string manifest = DescriptorFileName(dbname_, 1);
   {
-    unique_ptr<WritableFile> file;
+    std::unique_ptr<WritableFile> file;
     EnvOptions env_options = env_->OptimizeForManifestWrite(env_options_);
     s = NewWritableFile(env_, manifest, &file, env_options);
     if (!s.ok()) {
@@ -238,8 +234,9 @@ Status DBImpl::NewDB() {
     }
     file->SetPreallocationBlockSize(
         immutable_db_options_.manifest_preallocation_size);
-    unique_ptr<WritableFileWriter> file_writer(
-        new WritableFileWriter(std::move(file), manifest, env_options));
+    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+        std::move(file), manifest, env_options, nullptr /* stats */,
+        immutable_db_options_.listeners));
     log::Writer log(std::move(file_writer), 0, false);
     std::string record;
     new_db.EncodeTo(&record);
@@ -257,9 +254,8 @@ Status DBImpl::NewDB() {
   return s;
 }
 
-Status DBImpl::CreateAndNewDirectory(
-    Env* env, const std::string& dirname,
-    std::unique_ptr<Directory>* directory) {
+Status DBImpl::CreateAndNewDirectory(Env* env, const std::string& dirname,
+                                     std::unique_ptr<Directory>* directory) {
   // We call CreateDirIfMissing() as the directory may already exist (if we
   // are reopening a DB), when this happens we don't want creating the
   // directory to cause an error. However, we need to check if creating the
@@ -340,8 +336,8 @@ Status DBImpl::Recover(
       }
     } else if (s.ok()) {
       if (immutable_db_options_.error_if_exists) {
-        return Status::InvalidArgument(
-            dbname_, "exists (error_if_exists is true)");
+        return Status::InvalidArgument(dbname_,
+                                       "exists (error_if_exists is true)");
       }
     } else {
       // Unexpected error reading file
@@ -361,7 +357,7 @@ Status DBImpl::Recover(
     }
     // Verify compatibility of env_options_ and filesystem
     {
-      unique_ptr<RandomAccessFile> idfile;
+      std::unique_ptr<RandomAccessFile> idfile;
       EnvOptions customized_env(env_options_);
       customized_env.use_direct_reads |=
           immutable_db_options_.use_direct_io_for_flush_and_compaction;
@@ -478,6 +474,28 @@ Status DBImpl::Recover(
     }
   }
 
+  if (read_only) {
+    // If we are opening as read-only, we need to update options_file_number_
+    // to reflect the most recent OPTIONS file. It does not matter for regular
+    // read-write db instance because options_file_number_ will later be
+    // updated to versions_->NewFileNumber() in RenameTempFileToOptionsFile.
+    std::vector<std::string> file_names;
+    if (s.ok()) {
+      s = env_->GetChildren(GetName(), &file_names);
+    }
+    if (s.ok()) {
+      uint64_t number = 0;
+      uint64_t options_file_number = 0;
+      FileType type;
+      for (const auto& fname : file_names) {
+        if (ParseFileName(fname, &number, &type) && type == kOptionsFile) {
+          options_file_number = std::max(number, options_file_number);
+        }
+      }
+      versions_->options_file_number_ = options_file_number;
+    }
+  }
+
   return s;
 }
 
@@ -526,10 +544,9 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     std::map<std::string, uint32_t> cf_name_id_map;
     std::map<uint32_t, uint64_t> cf_lognumber_map;
     for (auto cfd : *versions_->GetColumnFamilySet()) {
-      cf_name_id_map.insert(
-        std::make_pair(cfd->GetName(), cfd->GetID()));
+      cf_name_id_map.insert(std::make_pair(cfd->GetName(), cfd->GetID()));
       cf_lognumber_map.insert(
-        std::make_pair(cfd->GetID(), cfd->GetLogNumber()));
+          std::make_pair(cfd->GetID(), cfd->GetLogNumber()));
     }
 
     immutable_db_options_.wal_filter->ColumnFamilyLogNumberMap(cf_lognumber_map,
@@ -572,9 +589,9 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
       continue;
     }
 
-    unique_ptr<SequentialFileReader> file_reader;
+    std::unique_ptr<SequentialFileReader> file_reader;
     {
-      unique_ptr<SequentialFile> file;
+      std::unique_ptr<SequentialFile> file;
       status = env_->NewSequentialFile(fname, &file,
                                        env_->OptimizeForLogRead(env_options_));
       if (!status.ok()) {
@@ -607,7 +624,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     // to be skipped instead of propagating bad information (like overly
     // large sequence numbers).
     log::Reader reader(immutable_db_options_.info_log, std::move(file_reader),
-                       &reporter, true /*checksum*/, log_number);
+                       &reporter, true /*checksum*/, log_number,
+                       false /* retry_after_eof */);
 
     // Determine if we should tolerate incomplete records at the tail end of the
     // Read all the records and add to a memtable
@@ -878,8 +896,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
       // VersionSet::next_file_number_ always to be strictly greater than any
       // log number
       versions_->MarkFileNumberUsed(max_log_number + 1);
-      status = versions_->LogAndApply(
-          cfd, *cfd->GetLatestMutableCFOptions(), edit, &mutex_);
+      status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                      edit, &mutex_);
       if (!status.ok()) {
         // Recovery failed
         break;
@@ -992,11 +1010,17 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
       if (use_custom_gc_ && snapshot_checker == nullptr) {
         snapshot_checker = DisableGCSnapshotChecker::Instance();
       }
+      std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+          range_del_iters;
+      auto range_del_iter =
+          mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
+      if (range_del_iter != nullptr) {
+        range_del_iters.emplace_back(range_del_iter);
+      }
       s = BuildTable(
           dbname_, env_, *cfd->ioptions(), mutable_cf_options,
           env_options_for_compaction_, cfd->table_cache(), iter.get(),
-          std::unique_ptr<InternalIterator>(mem->NewRangeTombstoneIterator(ro)),
-          &meta, cfd->internal_comparator(),
+          std::move(range_del_iters), &meta, cfd->internal_comparator(),
           cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(),
           snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
           GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
@@ -1030,8 +1054,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
   stats.bytes_written = meta.fd.GetFileSize();
   stats.num_output_files = 1;
   cfd->internal_stats()->AddCompactionStats(level, stats);
-  cfd->internal_stats()->AddCFStats(
-      InternalStats::BYTES_FLUSHED, meta.fd.GetFileSize());
+  cfd->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
+                                    meta.fd.GetFileSize());
   RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
   return s;
 }
@@ -1127,7 +1151,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
   s = impl->Recover(column_families);
   if (s.ok()) {
     uint64_t new_log_number = impl->versions_->NewFileNumber();
-    unique_ptr<WritableFile> lfile;
+    std::unique_ptr<WritableFile> lfile;
     EnvOptions soptions(db_options);
     EnvOptions opt_env_options =
         impl->immutable_db_options_.env->OptimizeForLogWrite(
@@ -1144,8 +1168,10 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
       {
         InstrumentedMutexLock wl(&impl->log_write_mutex_);
         impl->logfile_number_ = new_log_number;
-        unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-            std::move(lfile), log_fname, opt_env_options));
+        const auto& listeners = impl->immutable_db_options_.listeners;
+        std::unique_ptr<WritableFileWriter> file_writer(
+            new WritableFileWriter(std::move(lfile), log_fname, opt_env_options,
+                                   nullptr /* stats */, listeners));
         impl->logs_.emplace_back(
             new_log_number,
             new log::Writer(
@@ -1222,7 +1248,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
           !cfd->mem()->IsMergeOperatorSupported()) {
         s = Status::InvalidArgument(
             "The memtable of column family %s does not support merge operator "
-            "its options.merge_operator is non-null", cfd->GetName().c_str());
+            "its options.merge_operator is non-null",
+            cfd->GetName().c_str());
       }
       if (!s.ok()) {
         break;
@@ -1295,6 +1322,9 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
           persist_options_status.ToString());
     }
   }
+  if (s.ok()) {
+    impl->StartTimedTasks();
+  }
   if (!s.ok()) {
     for (auto* h : *handles) {
       delete h;
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index e32afd2095c..bd7099f00d0 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -9,7 +9,6 @@
 #include "db/db_impl.h"
 #include "db/db_iter.h"
 #include "db/merge_context.h"
-#include "db/range_del_aggregator.h"
 #include "monitoring/perf_context_imp.h"
 
 namespace rocksdb {
@@ -45,17 +44,17 @@ Status DBImplReadOnly::Get(const ReadOptions& read_options,
   }
   SuperVersion* super_version = cfd->GetSuperVersion();
   MergeContext merge_context;
-  RangeDelAggregator range_del_agg(cfd->internal_comparator(), snapshot);
+  SequenceNumber max_covering_tombstone_seq = 0;
   LookupKey lkey(key, snapshot);
   PERF_TIMER_STOP(get_snapshot_time);
   if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
-                              &range_del_agg, read_options)) {
+                              &max_covering_tombstone_seq, read_options)) {
     pinnable_val->PinSelf();
     RecordTick(stats_, MEMTABLE_HIT);
   } else {
     PERF_TIMER_GUARD(get_from_output_files_time);
     super_version->current->Get(read_options, lkey, pinnable_val, &s,
-                                &merge_context, &range_del_agg);
+                                &merge_context, &max_covering_tombstone_seq);
     RecordTick(stats_, MEMTABLE_MISS);
   }
   RecordTick(stats_, NUMBER_KEYS_READ);
@@ -72,18 +71,20 @@ Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
   auto cfd = cfh->cfd();
   SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
   SequenceNumber latest_snapshot = versions_->LastSequence();
+  SequenceNumber read_seq =
+      read_options.snapshot != nullptr
+          ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+                ->number_
+          : latest_snapshot;
   ReadCallback* read_callback = nullptr;  // No read callback provided.
   auto db_iter = NewArenaWrappedDbIterator(
       env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
-      (read_options.snapshot != nullptr
-           ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
-                 ->number_
-           : latest_snapshot),
+      read_seq,
       super_version->mutable_cf_options.max_sequential_skip_in_iterations,
       super_version->version_number, read_callback);
   auto internal_iter =
       NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(),
-                          db_iter->GetRangeDelAggregator());
+                          db_iter->GetRangeDelAggregator(), read_seq);
   db_iter->SetIterUnderDBIter(internal_iter);
   return db_iter;
 }
@@ -99,21 +100,22 @@ Status DBImplReadOnly::NewIterators(
   iterators->clear();
   iterators->reserve(column_families.size());
   SequenceNumber latest_snapshot = versions_->LastSequence();
+  SequenceNumber read_seq =
+      read_options.snapshot != nullptr
+          ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+                ->number_
+          : latest_snapshot;
 
   for (auto cfh : column_families) {
     auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
     auto* sv = cfd->GetSuperVersion()->Ref();
     auto* db_iter = NewArenaWrappedDbIterator(
-        env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
-        (read_options.snapshot != nullptr
-             ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
-                   ->number_
-             : latest_snapshot),
+        env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, read_seq,
         sv->mutable_cf_options.max_sequential_skip_in_iterations,
         sv->version_number, read_callback);
     auto* internal_iter =
         NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
-                            db_iter->GetRangeDelAggregator());
+                            db_iter->GetRangeDelAggregator(), read_seq);
     db_iter->SetIterUnderDBIter(internal_iter);
     iterators->push_back(db_iter);
   }
diff --git a/db/db_impl_readonly.h b/db/db_impl_readonly.h
index 6ebe1bce760..2d77dbac0e7 100644
--- a/db/db_impl_readonly.h
+++ b/db/db_impl_readonly.h
@@ -89,10 +89,11 @@ class DBImplReadOnly : public DBImpl {
   virtual Status EnableFileDeletions(bool /*force*/) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
-  virtual Status GetLiveFiles(std::vector<std::string>&,
-                              uint64_t* /*manifest_file_size*/,
-                              bool /*flush_memtable*/ = true) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+  virtual Status GetLiveFiles(std::vector<std::string>& ret,
+                              uint64_t* manifest_file_size,
+                              bool /*flush_memtable*/) override {
+    return DBImpl::GetLiveFiles(ret, manifest_file_size,
+                                false /* flush_memtable */);
   }
 
   using DBImpl::Flush;
diff --git a/db/db_impl_write.cc b/db/db_impl_write.cc
index 29b54bfd1e2..6bfc9802503 100644
--- a/db/db_impl_write.cc
+++ b/db/db_impl_write.cc
@@ -338,8 +338,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
           }
         }
         write_group.last_sequence = last_sequence;
-        write_group.running.store(static_cast<uint32_t>(write_group.size),
-                                  std::memory_order_relaxed);
         write_thread_.LaunchParallelMemTableWriters(&write_group);
         in_parallel_group = true;
 
@@ -1016,6 +1014,28 @@ Status DBImpl::WriteRecoverableState() {
   return Status::OK();
 }
 
+void DBImpl::SelectColumnFamiliesForAtomicFlush(
+    autovector<ColumnFamilyData*>* cfds) {
+  for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+        !cached_recoverable_state_empty_.load()) {
+      cfds->push_back(cfd);
+    }
+  }
+}
+
+// Assign sequence number for atomic flush.
+void DBImpl::AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds) {
+  assert(immutable_db_options_.atomic_flush);
+  auto seq = versions_->LastSequence();
+  for (auto cfd : cfds) {
+    cfd->imm()->AssignAtomicFlushSeq(seq);
+  }
+}
+
 Status DBImpl::SwitchWAL(WriteContext* write_context) {
   mutex_.AssertHeld();
   assert(write_context != nullptr);
@@ -1064,21 +1084,36 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
                  oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize());
   // no need to refcount because drop is happening in write thread, so can't
   // happen while we're in the write thread
-  FlushRequest flush_req;
-  for (auto cfd : *versions_->GetColumnFamilySet()) {
-    if (cfd->IsDropped()) {
-      continue;
-    }
-    if (cfd->OldestLogToKeep() <= oldest_alive_log) {
-      status = SwitchMemtable(cfd, write_context);
-      if (!status.ok()) {
-        break;
+  autovector<ColumnFamilyData*> cfds;
+  if (immutable_db_options_.atomic_flush) {
+    SelectColumnFamiliesForAtomicFlush(&cfds);
+  } else {
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
       }
-      flush_req.emplace_back(cfd, cfd->imm()->GetLatestMemTableID());
-      cfd->imm()->FlushRequested();
+      if (cfd->OldestLogToKeep() <= oldest_alive_log) {
+        cfds.push_back(cfd);
+      }
+    }
+  }
+  for (const auto cfd : cfds) {
+    cfd->Ref();
+    status = SwitchMemtable(cfd, write_context);
+    cfd->Unref();
+    if (!status.ok()) {
+      break;
     }
   }
   if (status.ok()) {
+    if (immutable_db_options_.atomic_flush) {
+      AssignAtomicFlushSeq(cfds);
+    }
+    for (auto cfd : cfds) {
+      cfd->imm()->FlushRequested();
+    }
+    FlushRequest flush_req;
+    GenerateFlushRequest(cfds, &flush_req);
     SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
     MaybeScheduleFlushOrCompaction();
   }
@@ -1103,41 +1138,52 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
       write_buffer_manager_->buffer_size());
   // no need to refcount because drop is happening in write thread, so can't
   // happen while we're in the write thread
-  ColumnFamilyData* cfd_picked = nullptr;
-  SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber;
+  autovector<ColumnFamilyData*> cfds;
+  if (immutable_db_options_.atomic_flush) {
+    SelectColumnFamiliesForAtomicFlush(&cfds);
+  } else {
+    ColumnFamilyData* cfd_picked = nullptr;
+    SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber;
 
-  for (auto cfd : *versions_->GetColumnFamilySet()) {
-    if (cfd->IsDropped()) {
-      continue;
-    }
-    if (!cfd->mem()->IsEmpty()) {
-      // We only consider active mem table, hoping immutable memtable is
-      // already in the process of flushing.
-      uint64_t seq = cfd->mem()->GetCreationSeq();
-      if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) {
-        cfd_picked = cfd;
-        seq_num_for_cf_picked = seq;
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (!cfd->mem()->IsEmpty()) {
+        // We only consider active mem table, hoping immutable memtable is
+        // already in the process of flushing.
+        uint64_t seq = cfd->mem()->GetCreationSeq();
+        if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) {
+          cfd_picked = cfd;
+          seq_num_for_cf_picked = seq;
+        }
       }
     }
+    if (cfd_picked != nullptr) {
+      cfds.push_back(cfd_picked);
+    }
   }
 
-  autovector<ColumnFamilyData*> cfds;
-  if (cfd_picked != nullptr) {
-    cfds.push_back(cfd_picked);
-  }
-  FlushRequest flush_req;
   for (const auto cfd : cfds) {
+    if (cfd->mem()->IsEmpty()) {
+      continue;
+    }
     cfd->Ref();
     status = SwitchMemtable(cfd, write_context);
     cfd->Unref();
     if (!status.ok()) {
       break;
     }
-    uint64_t flush_memtable_id = cfd->imm()->GetLatestMemTableID();
-    cfd->imm()->FlushRequested();
-    flush_req.emplace_back(cfd, flush_memtable_id);
   }
   if (status.ok()) {
+    if (immutable_db_options_.atomic_flush) {
+      AssignAtomicFlushSeq(cfds);
+    }
+    for (const auto cfd : cfds) {
+      cfd->imm()->FlushRequested();
+    }
+    FlushRequest flush_req;
+    GenerateFlushRequest(cfds, &flush_req);
     SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
     MaybeScheduleFlushOrCompaction();
   }
@@ -1260,25 +1306,38 @@ Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
 }
 
 Status DBImpl::ScheduleFlushes(WriteContext* context) {
-  ColumnFamilyData* cfd;
-  FlushRequest flush_req;
+  autovector<ColumnFamilyData*> cfds;
+  if (immutable_db_options_.atomic_flush) {
+    SelectColumnFamiliesForAtomicFlush(&cfds);
+    for (auto cfd : cfds) {
+      cfd->Ref();
+    }
+    flush_scheduler_.Clear();
+  } else {
+    ColumnFamilyData* tmp_cfd;
+    while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
+      cfds.push_back(tmp_cfd);
+    }
+  }
   Status status;
-  while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
-    status = SwitchMemtable(cfd, context);
-    bool should_schedule = true;
+  for (auto& cfd : cfds) {
+    if (!cfd->mem()->IsEmpty()) {
+      status = SwitchMemtable(cfd, context);
+    }
     if (cfd->Unref()) {
       delete cfd;
-      should_schedule = false;
+      cfd = nullptr;
     }
     if (!status.ok()) {
       break;
     }
-    if (should_schedule) {
-      uint64_t flush_memtable_id = cfd->imm()->GetLatestMemTableID();
-      flush_req.emplace_back(cfd, flush_memtable_id);
-    }
   }
   if (status.ok()) {
+    if (immutable_db_options_.atomic_flush) {
+      AssignAtomicFlushSeq(cfds);
+    }
+    FlushRequest flush_req;
+    GenerateFlushRequest(cfds, &flush_req);
     SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
     MaybeScheduleFlushOrCompaction();
   }
@@ -1312,7 +1371,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
     nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
   }
 
-  unique_ptr<WritableFile> lfile;
+  std::unique_ptr<WritableFile> lfile;
   log::Writer* new_log = nullptr;
   MemTable* new_mem = nullptr;
 
@@ -1396,8 +1455,9 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
         // of calling GetWalPreallocateBlockSize()
         lfile->SetPreallocationBlockSize(preallocate_block_size);
         lfile->SetWriteLifeTimeHint(write_hint);
-        unique_ptr<WritableFileWriter> file_writer(
-            new WritableFileWriter(std::move(lfile), log_fname, opt_env_opt));
+        std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+            std::move(lfile), log_fname, opt_env_opt, nullptr /* stats */,
+            immutable_db_options_.listeners));
         new_log = new log::Writer(
             std::move(file_writer), new_log_number,
             immutable_db_options_.recycle_log_file_num > 0, manual_wal_flush_);
diff --git a/db/db_info_dumper.cc b/db/db_info_dumper.cc
index 1668a1638ff..31050d20a29 100644
--- a/db/db_info_dumper.cc
+++ b/db/db_info_dumper.cc
@@ -42,7 +42,7 @@ void DumpDBFileSummary(const ImmutableDBOptions& options,
           "Error when reading %s dir\n", dbname.c_str());
   }
   std::sort(files.begin(), files.end());
-  for (std::string file : files) {
+  for (const std::string& file : files) {
     if (!ParseFileName(file, &number, &type)) {
       continue;
     }
@@ -85,7 +85,7 @@ void DumpDBFileSummary(const ImmutableDBOptions& options,
         continue;
       }
       std::sort(files.begin(), files.end());
-      for (std::string file : files) {
+      for (const std::string& file : files) {
         if (ParseFileName(file, &number, &type)) {
           if (type == kTableFile && ++file_num < 10) {
             file_info.append(file).append(" ");
@@ -109,7 +109,7 @@ void DumpDBFileSummary(const ImmutableDBOptions& options,
       return;
     }
     wal_info.clear();
-    for (std::string file : files) {
+    for (const std::string& file : files) {
       if (ParseFileName(file, &number, &type)) {
         if (type == kLogFile) {
           env->GetFileSize(options.wal_dir + "/" + file, &file_size);
diff --git a/db/db_io_failure_test.cc b/db/db_io_failure_test.cc
index 9f4dcc5d056..ba8f197596c 100644
--- a/db/db_io_failure_test.cc
+++ b/db/db_io_failure_test.cc
@@ -88,7 +88,6 @@ TEST_F(DBIOFailureTest, DropWritesFlush) {
     env_->drop_writes_.store(false, std::memory_order_release);
   } while (ChangeCompactOptions());
 }
-#endif  // ROCKSDB_LITE
 
 // Check that CompactRange() returns failure if there is not enough space left
 // on device
@@ -116,6 +115,7 @@ TEST_F(DBIOFailureTest, NoSpaceCompactRange) {
     env_->no_space_.store(false, std::memory_order_release);
   } while (ChangeCompactOptions());
 }
+#endif  // ROCKSDB_LITE
 
 TEST_F(DBIOFailureTest, NonWritableFileSystem) {
   do {
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 701d0d8fcaf..348247aa387 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -134,15 +134,14 @@ class DBIter final: public Iterator {
         prefix_same_as_start_(read_options.prefix_same_as_start),
         pin_thru_lifetime_(read_options.pin_data),
         total_order_seek_(read_options.total_order_seek),
-        range_del_agg_(cf_options.internal_comparator, s,
-                       true /* collapse_deletions */),
+        range_del_agg_(&cf_options.internal_comparator, s),
         read_callback_(read_callback),
         db_impl_(db_impl),
         cfd_(cfd),
         allow_blob_(allow_blob),
         is_blob_(false),
         start_seqnum_(read_options.iter_start_seqnum) {
-    RecordTick(statistics_, NO_ITERATORS);
+    RecordTick(statistics_, NO_ITERATOR_CREATED);
     prefix_extractor_ = mutable_cf_options.prefix_extractor.get();
     max_skip_ = max_sequential_skip_in_iterations;
     max_skippable_internal_keys_ = read_options.max_skippable_internal_keys;
@@ -158,9 +157,7 @@ class DBIter final: public Iterator {
     if (pinned_iters_mgr_.PinningEnabled()) {
       pinned_iters_mgr_.ReleasePinnedData();
     }
-    // Compiler warning issue filed:
-    // https://github.com/facebook/rocksdb/issues/3013
-    RecordTick(statistics_, NO_ITERATORS, uint64_t(-1));
+    RecordTick(statistics_, NO_ITERATOR_DELETED);
     ResetInternalKeysSkippedCounter();
     local_stats_.BumpGlobalStatistics(statistics_);
     if (!arena_mode_) {
@@ -174,7 +171,7 @@ class DBIter final: public Iterator {
     iter_ = iter;
     iter_->SetPinnedItersMgr(&pinned_iters_mgr_);
   }
-  virtual RangeDelAggregator* GetRangeDelAggregator() {
+  virtual ReadRangeDelAggregator* GetRangeDelAggregator() {
     return &range_del_agg_;
   }
 
@@ -232,7 +229,7 @@ class DBIter final: public Iterator {
       *prop = saved_key_.GetUserKey().ToString();
       return Status::OK();
     }
-    return Status::InvalidArgument("Undentified property.");
+    return Status::InvalidArgument("Unidentified property.");
   }
 
   virtual void Next() override;
@@ -344,7 +341,7 @@ class DBIter final: public Iterator {
   const bool total_order_seek_;
   // List of operands for merge operator.
   MergeContext merge_context_;
-  RangeDelAggregator range_del_agg_;
+  ReadRangeDelAggregator range_del_agg_;
   LocalStatistics local_stats_;
   PinnedIteratorsManager pinned_iters_mgr_;
   ReadCallback* read_callback_;
@@ -1115,7 +1112,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
 
     if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
         range_del_agg_.ShouldDelete(
-            ikey, RangeDelPositioningMode::kBackwardTraversal)) {
+            ikey, RangeDelPositioningMode::kForwardTraversal)) {
       break;
     } else if (ikey.type == kTypeValue) {
       const Slice val = iter_->value();
@@ -1482,7 +1479,7 @@ Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
 
 ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
 
-RangeDelAggregator* ArenaWrappedDBIter::GetRangeDelAggregator() {
+ReadRangeDelAggregator* ArenaWrappedDBIter::GetRangeDelAggregator() {
   return db_iter_->GetRangeDelAggregator();
 }
 
@@ -1558,7 +1555,8 @@ Status ArenaWrappedDBIter::Refresh() {
          allow_refresh_);
 
     InternalIterator* internal_iter = db_impl_->NewInternalIterator(
-        read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator());
+        read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(),
+        latest_seq);
     SetIterUnderDBIter(internal_iter);
   } else {
     db_iter_->set_sequence(latest_seq);
diff --git a/db/db_iter.h b/db/db_iter.h
index 8e18f03fc31..a640f0296e5 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -48,7 +48,7 @@ class ArenaWrappedDBIter : public Iterator {
   // Get the arena to be used to allocate memory for DBIter to be wrapped,
   // as well as child iterators in it.
   virtual Arena* GetArena() { return &arena_; }
-  virtual RangeDelAggregator* GetRangeDelAggregator();
+  virtual ReadRangeDelAggregator* GetRangeDelAggregator();
 
   // Set the internal iterator wrapped inside the DB Iterator. Usually it is
   // a merging iterator.
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index eb7b52c0c04..0e8c78155ab 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -85,7 +85,7 @@ TEST_P(DBIteratorTest, IteratorProperty) {
   ReadOptions ropt;
   ropt.pin_data = false;
   {
-    unique_ptr<Iterator> iter(NewIterator(ropt, handles_[1]));
+    std::unique_ptr<Iterator> iter(NewIterator(ropt, handles_[1]));
     iter->SeekToFirst();
     std::string prop_value;
     ASSERT_NOK(iter->GetProperty("non_existing.value", &prop_value));
@@ -1951,6 +1951,7 @@ TEST_P(DBIteratorTest, ReadAhead) {
   size_t bytes_read = env_->random_read_bytes_counter_;
   delete iter;
 
+  int64_t num_file_closes = TestGetTickerCount(options, NO_FILE_CLOSES);
   env_->random_read_bytes_counter_ = 0;
   options.statistics->setTickerCount(NO_FILE_OPENS, 0);
   read_options.readahead_size = 1024 * 10;
@@ -1959,7 +1960,10 @@ TEST_P(DBIteratorTest, ReadAhead) {
   int64_t num_file_opens_readahead = TestGetTickerCount(options, NO_FILE_OPENS);
   size_t bytes_read_readahead = env_->random_read_bytes_counter_;
   delete iter;
+  int64_t num_file_closes_readahead =
+      TestGetTickerCount(options, NO_FILE_CLOSES);
   ASSERT_EQ(num_file_opens + 3, num_file_opens_readahead);
+  ASSERT_EQ(num_file_closes + 3, num_file_closes_readahead);
   ASSERT_GT(bytes_read_readahead, bytes_read);
   ASSERT_GT(bytes_read_readahead, read_options.readahead_size * 3);
 
@@ -2373,7 +2377,7 @@ TEST_P(DBIteratorTest, SeekAfterHittingManyInternalKeys) {
   Delete("5");
   Put("6", "val_6");
 
-  unique_ptr<Iterator> iter(NewIterator(ropts));
+  std::unique_ptr<Iterator> iter(NewIterator(ropts));
   iter->SeekToFirst();
 
   ASSERT_TRUE(iter->Valid());
@@ -2392,7 +2396,7 @@ TEST_P(DBIteratorTest, SeekAfterHittingManyInternalKeys) {
   ASSERT_EQ("4", prop_value);
 
   // Create a new iterator to seek to the internal key.
-  unique_ptr<Iterator> iter2(NewIterator(ropts));
+  std::unique_ptr<Iterator> iter2(NewIterator(ropts));
   iter2->Seek(prop_value);
   ASSERT_TRUE(iter2->Valid());
   ASSERT_OK(iter2->status());
@@ -2420,7 +2424,7 @@ TEST_P(DBIteratorTest, NonBlockingIterationBugRepro) {
   // Create a nonblocking iterator before writing to memtable.
   ReadOptions ropt;
   ropt.read_tier = kBlockCacheTier;
-  unique_ptr<Iterator> iter(NewIterator(ropt));
+  std::unique_ptr<Iterator> iter(NewIterator(ropt));
 
   // Overwrite a key in memtable many times to hit
   // max_sequential_skip_in_iterations (which is 8 by default).
@@ -2430,7 +2434,7 @@ TEST_P(DBIteratorTest, NonBlockingIterationBugRepro) {
 
   // Load the second block in sst file into the block cache.
   {
-    unique_ptr<Iterator> iter2(NewIterator(ReadOptions()));
+    std::unique_ptr<Iterator> iter2(NewIterator(ReadOptions()));
     iter2->Seek("d");
   }
 
diff --git a/db/db_log_iter_test.cc b/db/db_log_iter_test.cc
index 1370c65ca6f..a8c92ca26fe 100644
--- a/db/db_log_iter_test.cc
+++ b/db/db_log_iter_test.cc
@@ -23,7 +23,7 @@ class DBTestXactLogIterator : public DBTestBase {
 
   std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
       const SequenceNumber seq) {
-    unique_ptr<TransactionLogIterator> iter;
+    std::unique_ptr<TransactionLogIterator> iter;
     Status status = dbfull()->GetUpdatesSince(seq, &iter);
     EXPECT_OK(status);
     EXPECT_TRUE(iter->Valid());
diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc
index 5f47a94818e..96025d7dbbe 100644
--- a/db/db_memtable_test.cc
+++ b/db/db_memtable_test.cc
@@ -8,6 +8,7 @@
 
 #include "db/db_test_util.h"
 #include "db/memtable.h"
+#include "db/range_del_aggregator.h"
 #include "port/stack_trace.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/slice_transform.h"
@@ -135,7 +136,8 @@ TEST_F(DBMemTableTest, DuplicateSeq) {
   MergeContext merge_context;
   Options options;
   InternalKeyComparator ikey_cmp(options.comparator);
-  RangeDelAggregator range_del_agg(ikey_cmp, {} /* snapshots */);
+  ReadRangeDelAggregator range_del_agg(&ikey_cmp,
+                                       kMaxSequenceNumber /* upper_bound */);
 
   // Create a MemTable
   InternalKeyComparator cmp(BytewiseComparator());
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index dc793162465..1f4e99073cb 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -514,6 +514,33 @@ TEST_F(DBOptionsTest, SetStatsDumpPeriodSec) {
   }
 }
 
+TEST_F(DBOptionsTest, RunStatsDumpPeriodSec) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_dump_period_sec = 5;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0); // in seconds
+  options.env = mock_env.get();
+  int counter = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DumpStats:1", [&](void* /*arg*/) {
+        counter++;
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_dump_period_sec);
+  dbfull()->TEST_WaitForTimedTaskRun([&] { mock_env->set_current_time(5); });
+  ASSERT_GE(counter, 1);
+
+  // Test cacel job through SetOptions
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "0"}}));
+  int old_val = counter;
+  env_->SleepForMicroseconds(10000000);
+  ASSERT_EQ(counter, old_val);
+  Close();
+}
+
 static void assert_candidate_files_empty(DBImpl* dbfull, const bool empty) {
   dbfull->TEST_LockMutex();
   JobContext job_context(0);
diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc
index bdd50ec6cfb..3745135bf71 100644
--- a/db/db_properties_test.cc
+++ b/db/db_properties_test.cc
@@ -170,6 +170,8 @@ void ResetTableProperties(TableProperties* tp) {
   tp->raw_value_size = 0;
   tp->num_data_blocks = 0;
   tp->num_entries = 0;
+  tp->num_deletions = 0;
+  tp->num_merge_operands = 0;
   tp->num_range_deletions = 0;
 }
 
@@ -179,17 +181,19 @@ void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
   std::replace(tp_string.begin(), tp_string.end(), '=', ' ');
   ResetTableProperties(tp);
   sscanf(tp_string.c_str(),
-         "# data blocks %" SCNu64 " # entries %" SCNu64
-         " # range deletions %" SCNu64 " raw key size %" SCNu64
+         "# data blocks %" SCNu64 " # entries %" SCNu64 " # deletions %" SCNu64
+         " # merge operands %" SCNu64 " # range deletions %" SCNu64
+         " raw key size %" SCNu64
          " raw average key size %lf "
          " raw value size %" SCNu64
          " raw average value size %lf "
          " data block size %" SCNu64 " index block size (user-key? %" SCNu64
          ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64,
-         &tp->num_data_blocks, &tp->num_entries, &tp->num_range_deletions,
-         &tp->raw_key_size, &dummy_double, &tp->raw_value_size, &dummy_double,
-         &tp->data_size, &tp->index_key_is_user_key,
-         &tp->index_value_is_delta_encoded, &tp->index_size, &tp->filter_size);
+         &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions,
+         &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size,
+         &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size,
+         &tp->index_key_is_user_key, &tp->index_value_is_delta_encoded,
+         &tp->index_size, &tp->filter_size);
 }
 
 void VerifySimilar(uint64_t a, uint64_t b, double bias) {
@@ -217,28 +221,43 @@ void VerifyTableProperties(const TableProperties& base_tp,
   VerifySimilar(base_tp.filter_size, new_tp.filter_size, filter_size_bias);
   VerifySimilar(base_tp.num_data_blocks, new_tp.num_data_blocks,
                 num_data_blocks_bias);
+
   ASSERT_EQ(base_tp.raw_key_size, new_tp.raw_key_size);
   ASSERT_EQ(base_tp.raw_value_size, new_tp.raw_value_size);
   ASSERT_EQ(base_tp.num_entries, new_tp.num_entries);
+  ASSERT_EQ(base_tp.num_deletions, new_tp.num_deletions);
   ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions);
+
+  // Merge operands may become Puts, so we only have an upper bound the exact
+  // number of merge operands.
+  ASSERT_GE(base_tp.num_merge_operands, new_tp.num_merge_operands);
 }
 
 void GetExpectedTableProperties(
     TableProperties* expected_tp, const int kKeySize, const int kValueSize,
-    const int kKeysPerTable, const int kRangeDeletionsPerTable,
+    const int kPutsPerTable, const int kDeletionsPerTable,
+    const int kMergeOperandsPerTable, const int kRangeDeletionsPerTable,
     const int kTableCount, const int kBloomBitsPerKey, const size_t kBlockSize,
     const bool index_key_is_user_key, const bool value_delta_encoding) {
-  const int kKeyCount = kTableCount * kKeysPerTable;
+  const int kKeysPerTable =
+      kPutsPerTable + kDeletionsPerTable + kMergeOperandsPerTable;
+  const int kPutCount = kTableCount * kPutsPerTable;
+  const int kDeletionCount = kTableCount * kDeletionsPerTable;
+  const int kMergeCount = kTableCount * kMergeOperandsPerTable;
   const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable;
+  const int kKeyCount = kPutCount + kDeletionCount + kMergeCount;
   const int kAvgSuccessorSize = kKeySize / 5;
   const int kEncodingSavePerKey = kKeySize / 4;
-  expected_tp->raw_key_size = (kKeyCount + kRangeDeletionCount) * (kKeySize + 8);
-  expected_tp->raw_value_size = (kKeyCount + kRangeDeletionCount) * kValueSize;
+  expected_tp->raw_key_size =
+      (kKeyCount + kRangeDeletionCount) * (kKeySize + 8);
+  expected_tp->raw_value_size =
+      (kPutCount + kMergeCount + kRangeDeletionCount) * kValueSize;
   expected_tp->num_entries = kKeyCount;
+  expected_tp->num_deletions = kDeletionCount;
+  expected_tp->num_merge_operands = kMergeCount;
   expected_tp->num_range_deletions = kRangeDeletionCount;
   expected_tp->num_data_blocks =
-      kTableCount *
-      (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) /
+      kTableCount * (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) /
       kBlockSize;
   expected_tp->data_size =
       kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize));
@@ -298,8 +317,10 @@ TEST_F(DBPropertiesTest, ValidateSampleNumber) {
 
 TEST_F(DBPropertiesTest, AggregatedTableProperties) {
   for (int kTableCount = 40; kTableCount <= 100; kTableCount += 30) {
+    const int kDeletionsPerTable = 5;
+    const int kMergeOperandsPerTable = 15;
     const int kRangeDeletionsPerTable = 5;
-    const int kKeysPerTable = 100;
+    const int kPutsPerTable = 100;
     const int kKeySize = 80;
     const int kValueSize = 200;
     const int kBloomBitsPerKey = 20;
@@ -308,6 +329,8 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) {
     options.level0_file_num_compaction_trigger = 8;
     options.compression = kNoCompression;
     options.create_if_missing = true;
+    options.preserve_deletes = true;
+    options.merge_operator.reset(new TestPutOperator());
 
     BlockBasedTableOptions table_options;
     table_options.filter_policy.reset(
@@ -323,10 +346,17 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) {
 
     Random rnd(5632);
     for (int table = 1; table <= kTableCount; ++table) {
-      for (int i = 0; i < kKeysPerTable; ++i) {
+      for (int i = 0; i < kPutsPerTable; ++i) {
         db_->Put(WriteOptions(), RandomString(&rnd, kKeySize),
                  RandomString(&rnd, kValueSize));
       }
+      for (int i = 0; i < kDeletionsPerTable; i++) {
+        db_->Delete(WriteOptions(), RandomString(&rnd, kKeySize));
+      }
+      for (int i = 0; i < kMergeOperandsPerTable; i++) {
+        db_->Merge(WriteOptions(), RandomString(&rnd, kKeySize),
+                   RandomString(&rnd, kValueSize));
+      }
       for (int i = 0; i < kRangeDeletionsPerTable; i++) {
         std::string start = RandomString(&rnd, kKeySize);
         std::string end = start;
@@ -343,11 +373,11 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) {
     bool value_is_delta_encoded = output_tp.index_value_is_delta_encoded > 0;
 
     TableProperties expected_tp;
-    GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize,
-                               kKeysPerTable, kRangeDeletionsPerTable,
-                               kTableCount, kBloomBitsPerKey,
-                               table_options.block_size, index_key_is_user_key,
-                               value_is_delta_encoded);
+    GetExpectedTableProperties(
+        &expected_tp, kKeySize, kValueSize, kPutsPerTable, kDeletionsPerTable,
+        kMergeOperandsPerTable, kRangeDeletionsPerTable, kTableCount,
+        kBloomBitsPerKey, table_options.block_size, index_key_is_user_key,
+        value_is_delta_encoded);
 
     VerifyTableProperties(expected_tp, output_tp);
   }
@@ -416,7 +446,7 @@ TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) {
   ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
   ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
   {
-    unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
     for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) {
     }
   }
@@ -469,8 +499,10 @@ TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) {
 
 TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
   const int kTableCount = 100;
+  const int kDeletionsPerTable = 2;
+  const int kMergeOperandsPerTable = 2;
   const int kRangeDeletionsPerTable = 2;
-  const int kKeysPerTable = 10;
+  const int kPutsPerTable = 10;
   const int kKeySize = 50;
   const int kValueSize = 400;
   const int kMaxLevel = 7;
@@ -486,6 +518,8 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
   options.max_bytes_for_level_multiplier = 2;
   // This ensures there no compaction happening when we call GetProperty().
   options.disable_auto_compactions = true;
+  options.preserve_deletes = true;
+  options.merge_operator.reset(new TestPutOperator());
 
   BlockBasedTableOptions table_options;
   table_options.filter_policy.reset(
@@ -503,10 +537,17 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
   TableProperties level_tps[kMaxLevel];
   TableProperties tp, sum_tp, expected_tp;
   for (int table = 1; table <= kTableCount; ++table) {
-    for (int i = 0; i < kKeysPerTable; ++i) {
+    for (int i = 0; i < kPutsPerTable; ++i) {
       db_->Put(WriteOptions(), RandomString(&rnd, kKeySize),
                RandomString(&rnd, kValueSize));
     }
+    for (int i = 0; i < kDeletionsPerTable; i++) {
+      db_->Delete(WriteOptions(), RandomString(&rnd, kKeySize));
+    }
+    for (int i = 0; i < kMergeOperandsPerTable; i++) {
+      db_->Merge(WriteOptions(), RandomString(&rnd, kKeySize),
+                 RandomString(&rnd, kValueSize));
+    }
     for (int i = 0; i < kRangeDeletionsPerTable; i++) {
       std::string start = RandomString(&rnd, kKeySize);
       std::string end = start;
@@ -528,6 +569,8 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
       sum_tp.raw_value_size += level_tps[level].raw_value_size;
       sum_tp.num_data_blocks += level_tps[level].num_data_blocks;
       sum_tp.num_entries += level_tps[level].num_entries;
+      sum_tp.num_deletions += level_tps[level].num_deletions;
+      sum_tp.num_merge_operands += level_tps[level].num_merge_operands;
       sum_tp.num_range_deletions += level_tps[level].num_range_deletions;
     }
     db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string);
@@ -541,12 +584,15 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
     ASSERT_EQ(sum_tp.raw_value_size, tp.raw_value_size);
     ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks);
     ASSERT_EQ(sum_tp.num_entries, tp.num_entries);
+    ASSERT_EQ(sum_tp.num_deletions, tp.num_deletions);
+    ASSERT_EQ(sum_tp.num_merge_operands, tp.num_merge_operands);
     ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions);
     if (table > 3) {
-      GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize,
-                                 kKeysPerTable, kRangeDeletionsPerTable, table,
-                                 kBloomBitsPerKey, table_options.block_size,
-                                 index_key_is_user_key, value_is_delta_encoded);
+      GetExpectedTableProperties(
+          &expected_tp, kKeySize, kValueSize, kPutsPerTable, kDeletionsPerTable,
+          kMergeOperandsPerTable, kRangeDeletionsPerTable, table,
+          kBloomBitsPerKey, table_options.block_size, index_key_is_user_key,
+          value_is_delta_encoded);
       // Gives larger bias here as index block size, filter block size,
       // and data block size become much harder to estimate in this test.
       VerifyTableProperties(expected_tp, tp, 0.5, 0.4, 0.4, 0.25);
@@ -1483,6 +1529,64 @@ TEST_F(DBPropertiesTest, SstFilesSize) {
   ASSERT_TRUE(listener->callback_triggered);
 }
 
+TEST_F(DBPropertiesTest, MinObsoleteSstNumberToKeep) {
+  class TestListener : public EventListener {
+   public:
+    void OnTableFileCreated(const TableFileCreationInfo& info) override {
+      if (info.reason == TableFileCreationReason::kCompaction) {
+        // Verify the property indicates that SSTs created by a running
+        // compaction cannot be deleted.
+        uint64_t created_file_num;
+        FileType created_file_type;
+        std::string filename =
+            info.file_path.substr(info.file_path.rfind('/') + 1);
+        ASSERT_TRUE(
+            ParseFileName(filename, &created_file_num, &created_file_type));
+        ASSERT_EQ(kTableFile, created_file_type);
+
+        uint64_t keep_sst_lower_bound;
+        ASSERT_TRUE(
+            db_->GetIntProperty(DB::Properties::kMinObsoleteSstNumberToKeep,
+                                &keep_sst_lower_bound));
+
+        ASSERT_LE(keep_sst_lower_bound, created_file_num);
+        validated_ = true;
+      }
+    }
+
+    void SetDB(DB* db) { db_ = db; }
+
+    int GetNumCompactions() { return num_compactions_; }
+
+    // True if we've verified the property for at least one output file
+    bool Validated() { return validated_; }
+
+   private:
+    int num_compactions_ = 0;
+    bool validated_ = false;
+    DB* db_ = nullptr;
+  };
+
+  const int kNumL0Files = 4;
+
+  std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+  Options options = CurrentOptions();
+  options.listeners.push_back(listener);
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  DestroyAndReopen(options);
+  listener->SetDB(db_);
+
+  for (int i = 0; i < kNumL0Files; ++i) {
+    // Make sure they overlap in keyspace to prevent trivial move
+    Put("key1", "val");
+    Put("key2", "val");
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(listener->Validated());
+}
+
 TEST_F(DBPropertiesTest, BlockCacheProperties) {
   Options options;
   uint64_t value;
diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc
index 9ca06333551..73bc4d27550 100644
--- a/db/db_range_del_test.cc
+++ b/db/db_range_del_test.cc
@@ -1041,11 +1041,16 @@ TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) {
     //   L2:
     //     [key000000#1,1, key000000#1,1]
     //     [key000002#6,1, key000004#72057594037927935,15]
+    //
+    // At the same time, verify the compaction does not cause the key at the
+    // endpoint (key000002#6,1) to disappear.
+    ASSERT_EQ(value, Get(Key(2)));
     auto begin_str = Key(3);
     const rocksdb::Slice begin = begin_str;
     dbfull()->TEST_CompactRange(1, &begin, nullptr);
     ASSERT_EQ(1, NumTableFilesAtLevel(1));
     ASSERT_EQ(2, NumTableFilesAtLevel(2));
+    ASSERT_EQ(value, Get(Key(2)));
   }
 
   {
@@ -1103,6 +1108,386 @@ TEST_F(DBRangeDelTest, UnorderedTombstones) {
   ASSERT_TRUE(s.IsNotFound());
 }
 
+class MockMergeOperator : public MergeOperator {
+  // Mock non-associative operator. Non-associativity is expressed by lack of
+  // implementation for any `PartialMerge*` functions.
+ public:
+  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
+                           MergeOperationOutput* merge_out) const override {
+    assert(merge_out != nullptr);
+    merge_out->new_value = merge_in.operand_list.back().ToString();
+    return true;
+  }
+
+  virtual const char* Name() const override { return "MockMergeOperator"; }
+};
+
+TEST_F(DBRangeDelTest, KeyAtOverlappingEndpointReappears) {
+  // This test uses a non-associative merge operator since that is a convenient
+  // way to get compaction to write out files with overlapping user-keys at the
+  // endpoints. Note, however, overlapping endpoints can also occur with other
+  // value types (Put, etc.), assuming the right snapshots are present.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 1 << 10;
+  const int kNumFiles = 4;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new MockMergeOperator());
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = RandomString(&rnd, kValueBytes);
+      ASSERT_OK(db_->Merge(WriteOptions(), "key", value));
+    }
+    if (i == kNumFiles - 1) {
+      // Take snapshot to prevent covered merge operands from being dropped by
+      // compaction.
+      snapshot = db_->GetSnapshot();
+      // The DeleteRange is the last write so all merge operands are covered.
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "key", "key_"));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+  std::string value;
+  ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
+                              nullptr /* end_key */));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  // Now we have multiple files at L1 all containing a single user key, thus
+  // guaranteeing overlap in the file endpoints.
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+  // Verify no merge operands reappeared after the compaction.
+  ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+  // Compact and verify again. It's worthwhile because now the files have
+  // tighter endpoints, so we can verify that doesn't mess anything up.
+  dbfull()->TEST_CompactRange(1 /* level */, nullptr /* begin */,
+                              nullptr /* end */, nullptr /* column_family */,
+                              true /* disallow_trivial_move */);
+  ASSERT_GT(NumTableFilesAtLevel(2), 1);
+  ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, UntruncatedTombstoneDoesNotDeleteNewerKey) {
+  // Verify a key newer than a range tombstone cannot be deleted by being
+  // compacted to the bottom level (and thus having its seqnum zeroed) before
+  // the range tombstone. This used to happen when range tombstones were
+  // untruncated on reads such that they extended past their file boundaries.
+  //
+  // Test summary:
+  //
+  // - L1 is bottommost.
+  // - A couple snapshots are strategically taken to prevent seqnums from being
+  //   zeroed, range tombstone from being dropped, merge operands from being
+  //   dropped, and merge operands from being combined.
+  // - Left half of files in L1 all have same user key, ensuring their file
+  //   boundaries overlap. In the past this would cause range tombstones to be
+  //   untruncated.
+  // - Right half of L1 files all have different keys, ensuring no overlap.
+  // - A range tombstone spans all L1 keys, so it is stored in every L1 file.
+  // - Keys in the right side of the key-range are overwritten. These are
+  //   compacted down to L1 after releasing snapshots such that their seqnums
+  //   will be zeroed.
+  // - A full range scan is performed. If the tombstone in the left L1 files
+  //   were untruncated, it would now cover keys newer than it (but with zeroed
+  //   seqnums) in the right L1 files.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 1 << 10;
+  const int kNumFiles = 4;
+  const int kMaxKey = kNumFiles* kFileBytes / kValueBytes;
+  const int kKeysOverwritten = 10;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new MockMergeOperator());
+  options.num_levels = 2;
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  // - snapshots[0] prevents merge operands from being combined during
+  //   compaction.
+  // - snapshots[1] prevents merge operands from being dropped due to the
+  //   covering range tombstone.
+  const Snapshot* snapshots[] = {nullptr, nullptr};
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = RandomString(&rnd, kValueBytes);
+      std::string key;
+      if (i < kNumFiles / 2) {
+        key = Key(0);
+      } else {
+        key = Key(1 + i * kFileBytes / kValueBytes + j);
+      }
+      ASSERT_OK(db_->Merge(WriteOptions(), key, value));
+    }
+    if (i == 0) {
+      snapshots[0] = db_->GetSnapshot();
+    }
+    if (i == kNumFiles - 1) {
+      snapshots[1] = db_->GetSnapshot();
+      // The DeleteRange is the last write so all merge operands are covered.
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 Key(0), Key(kMaxKey + 1)));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+
+  auto get_key_count = [this]() -> int {
+    auto* iter = db_->NewIterator(ReadOptions());
+    iter->SeekToFirst();
+    int keys_found = 0;
+    for (; iter->Valid(); iter->Next()) {
+      ++keys_found;
+    }
+    delete iter;
+    return keys_found;
+  };
+
+  // All keys should be covered
+  ASSERT_EQ(0, get_key_count());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
+                              nullptr /* end_key */));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  // Roughly the left half of L1 files should have overlapping boundary keys,
+  // while the right half should not.
+  ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles);
+
+  // Now overwrite a few keys that are in L1 files that definitely don't have
+  // overlapping boundary keys.
+  for (int i = kMaxKey; i > kMaxKey - kKeysOverwritten; --i) {
+    auto value = RandomString(&rnd, kValueBytes);
+    ASSERT_OK(db_->Merge(WriteOptions(), Key(i), value));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  // The overwritten keys are in L0 now, so clearly aren't covered by the range
+  // tombstone in L1.
+  ASSERT_EQ(kKeysOverwritten, get_key_count());
+
+  // Release snapshots so seqnums can be zeroed when L0->L1 happens.
+  db_->ReleaseSnapshot(snapshots[0]);
+  db_->ReleaseSnapshot(snapshots[1]);
+
+  auto begin_key_storage = Key(kMaxKey - kKeysOverwritten + 1);
+  auto end_key_storage = Key(kMaxKey);
+  Slice begin_key(begin_key_storage);
+  Slice end_key(end_key_storage);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin_key, &end_key));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles);
+
+  ASSERT_EQ(kKeysOverwritten, get_key_count());
+}
+
+TEST_F(DBRangeDelTest, DeletedMergeOperandReappearsIterPrev) {
+  // Exposes a bug where we were using
+  // `RangeDelPositioningMode::kBackwardTraversal` while scanning merge operands
+  // in the forward direction. Confusingly, this case happened during
+  // `DBIter::Prev`. It could cause assertion failure, or reappearing keys.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 1 << 10;
+  // Need multiple keys so we can get results when calling `Prev()` after
+  // `SeekToLast()`.
+  const int kNumKeys = 3;
+  const int kNumFiles = 4;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new MockMergeOperator());
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = RandomString(&rnd, kValueBytes);
+      ASSERT_OK(db_->Merge(WriteOptions(), Key(j % kNumKeys), value));
+      if (i == 0 && j == kNumKeys) {
+        // Take snapshot to prevent covered merge operands from being dropped or
+        // merged by compaction.
+        snapshot = db_->GetSnapshot();
+        // Do a DeleteRange near the beginning so only the oldest merge operand
+        // for each key is covered. This ensures the sequence of events:
+        //
+        // - `DBIter::Prev()` is called
+        // - After several same versions of the same user key are encountered,
+        //   it decides to seek using `DBIter::FindValueForCurrentKeyUsingSeek`.
+        // - Binary searches to the newest version of the key, which is in the
+        //   leftmost file containing the user key.
+        // - Scans forwards to collect all merge operands. Eventually reaches
+        //   the rightmost file containing the oldest merge operand, which
+        //   should be covered by the `DeleteRange`. If `RangeDelAggregator`
+        //   were not properly using `kForwardTraversal` here, that operand
+        //   would reappear.
+        ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                   Key(0), Key(kNumKeys + 1)));
+      }
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
+                              nullptr /* end_key */));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+  auto* iter = db_->NewIterator(ReadOptions());
+  iter->SeekToLast();
+  int keys_found = 0;
+  for (; iter->Valid(); iter->Prev()) {
+    ++keys_found;
+  }
+  delete iter;
+  ASSERT_EQ(kNumKeys, keys_found);
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) {
+  const int kFileBytes = 1 << 20;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  ASSERT_OK(Put(Key(0), "a"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(10)));
+
+  db_->Flush(FlushOptions());
+
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot;
+  auto* iter = db_->NewIterator(read_opts);
+
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(Key(0), iter->key());
+
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
+  // Adapted from
+  // https://github.com/cockroachdb/cockroach/blob/de8b3ea603dd1592d9dc26443c2cc92c356fbc2f/pkg/storage/engine/rocksdb_test.go#L1267-L1398.
+  // Regression test for issue where range tombstone was written to more files
+  // than necessary when it began exactly at the begin key in the next
+  // compaction output file.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 4 << 10;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  // Have a bit of slack in the size limits but we enforce them more strictly
+  // when manually flushing/compacting.
+  options.max_compaction_bytes = 2 * kFileBytes;
+  options.target_file_size_base = 2 * kFileBytes;
+  options.write_buffer_size = 2 * kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  for (char first_char : {'a', 'b', 'c'}) {
+    for (int i = 0; i < kFileBytes / kValueBytes; ++i) {
+      std::string key(1, first_char);
+      key.append(Key(i));
+      std::string value = RandomString(&rnd, kValueBytes);
+      ASSERT_OK(Put(key, value));
+    }
+    db_->Flush(FlushOptions());
+    MoveFilesToLevel(2);
+  }
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(3, NumTableFilesAtLevel(2));
+
+  // Populate the memtable lightly while spanning the whole key-space. The
+  // setting of `max_compaction_bytes` will cause the L0->L1 to output multiple
+  // files to prevent a large L1->L2 compaction later.
+  ASSERT_OK(Put("a", "val"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             "c" + Key(1), "d"));
+  // Our compaction output file cutting logic currently only considers point
+  // keys. So, in order for the range tombstone to have a chance at landing at
+  // the start of a new file, we need a point key at the range tombstone's
+  // start.
+  // TODO(ajkr): remove this `Put` after file cutting accounts for range
+  // tombstones (#3977).
+  ASSERT_OK(Put("c" + Key(1), "value"));
+  db_->Flush(FlushOptions());
+
+  // Ensure manual L0->L1 compaction cuts the outputs before the range tombstone
+  // and the range tombstone is only placed in the second SST.
+  std::string begin_key_storage("c" + Key(1));
+  Slice begin_key(begin_key_storage);
+  std::string end_key_storage("d");
+  Slice end_key(end_key_storage);
+  dbfull()->TEST_CompactRange(0 /* level */, &begin_key /* begin */,
+                              &end_key /* end */, nullptr /* column_family */,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  std::vector<LiveFileMetaData> all_metadata;
+  std::vector<LiveFileMetaData> l1_metadata;
+  db_->GetLiveFilesMetaData(&all_metadata);
+  for (const auto& metadata : all_metadata) {
+    if (metadata.level == 1) {
+      l1_metadata.push_back(metadata);
+    }
+  }
+  std::sort(l1_metadata.begin(), l1_metadata.end(),
+            [&](const LiveFileMetaData& a, const LiveFileMetaData& b) {
+              return options.comparator->Compare(a.smallestkey, b.smallestkey) <
+                     0;
+            });
+  ASSERT_EQ("a", l1_metadata[0].smallestkey);
+  ASSERT_EQ("a", l1_metadata[0].largestkey);
+  ASSERT_EQ("c" + Key(1), l1_metadata[1].smallestkey);
+  ASSERT_EQ("d", l1_metadata[1].largestkey);
+
+  TablePropertiesCollection all_table_props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&all_table_props));
+  int64_t num_range_deletions = 0;
+  for (const auto& name_and_table_props : all_table_props) {
+    const auto& name = name_and_table_props.first;
+    const auto& table_props = name_and_table_props.second;
+    // The range tombstone should only be output to the second L1 SST.
+    if (name.size() >= l1_metadata[1].name.size() &&
+        name.substr(name.size() - l1_metadata[1].name.size()).compare(l1_metadata[1].name) == 0) {
+      ASSERT_EQ(1, table_props->num_range_deletions);
+      ++num_range_deletions;
+    } else {
+      ASSERT_EQ(0, table_props->num_range_deletions);
+    }
+  }
+  ASSERT_EQ(1, num_range_deletions);
+}
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/db/db_test.cc b/db/db_test.cc
index db51db1247b..12984de5562 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -93,7 +93,7 @@ class DBTestWithParam
 };
 
 TEST_F(DBTest, MockEnvTest) {
-  unique_ptr<MockEnv> env{new MockEnv(Env::Default())};
+  std::unique_ptr<MockEnv> env{new MockEnv(Env::Default())};
   Options options;
   options.create_if_missing = true;
   options.env = env.get();
@@ -143,7 +143,7 @@ TEST_F(DBTest, MockEnvTest) {
 // defined.
 #ifndef ROCKSDB_LITE
 TEST_F(DBTest, MemEnvTest) {
-  unique_ptr<Env> env{NewMemEnv(Env::Default())};
+  std::unique_ptr<Env> env{NewMemEnv(Env::Default())};
   Options options;
   options.create_if_missing = true;
   options.env = env.get();
@@ -2315,9 +2315,9 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // ROCKSDB_LITE
 
 // Group commit test:
-#ifndef TRAVIS
-// Disable this test temporarily on Travis as it fails intermittently.
-// Github issue: #4151
+#if !defined(TRAVIS) && !defined(OS_WIN)
+// Disable this test temporarily on Travis and appveyor as it fails
+// intermittently. Github issue: #4151
 namespace {
 
 static const int kGCNumThreads = 4;
@@ -2353,7 +2353,9 @@ TEST_F(DBTest, GroupCommitTest) {
 
     rocksdb::SyncPoint::GetInstance()->LoadDependency(
         {{"WriteThread::JoinBatchGroup:BeganWaiting",
-          "DBImpl::WriteImpl:BeforeLeaderEnters"}});
+          "DBImpl::WriteImpl:BeforeLeaderEnters"},
+          {"WriteThread::AwaitState:BlockingWaiting",
+          "WriteThread::EnterAsBatchGroupLeader:End"}});
     rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
     // Start threads
@@ -2661,6 +2663,11 @@ class ModelDB : public DB {
     Status ret;
     return ret;
   }
+  virtual Status Flush(
+      const rocksdb::FlushOptions& /*options*/,
+      const std::vector<ColumnFamilyHandle*>& /*column_families*/) override {
+    return Status::OK();
+  }
 
   virtual Status SyncWAL() override { return Status::OK(); }
 
@@ -2684,7 +2691,8 @@ class ModelDB : public DB {
   }
 
   virtual Status GetUpdatesSince(
-      rocksdb::SequenceNumber, unique_ptr<rocksdb::TransactionLogIterator>*,
+      rocksdb::SequenceNumber,
+      std::unique_ptr<rocksdb::TransactionLogIterator>*,
       const TransactionLogIterator::ReadOptions& /*read_options*/ =
           TransactionLogIterator::ReadOptions()) override {
     return Status::NotSupported("Not supported in Model DB");
@@ -3508,8 +3516,14 @@ TEST_F(DBTest, SanitizeNumThreads) {
                      (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH);
     }
 
-    // Wait 100 milliseconds for they are scheduled.
-    env_->SleepForMicroseconds(100000);
+    // Wait until 10s for they are scheduled.
+    for (int i = 0; i < 10000; i++) {
+      if (options.env->GetThreadPoolQueueLen(Env::Priority::LOW) <= 1 &&
+          options.env->GetThreadPoolQueueLen(Env::Priority::HIGH) <= 2) {
+        break;
+      }
+      env_->SleepForMicroseconds(1000);
+    }
 
     // pool size 3, total task 4. Queue size should be 1.
     ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW));
@@ -5645,41 +5659,18 @@ TEST_F(DBTest, HardLimit) {
 #if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
 class WriteStallListener : public EventListener {
  public:
-  WriteStallListener()
-      : cond_(&mutex_),
-        condition_(WriteStallCondition::kNormal),
-        expected_(WriteStallCondition::kNormal),
-        expected_set_(false) {}
+  WriteStallListener() : condition_(WriteStallCondition::kNormal) {}
   void OnStallConditionsChanged(const WriteStallInfo& info) override {
     MutexLock l(&mutex_);
     condition_ = info.condition.cur;
-    if (expected_set_ && condition_ == expected_) {
-      cond_.Signal();
-      expected_set_ = false;
-    }
   }
   bool CheckCondition(WriteStallCondition expected) {
     MutexLock l(&mutex_);
-    if (expected != condition_) {
-      expected_ = expected;
-      expected_set_ = true;
-      while (expected != condition_) {
-        // We bail out on timeout 500 milliseconds
-        const uint64_t timeout_us = 500000;
-        if (cond_.TimedWait(timeout_us)) {
-          expected_set_ = false;
-          return false;
-        }
-      }
-    }
-    return true;
+    return expected == condition_;
   }
  private:
   port::Mutex   mutex_;
-  port::CondVar cond_;
   WriteStallCondition condition_;
-  WriteStallCondition expected_;
-  bool                expected_set_;
 };
 
 TEST_F(DBTest, SoftLimit) {
@@ -5700,6 +5691,41 @@ TEST_F(DBTest, SoftLimit) {
   WriteStallListener* listener = new WriteStallListener();
   options.listeners.emplace_back(listener);
 
+  // FlushMemtable with opt.wait=true does not wait for
+  // `OnStallConditionsChanged` being called. The event listener is triggered
+  // on `JobContext::Clean`, which happens after flush result is installed.
+  // We use sync point to create a custom WaitForFlush that waits for
+  // context cleanup.
+  port::Mutex flush_mutex;
+  port::CondVar flush_cv(&flush_mutex);
+  bool flush_finished = false;
+  auto InstallFlushCallback = [&]() {
+    {
+      MutexLock l(&flush_mutex);
+      flush_finished = false;
+    }
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::BackgroundCallFlush:ContextCleanedUp", [&](void*) {
+          {
+            MutexLock l(&flush_mutex);
+            flush_finished = true;
+          }
+          flush_cv.SignalAll();
+        });
+  };
+  auto WaitForFlush = [&]() {
+    {
+      MutexLock l(&flush_mutex);
+      while (!flush_finished) {
+        flush_cv.Wait();
+      }
+    }
+    SyncPoint::GetInstance()->ClearCallBack(
+        "DBImpl::BackgroundCallFlush:ContextCleanedUp");
+  };
+
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
   Reopen(options);
 
   // Generating 360KB in Level 3
@@ -5735,7 +5761,9 @@ TEST_F(DBTest, SoftLimit) {
     Put(Key(i), std::string(5000, 'x'));
     Put(Key(100 - i), std::string(5000, 'x'));
     // Flush the file. File size is around 30KB.
+    InstallFlushCallback();
     dbfull()->TEST_FlushMemTable(true, true);
+    WaitForFlush();
   }
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
@@ -5760,8 +5788,6 @@ TEST_F(DBTest, SoftLimit) {
                        &sleeping_task_low, Env::Priority::LOW);
       });
 
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-
   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
                  Env::Priority::LOW);
   sleeping_task_low.WaitUntilSleeping();
@@ -5770,7 +5796,9 @@ TEST_F(DBTest, SoftLimit) {
     Put(Key(10 + i), std::string(5000, 'x'));
     Put(Key(90 - i), std::string(5000, 'x'));
     // Flush the file. File size is around 30KB.
+    InstallFlushCallback();
     dbfull()->TEST_FlushMemTable(true, true);
+    WaitForFlush();
   }
 
   // Wake up sleep task to enable compaction to run and waits
@@ -5791,7 +5819,9 @@ TEST_F(DBTest, SoftLimit) {
     Put(Key(20 + i), std::string(5000, 'x'));
     Put(Key(80 - i), std::string(5000, 'x'));
     // Flush the file. File size is around 30KB.
+    InstallFlushCallback();
     dbfull()->TEST_FlushMemTable(true, true);
+    WaitForFlush();
   }
   // Wake up sleep task to enable compaction to run and waits
   // for it to go to sleep state again to make sure one compaction
diff --git a/db/db_test2.cc b/db/db_test2.cc
index cc30ef85b51..2d6fadc3511 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -61,7 +61,7 @@ TEST_P(PrefixFullBloomWithReverseComparator,
     bbto.block_cache->EraseUnRefEntries();
   }
 
-  unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
   iter->Seek("bar345");
   ASSERT_OK(iter->status());
   ASSERT_TRUE(iter->Valid());
@@ -454,6 +454,22 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(DBTest2, TestWriteBufferNoLimitWithCache) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  std::shared_ptr<Cache> cache =
+      NewLRUCache(LRUCacheOptions(10000000, 1, false, 0.0));
+  options.write_buffer_size = 50000;  // this is never hit
+  // Use a write buffer total size so that the soft limit is about
+  // 105000.
+  options.write_buffer_manager.reset(new WriteBufferManager(0, cache));
+  Reopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  // One dummy entry is 1MB.
+  ASSERT_GT(cache->GetUsage(), 500000);
+}
+
 namespace {
   void ValidateKeyExistence(DB* db, const std::vector<Slice>& keys_must_exist,
     const std::vector<Slice>& keys_must_not_exist) {
@@ -1228,7 +1244,14 @@ TEST_F(DBTest2, CompressionOptions) {
 
 class CompactionStallTestListener : public EventListener {
  public:
-  CompactionStallTestListener() : compacted_files_cnt_(0) {}
+  CompactionStallTestListener() : compacting_files_cnt_(0), compacted_files_cnt_(0) {}
+
+  void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.cf_name, "default");
+    ASSERT_EQ(ci.base_input_level, 0);
+    ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
+    compacting_files_cnt_ += ci.input_files.size();
+  }
 
   void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
     ASSERT_EQ(ci.cf_name, "default");
@@ -1236,6 +1259,8 @@ class CompactionStallTestListener : public EventListener {
     ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
     compacted_files_cnt_ += ci.input_files.size();
   }
+
+  std::atomic<size_t> compacting_files_cnt_;
   std::atomic<size_t> compacted_files_cnt_;
 };
 
@@ -1244,6 +1269,8 @@ TEST_F(DBTest2, CompactionStall) {
       {{"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:0"},
        {"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:1"},
        {"DBTest2::CompactionStall:2",
+        "DBImpl::NotifyOnCompactionBegin::UnlockMutex"},
+       {"DBTest2::CompactionStall:3",
         "DBImpl::NotifyOnCompactionCompleted::UnlockMutex"}});
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
@@ -1285,14 +1312,18 @@ TEST_F(DBTest2, CompactionStall) {
   // Wait for another compaction to be triggered
   TEST_SYNC_POINT("DBTest2::CompactionStall:1");
 
-  // Hold NotifyOnCompactionCompleted in the unlock mutex section
+  // Hold NotifyOnCompactionBegin in the unlock mutex section
   TEST_SYNC_POINT("DBTest2::CompactionStall:2");
 
+  // Hold NotifyOnCompactionCompleted in the unlock mutex section
+  TEST_SYNC_POINT("DBTest2::CompactionStall:3");
+
   dbfull()->TEST_WaitForCompact();
   ASSERT_LT(NumTableFilesAtLevel(0),
             options.level0_file_num_compaction_trigger);
   ASSERT_GT(listener->compacted_files_cnt_.load(),
             10 - options.level0_file_num_compaction_trigger);
+  ASSERT_EQ(listener->compacting_files_cnt_.load(), listener->compacted_files_cnt_.load());
 
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
@@ -2500,6 +2531,61 @@ TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(DBTest2, TestNumPread) {
+  Options options = CurrentOptions();
+  // disable block cache
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  env_->count_random_reads_ = true;
+
+  env_->random_file_open_counter_.store(0);
+  ASSERT_OK(Put("bar", "foo"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+  // After flush, we'll open the file and read footer, meta block,
+  // property block and index block.
+  ASSERT_EQ(4, env_->random_read_counter_.Read());
+  ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+  // One pread per a normal data block read
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_EQ("bar", Get("foo"));
+  ASSERT_EQ(1, env_->random_read_counter_.Read());
+  // All files are already opened.
+  ASSERT_EQ(0, env_->random_file_open_counter_.load());
+
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_OK(Put("bar2", "foo2"));
+  ASSERT_OK(Put("foo2", "bar2"));
+  ASSERT_OK(Flush());
+  // After flush, we'll open the file and read footer, meta block,
+  // property block and index block.
+  ASSERT_EQ(4, env_->random_read_counter_.Read());
+  ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+  // Compaction needs two input blocks, which requires 2 preads, and
+  // generate a new SST file which needs 4 preads (footer, meta block,
+  // property block and index block). In total 6.
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(6, env_->random_read_counter_.Read());
+  // All compactin input files should have already been opened.
+  ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+  // One pread per a normal data block read
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_EQ("foo2", Get("bar2"));
+  ASSERT_EQ(1, env_->random_read_counter_.Read());
+  // SST files are already opened.
+  ASSERT_EQ(0, env_->random_file_open_counter_.load());
+}
+
 TEST_F(DBTest2, TraceAndReplay) {
   Options options = CurrentOptions();
   options.merge_operator = MergeOperators::CreatePutOperator();
@@ -2602,6 +2688,74 @@ TEST_F(DBTest2, TraceAndReplay) {
   ASSERT_OK(DestroyDB(dbname2, options));
 }
 
+TEST_F(DBTest2, TraceWithLimit) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+
+  // test the max trace file size options
+  trace_opts.max_trace_file_size = 5;
+  std::string trace_filename = dbname_ + "/rocksdb.trace1";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Put(0, "b", "1"));
+  ASSERT_OK(Put(0, "c", "1"));
+  ASSERT_OK(db_->EndTrace());
+
+  std::string dbname2 = test::TmpDir(env_) + "/db_replay2";
+  std::string value;
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  cf_options.merge_operator = MergeOperators::CreatePutOperator();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  Replayer replayer(db2, handles_, std::move(trace_reader));
+  ASSERT_OK(replayer.Replay());
+
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
 #endif  // ROCKSDB_LITE
 
 TEST_F(DBTest2, PinnableSliceAndMmapReads) {
@@ -2677,7 +2831,7 @@ TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) {
   // Verify that iterators don't pin more than one data block in block cache
   // at each time.
   {
-    unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
     iter->SeekToFirst();
 
     for (int i = 0; i < 4; i++) {
@@ -2835,6 +2989,105 @@ TEST_F(DBTest2, TestBBTTailPrefetch) {
   rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
+TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) {
+  // Setup sync point dependency to reproduce the race condition of
+  // DBImpl::GetColumnFamilyHandleUnlocked
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      { {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1",
+         "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2"},
+        {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2",
+         "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1"},
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateColumnFamilies({"test1", "test2"}, Options());
+  ASSERT_EQ(handles_.size(), 2);
+
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  port::Thread user_thread1([&]() {
+    auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID());
+    ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
+    TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1");
+    TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1");
+    ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
+  });
+
+  port::Thread user_thread2([&]() {
+    TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2");
+    auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[1]->GetID());
+    ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
+    TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2");
+    ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
+  });
+
+  user_thread1.join();
+  user_thread2.join();
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, TestCompactFiles) {
+  // Setup sync point dependency to reproduce the race condition of
+  // DBImpl::GetColumnFamilyHandleUnlocked
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"TestCompactFiles::IngestExternalFile1",
+       "TestCompactFiles::IngestExternalFile2"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options;
+  options.num_levels = 2;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  auto* handle = db_->DefaultColumnFamily();
+  ASSERT_EQ(db_->NumberLevels(handle), 2);
+
+  rocksdb::SstFileWriter sst_file_writer{rocksdb::EnvOptions(), options};
+  std::string external_file1 = dbname_ + "/test_compact_files1.sst_t";
+  std::string external_file2 = dbname_ + "/test_compact_files2.sst_t";
+  std::string external_file3 = dbname_ + "/test_compact_files3.sst_t";
+
+  ASSERT_OK(sst_file_writer.Open(external_file1));
+  ASSERT_OK(sst_file_writer.Put("1", "1"));
+  ASSERT_OK(sst_file_writer.Put("2", "2"));
+  ASSERT_OK(sst_file_writer.Finish());
+
+  ASSERT_OK(sst_file_writer.Open(external_file2));
+  ASSERT_OK(sst_file_writer.Put("3", "3"));
+  ASSERT_OK(sst_file_writer.Put("4", "4"));
+  ASSERT_OK(sst_file_writer.Finish());
+
+  ASSERT_OK(sst_file_writer.Open(external_file3));
+  ASSERT_OK(sst_file_writer.Put("5", "5"));
+  ASSERT_OK(sst_file_writer.Put("6", "6"));
+  ASSERT_OK(sst_file_writer.Finish());
+
+  ASSERT_OK(db_->IngestExternalFile(handle, {external_file1, external_file3},
+                                    IngestExternalFileOptions()));
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2);
+  std::vector<std::string> files;
+  GetSstFiles(env_, dbname_, &files);
+  ASSERT_EQ(files.size(), 2);
+
+  port::Thread user_thread1(
+      [&]() { db_->CompactFiles(CompactionOptions(), handle, files, 1); });
+
+  port::Thread user_thread2([&]() {
+    ASSERT_OK(db_->IngestExternalFile(handle, {external_file2},
+                                      IngestExternalFileOptions()));
+    TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile1");
+  });
+
+  user_thread1.join();
+  user_thread2.join();
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // ROCKSDB_LITE
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index 167e573ccf3..3c8a7c99d60 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -291,6 +291,47 @@ bool DBTestBase::ChangeFilterOptions() {
   return true;
 }
 
+// Switch between different DB options for file ingestion tests.
+bool DBTestBase::ChangeOptionsForFileIngestionTest() {
+  if (option_config_ == kDefault) {
+    option_config_ = kUniversalCompaction;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalCompaction) {
+    option_config_ = kUniversalCompactionMultiLevel;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalCompactionMultiLevel) {
+    option_config_ = kLevelSubcompactions;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    assert(options.max_subcompactions > 1);
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kLevelSubcompactions) {
+    option_config_ = kUniversalSubcompactions;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    assert(options.max_subcompactions > 1);
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalSubcompactions) {
+    option_config_ = kDirectIO;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    TryReopen(options);
+    return true;
+  } else {
+    return false;
+  }
+}
+
 // Return the current option configuration.
 Options DBTestBase::CurrentOptions(
     const anon::OptionsOverride& options_override) const {
@@ -462,6 +503,10 @@ Options DBTestBase::GetOptions(
       table_options.checksum = kxxHash;
       break;
     }
+    case kxxHash64Checksum: {
+      table_options.checksum = kxxHash64;
+      break;
+    }
     case kFIFOCompaction: {
       options.compaction_style = kCompactionStyleFIFO;
       break;
@@ -710,30 +755,19 @@ Status DBTestBase::ReadOnlyReopen(const Options& options) {
 Status DBTestBase::TryReopen(const Options& options) {
   Close();
   last_options_.table_factory.reset();
-  // Note: operator= is an unsafe approach here since it destructs shared_ptr in
-  // the same order of their creation, in contrast to destructors which
-  // destructs them in the opposite order of creation. One particular problme is
-  // that the cache destructor might invoke callback functions that use Option
-  // members such as statistics. To work around this problem, we manually call
-  // destructor of table_facotry which eventually clears the block cache.
+  // Note: operator= is an unsafe approach here since it destructs
+  // std::shared_ptr in the same order of their creation, in contrast to
+  // destructors which destructs them in the opposite order of creation. One
+  // particular problme is that the cache destructor might invoke callback
+  // functions that use Option members such as statistics. To work around this
+  // problem, we manually call destructor of table_facotry which eventually
+  // clears the block cache.
   last_options_ = options;
   return DB::Open(options, dbname_, &db_);
 }
 
 bool DBTestBase::IsDirectIOSupported() {
-  EnvOptions env_options;
-  env_options.use_mmap_writes = false;
-  env_options.use_direct_writes = true;
-  std::string tmp = TempFileName(dbname_, 999);
-  Status s;
-  {
-    unique_ptr<WritableFile> file;
-    s = env_->NewWritableFile(tmp, &file, env_options);
-  }
-  if (s.ok()) {
-    s = env_->DeleteFile(tmp);
-  }
-  return s.ok();
+  return test::IsDirectIOSupported(env_, dbname_);
 }
 
 bool DBTestBase::IsMemoryMappedAccessSupported() const {
@@ -748,6 +782,13 @@ Status DBTestBase::Flush(int cf) {
   }
 }
 
+Status DBTestBase::Flush(const std::vector<int>& cf_ids) {
+  std::vector<ColumnFamilyHandle*> cfhs;
+  std::for_each(cf_ids.begin(), cf_ids.end(),
+                [&cfhs, this](int id) { cfhs.emplace_back(handles_[id]); });
+  return db_->Flush(FlushOptions(), cfhs);
+}
+
 Status DBTestBase::Put(const Slice& k, const Slice& v, WriteOptions wo) {
   if (kMergePut == option_config_) {
     return db_->Merge(wo, k, v);
@@ -875,13 +916,15 @@ std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) {
   Arena arena;
   auto options = CurrentOptions();
   InternalKeyComparator icmp(options.comparator);
-  RangeDelAggregator range_del_agg(icmp, {} /* snapshots */);
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
   ScopedArenaIterator iter;
   if (cf == 0) {
-    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg));
+    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+                                           kMaxSequenceNumber));
   } else {
-    iter.set(
-        dbfull()->NewInternalIterator(&arena, &range_del_agg, handles_[cf]));
+    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+                                           kMaxSequenceNumber, handles_[cf]));
   }
   InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
   iter->Seek(target.Encode());
@@ -1286,15 +1329,17 @@ void DBTestBase::validateNumberOfEntries(int numValues, int cf) {
   Arena arena;
   auto options = CurrentOptions();
   InternalKeyComparator icmp(options.comparator);
-  RangeDelAggregator range_del_agg(icmp, {} /* snapshots */);
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
   // This should be defined after range_del_agg so that it destructs the
   // assigned iterator before it range_del_agg is already destructed.
   ScopedArenaIterator iter;
   if (cf != 0) {
-    iter.set(
-        dbfull()->NewInternalIterator(&arena, &range_del_agg, handles_[cf]));
+    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+                                           kMaxSequenceNumber, handles_[cf]));
   } else {
-    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg));
+    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+                                           kMaxSequenceNumber));
   }
   iter->SeekToFirst();
   ASSERT_EQ(iter->status().ok(), true);
@@ -1314,9 +1359,9 @@ void DBTestBase::validateNumberOfEntries(int numValues, int cf) {
 void DBTestBase::CopyFile(const std::string& source,
                           const std::string& destination, uint64_t size) {
   const EnvOptions soptions;
-  unique_ptr<SequentialFile> srcfile;
+  std::unique_ptr<SequentialFile> srcfile;
   ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
-  unique_ptr<WritableFile> destfile;
+  std::unique_ptr<WritableFile> destfile;
   ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
 
   if (size == 0) {
@@ -1494,8 +1539,10 @@ void DBTestBase::VerifyDBInternal(
     std::vector<std::pair<std::string, std::string>> true_data) {
   Arena arena;
   InternalKeyComparator icmp(last_options_.comparator);
-  RangeDelAggregator range_del_agg(icmp, {});
-  auto iter = dbfull()->NewInternalIterator(&arena, &range_del_agg);
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
+  auto iter =
+      dbfull()->NewInternalIterator(&arena, &range_del_agg, kMaxSequenceNumber);
   iter->SeekToFirst();
   for (auto p : true_data) {
     ASSERT_TRUE(iter->Valid());
diff --git a/db/db_test_util.h b/db/db_test_util.h
index 250e0c7e0ac..17a3551daaa 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -46,6 +46,7 @@
 #include "table/scoped_arena_iterator.h"
 #include "util/compression.h"
 #include "util/filename.h"
+#include "util/mock_time_env.h"
 #include "util/mutexlock.h"
 
 #include "util/string_util.h"
@@ -170,7 +171,7 @@ class SpecialMemTableRep : public MemTableRep {
   virtual ~SpecialMemTableRep() override {}
 
  private:
-  unique_ptr<MemTableRep> memtable_;
+  std::unique_ptr<MemTableRep> memtable_;
   int num_entries_flush_;
   int num_entries_;
 };
@@ -208,15 +209,15 @@ class SpecialEnv : public EnvWrapper {
  public:
   explicit SpecialEnv(Env* base);
 
-  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
                          const EnvOptions& soptions) override {
     class SSTableFile : public WritableFile {
      private:
       SpecialEnv* env_;
-      unique_ptr<WritableFile> base_;
+      std::unique_ptr<WritableFile> base_;
 
      public:
-      SSTableFile(SpecialEnv* env, unique_ptr<WritableFile>&& base)
+      SSTableFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& base)
           : env_(env), base_(std::move(base)) {}
       Status Append(const Slice& data) override {
         if (env_->table_write_callback_) {
@@ -296,7 +297,7 @@ class SpecialEnv : public EnvWrapper {
     };
     class ManifestFile : public WritableFile {
      public:
-      ManifestFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
+      ManifestFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
           : env_(env), base_(std::move(b)) {}
       Status Append(const Slice& data) override {
         if (env_->manifest_write_error_.load(std::memory_order_acquire)) {
@@ -323,11 +324,11 @@ class SpecialEnv : public EnvWrapper {
 
      private:
       SpecialEnv* env_;
-      unique_ptr<WritableFile> base_;
+      std::unique_ptr<WritableFile> base_;
     };
     class WalFile : public WritableFile {
      public:
-      WalFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
+      WalFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
           : env_(env), base_(std::move(b)) {
         env_->num_open_wal_file_.fetch_add(1);
       }
@@ -379,7 +380,7 @@ class SpecialEnv : public EnvWrapper {
 
      private:
       SpecialEnv* env_;
-      unique_ptr<WritableFile> base_;
+      std::unique_ptr<WritableFile> base_;
     };
 
     if (non_writeable_rate_.load(std::memory_order_acquire) > 0) {
@@ -421,11 +422,11 @@ class SpecialEnv : public EnvWrapper {
   }
 
   Status NewRandomAccessFile(const std::string& f,
-                             unique_ptr<RandomAccessFile>* r,
+                             std::unique_ptr<RandomAccessFile>* r,
                              const EnvOptions& soptions) override {
     class CountingFile : public RandomAccessFile {
      public:
-      CountingFile(unique_ptr<RandomAccessFile>&& target,
+      CountingFile(std::unique_ptr<RandomAccessFile>&& target,
                    anon::AtomicCounter* counter,
                    std::atomic<size_t>* bytes_read)
           : target_(std::move(target)),
@@ -440,7 +441,7 @@ class SpecialEnv : public EnvWrapper {
       }
 
      private:
-      unique_ptr<RandomAccessFile> target_;
+      std::unique_ptr<RandomAccessFile> target_;
       anon::AtomicCounter* counter_;
       std::atomic<size_t>* bytes_read_;
     };
@@ -458,11 +459,11 @@ class SpecialEnv : public EnvWrapper {
   }
 
   virtual Status NewSequentialFile(const std::string& f,
-                                   unique_ptr<SequentialFile>* r,
+                                   std::unique_ptr<SequentialFile>* r,
                                    const EnvOptions& soptions) override {
     class CountingFile : public SequentialFile {
      public:
-      CountingFile(unique_ptr<SequentialFile>&& target,
+      CountingFile(std::unique_ptr<SequentialFile>&& target,
                    anon::AtomicCounter* counter)
           : target_(std::move(target)), counter_(counter) {}
       virtual Status Read(size_t n, Slice* result, char* scratch) override {
@@ -472,7 +473,7 @@ class SpecialEnv : public EnvWrapper {
       virtual Status Skip(uint64_t n) override { return target_->Skip(n); }
 
      private:
-      unique_ptr<SequentialFile> target_;
+      std::unique_ptr<SequentialFile> target_;
       anon::AtomicCounter* counter_;
     };
 
@@ -575,7 +576,7 @@ class SpecialEnv : public EnvWrapper {
 
   std::atomic<int> delete_count_;
 
-  bool time_elapse_only_sleep_;
+  std::atomic<bool> time_elapse_only_sleep_;
 
   bool no_slowdown_;
 
@@ -584,37 +585,6 @@ class SpecialEnv : public EnvWrapper {
   std::atomic<size_t> compaction_readahead_size_{};
 };
 
-class MockTimeEnv : public EnvWrapper {
- public:
-  explicit MockTimeEnv(Env* base) : EnvWrapper(base) {}
-
-  virtual Status GetCurrentTime(int64_t* time) override {
-    assert(time != nullptr);
-    assert(current_time_ <=
-           static_cast<uint64_t>(std::numeric_limits<int64_t>::max()));
-    *time = static_cast<int64_t>(current_time_);
-    return Status::OK();
-  }
-
-  virtual uint64_t NowMicros() override {
-    assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000);
-    return current_time_ * 1000000;
-  }
-
-  virtual uint64_t NowNanos() override {
-    assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000000);
-    return current_time_ * 1000000000;
-  }
-
-  void set_current_time(uint64_t time) {
-    assert(time >= current_time_);
-    current_time_ = time;
-  }
-
- private:
-  std::atomic<uint64_t> current_time_{0};
-};
-
 #ifndef ROCKSDB_LITE
 class OnFileDeletionListener : public EventListener {
  public:
@@ -707,6 +677,7 @@ class DBTestBase : public testing::Test {
     kBlockBasedTableWithPartitionedIndexFormat4,
     kPartitionedFilterWithNewTableReaderForCompactions,
     kUniversalSubcompactions,
+    kxxHash64Checksum,
     // This must be the last line
     kEnd,
   };
@@ -794,6 +765,9 @@ class DBTestBase : public testing::Test {
   // Jump from kDefault to kFilter to kFullFilter
   bool ChangeFilterOptions();
 
+  // Switch between different DB options for file ingestion tests.
+  bool ChangeOptionsForFileIngestionTest();
+
   // Return the current option configuration.
   Options CurrentOptions(const anon::OptionsOverride& options_override =
                              anon::OptionsOverride()) const;
@@ -847,6 +821,8 @@ class DBTestBase : public testing::Test {
 
   Status Flush(int cf = 0);
 
+  Status Flush(const std::vector<int>& cf_ids);
+
   Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions());
 
   Status Put(int cf, const Slice& k, const Slice& v,
diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc
index 4f6a1aeada6..2c17d9f8291 100644
--- a/db/db_universal_compaction_test.cc
+++ b/db/db_universal_compaction_test.cc
@@ -1154,7 +1154,7 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio2) {
     dbfull()->TEST_WaitForFlushMemTable();
     dbfull()->TEST_WaitForCompact();
   }
-  ASSERT_LT(TotalSize(), 120000U * 12 * 0.8 + 120000 * 2);
+  ASSERT_LT(TotalSize(), 120000U * 12 * 0.82 + 120000 * 2);
 }
 
 #ifndef ROCKSDB_VALGRIND_RUN
diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc
index 9b123d921fa..1ba5024582e 100644
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@@ -33,7 +33,8 @@ class DBWALTest : public DBTestBase {
 class EnrichedSpecialEnv : public SpecialEnv {
  public:
   explicit EnrichedSpecialEnv(Env* base) : SpecialEnv(base) {}
-  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
                            const EnvOptions& soptions) override {
     InstrumentedMutexLock l(&env_mutex_);
     if (f == skipped_wal) {
@@ -88,6 +89,7 @@ class DBWALTestWithEnrichedEnv : public DBTestBase {
     enriched_env_ = new EnrichedSpecialEnv(env_->target());
     auto options = CurrentOptions();
     options.env = enriched_env_;
+    options.allow_2pc = true;
     Reopen(options);
     delete env_;
     // to be deleted by the parent class
@@ -802,12 +804,12 @@ class RecoveryTestHelper {
 
     *count = 0;
 
-    shared_ptr<Cache> table_cache = NewLRUCache(50, 0);
+    std::shared_ptr<Cache> table_cache = NewLRUCache(50, 0);
     EnvOptions env_options;
     WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
 
-    unique_ptr<VersionSet> versions;
-    unique_ptr<WalManager> wal_manager;
+    std::unique_ptr<VersionSet> versions;
+    std::unique_ptr<WalManager> wal_manager;
     WriteController write_controller;
 
     versions.reset(new VersionSet(test->dbname_, &db_options, env_options,
@@ -821,9 +823,9 @@ class RecoveryTestHelper {
     for (size_t j = kWALFileOffset; j < wal_count + kWALFileOffset; j++) {
       uint64_t current_log_number = j;
       std::string fname = LogFileName(test->dbname_, current_log_number);
-      unique_ptr<WritableFile> file;
+      std::unique_ptr<WritableFile> file;
       ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options));
-      unique_ptr<WritableFileWriter> file_writer(
+      std::unique_ptr<WritableFileWriter> file_writer(
           new WritableFileWriter(std::move(file), fname, env_options));
       current_log_writer.reset(
           new log::Writer(std::move(file_writer), current_log_number,
diff --git a/db/dbformat.h b/db/dbformat.h
index 2b88de7323f..f5ee7d3a073 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -674,5 +674,16 @@ int InternalKeyComparator::CompareKeySeq(const Slice& akey,
   return r;
 }
 
+struct ParsedInternalKeyComparator {
+  explicit ParsedInternalKeyComparator(const InternalKeyComparator* c)
+      : cmp(c) {}
+
+  bool operator()(const ParsedInternalKey& a,
+                  const ParsedInternalKey& b) const {
+    return cmp->Compare(a, b) < 0;
+  }
+
+  const InternalKeyComparator* cmp;
+};
 
 }  // namespace rocksdb
diff --git a/db/error_handler.cc b/db/error_handler.cc
index 8e297df9829..afec14edcbe 100644
--- a/db/error_handler.cc
+++ b/db/error_handler.cc
@@ -157,7 +157,7 @@ void ErrorHandler::CancelErrorRecovery() {
 //    a default one is allocated during DB::Open(), so there will always be
 //    one.
 // This can also get called as part of a recovery operation. In that case, we
-// also track the error seperately in recovery_error_ so we can tell in the
+// also track the error separately in recovery_error_ so we can tell in the
 // end whether recovery succeeded or not
 Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reason) {
   db_mutex_->AssertHeld();
diff --git a/db/error_handler.h b/db/error_handler.h
index ce8454da67c..c2af809fc69 100644
--- a/db/error_handler.h
+++ b/db/error_handler.h
@@ -60,7 +60,7 @@ class ErrorHandler {
     DBImpl* db_;
     const ImmutableDBOptions& db_options_;
     Status bg_error_;
-    // A seperate Status variable used to record any errors during the
+    // A separate Status variable used to record any errors during the
     // recovery process from hard errors
     Status recovery_error_;
     InstrumentedMutex* db_mutex_;
diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index 02451f41ba9..79853181d64 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -14,7 +14,8 @@
 namespace rocksdb {
 
 #ifndef ROCKSDB_LITE
-class ExternalSSTFileBasicTest : public DBTestBase {
+class ExternalSSTFileBasicTest : public DBTestBase,
+                                 public ::testing::WithParamInterface<bool> {
  public:
   ExternalSSTFileBasicTest() : DBTestBase("/external_sst_file_test") {
     sst_files_dir_ = dbname_ + "/sst_files/";
@@ -41,7 +42,7 @@ class ExternalSSTFileBasicTest : public DBTestBase {
       const Options options, std::vector<int> keys,
       const std::vector<ValueType>& value_types,
       std::vector<std::pair<int, int>> range_deletions, int file_id,
-      std::map<std::string, std::string>* true_data) {
+      bool write_global_seqno, std::map<std::string, std::string>* true_data) {
     assert(value_types.size() == 1 || keys.size() == value_types.size());
     std::string file_path = sst_files_dir_ + ToString(file_id);
     SstFileWriter sst_file_writer(EnvOptions(), options);
@@ -105,6 +106,7 @@ class ExternalSSTFileBasicTest : public DBTestBase {
     if (s.ok()) {
       IngestExternalFileOptions ifo;
       ifo.allow_global_seqno = true;
+      ifo.write_global_seqno = write_global_seqno;
       s = db_->IngestExternalFile({file_path}, ifo);
     }
     return s;
@@ -113,17 +115,18 @@ class ExternalSSTFileBasicTest : public DBTestBase {
   Status GenerateAndAddExternalFile(
       const Options options, std::vector<int> keys,
       const std::vector<ValueType>& value_types, int file_id,
-      std::map<std::string, std::string>* true_data) {
+      bool write_global_seqno, std::map<std::string, std::string>* true_data) {
     return GenerateAndAddExternalFile(options, keys, value_types, {}, file_id,
-                                      true_data);
+                                      write_global_seqno, true_data);
   }
 
   Status GenerateAndAddExternalFile(
       const Options options, std::vector<int> keys, const ValueType value_type,
-      int file_id, std::map<std::string, std::string>* true_data) {
+      int file_id, bool write_global_seqno,
+      std::map<std::string, std::string>* true_data) {
     return GenerateAndAddExternalFile(options, keys,
                                       std::vector<ValueType>(1, value_type),
-                                      file_id, true_data);
+                                      file_id, write_global_seqno, true_data);
   }
 
   ~ExternalSSTFileBasicTest() { test::DestroyDir(env_, sst_files_dir_); }
@@ -245,7 +248,8 @@ TEST_F(ExternalSSTFileBasicTest, NoCopy) {
   }
 }
 
-TEST_F(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) {
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) {
+  bool write_global_seqno = GetParam();
   do {
     Options options = CurrentOptions();
     DestroyAndReopen(options);
@@ -255,33 +259,37 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) {
 
     ASSERT_OK(GenerateAndAddExternalFile(options, {1, 2, 3, 4, 5, 6},
                                          ValueType::kTypeValue, file_id++,
-                                         &true_data));
+                                         write_global_seqno, &true_data));
     // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
 
     ASSERT_OK(GenerateAndAddExternalFile(options, {10, 11, 12, 13},
                                          ValueType::kTypeValue, file_id++,
-                                         &true_data));
+                                         write_global_seqno, &true_data));
     // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
 
-    ASSERT_OK(GenerateAndAddExternalFile(
-        options, {1, 4, 6}, ValueType::kTypeValue, file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(options, {1, 4, 6},
+                                         ValueType::kTypeValue, file_id++,
+                                         write_global_seqno, &true_data));
     // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
 
-    ASSERT_OK(GenerateAndAddExternalFile(
-        options, {11, 15, 19}, ValueType::kTypeValue, file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(options, {11, 15, 19},
+                                         ValueType::kTypeValue, file_id++,
+                                         write_global_seqno, &true_data));
     // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
 
-    ASSERT_OK(GenerateAndAddExternalFile(
-        options, {120, 130}, ValueType::kTypeValue, file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(options, {120, 130},
+                                         ValueType::kTypeValue, file_id++,
+                                         write_global_seqno, &true_data));
     // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
 
-    ASSERT_OK(GenerateAndAddExternalFile(
-        options, {1, 130}, ValueType::kTypeValue, file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(options, {1, 130},
+                                         ValueType::kTypeValue, file_id++,
+                                         write_global_seqno, &true_data));
     // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
 
@@ -292,18 +300,21 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) {
     }
     SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
 
-    ASSERT_OK(GenerateAndAddExternalFile(
-        options, {60, 61, 62}, ValueType::kTypeValue, file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(options, {60, 61, 62},
+                                         ValueType::kTypeValue, file_id++,
+                                         write_global_seqno, &true_data));
     // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
 
-    ASSERT_OK(GenerateAndAddExternalFile(
-        options, {40, 41, 42}, ValueType::kTypeValue, file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(options, {40, 41, 42},
+                                         ValueType::kTypeValue, file_id++,
+                                         write_global_seqno, &true_data));
     // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
 
-    ASSERT_OK(GenerateAndAddExternalFile(
-        options, {20, 30, 40}, ValueType::kTypeValue, file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(options, {20, 30, 40},
+                                         ValueType::kTypeValue, file_id++,
+                                         write_global_seqno, &true_data));
     // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
 
@@ -311,35 +322,39 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) {
 
     // We will need a seqno for the file regardless if the file overwrite
     // keys in the DB or not because we have a snapshot
-    ASSERT_OK(GenerateAndAddExternalFile(
-        options, {1000, 1002}, ValueType::kTypeValue, file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(options, {1000, 1002},
+                                         ValueType::kTypeValue, file_id++,
+                                         write_global_seqno, &true_data));
     // A global seqno will be assigned anyway because of the snapshot
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
 
-    ASSERT_OK(GenerateAndAddExternalFile(
-        options, {2000, 3002}, ValueType::kTypeValue, file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(options, {2000, 3002},
+                                         ValueType::kTypeValue, file_id++,
+                                         write_global_seqno, &true_data));
     // A global seqno will be assigned anyway because of the snapshot
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
 
     ASSERT_OK(GenerateAndAddExternalFile(options, {1, 20, 40, 100, 150},
                                          ValueType::kTypeValue, file_id++,
-                                         &true_data));
+                                         write_global_seqno, &true_data));
     // A global seqno will be assigned anyway because of the snapshot
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
 
     db_->ReleaseSnapshot(snapshot);
 
-    ASSERT_OK(GenerateAndAddExternalFile(
-        options, {5000, 5001}, ValueType::kTypeValue, file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(options, {5000, 5001},
+                                         ValueType::kTypeValue, file_id++,
+                                         write_global_seqno, &true_data));
     // No snapshot anymore, no need to assign a seqno
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
 
     size_t kcnt = 0;
     VerifyDBFromMap(true_data, &kcnt, false);
-  } while (ChangeCompactOptions());
+  } while (ChangeOptionsForFileIngestionTest());
 }
 
-TEST_F(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) {
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) {
+  bool write_global_seqno = GetParam();
   do {
     Options options = CurrentOptions();
     options.merge_operator.reset(new TestPutOperator());
@@ -350,52 +365,57 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) {
 
     ASSERT_OK(GenerateAndAddExternalFile(options, {1, 2, 3, 4, 5, 6},
                                          ValueType::kTypeValue, file_id++,
-                                         &true_data));
+                                         write_global_seqno, &true_data));
     // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
 
     ASSERT_OK(GenerateAndAddExternalFile(options, {10, 11, 12, 13},
                                          ValueType::kTypeValue, file_id++,
-                                         &true_data));
+                                         write_global_seqno, &true_data));
     // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
 
-    ASSERT_OK(GenerateAndAddExternalFile(
-        options, {1, 4, 6}, ValueType::kTypeMerge, file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(options, {1, 4, 6},
+                                         ValueType::kTypeMerge, file_id++,
+                                         write_global_seqno, &true_data));
     // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
 
     ASSERT_OK(GenerateAndAddExternalFile(options, {11, 15, 19},
                                          ValueType::kTypeDeletion, file_id++,
-                                         &true_data));
+                                         write_global_seqno, &true_data));
     // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
 
-    ASSERT_OK(GenerateAndAddExternalFile(
-        options, {120, 130}, ValueType::kTypeMerge, file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(options, {120, 130},
+                                         ValueType::kTypeMerge, file_id++,
+                                         write_global_seqno, &true_data));
     // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
 
-    ASSERT_OK(GenerateAndAddExternalFile(
-        options, {1, 130}, ValueType::kTypeDeletion, file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(options, {1, 130},
+                                         ValueType::kTypeDeletion, file_id++,
+                                         write_global_seqno, &true_data));
     // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
 
-    ASSERT_OK(GenerateAndAddExternalFile(options, {120},
-                                         {ValueType::kTypeValue}, {{120, 135}},
-                                         file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {120}, {ValueType::kTypeValue}, {{120, 135}}, file_id++,
+        write_global_seqno, &true_data));
     // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4);
 
     ASSERT_OK(GenerateAndAddExternalFile(options, {}, {}, {{110, 120}},
-                                         file_id++, &true_data));
+                                         file_id++, write_global_seqno,
+                                         &true_data));
     // The range deletion ends on a key, but it doesn't actually delete
     // this key because the largest key in the range is exclusive. Still,
     // it counts as an overlap so a new seqno will be assigned.
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
 
     ASSERT_OK(GenerateAndAddExternalFile(options, {}, {}, {{100, 109}},
-                                         file_id++, &true_data));
+                                         file_id++, write_global_seqno,
+                                         &true_data));
     // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
 
@@ -406,19 +426,21 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) {
     }
     SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
 
-    ASSERT_OK(GenerateAndAddExternalFile(
-        options, {60, 61, 62}, ValueType::kTypeValue, file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(options, {60, 61, 62},
+                                         ValueType::kTypeValue, file_id++,
+                                         write_global_seqno, &true_data));
     // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
 
-    ASSERT_OK(GenerateAndAddExternalFile(
-        options, {40, 41, 42}, ValueType::kTypeMerge, file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(options, {40, 41, 42},
+                                         ValueType::kTypeMerge, file_id++,
+                                         write_global_seqno, &true_data));
     // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
 
     ASSERT_OK(GenerateAndAddExternalFile(options, {20, 30, 40},
                                          ValueType::kTypeDeletion, file_id++,
-                                         &true_data));
+                                         write_global_seqno, &true_data));
     // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
 
@@ -426,35 +448,39 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) {
 
     // We will need a seqno for the file regardless if the file overwrite
     // keys in the DB or not because we have a snapshot
-    ASSERT_OK(GenerateAndAddExternalFile(
-        options, {1000, 1002}, ValueType::kTypeMerge, file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(options, {1000, 1002},
+                                         ValueType::kTypeMerge, file_id++,
+                                         write_global_seqno, &true_data));
     // A global seqno will be assigned anyway because of the snapshot
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
 
-    ASSERT_OK(GenerateAndAddExternalFile(
-        options, {2000, 3002}, ValueType::kTypeMerge, file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(options, {2000, 3002},
+                                         ValueType::kTypeMerge, file_id++,
+                                         write_global_seqno, &true_data));
     // A global seqno will be assigned anyway because of the snapshot
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
 
     ASSERT_OK(GenerateAndAddExternalFile(options, {1, 20, 40, 100, 150},
                                          ValueType::kTypeMerge, file_id++,
-                                         &true_data));
+                                         write_global_seqno, &true_data));
     // A global seqno will be assigned anyway because of the snapshot
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
 
     db_->ReleaseSnapshot(snapshot);
 
-    ASSERT_OK(GenerateAndAddExternalFile(
-        options, {5000, 5001}, ValueType::kTypeValue, file_id++, &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(options, {5000, 5001},
+                                         ValueType::kTypeValue, file_id++,
+                                         write_global_seqno, &true_data));
     // No snapshot anymore, no need to assign a seqno
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
 
     size_t kcnt = 0;
     VerifyDBFromMap(true_data, &kcnt, false);
-  } while (ChangeCompactOptions());
+  } while (ChangeOptionsForFileIngestionTest());
 }
 
-TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
+  bool write_global_seqno = GetParam();
   do {
     Options options = CurrentOptions();
     options.merge_operator.reset(new TestPutOperator());
@@ -467,7 +493,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
         options, {1, 2, 3, 4, 5, 6},
         {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue,
          ValueType::kTypeMerge, ValueType::kTypeValue, ValueType::kTypeMerge},
-        file_id++, &true_data));
+        file_id++, write_global_seqno, &true_data));
     // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
 
@@ -475,33 +501,35 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
         options, {10, 11, 12, 13},
         {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue,
          ValueType::kTypeMerge},
-        file_id++, &true_data));
+        file_id++, write_global_seqno, &true_data));
     // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {1, 4, 6}, {ValueType::kTypeDeletion, ValueType::kTypeValue,
-                             ValueType::kTypeMerge},
-        file_id++, &true_data));
+        options, {1, 4, 6},
+        {ValueType::kTypeDeletion, ValueType::kTypeValue,
+         ValueType::kTypeMerge},
+        file_id++, write_global_seqno, &true_data));
     // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {11, 15, 19}, {ValueType::kTypeDeletion, ValueType::kTypeMerge,
-                                ValueType::kTypeValue},
-        file_id++, &true_data));
+        options, {11, 15, 19},
+        {ValueType::kTypeDeletion, ValueType::kTypeMerge,
+         ValueType::kTypeValue},
+        file_id++, write_global_seqno, &true_data));
     // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
 
     ASSERT_OK(GenerateAndAddExternalFile(
         options, {120, 130}, {ValueType::kTypeValue, ValueType::kTypeMerge},
-        file_id++, &true_data));
+        file_id++, write_global_seqno, &true_data));
     // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
 
     ASSERT_OK(GenerateAndAddExternalFile(
         options, {1, 130}, {ValueType::kTypeMerge, ValueType::kTypeDeletion},
-        file_id++, &true_data));
+        file_id++, write_global_seqno, &true_data));
     // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
 
@@ -509,14 +537,14 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
         options, {150, 151, 152},
         {ValueType::kTypeValue, ValueType::kTypeMerge,
          ValueType::kTypeDeletion},
-        {{150, 160}, {180, 190}}, file_id++, &true_data));
+        {{150, 160}, {180, 190}}, file_id++, write_global_seqno, &true_data));
     // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
 
     ASSERT_OK(GenerateAndAddExternalFile(
         options, {150, 151, 152},
         {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue},
-        {{200, 250}}, file_id++, &true_data));
+        {{200, 250}}, file_id++, write_global_seqno, &true_data));
     // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4);
 
@@ -524,7 +552,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
         options, {300, 301, 302},
         {ValueType::kTypeValue, ValueType::kTypeMerge,
          ValueType::kTypeDeletion},
-        {{1, 2}, {152, 154}}, file_id++, &true_data));
+        {{1, 2}, {152, 154}}, file_id++, write_global_seqno, &true_data));
     // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
 
@@ -538,7 +566,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
     ASSERT_OK(GenerateAndAddExternalFile(
         options, {60, 61, 62},
         {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue},
-        file_id++, &true_data));
+        file_id++, write_global_seqno, &true_data));
     // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
 
@@ -546,7 +574,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
         options, {40, 41, 42},
         {ValueType::kTypeValue, ValueType::kTypeDeletion,
          ValueType::kTypeDeletion},
-        file_id++, &true_data));
+        file_id++, write_global_seqno, &true_data));
     // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
 
@@ -554,7 +582,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
         options, {20, 30, 40},
         {ValueType::kTypeDeletion, ValueType::kTypeDeletion,
          ValueType::kTypeDeletion},
-        file_id++, &true_data));
+        file_id++, write_global_seqno, &true_data));
     // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
 
@@ -564,13 +592,13 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
     // keys in the DB or not because we have a snapshot
     ASSERT_OK(GenerateAndAddExternalFile(
         options, {1000, 1002}, {ValueType::kTypeValue, ValueType::kTypeMerge},
-        file_id++, &true_data));
+        file_id++, write_global_seqno, &true_data));
     // A global seqno will be assigned anyway because of the snapshot
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
 
     ASSERT_OK(GenerateAndAddExternalFile(
         options, {2000, 3002}, {ValueType::kTypeValue, ValueType::kTypeMerge},
-        file_id++, &true_data));
+        file_id++, write_global_seqno, &true_data));
     // A global seqno will be assigned anyway because of the snapshot
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
 
@@ -578,7 +606,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
         options, {1, 20, 40, 100, 150},
         {ValueType::kTypeDeletion, ValueType::kTypeDeletion,
          ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeMerge},
-        file_id++, &true_data));
+        file_id++, write_global_seqno, &true_data));
     // A global seqno will be assigned anyway because of the snapshot
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
 
@@ -586,13 +614,13 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
 
     ASSERT_OK(GenerateAndAddExternalFile(
         options, {5000, 5001}, {ValueType::kTypeValue, ValueType::kTypeMerge},
-        file_id++, &true_data));
+        file_id++, write_global_seqno, &true_data));
     // No snapshot anymore, no need to assign a seqno
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
 
     size_t kcnt = 0;
     VerifyDBFromMap(true_data, &kcnt, false);
-  } while (ChangeCompactOptions());
+  } while (ChangeOptionsForFileIngestionTest());
 }
 
 TEST_F(ExternalSSTFileBasicTest, FadviseTrigger) {
@@ -635,7 +663,7 @@ TEST_F(ExternalSSTFileBasicTest, FadviseTrigger) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
-TEST_F(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
+TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
   int kNumLevels = 7;
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
@@ -662,12 +690,13 @@ TEST_F(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
   ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
   ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 1));
 
+  bool write_global_seqno = GetParam();
   // overlaps with L0 file but not memtable, so flush is skipped and file is
   // ingested into L0
   SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
   ASSERT_OK(GenerateAndAddExternalFile(
       options, {60, 90}, {ValueType::kTypeValue, ValueType::kTypeValue},
-      {{65, 70}, {70, 85}}, file_id++, &true_data));
+      {{65, 70}, {70, 85}}, file_id++, write_global_seqno, &true_data));
   ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
   ASSERT_EQ(2, NumTableFilesAtLevel(0));
   ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
@@ -677,7 +706,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
   // file is ingested into L5
   ASSERT_OK(GenerateAndAddExternalFile(
       options, {10, 40}, {ValueType::kTypeValue, ValueType::kTypeValue},
-      file_id++, &true_data));
+      file_id++, write_global_seqno, &true_data));
   ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
   ASSERT_EQ(2, NumTableFilesAtLevel(0));
   ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
@@ -686,7 +715,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
   // overlaps with L5 file but not memtable or L0 file, so flush is skipped and
   // file is ingested into L4
   ASSERT_OK(GenerateAndAddExternalFile(options, {}, {}, {{5, 15}}, file_id++,
-                                       &true_data));
+                                       write_global_seqno, &true_data));
   ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
   ASSERT_EQ(2, NumTableFilesAtLevel(0));
   ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
@@ -698,7 +727,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
   // count increases by two.
   ASSERT_OK(GenerateAndAddExternalFile(
       options, {100, 140}, {ValueType::kTypeValue, ValueType::kTypeValue},
-      file_id++, &true_data));
+      file_id++, write_global_seqno, &true_data));
   ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
   ASSERT_EQ(4, NumTableFilesAtLevel(0));
   ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
@@ -711,13 +740,16 @@ TEST_F(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
   // seqnum.
   ASSERT_OK(GenerateAndAddExternalFile(
       options, {151, 175}, {ValueType::kTypeValue, ValueType::kTypeValue},
-      {{160, 200}}, file_id++, &true_data));
+      {{160, 200}}, file_id++, write_global_seqno, &true_data));
   ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
   ASSERT_EQ(4, NumTableFilesAtLevel(0));
   ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
   ASSERT_EQ(2, NumTableFilesAtLevel(options.num_levels - 1));
 }
 
+INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest,
+                        testing::Bool());
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index e142bfbeac5..af11f6de9b5 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -225,7 +225,7 @@ void ExternalSstFileIngestionJob::UpdateStats() {
   for (IngestedFileInfo& f : files_to_ingest_) {
     InternalStats::CompactionStats stats(CompactionReason::kExternalSstIngestion, 1);
     stats.micros = total_time;
-    // If actual copy occured for this file, then we need to count the file
+    // If actual copy occurred for this file, then we need to count the file
     // size as the actual bytes written. If the file was linked, then we ignore
     // the bytes written for file metadata.
     // TODO (yanqin) maybe account for file metadata bytes for exact accuracy?
diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc
index 7981025b00c..8d62f508799 100644
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@@ -15,7 +15,8 @@
 
 namespace rocksdb {
 
-class ExternalSSTFileTest : public DBTestBase {
+class ExternalSSTFileTest : public DBTestBase,
+                            public ::testing::WithParamInterface<bool> {
  public:
   ExternalSSTFileTest() : DBTestBase("/external_sst_file_test") {
     sst_files_dir_ = dbname_ + "/sst_files/";
@@ -30,7 +31,8 @@ class ExternalSSTFileTest : public DBTestBase {
   Status GenerateAndAddExternalFile(
       const Options options,
       std::vector<std::pair<std::string, std::string>> data, int file_id = -1,
-      bool allow_global_seqno = false, bool sort_data = false,
+      bool allow_global_seqno = false, bool write_global_seqno = false,
+      bool sort_data = false,
       std::map<std::string, std::string>* true_data = nullptr,
       ColumnFamilyHandle* cfh = nullptr) {
     // Generate a file id if not provided
@@ -73,6 +75,7 @@ class ExternalSSTFileTest : public DBTestBase {
     if (s.ok()) {
       IngestExternalFileOptions ifo;
       ifo.allow_global_seqno = allow_global_seqno;
+      ifo.write_global_seqno = allow_global_seqno ? write_global_seqno : false;
       if (cfh) {
         s = db_->IngestExternalFile(cfh, {file_path}, ifo);
       } else {
@@ -149,11 +152,10 @@ class ExternalSSTFileTest : public DBTestBase {
     return s;
   }
 
-
-
   Status GenerateAndAddExternalFile(
       const Options options, std::vector<std::pair<int, std::string>> data,
-      int file_id = -1, bool allow_global_seqno = false, bool sort_data = false,
+      int file_id = -1, bool allow_global_seqno = false,
+      bool write_global_seqno = false, bool sort_data = false,
       std::map<std::string, std::string>* true_data = nullptr,
       ColumnFamilyHandle* cfh = nullptr) {
     std::vector<std::pair<std::string, std::string>> file_data;
@@ -161,13 +163,14 @@ class ExternalSSTFileTest : public DBTestBase {
       file_data.emplace_back(Key(entry.first), entry.second);
     }
     return GenerateAndAddExternalFile(options, file_data, file_id,
-                                      allow_global_seqno, sort_data, true_data,
-                                      cfh);
+                                      allow_global_seqno, write_global_seqno,
+                                      sort_data, true_data, cfh);
   }
 
   Status GenerateAndAddExternalFile(
       const Options options, std::vector<int> keys, int file_id = -1,
-      bool allow_global_seqno = false, bool sort_data = false,
+      bool allow_global_seqno = false, bool write_global_seqno = false,
+      bool sort_data = false,
       std::map<std::string, std::string>* true_data = nullptr,
       ColumnFamilyHandle* cfh = nullptr) {
     std::vector<std::pair<std::string, std::string>> file_data;
@@ -175,18 +178,20 @@ class ExternalSSTFileTest : public DBTestBase {
       file_data.emplace_back(Key(k), Key(k) + ToString(file_id));
     }
     return GenerateAndAddExternalFile(options, file_data, file_id,
-                                      allow_global_seqno, sort_data, true_data,
-                                      cfh);
+                                      allow_global_seqno, write_global_seqno,
+                                      sort_data, true_data, cfh);
   }
 
   Status DeprecatedAddFile(const std::vector<std::string>& files,
                            bool move_files = false,
-                           bool skip_snapshot_check = false) {
+                           bool skip_snapshot_check = false,
+                           bool skip_write_global_seqno = false) {
     IngestExternalFileOptions opts;
     opts.move_files = move_files;
     opts.snapshot_consistency = !skip_snapshot_check;
     opts.allow_global_seqno = false;
     opts.allow_blocking_flush = false;
+    opts.write_global_seqno = !skip_write_global_seqno;
     return db_->IngestExternalFile(files, opts);
   }
 
@@ -453,6 +458,7 @@ TEST_F(ExternalSSTFileTest, Basic) {
   } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction |
                          kRangeDelSkipConfigs));
 }
+
 class SstFileWriterCollector : public TablePropertiesCollector {
  public:
   explicit SstFileWriterCollector(const std::string prefix) : prefix_(prefix) {
@@ -1141,7 +1147,7 @@ TEST_F(ExternalSSTFileTest, OverlappingRanges) {
   } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
 }
 
-TEST_F(ExternalSSTFileTest, PickedLevel) {
+TEST_P(ExternalSSTFileTest, PickedLevel) {
   Options options = CurrentOptions();
   options.disable_auto_compactions = false;
   options.level0_file_num_compaction_trigger = 4;
@@ -1152,11 +1158,11 @@ TEST_F(ExternalSSTFileTest, PickedLevel) {
 
   // File 0 will go to last level (L3)
   ASSERT_OK(GenerateAndAddExternalFile(options, {1, 10}, -1, false, false,
-                                       &true_data));
+                                       false, &true_data));
   EXPECT_EQ(FilesPerLevel(), "0,0,0,1");
 
   // File 1 will go to level L2 (since it overlap with file 0 in L3)
-  ASSERT_OK(GenerateAndAddExternalFile(options, {2, 9}, -1, false, false,
+  ASSERT_OK(GenerateAndAddExternalFile(options, {2, 9}, -1, false, false, false,
                                        &true_data));
   EXPECT_EQ(FilesPerLevel(), "0,0,1,1");
 
@@ -1186,13 +1192,13 @@ TEST_F(ExternalSSTFileTest, PickedLevel) {
 
   // This file overlaps with file 0 (L3), file 1 (L2) and the
   // output of compaction going to L1
-  ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, -1, false, false,
+  ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, -1, false, false, false,
                                        &true_data));
   EXPECT_EQ(FilesPerLevel(), "5,0,1,1");
 
   // This file does not overlap with any file or with the running compaction
   ASSERT_OK(GenerateAndAddExternalFile(options, {9000, 9001}, -1, false, false,
-                                       &true_data));
+                                       false, &true_data));
   EXPECT_EQ(FilesPerLevel(), "5,0,1,2");
 
   // Hold compaction from finishing
@@ -1311,7 +1317,8 @@ TEST_F(ExternalSSTFileTest, IngestNonExistingFile) {
   ASSERT_OK(Flush());
 
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+  
   // After full compaction, there should be only 1 file.
   std::vector<std::string> files;
   env_->GetChildren(dbname_, &files);
@@ -1422,12 +1429,12 @@ TEST_F(ExternalSSTFileTest, PickedLevelDynamic) {
   // This file overlaps with the output of the compaction (going to L3)
   // so the file will be added to L0 since L3 is the base level
   ASSERT_OK(GenerateAndAddExternalFile(options, {31, 32, 33, 34}, -1, false,
-                                       false, &true_data));
+                                       false, false, &true_data));
   EXPECT_EQ(FilesPerLevel(), "5");
 
   // This file does not overlap with the current running compactiong
   ASSERT_OK(GenerateAndAddExternalFile(options, {9000, 9001}, -1, false, false,
-                                       &true_data));
+                                       false, &true_data));
   EXPECT_EQ(FilesPerLevel(), "5,0,0,1");
 
   // Hold compaction from finishing
@@ -1442,24 +1449,24 @@ TEST_F(ExternalSSTFileTest, PickedLevelDynamic) {
   Reopen(options);
 
   ASSERT_OK(GenerateAndAddExternalFile(options, {1, 15, 19}, -1, false, false,
-                                       &true_data));
+                                       false, &true_data));
   ASSERT_EQ(FilesPerLevel(), "1,0,0,3");
 
   ASSERT_OK(GenerateAndAddExternalFile(options, {1000, 1001, 1002}, -1, false,
-                                       false, &true_data));
+                                       false, false, &true_data));
   ASSERT_EQ(FilesPerLevel(), "1,0,0,4");
 
   ASSERT_OK(GenerateAndAddExternalFile(options, {500, 600, 700}, -1, false,
-                                       false, &true_data));
+                                       false, false, &true_data));
   ASSERT_EQ(FilesPerLevel(), "1,0,0,5");
 
   // File 5 overlaps with file 2 (L3 / base level)
   ASSERT_OK(GenerateAndAddExternalFile(options, {2, 10}, -1, false, false,
-                                       &true_data));
+                                       false, &true_data));
   ASSERT_EQ(FilesPerLevel(), "2,0,0,5");
 
   // File 6 overlaps with file 2 (L3 / base level) and file 5 (L0)
-  ASSERT_OK(GenerateAndAddExternalFile(options, {3, 9}, -1, false, false,
+  ASSERT_OK(GenerateAndAddExternalFile(options, {3, 9}, -1, false, false, false,
                                        &true_data));
   ASSERT_EQ(FilesPerLevel(), "3,0,0,5");
 
@@ -1479,7 +1486,7 @@ TEST_F(ExternalSSTFileTest, PickedLevelDynamic) {
 
   // File 7 overlaps with file 4 (L3)
   ASSERT_OK(GenerateAndAddExternalFile(options, {650, 651, 652}, -1, false,
-                                       false, &true_data));
+                                       false, false, &true_data));
   ASSERT_EQ(FilesPerLevel(), "5,0,0,5");
 
   VerifyDBFromMap(true_data, &kcnt, false);
@@ -1613,12 +1620,13 @@ TEST_F(ExternalSSTFileTest, SstFileWriterNonSharedKeys) {
   ASSERT_OK(DeprecatedAddFile({file_path}));
 }
 
-TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) {
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) {
   Options options = CurrentOptions();
   options.IncreaseParallelism(20);
   options.level0_slowdown_writes_trigger = 256;
   options.level0_stop_writes_trigger = 256;
 
+  bool write_global_seqno = GetParam();
   for (int iter = 0; iter < 2; iter++) {
     bool write_to_memtable = (iter == 0);
     DestroyAndReopen(options);
@@ -1643,7 +1651,8 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) {
         }
       } else {
         ASSERT_OK(GenerateAndAddExternalFile(options, random_data, -1, true,
-                                             true, &true_data));
+                                             write_global_seqno, true,
+                                             &true_data));
       }
     }
     size_t kcnt = 0;
@@ -1653,7 +1662,7 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) {
   }
 }
 
-TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
   Options options = CurrentOptions();
   options.num_levels = 5;
   options.disable_auto_compactions = true;
@@ -1672,8 +1681,9 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
   for (int i = 0; i <= 20; i++) {
     file_data.emplace_back(Key(i), "L4");
   }
-  ASSERT_OK(GenerateAndAddExternalFile(options, file_data, -1, true, false,
-                                       &true_data));
+  bool write_global_seqno = GetParam();
+  ASSERT_OK(GenerateAndAddExternalFile(options, file_data, -1, true,
+                                       write_global_seqno, false, &true_data));
 
   // This file dont overlap with anything in the DB, will go to L4
   ASSERT_EQ("0,0,0,0,1", FilesPerLevel());
@@ -1683,8 +1693,8 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
   for (int i = 80; i <= 130; i++) {
     file_data.emplace_back(Key(i), "L0");
   }
-  ASSERT_OK(GenerateAndAddExternalFile(options, file_data, -1, true, false,
-                                       &true_data));
+  ASSERT_OK(GenerateAndAddExternalFile(options, file_data, -1, true,
+                                       write_global_seqno, false, &true_data));
 
   // This file overlap with the memtable, so it will flush it and add
   // it self to L0
@@ -1695,8 +1705,8 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
   for (int i = 30; i <= 50; i++) {
     file_data.emplace_back(Key(i), "L4");
   }
-  ASSERT_OK(GenerateAndAddExternalFile(options, file_data, -1, true, false,
-                                       &true_data));
+  ASSERT_OK(GenerateAndAddExternalFile(options, file_data, -1, true,
+                                       write_global_seqno, false, &true_data));
 
   // This file dont overlap with anything in the DB and fit in L4 as well
   ASSERT_EQ("2,0,0,0,2", FilesPerLevel());
@@ -1706,8 +1716,8 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
   for (int i = 10; i <= 40; i++) {
     file_data.emplace_back(Key(i), "L3");
   }
-  ASSERT_OK(GenerateAndAddExternalFile(options, file_data, -1, true, false,
-                                       &true_data));
+  ASSERT_OK(GenerateAndAddExternalFile(options, file_data, -1, true,
+                                       write_global_seqno, false, &true_data));
 
   // This file overlap with files in L4, we will ingest it in L3
   ASSERT_EQ("2,0,0,1,2", FilesPerLevel());
@@ -1716,7 +1726,7 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
   VerifyDBFromMap(true_data, &kcnt, false);
 }
 
-TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) {
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) {
   Options options = CurrentOptions();
   DestroyAndReopen(options);
   uint64_t entries_in_memtable;
@@ -1730,16 +1740,17 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) {
                       &entries_in_memtable);
   ASSERT_GE(entries_in_memtable, 1);
 
+  bool write_global_seqno = GetParam();
   // No need for flush
-  ASSERT_OK(GenerateAndAddExternalFile(options, {90, 100, 110}, -1, true, false,
-                                       &true_data));
+  ASSERT_OK(GenerateAndAddExternalFile(options, {90, 100, 110}, -1, true,
+                                       write_global_seqno, false, &true_data));
   db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
                       &entries_in_memtable);
   ASSERT_GE(entries_in_memtable, 1);
 
   // This file will flush the memtable
-  ASSERT_OK(GenerateAndAddExternalFile(options, {19, 20, 21}, -1, true, false,
-                                       &true_data));
+  ASSERT_OK(GenerateAndAddExternalFile(options, {19, 20, 21}, -1, true,
+                                       write_global_seqno, false, &true_data));
   db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
                       &entries_in_memtable);
   ASSERT_EQ(entries_in_memtable, 0);
@@ -1754,14 +1765,14 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) {
 
   // No need for flush, this file keys fit between the memtable keys
   ASSERT_OK(GenerateAndAddExternalFile(options, {202, 203, 204}, -1, true,
-                                       false, &true_data));
+                                       write_global_seqno, false, &true_data));
   db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
                       &entries_in_memtable);
   ASSERT_GE(entries_in_memtable, 1);
 
   // This file will flush the memtable
   ASSERT_OK(GenerateAndAddExternalFile(options, {206, 207}, -1, true, false,
-                                       &true_data));
+                                       write_global_seqno, &true_data));
   db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
                       &entries_in_memtable);
   ASSERT_EQ(entries_in_memtable, 0);
@@ -1770,7 +1781,7 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) {
   VerifyDBFromMap(true_data, &kcnt, false);
 }
 
-TEST_F(ExternalSSTFileTest, L0SortingIssue) {
+TEST_P(ExternalSSTFileTest, L0SortingIssue) {
   Options options = CurrentOptions();
   options.num_levels = 2;
   DestroyAndReopen(options);
@@ -1779,10 +1790,13 @@ TEST_F(ExternalSSTFileTest, L0SortingIssue) {
   ASSERT_OK(Put(Key(1), "memtable"));
   ASSERT_OK(Put(Key(10), "memtable"));
 
+  bool write_global_seqno = GetParam();
   // No Flush needed, No global seqno needed, Ingest in L1
-  ASSERT_OK(GenerateAndAddExternalFile(options, {7, 8}, -1, true, false));
+  ASSERT_OK(GenerateAndAddExternalFile(options, {7, 8}, -1, true,
+                                       write_global_seqno, false));
   // No Flush needed, but need a global seqno, Ingest in L0
-  ASSERT_OK(GenerateAndAddExternalFile(options, {7, 8}, -1, true, false));
+  ASSERT_OK(GenerateAndAddExternalFile(options, {7, 8}, -1, true,
+                                       write_global_seqno, false));
   printf("%s\n", FilesPerLevel().c_str());
 
   // Overwrite what we added using external files
@@ -2011,15 +2025,17 @@ class TestIngestExternalFileListener : public EventListener {
   std::vector<ExternalFileIngestionInfo> ingested_files;
 };
 
-TEST_F(ExternalSSTFileTest, IngestionListener) {
+TEST_P(ExternalSSTFileTest, IngestionListener) {
   Options options = CurrentOptions();
   TestIngestExternalFileListener* listener =
       new TestIngestExternalFileListener();
   options.listeners.emplace_back(listener);
   CreateAndReopenWithCF({"koko", "toto"}, options);
 
+  bool write_global_seqno = GetParam();
   // Ingest into default cf
-  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 2}, -1, true, true, nullptr,
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 2}, -1, true,
+                                       write_global_seqno, true, nullptr,
                                        handles_[0]));
   ASSERT_EQ(listener->ingested_files.size(), 1);
   ASSERT_EQ(listener->ingested_files.back().cf_name, "default");
@@ -2030,7 +2046,8 @@ TEST_F(ExternalSSTFileTest, IngestionListener) {
             "default");
 
   // Ingest into cf1
-  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 2}, -1, true, true, nullptr,
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 2}, -1, true,
+                                       write_global_seqno, true, nullptr,
                                        handles_[1]));
   ASSERT_EQ(listener->ingested_files.size(), 2);
   ASSERT_EQ(listener->ingested_files.back().cf_name, "koko");
@@ -2041,7 +2058,8 @@ TEST_F(ExternalSSTFileTest, IngestionListener) {
             "koko");
 
   // Ingest into cf2
-  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 2}, -1, true, true, nullptr,
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 2}, -1, true,
+                                       write_global_seqno, true, nullptr,
                                        handles_[2]));
   ASSERT_EQ(listener->ingested_files.size(), 3);
   ASSERT_EQ(listener->ingested_files.back().cf_name, "toto");
@@ -2084,7 +2102,7 @@ TEST_F(ExternalSSTFileTest, SnapshotInconsistencyBug) {
   db_->ReleaseSnapshot(snap);
 }
 
-TEST_F(ExternalSSTFileTest, IngestBehind) {
+TEST_P(ExternalSSTFileTest, IngestBehind) {
   Options options = CurrentOptions();
   options.compaction_style = kCompactionStyleUniversal;
   options.num_levels = 3;
@@ -2108,6 +2126,7 @@ TEST_F(ExternalSSTFileTest, IngestBehind) {
   IngestExternalFileOptions ifo;
   ifo.allow_global_seqno = true;
   ifo.ingest_behind = true;
+  ifo.write_global_seqno = GetParam();
 
   // Can't ingest behind since allow_ingest_behind isn't set to true
   ASSERT_NOK(GenerateAndAddExternalFileIngestBehind(options, ifo,
@@ -2195,6 +2214,9 @@ TEST_F(ExternalSSTFileTest, SkipBloomFilter) {
   }
 }
 
+INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest,
+                        testing::Bool());
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index a6328e97001..45f5a94026d 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -70,7 +70,7 @@ class FaultInjectionTest
   std::unique_ptr<Env> base_env_;
   FaultInjectionTestEnv* env_;
   std::string dbname_;
-  shared_ptr<Cache> tiny_cache_;
+  std::shared_ptr<Cache> tiny_cache_;
   Options options_;
   DB* db_;
 
diff --git a/db/flush_job.cc b/db/flush_job.cc
index b64712fd1f0..8769c849e46 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -24,14 +24,15 @@
 #include "db/event_helpers.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
+#include "db/memtable.h"
 #include "db/memtable_list.h"
 #include "db/merge_context.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "db/version_set.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_util.h"
 #include "port/port.h"
-#include "db/memtable.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/statistics.h"
@@ -85,11 +86,11 @@ const char* GetFlushReasonString (FlushReason flush_reason) {
   }
 }
 
-
 FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
                    const ImmutableDBOptions& db_options,
                    const MutableCFOptions& mutable_cf_options,
-                   const EnvOptions env_options, VersionSet* versions,
+                   const uint64_t* max_memtable_id,
+                   const EnvOptions& env_options, VersionSet* versions,
                    InstrumentedMutex* db_mutex,
                    std::atomic<bool>* shutting_down,
                    std::vector<SequenceNumber> existing_snapshots,
@@ -98,11 +99,13 @@ FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
                    LogBuffer* log_buffer, Directory* db_directory,
                    Directory* output_file_directory,
                    CompressionType output_compression, Statistics* stats,
-                   EventLogger* event_logger, bool measure_io_stats)
+                   EventLogger* event_logger, bool measure_io_stats,
+                   const bool sync_output_directory, const bool write_manifest)
     : dbname_(dbname),
       cfd_(cfd),
       db_options_(db_options),
       mutable_cf_options_(mutable_cf_options),
+      max_memtable_id_(max_memtable_id),
       env_options_(env_options),
       versions_(versions),
       db_mutex_(db_mutex),
@@ -118,6 +121,8 @@ FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
       stats_(stats),
       event_logger_(event_logger),
       measure_io_stats_(measure_io_stats),
+      sync_output_directory_(sync_output_directory),
+      write_manifest_(write_manifest),
       edit_(nullptr),
       base_(nullptr),
       pick_memtable_called(false) {
@@ -162,7 +167,7 @@ void FlushJob::PickMemTable() {
   assert(!pick_memtable_called);
   pick_memtable_called = true;
   // Save the contents of the earliest memtable as a new Table
-  cfd_->imm()->PickMemtablesToFlush(&mems_);
+  cfd_->imm()->PickMemtablesToFlush(max_memtable_id_, &mems_);
   if (mems_.empty()) {
     return;
   }
@@ -226,10 +231,10 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker,
 
   if (!s.ok()) {
     cfd_->imm()->RollbackMemtableFlush(mems_, meta_.fd.GetNumber());
-  } else {
+  } else if (write_manifest_) {
     TEST_SYNC_POINT("FlushJob::InstallResults");
     // Replace immutable memtable with the generated Table
-    s = cfd_->imm()->InstallMemtableFlushResults(
+    s = cfd_->imm()->TryInstallMemtableFlushResults(
         cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_,
         meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_,
         log_buffer_);
@@ -291,7 +296,8 @@ Status FlushJob::WriteLevel0Table() {
     // memtable and its associated range deletion memtable, respectively, at
     // corresponding indexes.
     std::vector<InternalIterator*> memtables;
-    std::vector<InternalIterator*> range_del_iters;
+    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+        range_del_iters;
     ReadOptions ro;
     ro.total_order_seek = true;
     Arena arena;
@@ -303,9 +309,10 @@ Status FlushJob::WriteLevel0Table() {
           "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
           cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
       memtables.push_back(m->NewIterator(ro, &arena));
-      auto* range_del_iter = m->NewRangeTombstoneIterator(ro);
+      auto* range_del_iter =
+          m->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
       if (range_del_iter != nullptr) {
-        range_del_iters.push_back(range_del_iter);
+        range_del_iters.emplace_back(range_del_iter);
       }
       total_num_entries += m->num_entries();
       total_num_deletes += m->num_deletes();
@@ -324,10 +331,6 @@ Status FlushJob::WriteLevel0Table() {
       ScopedArenaIterator iter(
           NewMergingIterator(&cfd_->internal_comparator(), &memtables[0],
                              static_cast<int>(memtables.size()), &arena));
-      std::unique_ptr<InternalIterator> range_del_iter(NewMergingIterator(
-          &cfd_->internal_comparator(),
-          range_del_iters.empty() ? nullptr : &range_del_iters[0],
-          static_cast<int>(range_del_iters.size())));
       ROCKS_LOG_INFO(db_options_.info_log,
                      "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started",
                      cfd_->GetName().c_str(), job_context_->job_id,
@@ -353,7 +356,7 @@ Status FlushJob::WriteLevel0Table() {
       s = BuildTable(
           dbname_, db_options_.env, *cfd_->ioptions(), mutable_cf_options_,
           env_options_, cfd_->table_cache(), iter.get(),
-          std::move(range_del_iter), &meta_, cfd_->internal_comparator(),
+          std::move(range_del_iters), &meta_, cfd_->internal_comparator(),
           cfd_->int_tbl_prop_collector_factories(), cfd_->GetID(),
           cfd_->GetName(), existing_snapshots_,
           earliest_write_conflict_snapshot_, snapshot_checker_,
@@ -373,7 +376,7 @@ Status FlushJob::WriteLevel0Table() {
                    s.ToString().c_str(),
                    meta_.marked_for_compaction ? " (needs compaction)" : "");
 
-    if (s.ok() && output_file_directory_ != nullptr) {
+    if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
       s = output_file_directory_->Fsync();
     }
     TEST_SYNC_POINT("FlushJob::WriteLevel0Table");
diff --git a/db/flush_job.h b/db/flush_job.h
index c3115c4a654..d993e410d1f 100644
--- a/db/flush_job.h
+++ b/db/flush_job.h
@@ -59,14 +59,16 @@ class FlushJob {
   FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
            const ImmutableDBOptions& db_options,
            const MutableCFOptions& mutable_cf_options,
-           const EnvOptions env_options, VersionSet* versions,
-           InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
+           const uint64_t* max_memtable_id, const EnvOptions& env_options,
+           VersionSet* versions, InstrumentedMutex* db_mutex,
+           std::atomic<bool>* shutting_down,
            std::vector<SequenceNumber> existing_snapshots,
            SequenceNumber earliest_write_conflict_snapshot,
            SnapshotChecker* snapshot_checker, JobContext* job_context,
            LogBuffer* log_buffer, Directory* db_directory,
            Directory* output_file_directory, CompressionType output_compression,
-           Statistics* stats, EventLogger* event_logger, bool measure_io_stats);
+           Statistics* stats, EventLogger* event_logger, bool measure_io_stats,
+           const bool sync_output_directory, const bool write_manifest);
 
   ~FlushJob();
 
@@ -77,16 +79,24 @@ class FlushJob {
              FileMetaData* file_meta = nullptr);
   void Cancel();
   TableProperties GetTableProperties() const { return table_properties_; }
+  const autovector<MemTable*>& GetMemTables() const { return mems_; }
 
  private:
   void ReportStartedFlush();
   void ReportFlushInputSize(const autovector<MemTable*>& mems);
   void RecordFlushIOStats();
   Status WriteLevel0Table();
+
   const std::string& dbname_;
   ColumnFamilyData* cfd_;
   const ImmutableDBOptions& db_options_;
   const MutableCFOptions& mutable_cf_options_;
+  // Pointer to a variable storing the largest memtable id to flush in this
+  // flush job. RocksDB uses this variable to select the memtables to flush in
+  // this job. All memtables in this column family with an ID smaller than or
+  // equal to *max_memtable_id_ will be selected for flush. If null, then all
+  // memtables in the column family will be selected.
+  const uint64_t* max_memtable_id_;
   const EnvOptions env_options_;
   VersionSet* versions_;
   InstrumentedMutex* db_mutex_;
@@ -103,6 +113,23 @@ class FlushJob {
   EventLogger* event_logger_;
   TableProperties table_properties_;
   bool measure_io_stats_;
+  // True if this flush job should call fsync on the output directory. False
+  // otherwise.
+  // Usually sync_output_directory_ is true. A flush job needs to call sync on
+  // the output directory before committing to the MANIFEST.
+  // However, an individual flush job does not have to call sync on the output
+  // directory if it is part of an atomic flush. After all flush jobs in the
+  // atomic flush succeed, call sync once on each distinct output directory.
+  const bool sync_output_directory_;
+  // True if this flush job should write to MANIFEST after successfully
+  // flushing memtables. False otherwise.
+  // Usually write_manifest_ is true. A flush job commits to the MANIFEST after
+  // flushing the memtables.
+  // However, an individual flush job cannot rashly write to the MANIFEST
+  // immediately after it finishes the flush if it is part of an atomic flush.
+  // In this case, only after all flush jobs succeed in flush can RocksDB
+  // commit to the MANIFEST.
+  const bool write_manifest_;
 
   // Variables below are set by PickMemTable():
   FileMetaData meta_;
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index 041edeaa4b1..1f7bc7b845b 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -30,6 +30,7 @@ class FlushJobTest : public testing::Test {
         dbname_(test::PerThreadDBPath("flush_job_test")),
         options_(),
         db_options_(options_),
+        column_family_names_({kDefaultColumnFamilyName, "foo", "bar"}),
         table_cache_(NewLRUCache(50000, 16)),
         write_buffer_manager_(db_options_.db_write_buffer_size),
         versions_(new VersionSet(dbname_, &db_options_, env_options_,
@@ -45,7 +46,9 @@ class FlushJobTest : public testing::Test {
     NewDB();
     std::vector<ColumnFamilyDescriptor> column_families;
     cf_options_.table_factory = mock_table_factory_;
-    column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+    for (const auto& cf_name : column_family_names_) {
+      column_families.emplace_back(cf_name, cf_options_);
+    }
 
     EXPECT_OK(versions_->Recover(column_families, false));
   }
@@ -56,18 +59,38 @@ class FlushJobTest : public testing::Test {
     new_db.SetNextFile(2);
     new_db.SetLastSequence(0);
 
+    autovector<VersionEdit> new_cfs;
+    SequenceNumber last_seq = 1;
+    uint32_t cf_id = 1;
+    for (size_t i = 1; i != column_family_names_.size(); ++i) {
+      VersionEdit new_cf;
+      new_cf.AddColumnFamily(column_family_names_[i]);
+      new_cf.SetColumnFamily(cf_id++);
+      new_cf.SetLogNumber(0);
+      new_cf.SetNextFile(2);
+      new_cf.SetLastSequence(last_seq++);
+      new_cfs.emplace_back(new_cf);
+    }
+
     const std::string manifest = DescriptorFileName(dbname_, 1);
-    unique_ptr<WritableFile> file;
+    std::unique_ptr<WritableFile> file;
     Status s = env_->NewWritableFile(
         manifest, &file, env_->OptimizeForManifestWrite(env_options_));
     ASSERT_OK(s);
-    unique_ptr<WritableFileWriter> file_writer(
+    std::unique_ptr<WritableFileWriter> file_writer(
         new WritableFileWriter(std::move(file), manifest, EnvOptions()));
     {
       log::Writer log(std::move(file_writer), 0, false);
       std::string record;
       new_db.EncodeTo(&record);
       s = log.AddRecord(record);
+
+      for (const auto& e : new_cfs) {
+        record.clear();
+        e.EncodeTo(&record);
+        s = log.AddRecord(record);
+        ASSERT_OK(s);
+      }
     }
     ASSERT_OK(s);
     // Make "CURRENT" file that points to the new manifest file.
@@ -79,6 +102,7 @@ class FlushJobTest : public testing::Test {
   EnvOptions env_options_;
   Options options_;
   ImmutableDBOptions db_options_;
+  const std::vector<std::string> column_family_names_;
   std::shared_ptr<Cache> table_cache_;
   WriteController write_controller_;
   WriteBufferManager write_buffer_manager_;
@@ -96,9 +120,11 @@ TEST_F(FlushJobTest, Empty) {
   SnapshotChecker* snapshot_checker = nullptr;  // not relavant
   FlushJob flush_job(
       dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
-      *cfd->GetLatestMutableCFOptions(), env_options_, versions_.get(), &mutex_,
-      &shutting_down_, {}, kMaxSequenceNumber, snapshot_checker, &job_context,
-      nullptr, nullptr, nullptr, kNoCompression, nullptr, &event_logger, false);
+      *cfd->GetLatestMutableCFOptions(), nullptr /* memtable_id */,
+      env_options_, versions_.get(), &mutex_, &shutting_down_, {},
+      kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr,
+      nullptr, kNoCompression, nullptr, &event_logger, false,
+      true /* sync_output_directory */, true /* write_manifest */);
   {
     InstrumentedMutexLock l(&mutex_);
     flush_job.PickMemTable();
@@ -139,12 +165,13 @@ TEST_F(FlushJobTest, NonEmpty) {
 
   EventLogger event_logger(db_options_.info_log.get());
   SnapshotChecker* snapshot_checker = nullptr;  // not relavant
-  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
-                     db_options_, *cfd->GetLatestMutableCFOptions(),
-                     env_options_, versions_.get(), &mutex_, &shutting_down_,
-                     {}, kMaxSequenceNumber, snapshot_checker, &job_context,
-                     nullptr, nullptr, nullptr, kNoCompression,
-                     db_options_.statistics.get(), &event_logger, true);
+  FlushJob flush_job(
+      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+      *cfd->GetLatestMutableCFOptions(), nullptr /* memtable_id */,
+      env_options_, versions_.get(), &mutex_, &shutting_down_, {},
+      kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr,
+      nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
+      true, true /* sync_output_directory */, true /* write_manifest */);
 
   HistogramData hist;
   FileMetaData file_meta;
@@ -165,6 +192,179 @@ TEST_F(FlushJobTest, NonEmpty) {
   job_context.Clean();
 }
 
+TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) {
+  const size_t num_mems = 2;
+  const size_t num_mems_to_flush = 1;
+  const size_t num_keys_per_table = 100;
+  JobContext job_context(0);
+  ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+  std::vector<uint64_t> memtable_ids;
+  std::vector<MemTable*> new_mems;
+  for (size_t i = 0; i != num_mems; ++i) {
+    MemTable* mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                              kMaxSequenceNumber);
+    mem->SetID(i);
+    mem->Ref();
+    new_mems.emplace_back(mem);
+    memtable_ids.push_back(mem->GetID());
+
+    for (size_t j = 0; j < num_keys_per_table; ++j) {
+      std::string key(ToString(j + i * num_keys_per_table));
+      std::string value("value" + key);
+      mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue, key,
+               value);
+    }
+  }
+
+  autovector<MemTable*> to_delete;
+  for (auto mem : new_mems) {
+    cfd->imm()->Add(mem, &to_delete);
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
+
+  assert(memtable_ids.size() == num_mems);
+  uint64_t smallest_memtable_id = memtable_ids.front();
+  uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1;
+
+  FlushJob flush_job(
+      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+      *cfd->GetLatestMutableCFOptions(), &flush_memtable_id, env_options_,
+      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
+      snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */);
+  HistogramData hist;
+  FileMetaData file_meta;
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run(nullptr /* prep_tracker */, &file_meta));
+  mutex_.Unlock();
+  db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+  ASSERT_GT(hist.average, 0.0);
+
+  ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString());
+  ASSERT_EQ("99", file_meta.largest.user_key().ToString());
+  ASSERT_EQ(0, file_meta.fd.smallest_seqno);
+  ASSERT_EQ(SequenceNumber(num_mems_to_flush * num_keys_per_table - 1),
+            file_meta.fd.largest_seqno);
+
+  for (auto m : to_delete) {
+    delete m;
+  }
+  to_delete.clear();
+  job_context.Clean();
+}
+
+TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
+  autovector<ColumnFamilyData*> all_cfds;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    all_cfds.push_back(cfd);
+  }
+  const std::vector<size_t> num_memtables = {2, 1, 3};
+  assert(num_memtables.size() == column_family_names_.size());
+  const size_t num_keys_per_memtable = 1000;
+  JobContext job_context(0);
+  std::vector<uint64_t> memtable_ids;
+  std::vector<SequenceNumber> smallest_seqs;
+  std::vector<SequenceNumber> largest_seqs;
+  autovector<MemTable*> to_delete;
+  SequenceNumber curr_seqno = 0;
+  size_t k = 0;
+  for (auto cfd : all_cfds) {
+    smallest_seqs.push_back(curr_seqno);
+    for (size_t i = 0; i != num_memtables[k]; ++i) {
+      MemTable* mem = cfd->ConstructNewMemtable(
+          *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber);
+      mem->SetID(i);
+      mem->Ref();
+
+      for (size_t j = 0; j != num_keys_per_memtable; ++j) {
+        std::string key(ToString(j + i * num_keys_per_memtable));
+        std::string value("value" + key);
+        mem->Add(curr_seqno++, kTypeValue, key, value);
+      }
+
+      cfd->imm()->Add(mem, &to_delete);
+    }
+    largest_seqs.push_back(curr_seqno - 1);
+    memtable_ids.push_back(num_memtables[k++] - 1);
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relevant
+  std::vector<FlushJob> flush_jobs;
+  k = 0;
+  for (auto cfd : all_cfds) {
+    std::vector<SequenceNumber> snapshot_seqs;
+    flush_jobs.emplace_back(
+        dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
+        &memtable_ids[k], env_options_, versions_.get(), &mutex_,
+        &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker,
+        &job_context, nullptr, nullptr, nullptr, kNoCompression,
+        db_options_.statistics.get(), &event_logger, true,
+        false /* sync_output_directory */, false /* write_manifest */);
+    k++;
+  }
+  HistogramData hist;
+  std::vector<FileMetaData> file_metas;
+  // Call reserve to avoid auto-resizing
+  file_metas.reserve(flush_jobs.size());
+  mutex_.Lock();
+  for (auto& job : flush_jobs) {
+    job.PickMemTable();
+  }
+  for (auto& job : flush_jobs) {
+    FileMetaData meta;
+    // Run will release and re-acquire  mutex
+    ASSERT_OK(job.Run(nullptr /**/, &meta));
+    file_metas.emplace_back(meta);
+  }
+  autovector<FileMetaData*> file_meta_ptrs;
+  for (auto& meta : file_metas) {
+    file_meta_ptrs.push_back(&meta);
+  }
+  autovector<const autovector<MemTable*>*> mems_list;
+  for (size_t i = 0; i != all_cfds.size(); ++i) {
+    const auto& mems = flush_jobs[i].GetMemTables();
+    mems_list.push_back(&mems);
+  }
+  autovector<const MutableCFOptions*> mutable_cf_options_list;
+  for (auto cfd : all_cfds) {
+    mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
+  }
+
+  Status s = InstallMemtableAtomicFlushResults(
+      nullptr /* imm_lists */, all_cfds, mutable_cf_options_list, mems_list,
+      versions_.get(), &mutex_, file_meta_ptrs, &job_context.memtables_to_free,
+      nullptr /* db_directory */, nullptr /* log_buffer */);
+  ASSERT_OK(s);
+
+  mutex_.Unlock();
+  db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+  ASSERT_GT(hist.average, 0.0);
+  k = 0;
+  for (const auto& file_meta : file_metas) {
+    ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString());
+    ASSERT_EQ("999", file_meta.largest.user_key()
+                         .ToString());  // max key by bytewise comparator
+    ASSERT_EQ(smallest_seqs[k], file_meta.fd.smallest_seqno);
+    ASSERT_EQ(largest_seqs[k], file_meta.fd.largest_seqno);
+    // Verify that imm is empty
+    ASSERT_EQ(std::numeric_limits<uint64_t>::max(),
+              all_cfds[k]->imm()->GetEarliestMemTableID());
+    ASSERT_EQ(0, all_cfds[k]->imm()->GetLatestMemTableID());
+    ++k;
+  }
+
+  for (auto m : to_delete) {
+    delete m;
+  }
+  to_delete.clear();
+  job_context.Clean();
+}
+
 TEST_F(FlushJobTest, Snapshots) {
   JobContext job_context(0);
   auto cfd = versions_->GetColumnFamilySet()->GetDefault();
@@ -213,12 +413,13 @@ TEST_F(FlushJobTest, Snapshots) {
 
   EventLogger event_logger(db_options_.info_log.get());
   SnapshotChecker* snapshot_checker = nullptr;  // not relavant
-  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
-                     db_options_, *cfd->GetLatestMutableCFOptions(),
-                     env_options_, versions_.get(), &mutex_, &shutting_down_,
-                     snapshots, kMaxSequenceNumber, snapshot_checker,
-                     &job_context, nullptr, nullptr, nullptr, kNoCompression,
-                     db_options_.statistics.get(), &event_logger, true);
+  FlushJob flush_job(
+      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+      *cfd->GetLatestMutableCFOptions(), nullptr /* memtable_id */,
+      env_options_, versions_.get(), &mutex_, &shutting_down_, snapshots,
+      kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr,
+      nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
+      true, true /* sync_output_directory */, true /* write_manifest */);
   mutex_.Lock();
   flush_job.PickMemTable();
   ASSERT_OK(flush_job.Run());
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index 01afac79ee7..f44a0975613 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -15,6 +15,8 @@
 #include "db/db_iter.h"
 #include "db/dbformat.h"
 #include "db/job_context.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "rocksdb/env.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
@@ -71,8 +73,8 @@ class ForwardLevelIterator : public InternalIterator {
       delete file_iter_;
     }
 
-    RangeDelAggregator range_del_agg(
-        cfd_->internal_comparator(), {} /* snapshots */);
+    ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+                                         kMaxSequenceNumber /* upper_bound */);
     file_iter_ = cfd_->table_cache()->NewIterator(
         read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
         *files_[file_index_],
@@ -608,13 +610,14 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
     // New
     sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_));
   }
-  RangeDelAggregator range_del_agg(
-      cfd_->internal_comparator(), {} /* snapshots */);
+  ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+                                       kMaxSequenceNumber /* upper_bound */);
   mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
   sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
   if (!read_options_.ignore_range_deletions) {
-    std::unique_ptr<InternalIterator> range_del_iter(
-        sv_->mem->NewRangeTombstoneIterator(read_options_));
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        sv_->mem->NewRangeTombstoneIterator(
+            read_options_, sv_->current->version_set()->LastSequence()));
     range_del_agg.AddTombstones(std::move(range_del_iter));
     sv_->imm->AddRangeTombstoneIterators(read_options_, &arena_,
                                          &range_del_agg);
@@ -666,11 +669,12 @@ void ForwardIterator::RenewIterators() {
 
   mutable_iter_ = svnew->mem->NewIterator(read_options_, &arena_);
   svnew->imm->AddIterators(read_options_, &imm_iters_, &arena_);
-  RangeDelAggregator range_del_agg(
-      cfd_->internal_comparator(), {} /* snapshots */);
+  ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+                                       kMaxSequenceNumber /* upper_bound */);
   if (!read_options_.ignore_range_deletions) {
-    std::unique_ptr<InternalIterator> range_del_iter(
-        svnew->mem->NewRangeTombstoneIterator(read_options_));
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        svnew->mem->NewRangeTombstoneIterator(
+            read_options_, sv_->current->version_set()->LastSequence()));
     range_del_agg.AddTombstones(std::move(range_del_iter));
     svnew->imm->AddRangeTombstoneIterators(read_options_, &arena_,
                                            &range_del_agg);
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 906d0079596..b330a40f125 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -231,6 +231,8 @@ static const std::string current_version_number =
     "current-super-version-number";
 static const std::string estimate_live_data_size = "estimate-live-data-size";
 static const std::string min_log_number_to_keep_str = "min-log-number-to-keep";
+static const std::string min_obsolete_sst_number_to_keep_str =
+    "min-obsolete-sst-number-to-keep";
 static const std::string base_level_str = "base-level";
 static const std::string total_sst_files_size = "total-sst-files-size";
 static const std::string live_sst_files_size = "live-sst-files-size";
@@ -310,6 +312,8 @@ const std::string DB::Properties::kEstimateLiveDataSize =
     rocksdb_prefix + estimate_live_data_size;
 const std::string DB::Properties::kMinLogNumberToKeep =
     rocksdb_prefix + min_log_number_to_keep_str;
+const std::string DB::Properties::kMinObsoleteSstNumberToKeep =
+    rocksdb_prefix + min_obsolete_sst_number_to_keep_str;
 const std::string DB::Properties::kTotalSstFilesSize =
     rocksdb_prefix + total_sst_files_size;
 const std::string DB::Properties::kLiveSstFilesSize =
@@ -430,6 +434,9 @@ const std::unordered_map<std::string, DBPropertyInfo>
         {DB::Properties::kMinLogNumberToKeep,
          {false, nullptr, &InternalStats::HandleMinLogNumberToKeep, nullptr,
           nullptr}},
+        {DB::Properties::kMinObsoleteSstNumberToKeep,
+         {false, nullptr, &InternalStats::HandleMinObsoleteSstNumberToKeep,
+          nullptr, nullptr}},
         {DB::Properties::kBaseLevel,
          {false, nullptr, &InternalStats::HandleBaseLevel, nullptr, nullptr}},
         {DB::Properties::kTotalSstFilesSize,
@@ -826,6 +833,13 @@ bool InternalStats::HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db,
   return true;
 }
 
+bool InternalStats::HandleMinObsoleteSstNumberToKeep(uint64_t* value,
+                                                     DBImpl* db,
+                                                     Version* /*version*/) {
+  *value = db->MinObsoleteSstNumberToKeep();
+  return true;
+}
+
 bool InternalStats::HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db,
                                                  Version* /*version*/) {
   const WriteController& wc = db->write_controller();
diff --git a/db/internal_stats.h b/db/internal_stats.h
index 620ae473496..6f012751327 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -533,6 +533,8 @@ class InternalStats {
   bool HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db,
                                   Version* version);
   bool HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleMinObsoleteSstNumberToKeep(uint64_t* value, DBImpl* db,
+                                        Version* version);
   bool HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db,
                                     Version* version);
   bool HandleIsWriteStopped(uint64_t* value, DBImpl* db, Version* version);
diff --git a/db/job_context.h b/db/job_context.h
index af640dab29f..498ef7d1760 100644
--- a/db/job_context.h
+++ b/db/job_context.h
@@ -30,7 +30,8 @@ struct SuperVersionContext {
 #ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
   autovector<WriteStallNotification> write_stall_notifications;
 #endif
-  unique_ptr<SuperVersion> new_superversion;  // if nullptr no new superversion
+  std::unique_ptr<SuperVersion>
+      new_superversion;  // if nullptr no new superversion
 
   explicit SuperVersionContext(bool create_superversion = false)
     : new_superversion(create_superversion ? new SuperVersion() : nullptr) {}
@@ -44,7 +45,7 @@ struct SuperVersionContext {
   }
 
   void NewSuperVersion() {
-    new_superversion = unique_ptr<SuperVersion>(new SuperVersion());
+    new_superversion = std::unique_ptr<SuperVersion>(new SuperVersion());
   }
 
   inline bool HaveSomethingToDelete() const {
diff --git a/db/listener_test.cc b/db/listener_test.cc
index 77afcd9ed58..894769d88d9 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -891,6 +891,68 @@ TEST_F(EventListenerTest, BackgroundErrorListenerFailedCompactionTest) {
   ASSERT_LE(1, NumTableFilesAtLevel(0));
 }
 
+class TestFileOperationListener : public EventListener {
+ public:
+  TestFileOperationListener() {
+    file_reads_.store(0);
+    file_reads_success_.store(0);
+    file_writes_.store(0);
+    file_writes_success_.store(0);
+  }
+
+  void OnFileReadFinish(const FileOperationInfo& info) override {
+    ++file_reads_;
+    if (info.status.ok()) {
+      ++file_reads_success_;
+    }
+    ReportDuration(info);
+  }
+
+  void OnFileWriteFinish(const FileOperationInfo& info) override {
+    ++file_writes_;
+    if (info.status.ok()) {
+      ++file_writes_success_;
+    }
+    ReportDuration(info);
+  }
+
+  bool ShouldBeNotifiedOnFileIO() override { return true; }
+
+  std::atomic<size_t> file_reads_;
+  std::atomic<size_t> file_reads_success_;
+  std::atomic<size_t> file_writes_;
+  std::atomic<size_t> file_writes_success_;
+
+ private:
+  void ReportDuration(const FileOperationInfo& info) const {
+    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
+        info.finish_timestamp - info.start_timestamp);
+    ASSERT_GT(duration.count(), 0);
+  }
+};
+
+TEST_F(EventListenerTest, OnFileOperationTest) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+
+  TestFileOperationListener* listener = new TestFileOperationListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "aaa"));
+  dbfull()->Flush(FlushOptions());
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_GE(listener->file_writes_.load(),
+            listener->file_writes_success_.load());
+  ASSERT_GT(listener->file_writes_.load(), 0);
+  Close();
+
+  Reopen(options);
+  ASSERT_GE(listener->file_reads_.load(), listener->file_reads_success_.load());
+  ASSERT_GT(listener->file_reads_.load(), 0);
+}
+
 }  // namespace rocksdb
 
 #endif  // ROCKSDB_LITE
diff --git a/db/log_reader.cc b/db/log_reader.cc
index bd9d300bf6b..2c57cde5d59 100644
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@@ -23,8 +23,9 @@ Reader::Reporter::~Reporter() {
 }
 
 Reader::Reader(std::shared_ptr<Logger> info_log,
-               unique_ptr<SequentialFileReader>&& _file, Reporter* reporter,
-               bool checksum, uint64_t log_num)
+               std::unique_ptr<SequentialFileReader>&& _file,
+               Reporter* reporter, bool checksum, uint64_t log_num,
+               bool retry_after_eof)
     : info_log_(info_log),
       file_(std::move(_file)),
       reporter_(reporter),
@@ -37,7 +38,8 @@ Reader::Reader(std::shared_ptr<Logger> info_log,
       last_record_offset_(0),
       end_of_buffer_offset_(0),
       log_number_(log_num),
-      recycled_(false) {}
+      recycled_(false),
+      retry_after_eof_(retry_after_eof) {}
 
 Reader::~Reader() {
   delete[] backing_store_;
@@ -122,7 +124,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
           // in clean shutdown we don't expect any error in the log files
           ReportCorruption(drop_size, "truncated header");
         }
-	FALLTHROUGH_INTENDED;
+        FALLTHROUGH_INTENDED;
 
       case kEof:
         if (in_fragmented_record) {
@@ -152,7 +154,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
           }
           return false;
         }
-	FALLTHROUGH_INTENDED;
+        FALLTHROUGH_INTENDED;
 
       case kBadRecord:
         if (in_fragmented_record) {
@@ -208,7 +210,8 @@ void Reader::UnmarkEOF() {
 
   eof_ = false;
 
-  if (eof_offset_ == 0) {
+  // If retry_after_eof_ is true, we have to proceed to read anyway.
+  if (!retry_after_eof_ && eof_offset_ == 0) {
     return;
   }
 
@@ -289,8 +292,12 @@ bool Reader::ReadMore(size_t* drop_size, int *error) {
     } else if (buffer_.size() < static_cast<size_t>(kBlockSize)) {
       eof_ = true;
       eof_offset_ = buffer_.size();
+      TEST_SYNC_POINT("LogReader::ReadMore:FirstEOF");
     }
     return true;
+  } else if (retry_after_eof_ && !read_error_) {
+    UnmarkEOF();
+    return !read_error_;
   } else {
     // Note that if buffer_ is non-empty, we have a truncated header at the
     //  end of the file, which can be caused by the writer crashing in the
@@ -312,7 +319,10 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) {
   while (true) {
     // We need at least the minimum header size
     if (buffer_.size() < static_cast<size_t>(kHeaderSize)) {
-      int r;
+      // the default value of r is meaningless because ReadMore will overwrite
+      // it if it returns false; in case it returns true, the return value will
+      // not be used anyway
+      int r = kEof;
       if (!ReadMore(drop_size, &r)) {
         return r;
       }
@@ -333,7 +343,7 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) {
       header_size = kRecyclableHeaderSize;
       // We need enough for the larger header
       if (buffer_.size() < static_cast<size_t>(kRecyclableHeaderSize)) {
-        int r;
+        int r = kEof;
         if (!ReadMore(drop_size, &r)) {
           return r;
         }
@@ -345,16 +355,24 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) {
       }
     }
     if (header_size + length > buffer_.size()) {
-      *drop_size = buffer_.size();
-      buffer_.clear();
-      if (!eof_) {
-        return kBadRecordLen;
-      }
-      // If the end of the file has been reached without reading |length| bytes
-      // of payload, assume the writer died in the middle of writing the record.
-      // Don't report a corruption unless requested.
-      if (*drop_size) {
-        return kBadHeader;
+      if (!retry_after_eof_) {
+        *drop_size = buffer_.size();
+        buffer_.clear();
+        if (!eof_) {
+          return kBadRecordLen;
+        }
+        // If the end of the file has been reached without reading |length|
+        // bytes of payload, assume the writer died in the middle of writing the
+        // record. Don't report a corruption unless requested.
+        if (*drop_size) {
+          return kBadHeader;
+        }
+      } else {
+        int r = kEof;
+        if (!ReadMore(drop_size, &r)) {
+          return r;
+        }
+        continue;
       }
       return kEof;
     }
diff --git a/db/log_reader.h b/db/log_reader.h
index 4727be24c86..2c4f4f05990 100644
--- a/db/log_reader.h
+++ b/db/log_reader.h
@@ -52,8 +52,8 @@ class Reader {
   // If "checksum" is true, verify checksums if available.
   Reader(std::shared_ptr<Logger> info_log,
          // @lint-ignore TXT2 T25377293 Grandfathered in
-         unique_ptr<SequentialFileReader>&& file, Reporter* reporter,
-         bool checksum, uint64_t log_num);
+         std::unique_ptr<SequentialFileReader>&& file, Reporter* reporter,
+         bool checksum, uint64_t log_num, bool retry_after_eof);
 
   ~Reader();
 
@@ -87,7 +87,7 @@ class Reader {
 
  private:
   std::shared_ptr<Logger> info_log_;
-  const unique_ptr<SequentialFileReader> file_;
+  const std::unique_ptr<SequentialFileReader> file_;
   Reporter* const reporter_;
   bool const checksum_;
   char* const backing_store_;
@@ -110,6 +110,11 @@ class Reader {
   // Whether this is a recycled log file
   bool recycled_;
 
+  // Whether retry after encountering EOF
+  // TODO (yanqin) add support for retry policy, e.g. sleep, max retry limit,
+  // etc.
+  const bool retry_after_eof_;
+
   // Extend record types with the following special values
   enum {
     kEof = kMaxRecordType + 1,
diff --git a/db/log_test.cc b/db/log_test.cc
index e14aa202b5a..9e8148f6539 100644
--- a/db/log_test.cc
+++ b/db/log_test.cc
@@ -145,8 +145,8 @@ class LogTest : public ::testing::TestWithParam<int> {
   }
 
   Slice reader_contents_;
-  unique_ptr<WritableFileWriter> dest_holder_;
-  unique_ptr<SequentialFileReader> source_holder_;
+  std::unique_ptr<WritableFileWriter> dest_holder_;
+  std::unique_ptr<SequentialFileReader> source_holder_;
   ReportCollector report_;
   Writer writer_;
   Reader reader_;
@@ -164,7 +164,8 @@ class LogTest : public ::testing::TestWithParam<int> {
             new StringSource(reader_contents_), "" /* file name */)),
         writer_(std::move(dest_holder_), 123, GetParam()),
         reader_(nullptr, std::move(source_holder_), &report_,
-                true /* checksum */, 123 /* log_number */) {
+                true /* checksum */, 123 /* log_number */,
+                false /* retry_after_eof */) {
     int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
     initial_offset_last_record_offsets_[0] = 0;
     initial_offset_last_record_offsets_[1] = header_size + 10000;
@@ -638,7 +639,7 @@ TEST_P(LogTest, Recycle) {
   while (get_reader_contents()->size() < log::kBlockSize * 2) {
     Write("xxxxxxxxxxxxxxxx");
   }
-  unique_ptr<WritableFileWriter> dest_holder(test::GetWritableFileWriter(
+  std::unique_ptr<WritableFileWriter> dest_holder(test::GetWritableFileWriter(
       new test::OverwritingStringSink(get_reader_contents()),
       "" /* don't care */));
   Writer recycle_writer(std::move(dest_holder), 123, true);
@@ -652,6 +653,177 @@ TEST_P(LogTest, Recycle) {
 
 INSTANTIATE_TEST_CASE_P(bool, LogTest, ::testing::Values(0, 2));
 
+class RetriableLogTest : public ::testing::TestWithParam<int> {
+ private:
+  class ReportCollector : public Reader::Reporter {
+   public:
+    size_t dropped_bytes_;
+    std::string message_;
+
+    ReportCollector() : dropped_bytes_(0) {}
+    virtual void Corruption(size_t bytes, const Status& status) override {
+      dropped_bytes_ += bytes;
+      message_.append(status.ToString());
+    }
+  };
+
+  Slice contents_;
+  std::unique_ptr<WritableFileWriter> dest_holder_;
+  std::unique_ptr<Writer> log_writer_;
+  Env* env_;
+  EnvOptions env_options_;
+  const std::string test_dir_;
+  const std::string log_file_;
+  std::unique_ptr<WritableFileWriter> writer_;
+  std::unique_ptr<SequentialFileReader> reader_;
+  ReportCollector report_;
+  std::unique_ptr<Reader> log_reader_;
+
+ public:
+  RetriableLogTest()
+      : contents_(),
+        dest_holder_(nullptr),
+        log_writer_(nullptr),
+        env_(Env::Default()),
+        test_dir_(test::PerThreadDBPath("retriable_log_test")),
+        log_file_(test_dir_ + "/log"),
+        writer_(nullptr),
+        reader_(nullptr),
+        log_reader_(nullptr) {}
+
+  Status SetupTestEnv() {
+    dest_holder_.reset(test::GetWritableFileWriter(
+        new test::StringSink(&contents_), "" /* file name */));
+    assert(dest_holder_ != nullptr);
+    log_writer_.reset(new Writer(std::move(dest_holder_), 123, GetParam()));
+    assert(log_writer_ != nullptr);
+
+    Status s;
+    s = env_->CreateDirIfMissing(test_dir_);
+    std::unique_ptr<WritableFile> writable_file;
+    if (s.ok()) {
+      s = env_->NewWritableFile(log_file_, &writable_file, env_options_);
+    }
+    if (s.ok()) {
+      writer_.reset(new WritableFileWriter(std::move(writable_file), log_file_,
+                                           env_options_));
+      assert(writer_ != nullptr);
+    }
+    std::unique_ptr<SequentialFile> seq_file;
+    if (s.ok()) {
+      s = env_->NewSequentialFile(log_file_, &seq_file, env_options_);
+    }
+    if (s.ok()) {
+      reader_.reset(new SequentialFileReader(std::move(seq_file), log_file_));
+      assert(reader_ != nullptr);
+      log_reader_.reset(new Reader(nullptr, std::move(reader_), &report_,
+                                   true /* checksum */, 123 /* log_number */,
+                                   true /* retry_after_eof */));
+      assert(log_reader_ != nullptr);
+    }
+    return s;
+  }
+
+  std::string contents() {
+    auto file =
+        dynamic_cast<test::StringSink*>(log_writer_->file()->writable_file());
+    assert(file != nullptr);
+    return file->contents_;
+  }
+
+  void Encode(const std::string& msg) { log_writer_->AddRecord(Slice(msg)); }
+
+  void Write(const Slice& data) {
+    writer_->Append(data);
+    writer_->Sync(true);
+  }
+
+  std::string Read() {
+    auto wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
+    std::string scratch;
+    Slice record;
+    if (log_reader_->ReadRecord(&record, &scratch, wal_recovery_mode)) {
+      return record.ToString();
+    } else {
+      return "Read error";
+    }
+  }
+};
+
+TEST_P(RetriableLogTest, TailLog_PartialHeader) {
+  ASSERT_OK(SetupTestEnv());
+  std::vector<int> remaining_bytes_in_last_record;
+  size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"RetriableLogTest::TailLog:AfterPart1",
+        "RetriableLogTest::TailLog:BeforeReadRecord"},
+       {"LogReader::ReadMore:FirstEOF",
+        "RetriableLogTest::TailLog:BeforePart2"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  size_t delta = header_size - 1;
+  port::Thread log_writer_thread([&]() {
+    size_t old_sz = contents().size();
+    Encode("foo");
+    size_t new_sz = contents().size();
+    std::string part1 = contents().substr(old_sz, delta);
+    std::string part2 =
+        contents().substr(old_sz + delta, new_sz - old_sz - delta);
+    Write(Slice(part1));
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1");
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2");
+    Write(Slice(part2));
+  });
+
+  std::string record;
+  port::Thread log_reader_thread([&]() {
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord");
+    record = Read();
+  });
+  log_reader_thread.join();
+  log_writer_thread.join();
+  ASSERT_EQ("foo", record);
+}
+
+TEST_P(RetriableLogTest, TailLog_FullHeader) {
+  ASSERT_OK(SetupTestEnv());
+  std::vector<int> remaining_bytes_in_last_record;
+  size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"RetriableLogTest::TailLog:AfterPart1",
+        "RetriableLogTest::TailLog:BeforeReadRecord"},
+       {"LogReader::ReadMore:FirstEOF",
+        "RetriableLogTest::TailLog:BeforePart2"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  size_t delta = header_size + 1;
+  port::Thread log_writer_thread([&]() {
+    size_t old_sz = contents().size();
+    Encode("foo");
+    size_t new_sz = contents().size();
+    std::string part1 = contents().substr(old_sz, delta);
+    std::string part2 =
+        contents().substr(old_sz + delta, new_sz - old_sz - delta);
+    Write(Slice(part1));
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1");
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2");
+    Write(Slice(part2));
+  });
+
+  std::string record;
+  port::Thread log_reader_thread([&]() {
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord");
+    record = Read();
+  });
+  log_reader_thread.join();
+  log_writer_thread.join();
+  ASSERT_EQ("foo", record);
+}
+
+INSTANTIATE_TEST_CASE_P(bool, RetriableLogTest, ::testing::Values(0, 2));
+
 }  // namespace log
 }  // namespace rocksdb
 
diff --git a/db/log_writer.h b/db/log_writer.h
index abd7977b94f..dea5033872a 100644
--- a/db/log_writer.h
+++ b/db/log_writer.h
@@ -88,7 +88,7 @@ class Writer {
   bool TEST_BufferIsEmpty();
 
  private:
-  unique_ptr<WritableFileWriter> dest_;
+  std::unique_ptr<WritableFileWriter> dest_;
   size_t block_offset_;       // Current offset in block
   uint64_t log_number_;
   bool recycle_log_files_;
diff --git a/db/malloc_stats.cc b/db/malloc_stats.cc
index ba971b547e2..bcee5c3fbfe 100644
--- a/db/malloc_stats.cc
+++ b/db/malloc_stats.cc
@@ -13,17 +13,16 @@
 #include <memory>
 #include <string.h>
 
+#include "port/jemalloc_helper.h"
+
+
 namespace rocksdb {
 
 #ifdef ROCKSDB_JEMALLOC
-#ifdef __FreeBSD__
-#include <malloc_np.h>
-#else
-#include "jemalloc/jemalloc.h"
+
 #ifdef JEMALLOC_NO_RENAME
 #define malloc_stats_print je_malloc_stats_print
 #endif
-#endif
 
 typedef struct {
   char* cur;
@@ -41,10 +40,10 @@ static void GetJemallocStatus(void* mstat_arg, const char* status) {
   snprintf(mstat->cur, buf_size, "%s", status);
   mstat->cur += status_len;
 }
-#endif  // ROCKSDB_JEMALLOC
-
-#ifdef ROCKSDB_JEMALLOC
 void DumpMallocStats(std::string* stats) {
+  if (!HasJemalloc()) {
+    return;
+  }
   MallocStatus mstat;
   const unsigned int kMallocStatusLen = 1000000;
   std::unique_ptr<char[]> buf{new char[kMallocStatusLen + 1]};
@@ -56,5 +55,5 @@ void DumpMallocStats(std::string* stats) {
 #else
 void DumpMallocStats(std::string*) {}
 #endif  // ROCKSDB_JEMALLOC
-}
+}  // namespace rocksdb
 #endif  // !ROCKSDB_LITE
diff --git a/db/memtable.cc b/db/memtable.cc
index 70e6d9da52a..51b54d6368b 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -17,6 +17,7 @@
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
 #include "db/pinned_iterators_manager.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "db/read_callback.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
@@ -68,12 +69,13 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       refs_(0),
       kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)),
       mem_tracker_(write_buffer_manager),
-      arena_(
-          moptions_.arena_block_size,
-          (write_buffer_manager != nullptr && write_buffer_manager->enabled())
-              ? &mem_tracker_
-              : nullptr,
-          mutable_cf_options.memtable_huge_page_size),
+      arena_(moptions_.arena_block_size,
+             (write_buffer_manager != nullptr &&
+              (write_buffer_manager->enabled() ||
+               write_buffer_manager->cost_to_cache()))
+                 ? &mem_tracker_
+                 : nullptr,
+             mutable_cf_options.memtable_huge_page_size),
       table_(ioptions.memtable_factory->CreateMemTableRep(
           comparator_, &arena_, mutable_cf_options.prefix_extractor.get(),
           ioptions.info_log, column_family_id)),
@@ -101,7 +103,8 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       env_(ioptions.env),
       insert_with_hint_prefix_extractor_(
           ioptions.memtable_insert_with_hint_prefix_extractor),
-      oldest_key_time_(std::numeric_limits<uint64_t>::max()) {
+      oldest_key_time_(std::numeric_limits<uint64_t>::max()),
+      atomic_flush_seqno_(kMaxSequenceNumber) {
   UpdateFlushState();
   // something went wrong if we need to flush before inserting anything
   assert(!ShouldScheduleFlush());
@@ -409,13 +412,24 @@ InternalIterator* MemTable::NewIterator(const ReadOptions& read_options,
   return new (mem) MemTableIterator(*this, read_options, arena);
 }
 
-InternalIterator* MemTable::NewRangeTombstoneIterator(
-    const ReadOptions& read_options) {
+FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
+    const ReadOptions& read_options, SequenceNumber read_seq) {
   if (read_options.ignore_range_deletions || is_range_del_table_empty_) {
     return nullptr;
   }
-  return new MemTableIterator(*this, read_options, nullptr /* arena */,
-                              true /* use_range_del_table */);
+  auto* unfragmented_iter = new MemTableIterator(
+      *this, read_options, nullptr /* arena */, true /* use_range_del_table */);
+  if (unfragmented_iter == nullptr) {
+    return nullptr;
+  }
+  auto fragmented_tombstone_list =
+      std::make_shared<FragmentedRangeTombstoneList>(
+          std::unique_ptr<InternalIterator>(unfragmented_iter),
+          comparator_.comparator);
+
+  auto* fragmented_iter = new FragmentedRangeTombstoneIterator(
+      fragmented_tombstone_list, comparator_.comparator, read_seq);
+  return fragmented_iter;
 }
 
 port::RWMutex* MemTable::GetLock(const Slice& key) {
@@ -569,7 +583,7 @@ struct Saver {
   const MergeOperator* merge_operator;
   // the merge operations encountered;
   MergeContext* merge_context;
-  RangeDelAggregator* range_del_agg;
+  SequenceNumber max_covering_tombstone_seq;
   MemTable* mem;
   Logger* logger;
   Statistics* statistics;
@@ -591,10 +605,10 @@ static bool SaveValue(void* arg, const char* entry) {
   Saver* s = reinterpret_cast<Saver*>(arg);
   assert(s != nullptr);
   MergeContext* merge_context = s->merge_context;
-  RangeDelAggregator* range_del_agg = s->range_del_agg;
+  SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq;
   const MergeOperator* merge_operator = s->merge_operator;
 
-  assert(merge_context != nullptr && range_del_agg != nullptr);
+  assert(merge_context != nullptr);
 
   // entry format is:
   //    klength  varint32
@@ -622,7 +636,7 @@ static bool SaveValue(void* arg, const char* entry) {
     s->seq = seq;
 
     if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
-        range_del_agg->ShouldDelete(Slice(key_ptr, key_length))) {
+        max_covering_tombstone_seq > seq) {
       type = kTypeRangeDeletion;
     }
     switch (type) {
@@ -640,7 +654,7 @@ static bool SaveValue(void* arg, const char* entry) {
           *(s->found_final_value) = true;
           return false;
         }
-	FALLTHROUGH_INTENDED;
+        FALLTHROUGH_INTENDED;
       case kTypeValue: {
         if (s->inplace_update_support) {
           s->mem->GetLock(s->key->user_key())->ReadLock();
@@ -719,9 +733,9 @@ static bool SaveValue(void* arg, const char* entry) {
 
 bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
                    MergeContext* merge_context,
-                   RangeDelAggregator* range_del_agg, SequenceNumber* seq,
-                   const ReadOptions& read_opts, ReadCallback* callback,
-                   bool* is_blob_index) {
+                   SequenceNumber* max_covering_tombstone_seq,
+                   SequenceNumber* seq, const ReadOptions& read_opts,
+                   ReadCallback* callback, bool* is_blob_index) {
   // The sequence number is updated synchronously in version_set.h
   if (IsEmpty()) {
     // Avoiding recording stats for speed.
@@ -729,12 +743,13 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
   }
   PERF_TIMER_GUARD(get_from_memtable_time);
 
-  std::unique_ptr<InternalIterator> range_del_iter(
-      NewRangeTombstoneIterator(read_opts));
-  Status status = range_del_agg->AddTombstones(std::move(range_del_iter));
-  if (!status.ok()) {
-    *s = status;
-    return false;
+  std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+      NewRangeTombstoneIterator(read_opts,
+                                GetInternalKeySeqno(key.internal_key())));
+  if (range_del_iter != nullptr) {
+    *max_covering_tombstone_seq =
+        std::max(*max_covering_tombstone_seq,
+                 range_del_iter->MaxCoveringTombstoneSeqnum(key.user_key()));
   }
 
   Slice user_key = key.user_key();
@@ -761,7 +776,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
     saver.seq = kMaxSequenceNumber;
     saver.mem = this;
     saver.merge_context = merge_context;
-    saver.range_del_agg = range_del_agg;
+    saver.max_covering_tombstone_seq = *max_covering_tombstone_seq;
     saver.merge_operator = moptions_.merge_operator;
     saver.logger = moptions_.info_log;
     saver.inplace_update_support = moptions_.inplace_update_support;
diff --git a/db/memtable.h b/db/memtable.h
index e4594100781..6ce28961ea6 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -16,7 +16,7 @@
 #include <unordered_map>
 #include <vector>
 #include "db/dbformat.h"
-#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "db/read_callback.h"
 #include "db/version_edit.h"
 #include "monitoring/instrumented_mutex.h"
@@ -158,7 +158,8 @@ class MemTable {
   //        those allocated in arena.
   InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena);
 
-  InternalIterator* NewRangeTombstoneIterator(const ReadOptions& read_options);
+  FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
+      const ReadOptions& read_options, SequenceNumber read_seq);
 
   // Add an entry into memtable that maps key to value at the
   // specified sequence number and with the specified type.
@@ -187,17 +188,19 @@ class MemTable {
   // On success, *s may be set to OK, NotFound, or MergeInProgress.  Any other
   // status returned indicates a corruption or other unexpected error.
   bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext* merge_context, RangeDelAggregator* range_del_agg,
-           SequenceNumber* seq, const ReadOptions& read_opts,
-           ReadCallback* callback = nullptr, bool* is_blob_index = nullptr);
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+           const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+           bool* is_blob_index = nullptr);
 
   bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext* merge_context, RangeDelAggregator* range_del_agg,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq,
            const ReadOptions& read_opts, ReadCallback* callback = nullptr,
            bool* is_blob_index = nullptr) {
     SequenceNumber seq;
-    return Get(key, value, s, merge_context, range_del_agg, &seq, read_opts,
-               callback, is_blob_index);
+    return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq,
+               read_opts, callback, is_blob_index);
   }
 
   // Attempts to update the new_value inplace, else does normal Add
@@ -383,6 +386,16 @@ class MemTable {
 
   uint64_t GetID() const { return id_; }
 
+  void SetFlushCompleted(bool completed) { flush_completed_ = completed; }
+
+  uint64_t GetFileNumber() const { return file_number_; }
+
+  void SetFileNumber(uint64_t file_num) { file_number_ = file_num; }
+
+  void SetFlushInProgress(bool in_progress) {
+    flush_in_progress_ = in_progress;
+  }
+
  private:
   enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
 
@@ -396,8 +409,8 @@ class MemTable {
   const size_t kArenaBlockSize;
   AllocTracker mem_tracker_;
   ConcurrentArena arena_;
-  unique_ptr<MemTableRep> table_;
-  unique_ptr<MemTableRep> range_del_table_;
+  std::unique_ptr<MemTableRep> table_;
+  std::unique_ptr<MemTableRep> range_del_table_;
   bool is_range_del_table_empty_;
 
   // Total data size of all data inserted
@@ -455,6 +468,12 @@ class MemTable {
   // Memtable id to track flush.
   uint64_t id_ = 0;
 
+  // Sequence number of the atomic flush that is responsible for this memtable.
+  // The sequence number of atomic flush is a seq, such that no writes with
+  // sequence numbers greater than or equal to seq are flushed, while all
+  // writes with sequence number smaller than seq are flushed.
+  SequenceNumber atomic_flush_seqno_;
+
   // Returns a heuristic flush decision
   bool ShouldFlushNow() const;
 
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 7c3cad54a95..9397dbc7e00 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -11,9 +11,11 @@
 
 #include <inttypes.h>
 #include <limits>
+#include <queue>
 #include <string>
 #include "db/db_impl.h"
 #include "db/memtable.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "db/version_set.h"
 #include "monitoring/thread_status_util.h"
 #include "rocksdb/db.h"
@@ -103,34 +105,36 @@ int MemTableList::NumFlushed() const {
 // Operands stores the list of merge operations to apply, so far.
 bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
                               Status* s, MergeContext* merge_context,
-                              RangeDelAggregator* range_del_agg,
+                              SequenceNumber* max_covering_tombstone_seq,
                               SequenceNumber* seq, const ReadOptions& read_opts,
                               ReadCallback* callback, bool* is_blob_index) {
-  return GetFromList(&memlist_, key, value, s, merge_context, range_del_agg,
-                     seq, read_opts, callback, is_blob_index);
+  return GetFromList(&memlist_, key, value, s, merge_context,
+                     max_covering_tombstone_seq, seq, read_opts, callback,
+                     is_blob_index);
 }
 
 bool MemTableListVersion::GetFromHistory(
     const LookupKey& key, std::string* value, Status* s,
-    MergeContext* merge_context, RangeDelAggregator* range_del_agg,
+    MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq,
     SequenceNumber* seq, const ReadOptions& read_opts, bool* is_blob_index) {
   return GetFromList(&memlist_history_, key, value, s, merge_context,
-                     range_del_agg, seq, read_opts, nullptr /*read_callback*/,
-                     is_blob_index);
+                     max_covering_tombstone_seq, seq, read_opts,
+                     nullptr /*read_callback*/, is_blob_index);
 }
 
 bool MemTableListVersion::GetFromList(
     std::list<MemTable*>* list, const LookupKey& key, std::string* value,
-    Status* s, MergeContext* merge_context, RangeDelAggregator* range_del_agg,
-    SequenceNumber* seq, const ReadOptions& read_opts, ReadCallback* callback,
-    bool* is_blob_index) {
+    Status* s, MergeContext* merge_context,
+    SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+    const ReadOptions& read_opts, ReadCallback* callback, bool* is_blob_index) {
   *seq = kMaxSequenceNumber;
 
   for (auto& memtable : *list) {
     SequenceNumber current_seq = kMaxSequenceNumber;
 
-    bool done = memtable->Get(key, value, s, merge_context, range_del_agg,
-                              &current_seq, read_opts, callback, is_blob_index);
+    bool done =
+        memtable->Get(key, value, s, merge_context, max_covering_tombstone_seq,
+                      &current_seq, read_opts, callback, is_blob_index);
     if (*seq == kMaxSequenceNumber) {
       // Store the most recent sequence number of any operation on this key.
       // Since we only care about the most recent change, we only need to
@@ -143,7 +147,7 @@ bool MemTableListVersion::GetFromList(
     }
 
     if (done) {
-      assert(*seq != kMaxSequenceNumber);
+      assert(*seq != kMaxSequenceNumber || s->IsNotFound());
       return true;
     }
     if (!done && !s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) {
@@ -158,24 +162,11 @@ Status MemTableListVersion::AddRangeTombstoneIterators(
     RangeDelAggregator* range_del_agg) {
   assert(range_del_agg != nullptr);
   for (auto& m : memlist_) {
-    std::unique_ptr<InternalIterator> range_del_iter(
-        m->NewRangeTombstoneIterator(read_opts));
-    Status s = range_del_agg->AddTombstones(std::move(range_del_iter));
-    if (!s.ok()) {
-      return s;
-    }
-  }
-  return Status::OK();
-}
-
-Status MemTableListVersion::AddRangeTombstoneIterators(
-    const ReadOptions& read_opts,
-    std::vector<InternalIterator*>* range_del_iters) {
-  for (auto& m : memlist_) {
-    auto* range_del_iter = m->NewRangeTombstoneIterator(read_opts);
-    if (range_del_iter != nullptr) {
-      range_del_iters->push_back(range_del_iter);
-    }
+    // Using kMaxSequenceNumber is OK because these are immutable memtables.
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        m->NewRangeTombstoneIterator(read_opts,
+                                     kMaxSequenceNumber /* read_seq */));
+    range_del_agg->AddTombstones(std::move(range_del_iter));
   }
   return Status::OK();
 }
@@ -272,7 +263,7 @@ void MemTableListVersion::TrimHistory(autovector<MemTable*>* to_delete) {
 // Returns true if there is at least one memtable on which flush has
 // not yet started.
 bool MemTableList::IsFlushPending() const {
-  if ((flush_requested_ && num_flush_not_started_ >= 1) ||
+  if ((flush_requested_ && num_flush_not_started_ > 0) ||
       (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
     assert(imm_flush_needed.load(std::memory_order_relaxed));
     return true;
@@ -281,12 +272,16 @@ bool MemTableList::IsFlushPending() const {
 }
 
 // Returns the memtables that need to be flushed.
-void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
+void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id,
+                                        autovector<MemTable*>* ret) {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH);
   const auto& memlist = current_->memlist_;
   for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
     MemTable* m = *it;
+    if (max_memtable_id != nullptr && m->GetID() > *max_memtable_id) {
+      break;
+    }
     if (!m->flush_in_progress_) {
       assert(!m->flush_completed_);
       num_flush_not_started_--;
@@ -320,8 +315,9 @@ void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
   imm_flush_needed.store(true, std::memory_order_release);
 }
 
-// Record a successful flush in the manifest file
-Status MemTableList::InstallMemtableFlushResults(
+// Try record a successful flush in the manifest file. It might just return
+// Status::OK letting a concurrent flush to do actual the recording..
+Status MemTableList::TryInstallMemtableFlushResults(
     ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
     const autovector<MemTable*>& mems, LogsWithPrepTracker* prep_tracker,
     VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
@@ -331,7 +327,9 @@ Status MemTableList::InstallMemtableFlushResults(
       ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
   mu->AssertHeld();
 
-  // flush was successful
+  // Flush was successful
+  // Record the status on the memtable object. Either this call or a call by a
+  // concurrent flush thread will read the status and write it to manifest.
   for (size_t i = 0; i < mems.size(); ++i) {
     // All the edits are associated with the first memtable of this batch.
     assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0);
@@ -343,7 +341,7 @@ Status MemTableList::InstallMemtableFlushResults(
   // if some other thread is already committing, then return
   Status s;
   if (commit_in_progress_) {
-    TEST_SYNC_POINT("MemTableList::InstallMemtableFlushResults:InProgress");
+    TEST_SYNC_POINT("MemTableList::TryInstallMemtableFlushResults:InProgress");
     return s;
   }
 
@@ -354,11 +352,16 @@ Status MemTableList::InstallMemtableFlushResults(
   // while the current thread is writing manifest where mutex is released.
   while (s.ok()) {
     auto& memlist = current_->memlist_;
+    // The back is the oldest; if flush_completed_ is not set to it, it means
+    // that we were assigned a more recent memtable. The memtables' flushes must
+    // be recorded in manifest in order. A concurrent flush thread, who is
+    // assigned to flush the oldest memtable, will later wake up and does all
+    // the pending writes to manifest, in order.
     if (memlist.empty() || !memlist.back()->flush_completed_) {
       break;
     }
     // scan all memtables from the earliest, and commit those
-    // (in that order) that have finished flushing. Memetables
+    // (in that order) that have finished flushing. Memtables
     // are always committed in the order that they were created.
     uint64_t batch_file_number = 0;
     size_t batch_count = 0;
@@ -381,6 +384,7 @@ Status MemTableList::InstallMemtableFlushResults(
       batch_count++;
     }
 
+    // TODO(myabandeh): Not sure how batch_count could be 0 here.
     if (batch_count > 0) {
       if (vset->db_options()->allow_2pc) {
         assert(edit_list.size() > 0);
@@ -406,7 +410,7 @@ Status MemTableList::InstallMemtableFlushResults(
       // The reason is as follows (refer to
       // ColumnFamilyTest.FlushAndDropRaceCondition).
       // If the column family is dropped, then according to LogAndApply, its
-      // corrresponding flush operation is NOT written to the MANIFEST. This
+      // corresponding flush operation is NOT written to the MANIFEST. This
       // means the DB is not aware of the L0 files generated from the flush.
       // By committing the new state, we remove the memtable from the memtable
       // list. Creating an iterator on this column family will not be able to
@@ -523,4 +527,106 @@ uint64_t MemTableList::PrecomputeMinLogContainingPrepSection(
   return min_log;
 }
 
+// Commit a successful atomic flush in the manifest file.
+Status InstallMemtableAtomicFlushResults(
+    const autovector<MemTableList*>* imm_lists,
+    const autovector<ColumnFamilyData*>& cfds,
+    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+    const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
+    InstrumentedMutex* mu, const autovector<FileMetaData*>& file_metas,
+    autovector<MemTable*>* to_delete, Directory* db_directory,
+    LogBuffer* log_buffer) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
+  mu->AssertHeld();
+
+  size_t num = mems_list.size();
+  assert(cfds.size() == num);
+  if (imm_lists != nullptr) {
+    assert(imm_lists->size() == num);
+  }
+  for (size_t k = 0; k != num; ++k) {
+#ifndef NDEBUG
+    const auto* imm =
+        (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k);
+    if (!mems_list[k]->empty()) {
+      assert((*mems_list[k])[0]->GetID() == imm->GetEarliestMemTableID());
+    }
+#endif
+    assert(nullptr != file_metas[k]);
+    for (size_t i = 0; i != mems_list[k]->size(); ++i) {
+      assert(i == 0 || (*mems_list[k])[i]->GetEdits()->NumEntries() == 0);
+      (*mems_list[k])[i]->SetFlushCompleted(true);
+      (*mems_list[k])[i]->SetFileNumber(file_metas[k]->fd.GetNumber());
+    }
+  }
+
+  Status s;
+
+  autovector<autovector<VersionEdit*>> edit_lists;
+  uint32_t num_entries = 0;
+  for (const auto mems : mems_list) {
+    assert(mems != nullptr);
+    autovector<VersionEdit*> edits;
+    assert(!mems->empty());
+    edits.emplace_back((*mems)[0]->GetEdits());
+    ++num_entries;
+    edit_lists.emplace_back(edits);
+  }
+  // Mark the version edits as an atomic group
+  for (auto& edits : edit_lists) {
+    assert(edits.size() == 1);
+    edits[0]->MarkAtomicGroup(--num_entries);
+  }
+  assert(0 == num_entries);
+
+  // this can release and reacquire the mutex.
+  s = vset->LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+                        db_directory);
+
+  for (size_t k = 0; k != cfds.size(); ++k) {
+    auto* imm = (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k);
+    imm->InstallNewVersion();
+  }
+
+  if (s.ok() || s.IsShutdownInProgress()) {
+    for (size_t i = 0; i != cfds.size(); ++i) {
+      if (cfds[i]->IsDropped()) {
+        continue;
+      }
+      auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i);
+      for (auto m : *mems_list[i]) {
+        assert(m->GetFileNumber() > 0);
+        uint64_t mem_id = m->GetID();
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Level-0 commit table #%" PRIu64
+                         ": memtable #%" PRIu64 " done",
+                         cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                         mem_id);
+        imm->current_->Remove(m, to_delete);
+      }
+    }
+  } else {
+    for (size_t i = 0; i != cfds.size(); ++i) {
+      auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i);
+      for (auto m : *mems_list[i]) {
+        uint64_t mem_id = m->GetID();
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Level-0 commit table #%" PRIu64
+                         ": memtable #%" PRIu64 " failed",
+                         cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                         mem_id);
+        m->SetFlushCompleted(false);
+        m->SetFlushInProgress(false);
+        m->GetEdits()->Clear();
+        m->SetFileNumber(0);
+        imm->num_flush_not_started_++;
+      }
+      imm->imm_flush_needed.store(true, std::memory_order_release);
+    }
+  }
+
+  return s;
+}
+
 }  // namespace rocksdb
diff --git a/db/memtable_list.h b/db/memtable_list.h
index 7fd9de7ada0..b56ad4932c4 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -31,6 +31,7 @@ class ColumnFamilyData;
 class InternalKeyComparator;
 class InstrumentedMutex;
 class MergeIteratorBuilder;
+class MemTableList;
 
 // keeps a list of immutable memtables in a vector. the list is immutable
 // if refcount is bigger than one. It is used as a state for Get() and
@@ -55,17 +56,19 @@ class MemTableListVersion {
   // will be stored in *seq on success (regardless of whether true/false is
   // returned).  Otherwise, *seq will be set to kMaxSequenceNumber.
   bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext* merge_context, RangeDelAggregator* range_del_agg,
-           SequenceNumber* seq, const ReadOptions& read_opts,
-           ReadCallback* callback = nullptr, bool* is_blob_index = nullptr);
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+           const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+           bool* is_blob_index = nullptr);
 
   bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext* merge_context, RangeDelAggregator* range_del_agg,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq,
            const ReadOptions& read_opts, ReadCallback* callback = nullptr,
            bool* is_blob_index = nullptr) {
     SequenceNumber seq;
-    return Get(key, value, s, merge_context, range_del_agg, &seq, read_opts,
-               callback, is_blob_index);
+    return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq,
+               read_opts, callback, is_blob_index);
   }
 
   // Similar to Get(), but searches the Memtable history of memtables that
@@ -74,24 +77,22 @@ class MemTableListVersion {
   // writes that are also present in the SST files.
   bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
                       MergeContext* merge_context,
-                      RangeDelAggregator* range_del_agg, SequenceNumber* seq,
-                      const ReadOptions& read_opts,
+                      SequenceNumber* max_covering_tombstone_seq,
+                      SequenceNumber* seq, const ReadOptions& read_opts,
                       bool* is_blob_index = nullptr);
   bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
                       MergeContext* merge_context,
-                      RangeDelAggregator* range_del_agg,
+                      SequenceNumber* max_covering_tombstone_seq,
                       const ReadOptions& read_opts,
                       bool* is_blob_index = nullptr) {
     SequenceNumber seq;
-    return GetFromHistory(key, value, s, merge_context, range_del_agg, &seq,
-                          read_opts, is_blob_index);
+    return GetFromHistory(key, value, s, merge_context,
+                          max_covering_tombstone_seq, &seq, read_opts,
+                          is_blob_index);
   }
 
   Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
                                     RangeDelAggregator* range_del_agg);
-  Status AddRangeTombstoneIterators(
-      const ReadOptions& read_opts,
-      std::vector<InternalIterator*>* range_del_iters);
 
   void AddIterators(const ReadOptions& options,
                     std::vector<InternalIterator*>* iterator_list,
@@ -114,6 +115,18 @@ class MemTableListVersion {
   SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const;
 
  private:
+  friend class MemTableList;
+
+  friend Status InstallMemtableAtomicFlushResults(
+      const autovector<MemTableList*>* imm_lists,
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<const autovector<MemTable*>*>& mems_list,
+      VersionSet* vset, InstrumentedMutex* mu,
+      const autovector<FileMetaData*>& file_meta,
+      autovector<MemTable*>* to_delete, Directory* db_directory,
+      LogBuffer* log_buffer);
+
   // REQUIRE: m is an immutable memtable
   void Add(MemTable* m, autovector<MemTable*>* to_delete);
   // REQUIRE: m is an immutable memtable
@@ -123,8 +136,8 @@ class MemTableListVersion {
 
   bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
                    std::string* value, Status* s, MergeContext* merge_context,
-                   RangeDelAggregator* range_del_agg, SequenceNumber* seq,
-                   const ReadOptions& read_opts,
+                   SequenceNumber* max_covering_tombstone_seq,
+                   SequenceNumber* seq, const ReadOptions& read_opts,
                    ReadCallback* callback = nullptr,
                    bool* is_blob_index = nullptr);
 
@@ -132,8 +145,6 @@ class MemTableListVersion {
 
   void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m);
 
-  friend class MemTableList;
-
   // Immutable MemTables that have not yet been flushed.
   std::list<MemTable*> memlist_;
 
@@ -201,15 +212,17 @@ class MemTableList {
 
   // Returns the earliest memtables that needs to be flushed. The returned
   // memtables are guaranteed to be in the ascending order of created time.
-  void PickMemtablesToFlush(autovector<MemTable*>* mems);
+  void PickMemtablesToFlush(const uint64_t* max_memtable_id,
+                            autovector<MemTable*>* mems);
 
   // Reset status of the given memtable list back to pending state so that
   // they can get picked up again on the next round of flush.
   void RollbackMemtableFlush(const autovector<MemTable*>& mems,
                              uint64_t file_number);
 
-  // Commit a successful flush in the manifest file
-  Status InstallMemtableFlushResults(
+  // Try commit a successful flush in the manifest file. It might just return
+  // Status::OK letting a concurrent flush to do the actual the recording.
+  Status TryInstallMemtableFlushResults(
       ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
       const autovector<MemTable*>& m, LogsWithPrepTracker* prep_tracker,
       VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
@@ -266,7 +279,32 @@ class MemTableList {
     return memlist.front()->GetID();
   }
 
+  void AssignAtomicFlushSeq(const SequenceNumber& seq) {
+    const auto& memlist = current_->memlist_;
+    // Scan the memtable list from new to old
+    for (auto it = memlist.begin(); it != memlist.end(); ++it) {
+      MemTable* mem = *it;
+      if (mem->atomic_flush_seqno_ == kMaxSequenceNumber) {
+        mem->atomic_flush_seqno_ = seq;
+      } else {
+        // Earlier memtables must have been assigned a atomic flush seq, no
+        // need to continue scan.
+        break;
+      }
+    }
+  }
+
  private:
+  friend Status InstallMemtableAtomicFlushResults(
+      const autovector<MemTableList*>* imm_lists,
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<const autovector<MemTable*>*>& mems_list,
+      VersionSet* vset, InstrumentedMutex* mu,
+      const autovector<FileMetaData*>& file_meta,
+      autovector<MemTable*>* to_delete, Directory* db_directory,
+      LogBuffer* log_buffer);
+
   // DB mutex held
   void InstallNewVersion();
 
@@ -280,11 +318,26 @@ class MemTableList {
   // committing in progress
   bool commit_in_progress_;
 
-  // Requested a flush of all memtables to storage
+  // Requested a flush of memtables to storage. It's possible to request that
+  // a subset of memtables be flushed.
   bool flush_requested_;
 
   // The current memory usage.
   size_t current_memory_usage_;
 };
 
+// Installs memtable atomic flush results.
+// In most cases, imm_lists is nullptr, and the function simply uses the
+// immutable memtable lists associated with the cfds. There are unit tests that
+// installs flush results for external immutable memtable lists other than the
+// cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case,
+// imm_lists parameter is not nullptr.
+extern Status InstallMemtableAtomicFlushResults(
+    const autovector<MemTableList*>* imm_lists,
+    const autovector<ColumnFamilyData*>& cfds,
+    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+    const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
+    InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
+    autovector<MemTable*>* to_delete, Directory* db_directory,
+    LogBuffer* log_buffer);
 }  // namespace rocksdb
diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc
index 009a7188627..f0f4b0bb0cb 100644
--- a/db/memtable_list_test.cc
+++ b/db/memtable_list_test.cc
@@ -8,7 +8,6 @@
 #include <string>
 #include <vector>
 #include "db/merge_context.h"
-#include "db/range_del_aggregator.h"
 #include "db/version_set.h"
 #include "db/write_controller.h"
 #include "rocksdb/db.h"
@@ -25,9 +24,13 @@ class MemTableListTest : public testing::Test {
   std::string dbname;
   DB* db;
   Options options;
+  std::vector<ColumnFamilyHandle*> handles;
+  std::atomic<uint64_t> file_number;
 
-  MemTableListTest() : db(nullptr) {
+  MemTableListTest() : db(nullptr), file_number(1) {
     dbname = test::PerThreadDBPath("memtable_list_test");
+    options.create_if_missing = true;
+    DestroyDB(dbname, options);
   }
 
   // Create a test db if not yet created
@@ -35,19 +38,49 @@ class MemTableListTest : public testing::Test {
     if (db == nullptr) {
       options.create_if_missing = true;
       DestroyDB(dbname, options);
-      Status s = DB::Open(options, dbname, &db);
+      // Open DB only with default column family
+      ColumnFamilyOptions cf_options;
+      std::vector<ColumnFamilyDescriptor> cf_descs;
+      cf_descs.emplace_back(kDefaultColumnFamilyName, cf_options);
+      Status s = DB::Open(options, dbname, cf_descs, &handles, &db);
       EXPECT_OK(s);
+
+      ColumnFamilyOptions cf_opt1, cf_opt2;
+      cf_opt1.cf_paths.emplace_back(dbname + "_one_1",
+                                    std::numeric_limits<uint64_t>::max());
+      cf_opt2.cf_paths.emplace_back(dbname + "_two_1",
+                                    std::numeric_limits<uint64_t>::max());
+      int sz = static_cast<int>(handles.size());
+      handles.resize(sz + 2);
+      s = db->CreateColumnFamily(cf_opt1, "one", &handles[1]);
+      EXPECT_OK(s);
+      s = db->CreateColumnFamily(cf_opt2, "two", &handles[2]);
+      EXPECT_OK(s);
+
+      cf_descs.emplace_back("one", cf_options);
+      cf_descs.emplace_back("two", cf_options);
     }
   }
 
   ~MemTableListTest() {
     if (db) {
+      std::vector<ColumnFamilyDescriptor> cf_descs(handles.size());
+      for (int i = 0; i != static_cast<int>(handles.size()); ++i) {
+        handles[i]->GetDescriptor(&cf_descs[i]);
+      }
+      for (auto h : handles) {
+        if (h) {
+          db->DestroyColumnFamilyHandle(h);
+        }
+      }
+      handles.clear();
       delete db;
-      DestroyDB(dbname, options);
+      db = nullptr;
+      DestroyDB(dbname, options, cf_descs);
     }
   }
 
-  // Calls MemTableList::InstallMemtableFlushResults() and sets up all
+  // Calls MemTableList::TryInstallMemtableFlushResults() and sets up all
   // structures needed to call this function.
   Status Mock_InstallMemtableFlushResults(
       MemTableList* list, const MutableCFOptions& mutable_cf_options,
@@ -56,36 +89,95 @@ class MemTableListTest : public testing::Test {
     test::NullLogger logger;
     LogBuffer log_buffer(DEBUG_LEVEL, &logger);
 
+    CreateDB();
     // Create a mock VersionSet
     DBOptions db_options;
     ImmutableDBOptions immutable_db_options(db_options);
     EnvOptions env_options;
-    shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
+    std::shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
     WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
     WriteController write_controller(10000000u);
 
+    VersionSet versions(dbname, &immutable_db_options, env_options,
+                        table_cache.get(), &write_buffer_manager,
+                        &write_controller);
+    std::vector<ColumnFamilyDescriptor> cf_descs;
+    cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+    cf_descs.emplace_back("one", ColumnFamilyOptions());
+    cf_descs.emplace_back("two", ColumnFamilyOptions());
+
+    EXPECT_OK(versions.Recover(cf_descs, false));
+
+    // Create mock default ColumnFamilyData
+    auto column_family_set = versions.GetColumnFamilySet();
+    LogsWithPrepTracker dummy_prep_tracker;
+    auto cfd = column_family_set->GetDefault();
+    EXPECT_TRUE(nullptr != cfd);
+    uint64_t file_num = file_number.fetch_add(1);
+    // Create dummy mutex.
+    InstrumentedMutex mutex;
+    InstrumentedMutexLock l(&mutex);
+    return list->TryInstallMemtableFlushResults(
+        cfd, mutable_cf_options, m, &dummy_prep_tracker, &versions, &mutex,
+        file_num, to_delete, nullptr, &log_buffer);
+  }
+
+  // Calls MemTableList::InstallMemtableFlushResults() and sets up all
+  // structures needed to call this function.
+  Status Mock_InstallMemtableAtomicFlushResults(
+      autovector<MemTableList*>& lists, const autovector<uint32_t>& cf_ids,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<const autovector<MemTable*>*>& mems_list,
+      autovector<MemTable*>* to_delete) {
+    // Create a mock Logger
+    test::NullLogger logger;
+    LogBuffer log_buffer(DEBUG_LEVEL, &logger);
+
     CreateDB();
+    // Create a mock VersionSet
+    DBOptions db_options;
+    ImmutableDBOptions immutable_db_options(db_options);
+    EnvOptions env_options;
+    std::shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
+    WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
+    WriteController write_controller(10000000u);
+
     VersionSet versions(dbname, &immutable_db_options, env_options,
                         table_cache.get(), &write_buffer_manager,
                         &write_controller);
+    std::vector<ColumnFamilyDescriptor> cf_descs;
+    cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+    cf_descs.emplace_back("one", ColumnFamilyOptions());
+    cf_descs.emplace_back("two", ColumnFamilyOptions());
+    EXPECT_OK(versions.Recover(cf_descs, false));
 
     // Create mock default ColumnFamilyData
-    ColumnFamilyOptions cf_options;
-    std::vector<ColumnFamilyDescriptor> column_families;
-    column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
-    EXPECT_OK(versions.Recover(column_families, false));
 
     auto column_family_set = versions.GetColumnFamilySet();
-    auto cfd = column_family_set->GetColumnFamily(0);
-    EXPECT_TRUE(cfd != nullptr);
 
-    // Create dummy mutex.
+    LogsWithPrepTracker dummy_prep_tracker;
+    autovector<ColumnFamilyData*> cfds;
+    for (int i = 0; i != static_cast<int>(cf_ids.size()); ++i) {
+      cfds.emplace_back(column_family_set->GetColumnFamily(cf_ids[i]));
+      EXPECT_NE(nullptr, cfds[i]);
+    }
+    std::vector<FileMetaData> file_metas;
+    file_metas.reserve(cf_ids.size());
+    for (size_t i = 0; i != cf_ids.size(); ++i) {
+      FileMetaData meta;
+      uint64_t file_num = file_number.fetch_add(1);
+      meta.fd = FileDescriptor(file_num, 0, 0);
+      file_metas.emplace_back(meta);
+    }
+    autovector<FileMetaData*> file_meta_ptrs;
+    for (auto& meta : file_metas) {
+      file_meta_ptrs.push_back(&meta);
+    }
     InstrumentedMutex mutex;
     InstrumentedMutexLock l(&mutex);
-    LogsWithPrepTracker dummy_prep_tracker;
-    return list->InstallMemtableFlushResults(
-        cfd, mutable_cf_options, m, &dummy_prep_tracker, &versions, &mutex, 1,
-        to_delete, nullptr, &log_buffer);
+    return InstallMemtableAtomicFlushResults(
+        &lists, cfds, mutable_cf_options_list, mems_list, &versions, &mutex,
+        file_meta_ptrs, to_delete, nullptr, &log_buffer);
   }
 };
 
@@ -98,7 +190,7 @@ TEST_F(MemTableListTest, Empty) {
   ASSERT_FALSE(list.IsFlushPending());
 
   autovector<MemTable*> mems;
-  list.PickMemtablesToFlush(&mems);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &mems);
   ASSERT_EQ(0, mems.size());
 
   autovector<MemTable*> to_delete;
@@ -118,12 +210,12 @@ TEST_F(MemTableListTest, GetTest) {
   Status s;
   MergeContext merge_context;
   InternalKeyComparator ikey_cmp(options.comparator);
-  RangeDelAggregator range_del_agg(ikey_cmp, {} /* snapshots */);
+  SequenceNumber max_covering_tombstone_seq = 0;
   autovector<MemTable*> to_delete;
 
   LookupKey lkey("key1", seq);
   bool found = list.current()->Get(lkey, &value, &s, &merge_context,
-                                   &range_del_agg, ReadOptions());
+                                   &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   // Create a MemTable
@@ -146,19 +238,19 @@ TEST_F(MemTableListTest, GetTest) {
   // Fetch the newly written keys
   merge_context.Clear();
   found = mem->Get(LookupKey("key1", seq), &value, &s, &merge_context,
-                   &range_del_agg, ReadOptions());
+                   &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(s.ok() && found);
   ASSERT_EQ(value, "value1");
 
   merge_context.Clear();
   found = mem->Get(LookupKey("key1", 2), &value, &s, &merge_context,
-                   &range_del_agg, ReadOptions());
+                   &max_covering_tombstone_seq, ReadOptions());
   // MemTable found out that this key is *not* found (at this sequence#)
   ASSERT_TRUE(found && s.IsNotFound());
 
   merge_context.Clear();
   found = mem->Get(LookupKey("key2", seq), &value, &s, &merge_context,
-                   &range_del_agg, ReadOptions());
+                   &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(s.ok() && found);
   ASSERT_EQ(value, "value2.2");
 
@@ -184,25 +276,28 @@ TEST_F(MemTableListTest, GetTest) {
 
   // Fetch keys via MemTableList
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key1", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(found && s.IsNotFound());
 
   merge_context.Clear();
   found = list.current()->Get(LookupKey("key1", saved_seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+                              &merge_context, &max_covering_tombstone_seq,
+                              ReadOptions());
   ASSERT_TRUE(s.ok() && found);
   ASSERT_EQ("value1", value);
 
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key2", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(s.ok() && found);
   ASSERT_EQ(value, "value2.3");
 
   merge_context.Clear();
   found = list.current()->Get(LookupKey("key2", 1), &value, &s, &merge_context,
-                              &range_del_agg, ReadOptions());
+                              &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   ASSERT_EQ(2, list.NumNotFlushed());
@@ -225,12 +320,12 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
   Status s;
   MergeContext merge_context;
   InternalKeyComparator ikey_cmp(options.comparator);
-  RangeDelAggregator range_del_agg(ikey_cmp, {} /* snapshots */);
+  SequenceNumber max_covering_tombstone_seq = 0;
   autovector<MemTable*> to_delete;
 
   LookupKey lkey("key1", seq);
   bool found = list.current()->Get(lkey, &value, &s, &merge_context,
-                                   &range_del_agg, ReadOptions());
+                                   &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   // Create a MemTable
@@ -252,13 +347,13 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
   // Fetch the newly written keys
   merge_context.Clear();
   found = mem->Get(LookupKey("key1", seq), &value, &s, &merge_context,
-                   &range_del_agg, ReadOptions());
+                   &max_covering_tombstone_seq, ReadOptions());
   // MemTable found out that this key is *not* found (at this sequence#)
   ASSERT_TRUE(found && s.IsNotFound());
 
   merge_context.Clear();
   found = mem->Get(LookupKey("key2", seq), &value, &s, &merge_context,
-                   &range_del_agg, ReadOptions());
+                   &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(s.ok() && found);
   ASSERT_EQ(value, "value2.2");
 
@@ -268,24 +363,27 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
 
   // Fetch keys via MemTableList
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key1", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(found && s.IsNotFound());
 
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key2", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(s.ok() && found);
   ASSERT_EQ("value2.2", value);
 
   // Flush this memtable from the list.
   // (It will then be a part of the memtable history).
   autovector<MemTable*> to_flush;
-  list.PickMemtablesToFlush(&to_flush);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
   ASSERT_EQ(1, to_flush.size());
 
-  s = Mock_InstallMemtableFlushResults(&list, MutableCFOptions(options),
-                                       to_flush, &to_delete);
+  MutableCFOptions mutable_cf_options(options);
+  s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+                                       &to_delete);
   ASSERT_OK(s);
   ASSERT_EQ(0, list.NumNotFlushed());
   ASSERT_EQ(1, list.NumFlushed());
@@ -293,26 +391,28 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
 
   // Verify keys are no longer in MemTableList
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key1", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key2", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   // Verify keys are present in history
   merge_context.Clear();
-  found = list.current()->GetFromHistory(LookupKey("key1", seq), &value, &s,
-                                         &merge_context, &range_del_agg,
-                                         ReadOptions());
+  found = list.current()->GetFromHistory(
+      LookupKey("key1", seq), &value, &s, &merge_context,
+      &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(found && s.IsNotFound());
 
   merge_context.Clear();
-  found = list.current()->GetFromHistory(LookupKey("key2", seq), &value, &s,
-                                         &merge_context, &range_del_agg,
-                                         ReadOptions());
+  found = list.current()->GetFromHistory(
+      LookupKey("key2", seq), &value, &s, &merge_context,
+      &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(found);
   ASSERT_EQ("value2.2", value);
 
@@ -330,12 +430,12 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
   ASSERT_EQ(0, to_delete.size());
 
   to_flush.clear();
-  list.PickMemtablesToFlush(&to_flush);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
   ASSERT_EQ(1, to_flush.size());
 
   // Flush second memtable
-  s = Mock_InstallMemtableFlushResults(&list, MutableCFOptions(options),
-                                       to_flush, &to_delete);
+  s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+                                       &to_delete);
   ASSERT_OK(s);
   ASSERT_EQ(0, list.NumNotFlushed());
   ASSERT_EQ(2, list.NumFlushed());
@@ -353,38 +453,42 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
 
   // Verify keys are no longer in MemTableList
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key1", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key2", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key3", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key3", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   // Verify that the second memtable's keys are in the history
   merge_context.Clear();
-  found = list.current()->GetFromHistory(LookupKey("key1", seq), &value, &s,
-                                         &merge_context, &range_del_agg,
-                                         ReadOptions());
+  found = list.current()->GetFromHistory(
+      LookupKey("key1", seq), &value, &s, &merge_context,
+      &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(found && s.IsNotFound());
 
   merge_context.Clear();
-  found = list.current()->GetFromHistory(LookupKey("key3", seq), &value, &s,
-                                         &merge_context, &range_del_agg,
-                                         ReadOptions());
+  found = list.current()->GetFromHistory(
+      LookupKey("key3", seq), &value, &s, &merge_context,
+      &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(found);
   ASSERT_EQ("value3", value);
 
   // Verify that key2 from the first memtable is no longer in the history
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key2", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   // Cleanup
@@ -396,7 +500,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
 }
 
 TEST_F(MemTableListTest, FlushPendingTest) {
-  const int num_tables = 5;
+  const int num_tables = 6;
   SequenceNumber seq = 1;
   Status s;
 
@@ -414,11 +518,13 @@ TEST_F(MemTableListTest, FlushPendingTest) {
                     max_write_buffer_number_to_maintain);
 
   // Create some MemTables
+  uint64_t memtable_id = 0;
   std::vector<MemTable*> tables;
   MutableCFOptions mutable_cf_options(options);
   for (int i = 0; i < num_tables; i++) {
     MemTable* mem = new MemTable(cmp, ioptions, mutable_cf_options, &wb,
                                  kMaxSequenceNumber, 0 /* column_family_id */);
+    mem->SetID(memtable_id++);
     mem->Ref();
 
     std::string value;
@@ -437,7 +543,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
   autovector<MemTable*> to_flush;
-  list.PickMemtablesToFlush(&to_flush);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
   ASSERT_EQ(0, to_flush.size());
 
   // Request a flush even though there is nothing to flush
@@ -446,7 +552,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Attempt to 'flush' to clear request for flush
-  list.PickMemtablesToFlush(&to_flush);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
   ASSERT_EQ(0, to_flush.size());
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
@@ -470,7 +576,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Pick tables to flush
-  list.PickMemtablesToFlush(&to_flush);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
   ASSERT_EQ(2, to_flush.size());
   ASSERT_EQ(2, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
@@ -491,7 +597,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_EQ(0, to_delete.size());
 
   // Pick tables to flush
-  list.PickMemtablesToFlush(&to_flush);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
   ASSERT_EQ(3, to_flush.size());
   ASSERT_EQ(3, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
@@ -499,7 +605,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
 
   // Pick tables to flush again
   autovector<MemTable*> to_flush2;
-  list.PickMemtablesToFlush(&to_flush2);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush2);
   ASSERT_EQ(0, to_flush2.size());
   ASSERT_EQ(3, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
@@ -517,7 +623,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Pick tables to flush again
-  list.PickMemtablesToFlush(&to_flush2);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush2);
   ASSERT_EQ(1, to_flush2.size());
   ASSERT_EQ(4, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
@@ -538,7 +644,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_EQ(0, to_delete.size());
 
   // Pick tables to flush
-  list.PickMemtablesToFlush(&to_flush);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
   // Should pick 4 of 5 since 1 table has been picked in to_flush2
   ASSERT_EQ(4, to_flush.size());
   ASSERT_EQ(5, list.NumNotFlushed());
@@ -547,20 +653,21 @@ TEST_F(MemTableListTest, FlushPendingTest) {
 
   // Pick tables to flush again
   autovector<MemTable*> to_flush3;
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush3);
   ASSERT_EQ(0, to_flush3.size());  // nothing not in progress of being flushed
   ASSERT_EQ(5, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Flush the 4 memtables that were picked in to_flush
-  s = Mock_InstallMemtableFlushResults(&list, MutableCFOptions(options),
-                                       to_flush, &to_delete);
+  s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+                                       &to_delete);
   ASSERT_OK(s);
 
   // Note:  now to_flush contains tables[0,1,2,4].  to_flush2 contains
   // tables[3].
   // Current implementation will only commit memtables in the order they were
-  // created.  So InstallMemtableFlushResults will install the first 3 tables
+  // created. So TryInstallMemtableFlushResults will install the first 3 tables
   // in to_flush and stop when it encounters a table not yet flushed.
   ASSERT_EQ(2, list.NumNotFlushed());
   int num_in_history = std::min(3, max_write_buffer_number_to_maintain);
@@ -574,7 +681,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
 
   // Flush the 1 memtable that was picked in to_flush2
   s = MemTableListTest::Mock_InstallMemtableFlushResults(
-      &list, MutableCFOptions(options), to_flush2, &to_delete);
+      &list, mutable_cf_options, to_flush2, &to_delete);
   ASSERT_OK(s);
 
   // This will actually install 2 tables.  The 1 we told it to flush, and also
@@ -585,7 +692,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
 
   for (const auto& m : to_delete) {
-    // Refcount should be 0 after calling InstallMemtableFlushResults.
+    // Refcount should be 0 after calling TryInstallMemtableFlushResults.
     // Verify this, by Ref'ing then UnRef'ing:
     m->Ref();
     ASSERT_EQ(m, m->Unref());
@@ -593,12 +700,41 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   }
   to_delete.clear();
 
+  // Add another table
+  list.Add(tables[5], &to_delete);
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_EQ(5, list.GetLatestMemTableID());
+  memtable_id = 4;
+  // Pick tables to flush. The tables to pick must have ID smaller than or
+  // equal to 4. Therefore, no table will be selected in this case.
+  autovector<MemTable*> to_flush4;
+  list.FlushRequested();
+  ASSERT_TRUE(list.HasFlushRequested());
+  list.PickMemtablesToFlush(&memtable_id, &to_flush4);
+  ASSERT_TRUE(to_flush4.empty());
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.HasFlushRequested());
+
+  // Pick tables to flush. The tables to pick must have ID smaller than or
+  // equal to 5. Therefore, only tables[5] will be selected.
+  memtable_id = 5;
+  list.FlushRequested();
+  list.PickMemtablesToFlush(&memtable_id, &to_flush4);
+  ASSERT_EQ(1, static_cast<int>(to_flush4.size()));
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_FALSE(list.IsFlushPending());
+  to_delete.clear();
+
   list.current()->Unref(&to_delete);
-  int to_delete_size = std::min(5, max_write_buffer_number_to_maintain);
+  int to_delete_size =
+      std::min(num_tables, max_write_buffer_number_to_maintain);
   ASSERT_EQ(to_delete_size, to_delete.size());
 
   for (const auto& m : to_delete) {
-    // Refcount should be 0 after calling InstallMemtableFlushResults.
+    // Refcount should be 0 after calling TryInstallMemtableFlushResults.
     // Verify this, by Ref'ing then UnRef'ing:
     m->Ref();
     ASSERT_EQ(m, m->Unref());
@@ -607,6 +743,157 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   to_delete.clear();
 }
 
+TEST_F(MemTableListTest, EmptyAtomicFlusTest) {
+  autovector<MemTableList*> lists;
+  autovector<uint32_t> cf_ids;
+  autovector<const MutableCFOptions*> options_list;
+  autovector<const autovector<MemTable*>*> to_flush;
+  autovector<MemTable*> to_delete;
+  Status s = Mock_InstallMemtableAtomicFlushResults(lists, cf_ids, options_list,
+                                                    to_flush, &to_delete);
+  ASSERT_OK(s);
+  ASSERT_TRUE(to_delete.empty());
+}
+
+TEST_F(MemTableListTest, AtomicFlusTest) {
+  const int num_cfs = 3;
+  const int num_tables_per_cf = 2;
+  SequenceNumber seq = 1;
+
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableCFOptions ioptions(options);
+  InternalKeyComparator cmp(BytewiseComparator());
+  WriteBufferManager wb(options.db_write_buffer_size);
+
+  // Create MemTableLists
+  int min_write_buffer_number_to_merge = 3;
+  int max_write_buffer_number_to_maintain = 7;
+  autovector<MemTableList*> lists;
+  for (int i = 0; i != num_cfs; ++i) {
+    lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge,
+                                        max_write_buffer_number_to_maintain));
+  }
+
+  autovector<uint32_t> cf_ids;
+  std::vector<std::vector<MemTable*>> tables(num_cfs);
+  autovector<const MutableCFOptions*> mutable_cf_options_list;
+  uint32_t cf_id = 0;
+  for (auto& elem : tables) {
+    mutable_cf_options_list.emplace_back(new MutableCFOptions(options));
+    uint64_t memtable_id = 0;
+    for (int i = 0; i != num_tables_per_cf; ++i) {
+      MemTable* mem =
+          new MemTable(cmp, ioptions, *(mutable_cf_options_list.back()), &wb,
+                       kMaxSequenceNumber, cf_id);
+      mem->SetID(memtable_id++);
+      mem->Ref();
+
+      std::string value;
+
+      mem->Add(++seq, kTypeValue, "key1", ToString(i));
+      mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN");
+      mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value");
+      mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM");
+      mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "");
+
+      elem.push_back(mem);
+    }
+    cf_ids.push_back(cf_id++);
+  }
+
+  std::vector<autovector<MemTable*>> flush_candidates(num_cfs);
+
+  // Nothing to flush
+  for (auto i = 0; i != num_cfs; ++i) {
+    auto* list = lists[i];
+    ASSERT_FALSE(list->IsFlushPending());
+    ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
+    list->PickMemtablesToFlush(nullptr /* memtable_id */, &flush_candidates[i]);
+    ASSERT_EQ(0, flush_candidates[i].size());
+  }
+  // Request flush even though there is nothing to flush
+  for (auto i = 0; i != num_cfs; ++i) {
+    auto* list = lists[i];
+    list->FlushRequested();
+    ASSERT_FALSE(list->IsFlushPending());
+    ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
+  }
+  autovector<MemTable*> to_delete;
+  // Add tables to the immutable memtalbe lists associated with column families
+  for (auto i = 0; i != num_cfs; ++i) {
+    for (auto j = 0; j != num_tables_per_cf; ++j) {
+      lists[i]->Add(tables[i][j], &to_delete);
+    }
+    ASSERT_EQ(num_tables_per_cf, lists[i]->NumNotFlushed());
+    ASSERT_TRUE(lists[i]->IsFlushPending());
+    ASSERT_TRUE(lists[i]->imm_flush_needed.load(std::memory_order_acquire));
+  }
+  std::vector<uint64_t> flush_memtable_ids = {1, 1, 0};
+  //          +----+
+  // list[0]: |0  1|
+  // list[1]: |0  1|
+  //          | +--+
+  // list[2]: |0| 1
+  //          +-+
+  // Pick memtables to flush
+  for (auto i = 0; i != num_cfs; ++i) {
+    flush_candidates[i].clear();
+    lists[i]->PickMemtablesToFlush(&flush_memtable_ids[i],
+                                   &flush_candidates[i]);
+    ASSERT_EQ(flush_memtable_ids[i] - 0 + 1,
+              static_cast<uint64_t>(flush_candidates[i].size()));
+  }
+  autovector<MemTableList*> tmp_lists;
+  autovector<uint32_t> tmp_cf_ids;
+  autovector<const MutableCFOptions*> tmp_options_list;
+  autovector<const autovector<MemTable*>*> to_flush;
+  for (auto i = 0; i != num_cfs; ++i) {
+    if (!flush_candidates[i].empty()) {
+      to_flush.push_back(&flush_candidates[i]);
+      tmp_lists.push_back(lists[i]);
+      tmp_cf_ids.push_back(i);
+      tmp_options_list.push_back(mutable_cf_options_list[i]);
+    }
+  }
+  Status s = Mock_InstallMemtableAtomicFlushResults(
+      tmp_lists, tmp_cf_ids, tmp_options_list, to_flush, &to_delete);
+  ASSERT_OK(s);
+
+  for (auto i = 0; i != num_cfs; ++i) {
+    for (auto j = 0; j != num_tables_per_cf; ++j) {
+      if (static_cast<uint64_t>(j) <= flush_memtable_ids[i]) {
+        ASSERT_LT(0, tables[i][j]->GetFileNumber());
+      }
+    }
+    ASSERT_EQ(
+        static_cast<size_t>(num_tables_per_cf) - flush_candidates[i].size(),
+        lists[i]->NumNotFlushed());
+  }
+
+  to_delete.clear();
+  for (auto list : lists) {
+    list->current()->Unref(&to_delete);
+    delete list;
+  }
+  for (auto& mutable_cf_options : mutable_cf_options_list) {
+    if (mutable_cf_options != nullptr) {
+      delete mutable_cf_options;
+      mutable_cf_options = nullptr;
+    }
+  }
+  // All memtables in tables array must have been flushed, thus ready to be
+  // deleted.
+  ASSERT_EQ(to_delete.size(), tables.size() * tables.front().size());
+  for (const auto& m : to_delete) {
+    // Refcount should be 0 after calling InstallMemtableFlushResults.
+    // Verify this by Ref'ing and then Unref'ing.
+    m->Ref();
+    ASSERT_EQ(m, m->Unref());
+    delete m;
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/merge_context.h b/db/merge_context.h
index c226f64e558..fd06441f7cb 100644
--- a/db/merge_context.h
+++ b/db/merge_context.h
@@ -79,7 +79,8 @@ class MergeContext {
     return GetOperandsDirectionForward();
   }
 
-  // Return all the operands in the order as they were merged (passed to FullMerge or FullMergeV2)
+  // Return all the operands in the order as they were merged (passed to
+  // FullMerge or FullMergeV2)
   const std::vector<Slice>& GetOperandsDirectionForward() {
     if (!operand_list_) {
       return empty_operand_list;
@@ -89,7 +90,8 @@ class MergeContext {
     return *operand_list_;
   }
 
-  // Return all the operands in the reversed order relative to how they were merged (passed to FullMerge or FullMergeV2)
+  // Return all the operands in the reversed order relative to how they were
+  // merged (passed to FullMerge or FullMergeV2)
   const std::vector<Slice>& GetOperandsDirectionBackward() {
     if (!operand_list_) {
       return empty_operand_list;
diff --git a/db/merge_helper.cc b/db/merge_helper.cc
index dc6baa96302..f33dafd8e5b 100644
--- a/db/merge_helper.cc
+++ b/db/merge_helper.cc
@@ -110,8 +110,11 @@ Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator,
 //       keys_ stores the list of keys encountered while merging.
 //       operands_ stores the list of merge operands encountered while merging.
 //       keys_[i] corresponds to operands_[i] for each i.
+//
+// TODO: Avoid the snapshot stripe map lookup in CompactionRangeDelAggregator
+// and just pass the StripeRep corresponding to the stripe being merged.
 Status MergeHelper::MergeUntil(InternalIterator* iter,
-                               RangeDelAggregator* range_del_agg,
+                               CompactionRangeDelAggregator* range_del_agg,
                                const SequenceNumber stop_before,
                                const bool at_bottom) {
   // Get a copy of the internal key, before it's invalidated by iter->Next()
diff --git a/db/merge_helper.h b/db/merge_helper.h
index 993bbe3e9de..670cba5983b 100644
--- a/db/merge_helper.h
+++ b/db/merge_helper.h
@@ -78,7 +78,7 @@ class MergeHelper {
   //
   // REQUIRED: The first key in the input is not corrupted.
   Status MergeUntil(InternalIterator* iter,
-                    RangeDelAggregator* range_del_agg = nullptr,
+                    CompactionRangeDelAggregator* range_del_agg = nullptr,
                     const SequenceNumber stop_before = 0,
                     const bool at_bottom = false);
 
diff --git a/db/merge_test.cc b/db/merge_test.cc
index 15f6f5125a0..a24b3e7f97a 100644
--- a/db/merge_test.cc
+++ b/db/merge_test.cc
@@ -246,7 +246,7 @@ class MergeBasedCounters : public Counters {
 };
 
 void dumpDb(DB* db) {
-  auto it = unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
+  auto it = std::unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
   for (it->SeekToFirst(); it->Valid(); it->Next()) {
     //uint64_t value = DecodeFixed64(it->value().data());
     //std::cout << it->key().ToString() << ": " << value << std::endl;
diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc
index f9135c79a89..e7dc9764615 100644
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@@ -10,6 +10,7 @@
 
 #include "monitoring/histogram.h"
 #include "monitoring/instrumented_mutex.h"
+#include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_util.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
@@ -579,18 +580,18 @@ TEST_F(PerfContextTest, SeekKeyComparison) {
 
 TEST_F(PerfContextTest, DBMutexLockCounter) {
   int stats_code[] = {0, static_cast<int>(DB_MUTEX_WAIT_MICROS)};
-  for (PerfLevel perf_level :
+  for (PerfLevel perf_level_test :
        {PerfLevel::kEnableTimeExceptForMutex, PerfLevel::kEnableTime}) {
     for (int c = 0; c < 2; ++c) {
     InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]);
     mutex.Lock();
     rocksdb::port::Thread child_thread([&] {
-      SetPerfLevel(perf_level);
+      SetPerfLevel(perf_level_test);
       get_perf_context()->Reset();
       ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
       mutex.Lock();
       mutex.Unlock();
-      if (perf_level == PerfLevel::kEnableTimeExceptForMutex ||
+      if (perf_level_test == PerfLevel::kEnableTimeExceptForMutex ||
           stats_code[c] != DB_MUTEX_WAIT_MICROS) {
         ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
       } else {
@@ -686,7 +687,34 @@ TEST_F(PerfContextTest, MergeOperatorTime) {
 
   delete db;
 }
+
+TEST_F(PerfContextTest, PerfContextByLevelGetSet) {
+  get_perf_context()->Reset();
+  get_perf_context()->EnablePerLevelPerfContext();
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, 0);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 7);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 7);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, 2);
+  ASSERT_EQ(
+      0, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+  ASSERT_EQ(
+      1, (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+  ASSERT_EQ(
+      2, (*(get_perf_context()->level_to_perf_context))[7].bloom_filter_useful);
+  ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[0]
+                   .bloom_filter_full_positive);
+  ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[2]
+                   .bloom_filter_full_true_positive);
+  std::string zero_excluded = get_perf_context()->ToString(true);
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("bloom_filter_useful = 1@level5, 2@level7"));
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("bloom_filter_full_positive = 1@level0"));
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("bloom_filter_full_true_positive = 1@level2"));
 }
+}  // namespace rocksdb
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index dffd77ddcae..a98b1629a62 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -50,10 +50,11 @@ TEST_F(PlainTableKeyDecoderTest, ReadNonMmap) {
   test::StringSource* string_source =
       new test::StringSource(contents, 0, false);
 
-  unique_ptr<RandomAccessFileReader> file_reader(
+  std::unique_ptr<RandomAccessFileReader> file_reader(
       test::GetRandomAccessFileReader(string_source));
-  unique_ptr<PlainTableReaderFileInfo> file_info(new PlainTableReaderFileInfo(
-      std::move(file_reader), EnvOptions(), kLength));
+  std::unique_ptr<PlainTableReaderFileInfo> file_info(
+      new PlainTableReaderFileInfo(std::move(file_reader), EnvOptions(),
+                                   kLength));
 
   {
     PlainTableFileReader reader(file_info.get());
@@ -260,7 +261,7 @@ class TestPlainTableReader : public PlainTableReader {
                        int bloom_bits_per_key, double hash_table_ratio,
                        size_t index_sparseness,
                        const TableProperties* table_properties,
-                       unique_ptr<RandomAccessFileReader>&& file,
+                       std::unique_ptr<RandomAccessFileReader>&& file,
                        const ImmutableCFOptions& ioptions,
                        const SliceTransform* prefix_extractor,
                        bool* expect_bloom_not_match, bool store_index_in_file,
@@ -327,8 +328,8 @@ class TestPlainTableFactory : public PlainTableFactory {
 
   Status NewTableReader(
       const TableReaderOptions& table_reader_options,
-      unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-      unique_ptr<TableReader>* table,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table,
       bool /*prefetch_index_and_filter_in_cache*/) const override {
     TableProperties* props = nullptr;
     auto s =
diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc
index d1885603c6c..3685d717d72 100644
--- a/db/range_del_aggregator.cc
+++ b/db/range_del_aggregator.cc
@@ -1,652 +1,486 @@
-//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "db/range_del_aggregator.h"
-#include "util/heap.h"
 
-#include <algorithm>
+#include "db/compaction_iteration_stats.h"
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_edit.h"
+#include "include/rocksdb/comparator.h"
+#include "include/rocksdb/types.h"
+#include "table/internal_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/table_builder.h"
+#include "util/heap.h"
+#include "util/kv_map.h"
+#include "util/vector_iterator.h"
 
 namespace rocksdb {
 
-struct TombstoneStartKeyComparator {
-  TombstoneStartKeyComparator(const Comparator* c) : cmp(c) {}
-
-  bool operator()(const RangeTombstone& a, const RangeTombstone& b) const {
-    return cmp->Compare(a.start_key_, b.start_key_) < 0;
+TruncatedRangeDelIterator::TruncatedRangeDelIterator(
+    std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
+    const InternalKeyComparator* icmp, const InternalKey* smallest,
+    const InternalKey* largest)
+    : iter_(std::move(iter)),
+      icmp_(icmp),
+      smallest_ikey_(smallest),
+      largest_ikey_(largest) {
+  if (smallest != nullptr) {
+    pinned_bounds_.emplace_back();
+    auto& parsed_smallest = pinned_bounds_.back();
+    if (!ParseInternalKey(smallest->Encode(), &parsed_smallest)) {
+      assert(false);
+    }
+    smallest_ = &parsed_smallest;
   }
-
-  const Comparator* cmp;
-};
-
-// An UncollapsedRangeDelMap is quick to create but slow to answer ShouldDelete
-// queries.
-class UncollapsedRangeDelMap : public RangeDelMap {
-  typedef std::multiset<RangeTombstone, TombstoneStartKeyComparator> Rep;
-
-  class Iterator : public RangeDelIterator {
-    const Rep& rep_;
-    Rep::const_iterator iter_;
-
-   public:
-    Iterator(const Rep& rep) : rep_(rep), iter_(rep.begin()) {}
-    bool Valid() const override { return iter_ != rep_.end(); }
-    void Next() override { iter_++; }
-
-    void Seek(const Slice&) override {
-      fprintf(stderr, "UncollapsedRangeDelMap::Iterator::Seek unimplemented\n");
-      abort();
+  if (largest != nullptr) {
+    pinned_bounds_.emplace_back();
+    auto& parsed_largest = pinned_bounds_.back();
+    if (!ParseInternalKey(largest->Encode(), &parsed_largest)) {
+      assert(false);
     }
-
-    RangeTombstone Tombstone() const override { return *iter_; }
-  };
-
-  Rep rep_;
-  const Comparator* ucmp_;
-
- public:
-  UncollapsedRangeDelMap(const Comparator* ucmp)
-      : rep_(TombstoneStartKeyComparator(ucmp)), ucmp_(ucmp) {}
-
-  bool ShouldDelete(const ParsedInternalKey& parsed,
-                    RangeDelPositioningMode mode) override {
-    (void)mode;
-    assert(mode == RangeDelPositioningMode::kFullScan);
-    for (const auto& tombstone : rep_) {
-      if (ucmp_->Compare(parsed.user_key, tombstone.start_key_) < 0) {
-        break;
-      }
-      if (parsed.sequence < tombstone.seq_ &&
-          ucmp_->Compare(parsed.user_key, tombstone.end_key_) < 0) {
-        return true;
-      }
+    if (parsed_largest.type == kTypeRangeDeletion &&
+        parsed_largest.sequence == kMaxSequenceNumber) {
+      // The file boundary has been artificially extended by a range tombstone.
+      // We do not need to adjust largest to properly truncate range
+      // tombstones that extend past the boundary.
+    } else if (parsed_largest.sequence == 0) {
+      // The largest key in the sstable has a sequence number of 0. Since we
+      // guarantee that no internal keys with the same user key and sequence
+      // number can exist in a DB, we know that the largest key in this sstable
+      // cannot exist as the smallest key in the next sstable. This further
+      // implies that no range tombstone in this sstable covers largest;
+      // otherwise, the file boundary would have been artificially extended.
+      //
+      // Therefore, we will never truncate a range tombstone at largest, so we
+      // can leave it unchanged.
+    } else {
+      // The same user key may straddle two sstable boundaries. To ensure that
+      // the truncated end key can cover the largest key in this sstable, reduce
+      // its sequence number by 1.
+      parsed_largest.sequence -= 1;
     }
-    return false;
+    largest_ = &parsed_largest;
   }
+}
 
-  bool IsRangeOverlapped(const Slice& start, const Slice& end) override {
-    for (const auto& tombstone : rep_) {
-      if (ucmp_->Compare(start, tombstone.end_key_) < 0 &&
-          ucmp_->Compare(tombstone.start_key_, end) <= 0 &&
-          ucmp_->Compare(tombstone.start_key_, tombstone.end_key_) < 0) {
-        return true;
-      }
-    }
-    return false;
-  }
+bool TruncatedRangeDelIterator::Valid() const {
+  return iter_->Valid() &&
+         (smallest_ == nullptr ||
+          icmp_->Compare(*smallest_, iter_->parsed_end_key()) < 0) &&
+         (largest_ == nullptr ||
+          icmp_->Compare(iter_->parsed_start_key(), *largest_) < 0);
+}
 
-  void AddTombstone(RangeTombstone tombstone) override {
-    rep_.emplace(tombstone);
-  }
+void TruncatedRangeDelIterator::Next() { iter_->TopNext(); }
 
-  size_t Size() const override { return rep_.size(); }
+void TruncatedRangeDelIterator::Prev() { iter_->TopPrev(); }
 
-  void InvalidatePosition() override {}  // no-op
+void TruncatedRangeDelIterator::InternalNext() { iter_->Next(); }
 
-  std::unique_ptr<RangeDelIterator> NewIterator() override {
-    return std::unique_ptr<RangeDelIterator>(new Iterator(this->rep_));
+// NOTE: target is a user key
+void TruncatedRangeDelIterator::Seek(const Slice& target) {
+  if (largest_ != nullptr &&
+      icmp_->Compare(*largest_, ParsedInternalKey(target, kMaxSequenceNumber,
+                                                  kTypeRangeDeletion)) <= 0) {
+    iter_->Invalidate();
+    return;
   }
-};
-
-// A CollapsedRangeDelMap is slow to create but quick to answer ShouldDelete
-// queries.
-//
-// An explanation of the design follows. Suppose we have tombstones [b, n) @ 1,
-// [e, h) @ 2, [q, t) @ 2, and [g, k) @ 3. Visually, the tombstones look like
-// this:
-//
-//     3:        g---k
-//     2:     e---h        q--t
-//     1:  b------------n
-//
-// The CollapsedRangeDelMap representation is based on the observation that
-// wherever tombstones overlap, we need only store the tombstone with the
-// largest seqno. From the perspective of a read at seqno 4 or greater, this set
-// of tombstones is exactly equivalent:
-//
-//     3:        g---k
-//     2:     e--g         q--t
-//     1:  b--e      k--n
-//
-// Because these tombstones do not overlap, they can be efficiently represented
-// in an ordered map from keys to sequence numbers. Each entry should be thought
-// of as a transition from one tombstone to the next. In this example, the
-// CollapsedRangeDelMap would store the following entries, in order:
-//
-//     b → 1, e → 2, g → 3, k → 1, n → 0, q → 2, t → 0
-//
-// If a tombstone ends before the next tombstone begins, a sentinel seqno of 0
-// is installed to indicate that no tombstone exists. This occurs at keys n and
-// t in the example above.
-//
-// To check whether a key K is covered by a tombstone, the map is binary
-// searched for the last key less than K. K is covered iff the map entry has a
-// larger seqno than K. As an example, consider the key h @ 4. It would be
-// compared against the map entry g → 3 and determined to be uncovered. By
-// contrast, the key h @ 2 would be determined to be covered.
-class CollapsedRangeDelMap : public RangeDelMap {
-  typedef std::map<Slice, SequenceNumber, stl_wrappers::LessOfComparator> Rep;
-
-  class Iterator : public RangeDelIterator {
-    void MaybeSeekPastSentinel() {
-      if (Valid() && iter_->second == 0) {
-        iter_++;
-      }
-    }
-
-    const Rep& rep_;
-    Rep::const_iterator iter_;
-
-   public:
-    Iterator(const Rep& rep) : rep_(rep), iter_(rep.begin()) {}
-
-    bool Valid() const override { return iter_ != rep_.end(); }
-
-    void Next() override {
-      iter_++;
-      MaybeSeekPastSentinel();
-    }
-
-    void Seek(const Slice& target) override {
-      iter_ = rep_.upper_bound(target);
-      if (iter_ != rep_.begin()) {
-        iter_--;
-      }
-      MaybeSeekPastSentinel();
-    }
-
-    RangeTombstone Tombstone() const override {
-      assert(Valid());
-      assert(std::next(iter_) != rep_.end());
-      assert(iter_->second != 0);
-      RangeTombstone tombstone;
-      tombstone.start_key_ = iter_->first;
-      tombstone.end_key_ = std::next(iter_)->first;
-      tombstone.seq_ = iter_->second;
-      return tombstone;
-    }
-  };
-
-  Rep rep_;
-  Rep::iterator iter_;
-  const Comparator* ucmp_;
-
- public:
-  explicit CollapsedRangeDelMap(const Comparator* ucmp) 
-    : rep_(stl_wrappers::LessOfComparator(ucmp)), 
-      ucmp_(ucmp) {
-    InvalidatePosition();
+  if (smallest_ != nullptr &&
+      icmp_->user_comparator()->Compare(target, smallest_->user_key) < 0) {
+    iter_->Seek(smallest_->user_key);
+    return;
   }
+  iter_->Seek(target);
+}
 
-  bool ShouldDelete(const ParsedInternalKey& parsed,
-                    RangeDelPositioningMode mode) override {
-    if (iter_ == rep_.end() &&
-        (mode == RangeDelPositioningMode::kForwardTraversal ||
-         mode == RangeDelPositioningMode::kBackwardTraversal)) {
-      // invalid (e.g., if AddTombstones() changed the deletions), so need to
-      // reseek
-      mode = RangeDelPositioningMode::kBinarySearch;
-    }
-    switch (mode) {
-      case RangeDelPositioningMode::kFullScan:
-        assert(false);
-      case RangeDelPositioningMode::kForwardTraversal:
-        assert(iter_ != rep_.end());
-        if (iter_ == rep_.begin() &&
-            ucmp_->Compare(parsed.user_key, iter_->first) < 0) {
-          // before start of deletion intervals
-          return false;
-        }
-        while (std::next(iter_) != rep_.end() &&
-               ucmp_->Compare(std::next(iter_)->first, parsed.user_key) <= 0) {
-          ++iter_;
-        }
-        break;
-      case RangeDelPositioningMode::kBackwardTraversal:
-        assert(iter_ != rep_.end());
-        while (iter_ != rep_.begin() &&
-               ucmp_->Compare(parsed.user_key, iter_->first) < 0) {
-          --iter_;
-        }
-        if (iter_ == rep_.begin() &&
-            ucmp_->Compare(parsed.user_key, iter_->first) < 0) {
-          // before start of deletion intervals
-          return false;
-        }
-        break;
-      case RangeDelPositioningMode::kBinarySearch:
-        iter_ = rep_.upper_bound(parsed.user_key);
-        if (iter_ == rep_.begin()) {
-          // before start of deletion intervals
-          return false;
-        }
-        --iter_;
-        break;
-    }
-    assert(iter_ != rep_.end() &&
-           ucmp_->Compare(iter_->first, parsed.user_key) <= 0);
-    assert(std::next(iter_) == rep_.end() ||
-           ucmp_->Compare(parsed.user_key, std::next(iter_)->first) < 0);
-    return parsed.sequence < iter_->second;
+// NOTE: target is a user key
+void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) {
+  if (smallest_ != nullptr &&
+      icmp_->Compare(ParsedInternalKey(target, 0, kTypeRangeDeletion),
+                     *smallest_) < 0) {
+    iter_->Invalidate();
+    return;
   }
-
-  bool IsRangeOverlapped(const Slice&, const Slice&) override {
-    // Unimplemented because the only client of this method, file ingestion,
-    // uses uncollapsed maps.
-    fprintf(stderr, "CollapsedRangeDelMap::IsRangeOverlapped unimplemented");
-    abort();
+  if (largest_ != nullptr &&
+      icmp_->user_comparator()->Compare(largest_->user_key, target) < 0) {
+    iter_->SeekForPrev(largest_->user_key);
+    return;
   }
+  iter_->SeekForPrev(target);
+}
 
-  void AddTombstone(RangeTombstone t) override {
-    if (ucmp_->Compare(t.start_key_, t.end_key_) >= 0 || t.seq_ == 0) {
-      // The tombstone covers no keys. Nothing to do.
-      return;
-    }
-
-    auto it = rep_.upper_bound(t.start_key_);
-    auto prev_seq = [&]() {
-      return it == rep_.begin() ? 0 : std::prev(it)->second;
-    };
-
-    // end_seq stores the seqno of the last transition that the new tombstone
-    // covered. This is the seqno that we'll install if we need to insert a
-    // transition for the new tombstone's end key.
-    SequenceNumber end_seq = 0;
-
-    // In the diagrams below, the new tombstone is always [c, k) @ 2. The
-    // existing tombstones are varied to depict different scenarios. Uppercase
-    // letters are used to indicate points that exist in the map, while
-    // lowercase letters are used to indicate points that do not exist in the
-    // map. The location of the iterator is marked with a caret; it may point
-    // off the end of the diagram to indicate that it is positioned at a
-    // entry with a larger key whose specific key is irrelevant.
-
-    if (t.seq_ > prev_seq()) {
-      // The new tombstone's start point covers the existing tombstone:
-      //
-      //     3:                3: A--C           3:                3:
-      //     2:    c---   OR   2:    c---   OR   2:    c---   OR   2: c------
-      //     1: A--C           1:                1: A------        1: C------
-      //                ^                 ^                 ^                  ^
-      end_seq = prev_seq();
-      Rep::iterator pit;
-      if (it != rep_.begin() && (pit = std::prev(it)) != rep_.begin() &&
-          ucmp_->Compare(pit->first, t.start_key_) == 0 && std::prev(pit)->second == t.seq_) {
-        // The new tombstone starts at the end of an existing tombstone with an
-        // identical seqno:
-        //
-        //     3:
-        //     2: A--C---
-        //     1:
-        //                ^
-        // Merge the tombstones by removing the existing tombstone's end key.
-        it = rep_.erase(std::prev(it));
-      } else {
-        // Insert a new transition at the new tombstone's start point, or raise
-        // the existing transition at that point to the new tombstone's seqno.
-        rep_[t.start_key_] = t.seq_;  // operator[] will overwrite existing entry
-      }
-    } else {
-      // The new tombstone's start point is covered by an existing tombstone:
-      //
-      //      3: A-----   OR    3: C------   OR
-      //      2:   c---         2: c------         2: C------
-      //                ^                  ^                  ^
-      // Do nothing.
-    }
-
-    // Look at all the existing transitions that overlap the new tombstone.
-    while (it != rep_.end() && ucmp_->Compare(it->first, t.end_key_) < 0) {
-      if (t.seq_ >= it->second) {
-        // The transition is to an existing tombstone that the new tombstone
-        // covers. Save the covered tombstone's seqno. We'll need to return to
-        // it if the new tombstone ends before the existing tombstone.
-        end_seq = it->second;
-
-        if (t.seq_ == prev_seq()) {
-          // The previous transition is to the seqno of the new tombstone:
-          //
-          //     3:                3:                3: --F
-          //     2: C------   OR   2: C------   OR   2:   F----
-          //     1:    F---        1: ---F           1:     H--
-          //           ^                 ^                  ^
-          //
-          // Erase this transition. It's been superseded.
-          it = rep_.erase(it);
-          continue;  // skip increment; erase positions iterator correctly
-        } else {
-          // The previous transition is to a tombstone that covers the new
-          // tombstone, but this transition is to a tombstone that is covered by
-          // the new tombstone. That is, this is the end of a run of existing
-          // tombstones that cover the new tombstone:
-          //
-          //     3: A---E     OR   3:  E-G
-          //     2:   c----        2: ------
-          //            ^                ^
-          // Preserve this transition point, but raise it to the new tombstone's
-          // seqno.
-          it->second = t.seq_;
-        }
-      } else {
-        // The transition is to an existing tombstone that covers the new
-        // tombstone:
-        //
-        //     4:              4: --F
-        //     3:   F--   OR   3:   F--
-        //     2: -----        2: -----
-        //          ^               ^
-        // Do nothing.
-      }
-      ++it;
-    }
+void TruncatedRangeDelIterator::SeekToFirst() {
+  if (smallest_ != nullptr) {
+    iter_->Seek(smallest_->user_key);
+    return;
+  }
+  iter_->SeekToTopFirst();
+}
 
-    if (t.seq_ == prev_seq()) {
-      // The new tombstone is unterminated in the map.
-      if (it != rep_.end() && t.seq_ == it->second && ucmp_->Compare(it->first, t.end_key_) == 0) {
-        // The new tombstone ends at the start of another tombstone with an
-        // identical seqno. Merge the tombstones by removing the existing
-        // tombstone's start key.
-        rep_.erase(it);
-      } else if (end_seq == prev_seq() || (it != rep_.end() && end_seq == it->second)) {
-        // The new tombstone is implicitly ended because its end point is
-        // contained within an existing tombstone with the same seqno:
-        //
-        //     2: ---k--N
-        //              ^
-      } else {
-        // The new tombstone needs an explicit end point.
-        //
-        //     3:             OR   3: --G       OR   3: --G   K--
-        //     2: C-------k        2:   G---k        2:   G---k
-        //                  ^                 ^               ^
-        // Install one that returns to the last seqno we covered. Because end
-        // keys are exclusive, if there's an existing transition at t.end_key_,
-        // it takes precedence over the transition that we install here.
-        rep_.emplace(t.end_key_, end_seq);  // emplace is a noop if existing entry
-      }
-    } else {
-      // The new tombstone is implicitly ended because its end point is covered
-      // by an existing tombstone with a higher seqno.
-      //
-      //     3:   I---M   OR   3: A-----------M
-      //     2: ----k          2:   c-------k
-      //              ^                       ^
-      // Do nothing.
-    }
+void TruncatedRangeDelIterator::SeekToLast() {
+  if (largest_ != nullptr) {
+    iter_->SeekForPrev(largest_->user_key);
+    return;
   }
+  iter_->SeekToTopLast();
+}
 
-  size_t Size() const override { return rep_.empty() ? 0 : rep_.size() - 1; }
+std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+TruncatedRangeDelIterator::SplitBySnapshot(
+    const std::vector<SequenceNumber>& snapshots) {
+  using FragmentedIterPair =
+      std::pair<const SequenceNumber,
+                std::unique_ptr<FragmentedRangeTombstoneIterator>>;
+
+  auto split_untruncated_iters = iter_->SplitBySnapshot(snapshots);
+  std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+      split_truncated_iters;
+  std::for_each(
+      split_untruncated_iters.begin(), split_untruncated_iters.end(),
+      [&](FragmentedIterPair& iter_pair) {
+        std::unique_ptr<TruncatedRangeDelIterator> truncated_iter(
+            new TruncatedRangeDelIterator(std::move(iter_pair.second), icmp_,
+                                          smallest_ikey_, largest_ikey_));
+        split_truncated_iters.emplace(iter_pair.first,
+                                      std::move(truncated_iter));
+      });
+  return split_truncated_iters;
+}
 
-  void InvalidatePosition() override { iter_ = rep_.end(); }
+ForwardRangeDelIterator::ForwardRangeDelIterator(
+    const InternalKeyComparator* icmp)
+    : icmp_(icmp),
+      unused_idx_(0),
+      active_seqnums_(SeqMaxComparator()),
+      active_iters_(EndKeyMinComparator(icmp)),
+      inactive_iters_(StartKeyMinComparator(icmp)) {}
+
+bool ForwardRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
+  // Move active iterators that end before parsed.
+  while (!active_iters_.empty() &&
+         icmp_->Compare((*active_iters_.top())->end_key(), parsed) <= 0) {
+    TruncatedRangeDelIterator* iter = PopActiveIter();
+    do {
+      iter->Next();
+    } while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
 
-  std::unique_ptr<RangeDelIterator> NewIterator() override {
-    return std::unique_ptr<RangeDelIterator>(new Iterator(this->rep_));
+  // Move inactive iterators that start before parsed.
+  while (!inactive_iters_.empty() &&
+         icmp_->Compare(inactive_iters_.top()->start_key(), parsed) <= 0) {
+    TruncatedRangeDelIterator* iter = PopInactiveIter();
+    while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0) {
+      iter->Next();
+    }
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
   }
-};
 
-RangeDelAggregator::RangeDelAggregator(
-    const InternalKeyComparator& icmp,
-    const std::vector<SequenceNumber>& snapshots,
-    bool collapse_deletions /* = true */)
-    : upper_bound_(kMaxSequenceNumber),
-      icmp_(icmp),
-      collapse_deletions_(collapse_deletions) {
-  InitRep(snapshots);
+  return active_seqnums_.empty()
+             ? false
+             : (*active_seqnums_.begin())->seq() > parsed.sequence;
 }
 
-RangeDelAggregator::RangeDelAggregator(const InternalKeyComparator& icmp,
-                                       SequenceNumber snapshot,
-                                       bool collapse_deletions /* = false */)
-    : upper_bound_(snapshot),
-      icmp_(icmp),
-      collapse_deletions_(collapse_deletions) {}
-
-void RangeDelAggregator::InitRep(const std::vector<SequenceNumber>& snapshots) {
-  assert(rep_ == nullptr);
-  rep_.reset(new Rep());
-  for (auto snapshot : snapshots) {
-    rep_->stripe_map_.emplace(snapshot, NewRangeDelMap());
-  }
-  // Data newer than any snapshot falls in this catch-all stripe
-  rep_->stripe_map_.emplace(kMaxSequenceNumber, NewRangeDelMap());
-  rep_->pinned_iters_mgr_.StartPinning();
+void ForwardRangeDelIterator::Invalidate() {
+  unused_idx_ = 0;
+  active_iters_.clear();
+  active_seqnums_.clear();
+  inactive_iters_.clear();
 }
 
-std::unique_ptr<RangeDelMap> RangeDelAggregator::NewRangeDelMap() {
-  RangeDelMap* tombstone_map;
-  if (collapse_deletions_) {
-    tombstone_map = new CollapsedRangeDelMap(icmp_.user_comparator());
-  } else {
-    tombstone_map = new UncollapsedRangeDelMap(icmp_.user_comparator());
+ReverseRangeDelIterator::ReverseRangeDelIterator(
+    const InternalKeyComparator* icmp)
+    : icmp_(icmp),
+      unused_idx_(0),
+      active_seqnums_(SeqMaxComparator()),
+      active_iters_(StartKeyMaxComparator(icmp)),
+      inactive_iters_(EndKeyMaxComparator(icmp)) {}
+
+bool ReverseRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
+  // Move active iterators that start after parsed.
+  while (!active_iters_.empty() &&
+         icmp_->Compare(parsed, (*active_iters_.top())->start_key()) < 0) {
+    TruncatedRangeDelIterator* iter = PopActiveIter();
+    do {
+      iter->Prev();
+    } while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
   }
-  return std::unique_ptr<RangeDelMap>(tombstone_map);
-}
 
-bool RangeDelAggregator::ShouldDeleteImpl(const Slice& internal_key,
-                                          RangeDelPositioningMode mode) {
-  assert(rep_ != nullptr);
-  ParsedInternalKey parsed;
-  if (!ParseInternalKey(internal_key, &parsed)) {
-    assert(false);
+  // Move inactive iterators that end after parsed.
+  while (!inactive_iters_.empty() &&
+         icmp_->Compare(parsed, inactive_iters_.top()->end_key()) < 0) {
+    TruncatedRangeDelIterator* iter = PopInactiveIter();
+    while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0) {
+      iter->Prev();
+    }
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
   }
-  return ShouldDelete(parsed, mode);
+
+  return active_seqnums_.empty()
+             ? false
+             : (*active_seqnums_.begin())->seq() > parsed.sequence;
 }
 
-bool RangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed,
-                                          RangeDelPositioningMode mode) {
-  assert(IsValueType(parsed.type));
-  assert(rep_ != nullptr);
-  auto& tombstone_map = GetRangeDelMap(parsed.sequence);
-  if (tombstone_map.IsEmpty()) {
-    return false;
-  }
-  return tombstone_map.ShouldDelete(parsed, mode);
+void ReverseRangeDelIterator::Invalidate() {
+  unused_idx_ = 0;
+  active_iters_.clear();
+  active_seqnums_.clear();
+  inactive_iters_.clear();
 }
 
-bool RangeDelAggregator::IsRangeOverlapped(const Slice& start,
-                                           const Slice& end) {
-  // Unimplemented because the only client of this method, file ingestion,
-  // uses uncollapsed maps.
-  assert(!collapse_deletions_);
-  if (rep_ == nullptr) {
+bool RangeDelAggregator::StripeRep::ShouldDelete(
+    const ParsedInternalKey& parsed, RangeDelPositioningMode mode) {
+  if (!InStripe(parsed.sequence) || IsEmpty()) {
     return false;
   }
-  for (const auto& stripe : rep_->stripe_map_) {
-    if (stripe.second->IsRangeOverlapped(start, end)) {
-      return true;
-    }
+  switch (mode) {
+    case RangeDelPositioningMode::kForwardTraversal:
+      InvalidateReverseIter();
+
+      // Pick up previously unseen iterators.
+      for (auto it = std::next(iters_.begin(), forward_iter_.UnusedIdx());
+           it != iters_.end(); ++it, forward_iter_.IncUnusedIdx()) {
+        auto& iter = *it;
+        forward_iter_.AddNewIter(iter.get(), parsed);
+      }
+
+      return forward_iter_.ShouldDelete(parsed);
+    case RangeDelPositioningMode::kBackwardTraversal:
+      InvalidateForwardIter();
+
+      // Pick up previously unseen iterators.
+      for (auto it = std::next(iters_.begin(), reverse_iter_.UnusedIdx());
+           it != iters_.end(); ++it, reverse_iter_.IncUnusedIdx()) {
+        auto& iter = *it;
+        reverse_iter_.AddNewIter(iter.get(), parsed);
+      }
+
+      return reverse_iter_.ShouldDelete(parsed);
+    default:
+      assert(false);
+      return false;
   }
-  return false;
 }
 
-Status RangeDelAggregator::AddTombstones(
-    std::unique_ptr<InternalIterator> input,
-    const InternalKey* smallest,
-    const InternalKey* largest) {
-  if (input == nullptr) {
-    return Status::OK();
-  }
-  input->SeekToFirst();
-  bool first_iter = true;
-  while (input->Valid()) {
-    if (first_iter) {
-      if (rep_ == nullptr) {
-        InitRep({upper_bound_});
-      } else {
-        InvalidateRangeDelMapPositions();
-      }
-      first_iter = false;
-    }
-    ParsedInternalKey parsed_key;
-    bool parsed;
-    if (input->IsKeyPinned()) {
-      parsed = ParseInternalKey(input->key(), &parsed_key);
-    } else {
-      // The tombstone map holds slices into the iterator's memory. Make a
-      // copy of the key if it is not pinned.
-      rep_->pinned_slices_.emplace_back(input->key().data(),
-                                        input->key().size());
-      parsed = ParseInternalKey(rep_->pinned_slices_.back(), &parsed_key);
-    }
-    if (!parsed) {
-      return Status::Corruption("Unable to parse range tombstone InternalKey");
-    }
-    RangeTombstone tombstone;
-    if (input->IsValuePinned()) {
-      tombstone = RangeTombstone(parsed_key, input->value());
-    } else {
-      // The tombstone map holds slices into the iterator's memory. Make a
-      // copy of the value if it is not pinned.
-      rep_->pinned_slices_.emplace_back(input->value().data(),
-                                        input->value().size());
-      tombstone = RangeTombstone(parsed_key, rep_->pinned_slices_.back());
-    }
-    // Truncate the tombstone to the range [smallest, largest].
-    if (smallest != nullptr) {
-      if (icmp_.user_comparator()->Compare(
-              tombstone.start_key_, smallest->user_key()) < 0) {
-        tombstone.start_key_ = smallest->user_key();
+bool RangeDelAggregator::StripeRep::IsRangeOverlapped(const Slice& start,
+                                                      const Slice& end) {
+  Invalidate();
+
+  // Set the internal start/end keys so that:
+  // - if start_ikey has the same user key and sequence number as the
+  // current end key, start_ikey will be considered greater; and
+  // - if end_ikey has the same user key and sequence number as the current
+  // start key, end_ikey will be considered greater.
+  ParsedInternalKey start_ikey(start, kMaxSequenceNumber,
+                               static_cast<ValueType>(0));
+  ParsedInternalKey end_ikey(end, 0, static_cast<ValueType>(0));
+  for (auto& iter : iters_) {
+    bool checked_candidate_tombstones = false;
+    for (iter->SeekForPrev(start);
+         iter->Valid() && icmp_->Compare(iter->start_key(), end_ikey) <= 0;
+         iter->Next()) {
+      checked_candidate_tombstones = true;
+      if (icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
+          icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
+        return true;
       }
     }
-    if (largest != nullptr) {
-      // To safely truncate the range tombstone's end key, it must extend past
-      // the largest key in the sstable (which may have been extended to the
-      // smallest key in the next sstable), and largest must be a tombstone
-      // sentinel key. A range tombstone may straddle two sstables and not be
-      // the tombstone sentinel key in the first sstable if a user-key also
-      // straddles the sstables (possible if there is a snapshot between the
-      // two versions of the user-key), in which case we cannot truncate the
-      // range tombstone.
-      if (icmp_.user_comparator()->Compare(tombstone.end_key_,
-                                           largest->user_key()) > 0 &&
-          GetInternalKeySeqno(largest->Encode()) == kMaxSequenceNumber) {
-        tombstone.end_key_ = largest->user_key();
+
+    if (!checked_candidate_tombstones) {
+      // Do an additional check for when the end of the range is the begin
+      // key of a tombstone, which we missed earlier since SeekForPrev'ing
+      // to the start was invalid.
+      iter->SeekForPrev(end);
+      if (iter->Valid() && icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
+          icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
+        return true;
       }
     }
-    auto seq = tombstone.seq_;
-    GetRangeDelMap(seq).AddTombstone(std::move(tombstone));
-    input->Next();
-  }
-  if (!first_iter) {
-    rep_->pinned_iters_mgr_.PinIterator(input.release(), false /* arena */);
   }
-  return Status::OK();
+  return false;
 }
 
-void RangeDelAggregator::InvalidateRangeDelMapPositions() {
-  if (rep_ == nullptr) {
+void ReadRangeDelAggregator::AddTombstones(
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+    const InternalKey* smallest, const InternalKey* largest) {
+  if (input_iter == nullptr || input_iter->empty()) {
     return;
   }
-  for (auto& stripe : rep_->stripe_map_) {
-    stripe.second->InvalidatePosition();
-  }
+  rep_.AddTombstones(
+      std::unique_ptr<TruncatedRangeDelIterator>(new TruncatedRangeDelIterator(
+          std::move(input_iter), icmp_, smallest, largest)));
 }
 
-RangeDelMap& RangeDelAggregator::GetRangeDelMap(SequenceNumber seq) {
-  assert(rep_ != nullptr);
-  // The stripe includes seqnum for the snapshot above and excludes seqnum for
-  // the snapshot below.
-  StripeMap::iterator iter;
-  if (seq > 0) {
-    // upper_bound() checks strict inequality so need to subtract one
-    iter = rep_->stripe_map_.upper_bound(seq - 1);
-  } else {
-    iter = rep_->stripe_map_.begin();
-  }
-  // catch-all stripe justifies this assertion in either of above cases
-  assert(iter != rep_->stripe_map_.end());
-  return *iter->second;
+bool ReadRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed,
+                                          RangeDelPositioningMode mode) {
+  return rep_.ShouldDelete(parsed, mode);
 }
 
-bool RangeDelAggregator::IsEmpty() {
-  if (rep_ == nullptr) {
-    return true;
+bool ReadRangeDelAggregator::IsRangeOverlapped(const Slice& start,
+                                               const Slice& end) {
+  InvalidateRangeDelMapPositions();
+  return rep_.IsRangeOverlapped(start, end);
+}
+
+void CompactionRangeDelAggregator::AddTombstones(
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+    const InternalKey* smallest, const InternalKey* largest) {
+  if (input_iter == nullptr || input_iter->empty()) {
+    return;
   }
-  for (const auto& stripe : rep_->stripe_map_) {
-    if (!stripe.second->IsEmpty()) {
-      return false;
+  assert(input_iter->lower_bound() == 0);
+  assert(input_iter->upper_bound() == kMaxSequenceNumber);
+  parent_iters_.emplace_back(new TruncatedRangeDelIterator(
+      std::move(input_iter), icmp_, smallest, largest));
+
+  auto split_iters = parent_iters_.back()->SplitBySnapshot(*snapshots_);
+  for (auto& split_iter : split_iters) {
+    auto it = reps_.find(split_iter.first);
+    if (it == reps_.end()) {
+      bool inserted;
+      SequenceNumber upper_bound = split_iter.second->upper_bound();
+      SequenceNumber lower_bound = split_iter.second->lower_bound();
+      std::tie(it, inserted) = reps_.emplace(
+          split_iter.first, StripeRep(icmp_, upper_bound, lower_bound));
+      assert(inserted);
     }
+    assert(it != reps_.end());
+    it->second.AddTombstones(std::move(split_iter.second));
   }
-  return true;
 }
 
-bool RangeDelAggregator::AddFile(uint64_t file_number) {
-  if (rep_ == nullptr) {
-    return true;
+bool CompactionRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed,
+                                                RangeDelPositioningMode mode) {
+  auto it = reps_.lower_bound(parsed.sequence);
+  if (it == reps_.end()) {
+    return false;
   }
-  return rep_->added_files_.emplace(file_number).second;
+  return it->second.ShouldDelete(parsed, mode);
 }
 
-class MergingRangeDelIter : public RangeDelIterator {
+namespace {
+
+class TruncatedRangeDelMergingIter : public InternalIterator {
  public:
-  MergingRangeDelIter(const Comparator* c)
-      : heap_(IterMinHeap(IterComparator(c))), current_(nullptr) {}
-
-  void AddIterator(std::unique_ptr<RangeDelIterator> iter) {
-    if (iter->Valid()) {
-      heap_.push(iter.get());
-      iters_.push_back(std::move(iter));
-      current_ = heap_.top();
+  TruncatedRangeDelMergingIter(
+      const InternalKeyComparator* icmp, const Slice* lower_bound,
+      const Slice* upper_bound, bool upper_bound_inclusive,
+      const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>& children)
+      : icmp_(icmp),
+        lower_bound_(lower_bound),
+        upper_bound_(upper_bound),
+        upper_bound_inclusive_(upper_bound_inclusive),
+        heap_(StartKeyMinComparator(icmp)) {
+    for (auto& child : children) {
+      if (child != nullptr) {
+        assert(child->lower_bound() == 0);
+        assert(child->upper_bound() == kMaxSequenceNumber);
+        children_.push_back(child.get());
+      }
     }
   }
 
-  bool Valid() const override { return current_ != nullptr; }
+  bool Valid() const override {
+    return !heap_.empty() && BeforeEndKey(heap_.top());
+  }
+  Status status() const override { return Status::OK(); }
+
+  void SeekToFirst() override {
+    heap_.clear();
+    for (auto& child : children_) {
+      if (lower_bound_ != nullptr) {
+        child->Seek(*lower_bound_);
+      } else {
+        child->SeekToFirst();
+      }
+      if (child->Valid()) {
+        heap_.push(child);
+      }
+    }
+  }
 
   void Next() override {
-    current_->Next();
-    if (current_->Valid()) {
-      heap_.replace_top(current_);
+    auto* top = heap_.top();
+    top->InternalNext();
+    if (top->Valid()) {
+      heap_.replace_top(top);
     } else {
       heap_.pop();
     }
-    current_ = heap_.empty() ? nullptr : heap_.top();
   }
 
-  void Seek(const Slice& target) override {
-    heap_.clear();
-    for (auto& iter : iters_) {
-      iter->Seek(target);
-      if (iter->Valid()) {
-        heap_.push(iter.get());
-      }
-    }
-    current_ = heap_.empty() ? nullptr : heap_.top();
+  Slice key() const override {
+    auto* top = heap_.top();
+    cur_start_key_.Set(top->start_key().user_key, top->seq(),
+                       kTypeRangeDeletion);
+    return cur_start_key_.Encode();
   }
 
-  RangeTombstone Tombstone() const override { return current_->Tombstone(); }
+  Slice value() const override {
+    auto* top = heap_.top();
+    assert(top->end_key().sequence == kMaxSequenceNumber);
+    return top->end_key().user_key;
+  }
+
+  // Unused InternalIterator methods
+  void Prev() override { assert(false); }
+  void Seek(const Slice& /* target */) override { assert(false); }
+  void SeekForPrev(const Slice& /* target */) override { assert(false); }
+  void SeekToLast() override { assert(false); }
 
  private:
-  struct IterComparator {
-    IterComparator(const Comparator* c) : cmp(c) {}
-
-    bool operator()(const RangeDelIterator* a,
-                    const RangeDelIterator* b) const {
-      // Note: counterintuitively, returning the tombstone with the larger start
-      // key puts the tombstone with the smallest key at the top of the heap.
-      return cmp->Compare(a->Tombstone().start_key_,
-                          b->Tombstone().start_key_) > 0;
+  bool BeforeEndKey(const TruncatedRangeDelIterator* iter) const {
+    if (upper_bound_ == nullptr) {
+      return true;
     }
+    int cmp = icmp_->user_comparator()->Compare(iter->start_key().user_key,
+                                                *upper_bound_);
+    return upper_bound_inclusive_ ? cmp <= 0 : cmp < 0;
+  }
 
-    const Comparator* cmp;
-  };
-
-  typedef BinaryHeap<RangeDelIterator*, IterComparator> IterMinHeap;
+  const InternalKeyComparator* icmp_;
+  const Slice* lower_bound_;
+  const Slice* upper_bound_;
+  bool upper_bound_inclusive_;
+  BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> heap_;
+  std::vector<TruncatedRangeDelIterator*> children_;
 
-  std::vector<std::unique_ptr<RangeDelIterator>> iters_;
-  IterMinHeap heap_;
-  RangeDelIterator* current_;
+  mutable InternalKey cur_start_key_;
 };
 
-std::unique_ptr<RangeDelIterator> RangeDelAggregator::NewIterator() {
-  std::unique_ptr<MergingRangeDelIter> iter(
-      new MergingRangeDelIter(icmp_.user_comparator()));
-  if (rep_ != nullptr) {
-    for (const auto& stripe : rep_->stripe_map_) {
-      iter->AddIterator(stripe.second->NewIterator());
-    }
-  }
-  return std::move(iter);
+}  // namespace
+
+std::unique_ptr<FragmentedRangeTombstoneIterator>
+CompactionRangeDelAggregator::NewIterator(const Slice* lower_bound,
+                                          const Slice* upper_bound,
+                                          bool upper_bound_inclusive) {
+  InvalidateRangeDelMapPositions();
+  std::unique_ptr<TruncatedRangeDelMergingIter> merging_iter(
+      new TruncatedRangeDelMergingIter(icmp_, lower_bound, upper_bound,
+                                       upper_bound_inclusive, parent_iters_));
+
+  // TODO: add tests where tombstone fragments can be outside of upper and lower
+  // bound range
+  auto fragmented_tombstone_list =
+      std::make_shared<FragmentedRangeTombstoneList>(
+          std::move(merging_iter), *icmp_, true /* for_compaction */,
+          *snapshots_);
+
+  return std::unique_ptr<FragmentedRangeTombstoneIterator>(
+      new FragmentedRangeTombstoneIterator(
+          fragmented_tombstone_list, *icmp_,
+          kMaxSequenceNumber /* upper_bound */));
 }
 
 }  // namespace rocksdb
diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h
index 66088b3d4eb..712ae458390 100644
--- a/db/range_del_aggregator.h
+++ b/db/range_del_aggregator.h
@@ -1,10 +1,12 @@
-//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
 
+#include <algorithm>
+#include <iterator>
 #include <list>
 #include <map>
 #include <set>
@@ -14,182 +16,416 @@
 #include "db/compaction_iteration_stats.h"
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "db/version_edit.h"
 #include "include/rocksdb/comparator.h"
 #include "include/rocksdb/types.h"
 #include "table/internal_iterator.h"
 #include "table/scoped_arena_iterator.h"
 #include "table/table_builder.h"
+#include "util/heap.h"
 #include "util/kv_map.h"
 
 namespace rocksdb {
 
-// RangeDelMaps maintain position across calls to ShouldDelete. The caller may
-// wish to specify a mode to optimize positioning the iterator during the next
-// call to ShouldDelete. The non-kFullScan modes are only available when
-// deletion collapsing is enabled.
-//
-// For example, if we invoke Next() on an iterator, kForwardTraversal should be
-// specified to advance one-by-one through deletions until one is found with its
-// interval containing the key. This will typically be faster than doing a full
-// binary search (kBinarySearch).
-enum class RangeDelPositioningMode {
-  kFullScan,  // used iff collapse_deletions_ == false
-  kForwardTraversal,
-  kBackwardTraversal,
-  kBinarySearch,
+class TruncatedRangeDelIterator {
+ public:
+  TruncatedRangeDelIterator(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
+      const InternalKeyComparator* icmp, const InternalKey* smallest,
+      const InternalKey* largest);
+
+  bool Valid() const;
+
+  void Next();
+  void Prev();
+
+  void InternalNext();
+
+  // Seeks to the tombstone with the highest viisble sequence number that covers
+  // target (a user key). If no such tombstone exists, the position will be at
+  // the earliest tombstone that ends after target.
+  void Seek(const Slice& target);
+
+  // Seeks to the tombstone with the highest viisble sequence number that covers
+  // target (a user key). If no such tombstone exists, the position will be at
+  // the latest tombstone that starts before target.
+  void SeekForPrev(const Slice& target);
+
+  void SeekToFirst();
+  void SeekToLast();
+
+  ParsedInternalKey start_key() const {
+    return (smallest_ == nullptr ||
+            icmp_->Compare(*smallest_, iter_->parsed_start_key()) <= 0)
+               ? iter_->parsed_start_key()
+               : *smallest_;
+  }
+
+  ParsedInternalKey end_key() const {
+    return (largest_ == nullptr ||
+            icmp_->Compare(iter_->parsed_end_key(), *largest_) <= 0)
+               ? iter_->parsed_end_key()
+               : *largest_;
+  }
+
+  SequenceNumber seq() const { return iter_->seq(); }
+
+  std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+  SplitBySnapshot(const std::vector<SequenceNumber>& snapshots);
+
+  SequenceNumber upper_bound() const { return iter_->upper_bound(); }
+
+  SequenceNumber lower_bound() const { return iter_->lower_bound(); }
+
+ private:
+  std::unique_ptr<FragmentedRangeTombstoneIterator> iter_;
+  const InternalKeyComparator* icmp_;
+  const ParsedInternalKey* smallest_ = nullptr;
+  const ParsedInternalKey* largest_ = nullptr;
+  std::list<ParsedInternalKey> pinned_bounds_;
+
+  const InternalKey* smallest_ikey_;
+  const InternalKey* largest_ikey_;
+};
+
+struct SeqMaxComparator {
+  bool operator()(const TruncatedRangeDelIterator* a,
+                  const TruncatedRangeDelIterator* b) const {
+    return a->seq() > b->seq();
+  }
+};
+
+struct StartKeyMinComparator {
+  explicit StartKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+  bool operator()(const TruncatedRangeDelIterator* a,
+                  const TruncatedRangeDelIterator* b) const {
+    return icmp->Compare(a->start_key(), b->start_key()) > 0;
+  }
+
+  const InternalKeyComparator* icmp;
 };
 
-// A RangeDelIterator iterates over range deletion tombstones.
-class RangeDelIterator {
+class ForwardRangeDelIterator {
  public:
-  virtual ~RangeDelIterator() = default;
+  explicit ForwardRangeDelIterator(const InternalKeyComparator* icmp);
+
+  bool ShouldDelete(const ParsedInternalKey& parsed);
+  void Invalidate();
+
+  void AddNewIter(TruncatedRangeDelIterator* iter,
+                  const ParsedInternalKey& parsed) {
+    iter->Seek(parsed.user_key);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  size_t UnusedIdx() const { return unused_idx_; }
+  void IncUnusedIdx() { unused_idx_++; }
+
+ private:
+  using ActiveSeqSet =
+      std::multiset<TruncatedRangeDelIterator*, SeqMaxComparator>;
 
-  virtual bool Valid() const = 0;
-  virtual void Next() = 0;
-  virtual void Seek(const Slice& target) = 0;
-  virtual RangeTombstone Tombstone() const = 0;
+  struct EndKeyMinComparator {
+    explicit EndKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+    bool operator()(const ActiveSeqSet::const_iterator& a,
+                    const ActiveSeqSet::const_iterator& b) const {
+      return icmp->Compare((*a)->end_key(), (*b)->end_key()) > 0;
+    }
+
+    const InternalKeyComparator* icmp;
+  };
+
+  void PushIter(TruncatedRangeDelIterator* iter,
+                const ParsedInternalKey& parsed) {
+    if (!iter->Valid()) {
+      // The iterator has been fully consumed, so we don't need to add it to
+      // either of the heaps.
+      return;
+    }
+    int cmp = icmp_->Compare(parsed, iter->start_key());
+    if (cmp < 0) {
+      PushInactiveIter(iter);
+    } else {
+      PushActiveIter(iter);
+    }
+  }
+
+  void PushActiveIter(TruncatedRangeDelIterator* iter) {
+    auto seq_pos = active_seqnums_.insert(iter);
+    active_iters_.push(seq_pos);
+  }
+
+  TruncatedRangeDelIterator* PopActiveIter() {
+    auto active_top = active_iters_.top();
+    auto iter = *active_top;
+    active_iters_.pop();
+    active_seqnums_.erase(active_top);
+    return iter;
+  }
+
+  void PushInactiveIter(TruncatedRangeDelIterator* iter) {
+    inactive_iters_.push(iter);
+  }
+
+  TruncatedRangeDelIterator* PopInactiveIter() {
+    auto* iter = inactive_iters_.top();
+    inactive_iters_.pop();
+    return iter;
+  }
+
+  const InternalKeyComparator* icmp_;
+  size_t unused_idx_;
+  ActiveSeqSet active_seqnums_;
+  BinaryHeap<ActiveSeqSet::const_iterator, EndKeyMinComparator> active_iters_;
+  BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> inactive_iters_;
 };
 
-// A RangeDelMap keeps track of range deletion tombstones within a snapshot
-// stripe.
-//
-// RangeDelMaps are used internally by RangeDelAggregator. They are not intended
-// to be used directly.
-class RangeDelMap {
+class ReverseRangeDelIterator {
  public:
-  virtual ~RangeDelMap() = default;
+  explicit ReverseRangeDelIterator(const InternalKeyComparator* icmp);
 
-  virtual bool ShouldDelete(const ParsedInternalKey& parsed,
-                            RangeDelPositioningMode mode) = 0;
-  virtual bool IsRangeOverlapped(const Slice& start, const Slice& end) = 0;
-  virtual void InvalidatePosition() = 0;
+  bool ShouldDelete(const ParsedInternalKey& parsed);
+  void Invalidate();
+
+  void AddNewIter(TruncatedRangeDelIterator* iter,
+                  const ParsedInternalKey& parsed) {
+    iter->SeekForPrev(parsed.user_key);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  size_t UnusedIdx() const { return unused_idx_; }
+  void IncUnusedIdx() { unused_idx_++; }
+
+ private:
+  using ActiveSeqSet =
+      std::multiset<TruncatedRangeDelIterator*, SeqMaxComparator>;
 
-  virtual size_t Size() const = 0;
-  bool IsEmpty() const { return Size() == 0; }
+  struct EndKeyMaxComparator {
+    explicit EndKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {}
 
-  virtual void AddTombstone(RangeTombstone tombstone) = 0;
-  virtual std::unique_ptr<RangeDelIterator> NewIterator() = 0;
+    bool operator()(const TruncatedRangeDelIterator* a,
+                    const TruncatedRangeDelIterator* b) const {
+      return icmp->Compare(a->end_key(), b->end_key()) < 0;
+    }
+
+    const InternalKeyComparator* icmp;
+  };
+  struct StartKeyMaxComparator {
+    explicit StartKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+    bool operator()(const ActiveSeqSet::const_iterator& a,
+                    const ActiveSeqSet::const_iterator& b) const {
+      return icmp->Compare((*a)->start_key(), (*b)->start_key()) < 0;
+    }
+
+    const InternalKeyComparator* icmp;
+  };
+
+  void PushIter(TruncatedRangeDelIterator* iter,
+                const ParsedInternalKey& parsed) {
+    if (!iter->Valid()) {
+      // The iterator has been fully consumed, so we don't need to add it to
+      // either of the heaps.
+    } else if (icmp_->Compare(iter->end_key(), parsed) <= 0) {
+      PushInactiveIter(iter);
+    } else {
+      PushActiveIter(iter);
+    }
+  }
+
+  void PushActiveIter(TruncatedRangeDelIterator* iter) {
+    auto seq_pos = active_seqnums_.insert(iter);
+    active_iters_.push(seq_pos);
+  }
+
+  TruncatedRangeDelIterator* PopActiveIter() {
+    auto active_top = active_iters_.top();
+    auto iter = *active_top;
+    active_iters_.pop();
+    active_seqnums_.erase(active_top);
+    return iter;
+  }
+
+  void PushInactiveIter(TruncatedRangeDelIterator* iter) {
+    inactive_iters_.push(iter);
+  }
+
+  TruncatedRangeDelIterator* PopInactiveIter() {
+    auto* iter = inactive_iters_.top();
+    inactive_iters_.pop();
+    return iter;
+  }
+
+  const InternalKeyComparator* icmp_;
+  size_t unused_idx_;
+  ActiveSeqSet active_seqnums_;
+  BinaryHeap<ActiveSeqSet::const_iterator, StartKeyMaxComparator> active_iters_;
+  BinaryHeap<TruncatedRangeDelIterator*, EndKeyMaxComparator> inactive_iters_;
 };
 
-// A RangeDelAggregator aggregates range deletion tombstones as they are
-// encountered in memtables/SST files. It provides methods that check whether a
-// key is covered by range tombstones or write the relevant tombstones to a new
-// SST file.
+enum class RangeDelPositioningMode { kForwardTraversal, kBackwardTraversal };
 class RangeDelAggregator {
  public:
-  // @param snapshots These are used to organize the tombstones into snapshot
-  //    stripes, which is the seqnum range between consecutive snapshots,
-  //    including the higher snapshot and excluding the lower one. Currently,
-  //    this is used by ShouldDelete() to prevent deletion of keys that are
-  //    covered by range tombstones in other snapshot stripes. This constructor
-  //    is used for writes (flush/compaction). All DB snapshots are provided
-  //    such that no keys are removed that are uncovered according to any DB
-  //    snapshot.
-  // Note this overload does not lazily initialize Rep.
-  RangeDelAggregator(const InternalKeyComparator& icmp,
-                     const std::vector<SequenceNumber>& snapshots,
-                     bool collapse_deletions = true);
-
-  // @param upper_bound Similar to snapshots above, except with a single
-  //    snapshot, which allows us to store the snapshot on the stack and defer
-  //    initialization of heap-allocating members (in Rep) until the first range
-  //    deletion is encountered. This constructor is used in case of reads (get/
-  //    iterator), for which only the user snapshot (upper_bound) is provided
-  //    such that the seqnum space is divided into two stripes. Only the older
-  //    stripe will be used by ShouldDelete().
-  RangeDelAggregator(const InternalKeyComparator& icmp,
-                     SequenceNumber upper_bound,
-                     bool collapse_deletions = false);
-
-  // Returns whether the key should be deleted, which is the case when it is
-  // covered by a range tombstone residing in the same snapshot stripe.
-  // @param mode If collapse_deletions_ is true, this dictates how we will find
-  //             the deletion whose interval contains this key. Otherwise, its
-  //             value must be kFullScan indicating linear scan from beginning.
-  bool ShouldDelete(
-      const ParsedInternalKey& parsed,
-      RangeDelPositioningMode mode = RangeDelPositioningMode::kFullScan) {
-    if (rep_ == nullptr) {
+  explicit RangeDelAggregator(const InternalKeyComparator* icmp)
+      : icmp_(icmp) {}
+  virtual ~RangeDelAggregator() {}
+
+  virtual void AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+      const InternalKey* smallest = nullptr,
+      const InternalKey* largest = nullptr) = 0;
+
+  bool ShouldDelete(const Slice& key, RangeDelPositioningMode mode) {
+    ParsedInternalKey parsed;
+    if (!ParseInternalKey(key, &parsed)) {
       return false;
     }
-    return ShouldDeleteImpl(parsed, mode);
+    return ShouldDelete(parsed, mode);
   }
-  bool ShouldDelete(
-      const Slice& internal_key,
-      RangeDelPositioningMode mode = RangeDelPositioningMode::kFullScan) {
-    if (rep_ == nullptr) {
-      return false;
+  virtual bool ShouldDelete(const ParsedInternalKey& parsed,
+                            RangeDelPositioningMode mode) = 0;
+
+  virtual void InvalidateRangeDelMapPositions() = 0;
+
+  virtual bool IsEmpty() const = 0;
+
+  bool AddFile(uint64_t file_number) {
+    return files_seen_.insert(file_number).second;
+  }
+
+ protected:
+  class StripeRep {
+   public:
+    StripeRep(const InternalKeyComparator* icmp, SequenceNumber upper_bound,
+              SequenceNumber lower_bound)
+        : icmp_(icmp),
+          forward_iter_(icmp),
+          reverse_iter_(icmp),
+          upper_bound_(upper_bound),
+          lower_bound_(lower_bound) {}
+
+    void AddTombstones(std::unique_ptr<TruncatedRangeDelIterator> input_iter) {
+      iters_.push_back(std::move(input_iter));
     }
-    return ShouldDeleteImpl(internal_key, mode);
-  }
-  bool ShouldDeleteImpl(const ParsedInternalKey& parsed,
-                        RangeDelPositioningMode mode);
-  bool ShouldDeleteImpl(const Slice& internal_key,
-                        RangeDelPositioningMode mode);
-
-  // Checks whether range deletions cover any keys between `start` and `end`,
-  // inclusive.
-  //
-  // @param start User key representing beginning of range to check for overlap.
-  // @param end User key representing end of range to check for overlap. This
-  //     argument is inclusive, so the existence of a range deletion covering
-  //     `end` causes this to return true.
+
+    bool IsEmpty() const { return iters_.empty(); }
+
+    bool ShouldDelete(const ParsedInternalKey& parsed,
+                      RangeDelPositioningMode mode);
+
+    void Invalidate() {
+      InvalidateForwardIter();
+      InvalidateReverseIter();
+    }
+
+    bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+   private:
+    bool InStripe(SequenceNumber seq) const {
+      return lower_bound_ <= seq && seq <= upper_bound_;
+    }
+
+    void InvalidateForwardIter() { forward_iter_.Invalidate(); }
+
+    void InvalidateReverseIter() { reverse_iter_.Invalidate(); }
+
+    const InternalKeyComparator* icmp_;
+    std::vector<std::unique_ptr<TruncatedRangeDelIterator>> iters_;
+    ForwardRangeDelIterator forward_iter_;
+    ReverseRangeDelIterator reverse_iter_;
+    SequenceNumber upper_bound_;
+    SequenceNumber lower_bound_;
+  };
+
+  const InternalKeyComparator* icmp_;
+
+ private:
+  std::set<uint64_t> files_seen_;
+};
+
+class ReadRangeDelAggregator : public RangeDelAggregator {
+ public:
+  ReadRangeDelAggregator(const InternalKeyComparator* icmp,
+                         SequenceNumber upper_bound)
+      : RangeDelAggregator(icmp),
+        rep_(icmp, upper_bound, 0 /* lower_bound */) {}
+  ~ReadRangeDelAggregator() override {}
+
+  using RangeDelAggregator::ShouldDelete;
+  void AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+      const InternalKey* smallest = nullptr,
+      const InternalKey* largest = nullptr) override;
+
+  bool ShouldDelete(const ParsedInternalKey& parsed,
+                    RangeDelPositioningMode mode) override;
+
   bool IsRangeOverlapped(const Slice& start, const Slice& end);
 
-  // Adds tombstones to the tombstone aggregation structure maintained by this
-  // object. Tombstones are truncated to smallest and largest. If smallest (or
-  // largest) is null, it is not used for truncation. When adding range
-  // tombstones present in an sstable, smallest and largest should be set to
-  // the smallest and largest keys from the sstable file metadata. Note that
-  // tombstones end keys are exclusive while largest is inclusive.
-  // @return non-OK status if any of the tombstone keys are corrupted.
-  Status AddTombstones(std::unique_ptr<InternalIterator> input,
-                       const InternalKey* smallest = nullptr,
-                       const InternalKey* largest = nullptr);
-
-  // Resets iterators maintained across calls to ShouldDelete(). This may be
-  // called when the tombstones change, or the owner may call explicitly, e.g.,
-  // if it's an iterator that just seeked to an arbitrary position. The effect
-  // of invalidation is that the following call to ShouldDelete() will binary
-  // search for its tombstone.
-  void InvalidateRangeDelMapPositions();
-
-  bool IsEmpty();
-  bool AddFile(uint64_t file_number);
-
-  // Create a new iterator over the range deletion tombstones in all of the
-  // snapshot stripes in this aggregator. Tombstones are presented in start key
-  // order. Tombstones with the same start key are presented in arbitrary order.
-  //
-  // The iterator is invalidated after any call to AddTombstones. It is the
-  // caller's responsibility to avoid using invalid iterators.
-  std::unique_ptr<RangeDelIterator> NewIterator();
+  void InvalidateRangeDelMapPositions() override { rep_.Invalidate(); }
+
+  bool IsEmpty() const override { return rep_.IsEmpty(); }
 
  private:
-  // Maps snapshot seqnum -> map of tombstones that fall in that stripe, i.e.,
-  // their seqnums are greater than the next smaller snapshot's seqnum.
-  typedef std::map<SequenceNumber, std::unique_ptr<RangeDelMap>> StripeMap;
-
-  struct Rep {
-    StripeMap stripe_map_;
-    PinnedIteratorsManager pinned_iters_mgr_;
-    std::list<std::string> pinned_slices_;
-    std::set<uint64_t> added_files_;
-  };
-  // Initializes rep_ lazily. This aggregator object is constructed for every
-  // read, so expensive members should only be created when necessary, i.e.,
-  // once the first range deletion is encountered.
-  void InitRep(const std::vector<SequenceNumber>& snapshots);
-
-  std::unique_ptr<RangeDelMap> NewRangeDelMap();
-  RangeDelMap& GetRangeDelMap(SequenceNumber seq);
-
-  SequenceNumber upper_bound_;
-  std::unique_ptr<Rep> rep_;
-  const InternalKeyComparator& icmp_;
-  // collapse range deletions so they're binary searchable
-  const bool collapse_deletions_;
+  StripeRep rep_;
+};
+
+class CompactionRangeDelAggregator : public RangeDelAggregator {
+ public:
+  CompactionRangeDelAggregator(const InternalKeyComparator* icmp,
+                               const std::vector<SequenceNumber>& snapshots)
+      : RangeDelAggregator(icmp), snapshots_(&snapshots) {}
+  ~CompactionRangeDelAggregator() override {}
+
+  void AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+      const InternalKey* smallest = nullptr,
+      const InternalKey* largest = nullptr) override;
+
+  using RangeDelAggregator::ShouldDelete;
+  bool ShouldDelete(const ParsedInternalKey& parsed,
+                    RangeDelPositioningMode mode) override;
+
+  bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+  void InvalidateRangeDelMapPositions() override {
+    for (auto& rep : reps_) {
+      rep.second.Invalidate();
+    }
+  }
+
+  bool IsEmpty() const override {
+    for (const auto& rep : reps_) {
+      if (!rep.second.IsEmpty()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Creates an iterator over all the range tombstones in the aggregator, for
+  // use in compaction. Nullptr arguments indicate that the iterator range is
+  // unbounded.
+  // NOTE: the boundaries are used for optimization purposes to reduce the
+  // number of tombstones that are passed to the fragmenter; they do not
+  // guarantee that the resulting iterator only contains range tombstones that
+  // cover keys in the provided range. If required, these bounds must be
+  // enforced during iteration.
+  std::unique_ptr<FragmentedRangeTombstoneIterator> NewIterator(
+      const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr,
+      bool upper_bound_inclusive = false);
+
+ private:
+  std::vector<std::unique_ptr<TruncatedRangeDelIterator>> parent_iters_;
+  std::map<SequenceNumber, StripeRep> reps_;
+
+  const std::vector<SequenceNumber>* snapshots_;
 };
 
 }  // namespace rocksdb
diff --git a/db/range_del_aggregator_bench.cc b/db/range_del_aggregator_bench.cc
index 2e7f9de3fc7..7ecdbc5afba 100644
--- a/db/range_del_aggregator_bench.cc
+++ b/db/range_del_aggregator_bench.cc
@@ -20,6 +20,7 @@ int main() {
 #include <vector>
 
 #include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
 #include "util/coding.h"
@@ -33,7 +34,7 @@ using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
 DEFINE_int32(num_range_tombstones, 1000, "number of range tombstones created");
 
-DEFINE_int32(num_runs, 10000, "number of test runs");
+DEFINE_int32(num_runs, 1000, "number of test runs");
 
 DEFINE_int32(tombstone_start_upper_bound, 1000,
              "exclusive upper bound on range tombstone start keys");
@@ -46,8 +47,6 @@ DEFINE_double(tombstone_width_mean, 100.0, "average range tombstone width");
 DEFINE_double(tombstone_width_stddev, 0.0,
               "standard deviation of range tombstone width");
 
-DEFINE_bool(use_collapsed, true, "use the collapsed range tombstone map");
-
 DEFINE_int32(seed, 0, "random number generator seed");
 
 DEFINE_int32(should_deletes_per_run, 1, "number of ShouldDelete calls per run");
@@ -85,6 +84,8 @@ std::ostream& operator<<(std::ostream& os, const Stats& s) {
   return os;
 }
 
+auto icmp = rocksdb::InternalKeyComparator(rocksdb::BytewiseComparator());
+
 }  // anonymous namespace
 
 namespace rocksdb {
@@ -181,14 +182,14 @@ int main(int argc, char** argv) {
         std::vector<rocksdb::PersistentRangeTombstone>(
             FLAGS_num_range_tombstones);
   }
-  auto mode = FLAGS_use_collapsed
-                  ? rocksdb::RangeDelPositioningMode::kForwardTraversal
-                  : rocksdb::RangeDelPositioningMode::kFullScan;
+  auto mode = rocksdb::RangeDelPositioningMode::kForwardTraversal;
 
   for (int i = 0; i < FLAGS_num_runs; i++) {
-    auto icmp = rocksdb::InternalKeyComparator(rocksdb::BytewiseComparator());
-    rocksdb::RangeDelAggregator range_del_agg(icmp, {} /* snapshots */,
-                                              FLAGS_use_collapsed);
+    rocksdb::ReadRangeDelAggregator range_del_agg(
+        &icmp, rocksdb::kMaxSequenceNumber /* upper_bound */);
+
+    std::vector<std::unique_ptr<rocksdb::FragmentedRangeTombstoneList> >
+        fragmented_range_tombstone_lists(FLAGS_add_tombstones_per_run);
 
     for (auto& persistent_range_tombstones : all_persistent_range_tombstones) {
       // TODO(abhimadan): consider whether creating the range tombstones right
@@ -203,9 +204,19 @@ int main(int argc, char** argv) {
 
       auto range_del_iter =
           rocksdb::MakeRangeDelIterator(persistent_range_tombstones);
+      fragmented_range_tombstone_lists.emplace_back(
+          new rocksdb::FragmentedRangeTombstoneList(
+              rocksdb::MakeRangeDelIterator(persistent_range_tombstones),
+              icmp));
+      std::unique_ptr<rocksdb::FragmentedRangeTombstoneIterator>
+          fragmented_range_del_iter(
+              new rocksdb::FragmentedRangeTombstoneIterator(
+                  fragmented_range_tombstone_lists.back().get(), icmp,
+                  rocksdb::kMaxSequenceNumber));
+
       rocksdb::StopWatchNano stop_watch_add_tombstones(rocksdb::Env::Default(),
                                                        true /* auto_start */);
-      range_del_agg.AddTombstones(std::move(range_del_iter));
+      range_del_agg.AddTombstones(std::move(fragmented_range_del_iter));
       stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos();
     }
 
@@ -221,7 +232,7 @@ int main(int argc, char** argv) {
       parsed_key.user_key = key_string;
 
       rocksdb::StopWatchNano stop_watch_should_delete(rocksdb::Env::Default(),
-          true /* auto_start */);
+                                                      true /* auto_start */);
       range_del_agg.ShouldDelete(parsed_key, mode);
       uint64_t call_time = stop_watch_should_delete.ElapsedNanos();
 
diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc
index a5746df15f8..28c8129ecb0 100644
--- a/db/range_del_aggregator_test.cc
+++ b/db/range_del_aggregator_test.cc
@@ -1,13 +1,17 @@
-//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include <algorithm>
+#include "db/range_del_aggregator.h"
+
+#include <memory>
+#include <string>
+#include <vector>
 
 #include "db/db_test_util.h"
-#include "db/range_del_aggregator.h"
-#include "rocksdb/comparator.h"
+#include "db/dbformat.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "util/testutil.h"
 
 namespace rocksdb {
@@ -16,336 +20,685 @@ class RangeDelAggregatorTest : public testing::Test {};
 
 namespace {
 
-struct ExpectedPoint {
-  Slice begin;
-  SequenceNumber seq;
-  bool expectAlive;
-};
-
-enum Direction {
-  kForward,
-  kReverse,
-};
-
 static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());
 
-void AddTombstones(RangeDelAggregator* range_del_agg,
-                   const std::vector<RangeTombstone>& range_dels,
-                   const InternalKey* smallest = nullptr,
-                   const InternalKey* largest = nullptr) {
+std::unique_ptr<InternalIterator> MakeRangeDelIter(
+    const std::vector<RangeTombstone>& range_dels) {
   std::vector<std::string> keys, values;
   for (const auto& range_del : range_dels) {
     auto key_and_value = range_del.Serialize();
     keys.push_back(key_and_value.first.Encode().ToString());
     values.push_back(key_and_value.second.ToString());
   }
-  std::unique_ptr<test::VectorIterator> range_del_iter(
+  return std::unique_ptr<test::VectorIterator>(
       new test::VectorIterator(keys, values));
-  range_del_agg->AddTombstones(std::move(range_del_iter), smallest, largest);
 }
 
-void VerifyTombstonesEq(const RangeTombstone& a, const RangeTombstone& b) {
-  ASSERT_EQ(a.seq_, b.seq_);
-  ASSERT_EQ(a.start_key_, b.start_key_);
-  ASSERT_EQ(a.end_key_, b.end_key_);
+std::vector<std::unique_ptr<FragmentedRangeTombstoneList>>
+MakeFragmentedTombstoneLists(
+    const std::vector<std::vector<RangeTombstone>>& range_dels_list) {
+  std::vector<std::unique_ptr<FragmentedRangeTombstoneList>> fragment_lists;
+  for (const auto& range_dels : range_dels_list) {
+    auto range_del_iter = MakeRangeDelIter(range_dels);
+    fragment_lists.emplace_back(new FragmentedRangeTombstoneList(
+        std::move(range_del_iter), bytewise_icmp));
+  }
+  return fragment_lists;
 }
 
-void VerifyRangeDelIter(
-    RangeDelIterator* range_del_iter,
-    const std::vector<RangeTombstone>& expected_range_dels) {
-  size_t i = 0;
-  for (; range_del_iter->Valid() && i < expected_range_dels.size();
-       range_del_iter->Next(), i++) {
-    VerifyTombstonesEq(expected_range_dels[i], range_del_iter->Tombstone());
+struct TruncatedIterScanTestCase {
+  ParsedInternalKey start;
+  ParsedInternalKey end;
+  SequenceNumber seq;
+};
+
+struct TruncatedIterSeekTestCase {
+  Slice target;
+  ParsedInternalKey start;
+  ParsedInternalKey end;
+  SequenceNumber seq;
+  bool invalid;
+};
+
+struct ShouldDeleteTestCase {
+  ParsedInternalKey lookup_key;
+  bool result;
+};
+
+struct IsRangeOverlappedTestCase {
+  Slice start;
+  Slice end;
+  bool result;
+};
+
+ParsedInternalKey UncutEndpoint(const Slice& s) {
+  return ParsedInternalKey(s, kMaxSequenceNumber, kTypeRangeDeletion);
+}
+
+ParsedInternalKey InternalValue(const Slice& key, SequenceNumber seq) {
+  return ParsedInternalKey(key, seq, kTypeValue);
+}
+
+void VerifyIterator(
+    TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp,
+    const std::vector<TruncatedIterScanTestCase>& expected_range_dels) {
+  // Test forward iteration.
+  iter->SeekToFirst();
+  for (size_t i = 0; i < expected_range_dels.size(); i++, iter->Next()) {
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(0, icmp.Compare(iter->start_key(), expected_range_dels[i].start));
+    EXPECT_EQ(0, icmp.Compare(iter->end_key(), expected_range_dels[i].end));
+    EXPECT_EQ(expected_range_dels[i].seq, iter->seq());
+  }
+  EXPECT_FALSE(iter->Valid());
+
+  // Test reverse iteration.
+  iter->SeekToLast();
+  std::vector<TruncatedIterScanTestCase> reverse_expected_range_dels(
+      expected_range_dels.rbegin(), expected_range_dels.rend());
+  for (size_t i = 0; i < reverse_expected_range_dels.size();
+       i++, iter->Prev()) {
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(0, icmp.Compare(iter->start_key(),
+                              reverse_expected_range_dels[i].start));
+    EXPECT_EQ(
+        0, icmp.Compare(iter->end_key(), reverse_expected_range_dels[i].end));
+    EXPECT_EQ(reverse_expected_range_dels[i].seq, iter->seq());
   }
-  ASSERT_EQ(expected_range_dels.size(), i);
-  ASSERT_FALSE(range_del_iter->Valid());
+  EXPECT_FALSE(iter->Valid());
 }
 
-void VerifyRangeDels(
-    const std::vector<RangeTombstone>& range_dels_in,
-    const std::vector<ExpectedPoint>& expected_points,
-    const std::vector<RangeTombstone>& expected_collapsed_range_dels,
-    const InternalKey* smallest = nullptr, const InternalKey* largest = nullptr,
-    const InternalKeyComparator& icmp = bytewise_icmp) {
-  // Test same result regardless of which order the range deletions are added
-  // and regardless of collapsed mode.
-  for (bool collapsed : {false, true}) {
-    for (Direction dir : {kForward, kReverse}) {
-      RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, collapsed);
-
-      std::vector<RangeTombstone> range_dels = range_dels_in;
-      if (dir == kReverse) {
-        std::reverse(range_dels.begin(), range_dels.end());
-      }
-      AddTombstones(&range_del_agg, range_dels, smallest, largest);
-
-      auto mode = RangeDelPositioningMode::kFullScan;
-      if (collapsed) {
-        mode = RangeDelPositioningMode::kForwardTraversal;
-      }
-
-      for (const auto expected_point : expected_points) {
-        ParsedInternalKey parsed_key;
-        parsed_key.user_key = expected_point.begin;
-        parsed_key.sequence = expected_point.seq;
-        parsed_key.type = kTypeValue;
-        ASSERT_FALSE(range_del_agg.ShouldDelete(parsed_key, mode));
-        if (parsed_key.sequence > 0) {
-          --parsed_key.sequence;
-          if (expected_point.expectAlive) {
-            ASSERT_FALSE(range_del_agg.ShouldDelete(parsed_key, mode));
-          } else {
-            ASSERT_TRUE(range_del_agg.ShouldDelete(parsed_key, mode));
-          }
-        }
-      }
-
-      if (collapsed) {
-        range_dels = expected_collapsed_range_dels;
-        VerifyRangeDelIter(range_del_agg.NewIterator().get(), range_dels);
-      } else if (smallest == nullptr && largest == nullptr) {
-        // Tombstones in an uncollapsed map are presented in start key
-        // order. Tombstones with the same start key are presented in
-        // insertion order. We don't handle tombstone truncation here, so the
-        // verification is only performed if no truncation was requested.
-        std::stable_sort(range_dels.begin(), range_dels.end(),
-                         [&](const RangeTombstone& a, const RangeTombstone& b) {
-                           return icmp.user_comparator()->Compare(
-                                      a.start_key_, b.start_key_) < 0;
-                         });
-        VerifyRangeDelIter(range_del_agg.NewIterator().get(), range_dels);
-      }
+void VerifySeek(TruncatedRangeDelIterator* iter,
+                const InternalKeyComparator& icmp,
+                const std::vector<TruncatedIterSeekTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    iter->Seek(test_case.target);
+    if (test_case.invalid) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start));
+      EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end));
+      EXPECT_EQ(test_case.seq, iter->seq());
     }
   }
+}
 
-  RangeDelAggregator range_del_agg(icmp, {} /* snapshots */,
-                                   false /* collapse_deletions */);
-  AddTombstones(&range_del_agg, range_dels_in);
-  for (size_t i = 1; i < expected_points.size(); ++i) {
-    bool overlapped = range_del_agg.IsRangeOverlapped(
-        expected_points[i - 1].begin, expected_points[i].begin);
-    if (expected_points[i - 1].seq > 0 || expected_points[i].seq > 0) {
-      ASSERT_TRUE(overlapped);
+void VerifySeekForPrev(
+    TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp,
+    const std::vector<TruncatedIterSeekTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    iter->SeekForPrev(test_case.target);
+    if (test_case.invalid) {
+      ASSERT_FALSE(iter->Valid());
     } else {
-      ASSERT_FALSE(overlapped);
+      ASSERT_TRUE(iter->Valid());
+      EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start));
+      EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end));
+      EXPECT_EQ(test_case.seq, iter->seq());
     }
   }
 }
 
-}  // anonymous namespace
-
-TEST_F(RangeDelAggregatorTest, Empty) { VerifyRangeDels({}, {{"a", 0}}, {}); }
-
-TEST_F(RangeDelAggregatorTest, SameStartAndEnd) {
-  VerifyRangeDels({{"a", "a", 5}}, {{" ", 0}, {"a", 0}, {"b", 0}}, {});
+void VerifyShouldDelete(RangeDelAggregator* range_del_agg,
+                        const std::vector<ShouldDeleteTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    EXPECT_EQ(
+        test_case.result,
+        range_del_agg->ShouldDelete(
+            test_case.lookup_key, RangeDelPositioningMode::kForwardTraversal));
+  }
+  for (auto it = test_cases.rbegin(); it != test_cases.rend(); ++it) {
+    const auto& test_case = *it;
+    EXPECT_EQ(
+        test_case.result,
+        range_del_agg->ShouldDelete(
+            test_case.lookup_key, RangeDelPositioningMode::kBackwardTraversal));
+  }
 }
 
-TEST_F(RangeDelAggregatorTest, Single) {
-  VerifyRangeDels({{"a", "b", 10}}, {{" ", 0}, {"a", 10}, {"b", 0}},
-                  {{"a", "b", 10}});
+void VerifyIsRangeOverlapped(
+    ReadRangeDelAggregator* range_del_agg,
+    const std::vector<IsRangeOverlappedTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    EXPECT_EQ(test_case.result,
+              range_del_agg->IsRangeOverlapped(test_case.start, test_case.end));
+  }
 }
 
-TEST_F(RangeDelAggregatorTest, OverlapAboveLeft) {
-  VerifyRangeDels({{"a", "c", 10}, {"b", "d", 5}},
-                  {{" ", 0}, {"a", 10}, {"c", 5}, {"d", 0}},
-                  {{"a", "c", 10}, {"c", "d", 5}});
+void CheckIterPosition(const RangeTombstone& tombstone,
+                       const FragmentedRangeTombstoneIterator* iter) {
+  // Test InternalIterator interface.
+  EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key()));
+  EXPECT_EQ(tombstone.end_key_, iter->value());
+  EXPECT_EQ(tombstone.seq_, iter->seq());
+
+  // Test FragmentedRangeTombstoneIterator interface.
+  EXPECT_EQ(tombstone.start_key_, iter->start_key());
+  EXPECT_EQ(tombstone.end_key_, iter->end_key());
+  EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key()));
 }
 
-TEST_F(RangeDelAggregatorTest, OverlapAboveRight) {
-  VerifyRangeDels({{"a", "c", 5}, {"b", "d", 10}},
-                  {{" ", 0}, {"a", 5}, {"b", 10}, {"d", 0}},
-                  {{"a", "b", 5}, {"b", "d", 10}});
+void VerifyFragmentedRangeDels(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<RangeTombstone>& expected_tombstones) {
+  iter->SeekToFirst();
+  for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) {
+    ASSERT_TRUE(iter->Valid());
+    CheckIterPosition(expected_tombstones[i], iter);
+  }
+  EXPECT_FALSE(iter->Valid());
 }
 
-TEST_F(RangeDelAggregatorTest, OverlapAboveMiddle) {
-  VerifyRangeDels({{"a", "d", 5}, {"b", "c", 10}},
-                  {{" ", 0}, {"a", 5}, {"b", 10}, {"c", 5}, {"d", 0}},
-                  {{"a", "b", 5}, {"b", "c", 10}, {"c", "d", 5}});
-}
+}  // namespace
 
-TEST_F(RangeDelAggregatorTest, OverlapAboveMiddleReverse) {
-  VerifyRangeDels({{"d", "a", 5}, {"c", "b", 10}},
-                  {{"z", 0}, {"d", 5}, {"c", 10}, {"b", 5}, {"a", 0}},
-                  {{"d", "c", 5}, {"c", "b", 10}, {"b", "a", 5}},
-                  nullptr /* smallest */, nullptr /* largest */,
-                  InternalKeyComparator(ReverseBytewiseComparator()));
-}
+TEST_F(RangeDelAggregatorTest, EmptyTruncatedIter) {
+  auto range_del_iter = MakeRangeDelIter({});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
 
-TEST_F(RangeDelAggregatorTest, OverlapFully) {
-  VerifyRangeDels({{"a", "d", 10}, {"b", "c", 5}},
-                  {{" ", 0}, {"a", 10}, {"d", 0}}, {{"a", "d", 10}});
-}
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+                                 nullptr);
 
-TEST_F(RangeDelAggregatorTest, OverlapPoint) {
-  VerifyRangeDels({{"a", "b", 5}, {"b", "c", 10}},
-                  {{" ", 0}, {"a", 5}, {"b", 10}, {"c", 0}},
-                  {{"a", "b", 5}, {"b", "c", 10}});
-}
+  iter.SeekToFirst();
+  ASSERT_FALSE(iter.Valid());
 
-TEST_F(RangeDelAggregatorTest, SameStartKey) {
-  VerifyRangeDels({{"a", "c", 5}, {"a", "b", 10}},
-                  {{" ", 0}, {"a", 10}, {"b", 5}, {"c", 0}},
-                  {{"a", "b", 10}, {"b", "c", 5}});
+  iter.SeekToLast();
+  ASSERT_FALSE(iter.Valid());
 }
 
-TEST_F(RangeDelAggregatorTest, SameEndKey) {
-  VerifyRangeDels({{"a", "d", 5}, {"b", "d", 10}},
-                  {{" ", 0}, {"a", 5}, {"b", 10}, {"d", 0}},
-                  {{"a", "b", 5}, {"b", "d", 10}});
+TEST_F(RangeDelAggregatorTest, UntruncatedIter) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+                                 nullptr);
+
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{UncutEndpoint("a"), UncutEndpoint("e"), 10},
+                  {UncutEndpoint("e"), UncutEndpoint("g"), 8},
+                  {UncutEndpoint("j"), UncutEndpoint("n"), 4}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"", UncutEndpoint("a"), UncutEndpoint("e"), 10}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
 }
 
-TEST_F(RangeDelAggregatorTest, GapsBetweenRanges) {
-  VerifyRangeDels({{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}},
-                  {{" ", 0},
-                   {"a", 5},
-                   {"b", 0},
-                   {"c", 10},
-                   {"d", 0},
-                   {"da", 0},
-                   {"e", 15},
-                   {"f", 0}},
-                  {{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}});
+TEST_F(RangeDelAggregatorTest, UntruncatedIterWithSnapshot) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           9 /* snapshot */));
+
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+                                 nullptr);
+
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{UncutEndpoint("e"), UncutEndpoint("g"), 8},
+                  {UncutEndpoint("j"), UncutEndpoint("n"), 4}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"", UncutEndpoint("e"), UncutEndpoint("g"), 8}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
 }
 
-TEST_F(RangeDelAggregatorTest, IdenticalSameSeqNo) {
-  VerifyRangeDels({{"a", "b", 5}, {"a", "b", 5}},
-                  {{" ", 0}, {"a", 5}, {"b", 0}},
-                  {{"a", "b", 5}});
+TEST_F(RangeDelAggregatorTest, TruncatedIterPartiallyCutTombstones) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  InternalKey smallest("d", 7, kTypeValue);
+  InternalKey largest("m", 9, kTypeValue);
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp,
+                                 &smallest, &largest);
+
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{InternalValue("d", 7), UncutEndpoint("e"), 10},
+                  {UncutEndpoint("e"), UncutEndpoint("g"), 8},
+                  {UncutEndpoint("j"), InternalValue("m", 8), 4}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("j"), InternalValue("m", 8), 4},
+       {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"", InternalValue("d", 7), UncutEndpoint("e"), 10}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"n", UncutEndpoint("j"), InternalValue("m", 8), 4},
+       {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
 }
 
-TEST_F(RangeDelAggregatorTest, ContiguousSameSeqNo) {
-  VerifyRangeDels({{"a", "b", 5}, {"b", "c", 5}},
-                  {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 0}},
-                  {{"a", "c", 5}});
+TEST_F(RangeDelAggregatorTest, TruncatedIterFullyCutTombstones) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  InternalKey smallest("f", 7, kTypeValue);
+  InternalKey largest("i", 9, kTypeValue);
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp,
+                                 &smallest, &largest);
+
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{InternalValue("f", 7), UncutEndpoint("g"), 8}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", InternalValue("f", 7), UncutEndpoint("g"), 8},
+       {"f", InternalValue("f", 7), UncutEndpoint("g"), 8},
+       {"j", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"f", InternalValue("f", 7), UncutEndpoint("g"), 8},
+       {"j", InternalValue("f", 7), UncutEndpoint("g"), 8}});
 }
 
-TEST_F(RangeDelAggregatorTest, OverlappingSameSeqNo) {
-  VerifyRangeDels({{"a", "c", 5}, {"b", "d", 5}},
-                  {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}},
-                  {{"a", "d", 5}});
+TEST_F(RangeDelAggregatorTest, SingleIterInAggregator) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 8}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber);
+  range_del_agg.AddTombstones(std::move(input_iter));
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "c", true},
+                                           {"d", "f", true},
+                                           {"g", "l", false}});
 }
 
-TEST_F(RangeDelAggregatorTest, CoverSameSeqNo) {
-  VerifyRangeDels({{"a", "d", 5}, {"b", "c", 5}},
-                  {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}},
-                  {{"a", "d", 5}});
+TEST_F(RangeDelAggregatorTest, MultipleItersInAggregator) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true},
+                                      {InternalValue("b", 19), false},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false},
+                                      {InternalValue("h", 24), true},
+                                      {InternalValue("i", 24), false},
+                                      {InternalValue("ii", 14), true},
+                                      {InternalValue("j", 14), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "c", true},
+                                           {"d", "f", true},
+                                           {"g", "l", true},
+                                           {"x", "y", false}});
 }
 
-// Note the Cover* tests also test cases where tombstones are inserted under a
-// larger one when VerifyRangeDels() runs them in reverse
-TEST_F(RangeDelAggregatorTest, CoverMultipleFromLeft) {
-  VerifyRangeDels(
-      {{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "f", 20}},
-      {{" ", 0}, {"a", 20}, {"f", 15}, {"g", 0}},
-      {{"a", "f", 20}, {"f", "g", 15}});
+TEST_F(RangeDelAggregatorTest, MultipleItersInAggregatorWithUpperBound) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             19 /* snapshot */));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false},
+                                      {InternalValue("a", 9), true},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false},
+                                      {InternalValue("h", 24), false},
+                                      {InternalValue("i", 24), false},
+                                      {InternalValue("ii", 14), true},
+                                      {InternalValue("j", 14), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "c", true},
+                                           {"d", "f", true},
+                                           {"g", "l", true},
+                                           {"x", "y", false}});
 }
 
-TEST_F(RangeDelAggregatorTest, CoverMultipleFromRight) {
-  VerifyRangeDels(
-      {{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"c", "h", 20}},
-      {{" ", 0}, {"b", 5}, {"c", 20}, {"h", 0}},
-      {{"b", "c", 5}, {"c", "h", 20}});
+TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregator) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}});
+  std::vector<std::pair<InternalKey, InternalKey>> iter_bounds = {
+      {InternalKey("a", 4, kTypeValue),
+       InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("m", 20, kTypeValue),
+       InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}};
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+  for (size_t i = 0; i < fragment_lists.size(); i++) {
+    const auto& fragment_list = fragment_lists[i];
+    const auto& bounds = iter_bounds[i];
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             19 /* snapshot */));
+    range_del_agg.AddTombstones(std::move(input_iter), &bounds.first,
+                                &bounds.second);
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false},
+                                      {InternalValue("a", 9), false},
+                                      {InternalValue("a", 4), true},
+                                      {InternalValue("m", 10), false},
+                                      {InternalValue("m", 9), true},
+                                      {InternalValue("x", 10), false},
+                                      {InternalValue("x", 9), false},
+                                      {InternalValue("x", 5), true},
+                                      {InternalValue("z", 9), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "n", true},
+                                           {"l", "x", true},
+                                           {"w", "z", true},
+                                           {"zzz", "zz", false},
+                                           {"zz", "zzz", false}});
 }
 
-TEST_F(RangeDelAggregatorTest, CoverMultipleFully) {
-  VerifyRangeDels(
-      {{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "h", 20}},
-      {{" ", 0}, {"a", 20}, {"h", 0}}, {{"a", "h", 20}});
+TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregatorSameLevel) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}});
+  std::vector<std::pair<InternalKey, InternalKey>> iter_bounds = {
+      {InternalKey("a", 4, kTypeValue),
+       InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("m", 20, kTypeValue),
+       InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}};
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+
+  auto add_iter_to_agg = [&](size_t i) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_lists[i].get(),
+                                             bytewise_icmp, 19 /* snapshot */));
+    range_del_agg.AddTombstones(std::move(input_iter), &iter_bounds[i].first,
+                                &iter_bounds[i].second);
+  };
+
+  add_iter_to_agg(0);
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false},
+                                      {InternalValue("a", 9), false},
+                                      {InternalValue("a", 4), true}});
+
+  add_iter_to_agg(1);
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("m", 10), false},
+                                      {InternalValue("m", 9), true}});
+
+  add_iter_to_agg(2);
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("x", 10), false},
+                                      {InternalValue("x", 9), false},
+                                      {InternalValue("x", 5), true},
+                                      {InternalValue("z", 9), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "n", true},
+                                           {"l", "x", true},
+                                           {"w", "z", true},
+                                           {"zzz", "zz", false},
+                                           {"zz", "zzz", false}});
 }
 
-TEST_F(RangeDelAggregatorTest, AlternateMultipleAboveBelow) {
-  VerifyRangeDels(
-      {{"b", "d", 15}, {"c", "f", 10}, {"e", "g", 20}, {"a", "h", 5}},
-      {{" ", 0}, {"a", 5}, {"b", 15}, {"d", 10}, {"e", 20}, {"g", 5}, {"h", 0}},
-      {{"a", "b", 5},
-       {"b", "d", 15},
-       {"d", "e", 10},
-       {"e", "g", 20},
-       {"g", "h", 5}});
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorNoSnapshots) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots;
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true},
+                                      {InternalValue("b", 19), false},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false},
+                                      {InternalValue("h", 24), true},
+                                      {InternalValue("i", 24), false},
+                                      {InternalValue("ii", 14), true},
+                                      {InternalValue("j", 14), false}});
+
+  auto range_del_compaction_iter = range_del_agg.NewIterator();
+  VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20},
+                                                              {"b", "c", 10},
+                                                              {"c", "e", 10},
+                                                              {"e", "g", 8},
+                                                              {"h", "i", 25},
+                                                              {"ii", "j", 15}});
 }
 
-TEST_F(RangeDelAggregatorTest, MergingIteratorAllEmptyStripes) {
-  for (bool collapsed : {true, false}) {
-    RangeDelAggregator range_del_agg(bytewise_icmp, {1, 2}, collapsed);
-    VerifyRangeDelIter(range_del_agg.NewIterator().get(), {});
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorWithSnapshots) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
   }
+
+  VerifyShouldDelete(
+      &range_del_agg,
+      {
+          {InternalValue("a", 19), false},  // [10, 19]
+          {InternalValue("a", 9), false},   // [0, 9]
+          {InternalValue("b", 9), false},   // [0, 9]
+          {InternalValue("d", 9), false},   // [0, 9]
+          {InternalValue("d", 7), true},    // [0, 9]
+          {InternalValue("e", 7), true},    // [0, 9]
+          {InternalValue("g", 7), false},   // [0, 9]
+          {InternalValue("h", 24), true},   // [20, kMaxSequenceNumber]
+          {InternalValue("i", 24), false},  // [20, kMaxSequenceNumber]
+          {InternalValue("ii", 14), true},  // [10, 19]
+          {InternalValue("j", 14), false}   // [10, 19]
+      });
+
+  auto range_del_compaction_iter = range_del_agg.NewIterator();
+  VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20},
+                                                              {"a", "b", 10},
+                                                              {"b", "c", 10},
+                                                              {"c", "e", 10},
+                                                              {"c", "e", 8},
+                                                              {"e", "g", 8},
+                                                              {"h", "i", 25},
+                                                              {"ii", "j", 15}});
 }
 
-TEST_F(RangeDelAggregatorTest, MergingIteratorOverlappingStripes) {
-  for (bool collapsed : {true, false}) {
-    RangeDelAggregator range_del_agg(bytewise_icmp, {5, 15, 25, 35}, collapsed);
-    AddTombstones(
-        &range_del_agg,
-        {{"d", "e", 10}, {"aa", "b", 20}, {"c", "d", 30}, {"a", "b", 10}});
-    VerifyRangeDelIter(
-        range_del_agg.NewIterator().get(),
-        {{"a", "b", 10}, {"aa", "b", 20}, {"c", "d", 30}, {"d", "e", 10}});
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorLeft) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
   }
+
+  Slice start("_");
+  Slice end("__");
 }
 
-TEST_F(RangeDelAggregatorTest, MergingIteratorSeek) {
-  RangeDelAggregator range_del_agg(bytewise_icmp, {5, 15},
-                                   true /* collapsed */);
-  AddTombstones(&range_del_agg, {{"a", "c", 10},
-                                 {"b", "c", 11},
-                                 {"f", "g", 10},
-                                 {"c", "d", 20},
-                                 {"e", "f", 20}});
-  auto it = range_del_agg.NewIterator();
-
-  // Verify seek positioning.
-  it->Seek("");
-  VerifyTombstonesEq(it->Tombstone(), {"a", "b", 10});
-  it->Seek("a");
-  VerifyTombstonesEq(it->Tombstone(), {"a", "b", 10});
-  it->Seek("aa");
-  VerifyTombstonesEq(it->Tombstone(), {"a", "b", 10});
-  it->Seek("b");
-  VerifyTombstonesEq(it->Tombstone(), {"b", "c", 11});
-  it->Seek("c");
-  VerifyTombstonesEq(it->Tombstone(), {"c", "d", 20});
-  it->Seek("dd");
-  VerifyTombstonesEq(it->Tombstone(), {"e", "f", 20});
-  it->Seek("f");
-  VerifyTombstonesEq(it->Tombstone(), {"f", "g", 10});
-  it->Seek("g");
-  ASSERT_EQ(it->Valid(), false);
-  it->Seek("h");
-  ASSERT_EQ(it->Valid(), false);
-
-  // Verify iteration after seek.
-  it->Seek("c");
-  VerifyRangeDelIter(it.get(),
-                     {{"c", "d", 20}, {"e", "f", 20}, {"f", "g", 10}});
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorRight) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("p");
+  Slice end("q");
+  auto range_del_compaction_iter1 =
+      range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {});
+
+  auto range_del_compaction_iter2 =
+      range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {});
 }
 
-TEST_F(RangeDelAggregatorTest, TruncateTombstones) {
-  const InternalKey smallest("b", 1, kTypeRangeDeletion);
-  const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion);
-  VerifyRangeDels(
-      {{"a", "c", 10}, {"d", "f", 10}},
-      {{"a", 10, true},  // truncated
-       {"b", 10, false}, // not truncated
-       {"d", 10, false}, // not truncated
-       {"e", 10, true}}, // truncated
-      {{"b", "c", 10}, {"d", "e", 10}},
-      &smallest, &largest);
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorBoundedIterator) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("bb");
+  Slice end("e");
+  auto range_del_compaction_iter1 =
+      range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter1.get(),
+                            {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}});
+
+  auto range_del_compaction_iter2 =
+      range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(
+      range_del_compaction_iter2.get(),
+      {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}, {"e", "g", 8}});
 }
 
-TEST_F(RangeDelAggregatorTest, OverlappingLargestKeyTruncateTombstones) {
-  const InternalKey smallest("b", 1, kTypeRangeDeletion);
-  const InternalKey largest(
-      "e", 3,  // could happen if "e" is in consecutive sstables
-      kTypeValue);
-  VerifyRangeDels(
-      {{"a", "c", 10}, {"d", "f", 10}},
-      {{"a", 10, true},  // truncated
-       {"b", 10, false}, // not truncated
-       {"d", 10, false}, // not truncated
-       {"e", 10, false}}, // not truncated
-      {{"b", "c", 10}, {"d", "f", 10}},
-      &smallest, &largest);
+TEST_F(RangeDelAggregatorTest,
+       CompactionAggregatorBoundedIteratorExtraFragments) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "d", 10}, {"c", "g", 8}},
+       {{"b", "c", 20}, {"d", "f", 30}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("bb");
+  Slice end("e");
+  auto range_del_compaction_iter1 =
+      range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {{"a", "b", 10},
+                                                               {"b", "c", 20},
+                                                               {"b", "c", 10},
+                                                               {"c", "d", 10},
+                                                               {"c", "d", 8},
+                                                               {"d", "f", 30},
+                                                               {"d", "f", 8},
+                                                               {"f", "g", 8}});
+
+  auto range_del_compaction_iter2 =
+      range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {{"a", "b", 10},
+                                                               {"b", "c", 20},
+                                                               {"b", "c", 10},
+                                                               {"c", "d", 10},
+                                                               {"c", "d", 8},
+                                                               {"d", "f", 30},
+                                                               {"d", "f", 8},
+                                                               {"f", "g", 8}});
 }
 
 }  // namespace rocksdb
diff --git a/db/range_tombstone_fragmenter.cc b/db/range_tombstone_fragmenter.cc
new file mode 100644
index 00000000000..f9d9f2feb40
--- /dev/null
+++ b/db/range_tombstone_fragmenter.cc
@@ -0,0 +1,438 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_tombstone_fragmenter.h"
+
+#include <algorithm>
+#include <functional>
+#include <set>
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "util/autovector.h"
+#include "util/kv_map.h"
+#include "util/vector_iterator.h"
+
+namespace rocksdb {
+
+FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(
+    std::unique_ptr<InternalIterator> unfragmented_tombstones,
+    const InternalKeyComparator& icmp, bool for_compaction,
+    const std::vector<SequenceNumber>& snapshots) {
+  if (unfragmented_tombstones == nullptr) {
+    return;
+  }
+  bool is_sorted = true;
+  int num_tombstones = 0;
+  InternalKey pinned_last_start_key;
+  Slice last_start_key;
+  for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+       unfragmented_tombstones->Next(), num_tombstones++) {
+    if (num_tombstones > 0 &&
+        icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) {
+      is_sorted = false;
+      break;
+    }
+    if (unfragmented_tombstones->IsKeyPinned()) {
+      last_start_key = unfragmented_tombstones->key();
+    } else {
+      pinned_last_start_key.DecodeFrom(unfragmented_tombstones->key());
+      last_start_key = pinned_last_start_key.Encode();
+    }
+  }
+  if (is_sorted) {
+    FragmentTombstones(std::move(unfragmented_tombstones), icmp, for_compaction,
+                       snapshots);
+    return;
+  }
+
+  // Sort the tombstones before fragmenting them.
+  std::vector<std::string> keys, values;
+  keys.reserve(num_tombstones);
+  values.reserve(num_tombstones);
+  for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+       unfragmented_tombstones->Next()) {
+    keys.emplace_back(unfragmented_tombstones->key().data(),
+                      unfragmented_tombstones->key().size());
+    values.emplace_back(unfragmented_tombstones->value().data(),
+                        unfragmented_tombstones->value().size());
+  }
+  // VectorIterator implicitly sorts by key during construction.
+  auto iter = std::unique_ptr<VectorIterator>(
+      new VectorIterator(std::move(keys), std::move(values), &icmp));
+  FragmentTombstones(std::move(iter), icmp, for_compaction, snapshots);
+}
+
+void FragmentedRangeTombstoneList::FragmentTombstones(
+    std::unique_ptr<InternalIterator> unfragmented_tombstones,
+    const InternalKeyComparator& icmp, bool for_compaction,
+    const std::vector<SequenceNumber>& snapshots) {
+  Slice cur_start_key(nullptr, 0);
+  auto cmp = ParsedInternalKeyComparator(&icmp);
+
+  // Stores the end keys and sequence numbers of range tombstones with a start
+  // key less than or equal to cur_start_key. Provides an ordering by end key
+  // for use in flush_current_tombstones.
+  std::set<ParsedInternalKey, ParsedInternalKeyComparator> cur_end_keys(cmp);
+
+  // Given the next start key in unfragmented_tombstones,
+  // flush_current_tombstones writes every tombstone fragment that starts
+  // and ends with a key before next_start_key, and starts with a key greater
+  // than or equal to cur_start_key.
+  auto flush_current_tombstones = [&](const Slice& next_start_key) {
+    auto it = cur_end_keys.begin();
+    bool reached_next_start_key = false;
+    for (; it != cur_end_keys.end() && !reached_next_start_key; ++it) {
+      Slice cur_end_key = it->user_key;
+      if (icmp.user_comparator()->Compare(cur_start_key, cur_end_key) == 0) {
+        // Empty tombstone.
+        continue;
+      }
+      if (icmp.user_comparator()->Compare(next_start_key, cur_end_key) <= 0) {
+        // All of the end keys in [it, cur_end_keys.end()) are after
+        // next_start_key, so the tombstones they represent can be used in
+        // fragments that start with keys greater than or equal to
+        // next_start_key. However, the end keys we already passed will not be
+        // used in any more tombstone fragments.
+        //
+        // Remove the fully fragmented tombstones and stop iteration after a
+        // final round of flushing to preserve the tombstones we can create more
+        // fragments from.
+        reached_next_start_key = true;
+        cur_end_keys.erase(cur_end_keys.begin(), it);
+        cur_end_key = next_start_key;
+      }
+
+      // Flush a range tombstone fragment [cur_start_key, cur_end_key), which
+      // should not overlap with the last-flushed tombstone fragment.
+      assert(tombstones_.empty() ||
+             icmp.user_comparator()->Compare(tombstones_.back().end_key,
+                                             cur_start_key) <= 0);
+
+      // Sort the sequence numbers of the tombstones being fragmented in
+      // descending order, and then flush them in that order.
+      autovector<SequenceNumber> seqnums_to_flush;
+      for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) {
+        seqnums_to_flush.push_back(flush_it->sequence);
+      }
+      std::sort(seqnums_to_flush.begin(), seqnums_to_flush.end(),
+                std::greater<SequenceNumber>());
+
+      size_t start_idx = tombstone_seqs_.size();
+      size_t end_idx = start_idx + seqnums_to_flush.size();
+
+      if (for_compaction) {
+        // Drop all tombstone seqnums that are not preserved by a snapshot.
+        SequenceNumber next_snapshot = kMaxSequenceNumber;
+        for (auto seq : seqnums_to_flush) {
+          if (seq <= next_snapshot) {
+            // This seqnum is visible by a lower snapshot.
+            tombstone_seqs_.push_back(seq);
+            seq_set_.insert(seq);
+            auto upper_bound_it =
+                std::lower_bound(snapshots.begin(), snapshots.end(), seq);
+            if (upper_bound_it == snapshots.begin()) {
+              // This seqnum is the topmost one visible by the earliest
+              // snapshot. None of the seqnums below it will be visible, so we
+              // can skip them.
+              break;
+            }
+            next_snapshot = *std::prev(upper_bound_it);
+          }
+        }
+        end_idx = tombstone_seqs_.size();
+      } else {
+        // The fragmentation is being done for reads, so preserve all seqnums.
+        tombstone_seqs_.insert(tombstone_seqs_.end(), seqnums_to_flush.begin(),
+                               seqnums_to_flush.end());
+        seq_set_.insert(seqnums_to_flush.begin(), seqnums_to_flush.end());
+      }
+
+      assert(start_idx < end_idx);
+      tombstones_.emplace_back(cur_start_key, cur_end_key, start_idx, end_idx);
+
+      cur_start_key = cur_end_key;
+    }
+    if (!reached_next_start_key) {
+      // There is a gap between the last flushed tombstone fragment and
+      // the next tombstone's start key. Remove all the end keys in
+      // the working set, since we have fully fragmented their corresponding
+      // tombstones.
+      cur_end_keys.clear();
+    }
+    cur_start_key = next_start_key;
+  };
+
+  pinned_iters_mgr_.StartPinning();
+
+  bool no_tombstones = true;
+  for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+       unfragmented_tombstones->Next()) {
+    const Slice& ikey = unfragmented_tombstones->key();
+    Slice tombstone_start_key = ExtractUserKey(ikey);
+    SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey);
+    if (!unfragmented_tombstones->IsKeyPinned()) {
+      pinned_slices_.emplace_back(tombstone_start_key.data(),
+                                  tombstone_start_key.size());
+      tombstone_start_key = pinned_slices_.back();
+    }
+    no_tombstones = false;
+
+    Slice tombstone_end_key = unfragmented_tombstones->value();
+    if (!unfragmented_tombstones->IsValuePinned()) {
+      pinned_slices_.emplace_back(tombstone_end_key.data(),
+                                  tombstone_end_key.size());
+      tombstone_end_key = pinned_slices_.back();
+    }
+    if (!cur_end_keys.empty() && icmp.user_comparator()->Compare(
+                                     cur_start_key, tombstone_start_key) != 0) {
+      // The start key has changed. Flush all tombstones that start before
+      // this new start key.
+      flush_current_tombstones(tombstone_start_key);
+    }
+    cur_start_key = tombstone_start_key;
+
+    cur_end_keys.emplace(tombstone_end_key, tombstone_seq, kTypeRangeDeletion);
+  }
+  if (!cur_end_keys.empty()) {
+    ParsedInternalKey last_end_key = *std::prev(cur_end_keys.end());
+    flush_current_tombstones(last_end_key.user_key);
+  }
+
+  if (!no_tombstones) {
+    pinned_iters_mgr_.PinIterator(unfragmented_tombstones.release(),
+                                  false /* arena */);
+  }
+}
+
+bool FragmentedRangeTombstoneList::ContainsRange(SequenceNumber lower,
+                                                 SequenceNumber upper) const {
+  auto seq_it = seq_set_.lower_bound(lower);
+  return seq_it != seq_set_.end() && *seq_it <= upper;
+}
+
+FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
+    const FragmentedRangeTombstoneList* tombstones,
+    const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
+    SequenceNumber _lower_bound)
+    : tombstone_start_cmp_(icmp.user_comparator()),
+      tombstone_end_cmp_(icmp.user_comparator()),
+      icmp_(&icmp),
+      ucmp_(icmp.user_comparator()),
+      tombstones_(tombstones),
+      upper_bound_(_upper_bound),
+      lower_bound_(_lower_bound) {
+  assert(tombstones_ != nullptr);
+  Invalidate();
+}
+
+FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
+    const std::shared_ptr<const FragmentedRangeTombstoneList>& tombstones,
+    const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
+    SequenceNumber _lower_bound)
+    : tombstone_start_cmp_(icmp.user_comparator()),
+      tombstone_end_cmp_(icmp.user_comparator()),
+      icmp_(&icmp),
+      ucmp_(icmp.user_comparator()),
+      tombstones_ref_(tombstones),
+      tombstones_(tombstones_ref_.get()),
+      upper_bound_(_upper_bound),
+      lower_bound_(_lower_bound) {
+  assert(tombstones_ != nullptr);
+  Invalidate();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToFirst() {
+  pos_ = tombstones_->begin();
+  seq_pos_ = tombstones_->seq_begin();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToTopFirst() {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  pos_ = tombstones_->begin();
+  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                              tombstones_->seq_iter(pos_->seq_end_idx),
+                              upper_bound_, std::greater<SequenceNumber>());
+  ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToLast() {
+  pos_ = std::prev(tombstones_->end());
+  seq_pos_ = std::prev(tombstones_->seq_end());
+}
+
+void FragmentedRangeTombstoneIterator::SeekToTopLast() {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  pos_ = std::prev(tombstones_->end());
+  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                              tombstones_->seq_iter(pos_->seq_end_idx),
+                              upper_bound_, std::greater<SequenceNumber>());
+  ScanBackwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::Seek(const Slice& target) {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  SeekToCoveringTombstone(target);
+  ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekForPrev(const Slice& target) {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  SeekForPrevToCoveringTombstone(target);
+  ScanBackwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToCoveringTombstone(
+    const Slice& target) {
+  pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target,
+                          tombstone_end_cmp_);
+  if (pos_ == tombstones_->end()) {
+    // All tombstones end before target.
+    seq_pos_ = tombstones_->seq_end();
+    return;
+  }
+  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                              tombstones_->seq_iter(pos_->seq_end_idx),
+                              upper_bound_, std::greater<SequenceNumber>());
+}
+
+void FragmentedRangeTombstoneIterator::SeekForPrevToCoveringTombstone(
+    const Slice& target) {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target,
+                          tombstone_start_cmp_);
+  if (pos_ == tombstones_->begin()) {
+    // All tombstones start after target.
+    Invalidate();
+    return;
+  }
+  --pos_;
+  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                              tombstones_->seq_iter(pos_->seq_end_idx),
+                              upper_bound_, std::greater<SequenceNumber>());
+}
+
+void FragmentedRangeTombstoneIterator::ScanForwardToVisibleTombstone() {
+  while (pos_ != tombstones_->end() &&
+         (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
+          *seq_pos_ < lower_bound_)) {
+    ++pos_;
+    if (pos_ == tombstones_->end()) {
+      Invalidate();
+      return;
+    }
+    seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                                tombstones_->seq_iter(pos_->seq_end_idx),
+                                upper_bound_, std::greater<SequenceNumber>());
+  }
+}
+
+void FragmentedRangeTombstoneIterator::ScanBackwardToVisibleTombstone() {
+  while (pos_ != tombstones_->end() &&
+         (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
+          *seq_pos_ < lower_bound_)) {
+    if (pos_ == tombstones_->begin()) {
+      Invalidate();
+      return;
+    }
+    --pos_;
+    seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                                tombstones_->seq_iter(pos_->seq_end_idx),
+                                upper_bound_, std::greater<SequenceNumber>());
+  }
+}
+
+void FragmentedRangeTombstoneIterator::Next() {
+  ++seq_pos_;
+  if (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx)) {
+    ++pos_;
+  }
+}
+
+void FragmentedRangeTombstoneIterator::TopNext() {
+  ++pos_;
+  if (pos_ == tombstones_->end()) {
+    return;
+  }
+  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                              tombstones_->seq_iter(pos_->seq_end_idx),
+                              upper_bound_, std::greater<SequenceNumber>());
+  ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::Prev() {
+  if (seq_pos_ == tombstones_->seq_begin()) {
+    Invalidate();
+    return;
+  }
+  --seq_pos_;
+  if (pos_ == tombstones_->end() ||
+      seq_pos_ == tombstones_->seq_iter(pos_->seq_start_idx - 1)) {
+    --pos_;
+  }
+}
+
+void FragmentedRangeTombstoneIterator::TopPrev() {
+  if (pos_ == tombstones_->begin()) {
+    Invalidate();
+    return;
+  }
+  --pos_;
+  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                              tombstones_->seq_iter(pos_->seq_end_idx),
+                              upper_bound_, std::greater<SequenceNumber>());
+  ScanBackwardToVisibleTombstone();
+}
+
+bool FragmentedRangeTombstoneIterator::Valid() const {
+  return tombstones_ != nullptr && pos_ != tombstones_->end();
+}
+
+SequenceNumber FragmentedRangeTombstoneIterator::MaxCoveringTombstoneSeqnum(
+    const Slice& user_key) {
+  SeekToCoveringTombstone(user_key);
+  return ValidPos() && ucmp_->Compare(start_key(), user_key) <= 0 ? seq() : 0;
+}
+
+std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+FragmentedRangeTombstoneIterator::SplitBySnapshot(
+    const std::vector<SequenceNumber>& snapshots) {
+  std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+      splits;
+  SequenceNumber lower = 0;
+  SequenceNumber upper;
+  for (size_t i = 0; i <= snapshots.size(); i++) {
+    if (i >= snapshots.size()) {
+      upper = kMaxSequenceNumber;
+    } else {
+      upper = snapshots[i];
+    }
+    if (tombstones_->ContainsRange(lower, upper)) {
+      splits.emplace(upper, std::unique_ptr<FragmentedRangeTombstoneIterator>(
+                                new FragmentedRangeTombstoneIterator(
+                                    tombstones_, *icmp_, upper, lower)));
+    }
+    lower = upper + 1;
+  }
+  return splits;
+}
+
+}  // namespace rocksdb
diff --git a/db/range_tombstone_fragmenter.h b/db/range_tombstone_fragmenter.h
new file mode 100644
index 00000000000..a0b77b67771
--- /dev/null
+++ b/db/range_tombstone_fragmenter.h
@@ -0,0 +1,254 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <list>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "rocksdb/status.h"
+#include "table/internal_iterator.h"
+
+namespace rocksdb {
+
+struct FragmentedRangeTombstoneList {
+ public:
+  // A compact representation of a "stack" of range tombstone fragments, which
+  // start and end at the same user keys but have different sequence numbers.
+  // The members seq_start_idx and seq_end_idx are intended to be parameters to
+  // seq_iter().
+  struct RangeTombstoneStack {
+    RangeTombstoneStack(const Slice& start, const Slice& end, size_t start_idx,
+                        size_t end_idx)
+        : start_key(start),
+          end_key(end),
+          seq_start_idx(start_idx),
+          seq_end_idx(end_idx) {}
+
+    Slice start_key;
+    Slice end_key;
+    size_t seq_start_idx;
+    size_t seq_end_idx;
+  };
+  FragmentedRangeTombstoneList(
+      std::unique_ptr<InternalIterator> unfragmented_tombstones,
+      const InternalKeyComparator& icmp, bool for_compaction = false,
+      const std::vector<SequenceNumber>& snapshots = {});
+
+  std::vector<RangeTombstoneStack>::const_iterator begin() const {
+    return tombstones_.begin();
+  }
+
+  std::vector<RangeTombstoneStack>::const_iterator end() const {
+    return tombstones_.end();
+  }
+
+  std::vector<SequenceNumber>::const_iterator seq_iter(size_t idx) const {
+    return std::next(tombstone_seqs_.begin(), idx);
+  }
+
+  std::vector<SequenceNumber>::const_iterator seq_begin() const {
+    return tombstone_seqs_.begin();
+  }
+
+  std::vector<SequenceNumber>::const_iterator seq_end() const {
+    return tombstone_seqs_.end();
+  }
+
+  bool empty() const { return tombstones_.empty(); }
+
+  // Returns true if the stored tombstones contain with one with a sequence
+  // number in [lower, upper].
+  bool ContainsRange(SequenceNumber lower, SequenceNumber upper) const;
+
+ private:
+  // Given an ordered range tombstone iterator unfragmented_tombstones,
+  // "fragment" the tombstones into non-overlapping pieces, and store them in
+  // tombstones_ and tombstone_seqs_.
+  void FragmentTombstones(
+      std::unique_ptr<InternalIterator> unfragmented_tombstones,
+      const InternalKeyComparator& icmp, bool for_compaction,
+      const std::vector<SequenceNumber>& snapshots);
+
+  std::vector<RangeTombstoneStack> tombstones_;
+  std::vector<SequenceNumber> tombstone_seqs_;
+  std::set<SequenceNumber> seq_set_;
+  std::list<std::string> pinned_slices_;
+  PinnedIteratorsManager pinned_iters_mgr_;
+};
+
+// FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del
+// meta block into an iterator over non-overlapping tombstone fragments. The
+// tombstone fragmentation process should be more efficient than the range
+// tombstone collapsing algorithm in RangeDelAggregator because this leverages
+// the internal key ordering already provided by the input iterator, if
+// applicable (when the iterator is unsorted, a new sorted iterator is created
+// before proceeding). If there are few overlaps, creating a
+// FragmentedRangeTombstoneIterator should be O(n), while the RangeDelAggregator
+// tombstone collapsing is always O(n log n).
+class FragmentedRangeTombstoneIterator : public InternalIterator {
+ public:
+  FragmentedRangeTombstoneIterator(
+      const FragmentedRangeTombstoneList* tombstones,
+      const InternalKeyComparator& icmp, SequenceNumber upper_bound,
+      SequenceNumber lower_bound = 0);
+  FragmentedRangeTombstoneIterator(
+      const std::shared_ptr<const FragmentedRangeTombstoneList>& tombstones,
+      const InternalKeyComparator& icmp, SequenceNumber upper_bound,
+      SequenceNumber lower_bound = 0);
+
+  void SeekToFirst() override;
+  void SeekToLast() override;
+
+  void SeekToTopFirst();
+  void SeekToTopLast();
+
+  // NOTE: Seek and SeekForPrev do not behave in the way InternalIterator
+  // seeking should behave. This is OK because they are not currently used, but
+  // eventually FragmentedRangeTombstoneIterator should no longer implement
+  // InternalIterator.
+  //
+  // Seeks to the range tombstone that covers target at a seqnum in the
+  // snapshot. If no such tombstone exists, seek to the earliest tombstone in
+  // the snapshot that ends after target.
+  void Seek(const Slice& target) override;
+  // Seeks to the range tombstone that covers target at a seqnum in the
+  // snapshot. If no such tombstone exists, seek to the latest tombstone in the
+  // snapshot that starts before target.
+  void SeekForPrev(const Slice& target) override;
+
+  void Next() override;
+  void Prev() override;
+
+  void TopNext();
+  void TopPrev();
+
+  bool Valid() const override;
+  Slice key() const override {
+    MaybePinKey();
+    return current_start_key_.Encode();
+  }
+  Slice value() const override { return pos_->end_key; }
+  bool IsKeyPinned() const override { return false; }
+  bool IsValuePinned() const override { return true; }
+  Status status() const override { return Status::OK(); }
+
+  bool empty() const { return tombstones_->empty(); }
+  void Invalidate() {
+    pos_ = tombstones_->end();
+    seq_pos_ = tombstones_->seq_end();
+  }
+
+  RangeTombstone Tombstone() const {
+    return RangeTombstone(start_key(), end_key(), seq());
+  }
+  Slice start_key() const { return pos_->start_key; }
+  Slice end_key() const { return pos_->end_key; }
+  SequenceNumber seq() const { return *seq_pos_; }
+  ParsedInternalKey parsed_start_key() const {
+    return ParsedInternalKey(pos_->start_key, kMaxSequenceNumber,
+                             kTypeRangeDeletion);
+  }
+  ParsedInternalKey parsed_end_key() const {
+    return ParsedInternalKey(pos_->end_key, kMaxSequenceNumber,
+                             kTypeRangeDeletion);
+  }
+
+  SequenceNumber MaxCoveringTombstoneSeqnum(const Slice& user_key);
+
+  // Splits the iterator into n+1 iterators (where n is the number of
+  // snapshots), each providing a view over a "stripe" of sequence numbers. The
+  // iterators are keyed by the upper bound of their ranges (the provided
+  // snapshots + kMaxSequenceNumber).
+  //
+  // NOTE: the iterators in the returned map are no longer valid if their
+  // parent iterator is deleted, since they do not modify the refcount of the
+  // underlying tombstone list. Therefore, this map should be deleted before
+  // the parent iterator.
+  std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+  SplitBySnapshot(const std::vector<SequenceNumber>& snapshots);
+
+  SequenceNumber upper_bound() const { return upper_bound_; }
+  SequenceNumber lower_bound() const { return lower_bound_; }
+
+ private:
+  using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack;
+
+  struct RangeTombstoneStackStartComparator {
+    explicit RangeTombstoneStackStartComparator(const Comparator* c) : cmp(c) {}
+
+    bool operator()(const RangeTombstoneStack& a,
+                    const RangeTombstoneStack& b) const {
+      return cmp->Compare(a.start_key, b.start_key) < 0;
+    }
+
+    bool operator()(const RangeTombstoneStack& a, const Slice& b) const {
+      return cmp->Compare(a.start_key, b) < 0;
+    }
+
+    bool operator()(const Slice& a, const RangeTombstoneStack& b) const {
+      return cmp->Compare(a, b.start_key) < 0;
+    }
+
+    const Comparator* cmp;
+  };
+
+  struct RangeTombstoneStackEndComparator {
+    explicit RangeTombstoneStackEndComparator(const Comparator* c) : cmp(c) {}
+
+    bool operator()(const RangeTombstoneStack& a,
+                    const RangeTombstoneStack& b) const {
+      return cmp->Compare(a.end_key, b.end_key) < 0;
+    }
+
+    bool operator()(const RangeTombstoneStack& a, const Slice& b) const {
+      return cmp->Compare(a.end_key, b) < 0;
+    }
+
+    bool operator()(const Slice& a, const RangeTombstoneStack& b) const {
+      return cmp->Compare(a, b.end_key) < 0;
+    }
+
+    const Comparator* cmp;
+  };
+
+  void MaybePinKey() const {
+    if (pos_ != tombstones_->end() && seq_pos_ != tombstones_->seq_end() &&
+        (pinned_pos_ != pos_ || pinned_seq_pos_ != seq_pos_)) {
+      current_start_key_.Set(pos_->start_key, *seq_pos_, kTypeRangeDeletion);
+      pinned_pos_ = pos_;
+      pinned_seq_pos_ = seq_pos_;
+    }
+  }
+
+  void SeekToCoveringTombstone(const Slice& key);
+  void SeekForPrevToCoveringTombstone(const Slice& key);
+  void ScanForwardToVisibleTombstone();
+  void ScanBackwardToVisibleTombstone();
+  bool ValidPos() const {
+    return Valid() && seq_pos_ != tombstones_->seq_iter(pos_->seq_end_idx);
+  }
+
+  const RangeTombstoneStackStartComparator tombstone_start_cmp_;
+  const RangeTombstoneStackEndComparator tombstone_end_cmp_;
+  const InternalKeyComparator* icmp_;
+  const Comparator* ucmp_;
+  std::shared_ptr<const FragmentedRangeTombstoneList> tombstones_ref_;
+  const FragmentedRangeTombstoneList* tombstones_;
+  SequenceNumber upper_bound_;
+  SequenceNumber lower_bound_;
+  std::vector<RangeTombstoneStack>::const_iterator pos_;
+  std::vector<SequenceNumber>::const_iterator seq_pos_;
+  mutable std::vector<RangeTombstoneStack>::const_iterator pinned_pos_;
+  mutable std::vector<SequenceNumber>::const_iterator pinned_seq_pos_;
+  mutable InternalKey current_start_key_;
+};
+
+}  // namespace rocksdb
diff --git a/db/range_tombstone_fragmenter_test.cc b/db/range_tombstone_fragmenter_test.cc
new file mode 100644
index 00000000000..ddd3f774176
--- /dev/null
+++ b/db/range_tombstone_fragmenter_test.cc
@@ -0,0 +1,552 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_tombstone_fragmenter.h"
+
+#include "db/db_test_util.h"
+#include "rocksdb/comparator.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class RangeTombstoneFragmenterTest : public testing::Test {};
+
+namespace {
+
+static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());
+
+std::unique_ptr<InternalIterator> MakeRangeDelIter(
+    const std::vector<RangeTombstone>& range_dels) {
+  std::vector<std::string> keys, values;
+  for (const auto& range_del : range_dels) {
+    auto key_and_value = range_del.Serialize();
+    keys.push_back(key_and_value.first.Encode().ToString());
+    values.push_back(key_and_value.second.ToString());
+  }
+  return std::unique_ptr<test::VectorIterator>(
+      new test::VectorIterator(keys, values));
+}
+
+void CheckIterPosition(const RangeTombstone& tombstone,
+                       const FragmentedRangeTombstoneIterator* iter) {
+  // Test InternalIterator interface.
+  EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key()));
+  EXPECT_EQ(tombstone.end_key_, iter->value());
+  EXPECT_EQ(tombstone.seq_, iter->seq());
+
+  // Test FragmentedRangeTombstoneIterator interface.
+  EXPECT_EQ(tombstone.start_key_, iter->start_key());
+  EXPECT_EQ(tombstone.end_key_, iter->end_key());
+  EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key()));
+}
+
+void VerifyFragmentedRangeDels(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<RangeTombstone>& expected_tombstones) {
+  iter->SeekToFirst();
+  for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) {
+    ASSERT_TRUE(iter->Valid());
+    CheckIterPosition(expected_tombstones[i], iter);
+  }
+  EXPECT_FALSE(iter->Valid());
+}
+
+void VerifyVisibleTombstones(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<RangeTombstone>& expected_tombstones) {
+  iter->SeekToTopFirst();
+  for (size_t i = 0; i < expected_tombstones.size(); i++, iter->TopNext()) {
+    ASSERT_TRUE(iter->Valid());
+    CheckIterPosition(expected_tombstones[i], iter);
+  }
+  EXPECT_FALSE(iter->Valid());
+}
+
+struct SeekTestCase {
+  Slice seek_target;
+  RangeTombstone expected_position;
+  bool out_of_range;
+};
+
+void VerifySeek(FragmentedRangeTombstoneIterator* iter,
+                const std::vector<SeekTestCase>& cases) {
+  for (const auto& testcase : cases) {
+    iter->Seek(testcase.seek_target);
+    if (testcase.out_of_range) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      CheckIterPosition(testcase.expected_position, iter);
+    }
+  }
+}
+
+void VerifySeekForPrev(FragmentedRangeTombstoneIterator* iter,
+                       const std::vector<SeekTestCase>& cases) {
+  for (const auto& testcase : cases) {
+    iter->SeekForPrev(testcase.seek_target);
+    if (testcase.out_of_range) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      CheckIterPosition(testcase.expected_position, iter);
+    }
+  }
+}
+
+struct MaxCoveringTombstoneSeqnumTestCase {
+  Slice user_key;
+  SequenceNumber result;
+};
+
+void VerifyMaxCoveringTombstoneSeqnum(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<MaxCoveringTombstoneSeqnumTestCase>& cases) {
+  for (const auto& testcase : cases) {
+    EXPECT_EQ(testcase.result,
+              iter->MaxCoveringTombstoneSeqnum(testcase.user_key));
+  }
+}
+
+}  // anonymous namespace
+
+TEST_F(RangeTombstoneFragmenterTest, NonOverlappingTombstones) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "b", 10}, {"c", "d", 5}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "b", 10}, {"c", "d", 5}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"", 0}, {"a", 10}, {"b", 0}, {"c", 5}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlappingTombstones) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 15}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(
+      &iter, {{"a", "c", 10}, {"c", "e", 15}, {"c", "e", 10}, {"e", "g", 15}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 10}, {"c", 15}, {"e", 15}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, ContiguousTombstones) {
+  auto range_del_iter = MakeRangeDelIter(
+      {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(
+      &iter, {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 10}, {"c", 20}, {"e", 15}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartAndEndKey) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter,
+                            {{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter, {{"a", 10}, {"b", 10}, {"c", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyDifferentEndKeys) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"a", "g", 7}, {"a", "c", 3}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"a", "c", 7},
+                                    {"a", "c", 3},
+                                    {"c", "e", 10},
+                                    {"c", "e", 7},
+                                    {"e", "g", 7}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 10}, {"c", 10}, {"e", 7}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyMixedEndKeys) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "c", 30},
+                                          {"a", "g", 20},
+                                          {"a", "e", 10},
+                                          {"a", "g", 7},
+                                          {"a", "c", 3}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 30},
+                                    {"a", "c", 20},
+                                    {"a", "c", 10},
+                                    {"a", "c", 7},
+                                    {"a", "c", 3},
+                                    {"c", "e", 20},
+                                    {"c", "e", 10},
+                                    {"c", "e", 7},
+                                    {"e", "g", 20},
+                                    {"e", "g", 7}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 30}, {"c", 20}, {"e", 20}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         9 /* upper_bound */);
+  FragmentedRangeTombstoneIterator iter3(&fragment_list, bytewise_icmp,
+                                         7 /* upper_bound */);
+  FragmentedRangeTombstoneIterator iter4(&fragment_list, bytewise_icmp,
+                                         5 /* upper_bound */);
+  FragmentedRangeTombstoneIterator iter5(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  for (auto* iter : {&iter1, &iter2, &iter3, &iter4, &iter5}) {
+    VerifyFragmentedRangeDels(iter, {{"a", "c", 10},
+                                     {"c", "e", 10},
+                                     {"c", "e", 8},
+                                     {"c", "e", 6},
+                                     {"e", "g", 8},
+                                     {"e", "g", 6},
+                                     {"g", "i", 6},
+                                     {"j", "l", 4},
+                                     {"j", "l", 2},
+                                     {"l", "n", 4}});
+  }
+
+  ASSERT_EQ(0, iter1.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter1.upper_bound());
+  VerifyVisibleTombstones(&iter1, {{"a", "c", 10},
+                                   {"c", "e", 10},
+                                   {"e", "g", 8},
+                                   {"g", "i", 6},
+                                   {"j", "l", 4},
+                                   {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter1, {{"a", 10}, {"c", 10}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter2.lower_bound());
+  ASSERT_EQ(9, iter2.upper_bound());
+  VerifyVisibleTombstones(&iter2, {{"c", "e", 8},
+                                   {"e", "g", 8},
+                                   {"g", "i", 6},
+                                   {"j", "l", 4},
+                                   {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter2, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter3.lower_bound());
+  ASSERT_EQ(7, iter3.upper_bound());
+  VerifyVisibleTombstones(&iter3, {{"c", "e", 6},
+                                   {"e", "g", 6},
+                                   {"g", "i", 6},
+                                   {"j", "l", 4},
+                                   {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter3, {{"a", 0}, {"c", 6}, {"e", 6}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter4.lower_bound());
+  ASSERT_EQ(5, iter4.upper_bound());
+  VerifyVisibleTombstones(&iter4, {{"j", "l", 4}, {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter4, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter5.lower_bound());
+  ASSERT_EQ(3, iter5.upper_bound());
+  VerifyVisibleTombstones(&iter5, {{"j", "l", 2}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter5, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 2}, {"m", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        9 /* upper_bound */);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(9, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"c", "e", 10},
+                                    {"c", "e", 8},
+                                    {"c", "e", 6},
+                                    {"e", "g", 8},
+                                    {"e", "g", 6},
+                                    {"g", "i", 6},
+                                    {"j", "l", 4},
+                                    {"j", "l", 2},
+                                    {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyForCompaction) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(
+      std::move(range_del_iter), bytewise_icmp, true /* for_compaction */,
+      {} /* snapshots */);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"c", "e", 10},
+                                    {"e", "g", 8},
+                                    {"g", "i", 6},
+                                    {"j", "l", 4},
+                                    {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest,
+       OverlapAndRepeatedStartKeyForCompactionWithSnapshot) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(
+      std::move(range_del_iter), bytewise_icmp, true /* for_compaction */,
+      {20, 9} /* upper_bounds */);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"c", "e", 10},
+                                    {"c", "e", 8},
+                                    {"e", "g", 8},
+                                    {"g", "i", 6},
+                                    {"j", "l", 4},
+                                    {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, IteratorSplitNoSnapshots) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+
+  auto split_iters = iter.SplitBySnapshot({} /* snapshots */);
+  ASSERT_EQ(1, split_iters.size());
+
+  auto* split_iter = split_iters[kMaxSequenceNumber].get();
+  ASSERT_EQ(0, split_iter->lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, split_iter->upper_bound());
+  VerifyVisibleTombstones(split_iter, {{"a", "c", 10},
+                                       {"c", "e", 10},
+                                       {"e", "g", 8},
+                                       {"g", "i", 6},
+                                       {"j", "l", 4},
+                                       {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, IteratorSplitWithSnapshots) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+
+  auto split_iters = iter.SplitBySnapshot({3, 5, 7, 9} /* snapshots */);
+  ASSERT_EQ(5, split_iters.size());
+
+  auto* split_iter1 = split_iters[3].get();
+  ASSERT_EQ(0, split_iter1->lower_bound());
+  ASSERT_EQ(3, split_iter1->upper_bound());
+  VerifyVisibleTombstones(split_iter1, {{"j", "l", 2}});
+
+  auto* split_iter2 = split_iters[5].get();
+  ASSERT_EQ(4, split_iter2->lower_bound());
+  ASSERT_EQ(5, split_iter2->upper_bound());
+  VerifyVisibleTombstones(split_iter2, {{"j", "l", 4}, {"l", "n", 4}});
+
+  auto* split_iter3 = split_iters[7].get();
+  ASSERT_EQ(6, split_iter3->lower_bound());
+  ASSERT_EQ(7, split_iter3->upper_bound());
+  VerifyVisibleTombstones(split_iter3,
+                          {{"c", "e", 6}, {"e", "g", 6}, {"g", "i", 6}});
+
+  auto* split_iter4 = split_iters[9].get();
+  ASSERT_EQ(8, split_iter4->lower_bound());
+  ASSERT_EQ(9, split_iter4->upper_bound());
+  VerifyVisibleTombstones(split_iter4, {{"c", "e", 8}, {"e", "g", 8}});
+
+  auto* split_iter5 = split_iters[kMaxSequenceNumber].get();
+  ASSERT_EQ(10, split_iter5->lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, split_iter5->upper_bound());
+  VerifyVisibleTombstones(split_iter5, {{"a", "c", 10}, {"c", "e", 10}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekStartKey) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  VerifySeek(
+      &iter1,
+      {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}});
+  VerifySeekForPrev(
+      &iter1,
+      {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}});
+
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  VerifySeek(&iter2, {{"a", {"j", "l", 2}},
+                      {"e", {"j", "l", 2}},
+                      {"l", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter2, {{"a", {}, true /* out of range */},
+                             {"e", {}, true /* out of range */},
+                             {"l", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekCovered) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  VerifySeek(
+      &iter1,
+      {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}});
+  VerifySeekForPrev(
+      &iter1,
+      {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}});
+
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  VerifySeek(&iter2, {{"b", {"j", "l", 2}},
+                      {"f", {"j", "l", 2}},
+                      {"m", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter2, {{"b", {}, true /* out of range */},
+                             {"f", {}, true /* out of range */},
+                             {"m", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekEndKey) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  VerifySeek(&iter1, {{"c", {"c", "e", 10}},
+                      {"g", {"g", "i", 6}},
+                      {"i", {"j", "l", 4}},
+                      {"n", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter1, {{"c", {"c", "e", 10}},
+                             {"g", {"g", "i", 6}},
+                             {"i", {"g", "i", 6}},
+                             {"n", {"l", "n", 4}}});
+
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  VerifySeek(&iter2, {{"c", {"j", "l", 2}},
+                      {"g", {"j", "l", 2}},
+                      {"i", {"j", "l", 2}},
+                      {"n", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter2, {{"c", {}, true /* out of range */},
+                             {"g", {}, true /* out of range */},
+                             {"i", {}, true /* out of range */},
+                             {"n", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekOutOfBounds) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  VerifySeek(&iter, {{"", {"a", "c", 10}}, {"z", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter,
+                    {{"", {}, true /* out of range */}, {"z", {"l", "n", 4}}});
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/db/repair.cc b/db/repair.cc
index 195689418b9..4e93a161cf1 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -118,7 +118,8 @@ class Repairer {
         wc_(db_options_.delayed_write_rate),
         vset_(dbname_, &immutable_db_options_, env_options_,
               raw_table_cache_.get(), &wb_, &wc_),
-        next_file_number_(1) {
+        next_file_number_(1),
+        db_lock_(nullptr) {
     for (const auto& cfd : column_families) {
       cf_name_to_opts_[cfd.name] = cfd.options;
     }
@@ -163,11 +164,18 @@ class Repairer {
   }
 
   ~Repairer() {
+    if (db_lock_ != nullptr) {
+      env_->UnlockFile(db_lock_);
+    }
     delete table_cache_;
   }
 
   Status Run() {
-    Status status = FindFiles();
+    Status status = env_->LockFile(LockFileName(dbname_), &db_lock_);
+    if (!status.ok()) {
+      return status;
+    }
+    status = FindFiles();
     if (status.ok()) {
       // Discard older manifests and start a fresh one
       for (size_t i = 0; i < manifests_.size(); i++) {
@@ -245,6 +253,9 @@ class Repairer {
   std::vector<uint64_t> logs_;
   std::vector<TableInfo> tables_;
   uint64_t next_file_number_;
+  // Lock over the persistent DB state. Non-nullptr iff successfully
+  // acquired.
+  FileLock* db_lock_;
 
   Status FindFiles() {
     std::vector<std::string> filenames;
@@ -334,13 +345,13 @@ class Repairer {
 
     // Open the log file
     std::string logname = LogFileName(db_options_.wal_dir, log);
-    unique_ptr<SequentialFile> lfile;
+    std::unique_ptr<SequentialFile> lfile;
     Status status = env_->NewSequentialFile(
         logname, &lfile, env_->OptimizeForLogRead(env_options_));
     if (!status.ok()) {
       return status;
     }
-    unique_ptr<SequentialFileReader> lfile_reader(
+    std::unique_ptr<SequentialFileReader> lfile_reader(
         new SequentialFileReader(std::move(lfile), logname));
 
     // Create the log reader.
@@ -353,7 +364,8 @@ class Repairer {
     // propagating bad information (like overly large sequence
     // numbers).
     log::Reader reader(db_options_.info_log, std::move(lfile_reader), &reporter,
-                       true /*enable checksum*/, log);
+                       true /*enable checksum*/, log,
+                       false /* retry_after_eof */);
 
     // Initialize per-column family memtables
     for (auto* cfd : *vset_.GetColumnFamilySet()) {
@@ -405,10 +417,16 @@ class Repairer {
       SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();
 
       auto write_hint = cfd->CalculateSSTWriteHint(0);
+      std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+          range_del_iters;
+      auto range_del_iter =
+          mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
+      if (range_del_iter != nullptr) {
+        range_del_iters.emplace_back(range_del_iter);
+      }
       status = BuildTable(
           dbname_, env_, *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(),
-          env_options_, table_cache_, iter.get(),
-          std::unique_ptr<InternalIterator>(mem->NewRangeTombstoneIterator(ro)),
+          env_options_, table_cache_, iter.get(), std::move(range_del_iters),
           &meta, cfd->internal_comparator(),
           cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(),
           {}, kMaxSequenceNumber, snapshot_checker, kNoCompression,
diff --git a/db/repair_test.cc b/db/repair_test.cc
index 72abd62a5f8..3422532da4b 100644
--- a/db/repair_test.cc
+++ b/db/repair_test.cc
@@ -313,6 +313,7 @@ TEST_F(RepairTest, RepairColumnFamilyOptions) {
     ASSERT_EQ(comparator_name,
               fname_and_props.second->comparator_name);
   }
+  Close();
 
   // Also check comparator when it's provided via "unknown" CF options
   ASSERT_OK(RepairDB(dbname_, opts, {{"default", opts}},
diff --git a/db/table_cache.cc b/db/table_cache.cc
index f374a68766d..5c0f9571654 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -10,6 +10,7 @@
 #include "db/table_cache.h"
 
 #include "db/dbformat.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "db/version_edit.h"
 #include "util/filename.h"
 
@@ -41,8 +42,10 @@ static void UnrefEntry(void* arg1, void* arg2) {
   cache->Release(h);
 }
 
-static void DeleteTableReader(void* arg1, void* /*arg2*/) {
+static void DeleteTableReader(void* arg1, void* arg2) {
   TableReader* table_reader = reinterpret_cast<TableReader*>(arg1);
+  Statistics* stats = reinterpret_cast<Statistics*>(arg2);
+  RecordTick(stats, NO_FILE_CLOSES);
   delete table_reader;
 }
 
@@ -91,12 +94,12 @@ Status TableCache::GetTableReader(
     const EnvOptions& env_options,
     const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
     bool sequential_mode, size_t readahead, bool record_read_stats,
-    HistogramImpl* file_read_hist, unique_ptr<TableReader>* table_reader,
+    HistogramImpl* file_read_hist, std::unique_ptr<TableReader>* table_reader,
     const SliceTransform* prefix_extractor, bool skip_filters, int level,
     bool prefetch_index_and_filter_in_cache, bool for_compaction) {
   std::string fname =
       TableFileName(ioptions_.cf_paths, fd.GetNumber(), fd.GetPathId());
-  unique_ptr<RandomAccessFile> file;
+  std::unique_ptr<RandomAccessFile> file;
   Status s = ioptions_.env->NewRandomAccessFile(fname, &file, env_options);
 
   RecordTick(ioptions_.statistics, NO_FILE_OPENS);
@@ -116,7 +119,8 @@ Status TableCache::GetTableReader(
         new RandomAccessFileReader(
             std::move(file), fname, ioptions_.env,
             record_read_stats ? ioptions_.statistics : nullptr, SST_READ_MICROS,
-            file_read_hist, ioptions_.rate_limiter, for_compaction));
+            file_read_hist, ioptions_.rate_limiter, for_compaction,
+            ioptions_.listeners));
     s = ioptions_.table_factory->NewTableReader(
         TableReaderOptions(ioptions_, prefix_extractor, env_options,
                            internal_comparator, skip_filters, immortal_tables_,
@@ -155,7 +159,7 @@ Status TableCache::FindTable(const EnvOptions& env_options,
     if (no_io) {  // Don't do IO and return a not-found status
       return Status::Incomplete("Table not found in table_cache, no_io is set");
     }
-    unique_ptr<TableReader> table_reader;
+    std::unique_ptr<TableReader> table_reader;
     s = GetTableReader(env_options, internal_comparator, fd,
                        false /* sequential mode */, 0 /* readahead */,
                        record_read_stats, file_read_hist, &table_reader,
@@ -183,7 +187,9 @@ InternalIterator* TableCache::NewIterator(
     const InternalKeyComparator& icomparator, const FileMetaData& file_meta,
     RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor,
     TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
-    bool for_compaction, Arena* arena, bool skip_filters, int level) {
+    bool for_compaction, Arena* arena, bool skip_filters, int level,
+    const InternalKey* smallest_compaction_key,
+    const InternalKey* largest_compaction_key) {
   PERF_TIMER_GUARD(new_table_iterator_nanos);
 
   Status s;
@@ -213,7 +219,7 @@ InternalIterator* TableCache::NewIterator(
 
   auto& fd = file_meta.fd;
   if (create_new_table_reader) {
-    unique_ptr<TableReader> table_reader_unique_ptr;
+    std::unique_ptr<TableReader> table_reader_unique_ptr;
     s = GetTableReader(
         env_options, icomparator, fd, true /* sequential_mode */, readahead,
         !for_compaction /* record stats */, nullptr, &table_reader_unique_ptr,
@@ -245,7 +251,8 @@ InternalIterator* TableCache::NewIterator(
     }
     if (create_new_table_reader) {
       assert(handle == nullptr);
-      result->RegisterCleanup(&DeleteTableReader, table_reader, nullptr);
+      result->RegisterCleanup(&DeleteTableReader, table_reader,
+                              ioptions_.statistics);
     } else if (handle != nullptr) {
       result->RegisterCleanup(&UnrefEntry, cache_, handle);
       handle = nullptr;  // prevent from releasing below
@@ -260,16 +267,23 @@ InternalIterator* TableCache::NewIterator(
   }
   if (s.ok() && range_del_agg != nullptr && !options.ignore_range_deletions) {
     if (range_del_agg->AddFile(fd.GetNumber())) {
-      std::unique_ptr<InternalIterator> range_del_iter(
-          table_reader->NewRangeTombstoneIterator(options));
+      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+          static_cast<FragmentedRangeTombstoneIterator*>(
+              table_reader->NewRangeTombstoneIterator(options)));
       if (range_del_iter != nullptr) {
         s = range_del_iter->status();
       }
       if (s.ok()) {
-        s = range_del_agg->AddTombstones(
-            std::move(range_del_iter),
-            &file_meta.smallest,
-            &file_meta.largest);
+        const InternalKey* smallest = &file_meta.smallest;
+        const InternalKey* largest = &file_meta.largest;
+        if (smallest_compaction_key != nullptr) {
+          smallest = smallest_compaction_key;
+        }
+        if (largest_compaction_key != nullptr) {
+          largest = largest_compaction_key;
+        }
+        range_del_agg->AddTombstones(std::move(range_del_iter), smallest,
+                                     largest);
       }
     }
   }
@@ -363,18 +377,16 @@ Status TableCache::Get(const ReadOptions& options,
         t = GetTableReaderFromHandle(handle);
       }
     }
-    if (s.ok() && get_context->range_del_agg() != nullptr &&
+    SequenceNumber* max_covering_tombstone_seq =
+        get_context->max_covering_tombstone_seq();
+    if (s.ok() && max_covering_tombstone_seq != nullptr &&
         !options.ignore_range_deletions) {
-      std::unique_ptr<InternalIterator> range_del_iter(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
           t->NewRangeTombstoneIterator(options));
       if (range_del_iter != nullptr) {
-        s = range_del_iter->status();
-      }
-      if (s.ok()) {
-        s = get_context->range_del_agg()->AddTombstones(
-            std::move(range_del_iter),
-            &file_meta.smallest,
-            &file_meta.largest);
+        *max_covering_tombstone_seq = std::max(
+            *max_covering_tombstone_seq,
+            range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k)));
       }
     }
     if (s.ok()) {
diff --git a/db/table_cache.h b/db/table_cache.h
index 7e7f53cc1ff..e3936ab44af 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -56,7 +56,9 @@ class TableCache {
       const SliceTransform* prefix_extractor = nullptr,
       TableReader** table_reader_ptr = nullptr,
       HistogramImpl* file_read_hist = nullptr, bool for_compaction = false,
-      Arena* arena = nullptr, bool skip_filters = false, int level = -1);
+      Arena* arena = nullptr, bool skip_filters = false, int level = -1,
+      const InternalKey* smallest_compaction_key = nullptr,
+      const InternalKey* largest_compaction_key = nullptr);
 
   // If a seek to internal key "k" in specified file finds an entry,
   // call (*handle_result)(arg, found_key, found_value) repeatedly until
@@ -139,7 +141,7 @@ class TableCache {
                         const FileDescriptor& fd, bool sequential_mode,
                         size_t readahead, bool record_read_stats,
                         HistogramImpl* file_read_hist,
-                        unique_ptr<TableReader>* table_reader,
+                        std::unique_ptr<TableReader>* table_reader,
                         const SliceTransform* prefix_extractor = nullptr,
                         bool skip_filters = false, int level = -1,
                         bool prefetch_index_and_filter_in_cache = true,
diff --git a/db/table_properties_collector.cc b/db/table_properties_collector.cc
index 084cf139db8..4254e179c91 100644
--- a/db/table_properties_collector.cc
+++ b/db/table_properties_collector.cc
@@ -11,52 +11,6 @@
 
 namespace rocksdb {
 
-Status InternalKeyPropertiesCollector::InternalAdd(const Slice& key,
-                                                   const Slice& /*value*/,
-                                                   uint64_t /*file_size*/) {
-  ParsedInternalKey ikey;
-  if (!ParseInternalKey(key, &ikey)) {
-    return Status::InvalidArgument("Invalid internal key");
-  }
-
-  // Note: We count both, deletions and single deletions here.
-  if (ikey.type == ValueType::kTypeDeletion ||
-      ikey.type == ValueType::kTypeSingleDeletion) {
-    ++deleted_keys_;
-  } else if (ikey.type == ValueType::kTypeMerge) {
-    ++merge_operands_;
-  }
-
-  return Status::OK();
-}
-
-Status InternalKeyPropertiesCollector::Finish(
-    UserCollectedProperties* properties) {
-  assert(properties);
-  assert(properties->find(
-        InternalKeyTablePropertiesNames::kDeletedKeys) == properties->end());
-  assert(properties->find(InternalKeyTablePropertiesNames::kMergeOperands) ==
-         properties->end());
-
-  std::string val_deleted_keys;
-  PutVarint64(&val_deleted_keys, deleted_keys_);
-  properties->insert(
-      {InternalKeyTablePropertiesNames::kDeletedKeys, val_deleted_keys});
-
-  std::string val_merge_operands;
-  PutVarint64(&val_merge_operands, merge_operands_);
-  properties->insert(
-      {InternalKeyTablePropertiesNames::kMergeOperands, val_merge_operands});
-
-  return Status::OK();
-}
-
-UserCollectedProperties
-InternalKeyPropertiesCollector::GetReadableProperties() const {
-  return {{"kDeletedKeys", ToString(deleted_keys_)},
-          {"kMergeOperands", ToString(merge_operands_)}};
-}
-
 namespace {
 
 uint64_t GetUint64Property(const UserCollectedProperties& props,
@@ -97,23 +51,17 @@ UserKeyTablePropertiesCollector::GetReadableProperties() const {
   return collector_->GetReadableProperties();
 }
 
-
-const std::string InternalKeyTablePropertiesNames::kDeletedKeys
-  = "rocksdb.deleted.keys";
-const std::string InternalKeyTablePropertiesNames::kMergeOperands =
-    "rocksdb.merge.operands";
-
 uint64_t GetDeletedKeys(
     const UserCollectedProperties& props) {
   bool property_present_ignored;
-  return GetUint64Property(props, InternalKeyTablePropertiesNames::kDeletedKeys,
+  return GetUint64Property(props, TablePropertiesNames::kDeletedKeys,
                            &property_present_ignored);
 }
 
 uint64_t GetMergeOperands(const UserCollectedProperties& props,
                           bool* property_present) {
   return GetUint64Property(
-      props, InternalKeyTablePropertiesNames::kMergeOperands, property_present);
+      props, TablePropertiesNames::kMergeOperands, property_present);
 }
 
 }  // namespace rocksdb
diff --git a/db/table_properties_collector.h b/db/table_properties_collector.h
index 7216ec3190f..4792b1a817c 100644
--- a/db/table_properties_collector.h
+++ b/db/table_properties_collector.h
@@ -14,11 +14,6 @@
 
 namespace rocksdb {
 
-struct InternalKeyTablePropertiesNames {
-  static const std::string kDeletedKeys;
-  static const std::string kMergeOperands;
-};
-
 // Base class for internal table properties collector.
 class IntTblPropCollector {
  public:
@@ -49,39 +44,6 @@ class IntTblPropCollectorFactory {
   virtual const char* Name() const = 0;
 };
 
-// Collecting the statistics for internal keys. Visible only by internal
-// rocksdb modules.
-class InternalKeyPropertiesCollector : public IntTblPropCollector {
- public:
-  virtual Status InternalAdd(const Slice& key, const Slice& value,
-                             uint64_t file_size) override;
-
-  virtual Status Finish(UserCollectedProperties* properties) override;
-
-  virtual const char* Name() const override {
-    return "InternalKeyPropertiesCollector";
-  }
-
-  UserCollectedProperties GetReadableProperties() const override;
-
- private:
-  uint64_t deleted_keys_ = 0;
-  uint64_t merge_operands_ = 0;
-};
-
-class InternalKeyPropertiesCollectorFactory
-    : public IntTblPropCollectorFactory {
- public:
-  virtual IntTblPropCollector* CreateIntTblPropCollector(
-      uint32_t /*column_family_id*/) override {
-    return new InternalKeyPropertiesCollector();
-  }
-
-  virtual const char* Name() const override {
-    return "InternalKeyPropertiesCollectorFactory";
-  }
-};
-
 // When rocksdb creates a new table, it will encode all "user keys" into
 // "internal keys", which contains meta information of a given entry.
 //
diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc
index 2147bf3fa1b..ebeb5962b1d 100644
--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@@ -45,7 +45,7 @@ void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions,
                      int_tbl_prop_collector_factories,
                  std::unique_ptr<WritableFileWriter>* writable,
                  std::unique_ptr<TableBuilder>* builder) {
-  unique_ptr<WritableFile> wf(new test::StringSink);
+  std::unique_ptr<WritableFile> wf(new test::StringSink);
   writable->reset(
       new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions()));
   int unknown_level = -1;
@@ -399,9 +399,6 @@ void TestInternalKeyPropertiesCollector(
     ImmutableCFOptions ioptions(options);
     GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories);
     options.comparator = comparator;
-  } else {
-    int_tbl_prop_collector_factories.emplace_back(
-        new InternalKeyPropertiesCollectorFactory);
   }
   const ImmutableCFOptions ioptions(options);
   MutableCFOptions moptions(options);
@@ -418,8 +415,9 @@ void TestInternalKeyPropertiesCollector(
 
     test::StringSink* fwf =
         static_cast<test::StringSink*>(writable->writable_file());
-    unique_ptr<RandomAccessFileReader> reader(test::GetRandomAccessFileReader(
-        new test::StringSource(fwf->contents())));
+    std::unique_ptr<RandomAccessFileReader> reader(
+        test::GetRandomAccessFileReader(
+            new test::StringSource(fwf->contents())));
     TableProperties* props;
     Status s =
         ReadTableProperties(reader.get(), fwf->contents().size(), magic_number,
diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc
index 9bdae05f818..4d6671ef66d 100644
--- a/db/transaction_log_impl.cc
+++ b/db/transaction_log_impl.cc
@@ -43,9 +43,10 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl(
 }
 
 Status TransactionLogIteratorImpl::OpenLogFile(
-    const LogFile* logFile, unique_ptr<SequentialFileReader>* file_reader) {
+    const LogFile* logFile,
+    std::unique_ptr<SequentialFileReader>* file_reader) {
   Env* env = options_->env;
-  unique_ptr<SequentialFile> file;
+  std::unique_ptr<SequentialFile> file;
   std::string fname;
   Status s;
   EnvOptions optimized_env_options = env->OptimizeForLogRead(soptions_);
@@ -306,7 +307,7 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
 }
 
 Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
-  unique_ptr<SequentialFileReader> file;
+  std::unique_ptr<SequentialFileReader> file;
   Status s = OpenLogFile(logFile, &file);
   if (!s.ok()) {
     return s;
@@ -314,7 +315,8 @@ Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
   assert(file);
   currentLogReader_.reset(
       new log::Reader(options_->info_log, std::move(file), &reporter_,
-                      read_options_.verify_checksums_, logFile->LogNumber()));
+                      read_options_.verify_checksums_, logFile->LogNumber(),
+                      false /* retry_after_eof */));
   return Status::OK();
 }
 }  //  namespace rocksdb
diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h
index b6762bf5f8e..3c27a8f3784 100644
--- a/db/transaction_log_impl.h
+++ b/db/transaction_log_impl.h
@@ -85,9 +85,9 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
   Status currentStatus_;
   size_t currentFileIndex_;
   std::unique_ptr<WriteBatch> currentBatch_;
-  unique_ptr<log::Reader> currentLogReader_;
+  std::unique_ptr<log::Reader> currentLogReader_;
   Status OpenLogFile(const LogFile* logFile,
-                     unique_ptr<SequentialFileReader>* file);
+                     std::unique_ptr<SequentialFileReader>* file);
 
   struct LogReporter : public log::Reader::Reporter {
     Env* env;
diff --git a/db/version_edit.cc b/db/version_edit.cc
index adeca134d9d..e9f497999da 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -579,7 +579,7 @@ std::string VersionEdit::DebugString(bool hex_key) const {
     AppendNumberTo(&r, max_column_family_);
   }
   if (is_in_atomic_group_) {
-    r.append("\n AtomicGroup: ");
+    r.append("\n  AtomicGroup: ");
     AppendNumberTo(&r, remaining_entries_);
     r.append(" entries remains");
   }
diff --git a/db/version_set.cc b/db/version_set.cc
index 434d791e74d..5aa2fac6c6e 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -16,12 +16,12 @@
 #include <inttypes.h>
 #include <stdio.h>
 #include <algorithm>
+#include <list>
 #include <map>
 #include <set>
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include <list>
 #include "db/compaction.h"
 #include "db/internal_stats.h"
 #include "db/log_reader.h"
@@ -301,17 +301,28 @@ class FilePicker {
         // On Level-n (n>=1), files are sorted. Binary search to find the
         // earliest file whose largest key >= ikey. Search left bound and
         // right bound are used to narrow the range.
-        if (search_left_bound_ == search_right_bound_) {
-          start_index = search_left_bound_;
-        } else if (search_left_bound_ < search_right_bound_) {
+        if (search_left_bound_ <= search_right_bound_) {
           if (search_right_bound_ == FileIndexer::kLevelMaxIndex) {
             search_right_bound_ =
                 static_cast<int32_t>(curr_file_level_->num_files) - 1;
           }
+          // `search_right_bound_` is an inclusive upper-bound, but since it was
+          // determined based on user key, it is still possible the lookup key
+          // falls to the right of `search_right_bound_`'s corresponding file.
+          // So, pass a limit one higher, which allows us to detect this case.
           start_index =
               FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_,
                               static_cast<uint32_t>(search_left_bound_),
-                              static_cast<uint32_t>(search_right_bound_));
+                              static_cast<uint32_t>(search_right_bound_) + 1);
+          if (start_index == search_right_bound_ + 1) {
+            // `ikey_` comes after `search_right_bound_`. The lookup key does
+            // not exist on this level, so let's skip this level and do a full
+            // binary search on the next level.
+            search_left_bound_ = 0;
+            search_right_bound_ = FileIndexer::kLevelMaxIndex;
+            curr_level_++;
+            continue;
+          }
         } else {
           // search_left_bound > search_right_bound, key does not exist in
           // this level. Since no comparison is done in this level, it will
@@ -451,15 +462,17 @@ bool SomeFileOverlapsRange(
 }
 
 namespace {
+
 class LevelIterator final : public InternalIterator {
  public:
-  LevelIterator(TableCache* table_cache, const ReadOptions& read_options,
-                const EnvOptions& env_options,
-                const InternalKeyComparator& icomparator,
-                const LevelFilesBrief* flevel,
-                const SliceTransform* prefix_extractor, bool should_sample,
-                HistogramImpl* file_read_hist, bool for_compaction,
-                bool skip_filters, int level, RangeDelAggregator* range_del_agg)
+  LevelIterator(
+      TableCache* table_cache, const ReadOptions& read_options,
+      const EnvOptions& env_options, const InternalKeyComparator& icomparator,
+      const LevelFilesBrief* flevel, const SliceTransform* prefix_extractor,
+      bool should_sample, HistogramImpl* file_read_hist, bool for_compaction,
+      bool skip_filters, int level, RangeDelAggregator* range_del_agg,
+      const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries =
+          nullptr)
       : table_cache_(table_cache),
         read_options_(read_options),
         env_options_(env_options),
@@ -473,7 +486,8 @@ class LevelIterator final : public InternalIterator {
         file_index_(flevel_->num_files),
         level_(level),
         range_del_agg_(range_del_agg),
-        pinned_iters_mgr_(nullptr) {
+        pinned_iters_mgr_(nullptr),
+        compaction_boundaries_(compaction_boundaries) {
     // Empty level is not supported.
     assert(flevel_ != nullptr && flevel_->num_files > 0);
   }
@@ -540,12 +554,18 @@ class LevelIterator final : public InternalIterator {
       sample_file_read_inc(file_meta.file_metadata);
     }
 
+    const InternalKey* smallest_compaction_key = nullptr;
+    const InternalKey* largest_compaction_key = nullptr;
+    if (compaction_boundaries_ != nullptr) {
+      smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest;
+      largest_compaction_key = (*compaction_boundaries_)[file_index_].largest;
+    }
     return table_cache_->NewIterator(
         read_options_, env_options_, icomparator_, *file_meta.file_metadata,
         range_del_agg_, prefix_extractor_,
         nullptr /* don't need reference to table */,
         file_read_hist_, for_compaction_, nullptr /* arena */, skip_filters_,
-        level_);
+        level_, smallest_compaction_key, largest_compaction_key);
   }
 
   TableCache* table_cache_;
@@ -565,6 +585,10 @@ class LevelIterator final : public InternalIterator {
   RangeDelAggregator* range_del_agg_;
   IteratorWrapper file_iter_;  // May be nullptr
   PinnedIteratorsManager* pinned_iters_mgr_;
+
+  // To be propagated to RangeDelAggregator in order to safely truncate range
+  // tombstones.
+  const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries_;
 };
 
 void LevelIterator::Seek(const Slice& target) {
@@ -752,7 +776,11 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
   // By setting the magic number to kInvalidTableMagicNumber, we can by
   // pass the magic number check in the footer.
   std::unique_ptr<RandomAccessFileReader> file_reader(
-      new RandomAccessFileReader(std::move(file), file_name));
+      new RandomAccessFileReader(
+          std::move(file), file_name, nullptr /* env */, nullptr /* stats */,
+          0 /* hist_type */, nullptr /* file_read_hist */,
+          nullptr /* rate_limiter */, false /* for_compaction*/,
+          ioptions->listeners));
   s = ReadTableProperties(
       file_reader.get(), file_meta->fd.GetFileSize(),
       Footer::kInvalidTableMagicNumber /* table's magic number */, *ioptions,
@@ -898,6 +926,8 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
           file->largest.user_key().ToString(),
           file->stats.num_reads_sampled.load(std::memory_order_relaxed),
           file->being_compacted});
+      files.back().num_entries = file->num_entries;
+      files.back().num_deletions = file->num_deletions;
       level_size += file->fd.GetFileSize();
     }
     cf_meta->levels.emplace_back(
@@ -1038,7 +1068,8 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
 
   Arena arena;
   Status status;
-  RangeDelAggregator range_del_agg(icmp, {}, false);
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
 
   *overlap = false;
 
@@ -1094,6 +1125,7 @@ VersionStorageInfo::VersionStorageInfo(
       compaction_style_(compaction_style),
       files_(new std::vector<FileMetaData*>[num_levels_]),
       base_level_(num_levels_ == 1 ? -1 : 1),
+      level_multiplier_(0.0),
       files_by_compaction_pri_(num_levels_),
       level0_non_overlapping_(false),
       next_file_to_compact_by_size_(num_levels_),
@@ -1158,7 +1190,7 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
 void Version::Get(const ReadOptions& read_options, const LookupKey& k,
                   PinnableSlice* value, Status* status,
                   MergeContext* merge_context,
-                  RangeDelAggregator* range_del_agg, bool* value_found,
+                  SequenceNumber* max_covering_tombstone_seq, bool* value_found,
                   bool* key_exists, SequenceNumber* seq, ReadCallback* callback,
                   bool* is_blob) {
   Slice ikey = k.internal_key();
@@ -1175,8 +1207,8 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
   GetContext get_context(
       user_comparator(), merge_operator_, info_log_, db_statistics_,
       status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
-      value, value_found, merge_context, range_del_agg, this->env_, seq,
-      merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob);
+      value, value_found, merge_context, max_covering_tombstone_seq, this->env_,
+      seq, merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob);
 
   // Pin blocks that we read to hold merge operands
   if (merge_operator_) {
@@ -1190,10 +1222,19 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
   FdWithKeyRange* f = fp.GetNextFile();
 
   while (f != nullptr) {
+    if (*max_covering_tombstone_seq > 0) {
+      // The remaining files we look at will only contain covered keys, so we
+      // stop here.
+      break;
+    }
     if (get_context.sample()) {
       sample_file_read_inc(f->file_metadata);
     }
 
+    bool timer_enabled =
+        GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
+        get_perf_context()->per_level_perf_context_enabled;
+    StopWatchNano timer(env_, timer_enabled /* auto_start */);
     *status = table_cache_->Get(
         read_options, *internal_comparator(), *f->file_metadata, ikey,
         &get_context, mutable_cf_options_.prefix_extractor.get(),
@@ -1202,6 +1243,10 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
                         fp.IsHitFileLastInLevel()),
         fp.GetCurrentLevel());
     // TODO: examine the behavior for corrupted key
+    if (timer_enabled) {
+      PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
+                                fp.GetCurrentLevel());
+    }
     if (!status->ok()) {
       return;
     }
@@ -1217,6 +1262,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
         // Keep searching in other files
         break;
       case GetContext::kMerge:
+        // TODO: update per-level perfcontext user_key_return_count for kMerge
         break;
       case GetContext::kFound:
         if (fp.GetHitFileLevel() == 0) {
@@ -1226,6 +1272,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
         } else if (fp.GetHitFileLevel() >= 2) {
           RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
         }
+        PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, fp.GetHitFileLevel());
         return;
       case GetContext::kDeleted:
         // Use empty error message for speed
@@ -1317,7 +1364,7 @@ bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
   }
   if (tp.get() == nullptr) return false;
   file_meta->num_entries = tp->num_entries;
-  file_meta->num_deletions = GetDeletedKeys(tp->user_collected_properties);
+  file_meta->num_deletions = tp->num_deletions;
   file_meta->raw_value_size = tp->raw_value_size;
   file_meta->raw_key_size = tp->raw_key_size;
 
@@ -1956,7 +2003,9 @@ void VersionStorageInfo::GenerateBottommostFiles() {
       } else {
         l0_file_idx = -1;
       }
-      if (!RangeMightExistAfterSortedRun(f.smallest_key, f.largest_key,
+      Slice smallest_user_key = ExtractUserKey(f.smallest_key);
+      Slice largest_user_key = ExtractUserKey(f.largest_key);
+      if (!RangeMightExistAfterSortedRun(smallest_user_key, largest_user_key,
                                          static_cast<int>(level),
                                          l0_file_idx)) {
         bottommost_files_.emplace_back(static_cast<int>(level),
@@ -2075,7 +2124,8 @@ void VersionStorageInfo::GetOverlappingInputs(
       if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) {
         // "f" is completely before specified range; skip it
         iter++;
-      } else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) {
+      } else if (end != nullptr &&
+                 user_cmp->Compare(file_start, user_end) > 0) {
         // "f" is completely after specified range; skip it
         iter++;
       } else {
@@ -2093,8 +2143,7 @@ void VersionStorageInfo::GetOverlappingInputs(
               user_cmp->Compare(file_start, user_begin) < 0) {
             user_begin = file_start;
           }
-          if (end != nullptr &&
-              user_cmp->Compare(file_limit, user_end) > 0) {
+          if (end != nullptr && user_cmp->Compare(file_limit, user_end) > 0) {
             user_end = file_limit;
           }
         }
@@ -2140,60 +2189,6 @@ void VersionStorageInfo::GetCleanInputsWithinInterval(
                                         true /* within_interval */);
 }
 
-namespace {
-
-const uint64_t kRangeTombstoneSentinel =
-    PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
-
-// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
-// null which provides the property that a==null indicates a key that is less
-// than any key and b==null indicates a key that is greater than any key. Note
-// that the comparison is performed primarily on the user-key portion of the
-// key. If the user-keys compare equal, an additional test is made to sort
-// range tombstone sentinel keys before other keys with the same user-key. The
-// result is that 2 user-keys will compare equal if they differ purely on
-// their sequence number and value, but the range tombstone sentinel for that
-// user-key will compare not equal. This is necessary because the range
-// tombstone sentinel key is set as the largest key for an sstable even though
-// that key never appears in the database. We don't want adjacent sstables to
-// be considered overlapping if they are separated by the range tombstone
-// sentinel.
-int sstableKeyCompare(const Comparator* user_cmp,
-                      const InternalKey& a, const InternalKey& b) {
-  auto c = user_cmp->Compare(a.user_key(), b.user_key());
-  if (c != 0) {
-    return c;
-  }
-  auto a_footer = ExtractInternalKeyFooter(a.Encode());
-  auto b_footer = ExtractInternalKeyFooter(b.Encode());
-  if (a_footer == kRangeTombstoneSentinel) {
-    if (b_footer != kRangeTombstoneSentinel) {
-      return -1;
-    }
-  } else if (b_footer == kRangeTombstoneSentinel) {
-    return 1;
-  }
-  return 0;
-}
-
-int sstableKeyCompare(const Comparator* user_cmp,
-                      const InternalKey* a, const InternalKey& b) {
-  if (a == nullptr) {
-    return -1;
-  }
-  return sstableKeyCompare(user_cmp, *a, b);
-}
-
-int sstableKeyCompare(const Comparator* user_cmp,
-                      const InternalKey& a, const InternalKey* b) {
-  if (b == nullptr) {
-    return -1;
-  }
-  return sstableKeyCompare(user_cmp, a, *b);
-}
-
-} // namespace
-
 // Store in "*inputs" all files in "level" that overlap [begin,end]
 // Employ binary search to find at least one file that overlaps the
 // specified range. From that file, iterate backwards and
@@ -2415,9 +2410,12 @@ const char* VersionStorageInfo::LevelSummary(
   int len = 0;
   if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) {
     assert(base_level_ < static_cast<int>(level_max_bytes_.size()));
-    len = snprintf(scratch->buffer, sizeof(scratch->buffer),
-                   "base level %d max bytes base %" PRIu64 " ", base_level_,
-                   level_max_bytes_[base_level_]);
+    if (level_multiplier_ != 0.0) {
+      len = snprintf(
+          scratch->buffer, sizeof(scratch->buffer),
+          "base level %d level multiplier %.2f max bytes base %" PRIu64 " ",
+          base_level_, level_multiplier_, level_max_bytes_[base_level_]);
+    }
   }
   len +=
       snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "files[");
@@ -2553,7 +2551,13 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
       // No compaction from L1+ needs to be scheduled.
       base_level_ = num_levels_ - 1;
     } else {
-      uint64_t base_bytes_max = options.max_bytes_for_level_base;
+      uint64_t l0_size = 0;
+      for (const auto& f : files_[0]) {
+        l0_size += f->fd.GetFileSize();
+      }
+
+      uint64_t base_bytes_max =
+          std::max(options.max_bytes_for_level_base, l0_size);
       uint64_t base_bytes_min = static_cast<uint64_t>(
           base_bytes_max / options.max_bytes_for_level_multiplier);
 
@@ -2593,11 +2597,33 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
         }
       }
 
+      level_multiplier_ = options.max_bytes_for_level_multiplier;
+      assert(base_level_size > 0);
+      if (l0_size > base_level_size &&
+          (l0_size > options.max_bytes_for_level_base ||
+           static_cast<int>(files_[0].size() / 2) >=
+               options.level0_file_num_compaction_trigger)) {
+        // We adjust the base level according to actual L0 size, and adjust
+        // the level multiplier accordingly, when:
+        //   1. the L0 size is larger than level size base, or
+        //   2. number of L0 files reaches twice the L0->L1 compaction trigger
+        // We don't do this otherwise to keep the LSM-tree structure stable
+        // unless the L0 compation is backlogged.
+        base_level_size = l0_size;
+        if (base_level_ == num_levels_ - 1) {
+          level_multiplier_ = 1.0;
+        } else {
+          level_multiplier_ = std::pow(
+              static_cast<double>(max_level_size) /
+                  static_cast<double>(base_level_size),
+              1.0 / static_cast<double>(num_levels_ - base_level_ - 1));
+        }
+      }
+
       uint64_t level_size = base_level_size;
       for (int i = base_level_; i < num_levels_; i++) {
         if (i > base_level_) {
-          level_size = MultiplyCheckOverflow(
-              level_size, options.max_bytes_for_level_multiplier);
+          level_size = MultiplyCheckOverflow(level_size, level_multiplier_);
         }
         // Don't set any level below base_bytes_max. Otherwise, the LSM can
         // assume an hourglass shape where L1+ sizes are smaller than L0. This
@@ -2644,8 +2670,8 @@ uint64_t VersionStorageInfo::EstimateLiveDataSize() const {
 }
 
 bool VersionStorageInfo::RangeMightExistAfterSortedRun(
-    const Slice& smallest_key, const Slice& largest_key, int last_level,
-    int last_l0_idx) {
+    const Slice& smallest_user_key, const Slice& largest_user_key,
+    int last_level, int last_l0_idx) {
   assert((last_l0_idx != -1) == (last_level == 0));
   // TODO(ajkr): this preserves earlier behavior where we considered an L0 file
   // bottommost only if it's the oldest L0 file and there are no files on older
@@ -2667,7 +2693,7 @@ bool VersionStorageInfo::RangeMightExistAfterSortedRun(
     // which overlap with [`smallest_key`, `largest_key`].
     if (files_[level].size() > 0 &&
         (last_level == 0 ||
-         OverlapInLevel(level, &smallest_key, &largest_key))) {
+         OverlapInLevel(level, &smallest_user_key, &largest_user_key))) {
       return true;
     }
   }
@@ -2834,6 +2860,7 @@ Status VersionSet::ProcessManifestWrites(
     batch_edits.push_back(first_writer.edit_list.front());
   } else {
     auto it = manifest_writers_.cbegin();
+    size_t group_start = std::numeric_limits<size_t>::max();
     while (it != manifest_writers_.cend()) {
       if ((*it)->edit_list.front()->IsColumnFamilyManipulation()) {
         // no group commits for column family add or drop
@@ -2842,7 +2869,36 @@ Status VersionSet::ProcessManifestWrites(
       last_writer = *(it++);
       assert(last_writer != nullptr);
       assert(last_writer->cfd != nullptr);
-      if (last_writer->cfd != nullptr && last_writer->cfd->IsDropped()) {
+      if (last_writer->cfd->IsDropped()) {
+        // If we detect a dropped CF at this point, and the corresponding
+        // version edits belong to an atomic group, then we need to find out
+        // the preceding version edits in the same atomic group, and update
+        // their `remaining_entries_` member variable because we are NOT going
+        // to write the version edits' of dropped CF to the MANIFEST. If we
+        // don't update, then Recover can report corrupted atomic group because
+        // the `remaining_entries_` do not match.
+        if (!batch_edits.empty()) {
+          if (batch_edits.back()->is_in_atomic_group_ &&
+              batch_edits.back()->remaining_entries_ > 0) {
+            assert(group_start < batch_edits.size());
+            const auto& edit_list = last_writer->edit_list;
+            size_t k = 0;
+            while (k < edit_list.size()) {
+              if (!edit_list[k]->is_in_atomic_group_) {
+                break;
+              } else if (edit_list[k]->remaining_entries_ == 0) {
+                ++k;
+                break;
+              }
+              ++k;
+            }
+            for (auto i = group_start; i < batch_edits.size(); ++i) {
+              assert(static_cast<uint32_t>(k) <=
+                     batch_edits.back()->remaining_entries_);
+              batch_edits[i]->remaining_entries_ -= static_cast<uint32_t>(k);
+            }
+          }
+        }
         continue;
       }
       // We do a linear search on versions because versions is small.
@@ -2873,6 +2929,15 @@ Status VersionSet::ProcessManifestWrites(
       }
       assert(builder != nullptr);  // make checker happy
       for (const auto& e : last_writer->edit_list) {
+        if (e->is_in_atomic_group_) {
+          if (batch_edits.empty() || !batch_edits.back()->is_in_atomic_group_ ||
+              (batch_edits.back()->is_in_atomic_group_ &&
+               batch_edits.back()->remaining_entries_ == 0)) {
+            group_start = batch_edits.size();
+          }
+        } else if (group_start != std::numeric_limits<size_t>::max()) {
+          group_start = std::numeric_limits<size_t>::max();
+        }
         LogAndApplyHelper(last_writer->cfd, builder, version, e, mu);
         batch_edits.push_back(e);
       }
@@ -2885,6 +2950,42 @@ Status VersionSet::ProcessManifestWrites(
     }
   }
 
+#ifndef NDEBUG
+  // Verify that version edits of atomic groups have correct
+  // remaining_entries_.
+  size_t k = 0;
+  while (k < batch_edits.size()) {
+    while (k < batch_edits.size() && !batch_edits[k]->is_in_atomic_group_) {
+      ++k;
+    }
+    if (k == batch_edits.size()) {
+      break;
+    }
+    size_t i = k;
+    while (i < batch_edits.size()) {
+      if (!batch_edits[i]->is_in_atomic_group_) {
+        break;
+      }
+      assert(i - k + batch_edits[i]->remaining_entries_ ==
+             batch_edits[k]->remaining_entries_);
+      if (batch_edits[i]->remaining_entries_ == 0) {
+        ++i;
+        break;
+      }
+      ++i;
+    }
+    assert(batch_edits[i - 1]->is_in_atomic_group_);
+    assert(0 == batch_edits[i - 1]->remaining_entries_);
+    std::vector<VersionEdit*> tmp;
+    for (size_t j = k; j != i; ++j) {
+      tmp.emplace_back(batch_edits[j]);
+    }
+    TEST_SYNC_POINT_CALLBACK(
+        "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", &tmp);
+    k = i;
+  }
+#endif  // NDEBUG
+
   uint64_t new_manifest_file_size = 0;
   Status s;
 
@@ -2936,15 +3037,16 @@ Status VersionSet::ProcessManifestWrites(
                      pending_manifest_file_number_);
       std::string descriptor_fname =
           DescriptorFileName(dbname_, pending_manifest_file_number_);
-      unique_ptr<WritableFile> descriptor_file;
+      std::unique_ptr<WritableFile> descriptor_file;
       s = NewWritableFile(env_, descriptor_fname, &descriptor_file,
                           opt_env_opts);
       if (s.ok()) {
         descriptor_file->SetPreallocationBlockSize(
             db_options_->manifest_preallocation_size);
 
-        unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-            std::move(descriptor_file), descriptor_fname, opt_env_opts));
+        std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+            std::move(descriptor_file), descriptor_fname, opt_env_opts, nullptr,
+            db_options_->listeners));
         descriptor_log_.reset(
             new log::Writer(std::move(file_writer), 0, false));
         s = WriteSnapshot(descriptor_log_.get());
@@ -3117,9 +3219,9 @@ Status VersionSet::ProcessManifestWrites(
 // 'datas' is gramatically incorrect. We still use this notation is to indicate
 // that this variable represents a collection of column_family_data.
 Status VersionSet::LogAndApply(
-    const std::vector<ColumnFamilyData*>& column_family_datas,
-    const std::vector<MutableCFOptions>& mutable_cf_options_list,
-    const std::vector<autovector<VersionEdit*>>& edit_lists,
+    const autovector<ColumnFamilyData*>& column_family_datas,
+    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+    const autovector<autovector<VersionEdit*>>& edit_lists,
     InstrumentedMutex* mu, Directory* db_directory, bool new_descriptor_log,
     const ColumnFamilyOptions* new_cf_options) {
   mu->AssertHeld();
@@ -3151,8 +3253,8 @@ Status VersionSet::LogAndApply(
     assert(static_cast<size_t>(num_cfds) == edit_lists.size());
   }
   for (int i = 0; i < num_cfds; ++i) {
-    writers.emplace_back(mu, column_family_datas[i], mutable_cf_options_list[i],
-                         edit_lists[i]);
+    writers.emplace_back(mu, column_family_datas[i],
+                         *mutable_cf_options_list[i], edit_lists[i]);
     manifest_writers_.push_back(&writers[i]);
   }
   assert(!writers.empty());
@@ -3189,7 +3291,7 @@ Status VersionSet::LogAndApply(
     if (!manifest_writers_.empty()) {
       manifest_writers_.front()->cv.Signal();
     }
-    return Status::OK();
+    return Status::ShutdownInProgress();
   }
 
   return ProcessManifestWrites(writers, mu, db_directory, new_descriptor_log,
@@ -3406,9 +3508,9 @@ Status VersionSet::Recover(
                  manifest_filename.c_str());
 
   manifest_filename = dbname_ + "/" + manifest_filename;
-  unique_ptr<SequentialFileReader> manifest_file_reader;
+  std::unique_ptr<SequentialFileReader> manifest_file_reader;
   {
-    unique_ptr<SequentialFile> manifest_file;
+    std::unique_ptr<SequentialFile> manifest_file;
     s = env_->NewSequentialFile(manifest_filename, &manifest_file,
                                 env_->OptimizeForManifestRead(env_options_));
     if (!s.ok()) {
@@ -3454,7 +3556,8 @@ Status VersionSet::Recover(
     VersionSet::LogReporter reporter;
     reporter.status = &s;
     log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
-                       true /* checksum */, 0 /* log_number */);
+                       true /* checksum */, 0 /* log_number */,
+                       false /* retry_after_eof */);
     Slice record;
     std::string scratch;
     std::vector<VersionEdit> replay_buffer;
@@ -3469,14 +3572,21 @@ Status VersionSet::Recover(
       if (edit.is_in_atomic_group_) {
         if (replay_buffer.empty()) {
           replay_buffer.resize(edit.remaining_entries_ + 1);
+          TEST_SYNC_POINT_CALLBACK("VersionSet::Recover:FirstInAtomicGroup",
+                                   &edit);
         }
         ++num_entries_decoded;
         if (num_entries_decoded + edit.remaining_entries_ !=
             static_cast<uint32_t>(replay_buffer.size())) {
-          return Status::Corruption("corrupted atomic group");
+          TEST_SYNC_POINT_CALLBACK(
+              "VersionSet::Recover:IncorrectAtomicGroupSize", &edit);
+          s = Status::Corruption("corrupted atomic group");
+          break;
         }
         replay_buffer[num_entries_decoded - 1] = std::move(edit);
         if (num_entries_decoded == replay_buffer.size()) {
+          TEST_SYNC_POINT_CALLBACK("VersionSet::Recover:LastInAtomicGroup",
+                                   &edit);
           for (auto& e : replay_buffer) {
             s = ApplyOneVersionEdit(
                 e, cf_name_to_options, column_families_not_found, builders,
@@ -3491,9 +3601,13 @@ Status VersionSet::Recover(
           replay_buffer.clear();
           num_entries_decoded = 0;
         }
+        TEST_SYNC_POINT("VersionSet::Recover:AtomicGroup");
       } else {
         if (!replay_buffer.empty()) {
-          return Status::Corruption("corrupted atomic group");
+          TEST_SYNC_POINT_CALLBACK(
+              "VersionSet::Recover:AtomicGroupMixedWithNormalEdits", &edit);
+          s = Status::Corruption("corrupted atomic group");
+          break;
         }
         s = ApplyOneVersionEdit(
             edit, cf_name_to_options, column_families_not_found, builders,
@@ -3644,12 +3758,12 @@ Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
 
   std::string dscname = dbname + "/" + current;
 
-  unique_ptr<SequentialFileReader> file_reader;
+  std::unique_ptr<SequentialFileReader> file_reader;
   {
-  unique_ptr<SequentialFile> file;
-  s = env->NewSequentialFile(dscname, &file, soptions);
-  if (!s.ok()) {
-    return s;
+    std::unique_ptr<SequentialFile> file;
+    s = env->NewSequentialFile(dscname, &file, soptions);
+    if (!s.ok()) {
+      return s;
   }
   file_reader.reset(new SequentialFileReader(std::move(file), dscname));
   }
@@ -3660,7 +3774,8 @@ Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
   VersionSet::LogReporter reporter;
   reporter.status = &s;
   log::Reader reader(nullptr, std::move(file_reader), &reporter,
-                     true /* checksum */, 0 /* log_number */);
+                     true /* checksum */, 0 /* log_number */,
+                     false /* retry_after_eof */);
   Slice record;
   std::string scratch;
   while (reader.ReadRecord(&record, &scratch) && s.ok()) {
@@ -3786,10 +3901,10 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
 Status VersionSet::DumpManifest(Options& options, std::string& dscname,
                                 bool verbose, bool hex, bool json) {
   // Open the specified manifest file.
-  unique_ptr<SequentialFileReader> file_reader;
+  std::unique_ptr<SequentialFileReader> file_reader;
   Status s;
   {
-    unique_ptr<SequentialFile> file;
+    std::unique_ptr<SequentialFile> file;
     s = options.env->NewSequentialFile(
         dscname, &file, env_->OptimizeForManifestRead(env_options_));
     if (!s.ok()) {
@@ -3820,7 +3935,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
     VersionSet::LogReporter reporter;
     reporter.status = &s;
     log::Reader reader(nullptr, std::move(file_reader), &reporter,
-                       true /* checksum */, 0 /* log_number */);
+                       true /* checksum */, 0 /* log_number */,
+                       false /* retry_after_eof */);
     Slice record;
     std::string scratch;
     while (reader.ReadRecord(&record, &scratch) && s.ok()) {
@@ -4266,7 +4382,8 @@ InternalIterator* VersionSet::MakeInputIterator(
             false /* should_sample */,
             nullptr /* no per level latency histogram */,
             true /* for_compaction */, false /* skip_filters */,
-            static_cast<int>(which) /* level */, range_del_agg);
+            static_cast<int>(which) /* level */, range_del_agg,
+            c->boundaries(which));
       }
     }
   }
@@ -4376,6 +4493,11 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
         filemetadata.largestkey = file->largest.user_key().ToString();
         filemetadata.smallest_seqno = file->fd.smallest_seqno;
         filemetadata.largest_seqno = file->fd.largest_seqno;
+        filemetadata.num_reads_sampled = file->stats.num_reads_sampled.load(
+            std::memory_order_relaxed);
+        filemetadata.being_compacted = file->being_compacted;
+        filemetadata.num_entries = file->num_entries;
+        filemetadata.num_deletions = file->num_deletions;
         metadata->push_back(filemetadata);
       }
     }
diff --git a/db/version_set.h b/db/version_set.h
index 8183a168b35..121e376223d 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -309,6 +309,7 @@ class VersionStorageInfo {
   }
 
   int base_level() const { return base_level_; }
+  double level_multiplier() const { return level_multiplier_; }
 
   // REQUIRES: lock is held
   // Set the index that is used to offset into files_by_compaction_pri_ to find
@@ -407,9 +408,9 @@ class VersionStorageInfo {
   // @param last_level Level after which we check for overlap
   // @param last_l0_idx If `last_level == 0`, index of L0 file after which we
   //    check for overlap; otherwise, must be -1
-  bool RangeMightExistAfterSortedRun(const Slice& smallest_key,
-                                     const Slice& largest_key, int last_level,
-                                     int last_l0_idx);
+  bool RangeMightExistAfterSortedRun(const Slice& smallest_user_key,
+                                     const Slice& largest_user_key,
+                                     int last_level, int last_l0_idx);
 
  private:
   const InternalKeyComparator* internal_comparator_;
@@ -435,6 +436,8 @@ class VersionStorageInfo {
   // be empty. -1 if it is not level-compaction so it's not applicable.
   int base_level_;
 
+  double level_multiplier_;
+
   // A list for the same set of files that are stored in files_,
   // but files in each level are now sorted based on file
   // size. The file with the largest size is at the front.
@@ -564,9 +567,10 @@ class Version {
   // REQUIRES: lock is not held
   void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
            Status* status, MergeContext* merge_context,
-           RangeDelAggregator* range_del_agg, bool* value_found = nullptr,
-           bool* key_exists = nullptr, SequenceNumber* seq = nullptr,
-           ReadCallback* callback = nullptr, bool* is_blob = nullptr);
+           SequenceNumber* max_covering_tombstone_seq,
+           bool* value_found = nullptr, bool* key_exists = nullptr,
+           SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr,
+           bool* is_blob = nullptr);
 
   // Loads some stats information from files. Call without mutex held. It needs
   // to be called before applying the version to the version set.
@@ -601,7 +605,7 @@ class Version {
   // REQUIRES: lock is held
   // On success, *props will be populated with all SSTables' table properties.
   // The keys of `props` are the sst file name, the values of `props` are the
-  // tables' properties, represented as shared_ptr.
+  // tables' properties, represented as std::shared_ptr.
   Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
   Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level);
   Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n,
@@ -755,10 +759,14 @@ class VersionSet {
       InstrumentedMutex* mu, Directory* db_directory = nullptr,
       bool new_descriptor_log = false,
       const ColumnFamilyOptions* column_family_options = nullptr) {
-    std::vector<ColumnFamilyData*> cfds(1, column_family_data);
-    std::vector<MutableCFOptions> mutable_cf_options_list(1,
-                                                          mutable_cf_options);
-    std::vector<autovector<VersionEdit*>> edit_lists(1, {edit});
+    autovector<ColumnFamilyData*> cfds;
+    cfds.emplace_back(column_family_data);
+    autovector<const MutableCFOptions*> mutable_cf_options_list;
+    mutable_cf_options_list.emplace_back(&mutable_cf_options);
+    autovector<autovector<VersionEdit*>> edit_lists;
+    autovector<VersionEdit*> edit_list;
+    edit_list.emplace_back(edit);
+    edit_lists.emplace_back(edit_list);
     return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
                        db_directory, new_descriptor_log, column_family_options);
   }
@@ -770,10 +778,12 @@ class VersionSet {
       const autovector<VersionEdit*>& edit_list, InstrumentedMutex* mu,
       Directory* db_directory = nullptr, bool new_descriptor_log = false,
       const ColumnFamilyOptions* column_family_options = nullptr) {
-    std::vector<ColumnFamilyData*> cfds(1, column_family_data);
-    std::vector<MutableCFOptions> mutable_cf_options_list(1,
-                                                          mutable_cf_options);
-    std::vector<autovector<VersionEdit*>> edit_lists(1, edit_list);
+    autovector<ColumnFamilyData*> cfds;
+    cfds.emplace_back(column_family_data);
+    autovector<const MutableCFOptions*> mutable_cf_options_list;
+    mutable_cf_options_list.emplace_back(&mutable_cf_options);
+    autovector<autovector<VersionEdit*>> edit_lists;
+    edit_lists.emplace_back(edit_list);
     return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
                        db_directory, new_descriptor_log, column_family_options);
   }
@@ -781,12 +791,13 @@ class VersionSet {
   // The across-multi-cf batch version. If edit_lists contain more than
   // 1 version edits, caller must ensure that no edit in the []list is column
   // family manipulation.
-  Status LogAndApply(const std::vector<ColumnFamilyData*>& cfds,
-                     const std::vector<MutableCFOptions>& mutable_cf_options,
-                     const std::vector<autovector<VersionEdit*>>& edit_lists,
-                     InstrumentedMutex* mu, Directory* db_directory = nullptr,
-                     bool new_descriptor_log = false,
-                     const ColumnFamilyOptions* new_cf_options = nullptr);
+  Status LogAndApply(
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<autovector<VersionEdit*>>& edit_lists,
+      InstrumentedMutex* mu, Directory* db_directory = nullptr,
+      bool new_descriptor_log = false,
+      const ColumnFamilyOptions* new_cf_options = nullptr);
 
   // Recover the last saved descriptor from persistent storage.
   // If read_only == true, Recover() will not complain if some column families
@@ -1043,7 +1054,7 @@ class VersionSet {
   uint64_t prev_log_number_;  // 0 or backing store for memtable being compacted
 
   // Opened lazily
-  unique_ptr<log::Writer> descriptor_log_;
+  std::unique_ptr<log::Writer> descriptor_log_;
 
   // generates a increasing version number for every new version
   uint64_t current_version_number_;
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index 26fb18dd038..0379bd58a2b 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -268,6 +268,93 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLargeLevel) {
   ASSERT_EQ(0, logger_->log_count);
 }
 
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_1) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 40000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+
+  Add(0, 1U, "1", "2", 10000U);
+  Add(0, 2U, "1", "2", 10000U);
+  Add(0, 3U, "1", "2", 10000U);
+
+  Add(5, 4U, "1", "2", 1286250U);
+  Add(4, 5U, "1", "2", 200000U);
+  Add(3, 6U, "1", "2", 40000U);
+  Add(2, 7U, "1", "2", 8000U);
+
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(2, vstorage_.base_level());
+  // level multiplier should be 3.5
+  ASSERT_EQ(vstorage_.level_multiplier(), 5.0);
+  // Level size should be around 30,000, 105,000, 367,500
+  ASSERT_EQ(40000U, vstorage_.MaxBytesForLevel(2));
+  ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3));
+  ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4));
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_2) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 10000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+
+  Add(0, 11U, "1", "2", 10000U);
+  Add(0, 12U, "1", "2", 10000U);
+  Add(0, 13U, "1", "2", 10000U);
+
+  Add(5, 4U, "1", "2", 1286250U);
+  Add(4, 5U, "1", "2", 200000U);
+  Add(3, 6U, "1", "2", 40000U);
+  Add(2, 7U, "1", "2", 8000U);
+
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(2, vstorage_.base_level());
+  // level multiplier should be 3.5
+  ASSERT_LT(vstorage_.level_multiplier(), 3.6);
+  ASSERT_GT(vstorage_.level_multiplier(), 3.4);
+  // Level size should be around 30,000, 105,000, 367,500
+  ASSERT_EQ(30000U, vstorage_.MaxBytesForLevel(2));
+  ASSERT_LT(vstorage_.MaxBytesForLevel(3), 110000U);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(3), 100000U);
+  ASSERT_LT(vstorage_.MaxBytesForLevel(4), 370000U);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(4), 360000U);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 10000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+
+  Add(0, 11U, "1", "2", 5000U);
+  Add(0, 12U, "1", "2", 5000U);
+  Add(0, 13U, "1", "2", 5000U);
+  Add(0, 14U, "1", "2", 5000U);
+  Add(0, 15U, "1", "2", 5000U);
+  Add(0, 16U, "1", "2", 5000U);
+
+  Add(5, 4U, "1", "2", 1286250U);
+  Add(4, 5U, "1", "2", 200000U);
+  Add(3, 6U, "1", "2", 40000U);
+  Add(2, 7U, "1", "2", 8000U);
+
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(2, vstorage_.base_level());
+  // level multiplier should be 3.5
+  ASSERT_LT(vstorage_.level_multiplier(), 3.6);
+  ASSERT_GT(vstorage_.level_multiplier(), 3.4);
+  // Level size should be around 30,000, 105,000, 367,500
+  ASSERT_EQ(30000U, vstorage_.MaxBytesForLevel(2));
+  ASSERT_LT(vstorage_.MaxBytesForLevel(3), 110000U);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(3), 100000U);
+  ASSERT_LT(vstorage_.MaxBytesForLevel(4), 370000U);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(4), 360000U);
+}
+
 TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) {
   // Test whether the overlaps are detected as expected
   Add(1, 1U, "4", "7", 1U);  // Perfect overlap with last level
@@ -518,9 +605,13 @@ TEST_F(FindLevelFileTest, LevelOverlappingFiles) {
   ASSERT_TRUE(Overlaps("600", "700"));
 }
 
-class ManifestWriterTest : public testing::Test {
+class VersionSetTestBase {
  public:
-  ManifestWriterTest()
+  const static std::string kColumnFamilyName1;
+  const static std::string kColumnFamilyName2;
+  const static std::string kColumnFamilyName3;
+
+  VersionSetTestBase()
       : env_(Env::Default()),
         dbname_(test::PerThreadDBPath("version_set_test")),
         db_options_(),
@@ -537,15 +628,20 @@ class ManifestWriterTest : public testing::Test {
                                       std::numeric_limits<uint64_t>::max());
   }
 
-  // Create DB with 3 column families.
-  void NewDB() {
+  void PrepareManifest(std::vector<ColumnFamilyDescriptor>* column_families,
+                       SequenceNumber* last_seqno,
+                       std::unique_ptr<log::Writer>* log_writer) {
+    assert(column_families != nullptr);
+    assert(last_seqno != nullptr);
+    assert(log_writer != nullptr);
     VersionEdit new_db;
     new_db.SetLogNumber(0);
     new_db.SetNextFile(2);
     new_db.SetLastSequence(0);
 
-    const std::vector<std::string> cf_names = {kDefaultColumnFamilyName,
-                                               "alice", "bob"};
+    const std::vector<std::string> cf_names = {
+        kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+        kColumnFamilyName3};
     const int kInitialNumOfCfs = static_cast<int>(cf_names.size());
     autovector<VersionEdit> new_cfs;
     uint64_t last_seq = 1;
@@ -559,41 +655,50 @@ class ManifestWriterTest : public testing::Test {
       new_cf.SetLastSequence(last_seq++);
       new_cfs.emplace_back(new_cf);
     }
+    *last_seqno = last_seq;
 
     const std::string manifest = DescriptorFileName(dbname_, 1);
-    unique_ptr<WritableFile> file;
+    std::unique_ptr<WritableFile> file;
     Status s = env_->NewWritableFile(
         manifest, &file, env_->OptimizeForManifestWrite(env_options_));
     ASSERT_OK(s);
-    unique_ptr<WritableFileWriter> file_writer(
+    std::unique_ptr<WritableFileWriter> file_writer(
         new WritableFileWriter(std::move(file), manifest, env_options_));
     {
-      log::Writer log(std::move(file_writer), 0, false);
+      log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
       std::string record;
       new_db.EncodeTo(&record);
-      s = log.AddRecord(record);
+      s = (*log_writer)->AddRecord(record);
       for (const auto& e : new_cfs) {
+        record.clear();
         e.EncodeTo(&record);
-        s = log.AddRecord(record);
+        s = (*log_writer)->AddRecord(record);
         ASSERT_OK(s);
       }
     }
     ASSERT_OK(s);
-    // Make "CURRENT" file point to the new manifest file.
-    s = SetCurrentFile(env_, dbname_, 1, nullptr);
 
-    std::vector<ColumnFamilyDescriptor> column_families;
     cf_options_.table_factory = mock_table_factory_;
     for (const auto& cf_name : cf_names) {
-      column_families.emplace_back(cf_name, cf_options_);
+      column_families->emplace_back(cf_name, cf_options_);
     }
+  }
+
+  // Create DB with 3 column families.
+  void NewDB() {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    SequenceNumber last_seqno;
+    std::unique_ptr<log::Writer> log_writer;
+
+    PrepareManifest(&column_families, &last_seqno, &log_writer);
+    log_writer.reset();
+    // Make "CURRENT" file point to the new manifest file.
+    Status s = SetCurrentFile(env_, dbname_, 1, nullptr);
+    ASSERT_OK(s);
 
     EXPECT_OK(versions_->Recover(column_families, false));
-    EXPECT_EQ(kInitialNumOfCfs,
+    EXPECT_EQ(column_families.size(),
               versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
-    for (auto cfd : *versions_->GetColumnFamilySet()) {
-      cfds_.emplace_back(cfd);
-    }
   }
 
   Env* env_;
@@ -605,25 +710,41 @@ class ManifestWriterTest : public testing::Test {
   std::shared_ptr<Cache> table_cache_;
   WriteController write_controller_;
   WriteBufferManager write_buffer_manager_;
-  std::unique_ptr<VersionSet> versions_;
+  std::shared_ptr<VersionSet> versions_;
   InstrumentedMutex mutex_;
   std::atomic<bool> shutting_down_;
   std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
-  std::vector<ColumnFamilyData*> cfds_;
 };
 
-TEST_F(ManifestWriterTest, SameColumnFamilyGroupCommit) {
+const std::string VersionSetTestBase::kColumnFamilyName1 = "alice";
+const std::string VersionSetTestBase::kColumnFamilyName2 = "bob";
+const std::string VersionSetTestBase::kColumnFamilyName3 = "charles";
+
+class VersionSetTest : public VersionSetTestBase, public testing::Test {
+ public:
+  VersionSetTest() : VersionSetTestBase() {}
+};
+
+TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) {
   NewDB();
   const int kGroupSize = 5;
-  std::vector<VersionEdit> edits(kGroupSize);
-  std::vector<ColumnFamilyData*> cfds(kGroupSize, cfds_[0]);
-  std::vector<MutableCFOptions> all_mutable_cf_options(kGroupSize,
-                                                       mutable_cf_options_);
-  std::vector<autovector<VersionEdit*>> edit_lists(kGroupSize);
+  autovector<VersionEdit> edits;
   for (int i = 0; i != kGroupSize; ++i) {
-    edit_lists[i].emplace_back(&edits[i]);
+    edits.emplace_back(VersionEdit());
+  }
+  autovector<ColumnFamilyData*> cfds;
+  autovector<const MutableCFOptions*> all_mutable_cf_options;
+  autovector<autovector<VersionEdit*>> edit_lists;
+  for (int i = 0; i != kGroupSize; ++i) {
+    cfds.emplace_back(versions_->GetColumnFamilySet()->GetDefault());
+    all_mutable_cf_options.emplace_back(&mutable_cf_options_);
+    autovector<VersionEdit*> edit_list;
+    edit_list.emplace_back(&edits[i]);
+    edit_lists.emplace_back(edit_list);
   }
 
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
   int count = 0;
   SyncPoint::GetInstance()->SetCallBack(
       "VersionSet::ProcessManifestWrites:SameColumnFamily", [&](void* arg) {
@@ -639,6 +760,338 @@ TEST_F(ManifestWriterTest, SameColumnFamilyGroupCommit) {
   EXPECT_OK(s);
   EXPECT_EQ(kGroupSize - 1, count);
 }
+
+TEST_F(VersionSetTest, HandleValidAtomicGroup) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  SequenceNumber last_seqno;
+  std::unique_ptr<log::Writer> log_writer;
+  PrepareManifest(&column_families, &last_seqno, &log_writer);
+
+  // Append multiple version edits that form an atomic group
+  const int kAtomicGroupSize = 3;
+  std::vector<VersionEdit> edits(kAtomicGroupSize);
+  int remaining = kAtomicGroupSize;
+  for (size_t i = 0; i != edits.size(); ++i) {
+    edits[i].SetLogNumber(0);
+    edits[i].SetNextFile(2);
+    edits[i].MarkAtomicGroup(--remaining);
+    edits[i].SetLastSequence(last_seqno++);
+  }
+  Status s;
+  for (const auto& edit : edits) {
+    std::string record;
+    edit.EncodeTo(&record);
+    s = log_writer->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer.reset();
+
+  s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  ASSERT_OK(s);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  bool first_in_atomic_group = false;
+  bool last_in_atomic_group = false;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::Recover:FirstInAtomicGroup", [&](void* arg) {
+        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+        EXPECT_EQ(edits.front().DebugString(),
+                  e->DebugString());  // compare based on value
+        first_in_atomic_group = true;
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::Recover:LastInAtomicGroup", [&](void* arg) {
+        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+        EXPECT_EQ(edits.back().DebugString(),
+                  e->DebugString());  // compare based on value
+        EXPECT_TRUE(first_in_atomic_group);
+        last_in_atomic_group = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  EXPECT_OK(versions_->Recover(column_families, false));
+  EXPECT_EQ(column_families.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group);
+  EXPECT_TRUE(last_in_atomic_group);
+}
+
+TEST_F(VersionSetTest, HandleIncompleteTrailingAtomicGroup) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  SequenceNumber last_seqno;
+  std::unique_ptr<log::Writer> log_writer;
+  PrepareManifest(&column_families, &last_seqno, &log_writer);
+
+  // Append multiple version edits that form an atomic group
+  const int kAtomicGroupSize = 4;
+  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+  std::vector<VersionEdit> edits(kNumberOfPersistedVersionEdits);
+  int remaining = kAtomicGroupSize;
+  for (size_t i = 0; i != edits.size(); ++i) {
+    edits[i].SetLogNumber(0);
+    edits[i].SetNextFile(2);
+    edits[i].MarkAtomicGroup(--remaining);
+    edits[i].SetLastSequence(last_seqno++);
+  }
+  Status s;
+  for (const auto& edit : edits) {
+    std::string record;
+    edit.EncodeTo(&record);
+    s = log_writer->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer.reset();
+
+  s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  ASSERT_OK(s);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  bool first_in_atomic_group = false;
+  bool last_in_atomic_group = false;
+  size_t num = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::Recover:FirstInAtomicGroup", [&](void* arg) {
+        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+        EXPECT_EQ(edits.front().DebugString(),
+                  e->DebugString());  // compare based on value
+        first_in_atomic_group = true;
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::Recover:LastInAtomicGroup",
+      [&](void* /* arg */) { last_in_atomic_group = true; });
+  SyncPoint::GetInstance()->SetCallBack("VersionSet::Recover:AtomicGroup",
+                                        [&](void* /* arg */) { ++num; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  EXPECT_OK(versions_->Recover(column_families, false));
+  EXPECT_EQ(column_families.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group);
+  EXPECT_FALSE(last_in_atomic_group);
+  EXPECT_EQ(kNumberOfPersistedVersionEdits, num);
+}
+
+TEST_F(VersionSetTest, HandleCorruptedAtomicGroup) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  SequenceNumber last_seqno;
+  std::unique_ptr<log::Writer> log_writer;
+  PrepareManifest(&column_families, &last_seqno, &log_writer);
+
+  // Append multiple version edits that form an atomic group
+  const int kAtomicGroupSize = 4;
+  std::vector<VersionEdit> edits(kAtomicGroupSize);
+  int remaining = kAtomicGroupSize;
+  for (size_t i = 0; i != edits.size(); ++i) {
+    edits[i].SetLogNumber(0);
+    edits[i].SetNextFile(2);
+    if (i != (kAtomicGroupSize / 2)) {
+      edits[i].MarkAtomicGroup(--remaining);
+    }
+    edits[i].SetLastSequence(last_seqno++);
+  }
+  Status s;
+  for (const auto& edit : edits) {
+    std::string record;
+    edit.EncodeTo(&record);
+    s = log_writer->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer.reset();
+
+  s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  ASSERT_OK(s);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  bool mixed = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::Recover:AtomicGroupMixedWithNormalEdits", [&](void* arg) {
+        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+        EXPECT_EQ(edits[kAtomicGroupSize / 2].DebugString(), e->DebugString());
+        mixed = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  EXPECT_NOK(versions_->Recover(column_families, false));
+  EXPECT_EQ(column_families.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(mixed);
+}
+
+TEST_F(VersionSetTest, HandleIncorrectAtomicGroupSize) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  SequenceNumber last_seqno;
+  std::unique_ptr<log::Writer> log_writer;
+  PrepareManifest(&column_families, &last_seqno, &log_writer);
+
+  // Append multiple version edits that form an atomic group
+  const int kAtomicGroupSize = 4;
+  std::vector<VersionEdit> edits(kAtomicGroupSize);
+  int remaining = kAtomicGroupSize;
+  for (size_t i = 0; i != edits.size(); ++i) {
+    edits[i].SetLogNumber(0);
+    edits[i].SetNextFile(2);
+    if (i != 1) {
+      edits[i].MarkAtomicGroup(--remaining);
+    } else {
+      edits[i].MarkAtomicGroup(remaining--);
+    }
+    edits[i].SetLastSequence(last_seqno++);
+  }
+  Status s;
+  for (const auto& edit : edits) {
+    std::string record;
+    edit.EncodeTo(&record);
+    s = log_writer->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer.reset();
+
+  s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  ASSERT_OK(s);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  bool incorrect_group_size = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::Recover:IncorrectAtomicGroupSize", [&](void* arg) {
+        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+        EXPECT_EQ(edits[1].DebugString(), e->DebugString());
+        incorrect_group_size = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  EXPECT_NOK(versions_->Recover(column_families, false));
+  EXPECT_EQ(column_families.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(incorrect_group_size);
+}
+
+class VersionSetTestDropOneCF : public VersionSetTestBase,
+                                public testing::TestWithParam<std::string> {
+ public:
+  VersionSetTestDropOneCF() : VersionSetTestBase() {}
+};
+
+// This test simulates the following execution sequence
+// Time  thread1                  bg_flush_thr
+//  |                             Prepare version edits (e1,e2,e3) for atomic
+//  |                             flush cf1, cf2, cf3
+//  |    Enqueue e to drop cfi
+//  |    to manifest_writers_
+//  |                             Enqueue (e1,e2,e3) to manifest_writers_
+//  |
+//  |    Apply e,
+//  |    cfi.IsDropped() is true
+//  |                             Apply (e1,e2,e3),
+//  |                             since cfi.IsDropped() == true, we need to
+//  |                             drop ei and write the rest to MANIFEST.
+//  V
+//
+//  Repeat the test for i = 1, 2, 3 to simulate dropping the first, middle and
+//  last column family in an atomic group.
+TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  SequenceNumber last_seqno;
+  std::unique_ptr<log::Writer> log_writer;
+  PrepareManifest(&column_families, &last_seqno, &log_writer);
+  Status s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  ASSERT_OK(s);
+
+  EXPECT_OK(versions_->Recover(column_families, false /* read_only */));
+  EXPECT_EQ(column_families.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+
+  const int kAtomicGroupSize = 3;
+  const std::vector<std::string> non_default_cf_names = {
+      kColumnFamilyName1, kColumnFamilyName2, kColumnFamilyName3};
+
+  // Drop one column family
+  VersionEdit drop_cf_edit;
+  drop_cf_edit.DropColumnFamily();
+  const std::string cf_to_drop_name(GetParam());
+  auto cfd_to_drop =
+      versions_->GetColumnFamilySet()->GetColumnFamily(cf_to_drop_name);
+  ASSERT_NE(nullptr, cfd_to_drop);
+  // Increase its refcount because cfd_to_drop is used later, and we need to
+  // prevent it from being deleted.
+  cfd_to_drop->Ref();
+  drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID());
+  mutex_.Lock();
+  s = versions_->LogAndApply(cfd_to_drop,
+                             *cfd_to_drop->GetLatestMutableCFOptions(),
+                             &drop_cf_edit, &mutex_);
+  mutex_.Unlock();
+  ASSERT_OK(s);
+
+  std::vector<VersionEdit> edits(kAtomicGroupSize);
+  uint32_t remaining = kAtomicGroupSize;
+  size_t i = 0;
+  autovector<ColumnFamilyData*> cfds;
+  autovector<const MutableCFOptions*> mutable_cf_options_list;
+  autovector<autovector<VersionEdit*>> edit_lists;
+  for (const auto& cf_name : non_default_cf_names) {
+    auto cfd = (cf_name != cf_to_drop_name)
+                   ? versions_->GetColumnFamilySet()->GetColumnFamily(cf_name)
+                   : cfd_to_drop;
+    ASSERT_NE(nullptr, cfd);
+    cfds.push_back(cfd);
+    mutable_cf_options_list.emplace_back(cfd->GetLatestMutableCFOptions());
+    edits[i].SetColumnFamily(cfd->GetID());
+    edits[i].SetLogNumber(0);
+    edits[i].SetNextFile(2);
+    edits[i].MarkAtomicGroup(--remaining);
+    edits[i].SetLastSequence(last_seqno++);
+    autovector<VersionEdit*> tmp_edits;
+    tmp_edits.push_back(&edits[i]);
+    edit_lists.emplace_back(tmp_edits);
+    ++i;
+  }
+  int called = 0;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", [&](void* arg) {
+        std::vector<VersionEdit*>* tmp_edits =
+            reinterpret_cast<std::vector<VersionEdit*>*>(arg);
+        EXPECT_EQ(kAtomicGroupSize - 1, tmp_edits->size());
+        for (const auto e : *tmp_edits) {
+          bool found = false;
+          for (const auto& e2 : edits) {
+            if (&e2 == e) {
+              found = true;
+              break;
+            }
+          }
+          ASSERT_TRUE(found);
+        }
+        ++called;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  mutex_.Lock();
+  s = versions_->LogAndApply(cfds, mutable_cf_options_list, edit_lists,
+                             &mutex_);
+  mutex_.Unlock();
+  ASSERT_OK(s);
+  ASSERT_EQ(1, called);
+  if (cfd_to_drop->Unref()) {
+    delete cfd_to_drop;
+    cfd_to_drop = nullptr;
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AtomicGroup, VersionSetTestDropOneCF,
+    testing::Values(VersionSetTestBase::kColumnFamilyName1,
+                    VersionSetTestBase::kColumnFamilyName2,
+                    VersionSetTestBase::kColumnFamilyName3));
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 4612f7f31e5..667ecae41ad 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -443,7 +443,7 @@ Status WalManager::ReadFirstLine(const std::string& fname,
   std::unique_ptr<SequentialFile> file;
   Status status = env_->NewSequentialFile(
       fname, &file, env_->OptimizeForLogRead(env_options_));
-  unique_ptr<SequentialFileReader> file_reader(
+  std::unique_ptr<SequentialFileReader> file_reader(
       new SequentialFileReader(std::move(file), fname));
 
   if (!status.ok()) {
@@ -457,7 +457,7 @@ Status WalManager::ReadFirstLine(const std::string& fname,
   reporter.status = &status;
   reporter.ignore_error = !db_options_.paranoid_checks;
   log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter,
-                     true /*checksum*/, number);
+                     true /*checksum*/, number, false /* retry_after_eof */);
   std::string scratch;
   Slice record;
 
diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc
index 838fad2b1d9..379f12f52aa 100644
--- a/db/wal_manager_test.cc
+++ b/db/wal_manager_test.cc
@@ -76,9 +76,9 @@ class WalManagerTest : public testing::Test {
   void RollTheLog(bool /*archived*/) {
     current_log_number_++;
     std::string fname = ArchivedLogFileName(dbname_, current_log_number_);
-    unique_ptr<WritableFile> file;
+    std::unique_ptr<WritableFile> file;
     ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_));
-    unique_ptr<WritableFileWriter> file_writer(
+    std::unique_ptr<WritableFileWriter> file_writer(
         new WritableFileWriter(std::move(file), fname, env_options_));
     current_log_writer_.reset(new log::Writer(std::move(file_writer), 0, false));
   }
@@ -94,7 +94,7 @@ class WalManagerTest : public testing::Test {
 
   std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
       const SequenceNumber seq) {
-    unique_ptr<TransactionLogIterator> iter;
+    std::unique_ptr<TransactionLogIterator> iter;
     Status status = wal_manager_->GetUpdatesSince(
         seq, &iter, TransactionLogIterator::ReadOptions(), versions_.get());
     EXPECT_OK(status);
@@ -118,7 +118,7 @@ class WalManagerTest : public testing::Test {
 TEST_F(WalManagerTest, ReadFirstRecordCache) {
   Init();
   std::string path = dbname_ + "/000001.log";
-  unique_ptr<WritableFile> file;
+  std::unique_ptr<WritableFile> file;
   ASSERT_OK(env_->NewWritableFile(path, &file, EnvOptions()));
 
   SequenceNumber s;
@@ -129,7 +129,7 @@ TEST_F(WalManagerTest, ReadFirstRecordCache) {
       wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1 /* number */, &s));
   ASSERT_EQ(s, 0U);
 
-  unique_ptr<WritableFileWriter> file_writer(
+  std::unique_ptr<WritableFileWriter> file_writer(
       new WritableFileWriter(std::move(file), path, EnvOptions()));
   log::Writer writer(std::move(file_writer), 1,
                      db_options_.recycle_log_file_num > 0);
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 295fba22ed0..19ec243592e 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -414,8 +414,13 @@ Status WriteBatch::Iterate(Handler* handler) const {
   char tag = 0;
   uint32_t column_family = 0;  // default
   bool last_was_try_again = false;
-  while (((s.ok() && !input.empty()) || UNLIKELY(s.IsTryAgain())) &&
-         handler->Continue()) {
+  bool handler_continue = true;
+  while (((s.ok() && !input.empty()) || UNLIKELY(s.IsTryAgain()))) {
+    handler_continue = handler->Continue();
+    if (!handler_continue) {
+      break;
+    }
+
     if (LIKELY(!s.IsTryAgain())) {
       last_was_try_again = false;
       tag = 0;
@@ -583,7 +588,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
   if (!s.ok()) {
     return s;
   }
-  if (found != WriteBatchInternal::Count(this)) {
+  if (handler_continue && found != WriteBatchInternal::Count(this)) {
     return Status::Corruption("WriteBatch has wrong count");
   } else {
     return Status::OK();
@@ -1048,7 +1053,7 @@ class MemTableInserter : public WriteBatch::Handler {
   // a map is too expensive in the Write() path as they
   // cause memory allocations though unused.
   // Make creation optional but do not incur
-  // unique_ptr additional allocation
+  // std::unique_ptr additional allocation
   using MemPostInfoMap = std::map<MemTable*, MemTablePostProcessInfo>;
   using PostMapType = std::aligned_storage<sizeof(MemPostInfoMap)>::type;
   PostMapType mem_post_info_map_;
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index 84b866a3162..52c0fab24f3 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -51,7 +51,8 @@ static std::string PrintContents(WriteBatch* b) {
       iter = mem->NewIterator(ReadOptions(), &arena);
       arena_iter_guard.set(iter);
     } else {
-      iter = mem->NewRangeTombstoneIterator(ReadOptions());
+      iter = mem->NewRangeTombstoneIterator(ReadOptions(),
+                                            kMaxSequenceNumber /* read_seq */);
       iter_guard.reset(iter);
     }
     if (iter == nullptr) {
diff --git a/db/write_thread.cc b/db/write_thread.cc
index 5ea7715c692..835992c8fce 100644
--- a/db/write_thread.cc
+++ b/db/write_thread.cc
@@ -181,6 +181,7 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
   }
 
   if ((state & goal_mask) == 0) {
+    TEST_SYNC_POINT_CALLBACK("WriteThread::AwaitState:BlockingWaiting", w);
     state = BlockingAwaitState(w, goal_mask);
   }
 
diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock
index 33f4dbc4474..78dc919a981 100644
--- a/docs/Gemfile.lock
+++ b/docs/Gemfile.lock
@@ -25,7 +25,7 @@ GEM
     github-pages (104)
       activesupport (= 4.2.7)
       github-pages-health-check (= 1.2.0)
-      jekyll (= 3.3.0)
+      jekyll (>= 3.8.4)
       jekyll-avatar (= 0.4.2)
       jekyll-coffeescript (= 1.0.1)
       jekyll-feed (= 0.8.0)
@@ -56,7 +56,7 @@ GEM
       activesupport (>= 2)
       nokogiri (~> 1.8.2)
     i18n (0.7.0)
-    jekyll (3.3.0)
+    jekyll (3.8.4)
       addressable (~> 2.4)
       colorator (~> 1.0)
       jekyll-sass-converter (~> 1.0)
diff --git a/docs/_data/authors.yml b/docs/_data/authors.yml
index 0f1c39abb2a..13225be9dfd 100644
--- a/docs/_data/authors.yml
+++ b/docs/_data/authors.yml
@@ -51,7 +51,11 @@ IslamAbdelRahman:
 
 ajkr:
   full_name: Andrew Kryczka
-  fbid: 100010829806660
+  fbid: 568694102
+
+abhimadan:
+  full_name: Abhishek Madan
+  fbid: 1850247869
 
 sagar0:
   full_name: Sagar Vemuri
diff --git a/docs/_includes/post.html b/docs/_includes/post.html
index 4c4b5b1acee..3ae0a2a8084 100644
--- a/docs/_includes/post.html
+++ b/docs/_includes/post.html
@@ -1,18 +1,23 @@
 <div class="post">
-  {% assign author = site.data.authors[page.author] %}
   <header class="post-header">
-    {% if author.fbid %}
-    <div class="authorPhoto">
-      <img src="http://graph.facebook.com/{{ author.fbid }}/picture/" alt="{{ author.fullname }}" title="{{ author.fullname }}" />
+    <div style="display: flex; align-content: center; align-items: center; justify-content: center">
+    {% for author_idx in page.author %}
+      <div style="padding: 16px; display: inline-block; text-align: center">
+        {% assign author = site.data.authors[author_idx] %}
+        {% if author.fbid %}
+        <div class="authorPhoto">
+          <img src="http://graph.facebook.com/{{ author.fbid }}/picture/" alt="{{ author.fullname }}" title="{{ author.fullname }}" />
+        </div>
+        {% endif %}
+        {% if author.full_name %}
+        <p class="post-authorName">{{ author.full_name }}</p>
+        {% endif %}
+      </div>
+    {% endfor %}
     </div>
-    {% endif %}
-    {% if author.full_name %}
-    <p class="post-authorName">{{ author.full_name }}</p>
-    {% endif %}
     <h1 class="post-title">{% if include.truncate %}<a href="{{ page.url | absolute_url }}">{{ page.title }}</a>{% else %}{{ page.title }}{% endif %}</h1>
     <p class="post-meta">Posted {{ page.date | date: '%B %d, %Y' }}{% if page.meta %} • {{ page.meta }}{% endif %}</p>
   </header>
-
   <article class="post-content">
   {% if include.truncate %}
     {% if page.content contains '<!--truncate-->' %}
diff --git a/docs/_posts/2017-12-19-write-prepared-txn.markdown b/docs/_posts/2017-12-19-write-prepared-txn.markdown
index d592b6f7b16..439b3f83cc4 100644
--- a/docs/_posts/2017-12-19-write-prepared-txn.markdown
+++ b/docs/_posts/2017-12-19-write-prepared-txn.markdown
@@ -7,8 +7,6 @@ category: blog
 
 RocksDB supports both optimistic and pessimistic concurrency controls. The pessimistic transactions make use of locks to provide isolation between the transactions. The default write policy in pessimistic transactions is _WriteCommitted_, which means that the data is written to the DB, i.e., the memtable, only after the transaction is committed. This policy simplified the implementation but came with some limitations in throughput, transaction size, and variety in supported isolation levels. In the below, we explain these in detail and present the other write policies, _WritePrepared_ and _WriteUnprepared_. We then dive into the design of _WritePrepared_ transactions.
 
-> _WritePrepared_ are to be announced as production-ready soon.
-
 ### WriteCommitted, Pros and Cons
 
 With _WriteCommitted_ write policy, the data is written to the memtable only after the transaction commits. This greatly simplifies the read path as any data that is read by other transactions can be assumed to be committed. This write policy, however, implies that the writes are buffered in memory in the meanwhile. This makes memory a bottleneck for large transactions. The delay of the commit phase in 2PC (two-phase commit) also becomes noticeable since most of the work, i.e., writing to memtable, is done at the commit phase. When the commit of multiple transactions are done in a serial fashion, such as in 2PC implementation of MySQL, the lengthy commit latency becomes a major contributor to lower throughput. Moreover this write policy cannot provide weaker isolation levels, such as READ UNCOMMITTED, that could potentially provide higher throughput for some applications.
@@ -28,10 +26,16 @@ With _WritePrepared_, a transaction still buffers the writes in a write batch ob
 
 The _CommitCache_ is a lock-free data structure that caches the recent commit entries. Looking up the entries in the cache must be enough for almost all th transactions that commit in a timely manner. When evicting the older entries from the cache, it still maintains some other data structures to cover the corner cases for transactions that takes abnormally too long to finish. We will cover them in the design details below.
 
-### Preliminary Results
-The full experimental results are to be reported soon. Here we present the improvement in tps observed in some preliminary experiments with MyRocks:
-* sysbench update-noindex: 25%
-* sysbench read-write: 7.6%
-* linkbench: 3.7%
+### Benchmark Results
+Here we presents the improvements observed in MyRocks with sysbench and linkbench:
+* benchmark...........tps.........p95 latency....cpu/query
+* insert...................68%
+* update-noindex...30%......38%
+* update-index.......61%.......28%
+* read-write............6%........3.5%
+* read-only...........-1.2%.....-1.8%
+* linkbench.............1.9%......+overall........0.6%
+
+Here are also the detailed results for [In-Memory Sysbench](https://gist.github.com/maysamyabandeh/bdb868091b2929a6d938615fdcf58424) and [SSD Sysbench](https://gist.github.com/maysamyabandeh/ff94f378ab48925025c34c47eff99306) curtesy of [@mdcallag](https://github.com/mdcallag).
 
 Learn more [here](https://github.com/facebook/rocksdb/wiki/WritePrepared-Transactions).
diff --git a/docs/_posts/2018-11-21-delete-range.markdown b/docs/_posts/2018-11-21-delete-range.markdown
new file mode 100644
index 00000000000..96fc3562d19
--- /dev/null
+++ b/docs/_posts/2018-11-21-delete-range.markdown
@@ -0,0 +1,292 @@
+---
+title: "DeleteRange: A New Native RocksDB Operation"
+layout: post
+author:
+- abhimadan
+- ajkr
+category: blog
+---
+## Motivation
+
+### Deletion patterns in LSM
+
+Deleting a range of keys is a common pattern in RocksDB. Most systems built on top of
+RocksDB have multi-component key schemas, where keys sharing a common prefix are
+logically related. Here are some examples.
+
+MyRocks is a MySQL fork using RocksDB as its storage engine. Each key's first
+four bytes identify the table or index to which that key belongs. Thus dropping
+a table or index involves deleting all the keys with that prefix.
+
+Rockssandra is a Cassandra variant that uses RocksDB as its storage engine. One
+of its admin tool commands, `nodetool cleanup`, removes key-ranges that have been migrated
+to other nodes in the cluster.
+
+Marketplace uses RocksDB to store product data. Its key begins with product ID,
+and it stores various data associated with the product in separate keys. When a
+product is removed, all these keys must be deleted.
+
+When we decide what to improve, we try to find a use case that's common across
+users, since we want to build a generally useful system, not one that has many
+one-off features for individual users. The range deletion pattern is common as
+illustrated above, so from this perspective it's a good target for optimization.
+
+### Existing mechanisms: challenges and opportunities
+
+The most common pattern we see is scan-and-delete, i.e., advance an iterator
+through the to-be-deleted range, and issue a `Delete` for each key. This is
+slow (involves read I/O) so cannot be done in any critical path. Additionally,
+it creates many tombstones, which slows down iterators and doesn't offer a deadline
+for space reclamation.
+
+Another common pattern is using a custom compaction filter that drops keys in
+the deleted range(s). This deletes the range asynchronously, so cannot be used
+in cases where readers must not see keys in deleted ranges. Further, it has the
+disadvantage of outputting tombstones to all but the bottom level. That's
+because compaction cannot detect whether dropping a key would cause an older
+version at a lower level to reappear.
+
+If space reclamation time is important, or it is important that the deleted
+range not affect iterators, the user can trigger `CompactRange` on the deleted
+range. This can involve arbitrarily long waits in the compaction queue, and
+increases write-amp. By the time it's finished, however, the range is completely
+gone from the LSM.
+
+`DeleteFilesInRange` can be used prior to compacting the deleted range as long
+as snapshot readers do not need to access them. It drops files that are
+completely contained in the deleted range. That saves write-amp because, in
+`CompactRange`, the file data would have to be rewritten several times before it
+reaches the bottom of the LSM, where tombstones can finally be dropped.
+
+In addition to the above approaches having various drawbacks, they are quite
+complicated to reason about and implement. In an ideal world, deleting a range
+of keys would be (1) simple, i.e., a single API call; (2) synchronous, i.e.,
+when the call finishes, the keys are guaranteed to be wiped from the DB; (3) low
+latency so it can be used in critical paths; and (4) a first-class operation
+with all the guarantees of any other write, like atomicity, crash-recovery, etc.
+
+## v1: Getting it to work
+
+### Where to persist them?
+
+The first place we thought about storing them is inline with the data blocks.
+We could not think of a good way to do it, however, since the start of a range
+tombstone covering a key could be anywhere, making binary search impossible.
+So, we decided to investigate segregated storage.
+
+A second solution we considered is appending to the manifest. This file is
+append-only, periodically compacted, and stores metadata like the level to which
+each SST belongs. This is tempting because it leverages an existing file, which
+is maintained in the background and fully read when the DB is opened. However,
+it conceptually violates the manifest's purpose, which is to store metadata. It
+also has no way to detect when a range tombstone no longer covers anything and
+is droppable. Further, it'd be possible for keys above a range tombstone to disappear
+when they have their seqnums zeroed upon compaction to the bottommost level.
+
+A third candidate is using a separate column family. This has similar problems
+to the manifest approach. That is, we cannot easily detect when a range
+tombstone is obsolete, and seqnum zeroing can cause a key
+to go from above a range tombstone to below, i.e., disappearing. The upside is
+we can reuse logic for memory buffering, consistent reads/writes, etc.
+
+The problems with the second and third solutions indicate a need for range
+tombstones to be aware of flush/compaction. An easy way to achieve this is put
+them in the SST files themselves - but not in the data blocks, as explained for
+the first solution. So, we introduced a separate meta-block for range tombstones.
+This resolved the problem of when to obsolete range tombstones, as it's simple:
+when they're compacted to the bottom level. We also reused the LSM invariants
+that newer versions of a key are always in a higher level to prevent the seqnum
+zeroing problem. This approach has the side benefit of constraining the range
+tombstones seen during reads to ones in a similar key-range.
+
+![](/static/images/delrange/delrange_sst_blocks.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+*When there are range tombstones in an SST, they are segregated in a separate meta-block*
+{: style="text-align: center"}
+
+![](/static/images/delrange/delrange_key_schema.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+*Logical range tombstones (left) and their corresponding physical key-value representation (right)*
+{: style="text-align: center"}
+
+### Write path
+
+`WriteBatch` stores range tombstones in its buffer which are logged to the WAL and
+then applied to a dedicated range tombstone memtable during `Write`. Later in
+the background the range tombstone memtable and its corresponding data memtable
+are flushed together into a single SST with a range tombstone meta-block. SSTs
+periodically undergo compaction which rewrites SSTs with point data and range
+tombstones dropped or merged wherever possible.
+
+We chose to use a dedicated memtable for range tombstones. The memtable
+representation is always skiplist in order to minimize overhead in the usual
+case, which is the memtable contains zero or a small number of range tombstones.
+The range tombstones are segregated to a separate memtable for the same reason
+we segregated range tombstones in SSTs. That is, we did not know how to
+interleave the range tombstone with point data in a way that we would be able to
+find it for arbitrary keys that it covers.
+
+![](/static/images/delrange/delrange_write_path.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 70%"}
+
+*Lifetime of point keys and range tombstones in RocksDB*
+{: style="text-align: center"}
+
+During flush and compaction, we chose to write out all non-obsolete range
+tombstones unsorted. Sorting by a single dimension is easy to implement, but
+doesn't bring asymptotic improvement to queries over range data. Ideally, we
+want to store skylines (see “Read Path” subsection below) computed over our ranges so we can binary search.
+However, a couple of concerns cause doing this in flush and compaction to feel
+unsatisfactory: (1) we need to store multiple skylines, one for each snapshot,
+which further complicates the range tombstone meta-block encoding; and (2) even
+if we implement this, the range tombstone memtable still needs to be linearly
+scanned. Given these concerns we decided to defer collapsing work to the read
+side, hoping a good caching strategy could optimize this at some future point.
+
+
+### Read path
+
+In point lookups, we aggregate range tombstones in an unordered vector as we
+search through live memtable, immutable memtables, and then SSTs. When a key is
+found that matches the lookup key, we do a scan through the vector, checking
+whether the key is deleted.
+
+In iterators, we aggregate range tombstones into a skyline as we visit live
+memtable, immutable memtables, and SSTs. The skyline is expensive to construct but fast to determine whether a key is covered. The skyline keeps track of the most recent range tombstone found to optimize `Next` and `Prev`.
+
+|![](/static/images/delrange/delrange_uncollapsed.png)	|![](/static/images/delrange/delrange_collapsed.png)	|
+
+*([Image source: Leetcode](https://leetcode.com/problems/the-skyline-problem/description/)) The skyline problem involves taking building location/height data in the
+unsearchable form of A and converting it to the form of B, which is
+binary-searchable. With overlapping range tombstones, to achieve efficient
+searching we need to solve an analogous problem, where the x-axis is the
+key-space and the y-axis is the sequence number.*
+{: style="text-align: center"}
+
+### Performance characteristics
+
+For the v1 implementation, writes are much faster compared to the scan and
+delete (optionally within a transaction) pattern. `DeleteRange` only logs to WAL
+and applies to memtable. Logging to WAL always `fflush`es, and optionally
+`fsync`s or `fdatasync`s. Applying to memtable is always an in-memory operation.
+Since range tombstones have a dedicated skiplist memtable, the complexity of inserting is O(log(T)), where T is the number of existing buffered range tombstones.
+
+Reading in the presence of v1 range tombstones, however, is much slower than reads
+in a database where scan-and-delete has happened, due to the linear scan over
+range tombstone memtables/meta-blocks.
+
+Iterating in a database with v1 range tombstones is usually slower than in a
+scan-and-delete database, although the gap lessens as iterations grow longer.
+When an iterator is first created and seeked, we construct a skyline over its
+tombstones. This operation is O(T\*log(T)) where T is the number of tombstones
+found across live memtable, immutable memtable, L0 files, and one file from each
+of the L1+ levels. However, moving the iterator forwards or backwards is simply
+a constant-time operation (excluding edge cases, e.g., many range tombstones
+between consecutive point keys).
+
+## v2: Making it fast
+
+`DeleteRange`’s negative impact on read perf is a barrier to its adoption. The
+root cause is range tombstones are not stored or cached in a format that can be
+efficiently searched. We needed to design DeleteRange so that we could maintain
+write performance while making read performance competitive with workarounds
+used in production (e.g., scan-and-delete).
+
+### Representations
+
+The key idea of the redesign is that, instead of globally collapsing range tombstones,
+ we can locally “fragment” them for each SST file and memtable to guarantee that:
+
+* no range tombstones overlap; and
+* range tombstones are ordered by start key.
+
+Combined, these properties make range tombstones binary searchable. This
+ fragmentation will happen on the read path, but unlike the previous design, we can
+ easily cache many of these range tombstone fragments on the read path.
+
+### Write path
+
+The write path remains unchanged.
+
+### Read path
+
+When an SST file is opened, its range tombstones are fragmented and cached. For point
+ lookups, we binary search each file's fragmented range tombstones for one that covers
+ the lookup key. Unlike the old design, once we find a tombstone, we no longer need to
+ search for the key in lower levels, since we know that any keys on those levels will be
+ covered (though we do still check the current level since there may be keys written after
+ the range tombstone).
+
+For range scans, we create iterators over all the fragmented range
+ tombstones and store them in a list, seeking each one to cover the start key of the range
+ scan (if possible), and query each encountered key in this structure as in the old design,
+ advancing range tombstone iterators as necessary. In effect, we implicitly create a skyline.
+ This requires significantly less work on iterator creation, but since each memtable/SST has
+its own range tombstone iterator, querying range tombstones requires key comparisons (and
+possibly iterator increments) for several iterators (as opposed to v1, where we had a global
+collapsed representation of all range tombstones). As a result, very long range scans may become
+ slower than before, but short range scans are an order of magnitude faster, which are the
+ more common class of range scan.
+
+## Benchmarks
+
+To understand the performance of this new design, we used `db_bench` to compare point lookup, short range scan,
+ and long range scan performance across:
+
+* the v1 DeleteRange design,
+* the scan-and-delete workaround, and
+* the v2 DeleteRange design.
+
+In these benchmarks, we used a database with 5 million data keys, and 10000 range tombstones (ignoring
+those dropped during compaction) that were written in regular intervals after 4.5 million data keys were written.
+Writing the range tombstones ensures that most of them are not compacted away, and we have more tombstones
+in higher levels that cover keys in lower levels, which allows the benchmarks to exercise more interesting behavior
+when reading deleted keys.
+
+Point lookup benchmarks read 100000 keys from a database using `readwhilewriting`. Range scan benchmarks used
+`seekrandomwhilewriting` and seeked 100000 times, and advanced up to 10 keys away from the seek position for short range scans, and advanced up to 1000 keys away from the seek position for long range scans.
+
+The results are summarized in the tables below, averaged over 10 runs (note the
+different SHAs for v1 benchmarks are due to a new `db_bench` flag that was added in order to compare performance with databases with no tombstones; for brevity, those results are not reported here). Also note that the block cache was large enough to hold the entire db, so the large throughput is due to limited I/Os and little time spent on decompression. The range tombstone blocks are always pinned uncompressed in memory. We believe these setup details should not affect relative performance between versions.
+
+### Point Lookups
+
+|Name	|SHA	|avg micros/op	|avg ops/sec	|
+|v1	|35cd754a6	|1.3179	|759,830.90	|
+|scan-del	|7528130e3	|0.6036	|1,667,237.70	|
+|v2	|7528130e3	|0.6128	|1,634,633.40	|
+
+### Short Range Scans
+
+|Name	|SHA	|avg micros/op	|avg ops/sec	|
+|v1	|0ed738fdd	|6.23	|176,562.00	|
+|scan-del	|PR 4677	|2.6844	|377,313.00	|
+|v2	|PR 4677	|2.8226	|361,249.70	|
+
+### Long Range scans
+
+|Name	|SHA	|avg micros/op	|avg ops/sec	|
+|v1	|0ed738fdd	|52.7066	|19,074.00	|
+|scan-del	|PR 4677	|38.0325	|26,648.60	|
+|v2	|PR 4677	|41.2882	|24,714.70	|
+
+## Future Work
+
+Note that memtable range tombstones are fragmented every read; for now this is acceptable,
+ since we expect there to be relatively few range tombstones in memtables (and users can
+ enforce this by keeping track of the number of memtable range deletions and manually flushing
+ after it passes a threshold). In the future, a specialized data structure can be used for storing
+ range tombstones in memory to avoid this work.
+
+Another future optimization is to create a new format version that requires range tombstones to
+ be stored in a fragmented form. This would save time when opening SST files, and when `max_open_files` 
+is not -1 (i.e., files may be opened several times).
+
+## Acknowledgements
+
+Special thanks to Peter Mattis and Nikhil Benesch from Cockroach Labs, who were early users of
+DeleteRange v1 in production, contributed the cleanest/most efficient v1 aggregation implementation, found and fixed bugs, and provided initial DeleteRange v2 design and continued help.
+
+Thanks to Huachao Huang and Jinpeng Zhang from PingCAP for early DeleteRange v1 adoption, bug reports, and fixes.
diff --git a/docs/_sass/_blog.scss b/docs/_sass/_blog.scss
index 74335d10b41..12a73c1fcda 100644
--- a/docs/_sass/_blog.scss
+++ b/docs/_sass/_blog.scss
@@ -35,11 +35,13 @@
       border-radius: 50%;
       height: 50px;
       left: 50%;
-      margin-left: -25px;
+      margin-left: auto;
+      margin-right: auto;
+      display: inline-block;
       overflow: hidden;
-      position: absolute;
+      position: static;
       top: -25px;
       width: 50px;
     }
   }
-}
\ No newline at end of file
+}
diff --git a/docs/static/images/delrange/delrange_collapsed.png b/docs/static/images/delrange/delrange_collapsed.png
new file mode 100644
index 00000000000..52246c2c1d6
Binary files /dev/null and b/docs/static/images/delrange/delrange_collapsed.png differ
diff --git a/docs/static/images/delrange/delrange_key_schema.png b/docs/static/images/delrange/delrange_key_schema.png
new file mode 100644
index 00000000000..0a14d4a3a52
Binary files /dev/null and b/docs/static/images/delrange/delrange_key_schema.png differ
diff --git a/docs/static/images/delrange/delrange_sst_blocks.png b/docs/static/images/delrange/delrange_sst_blocks.png
new file mode 100644
index 00000000000..6003e42ae89
Binary files /dev/null and b/docs/static/images/delrange/delrange_sst_blocks.png differ
diff --git a/docs/static/images/delrange/delrange_uncollapsed.png b/docs/static/images/delrange/delrange_uncollapsed.png
new file mode 100644
index 00000000000..39c7097af96
Binary files /dev/null and b/docs/static/images/delrange/delrange_uncollapsed.png differ
diff --git a/docs/static/images/delrange/delrange_write_path.png b/docs/static/images/delrange/delrange_write_path.png
new file mode 100644
index 00000000000..229dfb349ac
Binary files /dev/null and b/docs/static/images/delrange/delrange_write_path.png differ
diff --git a/env/env.cc b/env/env.cc
index 9b7f5e40ded..a41feaf00e6 100644
--- a/env/env.cc
+++ b/env/env.cc
@@ -43,7 +43,7 @@ uint64_t Env::GetThreadID() const {
 
 Status Env::ReuseWritableFile(const std::string& fname,
                               const std::string& old_fname,
-                              unique_ptr<WritableFile>* result,
+                              std::unique_ptr<WritableFile>* result,
                               const EnvOptions& options) {
   Status s = RenameFile(old_fname, fname);
   if (!s.ok()) {
@@ -242,11 +242,11 @@ void Fatal(Logger* info_log, const char* format, ...) {
   va_end(ap);
 }
 
-void LogFlush(const shared_ptr<Logger>& info_log) {
+void LogFlush(const std::shared_ptr<Logger>& info_log) {
   LogFlush(info_log.get());
 }
 
-void Log(const InfoLogLevel log_level, const shared_ptr<Logger>& info_log,
+void Log(const InfoLogLevel log_level, const std::shared_ptr<Logger>& info_log,
          const char* format, ...) {
   va_list ap;
   va_start(ap, format);
@@ -254,49 +254,49 @@ void Log(const InfoLogLevel log_level, const shared_ptr<Logger>& info_log,
   va_end(ap);
 }
 
-void Header(const shared_ptr<Logger>& info_log, const char* format, ...) {
+void Header(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
   va_list ap;
   va_start(ap, format);
   Headerv(info_log.get(), format, ap);
   va_end(ap);
 }
 
-void Debug(const shared_ptr<Logger>& info_log, const char* format, ...) {
+void Debug(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
   va_list ap;
   va_start(ap, format);
   Debugv(info_log.get(), format, ap);
   va_end(ap);
 }
 
-void Info(const shared_ptr<Logger>& info_log, const char* format, ...) {
+void Info(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
   va_list ap;
   va_start(ap, format);
   Infov(info_log.get(), format, ap);
   va_end(ap);
 }
 
-void Warn(const shared_ptr<Logger>& info_log, const char* format, ...) {
+void Warn(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
   va_list ap;
   va_start(ap, format);
   Warnv(info_log.get(), format, ap);
   va_end(ap);
 }
 
-void Error(const shared_ptr<Logger>& info_log, const char* format, ...) {
+void Error(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
   va_list ap;
   va_start(ap, format);
   Errorv(info_log.get(), format, ap);
   va_end(ap);
 }
 
-void Fatal(const shared_ptr<Logger>& info_log, const char* format, ...) {
+void Fatal(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
   va_list ap;
   va_start(ap, format);
   Fatalv(info_log.get(), format, ap);
   va_end(ap);
 }
 
-void Log(const shared_ptr<Logger>& info_log, const char* format, ...) {
+void Log(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
   va_list ap;
   va_start(ap, format);
   Logv(info_log.get(), format, ap);
@@ -305,7 +305,7 @@ void Log(const shared_ptr<Logger>& info_log, const char* format, ...) {
 
 Status WriteStringToFile(Env* env, const Slice& data, const std::string& fname,
                          bool should_sync) {
-  unique_ptr<WritableFile> file;
+  std::unique_ptr<WritableFile> file;
   EnvOptions soptions;
   Status s = env->NewWritableFile(fname, &file, soptions);
   if (!s.ok()) {
@@ -324,7 +324,7 @@ Status WriteStringToFile(Env* env, const Slice& data, const std::string& fname,
 Status ReadFileToString(Env* env, const std::string& fname, std::string* data) {
   EnvOptions soptions;
   data->clear();
-  unique_ptr<SequentialFile> file;
+  std::unique_ptr<SequentialFile> file;
   Status s = env->NewSequentialFile(fname, &file, soptions);
   if (!s.ok()) {
     return s;
diff --git a/env/env_basic_test.cc b/env/env_basic_test.cc
index e33c79f3a29..22983dbecdb 100644
--- a/env/env_basic_test.cc
+++ b/env/env_basic_test.cc
@@ -2,17 +2,14 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include <algorithm>
 #include <memory>
 #include <string>
 #include <vector>
+#include <algorithm>
 
-#include "cloud/aws/aws_env.h"
 #include "env/mock_env.h"
 #include "rocksdb/env.h"
 #include "rocksdb/utilities/object_registry.h"
-#include "util/logging.h"
-#include "util/stderr_logger.h"
 #include "util/testharness.h"
 
 namespace rocksdb {
@@ -63,7 +60,9 @@ class EnvBasicTestWithParam : public testing::Test,
     test_dir_ = test::PerThreadDBPath(env_, "env_basic_test");
   }
 
-  void SetUp() { env_->CreateDirIfMissing(test_dir_); }
+  void SetUp() {
+    env_->CreateDirIfMissing(test_dir_);
+  }
 
   void TearDown() {
     std::vector<std::string> files;
@@ -92,55 +91,6 @@ static std::unique_ptr<Env> mock_env(new MockEnv(Env::Default()));
 INSTANTIATE_TEST_CASE_P(MockEnv, EnvBasicTestWithParam,
                         ::testing::Values(mock_env.get()));
 #ifndef ROCKSDB_LITE
-
-#ifdef USE_AWS
-// Register an AWS env
-void CreateAwsEnv(const std::string& dbpath,
-                  std::unique_ptr<rocksdb::Env>* result) {
-  std::shared_ptr<rocksdb::Logger> info_log;
-  info_log.reset(new rocksdb::StderrLogger(rocksdb::InfoLogLevel::DEBUG_LEVEL));
-  std::string aws_access_key_id;
-  std::string aws_secret_access_key;
-  std::string aws_region;
-  Status st = rocksdb::AwsEnv::GetTestCredentials(
-      &aws_access_key_id, &aws_secret_access_key, &aws_region);
-  if (!st.ok()) {
-    Log(InfoLogLevel::DEBUG_LEVEL, info_log, st.ToString().c_str());
-    return;
-  }
-  rocksdb::CloudEnvOptions coptions;
-  coptions.credentials.access_key_id = aws_access_key_id;
-  coptions.credentials.secret_key = aws_secret_access_key;
-  rocksdb::CloudEnv* s;
-  ROCKS_LOG_INFO(info_log, "Created new aws env with path %s", dbpath.c_str());
-  st = rocksdb::AwsEnv::NewAwsEnv(
-      Env::Default(),
-      "envtest." + AwsEnv::GetTestBucketSuffix(), dbpath, aws_region,
-      "envtest." + AwsEnv::GetTestBucketSuffix(), dbpath, aws_region,
-      coptions, std::move(info_log), &s);
-  assert(st.ok());
-  if (!st.ok()) {
-    Log(InfoLogLevel::DEBUG_LEVEL, info_log, st.ToString().c_str());
-    return;
-  }
-  ((CloudEnvImpl*)s)->TEST_DisableCloudManifest();
-  ((AwsEnv*)s)->TEST_SetFileDeletionDelay(std::chrono::seconds(0));
-  // If we are keeping wal in cloud storage, then tail it as well.
-  // so that our unit tests can run to completion.
-  if (!coptions.keep_local_log_files) {
-    AwsEnv* aws = static_cast<AwsEnv*>(s);
-    aws->StartTailingStream();
-  }
-  result->reset(new NormalizingEnvWrapper(s));
-}
-static rocksdb::Registrar<rocksdb::Env> s3_reg(
-    "s3://.*",
-    [](const std::string& uri, std::unique_ptr<rocksdb::Env>* env_guard) {
-      CreateAwsEnv(uri, env_guard);
-      return env_guard->get();
-    });
-#endif /* USE_AWS */
-
 static std::unique_ptr<Env> mem_env(NewMemEnv(Env::Default()));
 INSTANTIATE_TEST_CASE_P(MemEnv, EnvBasicTestWithParam,
                         ::testing::Values(mem_env.get()));
@@ -183,7 +133,7 @@ INSTANTIATE_TEST_CASE_P(CustomEnv, EnvMoreTestWithParam,
 
 TEST_P(EnvBasicTestWithParam, Basics) {
   uint64_t file_size;
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   std::vector<std::string> children;
 
   // Check that the directory is empty.
@@ -236,8 +186,8 @@ TEST_P(EnvBasicTestWithParam, Basics) {
   ASSERT_EQ(0U, file_size);
 
   // Check that opening non-existent file fails.
-  unique_ptr<SequentialFile> seq_file;
-  unique_ptr<RandomAccessFile> rand_file;
+  std::unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<RandomAccessFile> rand_file;
   ASSERT_TRUE(!env_->NewSequentialFile(test_dir_ + "/non_existent", &seq_file,
                                        soptions_)
                    .ok());
@@ -258,22 +208,20 @@ TEST_P(EnvBasicTestWithParam, Basics) {
 }
 
 TEST_P(EnvBasicTestWithParam, ReadWrite) {
-  unique_ptr<WritableFile> writable_file;
-  unique_ptr<SequentialFile> seq_file;
-  unique_ptr<RandomAccessFile> rand_file;
+  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<RandomAccessFile> rand_file;
   Slice result;
   char scratch[100];
-  std::string fname = "/100.sst";
 
-  ASSERT_OK(
-      env_->NewWritableFile(test_dir_ + fname, &writable_file, soptions_));
+  ASSERT_OK(env_->NewWritableFile(test_dir_ + "/f", &writable_file, soptions_));
   ASSERT_OK(writable_file->Append("hello "));
   ASSERT_OK(writable_file->Append("world"));
   ASSERT_OK(writable_file->Close());
   writable_file.reset();
 
   // Read sequentially.
-  ASSERT_OK(env_->NewSequentialFile(test_dir_ + fname, &seq_file, soptions_));
+  ASSERT_OK(env_->NewSequentialFile(test_dir_ + "/f", &seq_file, soptions_));
   ASSERT_OK(seq_file->Read(5, &result, scratch));  // Read "hello".
   ASSERT_EQ(0, result.compare("hello"));
   ASSERT_OK(seq_file->Skip(1));
@@ -286,8 +234,7 @@ TEST_P(EnvBasicTestWithParam, ReadWrite) {
   ASSERT_EQ(0U, result.size());
 
   // Random reads.
-  ASSERT_OK(
-      env_->NewRandomAccessFile(test_dir_ + fname, &rand_file, soptions_));
+  ASSERT_OK(env_->NewRandomAccessFile(test_dir_ + "/f", &rand_file, soptions_));
   ASSERT_OK(rand_file->Read(6, 5, &result, scratch));  // Read "world".
   ASSERT_EQ(0, result.compare("world"));
   ASSERT_OK(rand_file->Read(0, 5, &result, scratch));  // Read "hello".
@@ -297,12 +244,10 @@ TEST_P(EnvBasicTestWithParam, ReadWrite) {
 
   // Too high offset.
   ASSERT_TRUE(rand_file->Read(1000, 5, &result, scratch).ok());
-  // delete test file
-  ASSERT_TRUE(env_->DeleteFile(test_dir_ + fname).ok());
 }
 
 TEST_P(EnvBasicTestWithParam, Misc) {
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env_->NewWritableFile(test_dir_ + "/b", &writable_file, soptions_));
 
   // These are no-ops, but we test they return success.
@@ -313,7 +258,6 @@ TEST_P(EnvBasicTestWithParam, Misc) {
 }
 
 TEST_P(EnvBasicTestWithParam, LargeWrite) {
-  std::string fname = "/f.log";
   const size_t kWriteSize = 300 * 1024;
   char* scratch = new char[kWriteSize * 2];
 
@@ -322,17 +266,16 @@ TEST_P(EnvBasicTestWithParam, LargeWrite) {
     write_data.append(1, static_cast<char>(i));
   }
 
-  unique_ptr<WritableFile> writable_file;
-  ASSERT_OK(
-      env_->NewWritableFile(test_dir_ + fname, &writable_file, soptions_));
+  std::unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile(test_dir_ + "/f", &writable_file, soptions_));
   ASSERT_OK(writable_file->Append("foo"));
   ASSERT_OK(writable_file->Append(write_data));
   ASSERT_OK(writable_file->Close());
   writable_file.reset();
 
-  unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<SequentialFile> seq_file;
   Slice result;
-  ASSERT_OK(env_->NewSequentialFile(test_dir_ + fname, &seq_file, soptions_));
+  ASSERT_OK(env_->NewSequentialFile(test_dir_ + "/f", &seq_file, soptions_));
   ASSERT_OK(seq_file->Read(3, &result, scratch));  // Read "foo".
   ASSERT_EQ(0, result.compare("foo"));
 
@@ -344,7 +287,7 @@ TEST_P(EnvBasicTestWithParam, LargeWrite) {
     read += result.size();
   }
   ASSERT_TRUE(write_data == read_data);
-  delete[] scratch;
+  delete [] scratch;
 }
 
 TEST_P(EnvMoreTestWithParam, GetModTime) {
@@ -397,7 +340,7 @@ TEST_P(EnvMoreTestWithParam, GetChildren) {
 
   // if dir is a file, returns IOError
   ASSERT_OK(env_->CreateDir(test_dir_));
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(
       env_->NewWritableFile(test_dir_ + "/file", &writable_file, soptions_));
   ASSERT_OK(writable_file->Close());
diff --git a/env/env_chroot.cc b/env/env_chroot.cc
index 6a1fda8a834..f6236c81b2c 100644
--- a/env/env_chroot.cc
+++ b/env/env_chroot.cc
@@ -50,7 +50,7 @@ class ChrootEnv : public EnvWrapper {
   }
 
   virtual Status NewRandomAccessFile(const std::string& fname,
-                                     unique_ptr<RandomAccessFile>* result,
+                                     std::unique_ptr<RandomAccessFile>* result,
                                      const EnvOptions& options) override {
     auto status_and_enc_path = EncodePathWithNewBasename(fname);
     if (!status_and_enc_path.first.ok()) {
@@ -61,7 +61,7 @@ class ChrootEnv : public EnvWrapper {
   }
 
   virtual Status NewWritableFile(const std::string& fname,
-                                 unique_ptr<WritableFile>* result,
+                                 std::unique_ptr<WritableFile>* result,
                                  const EnvOptions& options) override {
     auto status_and_enc_path = EncodePathWithNewBasename(fname);
     if (!status_and_enc_path.first.ok()) {
@@ -73,7 +73,7 @@ class ChrootEnv : public EnvWrapper {
 
   virtual Status ReuseWritableFile(const std::string& fname,
                                    const std::string& old_fname,
-                                   unique_ptr<WritableFile>* result,
+                                   std::unique_ptr<WritableFile>* result,
                                    const EnvOptions& options) override {
     auto status_and_enc_path = EncodePathWithNewBasename(fname);
     if (!status_and_enc_path.first.ok()) {
@@ -89,7 +89,7 @@ class ChrootEnv : public EnvWrapper {
   }
 
   virtual Status NewRandomRWFile(const std::string& fname,
-                                 unique_ptr<RandomRWFile>* result,
+                                 std::unique_ptr<RandomRWFile>* result,
                                  const EnvOptions& options) override {
     auto status_and_enc_path = EncodePathWithNewBasename(fname);
     if (!status_and_enc_path.first.ok()) {
@@ -100,7 +100,7 @@ class ChrootEnv : public EnvWrapper {
   }
 
   virtual Status NewDirectory(const std::string& dir,
-                              unique_ptr<Directory>* result) override {
+                              std::unique_ptr<Directory>* result) override {
     auto status_and_enc_path = EncodePathWithNewBasename(dir);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
@@ -238,7 +238,7 @@ class ChrootEnv : public EnvWrapper {
   }
 
   virtual Status NewLogger(const std::string& fname,
-                           shared_ptr<Logger>* result) override {
+                           std::shared_ptr<Logger>* result) override {
     auto status_and_enc_path = EncodePathWithNewBasename(fname);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
diff --git a/env/env_encryption.cc b/env/env_encryption.cc
index e80796fe0c7..e38693e3ce7 100644
--- a/env/env_encryption.cc
+++ b/env/env_encryption.cc
@@ -422,7 +422,7 @@ class EncryptedEnv : public EnvWrapper {
 
   // NewRandomAccessFile opens a file for random read access.
   virtual Status NewRandomAccessFile(const std::string& fname,
-                                     unique_ptr<RandomAccessFile>* result,
+                                     std::unique_ptr<RandomAccessFile>* result,
                                      const EnvOptions& options) override {
     result->reset();
     if (options.use_mmap_reads) {
@@ -456,10 +456,10 @@ class EncryptedEnv : public EnvWrapper {
     (*result) = std::unique_ptr<RandomAccessFile>(new EncryptedRandomAccessFile(underlying.release(), stream.release(), prefixLength));
     return Status::OK();
   }
-  
+
   // NewWritableFile opens a file for sequential writing.
   virtual Status NewWritableFile(const std::string& fname,
-                                 unique_ptr<WritableFile>* result,
+                                 std::unique_ptr<WritableFile>* result,
                                  const EnvOptions& options) override {
     result->reset();
     if (options.use_mmap_writes) {
@@ -505,8 +505,8 @@ class EncryptedEnv : public EnvWrapper {
   //
   // The returned file will only be accessed by one thread at a time.
   virtual Status ReopenWritableFile(const std::string& fname,
-                                   unique_ptr<WritableFile>* result,
-                                   const EnvOptions& options) override {
+                                    std::unique_ptr<WritableFile>* result,
+                                    const EnvOptions& options) override {
     result->reset();
     if (options.use_mmap_writes) {
       return Status::InvalidArgument();
@@ -546,7 +546,7 @@ class EncryptedEnv : public EnvWrapper {
   // Reuse an existing file by renaming it and opening it as writable.
   virtual Status ReuseWritableFile(const std::string& fname,
                                    const std::string& old_fname,
-                                   unique_ptr<WritableFile>* result,
+                                   std::unique_ptr<WritableFile>* result,
                                    const EnvOptions& options) override {
     result->reset();
     if (options.use_mmap_writes) {
@@ -590,7 +590,7 @@ class EncryptedEnv : public EnvWrapper {
   //
   // The returned file will only be accessed by one thread at a time.
   virtual Status NewRandomRWFile(const std::string& fname,
-                                 unique_ptr<RandomRWFile>* result,
+                                 std::unique_ptr<RandomRWFile>* result,
                                  const EnvOptions& options) override {
     result->reset();
     if (options.use_mmap_reads || options.use_mmap_writes) {
@@ -692,7 +692,7 @@ Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char *data, size_t
   auto blockSize = BlockSize();
   uint64_t blockIndex = fileOffset / blockSize;
   size_t blockOffset = fileOffset % blockSize;
-  unique_ptr<char[]> blockBuffer;
+  std::unique_ptr<char[]> blockBuffer;
 
   std::string scratch;
   AllocateScratch(scratch);
@@ -705,8 +705,8 @@ Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char *data, size_t
       // We're not encrypting a full block. 
       // Copy data to blockBuffer
       if (!blockBuffer.get()) {
-        // Allocate buffer 
-        blockBuffer = unique_ptr<char[]>(new char[blockSize]);
+        // Allocate buffer
+        blockBuffer = std::unique_ptr<char[]>(new char[blockSize]);
       }
       block = blockBuffer.get();
       // Copy plain data to block buffer 
@@ -737,7 +737,7 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char *data, size_t
   auto blockSize = BlockSize();
   uint64_t blockIndex = fileOffset / blockSize;
   size_t blockOffset = fileOffset % blockSize;
-  unique_ptr<char[]> blockBuffer;
+  std::unique_ptr<char[]> blockBuffer;
 
   std::string scratch;
   AllocateScratch(scratch);
@@ -750,8 +750,8 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char *data, size_t
       // We're not decrypting a full block. 
       // Copy data to blockBuffer
       if (!blockBuffer.get()) {
-        // Allocate buffer 
-        blockBuffer = unique_ptr<char[]>(new char[blockSize]);
+        // Allocate buffer
+        blockBuffer = std::unique_ptr<char[]>(new char[blockSize]);
       }
       block = blockBuffer.get();
       // Copy encrypted data to block buffer 
@@ -882,7 +882,9 @@ size_t CTREncryptionProvider::PopulateSecretPrefixPart(char* /*prefix*/,
   return 0;
 }
 
-Status CTREncryptionProvider::CreateCipherStream(const std::string& fname, const EnvOptions& options, Slice &prefix, unique_ptr<BlockAccessCipherStream>* result) {
+Status CTREncryptionProvider::CreateCipherStream(
+    const std::string& fname, const EnvOptions& options, Slice& prefix,
+    std::unique_ptr<BlockAccessCipherStream>* result) {
   // Read plain text part of prefix.
   auto blockSize = cipher_.BlockSize();
   uint64_t initialCounter;
@@ -905,8 +907,9 @@ Status CTREncryptionProvider::CreateCipherStream(const std::string& fname, const
 Status CTREncryptionProvider::CreateCipherStreamFromPrefix(
     const std::string& /*fname*/, const EnvOptions& /*options*/,
     uint64_t initialCounter, const Slice& iv, const Slice& /*prefix*/,
-    unique_ptr<BlockAccessCipherStream>* result) {
-  (*result) = unique_ptr<BlockAccessCipherStream>(new CTRCipherStream(cipher_, iv.data(), initialCounter));
+    std::unique_ptr<BlockAccessCipherStream>* result) {
+  (*result) = std::unique_ptr<BlockAccessCipherStream>(
+      new CTRCipherStream(cipher_, iv.data(), initialCounter));
   return Status::OK();
 }
 
diff --git a/env/env_hdfs.cc b/env/env_hdfs.cc
index 1eaea3a1ce5..14fb902f0d4 100644
--- a/env/env_hdfs.cc
+++ b/env/env_hdfs.cc
@@ -381,7 +381,7 @@ const std::string HdfsEnv::pathsep = "/";
 
 // open a file for sequential reading
 Status HdfsEnv::NewSequentialFile(const std::string& fname,
-                                  unique_ptr<SequentialFile>* result,
+                                  std::unique_ptr<SequentialFile>* result,
                                   const EnvOptions& options) {
   result->reset();
   HdfsReadableFile* f = new HdfsReadableFile(fileSys_, fname);
@@ -396,7 +396,7 @@ Status HdfsEnv::NewSequentialFile(const std::string& fname,
 
 // open a file for random reading
 Status HdfsEnv::NewRandomAccessFile(const std::string& fname,
-                                    unique_ptr<RandomAccessFile>* result,
+                                    std::unique_ptr<RandomAccessFile>* result,
                                     const EnvOptions& options) {
   result->reset();
   HdfsReadableFile* f = new HdfsReadableFile(fileSys_, fname);
@@ -411,7 +411,7 @@ Status HdfsEnv::NewRandomAccessFile(const std::string& fname,
 
 // create a new file for writing
 Status HdfsEnv::NewWritableFile(const std::string& fname,
-                                unique_ptr<WritableFile>* result,
+                                std::unique_ptr<WritableFile>* result,
                                 const EnvOptions& options) {
   result->reset();
   Status s;
@@ -437,7 +437,7 @@ class HdfsDirectory : public Directory {
 };
 
 Status HdfsEnv::NewDirectory(const std::string& name,
-                             unique_ptr<Directory>* result) {
+                             std::unique_ptr<Directory>* result) {
   int value = hdfsExists(fileSys_, name.c_str());
   switch (value) {
     case HDFS_EXISTS:
@@ -581,7 +581,7 @@ Status HdfsEnv::UnlockFile(FileLock* lock) {
 }
 
 Status HdfsEnv::NewLogger(const std::string& fname,
-                          shared_ptr<Logger>* result) {
+                          std::shared_ptr<Logger>* result) {
   HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname);
   if (f == nullptr || !f->isValid()) {
     delete f;
@@ -610,10 +610,10 @@ Status NewHdfsEnv(Env** hdfs_env, const std::string& fsname) {
 // dummy placeholders used when HDFS is not available
 namespace rocksdb {
 Status HdfsEnv::NewSequentialFile(const std::string& /*fname*/,
-                                  unique_ptr<SequentialFile>* /*result*/,
+                                  std::unique_ptr<SequentialFile>* /*result*/,
                                   const EnvOptions& /*options*/) {
   return Status::NotSupported("Not compiled with hdfs support");
- }
+}
 
  Status NewHdfsEnv(Env** /*hdfs_env*/, const std::string& /*fsname*/) {
    return Status::NotSupported("Not compiled with hdfs support");
diff --git a/env/env_posix.cc b/env/env_posix.cc
index 34d49b9dc15..c2e456a6614 100644
--- a/env/env_posix.cc
+++ b/env/env_posix.cc
@@ -142,7 +142,7 @@ class PosixEnv : public Env {
   }
 
   virtual Status NewSequentialFile(const std::string& fname,
-                                   unique_ptr<SequentialFile>* result,
+                                   std::unique_ptr<SequentialFile>* result,
                                    const EnvOptions& options) override {
     result->reset();
     int fd = -1;
@@ -192,7 +192,7 @@ class PosixEnv : public Env {
   }
 
   virtual Status NewRandomAccessFile(const std::string& fname,
-                                     unique_ptr<RandomAccessFile>* result,
+                                     std::unique_ptr<RandomAccessFile>* result,
                                      const EnvOptions& options) override {
     result->reset();
     Status s;
@@ -249,7 +249,7 @@ class PosixEnv : public Env {
   }
 
   virtual Status OpenWritableFile(const std::string& fname,
-                                  unique_ptr<WritableFile>* result,
+                                  std::unique_ptr<WritableFile>* result,
                                   const EnvOptions& options,
                                   bool reopen = false) {
     result->reset();
@@ -333,20 +333,20 @@ class PosixEnv : public Env {
   }
 
   virtual Status NewWritableFile(const std::string& fname,
-                                 unique_ptr<WritableFile>* result,
+                                 std::unique_ptr<WritableFile>* result,
                                  const EnvOptions& options) override {
     return OpenWritableFile(fname, result, options, false);
   }
 
   virtual Status ReopenWritableFile(const std::string& fname,
-                                    unique_ptr<WritableFile>* result,
+                                    std::unique_ptr<WritableFile>* result,
                                     const EnvOptions& options) override {
     return OpenWritableFile(fname, result, options, true);
   }
 
   virtual Status ReuseWritableFile(const std::string& fname,
                                    const std::string& old_fname,
-                                   unique_ptr<WritableFile>* result,
+                                   std::unique_ptr<WritableFile>* result,
                                    const EnvOptions& options) override {
     result->reset();
     Status s;
@@ -430,7 +430,7 @@ class PosixEnv : public Env {
   }
 
   virtual Status NewRandomRWFile(const std::string& fname,
-                                 unique_ptr<RandomRWFile>* result,
+                                 std::unique_ptr<RandomRWFile>* result,
                                  const EnvOptions& options) override {
     int fd = -1;
     int flags = cloexec_flags(O_RDWR, &options);
@@ -455,7 +455,7 @@ class PosixEnv : public Env {
 
   virtual Status NewMemoryMappedFileBuffer(
       const std::string& fname,
-      unique_ptr<MemoryMappedFileBuffer>* result) override {
+      std::unique_ptr<MemoryMappedFileBuffer>* result) override {
     int fd = -1;
     Status status;
     int flags = cloexec_flags(O_RDWR, nullptr);
@@ -497,7 +497,7 @@ class PosixEnv : public Env {
   }
 
   virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) override {
+                              std::unique_ptr<Directory>* result) override {
     result->reset();
     int fd;
     int flags = cloexec_flags(0, nullptr);
@@ -791,7 +791,7 @@ class PosixEnv : public Env {
   }
 
   virtual Status NewLogger(const std::string& fname,
-                           shared_ptr<Logger>* result) override {
+                           std::shared_ptr<Logger>* result) override {
     FILE* f;
     {
       IOSTATS_TIMER_GUARD(open_nanos);
diff --git a/env/env_test.cc b/env/env_test.cc
index eda6b9d5d76..36cbd735d7d 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -181,11 +181,11 @@ TEST_F(EnvPosixTest, DISABLED_FilePermission) {
     std::vector<std::string> fileNames{
         test::PerThreadDBPath(env_, "testfile"),
         test::PerThreadDBPath(env_, "testfile1")};
-    unique_ptr<WritableFile> wfile;
+    std::unique_ptr<WritableFile> wfile;
     ASSERT_OK(env_->NewWritableFile(fileNames[0], &wfile, soptions));
     ASSERT_OK(env_->NewWritableFile(fileNames[1], &wfile, soptions));
     wfile.reset();
-    unique_ptr<RandomRWFile> rwfile;
+    std::unique_ptr<RandomRWFile> rwfile;
     ASSERT_OK(env_->NewRandomRWFile(fileNames[1], &rwfile, soptions));
 
     struct stat sb;
@@ -217,7 +217,7 @@ TEST_F(EnvPosixTest, MemoryMappedFileBuffer) {
   std::string expected_data;
   std::string fname = test::PerThreadDBPath(env_, "testfile");
   {
-    unique_ptr<WritableFile> wfile;
+    std::unique_ptr<WritableFile> wfile;
     const EnvOptions soptions;
     ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
 
@@ -812,7 +812,7 @@ class IoctlFriendlyTmpdir {
 
 #ifndef ROCKSDB_LITE
 TEST_F(EnvPosixTest, PositionedAppend) {
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   EnvOptions options;
   options.use_direct_writes = true;
   options.use_mmap_writes = false;
@@ -832,7 +832,7 @@ TEST_F(EnvPosixTest, PositionedAppend) {
   // The file now has 1 sector worth of a followed by a page worth of b
 
   // Verify the above
-  unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<SequentialFile> seq_file;
   ASSERT_OK(env_->NewSequentialFile(ift.name() + "/f", &seq_file, options));
   char scratch[kPageSize * 2];
   Slice result;
@@ -851,10 +851,10 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueID) {
     soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
     IoctlFriendlyTmpdir ift;
     std::string fname = ift.name() + "/testfile";
-    unique_ptr<WritableFile> wfile;
+    std::unique_ptr<WritableFile> wfile;
     ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
 
-    unique_ptr<RandomAccessFile> file;
+    std::unique_ptr<RandomAccessFile> file;
 
     // Get Unique ID
     ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
@@ -921,7 +921,7 @@ TEST_P(EnvPosixTestWithParam, AllocateTest) {
     EnvOptions soptions;
     soptions.use_mmap_writes = false;
     soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
-    unique_ptr<WritableFile> wfile;
+    std::unique_ptr<WritableFile> wfile;
     ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
 
     // allocate 100 MB
@@ -990,14 +990,14 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDConcurrent) {
       fnames.push_back(ift.name() + "/" + "testfile" + ToString(i));
 
       // Create file.
-      unique_ptr<WritableFile> wfile;
+      std::unique_ptr<WritableFile> wfile;
       ASSERT_OK(env_->NewWritableFile(fnames[i], &wfile, soptions));
     }
 
     // Collect and check whether the IDs are unique.
     std::unordered_set<std::string> ids;
     for (const std::string fname : fnames) {
-      unique_ptr<RandomAccessFile> file;
+      std::unique_ptr<RandomAccessFile> file;
       std::string unique_id;
       ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
       size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
@@ -1033,14 +1033,14 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDDeletes) {
     for (int i = 0; i < 1000; ++i) {
       // Create file.
       {
-        unique_ptr<WritableFile> wfile;
+        std::unique_ptr<WritableFile> wfile;
         ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
       }
 
       // Get Unique ID
       std::string unique_id;
       {
-        unique_ptr<RandomAccessFile> file;
+        std::unique_ptr<RandomAccessFile> file;
         ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
         size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
         ASSERT_TRUE(id_size > 0);
@@ -1076,7 +1076,7 @@ TEST_P(EnvPosixTestWithParam, InvalidateCache) {
 
     // Create file.
     {
-      unique_ptr<WritableFile> wfile;
+      std::unique_ptr<WritableFile> wfile;
 #if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX)
       if (soptions.use_direct_writes) {
         soptions.use_direct_writes = false;
@@ -1090,7 +1090,7 @@ TEST_P(EnvPosixTestWithParam, InvalidateCache) {
 
     // Random Read
     {
-      unique_ptr<RandomAccessFile> file;
+      std::unique_ptr<RandomAccessFile> file;
       auto scratch = NewAligned(kSectorSize, 0);
       Slice result;
 #if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX)
@@ -1107,7 +1107,7 @@ TEST_P(EnvPosixTestWithParam, InvalidateCache) {
 
     // Sequential Read
     {
-      unique_ptr<SequentialFile> file;
+      std::unique_ptr<SequentialFile> file;
       auto scratch = NewAligned(kSectorSize, 0);
       Slice result;
 #if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX)
@@ -1252,7 +1252,7 @@ TEST_P(EnvPosixTestWithParam, LogBufferMaxSizeTest) {
 TEST_P(EnvPosixTestWithParam, Preallocation) {
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
   const std::string src = test::PerThreadDBPath(env_, "testfile");
-  unique_ptr<WritableFile> srcfile;
+  std::unique_ptr<WritableFile> srcfile;
   EnvOptions soptions;
   soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
 #if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD)
@@ -1315,7 +1315,7 @@ TEST_P(EnvPosixTestWithParam, ConsistentChildrenAttributes) {
     for (int i = 0; i < kNumChildren; ++i) {
       const std::string path =
           test::TmpDir(env_) + "/" + "testfile_" + std::to_string(i);
-      unique_ptr<WritableFile> file;
+      std::unique_ptr<WritableFile> file;
 #if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD)
       if (soptions.use_direct_writes) {
         rocksdb::SyncPoint::GetInstance()->SetCallBack(
@@ -1368,50 +1368,110 @@ TEST_P(EnvPosixTestWithParam, WritableFileWrapper) {
       inc(1);
       return Status::OK();
     }
-    Status Truncate(uint64_t /*size*/) override { return Status::OK(); }
-    Status Close() override { inc(2); return Status::OK(); }
-    Status Flush() override { inc(3); return Status::OK(); }
-    Status Sync() override { inc(4); return Status::OK(); }
-    Status Fsync() override { inc(5); return Status::OK(); }
-    void SetIOPriority(Env::IOPriority /*pri*/) override { inc(6); }
-    uint64_t GetFileSize() override { inc(7); return 0; }
+
+    Status PositionedAppend(const Slice& /*data*/,
+                            uint64_t /*offset*/) override {
+      inc(2);
+      return Status::OK();
+    }
+
+    Status Truncate(uint64_t /*size*/) override {
+      inc(3);
+      return Status::OK();
+    }
+
+    Status Close() override {
+      inc(4);
+      return Status::OK();
+    }
+
+    Status Flush() override {
+      inc(5);
+      return Status::OK();
+    }
+
+    Status Sync() override {
+      inc(6);
+      return Status::OK();
+    }
+
+    Status Fsync() override {
+      inc(7);
+      return Status::OK();
+    }
+
+    bool IsSyncThreadSafe() const override {
+      inc(8);
+      return true;
+    }
+
+    bool use_direct_io() const override {
+      inc(9);
+      return true;
+    }
+
+    size_t GetRequiredBufferAlignment() const override {
+      inc(10);
+      return 0;
+    }
+
+    void SetIOPriority(Env::IOPriority /*pri*/) override { inc(11); }
+
+    Env::IOPriority GetIOPriority() override {
+      inc(12);
+      return Env::IOPriority::IO_LOW;
+    }
+
+    void SetWriteLifeTimeHint(Env::WriteLifeTimeHint /*hint*/) override {
+      inc(13);
+    }
+
+    Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+      inc(14);
+      return Env::WriteLifeTimeHint::WLTH_NOT_SET;
+    }
+
+    uint64_t GetFileSize() override {
+      inc(15);
+      return 0;
+    }
+
+    void SetPreallocationBlockSize(size_t /*size*/) override { inc(16); }
+
     void GetPreallocationStatus(size_t* /*block_size*/,
                                 size_t* /*last_allocated_block*/) override {
-      inc(8);
+      inc(17);
     }
+
     size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override {
-      inc(9);
+      inc(18);
       return 0;
     }
+
     Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
-      inc(10);
+      inc(19);
       return Status::OK();
     }
 
-   protected:
-    Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) override {
-      inc(11);
+    Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) override {
+      inc(20);
       return Status::OK();
     }
-    Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) override {
-      inc(12);
+
+    void PrepareWrite(size_t /*offset*/, size_t /*len*/) override { inc(21); }
+
+    Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) override {
+      inc(22);
       return Status::OK();
     }
 
    public:
-    ~Base() {
-      inc(13);
-    }
+    ~Base() { inc(23); }
   };
 
   class Wrapper : public WritableFileWrapper {
    public:
     explicit Wrapper(WritableFile* target) : WritableFileWrapper(target) {}
-
-    void CallProtectedMethods() {
-      Allocate(0, 0);
-      RangeSync(0, 0);
-    }
   };
 
   int step = 0;
@@ -1420,19 +1480,30 @@ TEST_P(EnvPosixTestWithParam, WritableFileWrapper) {
     Base b(&step);
     Wrapper w(&b);
     w.Append(Slice());
+    w.PositionedAppend(Slice(), 0);
+    w.Truncate(0);
     w.Close();
     w.Flush();
     w.Sync();
     w.Fsync();
+    w.IsSyncThreadSafe();
+    w.use_direct_io();
+    w.GetRequiredBufferAlignment();
     w.SetIOPriority(Env::IOPriority::IO_HIGH);
+    w.GetIOPriority();
+    w.SetWriteLifeTimeHint(Env::WriteLifeTimeHint::WLTH_NOT_SET);
+    w.GetWriteLifeTimeHint();
     w.GetFileSize();
+    w.SetPreallocationBlockSize(0);
     w.GetPreallocationStatus(nullptr, nullptr);
     w.GetUniqueId(nullptr, 0);
     w.InvalidateCache(0, 0);
-    w.CallProtectedMethods();
+    w.RangeSync(0, 0);
+    w.PrepareWrite(0, 0);
+    w.Allocate(0, 0);
   }
 
-  EXPECT_EQ(14, step);
+  EXPECT_EQ(24, step);
 }
 
 TEST_P(EnvPosixTestWithParam, PosixRandomRWFile) {
@@ -1567,7 +1638,7 @@ TEST_P(EnvPosixTestWithParam, PosixRandomRWFileRandomized) {
   const std::string path = test::PerThreadDBPath(env_, "random_rw_file_rand");
   env_->DeleteFile(path);
 
-  unique_ptr<RandomRWFile> file;
+  std::unique_ptr<RandomRWFile> file;
 
 #ifdef OS_LINUX
   // Cannot open non-existing file.
@@ -1641,7 +1712,7 @@ class TestEnv : public EnvWrapper {
   int GetCloseCount() { return close_count; }
 
   virtual Status NewLogger(const std::string& /*fname*/,
-                           shared_ptr<Logger>* result) {
+                           std::shared_ptr<Logger>* result) {
     result->reset(new TestLogger(this));
     return Status::OK();
   }
@@ -1685,8 +1756,8 @@ INSTANTIATE_TEST_CASE_P(DefaultEnvWithDirectIO, EnvPosixTestWithParam,
 #endif  // !defined(ROCKSDB_LITE)
 
 #if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
-static unique_ptr<Env> chroot_env(NewChrootEnv(Env::Default(),
-                                               test::TmpDir(Env::Default())));
+static std::unique_ptr<Env> chroot_env(
+    NewChrootEnv(Env::Default(), test::TmpDir(Env::Default())));
 INSTANTIATE_TEST_CASE_P(
     ChrootEnvWithoutDirectIO, EnvPosixTestWithParam,
     ::testing::Values(std::pair<Env*, bool>(chroot_env.get(), false)));
diff --git a/env/mock_env.cc b/env/mock_env.cc
index 12c096cefba..84b30607172 100644
--- a/env/mock_env.cc
+++ b/env/mock_env.cc
@@ -319,7 +319,7 @@ class TestMemLogger : public Logger {
   static const uint64_t flush_every_seconds_ = 5;
   std::atomic_uint_fast64_t last_flush_micros_;
   Env* env_;
-  bool flush_pending_;
+  std::atomic<bool> flush_pending_;
 
  public:
   TestMemLogger(std::unique_ptr<WritableFile> f, Env* env,
@@ -424,7 +424,7 @@ MockEnv::~MockEnv() {
 
 // Partial implementation of the Env interface.
 Status MockEnv::NewSequentialFile(const std::string& fname,
-                                  unique_ptr<SequentialFile>* result,
+                                  std::unique_ptr<SequentialFile>* result,
                                   const EnvOptions& /*soptions*/) {
   auto fn = NormalizePath(fname);
   MutexLock lock(&mutex_);
@@ -441,7 +441,7 @@ Status MockEnv::NewSequentialFile(const std::string& fname,
 }
 
 Status MockEnv::NewRandomAccessFile(const std::string& fname,
-                                    unique_ptr<RandomAccessFile>* result,
+                                    std::unique_ptr<RandomAccessFile>* result,
                                     const EnvOptions& /*soptions*/) {
   auto fn = NormalizePath(fname);
   MutexLock lock(&mutex_);
@@ -458,7 +458,7 @@ Status MockEnv::NewRandomAccessFile(const std::string& fname,
 }
 
 Status MockEnv::NewRandomRWFile(const std::string& fname,
-                                unique_ptr<RandomRWFile>* result,
+                                std::unique_ptr<RandomRWFile>* result,
                                 const EnvOptions& /*soptions*/) {
   auto fn = NormalizePath(fname);
   MutexLock lock(&mutex_);
@@ -476,7 +476,7 @@ Status MockEnv::NewRandomRWFile(const std::string& fname,
 
 Status MockEnv::ReuseWritableFile(const std::string& fname,
                                   const std::string& old_fname,
-                                  unique_ptr<WritableFile>* result,
+                                  std::unique_ptr<WritableFile>* result,
                                   const EnvOptions& options) {
   auto s = RenameFile(old_fname, fname);
   if (!s.ok()) {
@@ -487,7 +487,7 @@ Status MockEnv::ReuseWritableFile(const std::string& fname,
 }
 
 Status MockEnv::NewWritableFile(const std::string& fname,
-                                unique_ptr<WritableFile>* result,
+                                std::unique_ptr<WritableFile>* result,
                                 const EnvOptions& env_options) {
   auto fn = NormalizePath(fname);
   MutexLock lock(&mutex_);
@@ -503,7 +503,7 @@ Status MockEnv::NewWritableFile(const std::string& fname,
 }
 
 Status MockEnv::NewDirectory(const std::string& /*name*/,
-                             unique_ptr<Directory>* result) {
+                             std::unique_ptr<Directory>* result) {
   result->reset(new MockEnvDirectory());
   return Status::OK();
 }
@@ -660,7 +660,7 @@ Status MockEnv::LinkFile(const std::string& src, const std::string& dest) {
 }
 
 Status MockEnv::NewLogger(const std::string& fname,
-                          shared_ptr<Logger>* result) {
+                          std::shared_ptr<Logger>* result) {
   auto fn = NormalizePath(fname);
   MutexLock lock(&mutex_);
   auto iter = file_map_.find(fn);
diff --git a/env/mock_env.h b/env/mock_env.h
index 816256ab08c..87b8deaf8c3 100644
--- a/env/mock_env.h
+++ b/env/mock_env.h
@@ -28,28 +28,28 @@ class MockEnv : public EnvWrapper {
 
   // Partial implementation of the Env interface.
   virtual Status NewSequentialFile(const std::string& fname,
-                                   unique_ptr<SequentialFile>* result,
+                                   std::unique_ptr<SequentialFile>* result,
                                    const EnvOptions& soptions) override;
 
   virtual Status NewRandomAccessFile(const std::string& fname,
-                                     unique_ptr<RandomAccessFile>* result,
+                                     std::unique_ptr<RandomAccessFile>* result,
                                      const EnvOptions& soptions) override;
 
   virtual Status NewRandomRWFile(const std::string& fname,
-                                 unique_ptr<RandomRWFile>* result,
+                                 std::unique_ptr<RandomRWFile>* result,
                                  const EnvOptions& options) override;
 
   virtual Status ReuseWritableFile(const std::string& fname,
                                    const std::string& old_fname,
-                                   unique_ptr<WritableFile>* result,
+                                   std::unique_ptr<WritableFile>* result,
                                    const EnvOptions& options) override;
 
   virtual Status NewWritableFile(const std::string& fname,
-                                 unique_ptr<WritableFile>* result,
+                                 std::unique_ptr<WritableFile>* result,
                                  const EnvOptions& env_options) override;
 
   virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) override;
+                              std::unique_ptr<Directory>* result) override;
 
   virtual Status FileExists(const std::string& fname) override;
 
@@ -81,7 +81,7 @@ class MockEnv : public EnvWrapper {
                           const std::string& target) override;
 
   virtual Status NewLogger(const std::string& fname,
-                           shared_ptr<Logger>* result) override;
+                           std::shared_ptr<Logger>* result) override;
 
   virtual Status LockFile(const std::string& fname, FileLock** flock) override;
 
diff --git a/env/mock_env_test.cc b/env/mock_env_test.cc
index 19e259ccd85..abd5b89f0b7 100644
--- a/env/mock_env_test.cc
+++ b/env/mock_env_test.cc
@@ -29,7 +29,7 @@ TEST_F(MockEnvTest, Corrupt) {
   const std::string kGood = "this is a good string, synced to disk";
   const std::string kCorrupted = "this part may be corrupted";
   const std::string kFileName = "/dir/f";
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env_->NewWritableFile(kFileName, &writable_file, soptions_));
   ASSERT_OK(writable_file->Append(kGood));
   ASSERT_TRUE(writable_file->GetFileSize() == kGood.size());
@@ -37,7 +37,7 @@ TEST_F(MockEnvTest, Corrupt) {
   std::string scratch;
   scratch.resize(kGood.size() + kCorrupted.size() + 16);
   Slice result;
-  unique_ptr<RandomAccessFile> rand_file;
+  std::unique_ptr<RandomAccessFile> rand_file;
   ASSERT_OK(env_->NewRandomAccessFile(kFileName, &rand_file, soptions_));
   ASSERT_OK(rand_file->Read(0, kGood.size(), &result, &(scratch[0])));
   ASSERT_EQ(result.compare(kGood), 0);
diff --git a/hdfs/env_hdfs.h b/hdfs/env_hdfs.h
index b0c9e33fd78..a77c42e0af8 100644
--- a/hdfs/env_hdfs.h
+++ b/hdfs/env_hdfs.h
@@ -255,23 +255,24 @@ class HdfsEnv : public Env {
   }
 
   virtual Status NewSequentialFile(const std::string& fname,
-                                   unique_ptr<SequentialFile>* result,
+                                   std::unique_ptr<SequentialFile>* result,
                                    const EnvOptions& options) override;
 
-  virtual Status NewRandomAccessFile(const std::string& /*fname*/,
-                                     unique_ptr<RandomAccessFile>* /*result*/,
-                                     const EnvOptions& /*options*/) override {
+  virtual Status NewRandomAccessFile(
+      const std::string& /*fname*/,
+      std::unique_ptr<RandomAccessFile>* /*result*/,
+      const EnvOptions& /*options*/) override {
     return notsup;
   }
 
   virtual Status NewWritableFile(const std::string& /*fname*/,
-                                 unique_ptr<WritableFile>* /*result*/,
+                                 std::unique_ptr<WritableFile>* /*result*/,
                                  const EnvOptions& /*options*/) override {
     return notsup;
   }
 
   virtual Status NewDirectory(const std::string& /*name*/,
-                              unique_ptr<Directory>* /*result*/) override {
+                              std::unique_ptr<Directory>* /*result*/) override {
     return notsup;
   }
 
@@ -328,7 +329,7 @@ class HdfsEnv : public Env {
   virtual Status UnlockFile(FileLock* /*lock*/) override { return notsup; }
 
   virtual Status NewLogger(const std::string& /*fname*/,
-                           shared_ptr<Logger>* /*result*/) override {
+                           std::shared_ptr<Logger>* /*result*/) override {
     return notsup;
   }
 
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index 940a6f6b74a..fe331482e26 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -413,6 +413,7 @@ struct AdvancedColumnFamilyOptions {
   //    of the level.
   // At the same time max_bytes_for_level_multiplier and
   // max_bytes_for_level_multiplier_additional are still satisfied.
+  // (When L0 is too large, we make some adjustment. See below.)
   //
   // With this option on, from an empty DB, we make last level the base level,
   // which means merging L0 data into the last level, until it exceeds
@@ -451,6 +452,29 @@ struct AdvancedColumnFamilyOptions {
   // max_bytes_for_level_base, for a more predictable LSM tree shape. It is
   // useful to limit worse case space amplification.
   //
+  //
+  // If the compaction from L0 is lagged behind, a special mode will be turned
+  // on to prioritize write amplification against max_bytes_for_level_multiplier
+  // or max_bytes_for_level_base. The L0 compaction is lagged behind by looking
+  // at number of L0 files and total L0 size. If number of L0 files is at least
+  // the double of level0_file_num_compaction_trigger, or the total size is
+  // at least max_bytes_for_level_base, this mode is on. The target of L1 grows
+  // to the actual data size in L0, and then determine the target for each level
+  // so that each level will have the same level multiplier.
+  //
+  // For example, when L0 size is 100MB, the size of last level is 1600MB,
+  // max_bytes_for_level_base = 80MB, and max_bytes_for_level_multiplier = 10.
+  // Since L0 size is larger than max_bytes_for_level_base, this is a L0
+  // compaction backlogged mode. So that the L1 size is determined to be 100MB.
+  // Based on max_bytes_for_level_multiplier = 10, at least 3 non-0 levels will
+  // be needed. The level multiplier will be calculated to be 4 and the three
+  // levels' target to be [100MB, 400MB, 1600MB].
+  //
+  // In this mode, The number of levels will be no more than the normal mode,
+  // and the level multiplier will be lower. The write amplification will
+  // likely to be reduced.
+  //
+  //
   // max_bytes_for_level_multiplier_additional is ignored with this flag on.
   //
   // Turning this feature on or off for an existing DB can cause unexpected
@@ -478,19 +502,25 @@ struct AdvancedColumnFamilyOptions {
   // threshold. But it's not guaranteed.
   // Value 0 will be sanitized.
   //
-  // Default: result.target_file_size_base * 25
+  // Default: target_file_size_base * 25
+  //
+  // Dynamically changeable through SetOptions() API
   uint64_t max_compaction_bytes = 0;
 
   // All writes will be slowed down to at least delayed_write_rate if estimated
   // bytes needed to be compaction exceed this threshold.
   //
   // Default: 64GB
+  //
+  // Dynamically changeable through SetOptions() API
   uint64_t soft_pending_compaction_bytes_limit = 64 * 1073741824ull;
 
   // All writes are stopped if estimated bytes needed to be compaction exceed
   // this threshold.
   //
   // Default: 256GB
+  //
+  // Dynamically changeable through SetOptions() API
   uint64_t hard_pending_compaction_bytes_limit = 256 * 1073741824ull;
 
   // The compaction style. Default: kCompactionStyleLevel
@@ -502,13 +532,17 @@ struct AdvancedColumnFamilyOptions {
   CompactionPri compaction_pri = kByCompensatedSize;
 
   // The options needed to support Universal Style compactions
+  //
+  // Dynamically changeable through SetOptions() API
+  // Dynamic change example:
+  // SetOptions("compaction_options_universal", "{size_ratio=2;}")
   CompactionOptionsUniversal compaction_options_universal;
 
   // The options for FIFO compaction style
   //
   // Dynamically changeable through SetOptions() API
   // Dynamic change example:
-  // SetOption("compaction_options_fifo", "{max_table_files_size=100;ttl=2;}")
+  // SetOptions("compaction_options_fifo", "{max_table_files_size=100;ttl=2;}")
   CompactionOptionsFIFO compaction_options_fifo;
 
   // An iteration->Next() sequentially skips over keys with the same
@@ -578,7 +612,10 @@ struct AdvancedColumnFamilyOptions {
   bool optimize_filters_for_hits = false;
 
   // After writing every SST file, reopen it and read all the keys.
+  //
   // Default: false
+  //
+  // Dynamically changeable through SetOptions() API
   bool paranoid_file_checks = false;
 
   // In debug mode, RocksDB run consistency checks on the LSM every time the LSM
@@ -588,7 +625,10 @@ struct AdvancedColumnFamilyOptions {
   bool force_consistency_checks = false;
 
   // Measure IO stats in compactions and flushes, if true.
+  //
   // Default: false
+  //
+  // Dynamically changeable through SetOptions() API
   bool report_bg_io_stats = false;
 
   // Non-bottom-level files older than TTL will go through the compaction
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 0899ed62559..cf46054aa34 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -1422,6 +1422,10 @@ extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_smallestkey(
     const rocksdb_livefiles_t*, int index, size_t* size);
 extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_largestkey(
     const rocksdb_livefiles_t*, int index, size_t* size);
+extern ROCKSDB_LIBRARY_API uint64_t rocksdb_livefiles_entries(
+    const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API uint64_t rocksdb_livefiles_deletions(
+    const rocksdb_livefiles_t*, int index);
 extern ROCKSDB_LIBRARY_API void rocksdb_livefiles_destroy(
     const rocksdb_livefiles_t*);
 
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index da3b934d830..190112b37e8 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -25,6 +25,7 @@
 #include <stdint.h>
 #include <memory>
 #include <string>
+#include "rocksdb/memory_allocator.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
@@ -58,13 +59,24 @@ struct LRUCacheOptions {
   // BlockBasedTableOptions::cache_index_and_filter_blocks_with_high_priority.
   double high_pri_pool_ratio = 0.0;
 
+  // If non-nullptr will use this allocator instead of system allocator when
+  // allocating memory for cache blocks. Call this method before you start using
+  // the cache!
+  //
+  // Caveat: when the cache is used as block cache, the memory allocator is
+  // ignored when dealing with compression libraries that allocate memory
+  // internally (currently only XPRESS).
+  std::shared_ptr<MemoryAllocator> memory_allocator;
+
   LRUCacheOptions() {}
   LRUCacheOptions(size_t _capacity, int _num_shard_bits,
-                  bool _strict_capacity_limit, double _high_pri_pool_ratio)
+                  bool _strict_capacity_limit, double _high_pri_pool_ratio,
+                  std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr)
       : capacity(_capacity),
         num_shard_bits(_num_shard_bits),
         strict_capacity_limit(_strict_capacity_limit),
-        high_pri_pool_ratio(_high_pri_pool_ratio) {}
+        high_pri_pool_ratio(_high_pri_pool_ratio),
+        memory_allocator(std::move(_memory_allocator)) {}
 };
 
 // Create a new cache with a fixed size capacity. The cache is sharded
@@ -75,10 +87,10 @@ struct LRUCacheOptions {
 // high_pri_pool_pct.
 // num_shard_bits = -1 means it is automatically determined: every shard
 // will be at least 512KB and number of shard bits will not exceed 6.
-extern std::shared_ptr<Cache> NewLRUCache(size_t capacity,
-                                          int num_shard_bits = -1,
-                                          bool strict_capacity_limit = false,
-                                          double high_pri_pool_ratio = 0.0);
+extern std::shared_ptr<Cache> NewLRUCache(
+    size_t capacity, int num_shard_bits = -1,
+    bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.0,
+    std::shared_ptr<MemoryAllocator> memory_allocator = nullptr);
 
 extern std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts);
 
@@ -97,7 +109,8 @@ class Cache {
   // likely to get evicted than low priority entries.
   enum class Priority { HIGH, LOW };
 
-  Cache() {}
+  Cache(std::shared_ptr<MemoryAllocator> allocator = nullptr)
+      : memory_allocator_(std::move(allocator)) {}
 
   // Destroys all existing entries by calling the "deleter"
   // function that was passed via the Insert() function.
@@ -228,10 +241,14 @@ class Cache {
   virtual void TEST_mark_as_data_block(const Slice& /*key*/,
                                        size_t /*charge*/) {}
 
+  MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); }
+
  private:
   // No copying allowed
   Cache(const Cache&);
   Cache& operator=(const Cache&);
+
+  std::shared_ptr<MemoryAllocator> memory_allocator_;
 };
 
 }  // namespace rocksdb
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index f1430bce83f..6a37084c52e 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -287,16 +287,12 @@ class DB {
   // a non-OK status on error. It is not an error if no keys exist in the range
   // ["begin_key", "end_key").
   //
-  // This feature is currently an experimental performance optimization for
-  // deleting very large ranges of contiguous keys. Invoking it many times or on
-  // small ranges may severely degrade read performance; in particular, the
-  // resulting performance can be worse than calling Delete() for each key in
-  // the range. Note also the degraded read performance affects keys outside the
-  // deleted ranges, and affects database operations involving scans, like flush
-  // and compaction.
-  //
-  // Consider setting ReadOptions::ignore_range_deletions = true to speed
-  // up reads for key(s) that are known to be unaffected by range deletions.
+  // This feature is now usable in production, with the following caveats:
+  // 1) Accumulating many range tombstones in the memtable will degrade read
+  // performance; this can be avoided by manually flushing occasionally.
+  // 2) Limiting the maximum number of open files in the presence of range
+  // tombstones can degrade read performance. To avoid this problem, set
+  // max_open_files to -1 whenever possible.
   virtual Status DeleteRange(const WriteOptions& options,
                              ColumnFamilyHandle* column_family,
                              const Slice& begin_key, const Slice& end_key);
@@ -572,6 +568,11 @@ class DB {
     //      log files that should be kept.
     static const std::string kMinLogNumberToKeep;
 
+    //  "rocksdb.min-obsolete-sst-number-to-keep" - return the minimum file
+    //      number for an obsolete SST to be kept. The max value of `uint64_t`
+    //      will be returned if all obsolete files can be deleted.
+    static const std::string kMinObsoleteSstNumberToKeep;
+
     //  "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST
     //      files.
     //  WARNING: may slow down online queries if there are too many files.
@@ -670,6 +671,7 @@ class DB {
   //  "rocksdb.current-super-version-number"
   //  "rocksdb.estimate-live-data-size"
   //  "rocksdb.min-log-number-to-keep"
+  //  "rocksdb.min-obsolete-sst-number-to-keep"
   //  "rocksdb.total-sst-files-size"
   //  "rocksdb.live-sst-files-size"
   //  "rocksdb.base-level"
@@ -900,11 +902,22 @@ class DB {
   virtual DBOptions GetDBOptions() const = 0;
 
   // Flush all mem-table data.
+  // Flush a single column family, even when atomic flush is enabled. To flush
+  // multiple column families, use Flush(options, column_families).
   virtual Status Flush(const FlushOptions& options,
                        ColumnFamilyHandle* column_family) = 0;
   virtual Status Flush(const FlushOptions& options) {
     return Flush(options, DefaultColumnFamily());
   }
+  // Flushes multiple column families.
+  // If atomic flush is not enabled, Flush(options, column_families) is
+  // equivalent to calling Flush(options, column_family) multiple times.
+  // If atomic flush is enabled, Flush(options, column_families) will flush all
+  // column families specified in 'column_families' up to the latest sequence
+  // number at the time when flush is requested.
+  virtual Status Flush(
+      const FlushOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families) = 0;
 
   // Flush the WAL memory buffer to the file. If sync is true, it calls SyncWAL
   // afterwards.
@@ -979,9 +992,9 @@ class DB {
   // cleared aggressively and the iterator might keep getting invalid before
   // an update is read.
   virtual Status GetUpdatesSince(
-      SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
-      const TransactionLogIterator::ReadOptions&
-          read_options = TransactionLogIterator::ReadOptions()) = 0;
+      SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options =
+          TransactionLogIterator::ReadOptions()) = 0;
 
 // Windows API macro interference
 #undef DeleteFile
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 7558364614d..bc439ac1c4c 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -137,9 +137,8 @@ class Env {
   //
   // The returned file will only be accessed by one thread at a time.
   virtual Status NewSequentialFile(const std::string& fname,
-                                   unique_ptr<SequentialFile>* result,
-                                   const EnvOptions& options)
-                                   = 0;
+                                   std::unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options) = 0;
 
   // Create a brand new random access read-only file with the
   // specified name.  On success, stores a pointer to the new file in
@@ -149,9 +148,8 @@ class Env {
   //
   // The returned file may be concurrently accessed by multiple threads.
   virtual Status NewRandomAccessFile(const std::string& fname,
-                                     unique_ptr<RandomAccessFile>* result,
-                                     const EnvOptions& options)
-                                     = 0;
+                                     std::unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options) = 0;
   // These values match Linux definition
   // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
   enum WriteLifeTimeHint {
@@ -171,7 +169,7 @@ class Env {
   //
   // The returned file will only be accessed by one thread at a time.
   virtual Status NewWritableFile(const std::string& fname,
-                                 unique_ptr<WritableFile>* result,
+                                 std::unique_ptr<WritableFile>* result,
                                  const EnvOptions& options) = 0;
 
   // Create an object that writes to a new file with the specified
@@ -182,7 +180,7 @@ class Env {
   //
   // The returned file will only be accessed by one thread at a time.
   virtual Status ReopenWritableFile(const std::string& /*fname*/,
-                                    unique_ptr<WritableFile>* /*result*/,
+                                    std::unique_ptr<WritableFile>* /*result*/,
                                     const EnvOptions& /*options*/) {
     return Status::NotSupported();
   }
@@ -190,7 +188,7 @@ class Env {
   // Reuse an existing file by renaming it and opening it as writable.
   virtual Status ReuseWritableFile(const std::string& fname,
                                    const std::string& old_fname,
-                                   unique_ptr<WritableFile>* result,
+                                   std::unique_ptr<WritableFile>* result,
                                    const EnvOptions& options);
 
   // Open `fname` for random read and write, if file doesn't exist the file
@@ -199,7 +197,7 @@ class Env {
   //
   // The returned file will only be accessed by one thread at a time.
   virtual Status NewRandomRWFile(const std::string& /*fname*/,
-                                 unique_ptr<RandomRWFile>* /*result*/,
+                                 std::unique_ptr<RandomRWFile>* /*result*/,
                                  const EnvOptions& /*options*/) {
     return Status::NotSupported("RandomRWFile is not implemented in this Env");
   }
@@ -209,7 +207,7 @@ class Env {
   // file in `*result`. The file must exist prior to this call.
   virtual Status NewMemoryMappedFileBuffer(
       const std::string& /*fname*/,
-      unique_ptr<MemoryMappedFileBuffer>* /*result*/) {
+      std::unique_ptr<MemoryMappedFileBuffer>* /*result*/) {
     return Status::NotSupported(
         "MemoryMappedFileBuffer is not implemented in this Env");
   }
@@ -222,7 +220,7 @@ class Env {
   // *result and returns OK. On failure stores nullptr in *result and
   // returns non-OK.
   virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) = 0;
+                              std::unique_ptr<Directory>* result) = 0;
 
   // Returns OK if the named file exists.
   //         NotFound if the named file does not exist,
@@ -370,7 +368,7 @@ class Env {
 
   // Create and return a log file for storing informational messages.
   virtual Status NewLogger(const std::string& fname,
-                           shared_ptr<Logger>* result) = 0;
+                           std::shared_ptr<Logger>* result) = 0;
 
   // Returns the number of micro-seconds since some fixed point in time.
   // It is often used as system time such as in GenericRateLimiter
@@ -942,24 +940,32 @@ class FileLock {
   void operator=(const FileLock&);
 };
 
-extern void LogFlush(const shared_ptr<Logger>& info_log);
+extern void LogFlush(const std::shared_ptr<Logger>& info_log);
 
 extern void Log(const InfoLogLevel log_level,
-                const shared_ptr<Logger>& info_log, const char* format, ...);
+                const std::shared_ptr<Logger>& info_log, const char* format,
+                ...);
 
 // a set of log functions with different log levels.
-extern void Header(const shared_ptr<Logger>& info_log, const char* format, ...);
-extern void Debug(const shared_ptr<Logger>& info_log, const char* format, ...);
-extern void Info(const shared_ptr<Logger>& info_log, const char* format, ...);
-extern void Warn(const shared_ptr<Logger>& info_log, const char* format, ...);
-extern void Error(const shared_ptr<Logger>& info_log, const char* format, ...);
-extern void Fatal(const shared_ptr<Logger>& info_log, const char* format, ...);
+extern void Header(const std::shared_ptr<Logger>& info_log, const char* format,
+                   ...);
+extern void Debug(const std::shared_ptr<Logger>& info_log, const char* format,
+                  ...);
+extern void Info(const std::shared_ptr<Logger>& info_log, const char* format,
+                 ...);
+extern void Warn(const std::shared_ptr<Logger>& info_log, const char* format,
+                 ...);
+extern void Error(const std::shared_ptr<Logger>& info_log, const char* format,
+                  ...);
+extern void Fatal(const std::shared_ptr<Logger>& info_log, const char* format,
+                  ...);
 
 // Log the specified data to *info_log if info_log is non-nullptr.
 // The default info log level is InfoLogLevel::INFO_LEVEL.
-extern void Log(const shared_ptr<Logger>& info_log, const char* format, ...)
+extern void Log(const std::shared_ptr<Logger>& info_log, const char* format,
+                ...)
 #   if defined(__GNUC__) || defined(__clang__)
-    __attribute__((__format__ (__printf__, 2, 3)))
+    __attribute__((__format__(__printf__, 2, 3)))
 #   endif
     ;
 
@@ -1005,37 +1011,38 @@ class EnvWrapper : public Env {
   Env* target() const { return target_; }
 
   // The following text is boilerplate that forwards all methods to target()
-  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
                            const EnvOptions& options) override {
     return target_->NewSequentialFile(f, r, options);
   }
   Status NewRandomAccessFile(const std::string& f,
-                             unique_ptr<RandomAccessFile>* r,
+                             std::unique_ptr<RandomAccessFile>* r,
                              const EnvOptions& options) override {
     return target_->NewRandomAccessFile(f, r, options);
   }
-  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
                          const EnvOptions& options) override {
     return target_->NewWritableFile(f, r, options);
   }
   Status ReopenWritableFile(const std::string& fname,
-                            unique_ptr<WritableFile>* result,
+                            std::unique_ptr<WritableFile>* result,
                             const EnvOptions& options) override {
     return target_->ReopenWritableFile(fname, result, options);
   }
   Status ReuseWritableFile(const std::string& fname,
                            const std::string& old_fname,
-                           unique_ptr<WritableFile>* r,
+                           std::unique_ptr<WritableFile>* r,
                            const EnvOptions& options) override {
     return target_->ReuseWritableFile(fname, old_fname, r, options);
   }
   Status NewRandomRWFile(const std::string& fname,
-                         unique_ptr<RandomRWFile>* result,
+                         std::unique_ptr<RandomRWFile>* result,
                          const EnvOptions& options) override {
     return target_->NewRandomRWFile(fname, result, options);
   }
   Status NewDirectory(const std::string& name,
-                      unique_ptr<Directory>* result) override {
+                      std::unique_ptr<Directory>* result) override {
     return target_->NewDirectory(name, result);
   }
   Status FileExists(const std::string& f) override {
@@ -1113,7 +1120,7 @@ class EnvWrapper : public Env {
     return target_->GetTestDirectory(path);
   }
   Status NewLogger(const std::string& fname,
-                   shared_ptr<Logger>* result) override {
+                   std::shared_ptr<Logger>* result) override {
     return target_->NewLogger(fname, result);
   }
   uint64_t NowMicros() override { return target_->NowMicros(); }
@@ -1224,36 +1231,57 @@ class WritableFileWrapper : public WritableFile {
   Status Sync() override { return target_->Sync(); }
   Status Fsync() override { return target_->Fsync(); }
   bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+
   void SetIOPriority(Env::IOPriority pri) override {
     target_->SetIOPriority(pri);
   }
+
   Env::IOPriority GetIOPriority() override { return target_->GetIOPriority(); }
+
+  void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
+    target_->SetWriteLifeTimeHint(hint);
+  }
+
+  Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+    return target_->GetWriteLifeTimeHint();
+  }
+
   uint64_t GetFileSize() override { return target_->GetFileSize(); }
+
+  void SetPreallocationBlockSize(size_t size) override {
+    target_->SetPreallocationBlockSize(size);
+  }
+
   void GetPreallocationStatus(size_t* block_size,
                               size_t* last_allocated_block) override {
     target_->GetPreallocationStatus(block_size, last_allocated_block);
   }
+
   size_t GetUniqueId(char* id, size_t max_size) const override {
     return target_->GetUniqueId(id, max_size);
   }
+
   Status InvalidateCache(size_t offset, size_t length) override {
     return target_->InvalidateCache(offset, length);
   }
 
-  void SetPreallocationBlockSize(size_t size) override {
-    target_->SetPreallocationBlockSize(size);
+  Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+    return target_->RangeSync(offset, nbytes);
   }
+
   void PrepareWrite(size_t offset, size_t len) override {
     target_->PrepareWrite(offset, len);
   }
 
- protected:
   Status Allocate(uint64_t offset, uint64_t len) override {
     return target_->Allocate(offset, len);
   }
-  Status RangeSync(uint64_t offset, uint64_t nbytes) override {
-    return target_->RangeSync(offset, nbytes);
-  }
 
  private:
   WritableFile* target_;
diff --git a/include/rocksdb/env_encryption.h b/include/rocksdb/env_encryption.h
index 70dce616a62..a6e91954656 100644
--- a/include/rocksdb/env_encryption.h
+++ b/include/rocksdb/env_encryption.h
@@ -142,8 +142,9 @@ class EncryptionProvider {
 
     // CreateCipherStream creates a block access cipher stream for a file given
     // given name and options.
-    virtual Status CreateCipherStream(const std::string& fname, const EnvOptions& options,
-      Slice& prefix, unique_ptr<BlockAccessCipherStream>* result) = 0;
+    virtual Status CreateCipherStream(
+        const std::string& fname, const EnvOptions& options, Slice& prefix,
+        std::unique_ptr<BlockAccessCipherStream>* result) = 0;
 };
 
 // This encryption provider uses a CTR cipher stream, with a given block cipher 
@@ -174,10 +175,11 @@ class CTREncryptionProvider : public EncryptionProvider {
 
     // CreateCipherStream creates a block access cipher stream for a file given
     // given name and options.
-    virtual Status CreateCipherStream(const std::string& fname, const EnvOptions& options,
-      Slice& prefix, unique_ptr<BlockAccessCipherStream>* result) override;
+    virtual Status CreateCipherStream(
+        const std::string& fname, const EnvOptions& options, Slice& prefix,
+        std::unique_ptr<BlockAccessCipherStream>* result) override;
 
-  protected:
+   protected:
     // PopulateSecretPrefixPart initializes the data into a new prefix block 
     // that will be encrypted. This function will store the data in plain text. 
     // It will be encrypted later (before written to disk).
@@ -187,8 +189,10 @@ class CTREncryptionProvider : public EncryptionProvider {
 
     // CreateCipherStreamFromPrefix creates a block access cipher stream for a file given
     // given name and options. The given prefix is already decrypted.
-    virtual Status CreateCipherStreamFromPrefix(const std::string& fname, const EnvOptions& options,
-      uint64_t initialCounter, const Slice& iv, const Slice& prefix, unique_ptr<BlockAccessCipherStream>* result);
+    virtual Status CreateCipherStreamFromPrefix(
+        const std::string& fname, const EnvOptions& options,
+        uint64_t initialCounter, const Slice& iv, const Slice& prefix,
+        std::unique_ptr<BlockAccessCipherStream>* result);
 };
 
 }  // namespace rocksdb
diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h
index 4e1dc3bfc93..9c0904456f4 100644
--- a/include/rocksdb/filter_policy.h
+++ b/include/rocksdb/filter_policy.h
@@ -145,6 +145,6 @@ class FilterPolicy {
 // ignores trailing spaces, it would be incorrect to use a
 // FilterPolicy (like NewBloomFilterPolicy) that does not ignore
 // trailing spaces in keys.
-extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
-    bool use_block_based_builder = true);
+extern const FilterPolicy* NewBloomFilterPolicy(
+    int bits_per_key, bool use_block_based_builder = false);
 }
diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h
index 46ce712dc5b..9b4e8a86664 100644
--- a/include/rocksdb/listener.h
+++ b/include/rocksdb/listener.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <chrono>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -143,6 +144,21 @@ struct TableFileDeletionInfo {
   Status status;
 };
 
+struct FileOperationInfo {
+  using TimePoint = std::chrono::time_point<std::chrono::system_clock,
+                                            std::chrono::nanoseconds>;
+
+  const std::string& path;
+  uint64_t offset;
+  size_t length;
+  const TimePoint& start_timestamp;
+  const TimePoint& finish_timestamp;
+  Status status;
+  FileOperationInfo(const std::string& _path, const TimePoint& start,
+                    const TimePoint& finish)
+      : path(_path), start_timestamp(start), finish_timestamp(finish) {}
+};
+
 struct FlushJobInfo {
   // the name of the column family
   std::string cf_name;
@@ -177,6 +193,8 @@ struct CompactionJobInfo {
   explicit CompactionJobInfo(const CompactionJobStats& _stats) :
       stats(_stats) {}
 
+  // the id of the column family where the compaction happened.
+  uint32_t cf_id;
   // the name of the column family where the compaction happened.
   std::string cf_name;
   // the status indicating whether the compaction was successful or not.
@@ -297,6 +315,16 @@ class EventListener {
   // returned value.
   virtual void OnTableFileDeleted(const TableFileDeletionInfo& /*info*/) {}
 
+  // A callback function to RocksDB which will be called before a
+  // RocksDB starts to compact.  The default implementation is
+  // no-op.
+  //
+  // Note that the this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  virtual void OnCompactionBegin(DB* /*db*/,
+                                 const CompactionJobInfo& /*ci*/) {}
+
   // A callback function for RocksDB which will be called whenever
   // a registered RocksDB compacts a file. The default implementation
   // is a no-op.
@@ -395,6 +423,18 @@ class EventListener {
   // returns.  Otherwise, RocksDB may be blocked.
   virtual void OnStallConditionsChanged(const WriteStallInfo& /*info*/) {}
 
+  // A callback function for RocksDB which will be called whenever a file read
+  // operation finishes.
+  virtual void OnFileReadFinish(const FileOperationInfo& /* info */) {}
+
+  // A callback function for RocksDB which will be called whenever a file write
+  // operation finishes.
+  virtual void OnFileWriteFinish(const FileOperationInfo& /* info */) {}
+
+  // If true, the OnFileReadFinish and OnFileWriteFinish will be called. If
+  // false, then they won't be called.
+  virtual bool ShouldBeNotifiedOnFileIO() { return false; }
+
   // A callback function for RocksDB which will be called just before
   // starting the automatic recovery process for recoverable background
   // errors, such as NoSpace(). The callback can suppress the automatic
diff --git a/include/rocksdb/memory_allocator.h b/include/rocksdb/memory_allocator.h
new file mode 100644
index 00000000000..889c0e92182
--- /dev/null
+++ b/include/rocksdb/memory_allocator.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/status.h"
+
+#include <memory>
+
+namespace rocksdb {
+
+// MemoryAllocator is an interface that a client can implement to supply custom
+// memory allocation and deallocation methods. See rocksdb/cache.h for more
+// information.
+// All methods should be thread-safe.
+class MemoryAllocator {
+ public:
+  virtual ~MemoryAllocator() = default;
+
+  // Name of the cache allocator, printed in the log
+  virtual const char* Name() const = 0;
+
+  // Allocate a block of at least size. Has to be thread-safe.
+  virtual void* Allocate(size_t size) = 0;
+
+  // Deallocate previously allocated block. Has to be thread-safe.
+  virtual void Deallocate(void* p) = 0;
+
+  // Returns the memory size of the block allocated at p. The default
+  // implementation that just returns the original allocation_size is fine.
+  virtual size_t UsableSize(void* /*p*/, size_t allocation_size) const {
+    // default implementation just returns the allocation size
+    return allocation_size;
+  }
+};
+
+struct JemallocAllocatorOptions {
+  // Jemalloc tcache cache allocations by size class. For each size class,
+  // it caches between 20 (for large size classes) to 200 (for small size
+  // classes). To reduce tcache memory usage in case the allocator is access
+  // by large number of threads, we can control whether to cache an allocation
+  // by its size.
+  bool limit_tcache_size = false;
+
+  // Lower bound of allocation size to use tcache, if limit_tcache_size=true.
+  // When used with block cache, it is recommneded to set it to block_size/4.
+  size_t tcache_size_lower_bound = 1024;
+
+  // Upper bound of allocation size to use tcache, if limit_tcache_size=true.
+  // When used with block cache, it is recommneded to set it to block_size.
+  size_t tcache_size_upper_bound = 16 * 1024;
+};
+
+// Generate memory allocators which allocates through Jemalloc and utilize
+// MADV_DONTDUMP through madvice to exclude cache items from core dump.
+// Applications can use the allocator with block cache to exclude block cache
+// usage from core dump.
+//
+// Implementation details:
+// The JemallocNodumpAllocator creates a delicated jemalloc arena, and all
+// allocations of the JemallocNodumpAllocator is through the same arena.
+// The memory allocator hooks memory allocation of the arena, and call
+// madvice() with MADV_DONTDUMP flag to exclude the piece of memory from
+// core dump. Side benefit of using single arena would be reduce of jemalloc
+// metadata for some workload.
+//
+// To mitigate mutex contention for using one single arena, jemalloc tcache
+// (thread-local cache) is enabled to cache unused allocations for future use.
+// The tcache normally incur 0.5M extra memory usage per-thread. The usage
+// can be reduce by limitting allocation sizes to cache.
+extern Status NewJemallocNodumpAllocator(
+    JemallocAllocatorOptions& options,
+    std::shared_ptr<MemoryAllocator>* memory_allocator);
+
+}  // namespace rocksdb
diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h
index a9773bf40c4..e62d4f40982 100644
--- a/include/rocksdb/metadata.h
+++ b/include/rocksdb/metadata.h
@@ -63,7 +63,10 @@ struct SstFileMetaData {
         smallestkey(""),
         largestkey(""),
         num_reads_sampled(0),
-        being_compacted(false) {}
+        being_compacted(false),
+        num_entries(0),
+        num_deletions(0) {}
+
   SstFileMetaData(const std::string& _file_name, const std::string& _path,
                   size_t _size, SequenceNumber _smallest_seqno,
                   SequenceNumber _largest_seqno,
@@ -78,7 +81,9 @@ struct SstFileMetaData {
         smallestkey(_smallestkey),
         largestkey(_largestkey),
         num_reads_sampled(_num_reads_sampled),
-        being_compacted(_being_compacted) {}
+        being_compacted(_being_compacted),
+        num_entries(0),
+        num_deletions(0) {}
 
   // File size in bytes.
   size_t size;
@@ -93,11 +98,15 @@ struct SstFileMetaData {
   std::string largestkey;      // Largest user defined key in the file.
   uint64_t num_reads_sampled;  // How many times the file is read.
   bool being_compacted;  // true if the file is currently being compacted.
+
+  uint64_t num_entries;
+  uint64_t num_deletions;
 };
 
 // The full set of metadata associated with each SST file.
 struct LiveFileMetaData : SstFileMetaData {
   std::string column_family_name;  // Name of the column family
   int level;               // Level at which this file resides.
+  LiveFileMetaData() : column_family_name(), level(0) {}
 };
 }  // namespace rocksdb
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 0ed3ad91049..3ace2db2bad 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -188,8 +188,7 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   size_t write_buffer_size = 64 << 20;
 
-  // Compress blocks using the specified compression algorithm.  This
-  // parameter can be changed dynamically.
+  // Compress blocks using the specified compression algorithm.
   //
   // Default: kSnappyCompression, if it's supported. If snappy is not linked
   // with the library, the default is kNoCompression.
@@ -212,6 +211,8 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // - kZlibCompression: Z_DEFAULT_COMPRESSION (currently -1)
   // - kLZ4HCCompression: 0
   // - For all others, we do not specify a compression level
+  //
+  // Dynamically changeable through SetOptions() API
   CompressionType compression;
 
   // Compression algorithm that will be used for the bottommost level that
@@ -418,7 +419,10 @@ struct DBOptions {
   // files opened are always kept open. You can estimate number of files based
   // on target_file_size_base and target_file_size_multiplier for level-based
   // compaction. For universal-style compaction, you can usually set it to -1.
+  //
   // Default: -1
+  //
+  // Dynamically changeable through SetDBOptions() API.
   int max_open_files = -1;
 
   // If max_open_files is -1, DB will open all files on DB::Open(). You can
@@ -433,7 +437,10 @@ struct DBOptions {
   // [sum of all write_buffer_size * max_write_buffer_number] * 4
   // This option takes effect only when there are more than one column family as
   // otherwise the wal size is dictated by the write_buffer_size.
+  //
   // Default: 0
+  //
+  // Dynamically changeable through SetDBOptions() API.
   uint64_t max_total_wal_size = 0;
 
   // If non-null, then we should collect metrics about database operations
@@ -494,13 +501,23 @@ struct DBOptions {
   // value is 6 hours. The files that get out of scope by compaction
   // process will still get automatically delete on every compaction,
   // regardless of this setting
+  //
+  // Default: 6 hours
+  //
+  // Dynamically changeable through SetDBOptions() API.
   uint64_t delete_obsolete_files_period_micros = 6ULL * 60 * 60 * 1000000;
 
   // Maximum number of concurrent background jobs (compactions and flushes).
+  //
+  // Default: 2
+  //
+  // Dynamically changeable through SetDBOptions() API.
   int max_background_jobs = 2;
 
   // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
   // value of max_background_jobs. This option is ignored.
+  //
+  // Dynamically changeable through SetDBOptions() API.
   int base_background_compactions = -1;
 
   // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
@@ -515,7 +532,10 @@ struct DBOptions {
   // If you're increasing this, also consider increasing number of threads in
   // LOW priority thread pool. For more information, see
   // Env::SetBackgroundThreads
+  //
   // Default: -1
+  //
+  // Dynamically changeable through SetDBOptions() API.
   int max_background_compactions = -1;
 
   // This value represents the maximum number of threads that will
@@ -644,7 +664,10 @@ struct DBOptions {
   bool skip_log_error_on_recovery = false;
 
   // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+  //
   // Default: 600 (10 min)
+  //
+  // Dynamically changeable through SetDBOptions() API.
   unsigned int stats_dump_period_sec = 600;
 
   // If set true, will hint the underlying file system that the file
@@ -711,6 +734,8 @@ struct DBOptions {
   // true.
   //
   // Default: 0
+  //
+  // Dynamically changeable through SetDBOptions() API.
   size_t compaction_readahead_size = 0;
 
   // This is a maximum buffer size that is used by WinMmapReadableFile in
@@ -737,6 +762,8 @@ struct DBOptions {
   // write requests if the logical sector size is unusual
   //
   // Default: 1024 * 1024 (1 MB)
+  //
+  // Dynamically changeable through SetDBOptions() API.
   size_t writable_file_max_buffer_size = 1024 * 1024;
 
 
@@ -759,17 +786,23 @@ struct DBOptions {
   // to smooth out write I/Os over time. Users shouldn't rely on it for
   // persistency guarantee.
   // Issue one request for every bytes_per_sync written. 0 turns it off.
-  // Default: 0
   //
   // You may consider using rate_limiter to regulate write rate to device.
   // When rate limiter is enabled, it automatically enables bytes_per_sync
   // to 1MB.
   //
   // This option applies to table files
+  //
+  // Default: 0, turned off
+  //
+  // Dynamically changeable through SetDBOptions() API.
   uint64_t bytes_per_sync = 0;
 
   // Same as bytes_per_sync, but applies to WAL files
+  //
   // Default: 0, turned off
+  //
+  // Dynamically changeable through SetDBOptions() API.
   uint64_t wal_bytes_per_sync = 0;
 
   // A vector of EventListeners which callback functions will be called
@@ -796,6 +829,8 @@ struct DBOptions {
   // Unit: byte per second.
   //
   // Default: 0
+  //
+  // Dynamically changeable through SetDBOptions() API.
   uint64_t delayed_write_rate = 0;
 
   // By default, a single write thread queue is maintained. The thread gets
@@ -945,6 +980,20 @@ struct DBOptions {
   // relies on manual invocation of FlushWAL to write the WAL buffer to its
   // file.
   bool manual_wal_flush = false;
+
+  // If true, RocksDB supports flushing multiple column families and committing
+  // their results atomically to MANIFEST. Note that it is not
+  // necessary to set atomic_flush to true if WAL is always enabled since WAL
+  // allows the database to be restored to the last persistent state in WAL.
+  // This option is useful when there are column families with writes NOT
+  // protected by WAL.
+  // For manual flush, application has to specify which column families to
+  // flush atomically in DB::Flush.
+  // For auto-triggered flush, RocksDB atomically flushes ALL column families.
+  //
+  // Currently, any WAL-enabled writes after atomic flush may be replayed
+  // independently if the process crashes later and tries to recover.
+  bool atomic_flush = false;
 };
 
 // Options to control the behavior of a database (passed to DB::Open)
@@ -1290,6 +1339,11 @@ struct IngestExternalFileOptions {
   bool write_global_seqno = true;
 };
 
-struct TraceOptions {};
+// TraceOptions is used for StartTrace
+struct TraceOptions {
+  // To avoid the trace file size grows large than the storage space,
+  // user can set the max trace file size in Bytes. Default is 64GB
+  uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024;
+};
 
 }  // namespace rocksdb
diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h
index d3771d3f082..3f125c21364 100644
--- a/include/rocksdb/perf_context.h
+++ b/include/rocksdb/perf_context.h
@@ -5,6 +5,7 @@
 
 #pragma once
 
+#include <map>
 #include <stdint.h>
 #include <string>
 
@@ -16,12 +17,44 @@ namespace rocksdb {
 // and transparently.
 // Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.
 
+// Break down performance counters by level and store per-level perf context in
+// PerfContextByLevel
+struct PerfContextByLevel {
+  // # of times bloom filter has avoided file reads, i.e., negatives.
+  uint64_t bloom_filter_useful = 0;
+  // # of times bloom FullFilter has not avoided the reads.
+  uint64_t bloom_filter_full_positive = 0;
+  // # of times bloom FullFilter has not avoided the reads and data actually
+  // exist.
+  uint64_t bloom_filter_full_true_positive = 0;
+
+  // total number of user key returned (only include keys that are found, does
+  // not include keys that are deleted or merged without a final put
+  uint64_t user_key_return_count;
+
+  // total nanos spent on reading data from SST files
+  uint64_t get_from_table_nanos;
+
+  void Reset(); // reset all performance counters to zero
+};
+
 struct PerfContext {
 
+  ~PerfContext();
+
   void Reset(); // reset all performance counters to zero
 
   std::string ToString(bool exclude_zero_counters = false) const;
 
+  // enable per level perf context and allocate storage for PerfContextByLevel
+  void EnablePerLevelPerfContext();
+
+  // temporarily disable per level perf contxt by setting the flag to false
+  void DisablePerLevelPerfContext();
+
+  // free the space for PerfContextByLevel, also disable per level perf context
+  void ClearPerLevelPerfContext();
+
   uint64_t user_key_comparison_count; // total number of user key comparisons
   uint64_t block_cache_hit_count;     // total number of block cache hits
   uint64_t block_read_count;          // total number of block reads (with IO)
@@ -168,6 +201,8 @@ struct PerfContext {
   uint64_t env_lock_file_nanos;
   uint64_t env_unlock_file_nanos;
   uint64_t env_new_logger_nanos;
+  std::map<uint32_t, PerfContextByLevel>* level_to_perf_context = nullptr;
+  bool per_level_perf_context_enabled = false;
 };
 
 // Get Thread-local PerfContext object pointer
diff --git a/include/rocksdb/sst_file_reader.h b/include/rocksdb/sst_file_reader.h
new file mode 100644
index 00000000000..e58c84792e6
--- /dev/null
+++ b/include/rocksdb/sst_file_reader.h
@@ -0,0 +1,45 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/slice.h"
+#include "rocksdb/options.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/table_properties.h"
+
+namespace rocksdb {
+
+// SstFileReader is used to read sst files that are generated by DB or
+// SstFileWriter.
+class SstFileReader {
+ public:
+  SstFileReader(const Options& options);
+
+  ~SstFileReader();
+
+  // Prepares to read from the file located at "file_path".
+  Status Open(const std::string& file_path);
+
+  // Returns a new iterator over the table contents.
+  // Most read options provide the same control as we read from DB.
+  // If "snapshot" is nullptr, the iterator returns only the latest keys.
+  Iterator* NewIterator(const ReadOptions& options);
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const;
+
+  // Verifies whether there is corruption in this table.
+  Status VerifyChecksum();
+
+ private:
+  struct Rep;
+  std::unique_ptr<Rep> rep_;
+};
+
+}  // namespace rocksdb
+
+#endif  // !ROCKSDB_LITE
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index c493a18240c..14e6195faea 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -155,7 +155,8 @@ enum Tickers : uint32_t {
   // Disabled by default. To enable it set stats level to kAll
   DB_MUTEX_WAIT_MICROS,
   RATE_LIMIT_DELAY_MILLIS,
-  NO_ITERATORS,  // number of iterators currently open
+  // DEPRECATED number of iterators currently open
+  NO_ITERATORS,
 
   // Number of MultiGet calls, keys read, and bytes read
   NUMBER_MULTIGET_CALLS,
@@ -322,159 +323,15 @@ enum Tickers : uint32_t {
   // Number of keys actually found in MultiGet calls (vs number requested by caller)
   // NUMBER_MULTIGET_KEYS_READ gives the number requested by caller
   NUMBER_MULTIGET_KEYS_FOUND,
+
+  NO_ITERATOR_CREATED,  // number of iterators created
+  NO_ITERATOR_DELETED,  // number of iterators deleted
   TICKER_ENUM_MAX
 };
 
 // The order of items listed in  Tickers should be the same as
 // the order listed in TickersNameMap
-const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
-    {BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"},
-    {BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"},
-    {BLOCK_CACHE_ADD, "rocksdb.block.cache.add"},
-    {BLOCK_CACHE_ADD_FAILURES, "rocksdb.block.cache.add.failures"},
-    {BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"},
-    {BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"},
-    {BLOCK_CACHE_INDEX_ADD, "rocksdb.block.cache.index.add"},
-    {BLOCK_CACHE_INDEX_BYTES_INSERT, "rocksdb.block.cache.index.bytes.insert"},
-    {BLOCK_CACHE_INDEX_BYTES_EVICT, "rocksdb.block.cache.index.bytes.evict"},
-    {BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"},
-    {BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"},
-    {BLOCK_CACHE_FILTER_ADD, "rocksdb.block.cache.filter.add"},
-    {BLOCK_CACHE_FILTER_BYTES_INSERT,
-     "rocksdb.block.cache.filter.bytes.insert"},
-    {BLOCK_CACHE_FILTER_BYTES_EVICT, "rocksdb.block.cache.filter.bytes.evict"},
-    {BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"},
-    {BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"},
-    {BLOCK_CACHE_DATA_ADD, "rocksdb.block.cache.data.add"},
-    {BLOCK_CACHE_DATA_BYTES_INSERT, "rocksdb.block.cache.data.bytes.insert"},
-    {BLOCK_CACHE_BYTES_READ, "rocksdb.block.cache.bytes.read"},
-    {BLOCK_CACHE_BYTES_WRITE, "rocksdb.block.cache.bytes.write"},
-    {BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"},
-    {BLOOM_FILTER_FULL_POSITIVE, "rocksdb.bloom.filter.full.positive"},
-    {BLOOM_FILTER_FULL_TRUE_POSITIVE,
-     "rocksdb.bloom.filter.full.true.positive"},
-    {PERSISTENT_CACHE_HIT, "rocksdb.persistent.cache.hit"},
-    {PERSISTENT_CACHE_MISS, "rocksdb.persistent.cache.miss"},
-    {SIM_BLOCK_CACHE_HIT, "rocksdb.sim.block.cache.hit"},
-    {SIM_BLOCK_CACHE_MISS, "rocksdb.sim.block.cache.miss"},
-    {MEMTABLE_HIT, "rocksdb.memtable.hit"},
-    {MEMTABLE_MISS, "rocksdb.memtable.miss"},
-    {GET_HIT_L0, "rocksdb.l0.hit"},
-    {GET_HIT_L1, "rocksdb.l1.hit"},
-    {GET_HIT_L2_AND_UP, "rocksdb.l2andup.hit"},
-    {COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"},
-    {COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"},
-    {COMPACTION_KEY_DROP_RANGE_DEL, "rocksdb.compaction.key.drop.range_del"},
-    {COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"},
-    {COMPACTION_RANGE_DEL_DROP_OBSOLETE,
-     "rocksdb.compaction.range_del.drop.obsolete"},
-    {COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
-     "rocksdb.compaction.optimized.del.drop.obsolete"},
-    {COMPACTION_CANCELLED, "rocksdb.compaction.cancelled"},
-    {NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"},
-    {NUMBER_KEYS_READ, "rocksdb.number.keys.read"},
-    {NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"},
-    {BYTES_WRITTEN, "rocksdb.bytes.written"},
-    {BYTES_READ, "rocksdb.bytes.read"},
-    {NUMBER_DB_SEEK, "rocksdb.number.db.seek"},
-    {NUMBER_DB_NEXT, "rocksdb.number.db.next"},
-    {NUMBER_DB_PREV, "rocksdb.number.db.prev"},
-    {NUMBER_DB_SEEK_FOUND, "rocksdb.number.db.seek.found"},
-    {NUMBER_DB_NEXT_FOUND, "rocksdb.number.db.next.found"},
-    {NUMBER_DB_PREV_FOUND, "rocksdb.number.db.prev.found"},
-    {ITER_BYTES_READ, "rocksdb.db.iter.bytes.read"},
-    {NO_FILE_CLOSES, "rocksdb.no.file.closes"},
-    {NO_FILE_OPENS, "rocksdb.no.file.opens"},
-    {NO_FILE_ERRORS, "rocksdb.no.file.errors"},
-    {STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"},
-    {STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"},
-    {STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"},
-    {STALL_MICROS, "rocksdb.stall.micros"},
-    {DB_MUTEX_WAIT_MICROS, "rocksdb.db.mutex.wait.micros"},
-    {RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"},
-    {NO_ITERATORS, "rocksdb.num.iterators"},
-    {NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"},
-    {NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"},
-    {NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"},
-    {NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered"},
-    {NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"},
-    {BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"},
-    {BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"},
-    {NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"},
-    {GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"},
-    {BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss"},
-    {BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit"},
-    {BLOCK_CACHE_COMPRESSED_ADD, "rocksdb.block.cachecompressed.add"},
-    {BLOCK_CACHE_COMPRESSED_ADD_FAILURES,
-     "rocksdb.block.cachecompressed.add.failures"},
-    {WAL_FILE_SYNCED, "rocksdb.wal.synced"},
-    {WAL_FILE_BYTES, "rocksdb.wal.bytes"},
-    {WRITE_DONE_BY_SELF, "rocksdb.write.self"},
-    {WRITE_DONE_BY_OTHER, "rocksdb.write.other"},
-    {WRITE_TIMEDOUT, "rocksdb.write.timeout"},
-    {WRITE_WITH_WAL, "rocksdb.write.wal"},
-    {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"},
-    {COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"},
-    {FLUSH_WRITE_BYTES, "rocksdb.flush.write.bytes"},
-    {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
-     "rocksdb.number.direct.load.table.properties"},
-    {NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"},
-    {NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"},
-    {NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"},
-    {NUMBER_BLOCK_COMPRESSED, "rocksdb.number.block.compressed"},
-    {NUMBER_BLOCK_DECOMPRESSED, "rocksdb.number.block.decompressed"},
-    {NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"},
-    {MERGE_OPERATION_TOTAL_TIME, "rocksdb.merge.operation.time.nanos"},
-    {FILTER_OPERATION_TOTAL_TIME, "rocksdb.filter.operation.time.nanos"},
-    {ROW_CACHE_HIT, "rocksdb.row.cache.hit"},
-    {ROW_CACHE_MISS, "rocksdb.row.cache.miss"},
-    {READ_AMP_ESTIMATE_USEFUL_BYTES, "rocksdb.read.amp.estimate.useful.bytes"},
-    {READ_AMP_TOTAL_READ_BYTES, "rocksdb.read.amp.total.read.bytes"},
-    {NUMBER_RATE_LIMITER_DRAINS, "rocksdb.number.rate_limiter.drains"},
-    {NUMBER_ITER_SKIP, "rocksdb.number.iter.skip"},
-    {BLOB_DB_NUM_PUT, "rocksdb.blobdb.num.put"},
-    {BLOB_DB_NUM_WRITE, "rocksdb.blobdb.num.write"},
-    {BLOB_DB_NUM_GET, "rocksdb.blobdb.num.get"},
-    {BLOB_DB_NUM_MULTIGET, "rocksdb.blobdb.num.multiget"},
-    {BLOB_DB_NUM_SEEK, "rocksdb.blobdb.num.seek"},
-    {BLOB_DB_NUM_NEXT, "rocksdb.blobdb.num.next"},
-    {BLOB_DB_NUM_PREV, "rocksdb.blobdb.num.prev"},
-    {BLOB_DB_NUM_KEYS_WRITTEN, "rocksdb.blobdb.num.keys.written"},
-    {BLOB_DB_NUM_KEYS_READ, "rocksdb.blobdb.num.keys.read"},
-    {BLOB_DB_BYTES_WRITTEN, "rocksdb.blobdb.bytes.written"},
-    {BLOB_DB_BYTES_READ, "rocksdb.blobdb.bytes.read"},
-    {BLOB_DB_WRITE_INLINED, "rocksdb.blobdb.write.inlined"},
-    {BLOB_DB_WRITE_INLINED_TTL, "rocksdb.blobdb.write.inlined.ttl"},
-    {BLOB_DB_WRITE_BLOB, "rocksdb.blobdb.write.blob"},
-    {BLOB_DB_WRITE_BLOB_TTL, "rocksdb.blobdb.write.blob.ttl"},
-    {BLOB_DB_BLOB_FILE_BYTES_WRITTEN, "rocksdb.blobdb.blob.file.bytes.written"},
-    {BLOB_DB_BLOB_FILE_BYTES_READ, "rocksdb.blobdb.blob.file.bytes.read"},
-    {BLOB_DB_BLOB_FILE_SYNCED, "rocksdb.blobdb.blob.file.synced"},
-    {BLOB_DB_BLOB_INDEX_EXPIRED_COUNT,
-     "rocksdb.blobdb.blob.index.expired.count"},
-    {BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, "rocksdb.blobdb.blob.index.expired.size"},
-    {BLOB_DB_BLOB_INDEX_EVICTED_COUNT,
-     "rocksdb.blobdb.blob.index.evicted.count"},
-    {BLOB_DB_BLOB_INDEX_EVICTED_SIZE, "rocksdb.blobdb.blob.index.evicted.size"},
-    {BLOB_DB_GC_NUM_FILES, "rocksdb.blobdb.gc.num.files"},
-    {BLOB_DB_GC_NUM_NEW_FILES, "rocksdb.blobdb.gc.num.new.files"},
-    {BLOB_DB_GC_FAILURES, "rocksdb.blobdb.gc.failures"},
-    {BLOB_DB_GC_NUM_KEYS_OVERWRITTEN, "rocksdb.blobdb.gc.num.keys.overwritten"},
-    {BLOB_DB_GC_NUM_KEYS_EXPIRED, "rocksdb.blobdb.gc.num.keys.expired"},
-    {BLOB_DB_GC_NUM_KEYS_RELOCATED, "rocksdb.blobdb.gc.num.keys.relocated"},
-    {BLOB_DB_GC_BYTES_OVERWRITTEN, "rocksdb.blobdb.gc.bytes.overwritten"},
-    {BLOB_DB_GC_BYTES_EXPIRED, "rocksdb.blobdb.gc.bytes.expired"},
-    {BLOB_DB_GC_BYTES_RELOCATED, "rocksdb.blobdb.gc.bytes.relocated"},
-    {BLOB_DB_FIFO_NUM_FILES_EVICTED, "rocksdb.blobdb.fifo.num.files.evicted"},
-    {BLOB_DB_FIFO_NUM_KEYS_EVICTED, "rocksdb.blobdb.fifo.num.keys.evicted"},
-    {BLOB_DB_FIFO_BYTES_EVICTED, "rocksdb.blobdb.fifo.bytes.evicted"},
-    {TXN_PREPARE_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.prepare"},
-    {TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
-     "rocksdb.txn.overhead.mutex.old.commit.map"},
-    {TXN_DUPLICATE_KEY_OVERHEAD, "rocksdb.txn.overhead.duplicate.key"},
-    {TXN_SNAPSHOT_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.snapshot"},
-    {NUMBER_MULTIGET_KEYS_FOUND, "rocksdb.number.multiget.keys.found"},
-};
+extern const std::vector<std::pair<Tickers, std::string>> TickersNameMap;
 
 /**
  * Keep adding histogram's here.
@@ -557,57 +414,10 @@ enum Histograms : uint32_t {
   // Time spent flushing memtable to disk
   FLUSH_TIME,
 
-  HISTOGRAM_ENUM_MAX,  // TODO(ldemailly): enforce HistogramsNameMap match
+  HISTOGRAM_ENUM_MAX,
 };
 
-const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
-    {DB_GET, "rocksdb.db.get.micros"},
-    {DB_WRITE, "rocksdb.db.write.micros"},
-    {COMPACTION_TIME, "rocksdb.compaction.times.micros"},
-    {SUBCOMPACTION_SETUP_TIME, "rocksdb.subcompaction.setup.times.micros"},
-    {TABLE_SYNC_MICROS, "rocksdb.table.sync.micros"},
-    {COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros"},
-    {WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros"},
-    {MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros"},
-    {TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros"},
-    {DB_MULTIGET, "rocksdb.db.multiget.micros"},
-    {READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros"},
-    {READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros"},
-    {WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros"},
-    {STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"},
-    {STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"},
-    {STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"},
-    {HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
-    {SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
-    {NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction"},
-    {DB_SEEK, "rocksdb.db.seek.micros"},
-    {WRITE_STALL, "rocksdb.db.write.stall"},
-    {SST_READ_MICROS, "rocksdb.sst.read.micros"},
-    {NUM_SUBCOMPACTIONS_SCHEDULED, "rocksdb.num.subcompactions.scheduled"},
-    {BYTES_PER_READ, "rocksdb.bytes.per.read"},
-    {BYTES_PER_WRITE, "rocksdb.bytes.per.write"},
-    {BYTES_PER_MULTIGET, "rocksdb.bytes.per.multiget"},
-    {BYTES_COMPRESSED, "rocksdb.bytes.compressed"},
-    {BYTES_DECOMPRESSED, "rocksdb.bytes.decompressed"},
-    {COMPRESSION_TIMES_NANOS, "rocksdb.compression.times.nanos"},
-    {DECOMPRESSION_TIMES_NANOS, "rocksdb.decompression.times.nanos"},
-    {READ_NUM_MERGE_OPERANDS, "rocksdb.read.num.merge_operands"},
-    {BLOB_DB_KEY_SIZE, "rocksdb.blobdb.key.size"},
-    {BLOB_DB_VALUE_SIZE, "rocksdb.blobdb.value.size"},
-    {BLOB_DB_WRITE_MICROS, "rocksdb.blobdb.write.micros"},
-    {BLOB_DB_GET_MICROS, "rocksdb.blobdb.get.micros"},
-    {BLOB_DB_MULTIGET_MICROS, "rocksdb.blobdb.multiget.micros"},
-    {BLOB_DB_SEEK_MICROS, "rocksdb.blobdb.seek.micros"},
-    {BLOB_DB_NEXT_MICROS, "rocksdb.blobdb.next.micros"},
-    {BLOB_DB_PREV_MICROS, "rocksdb.blobdb.prev.micros"},
-    {BLOB_DB_BLOB_FILE_WRITE_MICROS, "rocksdb.blobdb.blob.file.write.micros"},
-    {BLOB_DB_BLOB_FILE_READ_MICROS, "rocksdb.blobdb.blob.file.read.micros"},
-    {BLOB_DB_BLOB_FILE_SYNC_MICROS, "rocksdb.blobdb.blob.file.sync.micros"},
-    {BLOB_DB_GC_MICROS, "rocksdb.blobdb.gc.micros"},
-    {BLOB_DB_COMPRESSION_MICROS, "rocksdb.blobdb.compression.micros"},
-    {BLOB_DB_DECOMPRESSION_MICROS, "rocksdb.blobdb.decompression.micros"},
-    {FLUSH_TIME, "rocksdb.db.flush.micros"},
-};
+extern const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap;
 
 struct HistogramData {
   double median;
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index a177d1c7ae1..a99c8bf6e72 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -47,6 +47,7 @@ enum ChecksumType : char {
   kNoChecksum = 0x0,
   kCRC32c = 0x1,
   kxxHash = 0x2,
+  kxxHash64 = 0x3,
 };
 
 // For advanced user only
@@ -137,6 +138,8 @@ struct BlockBasedTableOptions {
 
   // If non-NULL use the specified cache for compressed blocks.
   // If NULL, rocksdb will not use a compressed block cache.
+  // Note: though it looks similar to `block_cache`, RocksDB doesn't put the
+  //       same type of object there.
   std::shared_ptr<Cache> block_cache_compressed = nullptr;
 
   // Approximate size of user data packed per block.  Note that the
@@ -449,7 +452,7 @@ class TableFactory {
   // NewTableReader() is called in three places:
   // (1) TableCache::FindTable() calls the function when table cache miss
   //     and cache the table object returned.
-  // (2) SstFileReader (for SST Dump) opens the table and dump the table
+  // (2) SstFileDumper (for SST Dump) opens the table and dump the table
   //     contents using the iterator of the table.
   // (3) DBImpl::IngestExternalFile() calls this function to read the contents of
   //     the sst file it's attempting to add
@@ -461,8 +464,8 @@ class TableFactory {
   // table_reader is the output table reader.
   virtual Status NewTableReader(
       const TableReaderOptions& table_reader_options,
-      unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-      unique_ptr<TableReader>* table_reader,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
       bool prefetch_index_and_filter_in_cache = true) const = 0;
 
   // Return a table builder to write to a file for this table type.
diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h
index d545e455ffc..75c180ff4fc 100644
--- a/include/rocksdb/table_properties.h
+++ b/include/rocksdb/table_properties.h
@@ -40,6 +40,8 @@ struct TablePropertiesNames {
   static const std::string kRawValueSize;
   static const std::string kNumDataBlocks;
   static const std::string kNumEntries;
+  static const std::string kDeletedKeys;
+  static const std::string kMergeOperands;
   static const std::string kNumRangeDeletions;
   static const std::string kFormatVersion;
   static const std::string kFixedKeyLen;
@@ -152,6 +154,10 @@ struct TableProperties {
   uint64_t num_data_blocks = 0;
   // the number of entries in this table
   uint64_t num_entries = 0;
+  // the number of deletions in the table
+  uint64_t num_deletions = 0;
+  // the number of merge operands in the table
+  uint64_t num_merge_operands = 0;
   // the number of range deletions in this table
   uint64_t num_range_deletions = 0;
   // format version, reserved for backward compatibility
@@ -216,6 +222,10 @@ struct TableProperties {
 // Below is a list of non-basic properties that are collected by database
 // itself. Especially some properties regarding to the internal keys (which
 // is unknown to `table`).
+//
+// DEPRECATED: these properties now belong as TableProperties members. Please
+// use TableProperties::num_deletions and TableProperties::num_merge_operands,
+// respectively.
 extern uint64_t GetDeletedKeys(const UserCollectedProperties& props);
 extern uint64_t GetMergeOperands(const UserCollectedProperties& props,
                                  bool* property_present);
diff --git a/include/rocksdb/trace_reader_writer.h b/include/rocksdb/trace_reader_writer.h
index 31226487b85..28919a0fadc 100644
--- a/include/rocksdb/trace_reader_writer.h
+++ b/include/rocksdb/trace_reader_writer.h
@@ -24,6 +24,7 @@ class TraceWriter {
 
   virtual Status Write(const Slice& data) = 0;
   virtual Status Close() = 0;
+  virtual uint64_t GetFileSize() = 0;
 };
 
 // TraceReader allows reading RocksDB traces from any system, one operation at
diff --git a/include/rocksdb/transaction_log.h b/include/rocksdb/transaction_log.h
index 1d8ef918612..cf80a633f1c 100644
--- a/include/rocksdb/transaction_log.h
+++ b/include/rocksdb/transaction_log.h
@@ -60,7 +60,7 @@ struct BatchResult {
 
   // Add empty __ctor and __dtor for the rule of five
   // However, preserve the original semantics and prohibit copying
-  // as the unique_ptr member does not copy.
+  // as the std::unique_ptr member does not copy.
   BatchResult() {}
 
   ~BatchResult() {}
diff --git a/include/rocksdb/utilities/env_mirror.h b/include/rocksdb/utilities/env_mirror.h
index bc27cdc4884..40e9411ffae 100644
--- a/include/rocksdb/utilities/env_mirror.h
+++ b/include/rocksdb/utilities/env_mirror.h
@@ -48,20 +48,21 @@ class EnvMirror : public EnvWrapper {
       delete b_;
   }
 
-  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
                            const EnvOptions& options) override;
   Status NewRandomAccessFile(const std::string& f,
-                             unique_ptr<RandomAccessFile>* r,
+                             std::unique_ptr<RandomAccessFile>* r,
                              const EnvOptions& options) override;
-  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
                          const EnvOptions& options) override;
   Status ReuseWritableFile(const std::string& fname,
                            const std::string& old_fname,
-                           unique_ptr<WritableFile>* r,
+                           std::unique_ptr<WritableFile>* r,
                            const EnvOptions& options) override;
   virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) override {
-    unique_ptr<Directory> br;
+                              std::unique_ptr<Directory>* result) override {
+    std::unique_ptr<Directory> br;
     Status as = a_->NewDirectory(name, result);
     Status bs = b_->NewDirectory(name, &br);
     assert(as == bs);
diff --git a/include/rocksdb/utilities/object_registry.h b/include/rocksdb/utilities/object_registry.h
index b046ba7c1f5..86a51b92ead 100644
--- a/include/rocksdb/utilities/object_registry.h
+++ b/include/rocksdb/utilities/object_registry.h
@@ -27,8 +27,8 @@ namespace rocksdb {
 template <typename T>
 T* NewCustomObject(const std::string& target, std::unique_ptr<T>* res_guard);
 
-// Returns a new T when called with a string. Populates the unique_ptr argument
-// if granting ownership to caller.
+// Returns a new T when called with a string. Populates the std::unique_ptr
+// argument if granting ownership to caller.
 template <typename T>
 using FactoryFunc = std::function<T*(const std::string&, std::unique_ptr<T>*)>;
 
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 721203f7ce4..eae3a85ea1f 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -278,6 +278,11 @@ class StackableDB : public DB {
                        ColumnFamilyHandle* column_family) override {
     return db_->Flush(fopts, column_family);
   }
+  virtual Status Flush(
+      const FlushOptions& fopts,
+      const std::vector<ColumnFamilyHandle*>& column_families) override {
+    return db_->Flush(fopts, column_families);
+  }
 
   virtual Status SyncWAL() override {
     return db_->SyncWAL();
@@ -364,7 +369,7 @@ class StackableDB : public DB {
   }
 
   virtual Status GetUpdatesSince(
-      SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
+      SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
       const TransactionLogIterator::ReadOptions& read_options) override {
     return db_->GetUpdatesSince(seq_number, iter, read_options);
   }
diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h
index 86627d4f458..c1e2441bc37 100644
--- a/include/rocksdb/utilities/transaction.h
+++ b/include/rocksdb/utilities/transaction.h
@@ -239,14 +239,15 @@ class Transaction {
   // An overload of the above method that receives a PinnableSlice
   // For backward compatibility a default implementation is provided
   virtual Status GetForUpdate(const ReadOptions& options,
-                              ColumnFamilyHandle* /*column_family*/,
+                              ColumnFamilyHandle* column_family,
                               const Slice& key, PinnableSlice* pinnable_val,
-                              bool /*exclusive*/ = true) {
+                              bool exclusive = true) {
     if (pinnable_val == nullptr) {
       std::string* null_str = nullptr;
-      return GetForUpdate(options, key, null_str);
+      return GetForUpdate(options, column_family, key, null_str, exclusive);
     } else {
-      auto s = GetForUpdate(options, key, pinnable_val->GetSelf());
+      auto s = GetForUpdate(options, column_family, key,
+                            pinnable_val->GetSelf(), exclusive);
       pinnable_val->PinSelf();
       return s;
     }
diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index 3d7bc355a37..1a692f2a7ac 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -171,8 +171,8 @@ struct KeyLockInfo {
 struct DeadlockInfo {
   TransactionID m_txn_id;
   uint32_t m_cf_id;
-  std::string m_waiting_key;
   bool m_exclusive;
+  std::string m_waiting_key;
 };
 
 struct DeadlockPath {
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index c24ba1d3902..24cef677f11 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -5,8 +5,8 @@
 #pragma once
 
 #define ROCKSDB_MAJOR 5
-#define ROCKSDB_MINOR 17
-#define ROCKSDB_PATCH 2
+#define ROCKSDB_MINOR 18
+#define ROCKSDB_PATCH 3
 
 // Do not use these. We made the mistake of declaring macros starting with
 // double underscore. Now we have to live with our choice. We'll deprecate these
diff --git a/include/rocksdb/write_buffer_manager.h b/include/rocksdb/write_buffer_manager.h
index 856cf4b2463..dea904c187e 100644
--- a/include/rocksdb/write_buffer_manager.h
+++ b/include/rocksdb/write_buffer_manager.h
@@ -30,6 +30,8 @@ class WriteBufferManager {
 
   bool enabled() const { return buffer_size_ != 0; }
 
+  bool cost_to_cache() const { return cache_rep_ != nullptr; }
+
   // Only valid if enabled()
   size_t memory_usage() const {
     return memory_used_.load(std::memory_order_relaxed);
diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt
index 96c08b23189..8f4ec9a568a 100644
--- a/java/CMakeLists.txt
+++ b/java/CMakeLists.txt
@@ -25,6 +25,7 @@ set(JNI_NATIVE_SOURCES
         rocksjni/jnicallback.cc
         rocksjni/loggerjnicallback.cc
         rocksjni/lru_cache.cc
+        rocksjni/memory_util.cc
         rocksjni/memtablejni.cc
         rocksjni/merge_operator.cc
         rocksjni/native_comparator_wrapper_test.cc
@@ -57,6 +58,7 @@ set(JNI_NATIVE_SOURCES
         rocksjni/writebatchhandlerjnicallback.cc
         rocksjni/write_batch_test.cc
         rocksjni/write_batch_with_index.cc
+        rocksjni/write_buffer_manager.cc
 )
 
 set(NATIVE_JAVA_CLASSES
@@ -96,6 +98,7 @@ set(NATIVE_JAVA_CLASSES
         org.rocksdb.IngestExternalFileOptions
         org.rocksdb.Logger
         org.rocksdb.LRUCache
+        org.rocksdb.MemoryUtil
         org.rocksdb.MemTableConfig
         org.rocksdb.NativeComparatorWrapper
         org.rocksdb.NativeLibraryLoader
@@ -130,6 +133,7 @@ set(NATIVE_JAVA_CLASSES
         org.rocksdb.TransactionLogIterator
         org.rocksdb.TransactionOptions
         org.rocksdb.TtlDB
+        org.rocksdb.UInt64AddOperator
         org.rocksdb.VectorMemTableConfig
         org.rocksdb.WBWIRocksIterator
         org.rocksdb.WriteBatch
@@ -142,6 +146,7 @@ set(NATIVE_JAVA_CLASSES
         org.rocksdb.SnapshotTest
         org.rocksdb.WriteBatchTest
         org.rocksdb.WriteBatchTestInternalHelper
+        org.rocksdb.WriteBufferManager
 )
 
 include(FindJava)
@@ -222,6 +227,8 @@ add_jar(
   src/main/java/org/rocksdb/IngestExternalFileOptions.java
   src/main/java/org/rocksdb/Logger.java
   src/main/java/org/rocksdb/LRUCache.java
+  src/main/java/org/rocksdb/MemoryUsageType.java
+  src/main/java/org/rocksdb/MemoryUtil.java
   src/main/java/org/rocksdb/MemTableConfig.java
   src/main/java/org/rocksdb/MergeOperator.java
   src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
@@ -278,6 +285,7 @@ add_jar(
   src/main/java/org/rocksdb/WriteBatch.java
   src/main/java/org/rocksdb/WriteBatchWithIndex.java
   src/main/java/org/rocksdb/WriteOptions.java
+  src/main/java/org/rocksdb/WriteBufferManager.java
   src/main/java/org/rocksdb/util/BytewiseComparator.java
   src/main/java/org/rocksdb/util/DirectBytewiseComparator.java
   src/main/java/org/rocksdb/util/Environment.java
@@ -290,6 +298,7 @@ add_jar(
   src/test/java/org/rocksdb/RocksDBExceptionTest.java
   src/test/java/org/rocksdb/RocksMemoryResource.java
   src/test/java/org/rocksdb/SnapshotTest.java
+  src/main/java/org/rocksdb/UInt64AddOperator.java
   src/test/java/org/rocksdb/WriteBatchTest.java
   src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java
   src/test/java/org/rocksdb/util/WriteBatchGetter.java
diff --git a/java/Makefile b/java/Makefile
index f58fff06e50..b3b89eb8372 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -30,6 +30,8 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractCompactionFilter\
 	org.rocksdb.HashSkipListMemTableConfig\
 	org.rocksdb.Logger\
 	org.rocksdb.LRUCache\
+	org.rocksdb.MemoryUsageType\
+	org.rocksdb.MemoryUtil\
 	org.rocksdb.MergeOperator\
 	org.rocksdb.NativeComparatorWrapper\
 	org.rocksdb.OptimisticTransactionDB\
@@ -60,10 +62,12 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractCompactionFilter\
 	org.rocksdb.VectorMemTableConfig\
 	org.rocksdb.Snapshot\
 	org.rocksdb.StringAppendOperator\
+	org.rocksdb.UInt64AddOperator\
 	org.rocksdb.WriteBatch\
 	org.rocksdb.WriteBatch.Handler\
 	org.rocksdb.WriteOptions\
 	org.rocksdb.WriteBatchWithIndex\
+	org.rocksdb.WriteBufferManager\
 	org.rocksdb.WBWIRocksIterator
 
 NATIVE_JAVA_TEST_CLASSES = org.rocksdb.RocksDBExceptionTest\
@@ -111,6 +115,7 @@ JAVA_TESTS = org.rocksdb.BackupableDBOptionsTest\
 	org.rocksdb.KeyMayExistTest\
 	org.rocksdb.LoggerTest\
 	org.rocksdb.LRUCacheTest\
+	org.rocksdb.MemoryUtilTest\
 	org.rocksdb.MemTableTest\
 	org.rocksdb.MergeTest\
 	org.rocksdb.MixedOptionsTest\
diff --git a/java/rocksjni/compaction_options_fifo.cc b/java/rocksjni/compaction_options_fifo.cc
index 95bbfc621dc..00761b6ac5f 100644
--- a/java/rocksjni/compaction_options_fifo.cc
+++ b/java/rocksjni/compaction_options_fifo.cc
@@ -46,6 +46,53 @@ jlong Java_org_rocksdb_CompactionOptionsFIFO_maxTableFilesSize(JNIEnv* /*env*/,
   return static_cast<jlong>(opt->max_table_files_size);
 }
 
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    setTtl
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_CompactionOptionsFIFO_setTtl(JNIEnv* /*env*/,
+                                                   jobject /*jobj*/,
+                                                   jlong jhandle, jlong ttl) {
+  auto* opt = reinterpret_cast<rocksdb::CompactionOptionsFIFO*>(jhandle);
+  opt->ttl = static_cast<uint64_t>(ttl);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    ttl
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionOptionsFIFO_ttl(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::CompactionOptionsFIFO*>(jhandle);
+  return static_cast<jlong>(opt->ttl);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    setAllowCompaction
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_CompactionOptionsFIFO_setAllowCompaction(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jboolean allow_compaction) {
+  auto* opt = reinterpret_cast<rocksdb::CompactionOptionsFIFO*>(jhandle);
+  opt->allow_compaction = static_cast<bool>(allow_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    allowCompaction
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompactionOptionsFIFO_allowCompaction(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::CompactionOptionsFIFO*>(jhandle);
+  return static_cast<jboolean>(opt->allow_compaction);
+}
+
 /*
  * Class:     org_rocksdb_CompactionOptionsFIFO
  * Method:    disposeInternal
diff --git a/java/rocksjni/memory_util.cc b/java/rocksjni/memory_util.cc
new file mode 100644
index 00000000000..9c2bfd04e2b
--- /dev/null
+++ b/java/rocksjni/memory_util.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <jni.h>
+#include <map>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "include/org_rocksdb_MemoryUtil.h"
+
+#include "rocksjni/portal.h"
+
+#include "rocksdb/utilities/memory_util.h"
+
+
+/*
+ * Class:     org_rocksdb_MemoryUtil
+ * Method:    getApproximateMemoryUsageByType
+ * Signature: ([J[J)Ljava/util/Map;
+ */
+jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType(
+    JNIEnv *env, jclass /*jclazz*/, jlongArray jdb_handles, jlongArray jcache_handles) {
+
+  std::vector<rocksdb::DB*> dbs;
+  jsize db_handle_count = env->GetArrayLength(jdb_handles);
+  if(db_handle_count > 0) {
+    jlong *ptr_jdb_handles = env->GetLongArrayElements(jdb_handles, nullptr);
+    if (ptr_jdb_handles == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+    for (jsize i = 0; i < db_handle_count; i++) {
+      dbs.push_back(reinterpret_cast<rocksdb::DB *>(ptr_jdb_handles[i]));
+    }
+    env->ReleaseLongArrayElements(jdb_handles, ptr_jdb_handles, JNI_ABORT);
+  }
+
+  std::unordered_set<const rocksdb::Cache*> cache_set;
+  jsize cache_handle_count = env->GetArrayLength(jcache_handles);
+  if(cache_handle_count > 0) {
+    jlong *ptr_jcache_handles = env->GetLongArrayElements(jcache_handles, nullptr);
+    if (ptr_jcache_handles == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+    for (jsize i = 0; i < cache_handle_count; i++) {
+      auto *cache_ptr =
+          reinterpret_cast<std::shared_ptr<rocksdb::Cache> *>(ptr_jcache_handles[i]);
+      cache_set.insert(cache_ptr->get());
+    }
+    env->ReleaseLongArrayElements(jcache_handles, ptr_jcache_handles, JNI_ABORT);
+  }
+
+  std::map<rocksdb::MemoryUtil::UsageType, uint64_t> usage_by_type;
+  if(rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set, &usage_by_type) != rocksdb::Status::OK()) {
+    // Non-OK status
+    return nullptr;
+  }
+
+  jobject jusage_by_type = rocksdb::HashMapJni::construct(
+      env, static_cast<uint32_t>(usage_by_type.size()));
+  if (jusage_by_type == nullptr) {
+    // exception occurred
+    return nullptr;
+  }
+  const rocksdb::HashMapJni::FnMapKV<const rocksdb::MemoryUtil::UsageType, const uint64_t>
+      fn_map_kv =
+      [env](const std::pair<rocksdb::MemoryUtil::UsageType, uint64_t>& pair) {
+        // Construct key
+        const jobject jusage_type =
+            rocksdb::ByteJni::valueOf(env, rocksdb::MemoryUsageTypeJni::toJavaMemoryUsageType(pair.first));
+        if (jusage_type == nullptr) {
+          // an error occurred
+          return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+        }
+        // Construct value
+        const jobject jusage_value =
+            rocksdb::LongJni::valueOf(env, pair.second);
+        if (jusage_value == nullptr) {
+          // an error occurred
+          return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+        }
+        // Construct and return pointer to pair of jobjects
+        return std::unique_ptr<std::pair<jobject, jobject>>(
+            new std::pair<jobject, jobject>(jusage_type,
+                                            jusage_value));
+      };
+
+  if (!rocksdb::HashMapJni::putAll(env, jusage_by_type, usage_by_type.begin(),
+                                   usage_by_type.end(), fn_map_kv)) {
+    // exception occcurred
+    jusage_by_type = nullptr;
+  }
+
+  return jusage_by_type;
+
+}
diff --git a/java/rocksjni/merge_operator.cc b/java/rocksjni/merge_operator.cc
index 782153f5712..e06a06f7e35 100644
--- a/java/rocksjni/merge_operator.cc
+++ b/java/rocksjni/merge_operator.cc
@@ -13,6 +13,7 @@
 #include <string>
 
 #include "include/org_rocksdb_StringAppendOperator.h"
+#include "include/org_rocksdb_UInt64AddOperator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/merge_operator.h"
@@ -47,3 +48,28 @@ void Java_org_rocksdb_StringAppendOperator_disposeInternal(JNIEnv* /*env*/,
       reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>*>(jhandle);
   delete sptr_string_append_op;  // delete std::shared_ptr
 }
+
+/*
+ * Class:     org_rocksdb_UInt64AddOperator
+ * Method:    newSharedUInt64AddOperator
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_UInt64AddOperator_newSharedUInt64AddOperator(
+    JNIEnv* /*env*/, jclass /*jclazz*/) {
+  auto* sptr_uint64_add_op = new std::shared_ptr<rocksdb::MergeOperator>(
+      rocksdb::MergeOperators::CreateUInt64AddOperator());
+  return reinterpret_cast<jlong>(sptr_uint64_add_op);
+}
+
+/*
+ * Class:     org_rocksdb_UInt64AddOperator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_UInt64AddOperator_disposeInternal(JNIEnv* /*env*/,
+                                                        jobject /*jobj*/,
+                                                        jlong jhandle) {
+  auto* sptr_uint64_add_op =
+      reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>*>(jhandle);
+  delete sptr_uint64_add_op;  // delete std::shared_ptr
+}
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 9aed80e1e66..342ee3e9e4c 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -250,6 +250,20 @@ void Java_org_rocksdb_Options_setWriteBufferSize(JNIEnv* env, jobject /*jobj*/,
   }
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWriteBufferManager
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setWriteBufferManager(JNIEnv* /*env*/, jobject /*jobj*/,
+                                                    jlong joptions_handle,
+                                                    jlong jwrite_buffer_manager_handle) {
+  auto* write_buffer_manager =
+          reinterpret_cast<std::shared_ptr<rocksdb::WriteBufferManager> *>(jwrite_buffer_manager_handle);
+  reinterpret_cast<rocksdb::Options*>(joptions_handle)->write_buffer_manager =
+          *write_buffer_manager;
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    writeBufferSize
@@ -1956,8 +1970,8 @@ jbyte Java_org_rocksdb_Options_compressionType(JNIEnv* /*env*/,
  * @param jcompression_levels A reference to a java byte array
  *     where each byte indicates a compression level
  *
- * @return A unique_ptr to the vector, or unique_ptr(nullptr) if a JNI exception
- * occurs
+ * @return A std::unique_ptr to the vector, or std::unique_ptr(nullptr) if a JNI
+ * exception occurs
  */
 std::unique_ptr<std::vector<rocksdb::CompressionType>>
 rocksdb_compression_vector_helper(JNIEnv* env, jbyteArray jcompression_levels) {
@@ -5518,6 +5532,20 @@ void Java_org_rocksdb_DBOptions_setDbWriteBufferSize(
   opt->db_write_buffer_size = static_cast<size_t>(jdb_write_buffer_size);
 }
 
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWriteBufferManager
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setWriteBufferManager(JNIEnv* /*env*/, jobject /*jobj*/,
+                                                      jlong jdb_options_handle,
+                                                      jlong jwrite_buffer_manager_handle) {
+  auto* write_buffer_manager =
+      reinterpret_cast<std::shared_ptr<rocksdb::WriteBufferManager> *>(jwrite_buffer_manager_handle);
+  reinterpret_cast<rocksdb::DBOptions*>(jdb_options_handle)->write_buffer_manager =
+      *write_buffer_manager;
+}
+
 /*
  * Class:     org_rocksdb_DBOptions
  * Method:    dbWriteBufferSize
@@ -6525,6 +6553,31 @@ jlong Java_org_rocksdb_ReadOptions_iterateUpperBound(JNIEnv* /*env*/,
   return reinterpret_cast<jlong>(upper_bound_slice_handle);
 }
 
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setIterateLowerBound
+ * Signature: (JJ)I
+ */
+void Java_org_rocksdb_ReadOptions_setIterateLowerBound(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jlower_bound_slice_handle) {
+  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->iterate_lower_bound =
+      reinterpret_cast<rocksdb::Slice*>(jlower_bound_slice_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    iterateLowerBound
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_iterateLowerBound(JNIEnv* /*env*/,
+                                                     jobject /*jobj*/,
+                                                     jlong jhandle) {
+  auto& lower_bound_slice_handle =
+      reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->iterate_lower_bound;
+  return reinterpret_cast<jlong>(lower_bound_slice_handle);
+}
+
 /////////////////////////////////////////////////////////////////////
 // rocksdb::ComparatorOptions
 
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index a0d1846a659..0bf2867c1c0 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -26,6 +26,7 @@
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/status.h"
 #include "rocksdb/utilities/backupable_db.h"
+#include "rocksdb/utilities/memory_util.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "rocksjni/compaction_filter_factory_jnicallback.h"
@@ -2251,7 +2252,7 @@ class ByteJni : public JavaClass {
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getByteValueMethod(JNIEnv* env) {
     jclass clazz = getJClass(env);
@@ -2264,6 +2265,39 @@ class ByteJni : public JavaClass {
     assert(mid != nullptr);
     return mid;
   }
+
+  /**
+   * Calls the Java Method: Byte#valueOf, returning a constructed Byte jobject
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A constructing Byte object or nullptr if the class or method id could not
+   *     be retrieved, or an exception occurred
+   */
+  static jobject valueOf(JNIEnv* env, jbyte jprimitive_byte) {
+    jclass clazz = getJClass(env);
+    if (clazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetStaticMethodID(clazz, "valueOf", "(B)Ljava/lang/Byte;");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    const jobject jbyte_obj =
+        env->CallStaticObjectMethod(clazz, mid, jprimitive_byte);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      return nullptr;
+    }
+
+    return jbyte_obj;
+  }
+
 };
 
 // The portal class for java.lang.StringBuilder
@@ -3345,8 +3379,12 @@ class TickerTypeJni {
         return 0x5D;
       case rocksdb::Tickers::NUMBER_MULTIGET_KEYS_FOUND:
         return 0x5E;
-      case rocksdb::Tickers::TICKER_ENUM_MAX:
+      case rocksdb::Tickers::NO_ITERATOR_CREATED:
         return 0x5F;
+      case rocksdb::Tickers::NO_ITERATOR_DELETED:
+        return 0x60;
+      case rocksdb::Tickers::TICKER_ENUM_MAX:
+        return 0x61;
 
       default:
         // undefined/default
@@ -3549,6 +3587,10 @@ class TickerTypeJni {
       case 0x5E:
         return rocksdb::Tickers::NUMBER_MULTIGET_KEYS_FOUND;
       case 0x5F:
+        return rocksdb::Tickers::NO_ITERATOR_CREATED;
+      case 0x60:
+        return rocksdb::Tickers::NO_ITERATOR_DELETED;
+      case 0x61:
         return rocksdb::Tickers::TICKER_ENUM_MAX;
 
       default:
@@ -3795,6 +3837,48 @@ class RateLimiterModeJni {
   }
 };
 
+// The portal class for org.rocksdb.MemoryUsageType
+class MemoryUsageTypeJni {
+public:
+  // Returns the equivalent org.rocksdb.MemoryUsageType for the provided
+  // C++ rocksdb::MemoryUtil::UsageType enum
+  static jbyte toJavaMemoryUsageType(
+      const rocksdb::MemoryUtil::UsageType& usage_type) {
+    switch(usage_type) {
+      case rocksdb::MemoryUtil::UsageType::kMemTableTotal:
+        return 0x0;
+      case rocksdb::MemoryUtil::UsageType::kMemTableUnFlushed:
+        return 0x1;
+      case rocksdb::MemoryUtil::UsageType::kTableReadersTotal:
+        return 0x2;
+      case rocksdb::MemoryUtil::UsageType::kCacheTotal:
+        return 0x3;
+      default:
+        // undefined: use kNumUsageTypes
+        return 0x4;
+    }
+  }
+
+  // Returns the equivalent C++ rocksdb::MemoryUtil::UsageType enum for the
+  // provided Java org.rocksdb.MemoryUsageType
+  static rocksdb::MemoryUtil::UsageType toCppMemoryUsageType(
+      jbyte usage_type) {
+    switch(usage_type) {
+      case 0x0:
+        return rocksdb::MemoryUtil::UsageType::kMemTableTotal;
+      case 0x1:
+        return rocksdb::MemoryUtil::UsageType::kMemTableUnFlushed;
+      case 0x2:
+        return rocksdb::MemoryUtil::UsageType::kTableReadersTotal;
+      case 0x3:
+        return rocksdb::MemoryUtil::UsageType::kCacheTotal;
+      default:
+        // undefined/default: use kNumUsageTypes
+        return rocksdb::MemoryUtil::UsageType::kNumUsageTypes;
+    }
+  }
+};
+
 // The portal class for org.rocksdb.Transaction
 class TransactionJni : public JavaClass {
  public:
diff --git a/java/rocksjni/statisticsjni.cc b/java/rocksjni/statisticsjni.cc
index 3ac1e5b413e..8fddc437a0b 100644
--- a/java/rocksjni/statisticsjni.cc
+++ b/java/rocksjni/statisticsjni.cc
@@ -11,11 +11,11 @@
 namespace rocksdb {
 
   StatisticsJni::StatisticsJni(std::shared_ptr<Statistics> stats)
-      : StatisticsImpl(stats, false), m_ignore_histograms() {
+      : StatisticsImpl(stats), m_ignore_histograms() {
   }
 
   StatisticsJni::StatisticsJni(std::shared_ptr<Statistics> stats,
-      const std::set<uint32_t> ignore_histograms) : StatisticsImpl(stats, false),
+      const std::set<uint32_t> ignore_histograms) : StatisticsImpl(stats),
       m_ignore_histograms(ignore_histograms) {
   }
 
diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc
index 5f5f8cd2abf..3dbd13280ad 100644
--- a/java/rocksjni/table.cc
+++ b/java/rocksjni/table.cc
@@ -37,7 +37,7 @@ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
 /*
  * Class:     org_rocksdb_BlockBasedTableConfig
  * Method:    newTableFactoryHandle
- * Signature: (ZJIJJIIZIZZZJIBBI)J
+ * Signature: (ZJIJJIIZJZZZZJZZJIBBI)J
  */
 jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
     JNIEnv * /*env*/, jobject /*jobj*/, jboolean no_block_cache,
@@ -45,7 +45,10 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
     jlong block_size, jint block_size_deviation, jint block_restart_interval,
     jboolean whole_key_filtering, jlong jfilter_policy,
     jboolean cache_index_and_filter_blocks,
+    jboolean cache_index_and_filter_blocks_with_high_priority,
     jboolean pin_l0_filter_and_index_blocks_in_cache,
+    jboolean partition_filters, jlong metadata_block_size,
+    jboolean pin_top_level_index_and_filter,
     jboolean hash_index_allow_collision, jlong block_cache_compressed_size,
     jint block_cache_compressd_num_shard_bits, jbyte jchecksum_type,
     jbyte jindex_type, jint jformat_version) {
@@ -77,8 +80,13 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
     options.filter_policy = *pFilterPolicy;
   }
   options.cache_index_and_filter_blocks = cache_index_and_filter_blocks;
+  options.cache_index_and_filter_blocks_with_high_priority =
+      cache_index_and_filter_blocks_with_high_priority;
   options.pin_l0_filter_and_index_blocks_in_cache =
       pin_l0_filter_and_index_blocks_in_cache;
+  options.partition_filters = partition_filters;
+  options.metadata_block_size = metadata_block_size;
+  options.pin_top_level_index_and_filter = pin_top_level_index_and_filter;
   options.hash_index_allow_collision = hash_index_allow_collision;
   if (block_cache_compressed_size > 0) {
     if (block_cache_compressd_num_shard_bits > 0) {
diff --git a/java/rocksjni/write_buffer_manager.cc b/java/rocksjni/write_buffer_manager.cc
new file mode 100644
index 00000000000..043f69031c0
--- /dev/null
+++ b/java/rocksjni/write_buffer_manager.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <jni.h>
+
+#include "include/org_rocksdb_WriteBufferManager.h"
+
+#include "rocksdb/cache.h"
+#include "rocksdb/write_buffer_manager.h"
+
+/*
+ * Class:     org_rocksdb_WriteBufferManager
+ * Method:    newWriteBufferManager
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_WriteBufferManager_newWriteBufferManager(
+        JNIEnv* /*env*/, jclass /*jclazz*/, jlong jbuffer_size, jlong jcache_handle) {
+    auto* cache_ptr =
+            reinterpret_cast<std::shared_ptr<rocksdb::Cache> *>(jcache_handle);
+    auto* write_buffer_manager = new std::shared_ptr<rocksdb::WriteBufferManager>(
+            std::make_shared<rocksdb::WriteBufferManager>(jbuffer_size, *cache_ptr));
+    return reinterpret_cast<jlong>(write_buffer_manager);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBufferManager
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBufferManager_disposeInternal(
+        JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+    auto* write_buffer_manager =
+            reinterpret_cast<std::shared_ptr<rocksdb::WriteBufferManager> *>(jhandle);
+    assert(write_buffer_manager != nullptr);
+    delete write_buffer_manager;
+}
diff --git a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
index 2dbbc64d358..1032be6e799 100644
--- a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
+++ b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
@@ -22,7 +22,11 @@ public BlockBasedTableConfig() {
     wholeKeyFiltering_ = true;
     filter_ = null;
     cacheIndexAndFilterBlocks_ = false;
+    cacheIndexAndFilterBlocksWithHighPriority_ = false;
     pinL0FilterAndIndexBlocksInCache_ = false;
+    partitionFilters_ = false;
+    metadataBlockSize_ = 4096;
+    pinTopLevelIndexAndFilter_ = true;
     hashIndexAllowCollision_ = true;
     blockCacheCompressedSize_ = 0;
     blockCacheCompressedNumShardBits_ = 0;
@@ -246,6 +250,31 @@ public BlockBasedTableConfig setCacheIndexAndFilterBlocks(
     return this;
   }
 
+  /**
+   * Indicates if index and filter blocks will be treated as high-priority in the block cache.
+   * See note below about applicability. If not specified, defaults to false.
+   *
+   * @return if index and filter blocks will be treated as high-priority.
+   */
+  public boolean cacheIndexAndFilterBlocksWithHighPriority() {
+    return cacheIndexAndFilterBlocksWithHighPriority_;
+  }
+
+  /**
+   * If true, cache index and filter blocks with high priority. If set to true,
+   * depending on implementation of block cache, index and filter blocks may be
+   * less likely to be evicted than data blocks.
+   *
+   * @param cacheIndexAndFilterBlocksWithHighPriority if index and filter blocks
+   *            will be treated as high-priority.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setCacheIndexAndFilterBlocksWithHighPriority(
+          final boolean cacheIndexAndFilterBlocksWithHighPriority) {
+    cacheIndexAndFilterBlocksWithHighPriority_ = cacheIndexAndFilterBlocksWithHighPriority;
+    return this;
+  }
+
   /**
    * Indicating if we'd like to pin L0 index/filter blocks to the block cache.
      If not specified, defaults to false.
@@ -269,6 +298,70 @@ public BlockBasedTableConfig setPinL0FilterAndIndexBlocksInCache(
     return this;
   }
 
+  /**
+   * Indicating if we're using partitioned filters. Defaults to false.
+   *
+   * @return if we're using partition filters.
+   */
+  public boolean partitionFilters() {
+    return partitionFilters_;
+  }
+
+  /**
+   * Use partitioned full filters for each SST file. This option is incompatible with
+   * block-based filters.
+   *
+   * @param partitionFilters use partition filters.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setPartitionFilters(final boolean partitionFilters) {
+    partitionFilters_ = partitionFilters;
+    return this;
+  }
+
+  /**
+   * @return block size for partitioned metadata.
+   */
+  public long metadataBlockSize() {
+    return metadataBlockSize_;
+  }
+
+  /**
+   * Set block size for partitioned metadata.
+   *
+   * @param metadataBlockSize Partitioned metadata block size.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setMetadataBlockSize(
+          final long metadataBlockSize) {
+    metadataBlockSize_ = metadataBlockSize;
+    return this;
+  }
+
+  /**
+   * Indicates if top-level index and filter blocks should be pinned.
+   *
+   * @return if top-level index and filter blocks should be pinned.
+   */
+  public boolean pinTopLevelIndexAndFilter() {
+    return pinTopLevelIndexAndFilter_;
+  }
+
+  /**
+   * If cacheIndexAndFilterBlocks is true and the below is true, then
+   * the top-level index of partitioned filter and index blocks are stored in
+   * the cache, but a reference is held in the "table reader" object so the
+   * blocks are pinned and only evicted from cache when the table reader is
+   * freed. This is not limited to l0 in LSM tree.
+   *
+   * @param pinTopLevelIndexAndFilter if top-level index and filter blocks should be pinned.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setPinTopLevelIndexAndFilter(final boolean pinTopLevelIndexAndFilter) {
+    pinTopLevelIndexAndFilter_ = pinTopLevelIndexAndFilter;
+    return this;
+  }
+
   /**
    * Influence the behavior when kHashSearch is used.
      if false, stores a precise prefix to block range mapping
@@ -440,20 +533,27 @@ public int formatVersion() {
     return newTableFactoryHandle(noBlockCache_, blockCacheSize_, blockCacheNumShardBits_,
         blockCacheHandle, blockSize_, blockSizeDeviation_, blockRestartInterval_,
         wholeKeyFiltering_, filterHandle, cacheIndexAndFilterBlocks_,
-        pinL0FilterAndIndexBlocksInCache_, hashIndexAllowCollision_, blockCacheCompressedSize_,
-        blockCacheCompressedNumShardBits_, checksumType_.getValue(), indexType_.getValue(),
-        formatVersion_);
+        cacheIndexAndFilterBlocksWithHighPriority_, pinL0FilterAndIndexBlocksInCache_,
+        partitionFilters_, metadataBlockSize_, pinTopLevelIndexAndFilter_,
+        hashIndexAllowCollision_, blockCacheCompressedSize_, blockCacheCompressedNumShardBits_,
+        checksumType_.getValue(), indexType_.getValue(), formatVersion_);
   }
 
   private native long newTableFactoryHandle(boolean noBlockCache, long blockCacheSize,
       int blockCacheNumShardBits, long blockCacheHandle, long blockSize, int blockSizeDeviation,
       int blockRestartInterval, boolean wholeKeyFiltering, long filterPolicyHandle,
-      boolean cacheIndexAndFilterBlocks, boolean pinL0FilterAndIndexBlocksInCache,
-      boolean hashIndexAllowCollision, long blockCacheCompressedSize,
-      int blockCacheCompressedNumShardBits, byte checkSumType, byte indexType, int formatVersion);
+      boolean cacheIndexAndFilterBlocks, boolean cacheIndexAndFilterBlocksWithHighPriority,
+      boolean pinL0FilterAndIndexBlocksInCache, boolean partitionFilters, long metadataBlockSize,
+      boolean pinTopLevelIndexAndFilter, boolean hashIndexAllowCollision,
+      long blockCacheCompressedSize, int blockCacheCompressedNumShardBits,
+      byte checkSumType, byte indexType, int formatVersion);
 
   private boolean cacheIndexAndFilterBlocks_;
+  private boolean cacheIndexAndFilterBlocksWithHighPriority_;
   private boolean pinL0FilterAndIndexBlocksInCache_;
+  private boolean partitionFilters_;
+  private long metadataBlockSize_;
+  private boolean pinTopLevelIndexAndFilter_;
   private IndexType indexType_;
   private boolean hashIndexAllowCollision_;
   private ChecksumType checksumType_;
diff --git a/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java b/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java
index f795807804d..36d78fe6e6f 100644
--- a/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java
+++ b/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java
@@ -42,8 +42,77 @@ public long maxTableFilesSize() {
     return maxTableFilesSize(nativeHandle_);
   }
 
+  /**
+   * Drop files older than TTL. TTL based deletion will take precedence over
+   * size based deletion if ttl &gt; 0.
+   * delete if sst_file_creation_time &lt; (current_time - ttl).
+   * unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60
+   *
+   * Default: 0 (disabled)
+   *
+   * @param ttl The ttl for the table files in seconds
+   *
+   * @return the reference to the current options.
+   */
+  public CompactionOptionsFIFO setTtl(final long ttl) {
+    setTtl(nativeHandle_, ttl);
+    return this;
+  }
+
+  /**
+   * The current ttl value.
+   * Drop files older than TTL. TTL based deletion will take precedence over
+   * size based deletion if ttl &gt; 0.
+   * delete if sst_file_creation_time &lt; (current_time - ttl).
+   *
+   * Default: 0 (disabled)
+   *
+   * @return the ttl in seconds
+   */
+  public long ttl() {
+    return ttl(nativeHandle_);
+  }
+
+  /**
+   * If true, try to do compaction to compact smaller files into larger ones.
+   * Minimum files to compact follows options.level0_file_num_compaction_trigger
+   * and compaction won't trigger if average compact bytes per del file is
+   * larger than options.write_buffer_size. This is to protect large files
+   * from being compacted again.
+   *
+   * Default: false
+   *
+   * @param allowCompaction should allow intra-L0 compaction?
+   *
+   * @return the reference to the current options.
+   */
+  public CompactionOptionsFIFO setAllowCompaction(final boolean allowCompaction) {
+    setAllowCompaction(nativeHandle_, allowCompaction);
+    return this;
+  }
+
+  /**
+   * Check if intra-L0 compaction is enabled.
+   * If true, try to do compaction to compact smaller files into larger ones.
+   * Minimum files to compact follows options.level0_file_num_compaction_trigger
+   * and compaction won't trigger if average compact bytes per del file is
+   * larger than options.write_buffer_size. This is to protect large files
+   * from being compacted again.
+   *
+   * Default: false
+   *
+   * @return a boolean value indicating whether intra-L0 compaction is enabled
+   */
+  public boolean allowCompaction() {
+    return allowCompaction(nativeHandle_);
+  }
+
   private native void setMaxTableFilesSize(long handle, long maxTableFilesSize);
   private native long maxTableFilesSize(long handle);
+  private native void setTtl(long handle, long ttl);
+  private native long ttl(long handle);
+  private native void setAllowCompaction(long handle, boolean allowCompaction);
+  private native boolean allowCompaction(long handle);
 
   private native static long newCompactionOptionsFIFO();
   @Override protected final native void disposeInternal(final long handle);
diff --git a/java/src/main/java/org/rocksdb/DBOptions.java b/java/src/main/java/org/rocksdb/DBOptions.java
index c3232938893..280623a208e 100644
--- a/java/src/main/java/org/rocksdb/DBOptions.java
+++ b/java/src/main/java/org/rocksdb/DBOptions.java
@@ -46,6 +46,7 @@ public DBOptions(DBOptions other) {
     this.numShardBits_ = other.numShardBits_;
     this.rateLimiter_ = other.rateLimiter_;
     this.rowCache_ = other.rowCache_;
+    this.writeBufferManager_ = other.writeBufferManager_;
   }
 
   /**
@@ -668,6 +669,20 @@ public DBOptions setDbWriteBufferSize(final long dbWriteBufferSize) {
   }
 
   @Override
+  public DBOptions setWriteBufferManager(final WriteBufferManager writeBufferManager) {
+    assert(isOwningHandle());
+    setWriteBufferManager(nativeHandle_, writeBufferManager.nativeHandle_);
+    this.writeBufferManager_ = writeBufferManager;
+    return this;
+  }
+
+  @Override
+  public WriteBufferManager writeBufferManager() {
+    assert(isOwningHandle());
+    return this.writeBufferManager_;
+  }
+
+    @Override
   public long dbWriteBufferSize() {
     assert(isOwningHandle());
     return dbWriteBufferSize(nativeHandle_);
@@ -1087,6 +1102,8 @@ private native void setAdviseRandomOnOpen(
   private native boolean adviseRandomOnOpen(long handle);
   private native void setDbWriteBufferSize(final long handle,
       final long dbWriteBufferSize);
+  private native void setWriteBufferManager(final long dbOptionsHandle,
+      final long writeBufferManagerHandle);
   private native long dbWriteBufferSize(final long handle);
   private native void setAccessHintOnCompactionStart(final long handle,
       final byte accessHintOnCompactionStart);
@@ -1158,4 +1175,5 @@ private native void setAvoidFlushDuringShutdown(final long handle,
   private int numShardBits_;
   private RateLimiter rateLimiter_;
   private Cache rowCache_;
+  private WriteBufferManager writeBufferManager_;
 }
diff --git a/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/java/src/main/java/org/rocksdb/DBOptionsInterface.java
index 7c406eaf8ab..accfb4c59ae 100644
--- a/java/src/main/java/org/rocksdb/DBOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/DBOptionsInterface.java
@@ -991,6 +991,28 @@ public interface DBOptionsInterface<T extends DBOptionsInterface> {
    */
   T setDbWriteBufferSize(long dbWriteBufferSize);
 
+  /**
+   * Use passed {@link WriteBufferManager} to control memory usage across
+   * multiple column families and/or DB instances.
+   *
+   * Check <a href="https://github.com/facebook/rocksdb/wiki/Write-Buffer-Manager">
+   *     https://github.com/facebook/rocksdb/wiki/Write-Buffer-Manager</a>
+   * for more details on when to use it
+   *
+   * @param writeBufferManager The WriteBufferManager to use
+   * @return the reference of the current options.
+   */
+  T setWriteBufferManager(final WriteBufferManager writeBufferManager);
+
+  /**
+   * Reference to {@link WriteBufferManager} used by it. <br>
+   *
+   * Default: null (Disabled)
+   *
+   * @return a reference to WriteBufferManager
+   */
+  WriteBufferManager writeBufferManager();
+
   /**
    * Amount of data to build up in memtables across all column
    * families before writing to disk.
diff --git a/java/src/main/java/org/rocksdb/MemoryUsageType.java b/java/src/main/java/org/rocksdb/MemoryUsageType.java
new file mode 100644
index 00000000000..3523cd0ee65
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/MemoryUsageType.java
@@ -0,0 +1,72 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * MemoryUsageType
+ *
+ * <p>The value will be used as a key to indicate the type of memory usage
+ * described</p>
+ */
+public enum MemoryUsageType {
+  /**
+   * Memory usage of all the mem-tables.
+   */
+  kMemTableTotal((byte) 0),
+  /**
+   * Memory usage of those un-flushed mem-tables.
+   */
+  kMemTableUnFlushed((byte) 1),
+  /**
+   * Memory usage of all the table readers.
+   */
+  kTableReadersTotal((byte) 2),
+  /**
+   * Memory usage by Cache.
+   */
+  kCacheTotal((byte) 3),
+  /**
+   * Max usage types - copied to keep 1:1 with native.
+   */
+  kNumUsageTypes((byte) 4);
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+
+  /**
+   * <p>Get the MemoryUsageType enumeration value by
+   * passing the byte identifier to this method.</p>
+   *
+   * @param byteIdentifier of MemoryUsageType.
+   *
+   * @return MemoryUsageType instance.
+   *
+   * @throws IllegalArgumentException if the usage type for the byteIdentifier
+   *     cannot be found
+   */
+  public static MemoryUsageType getMemoryUsageType(final byte byteIdentifier) {
+    for (final MemoryUsageType MemoryUsageType : MemoryUsageType.values()) {
+      if (MemoryUsageType.getValue() == byteIdentifier) {
+        return MemoryUsageType;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for MemoryUsageType.");
+  }
+
+  private MemoryUsageType(byte value) {
+    value_ = value;
+  }
+
+  private final byte value_;
+}
diff --git a/java/src/main/java/org/rocksdb/MemoryUtil.java b/java/src/main/java/org/rocksdb/MemoryUtil.java
new file mode 100644
index 00000000000..52b2175e6b1
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/MemoryUtil.java
@@ -0,0 +1,60 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.*;
+
+/**
+ * JNI passthrough for MemoryUtil.
+ */
+public class MemoryUtil {
+
+  /**
+   * <p>Returns the approximate memory usage of different types in the input
+   * list of DBs and Cache set.  For instance, in the output map the key
+   * kMemTableTotal will be associated with the memory
+   * usage of all the mem-tables from all the input rocksdb instances.</p>
+   *
+   * <p>Note that for memory usage inside Cache class, we will
+   * only report the usage of the input "cache_set" without
+   * including those Cache usage inside the input list "dbs"
+   * of DBs.</p>
+   *
+   * @param dbs List of dbs to collect memory usage for.
+   * @param caches Set of caches to collect memory usage for.
+   * @return Map from {@link MemoryUsageType} to memory usage as a {@link Long}.
+   */
+  public static Map<MemoryUsageType, Long> getApproximateMemoryUsageByType(final List<RocksDB> dbs, final Set<Cache> caches) {
+    int dbCount = (dbs == null) ? 0 : dbs.size();
+    int cacheCount = (caches == null) ? 0 : caches.size();
+    long[] dbHandles = new long[dbCount];
+    long[] cacheHandles = new long[cacheCount];
+    if (dbCount > 0) {
+      ListIterator<RocksDB> dbIter = dbs.listIterator();
+      while (dbIter.hasNext()) {
+        dbHandles[dbIter.nextIndex()] = dbIter.next().nativeHandle_;
+      }
+    }
+    if (cacheCount > 0) {
+      // NOTE: This index handling is super ugly but I couldn't get a clean way to track both the
+      // index and the iterator simultaneously within a Set.
+      int i = 0;
+      for (Cache cache : caches) {
+        cacheHandles[i] = cache.nativeHandle_;
+        i++;
+      }
+    }
+    Map<Byte, Long> byteOutput = getApproximateMemoryUsageByType(dbHandles, cacheHandles);
+    Map<MemoryUsageType, Long> output = new HashMap<>();
+    for(Map.Entry<Byte, Long> longEntry : byteOutput.entrySet()) {
+      output.put(MemoryUsageType.getMemoryUsageType(longEntry.getKey()), longEntry.getValue());
+    }
+    return output;
+  }
+
+  private native static Map<Byte, Long> getApproximateMemoryUsageByType(final long[] dbHandles,
+      final long[] cacheHandles);
+}
diff --git a/java/src/main/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java
index cac4fc5a368..2ff4ec12040 100644
--- a/java/src/main/java/org/rocksdb/Options.java
+++ b/java/src/main/java/org/rocksdb/Options.java
@@ -70,6 +70,7 @@ public Options(Options other) {
     this.compactionOptionsFIFO_ = other.compactionOptionsFIFO_;
     this.compressionOptions_ = other.compressionOptions_;
     this.rowCache_ = other.rowCache_;
+    this.writeBufferManager_ = other.writeBufferManager_;
   }
 
   @Override
@@ -724,6 +725,20 @@ public Options setDbWriteBufferSize(final long dbWriteBufferSize) {
   }
 
   @Override
+  public Options setWriteBufferManager(final WriteBufferManager writeBufferManager) {
+    assert(isOwningHandle());
+    setWriteBufferManager(nativeHandle_, writeBufferManager.nativeHandle_);
+    this.writeBufferManager_ = writeBufferManager;
+    return this;
+  }
+
+  @Override
+  public WriteBufferManager writeBufferManager() {
+    assert(isOwningHandle());
+    return this.writeBufferManager_;
+  }
+
+    @Override
   public long dbWriteBufferSize() {
     assert(isOwningHandle());
     return dbWriteBufferSize(nativeHandle_);
@@ -1690,6 +1705,8 @@ private native void setAdviseRandomOnOpen(
   private native boolean adviseRandomOnOpen(long handle);
   private native void setDbWriteBufferSize(final long handle,
       final long dbWriteBufferSize);
+  private native void setWriteBufferManager(final long handle,
+      final long writeBufferManagerHandle);
   private native long dbWriteBufferSize(final long handle);
   private native void setAccessHintOnCompactionStart(final long handle,
       final byte accessHintOnCompactionStart);
@@ -1909,4 +1926,5 @@ private native void setForceConsistencyChecks(final long handle,
   private CompactionOptionsFIFO compactionOptionsFIFO_;
   private CompressionOptions compressionOptions_;
   private Cache rowCache_;
+  private WriteBufferManager writeBufferManager_;
 }
diff --git a/java/src/main/java/org/rocksdb/ReadOptions.java b/java/src/main/java/org/rocksdb/ReadOptions.java
index be8aec6b32c..f176d249b02 100644
--- a/java/src/main/java/org/rocksdb/ReadOptions.java
+++ b/java/src/main/java/org/rocksdb/ReadOptions.java
@@ -27,6 +27,7 @@ public ReadOptions() {
   public ReadOptions(ReadOptions other) {
     super(copyReadOptions(other.nativeHandle_));
     iterateUpperBoundSlice_ = other.iterateUpperBoundSlice_;
+    iterateLowerBoundSlice_ = other.iterateLowerBoundSlice_;
   }
 
   /**
@@ -423,15 +424,65 @@ public Slice iterateUpperBound() {
     return null;
   }
 
+  /**
+   * Defines the smallest key at which the backward iterator can return an
+   * entry. Once the bound is passed, Valid() will be false.
+   * `iterate_lower_bound` is inclusive ie the bound value is a valid entry.
+   *
+   * If prefix_extractor is not null, the Seek target and `iterate_lower_bound`
+   * need to have the same prefix. This is because ordering is not guaranteed
+   * outside of prefix domain.
+   *
+   * Default: nullptr
+   *
+   * @param iterateLowerBound Slice representing the lower bound
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setIterateLowerBound(final Slice iterateLowerBound) {
+    assert(isOwningHandle());
+    if (iterateLowerBound != null) {
+      // Hold onto a reference so it doesn't get garbaged collected out from under us.
+      iterateLowerBoundSlice_ = iterateLowerBound;
+      setIterateLowerBound(nativeHandle_, iterateLowerBoundSlice_.getNativeHandle());
+    }
+    return this;
+  }
+
+  /**
+   * Defines the smallest key at which the backward iterator can return an
+   * entry. Once the bound is passed, Valid() will be false.
+   * `iterate_lower_bound` is inclusive ie the bound value is a valid entry.
+   *
+   * If prefix_extractor is not null, the Seek target and `iterate_lower_bound`
+   * need to have the same prefix. This is because ordering is not guaranteed
+   * outside of prefix domain.
+   *
+   * Default: nullptr
+   *
+   * @return Slice representing current iterate_lower_bound setting, or null if
+   *         one does not exist.
+   */
+  public Slice iterateLowerBound() {
+    assert(isOwningHandle());
+    long lowerBoundSliceHandle = iterateLowerBound(nativeHandle_);
+    if (lowerBoundSliceHandle != 0) {
+      // Disown the new slice - it's owned by the C++ side of the JNI boundary
+      // from the perspective of this method.
+      return new Slice(lowerBoundSliceHandle, false);
+    }
+    return null;
+  }
+
   // instance variables
   // NOTE: If you add new member variables, please update the copy constructor above!
   //
-  // Hold a reference to any iterate upper bound that was set on this object
-  // until we're destroyed or it's overwritten.  That way the caller can freely
+  // Hold a reference to any iterate upper/lower bound that was set on this object
+  // until we're destroyed or it's overwritten. That way the caller can freely
   // leave scope without us losing the Java Slice object, which during close()
   // would also reap its associated rocksdb::Slice native object since it's
   // possibly (likely) to be an owning handle.
   protected Slice iterateUpperBoundSlice_;
+  protected Slice iterateLowerBoundSlice_;
 
   private native static long newReadOptions();
   private native static long copyReadOptions(long handle);
@@ -465,6 +516,9 @@ private native void setIgnoreRangeDeletions(final long handle,
   private native void setIterateUpperBound(final long handle,
       final long upperBoundSliceHandle);
   private native long iterateUpperBound(final long handle);
+  private native void setIterateLowerBound(final long handle,
+      final long upperBoundSliceHandle);
+  private native long iterateLowerBound(final long handle);
 
   @Override protected final native void disposeInternal(final long handle);
 
diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java
index 38be3333f45..7ac08fdf05b 100644
--- a/java/src/main/java/org/rocksdb/RocksDB.java
+++ b/java/src/main/java/org/rocksdb/RocksDB.java
@@ -439,6 +439,12 @@ protected void storeOptionsInstance(DBOptionsInterface options) {
     options_ = options;
   }
 
+  private static void checkBounds(int offset, int len, int size) {
+    if ((offset | len | (offset + len) | (size - (offset + len))) < 0) {
+      throw new IndexOutOfBoundsException(String.format("offset(%d), len(%d), size(%d)", offset, len, size));
+    }
+  }
+
   /**
    * Set the database entry for "key" to "value".
    *
@@ -453,6 +459,28 @@ public void put(final byte[] key, final byte[] value)
     put(nativeHandle_, key, 0, key.length, value, 0, value.length);
   }
 
+  /**
+   * Set the database entry for "key" to "value"
+   *
+   * @param key The specified key to be inserted
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   *            must be non-negative and no larger than ("key".length -  offset)
+   * @param value the value associated with the specified key
+   * @param vOffset the offset of the "value" array to be used, must be non-negative and
+   *               no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be non-negative and
+   *            must be non-negative and no larger than ("value".length -  offset)
+   *
+   * @throws RocksDBException thrown if errors happens in underlying native library.
+   */
+  public void put(final byte[] key, int offset, int len, final byte[] value, int vOffset, int vLen) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    put(nativeHandle_, key, offset, len, value, vOffset, vLen);
+  }
+
   /**
    * Set the database entry for "key" to "value" in the specified
    * column family.
@@ -473,6 +501,32 @@ public void put(final ColumnFamilyHandle columnFamilyHandle,
         columnFamilyHandle.nativeHandle_);
   }
 
+  /**
+   * Set the database entry for "key" to "value" in the specified
+   * column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key The specified key to be inserted
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   *            must be non-negative and no larger than ("key".length -  offset)
+   * @param value the value associated with the specified key
+   * @param vOffset the offset of the "value" array to be used, must be non-negative and
+   *               no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be non-negative and
+   *            must be non-negative and no larger than ("value".length -  offset)
+   *
+   * @throws RocksDBException thrown if errors happens in underlying native library.
+   */
+  public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, int offset, int len, final byte[] value, int vOffset, int vLen) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    put(nativeHandle_, key, offset, len, value, vOffset, vLen,
+        columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * Set the database entry for "key" to "value".
    *
@@ -489,6 +543,32 @@ public void put(final WriteOptions writeOpts, final byte[] key,
         key, 0, key.length, value, 0, value.length);
   }
 
+  /**
+   * Set the database entry for "key" to "value".
+   *
+   * @param writeOpts {@link org.rocksdb.WriteOptions} instance.
+   * @param key The specified key to be inserted
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   *            must be non-negative and no larger than ("key".length -  offset)
+   * @param value the value associated with the specified key
+   * @param vOffset the offset of the "value" array to be used, must be non-negative and
+   *               no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be non-negative and
+   *            must be non-negative and no larger than ("value".length -  offset)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void put(final WriteOptions writeOpts, byte[] key, int offset, int len, byte[] value, int vOffset, int vLen) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    put(nativeHandle_, writeOpts.nativeHandle_,
+        key, offset, len, value, vOffset, vLen);
+  }
+
+
   /**
    * Set the database entry for "key" to "value" for the specified
    * column family.
@@ -512,6 +592,36 @@ public void put(final ColumnFamilyHandle columnFamilyHandle,
         0, value.length, columnFamilyHandle.nativeHandle_);
   }
 
+  /**
+   * Set the database entry for "key" to "value" for the specified
+   * column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param writeOpts {@link org.rocksdb.WriteOptions} instance.
+   * @param key The specified key to be inserted
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   *            must be non-negative and no larger than ("key".length -  offset)
+   * @param value the value associated with the specified key
+   * @param vOffset the offset of the "value" array to be used, must be non-negative and
+   *               no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be non-negative and
+   *            must be non-negative and no larger than ("value".length -  offset)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void put(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpts, final byte[] key, int offset, int len,
+      final byte[] value, int vOffset, int vLen) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    put(nativeHandle_, writeOpts.nativeHandle_, key, offset, len, value,
+        vOffset, vLen, columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * If the key definitely does not exist in the database, then this method
    * returns false, else true.
@@ -528,6 +638,27 @@ public boolean keyMayExist(final byte[] key, final StringBuilder value) {
     return keyMayExist(nativeHandle_, key, 0, key.length, value);
   }
 
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
+   *
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
+   *
+   * @param key byte array of a key to search for
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   * @param value StringBuilder instance which is a out parameter if a value is
+   *    found in block-cache.
+   *
+   * @return boolean value indicating if key does not exist or might exist.
+   */
+  public boolean keyMayExist(final byte[] key, int offset, int len, final StringBuilder value) {
+    checkBounds(offset, len, key.length);
+    return keyMayExist(nativeHandle_, key, offset, len, value);
+  }
+
   /**
    * If the key definitely does not exist in the database, then this method
    * returns false, else true.
@@ -547,6 +678,30 @@ public boolean keyMayExist(final ColumnFamilyHandle columnFamilyHandle,
         columnFamilyHandle.nativeHandle_, value);
   }
 
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
+   *
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key byte array of a key to search for
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   * @param value StringBuilder instance which is a out parameter if a value is
+   *    found in block-cache.
+   * @return boolean value indicating if key does not exist or might exist.
+   */
+  public boolean keyMayExist(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, int offset, int len, final StringBuilder value) {
+    checkBounds(offset, len, key.length);
+    return keyMayExist(nativeHandle_, key, offset, len,
+        columnFamilyHandle.nativeHandle_, value);
+  }
+
+
   /**
    * If the key definitely does not exist in the database, then this method
    * returns false, else true.
@@ -566,6 +721,29 @@ public boolean keyMayExist(final ReadOptions readOptions,
         key, 0, key.length, value);
   }
 
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
+   *
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
+   *
+   * @param readOptions {@link ReadOptions} instance
+   * @param key byte array of a key to search for
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   * @param value StringBuilder instance which is a out parameter if a value is
+   *    found in block-cache.
+   * @return boolean value indicating if key does not exist or might exist.
+   */
+  public boolean keyMayExist(final ReadOptions readOptions,
+      final byte[] key, int offset, int len, final StringBuilder value) {
+    checkBounds(offset, len, key.length);
+    return keyMayExist(nativeHandle_, readOptions.nativeHandle_,
+        key, offset, len, value);
+  }
+
   /**
    * If the key definitely does not exist in the database, then this method
    * returns false, else true.
@@ -588,6 +766,32 @@ public boolean keyMayExist(final ReadOptions readOptions,
         value);
   }
 
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
+   *
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
+   *
+   * @param readOptions {@link ReadOptions} instance
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key byte array of a key to search for
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   * @param value StringBuilder instance which is a out parameter if a value is
+   *    found in block-cache.
+   * @return boolean value indicating if key does not exist or might exist.
+   */
+  public boolean keyMayExist(final ReadOptions readOptions,
+      final ColumnFamilyHandle columnFamilyHandle, final byte[] key, int offset, int len,
+      final StringBuilder value) {
+    checkBounds(offset, len, key.length);
+    return keyMayExist(nativeHandle_, readOptions.nativeHandle_,
+        key, offset, len, columnFamilyHandle.nativeHandle_,
+        value);
+  }
+
   /**
    * Apply the specified updates to the database.
    *
@@ -631,6 +835,30 @@ public void merge(final byte[] key, final byte[] value)
     merge(nativeHandle_, key, 0, key.length, value, 0, value.length);
   }
 
+  /**
+   * Add merge operand for key/value pair.
+   *
+   * @param key the specified key to be merged.
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   * @param value the value to be merged with the current value for the specified key.
+   * @param vOffset the offset of the "value" array to be used, must be non-negative and
+   *               no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be non-negative and
+   *            must be non-negative and no larger than ("value".length -  offset)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void merge(final byte[] key, int offset, int len, final byte[] value, int vOffset, int vLen)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    merge(nativeHandle_, key, offset, len, value, vOffset, vLen);
+  }
+
+
   /**
    * Add merge operand for key/value pair in a ColumnFamily.
    *
@@ -648,6 +876,32 @@ public void merge(final ColumnFamilyHandle columnFamilyHandle,
         columnFamilyHandle.nativeHandle_);
   }
 
+  /**
+   * Add merge operand for key/value pair in a ColumnFamily.
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key the specified key to be merged.
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   * @param value the value to be merged with the current value for
+   * the specified key.
+   * @param vOffset the offset of the "value" array to be used, must be non-negative and
+   *               no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be non-negative and
+   *            must be non-negative and no larger than ("value".length -  offset)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void merge(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, int offset, int len, final byte[] value, int vOffset, int vLen) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    merge(nativeHandle_, key, offset, len, value, vOffset, vLen,
+        columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * Add merge operand for key/value pair.
    *
@@ -665,6 +919,32 @@ public void merge(final WriteOptions writeOpts, final byte[] key,
         key, 0, key.length, value, 0, value.length);
   }
 
+  /**
+   * Add merge operand for key/value pair.
+   *
+   * @param writeOpts {@link WriteOptions} for this write.
+   * @param key the specified key to be merged.
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   * @param value the value to be merged with the current value for
+   * the specified key.
+   * @param vOffset the offset of the "value" array to be used, must be non-negative and
+   *               no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be non-negative and
+   *            must be non-negative and no larger than ("value".length -  offset)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void merge(final WriteOptions writeOpts, final byte[] key, int offset, int len,
+      final byte[] value, int vOffset, int vLen) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    merge(nativeHandle_, writeOpts.nativeHandle_,
+        key, offset, len, value, vOffset, vLen);
+  }
+
   /**
    * Add merge operand for key/value pair.
    *
@@ -685,13 +965,44 @@ public void merge(final ColumnFamilyHandle columnFamilyHandle,
         columnFamilyHandle.nativeHandle_);
   }
 
+  /**
+   * Add merge operand for key/value pair.
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param writeOpts {@link WriteOptions} for this write.
+   * @param key the specified key to be merged.
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   * @param value the value to be merged with the current value for
+   * the specified key.
+   * @param vOffset the offset of the "value" array to be used, must be non-negative and
+   *               no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be non-negative and
+   *            must be non-negative and no larger than ("value".length -  offset)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void merge(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpts, final byte[] key, int offset, int len,
+      final byte[] value, int vOffset, int vLen) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    merge(nativeHandle_, writeOpts.nativeHandle_,
+        key, offset, len, value, vOffset, vLen,
+        columnFamilyHandle.nativeHandle_);
+  }
+
   // TODO(AR) we should improve the #get() API, returning -1 (RocksDB.NOT_FOUND) is not very nice
   // when we could communicate better status into, also the C++ code show that -2 could be returned
 
   /**
    * Get the value associated with the specified key within column family*
+   *
    * @param key the key to retrieve the value.
    * @param value the out-value to receive the retrieved value.
+   *
    * @return The size of the actual value that matches the specified
    *     {@code key} in byte.  If the return value is greater than the
    *     length of {@code value}, then it indicates that the size of the
@@ -706,6 +1017,35 @@ public int get(final byte[] key, final byte[] value) throws RocksDBException {
     return get(nativeHandle_, key, 0, key.length, value, 0, value.length);
   }
 
+  /**
+   * Get the value associated with the specified key within column family*
+   *
+   * @param key the key to retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   * @param value the out-value to receive the retrieved value.
+   * @param vOffset the offset of the "value" array to be used, must be non-negative and
+   *               no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be non-negative and
+   *            must be non-negative and no larger than ("value".length -  offset)
+   *
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final byte[] key, int offset, int len, final byte[] value, int vOffset, int vLen) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    return get(nativeHandle_, key, offset, len, value, vOffset, vLen);
+  }
+
   /**
    * Get the value associated with the specified key within column family.
    *
@@ -729,6 +1069,39 @@ public int get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
         columnFamilyHandle.nativeHandle_);
   }
 
+  /**
+   * Get the value associated with the specified key within column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key to retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   * @param value the out-value to receive the retrieved value.
+   * @param vOffset the offset of the "value" array to be used, must be non-negative and
+   *               no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be non-negative and
+   *            must be non-negative and no larger than ("value".length -  offset)
+   *
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, int offset, int len,
+      final byte[] value, int vOffset, int vLen) throws RocksDBException, IllegalArgumentException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    return get(nativeHandle_, key, offset, len, value, vOffset, vLen,
+        columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * Get the value associated with the specified key.
    *
@@ -750,6 +1123,38 @@ public int get(final ReadOptions opt, final byte[] key,
     return get(nativeHandle_, opt.nativeHandle_,
                key, 0, key.length, value, 0, value.length);
   }
+
+  /**
+   * Get the value associated with the specified key.
+   *
+   * @param opt {@link org.rocksdb.ReadOptions} instance.
+   * @param key the key to retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   * @param value the out-value to receive the retrieved value.
+   * @param vOffset the offset of the "value" array to be used, must be non-negative and
+   *               no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be non-negative and
+   *            must be non-negative and no larger than ("value".length -  offset)
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ReadOptions opt, final byte[] key, int offset, int len,
+      final byte[] value, int vOffset, int vLen) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    return get(nativeHandle_, opt.nativeHandle_,
+        key, offset, len, value, vOffset, vLen);
+  }
+
   /**
    * Get the value associated with the specified key within column family.
    *
@@ -775,6 +1180,40 @@ public int get(final ColumnFamilyHandle columnFamilyHandle,
         0, value.length, columnFamilyHandle.nativeHandle_);
   }
 
+  /**
+   * Get the value associated with the specified key within column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param opt {@link org.rocksdb.ReadOptions} instance.
+   * @param key the key to retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   * @param value the out-value to receive the retrieved value.
+   * @param vOffset the offset of the "value" array to be used, must be non-negative and
+   *               no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be non-negative and
+   *            must be non-negative and no larger than ("value".length -  offset)
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions opt, final byte[] key, int offset, int len, final byte[] value, int vOffset, int vLen)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    return get(nativeHandle_, opt.nativeHandle_, key, offset, len, value,
+        vOffset, vLen, columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * The simplified version of get which returns a new byte array storing
    * the value associated with the specified input key if any.  null will be
@@ -791,6 +1230,26 @@ public byte[] get(final byte[] key) throws RocksDBException {
     return get(nativeHandle_, key, 0, key.length);
   }
 
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param key the key retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final byte[] key, int offset, int len) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    return get(nativeHandle_, key, offset, len);
+  }
+
   /**
    * The simplified version of get which returns a new byte array storing
    * the value associated with the specified input key if any.  null will be
@@ -811,6 +1270,30 @@ public byte[] get(final ColumnFamilyHandle columnFamilyHandle,
         columnFamilyHandle.nativeHandle_);
   }
 
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, int offset, int len) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    return get(nativeHandle_, key, offset, len,
+        columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * The simplified version of get which returns a new byte array storing
    * the value associated with the specified input key if any.  null will be
@@ -829,6 +1312,28 @@ public byte[] get(final ReadOptions opt, final byte[] key)
     return get(nativeHandle_, opt.nativeHandle_, key, 0, key.length);
   }
 
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param key the key retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   * @param opt Read options.
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final ReadOptions opt, final byte[] key, int offset, int len)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    return get(nativeHandle_, opt.nativeHandle_, key, offset, len);
+  }
+
   /**
    * The simplified version of get which returns a new byte array storing
    * the value associated with the specified input key if any.  null will be
@@ -850,6 +1355,31 @@ public byte[] get(final ColumnFamilyHandle columnFamilyHandle,
         columnFamilyHandle.nativeHandle_);
   }
 
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   * @param opt Read options.
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions opt, final byte[] key, int offset, int len) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    return get(nativeHandle_, opt.nativeHandle_, key, offset, len,
+        columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * Returns a map of keys for which values were found in DB.
    *
@@ -1073,6 +1603,23 @@ public void delete(final byte[] key) throws RocksDBException {
     delete(nativeHandle_, key, 0, key.length);
   }
 
+  /**
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param key Key to delete within database
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void delete(final byte[] key, int offset, int len) throws RocksDBException {
+    delete(nativeHandle_, key, offset, len);
+  }
+
   /**
    * Remove the database entry (if any) for "key".  Returns OK on
    * success, and a non-OK status on error.  It is not an error if "key"
@@ -1110,6 +1657,26 @@ public void delete(final ColumnFamilyHandle columnFamilyHandle,
     delete(nativeHandle_, key, 0, key.length, columnFamilyHandle.nativeHandle_);
   }
 
+  /**
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key Key to delete within database
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void delete(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, int offset, int len) throws RocksDBException {
+    delete(nativeHandle_, key, offset, len, columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * Remove the database entry (if any) for "key".  Returns OK on
    * success, and a non-OK status on error.  It is not an error if "key"
@@ -1145,6 +1712,25 @@ public void delete(final WriteOptions writeOpt, final byte[] key)
     delete(nativeHandle_, writeOpt.nativeHandle_, key, 0, key.length);
   }
 
+  /**
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param key Key to delete within database
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void delete(final WriteOptions writeOpt, final byte[] key, int offset, int len)
+      throws RocksDBException {
+    delete(nativeHandle_, writeOpt.nativeHandle_, key, offset, len);
+  }
+
   /**
    * Remove the database entry (if any) for "key".  Returns OK on
    * success, and a non-OK status on error.  It is not an error if "key"
@@ -1187,6 +1773,29 @@ public void delete(final ColumnFamilyHandle columnFamilyHandle,
         columnFamilyHandle.nativeHandle_);
   }
 
+  /**
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param key Key to delete within database
+   * @param offset the offset of the "key" array to be used, must be non-negative and
+   *               no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative and
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void delete(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpt, final byte[] key, int offset, int len)
+      throws RocksDBException {
+    delete(nativeHandle_, writeOpt.nativeHandle_, key, offset, len,
+        columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * Remove the database entry for {@code key}. Requires that the key exists
    * and was not overwritten. It is not an error if the key did not exist
diff --git a/java/src/main/java/org/rocksdb/StatisticsCollector.java b/java/src/main/java/org/rocksdb/StatisticsCollector.java
index 48cf8af88e6..fb3f57150f0 100644
--- a/java/src/main/java/org/rocksdb/StatisticsCollector.java
+++ b/java/src/main/java/org/rocksdb/StatisticsCollector.java
@@ -93,9 +93,9 @@ public void run() {
                   statsCallback.histogramCallback(histogramType, histogramData);
                 }
               }
-
-              Thread.sleep(_statsCollectionInterval);
             }
+
+            Thread.sleep(_statsCollectionInterval);
           }
           catch (final InterruptedException e) {
             Thread.currentThread().interrupt();
diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java
index fdcf62ff8a5..08ed18fb3eb 100644
--- a/java/src/main/java/org/rocksdb/TickerType.java
+++ b/java/src/main/java/org/rocksdb/TickerType.java
@@ -304,9 +304,9 @@ public enum TickerType {
     RATE_LIMIT_DELAY_MILLIS((byte) 0x37),
 
     /**
-     * Number of iterators currently open.
+     * Number of iterators created.
      */
-    NO_ITERATORS((byte) 0x38),
+    NO_ITERATOR_CREATED((byte) 0x38),
 
     /**
      * Number of MultiGet calls.
@@ -475,7 +475,12 @@ public enum TickerType {
      */
     NUMBER_MULTIGET_KEYS_FOUND((byte) 0x5E),
 
-    TICKER_ENUM_MAX((byte) 0x5F);
+    /**
+     * Number of iterators deleted.
+     */
+    NO_ITERATOR_DELETED((byte) 0x5F),
+
+    TICKER_ENUM_MAX((byte) 0x60);
 
 
     private final byte value;
diff --git a/java/src/main/java/org/rocksdb/UInt64AddOperator.java b/java/src/main/java/org/rocksdb/UInt64AddOperator.java
new file mode 100644
index 00000000000..cce9b298d8a
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/UInt64AddOperator.java
@@ -0,0 +1,19 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Uint64AddOperator is a merge operator that accumlates a long
+ * integer value.
+ */
+public class UInt64AddOperator extends MergeOperator {
+    public UInt64AddOperator() {
+        super(newSharedUInt64AddOperator());
+    }
+
+    private native static long newSharedUInt64AddOperator();
+    @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/java/src/main/java/org/rocksdb/WriteBufferManager.java b/java/src/main/java/org/rocksdb/WriteBufferManager.java
new file mode 100644
index 00000000000..a5f80644fb5
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/WriteBufferManager.java
@@ -0,0 +1,30 @@
+package org.rocksdb;
+
+import org.rocksdb.Cache;
+
+/**
+ * Java wrapper over native write_buffer_manager class
+ */
+public class WriteBufferManager extends RocksObject {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  /**
+   * Construct a new instance of WriteBufferManager.
+   *
+   * Check <a href="https://github.com/facebook/rocksdb/wiki/Write-Buffer-Manager">
+   *     https://github.com/facebook/rocksdb/wiki/Write-Buffer-Manager</a>
+   * for more details on when to use it
+   *
+   * @param bufferSizeBytes buffer size(in bytes) to use for native write_buffer_manager
+   * @param cache cache whose memory should be bounded by this write buffer manager
+   */
+  public WriteBufferManager(final long bufferSizeBytes, final Cache cache){
+    super(newWriteBufferManager(bufferSizeBytes, cache.nativeHandle_));
+  }
+
+  private native static long newWriteBufferManager(final long bufferSizeBytes, final long cacheHandle);
+  @Override
+  protected native void disposeInternal(final long handle);
+}
diff --git a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
index 2b15b69f812..754cf11c039 100644
--- a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
+++ b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
@@ -95,6 +95,46 @@ public void cacheIndexAndFilterBlocks() {
 
   }
 
+  @Test
+  public void cacheIndexAndFilterBlocksWithHighPriority() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setCacheIndexAndFilterBlocksWithHighPriority(true);
+    assertThat(blockBasedTableConfig.cacheIndexAndFilterBlocksWithHighPriority()).
+            isTrue();
+  }
+
+  @Test
+  public void pinL0FilterAndIndexBlocksInCache() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setPinL0FilterAndIndexBlocksInCache(true);
+    assertThat(blockBasedTableConfig.pinL0FilterAndIndexBlocksInCache()).
+            isTrue();
+  }
+
+  @Test
+  public void partitionFilters() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setPartitionFilters(true);
+    assertThat(blockBasedTableConfig.partitionFilters()).
+            isTrue();
+  }
+
+  @Test
+  public void metadataBlockSize() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setMetadataBlockSize(1024);
+    assertThat(blockBasedTableConfig.metadataBlockSize()).
+            isEqualTo(1024);
+  }
+
+  @Test
+  public void pinTopLevelIndexAndFilter() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setPinTopLevelIndexAndFilter(false);
+    assertThat(blockBasedTableConfig.pinTopLevelIndexAndFilter()).
+            isFalse();
+  }
+
   @Test
   public void hashIndexAllowCollision() {
     BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
diff --git a/java/src/test/java/org/rocksdb/CompactionOptionsFIFOTest.java b/java/src/test/java/org/rocksdb/CompactionOptionsFIFOTest.java
index 370a28e8196..df4c98ec14c 100644
--- a/java/src/test/java/org/rocksdb/CompactionOptionsFIFOTest.java
+++ b/java/src/test/java/org/rocksdb/CompactionOptionsFIFOTest.java
@@ -18,9 +18,27 @@ public class CompactionOptionsFIFOTest {
   @Test
   public void maxTableFilesSize() {
     final long size = 500 * 1024 * 1026;
-    try(final CompactionOptionsFIFO opt = new CompactionOptionsFIFO()) {
+    try (final CompactionOptionsFIFO opt = new CompactionOptionsFIFO()) {
       opt.setMaxTableFilesSize(size);
       assertThat(opt.maxTableFilesSize()).isEqualTo(size);
     }
   }
+
+  @Test
+  public void ttl() {
+    final long ttl = 7 * 24 * 60 * 60; // 7 days
+    try (final CompactionOptionsFIFO opt = new CompactionOptionsFIFO()) {
+      opt.setTtl(ttl);
+      assertThat(opt.ttl()).isEqualTo(ttl);
+    }
+  }
+
+  @Test
+  public void allowCompaction() {
+    final boolean allowCompaction = true;
+    try (final CompactionOptionsFIFO opt = new CompactionOptionsFIFO()) {
+      opt.setAllowCompaction(allowCompaction);
+      assertThat(opt.allowCompaction()).isEqualTo(allowCompaction);
+    }
+  }
 }
diff --git a/java/src/test/java/org/rocksdb/DBOptionsTest.java b/java/src/test/java/org/rocksdb/DBOptionsTest.java
index 453639d5744..bad01c4354b 100644
--- a/java/src/test/java/org/rocksdb/DBOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/DBOptionsTest.java
@@ -424,6 +424,26 @@ public void dbWriteBufferSize() {
     }
   }
 
+  @Test
+  public void setWriteBufferManager() throws RocksDBException {
+    try (final DBOptions opt = new DBOptions();
+         final Cache cache = new LRUCache(1 * 1024 * 1024);
+         final WriteBufferManager writeBufferManager = new WriteBufferManager(2000l, cache)) {
+      opt.setWriteBufferManager(writeBufferManager);
+      assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager);
+    }
+  }
+
+  @Test
+  public void setWriteBufferManagerWithZeroBufferSize() throws RocksDBException {
+    try (final DBOptions opt = new DBOptions();
+         final Cache cache = new LRUCache(1 * 1024 * 1024);
+         final WriteBufferManager writeBufferManager = new WriteBufferManager(0l, cache)) {
+      opt.setWriteBufferManager(writeBufferManager);
+      assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager);
+    }
+  }
+
   @Test
   public void accessHintOnCompactionStart() {
     try(final DBOptions opt = new DBOptions()) {
diff --git a/java/src/test/java/org/rocksdb/KeyMayExistTest.java b/java/src/test/java/org/rocksdb/KeyMayExistTest.java
index 8092270eb2d..577fe2eadfe 100644
--- a/java/src/test/java/org/rocksdb/KeyMayExistTest.java
+++ b/java/src/test/java/org/rocksdb/KeyMayExistTest.java
@@ -48,12 +48,33 @@ public void keyMayExist() throws RocksDBException {
         assertThat(exists).isTrue();
         assertThat(retValue.toString()).isEqualTo("value");
 
+        // Slice key
+        StringBuilder builder = new StringBuilder("prefix");
+        int offset = builder.toString().length();
+        builder.append("slice key 0");
+        int len = builder.toString().length() - offset;
+        builder.append("suffix");
+
+        byte[] sliceKey = builder.toString().getBytes();
+        byte[] sliceValue = "slice value 0".getBytes();
+        db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length);
+
+        retValue = new StringBuilder();
+        exists = db.keyMayExist(sliceKey, offset, len, retValue);
+        assertThat(exists).isTrue();
+        assertThat(retValue.toString().getBytes()).isEqualTo(sliceValue);
+
         // Test without column family but with readOptions
         try (final ReadOptions readOptions = new ReadOptions()) {
           retValue = new StringBuilder();
           exists = db.keyMayExist(readOptions, "key".getBytes(), retValue);
           assertThat(exists).isTrue();
           assertThat(retValue.toString()).isEqualTo("value");
+
+          retValue = new StringBuilder();
+          exists = db.keyMayExist(readOptions, sliceKey, offset, len, retValue);
+          assertThat(exists).isTrue();
+          assertThat(retValue.toString().getBytes()).isEqualTo(sliceValue);
         }
 
         // Test with column family
@@ -63,6 +84,13 @@ public void keyMayExist() throws RocksDBException {
         assertThat(exists).isTrue();
         assertThat(retValue.toString()).isEqualTo("value");
 
+        // Test slice sky with column family
+        retValue = new StringBuilder();
+        exists = db.keyMayExist(columnFamilyHandleList.get(0), sliceKey, offset, len,
+            retValue);
+        assertThat(exists).isTrue();
+        assertThat(retValue.toString().getBytes()).isEqualTo(sliceValue);
+
         // Test with column family and readOptions
         try (final ReadOptions readOptions = new ReadOptions()) {
           retValue = new StringBuilder();
@@ -71,11 +99,23 @@ public void keyMayExist() throws RocksDBException {
               retValue);
           assertThat(exists).isTrue();
           assertThat(retValue.toString()).isEqualTo("value");
+
+          // Test slice key with column family and read options
+          retValue = new StringBuilder();
+          exists = db.keyMayExist(readOptions,
+              columnFamilyHandleList.get(0), sliceKey, offset, len,
+              retValue);
+          assertThat(exists).isTrue();
+          assertThat(retValue.toString().getBytes()).isEqualTo(sliceValue);
         }
 
         // KeyMayExist in CF1 must return false
         assertThat(db.keyMayExist(columnFamilyHandleList.get(1),
             "key".getBytes(), retValue)).isFalse();
+
+        // slice key
+        assertThat(db.keyMayExist(columnFamilyHandleList.get(1),
+           sliceKey, 1, 3, retValue)).isFalse();
       } finally {
         for (final ColumnFamilyHandle columnFamilyHandle :
             columnFamilyHandleList) {
diff --git a/java/src/test/java/org/rocksdb/MemoryUtilTest.java b/java/src/test/java/org/rocksdb/MemoryUtilTest.java
new file mode 100644
index 00000000000..73fcc87c32e
--- /dev/null
+++ b/java/src/test/java/org/rocksdb/MemoryUtilTest.java
@@ -0,0 +1,143 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class MemoryUtilTest {
+
+  private static final String MEMTABLE_SIZE = "rocksdb.size-all-mem-tables";
+  private static final String UNFLUSHED_MEMTABLE_SIZE = "rocksdb.cur-size-all-mem-tables";
+  private static final String TABLE_READERS = "rocksdb.estimate-table-readers-mem";
+
+  private final byte[] key = "some-key".getBytes(StandardCharsets.UTF_8);
+  private final byte[] value = "some-value".getBytes(StandardCharsets.UTF_8);
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule public TemporaryFolder dbFolder1 = new TemporaryFolder();
+  @Rule public TemporaryFolder dbFolder2 = new TemporaryFolder();
+
+  /**
+   * Test MemoryUtil.getApproximateMemoryUsageByType before and after a put + get
+   */
+  @Test
+  public void getApproximateMemoryUsageByType() throws RocksDBException {
+    try (final Cache cache = new LRUCache(8 * 1024 * 1024);
+         final Options options =
+                 new Options()
+                         .setCreateIfMissing(true)
+                         .setTableFormatConfig(new BlockBasedTableConfig().setBlockCache(cache));
+         final FlushOptions flushOptions =
+                 new FlushOptions().setWaitForFlush(true);
+         final RocksDB db =
+                 RocksDB.open(options, dbFolder1.getRoot().getAbsolutePath())) {
+
+      List<RocksDB> dbs = new ArrayList<>(1);
+      dbs.add(db);
+      Set<Cache> caches = new HashSet<>(1);
+      caches.add(cache);
+      Map<MemoryUsageType, Long> usage = MemoryUtil.getApproximateMemoryUsageByType(dbs, caches);
+
+      assertThat(usage.get(MemoryUsageType.kMemTableTotal)).isEqualTo(
+              db.getAggregatedLongProperty(MEMTABLE_SIZE));
+      assertThat(usage.get(MemoryUsageType.kMemTableUnFlushed)).isEqualTo(
+              db.getAggregatedLongProperty(UNFLUSHED_MEMTABLE_SIZE));
+      assertThat(usage.get(MemoryUsageType.kTableReadersTotal)).isEqualTo(
+              db.getAggregatedLongProperty(TABLE_READERS));
+      assertThat(usage.get(MemoryUsageType.kCacheTotal)).isEqualTo(0);
+
+      db.put(key, value);
+      db.flush(flushOptions);
+      db.get(key);
+
+      usage = MemoryUtil.getApproximateMemoryUsageByType(dbs, caches);
+      assertThat(usage.get(MemoryUsageType.kMemTableTotal)).isGreaterThan(0);
+      assertThat(usage.get(MemoryUsageType.kMemTableTotal)).isEqualTo(
+              db.getAggregatedLongProperty(MEMTABLE_SIZE));
+      assertThat(usage.get(MemoryUsageType.kMemTableUnFlushed)).isGreaterThan(0);
+      assertThat(usage.get(MemoryUsageType.kMemTableUnFlushed)).isEqualTo(
+              db.getAggregatedLongProperty(UNFLUSHED_MEMTABLE_SIZE));
+      assertThat(usage.get(MemoryUsageType.kTableReadersTotal)).isGreaterThan(0);
+      assertThat(usage.get(MemoryUsageType.kTableReadersTotal)).isEqualTo(
+              db.getAggregatedLongProperty(TABLE_READERS));
+      assertThat(usage.get(MemoryUsageType.kCacheTotal)).isGreaterThan(0);
+
+    }
+  }
+
+  /**
+   * Test MemoryUtil.getApproximateMemoryUsageByType with null inputs
+   */
+  @Test
+  public void getApproximateMemoryUsageByTypeNulls() throws RocksDBException {
+    Map<MemoryUsageType, Long> usage = MemoryUtil.getApproximateMemoryUsageByType(null, null);
+
+    assertThat(usage.get(MemoryUsageType.kMemTableTotal)).isEqualTo(null);
+    assertThat(usage.get(MemoryUsageType.kMemTableUnFlushed)).isEqualTo(null);
+    assertThat(usage.get(MemoryUsageType.kTableReadersTotal)).isEqualTo(null);
+    assertThat(usage.get(MemoryUsageType.kCacheTotal)).isEqualTo(null);
+  }
+
+  /**
+   * Test MemoryUtil.getApproximateMemoryUsageByType with two DBs and two caches
+   */
+  @Test
+  public void getApproximateMemoryUsageByTypeMultiple() throws RocksDBException {
+    try (final Cache cache1 = new LRUCache(1 * 1024 * 1024);
+         final Options options1 =
+                 new Options()
+                         .setCreateIfMissing(true)
+                         .setTableFormatConfig(new BlockBasedTableConfig().setBlockCache(cache1));
+         final RocksDB db1 =
+                 RocksDB.open(options1, dbFolder1.getRoot().getAbsolutePath());
+         final Cache cache2 = new LRUCache(1 * 1024 * 1024);
+         final Options options2 =
+                 new Options()
+                         .setCreateIfMissing(true)
+                         .setTableFormatConfig(new BlockBasedTableConfig().setBlockCache(cache2));
+         final RocksDB db2 =
+                 RocksDB.open(options2, dbFolder2.getRoot().getAbsolutePath());
+         final FlushOptions flushOptions =
+                 new FlushOptions().setWaitForFlush(true);
+
+    ) {
+      List<RocksDB> dbs = new ArrayList<>(1);
+      dbs.add(db1);
+      dbs.add(db2);
+      Set<Cache> caches = new HashSet<>(1);
+      caches.add(cache1);
+      caches.add(cache2);
+
+      for (RocksDB db: dbs) {
+        db.put(key, value);
+        db.flush(flushOptions);
+        db.get(key);
+      }
+
+      Map<MemoryUsageType, Long> usage = MemoryUtil.getApproximateMemoryUsageByType(dbs, caches);
+      assertThat(usage.get(MemoryUsageType.kMemTableTotal)).isEqualTo(
+              db1.getAggregatedLongProperty(MEMTABLE_SIZE) + db2.getAggregatedLongProperty(MEMTABLE_SIZE));
+      assertThat(usage.get(MemoryUsageType.kMemTableUnFlushed)).isEqualTo(
+              db1.getAggregatedLongProperty(UNFLUSHED_MEMTABLE_SIZE) + db2.getAggregatedLongProperty(UNFLUSHED_MEMTABLE_SIZE));
+      assertThat(usage.get(MemoryUsageType.kTableReadersTotal)).isEqualTo(
+              db1.getAggregatedLongProperty(TABLE_READERS) + db2.getAggregatedLongProperty(TABLE_READERS));
+      assertThat(usage.get(MemoryUsageType.kCacheTotal)).isGreaterThan(0);
+
+    }
+  }
+
+}
diff --git a/java/src/test/java/org/rocksdb/MergeTest.java b/java/src/test/java/org/rocksdb/MergeTest.java
index 73b90869cf1..b2ec62635a1 100644
--- a/java/src/test/java/org/rocksdb/MergeTest.java
+++ b/java/src/test/java/org/rocksdb/MergeTest.java
@@ -5,6 +5,7 @@
 
 package org.rocksdb;
 
+import java.nio.ByteBuffer;
 import java.util.Arrays;
 import java.util.List;
 import java.util.ArrayList;
@@ -44,6 +45,38 @@ public void stringOption()
     }
   }
 
+  private byte[] longToByteArray(long l) {
+    ByteBuffer buf = ByteBuffer.allocate(Long.BYTES);
+    buf.putLong(l);
+    return buf.array();
+  }
+
+  private long longFromByteArray(byte[] a) {
+    ByteBuffer buf = ByteBuffer.allocate(Long.BYTES);
+    buf.put(a);
+    buf.flip();
+    return buf.getLong();
+  }
+
+  @Test
+  public void uint64AddOption()
+      throws InterruptedException, RocksDBException {
+    try (final Options opt = new Options()
+        .setCreateIfMissing(true)
+        .setMergeOperatorName("uint64add");
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())) {
+      // writing (long)100 under key
+      db.put("key".getBytes(), longToByteArray(100));
+      // merge (long)1 under key
+      db.merge("key".getBytes(), longToByteArray(1));
+
+      final byte[] value = db.get("key".getBytes());
+      final long longValue = longFromByteArray(value);
+      assertThat(longValue).isEqualTo(101);
+    }
+  }
+
   @Test
   public void cFStringOption()
       throws InterruptedException, RocksDBException {
@@ -86,6 +119,48 @@ public void cFStringOption()
     }
   }
 
+  @Test
+  public void cFUInt64AddOption()
+      throws InterruptedException, RocksDBException {
+
+    try (final ColumnFamilyOptions cfOpt1 = new ColumnFamilyOptions()
+        .setMergeOperatorName("uint64add");
+         final ColumnFamilyOptions cfOpt2 = new ColumnFamilyOptions()
+             .setMergeOperatorName("uint64add")
+    ) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpt1),
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpt2)
+      );
+
+      final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+      try (final DBOptions opt = new DBOptions()
+          .setCreateIfMissing(true)
+          .setCreateMissingColumnFamilies(true);
+           final RocksDB db = RocksDB.open(opt,
+               dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+               columnFamilyHandleList)) {
+        try {
+          // writing (long)100 under key
+          db.put(columnFamilyHandleList.get(1),
+              "cfkey".getBytes(), longToByteArray(100));
+          // merge (long)1 under key
+          db.merge(columnFamilyHandleList.get(1),
+              "cfkey".getBytes(), longToByteArray(1));
+
+          byte[] value = db.get(columnFamilyHandleList.get(1),
+              "cfkey".getBytes());
+          long longValue = longFromByteArray(value);
+          assertThat(longValue).isEqualTo(101);
+        } finally {
+          for (final ColumnFamilyHandle handle : columnFamilyHandleList) {
+            handle.close();
+          }
+        }
+      }
+    }
+  }
+
   @Test
   public void operatorOption()
       throws InterruptedException, RocksDBException {
@@ -108,6 +183,28 @@ public void operatorOption()
     }
   }
 
+  @Test
+  public void uint64AddOperatorOption()
+      throws InterruptedException, RocksDBException {
+    try (final UInt64AddOperator uint64AddOperator = new UInt64AddOperator();
+         final Options opt = new Options()
+            .setCreateIfMissing(true)
+            .setMergeOperator(uint64AddOperator);
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())) {
+      // Writing (long)100 under key
+      db.put("key".getBytes(), longToByteArray(100));
+
+      // Writing (long)1 under key
+      db.merge("key".getBytes(), longToByteArray(1));
+
+      final byte[] value = db.get("key".getBytes());
+      final long longValue = longFromByteArray(value);
+
+      assertThat(longValue).isEqualTo(101);
+    }
+  }
+
   @Test
   public void cFOperatorOption()
       throws InterruptedException, RocksDBException {
@@ -170,6 +267,68 @@ public void cFOperatorOption()
     }
   }
 
+  @Test
+  public void cFUInt64AddOperatorOption()
+      throws InterruptedException, RocksDBException {
+    try (final UInt64AddOperator uint64AddOperator = new UInt64AddOperator();
+         final ColumnFamilyOptions cfOpt1 = new ColumnFamilyOptions()
+             .setMergeOperator(uint64AddOperator);
+         final ColumnFamilyOptions cfOpt2 = new ColumnFamilyOptions()
+             .setMergeOperator(uint64AddOperator)
+    ) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpt1),
+          new ColumnFamilyDescriptor("new_cf".getBytes(), cfOpt2)
+      );
+      final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+      try (final DBOptions opt = new DBOptions()
+          .setCreateIfMissing(true)
+          .setCreateMissingColumnFamilies(true);
+           final RocksDB db = RocksDB.open(opt,
+               dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+               columnFamilyHandleList)
+      ) {
+        try {
+          // writing (long)100 under key
+          db.put(columnFamilyHandleList.get(1),
+              "cfkey".getBytes(), longToByteArray(100));
+          // merge (long)1 under key
+          db.merge(columnFamilyHandleList.get(1),
+              "cfkey".getBytes(), longToByteArray(1));
+          byte[] value = db.get(columnFamilyHandleList.get(1),
+              "cfkey".getBytes());
+          long longValue = longFromByteArray(value);
+
+          // Test also with createColumnFamily
+          try (final ColumnFamilyOptions cfHandleOpts =
+                   new ColumnFamilyOptions()
+                       .setMergeOperator(uint64AddOperator);
+               final ColumnFamilyHandle cfHandle =
+                   db.createColumnFamily(
+                       new ColumnFamilyDescriptor("new_cf2".getBytes(),
+                           cfHandleOpts))
+          ) {
+            // writing (long)200 under cfkey2
+            db.put(cfHandle, "cfkey2".getBytes(), longToByteArray(200));
+            // merge (long)50 under cfkey2
+            db.merge(cfHandle, new WriteOptions(), "cfkey2".getBytes(),
+                longToByteArray(50));
+            value = db.get(cfHandle, "cfkey2".getBytes());
+            long longValueTmpCf = longFromByteArray(value);
+
+            assertThat(longValue).isEqualTo(101);
+            assertThat(longValueTmpCf).isEqualTo(250);
+          }
+        } finally {
+          for (final ColumnFamilyHandle columnFamilyHandle :
+              columnFamilyHandleList) {
+            columnFamilyHandle.close();
+          }
+        }
+      }
+    }
+  }
+
   @Test
   public void operatorGcBehaviour()
       throws RocksDBException {
@@ -182,7 +341,6 @@ public void operatorGcBehaviour()
         //no-op
       }
 
-
       // test reuse
       try (final Options opt = new Options()
               .setMergeOperator(stringAppendOperator);
@@ -213,6 +371,48 @@ public void operatorGcBehaviour()
     }
   }
 
+  @Test
+  public void uint64AddOperatorGcBehaviour()
+      throws RocksDBException {
+    try (final UInt64AddOperator uint64AddOperator = new UInt64AddOperator()) {
+      try (final Options opt = new Options()
+              .setCreateIfMissing(true)
+              .setMergeOperator(uint64AddOperator);
+           final RocksDB db = RocksDB.open(opt,
+                   dbFolder.getRoot().getAbsolutePath())) {
+        //no-op
+      }
+
+      // test reuse
+      try (final Options opt = new Options()
+              .setMergeOperator(uint64AddOperator);
+           final RocksDB db = RocksDB.open(opt,
+                   dbFolder.getRoot().getAbsolutePath())) {
+        //no-op
+      }
+
+      // test param init
+      try (final UInt64AddOperator uint64AddOperator2 = new UInt64AddOperator();
+           final Options opt = new Options()
+              .setMergeOperator(uint64AddOperator2);
+           final RocksDB db = RocksDB.open(opt,
+                   dbFolder.getRoot().getAbsolutePath())) {
+        //no-op
+      }
+
+      // test replace one with another merge operator instance
+      try (final Options opt = new Options()
+              .setMergeOperator(uint64AddOperator);
+           final UInt64AddOperator newUInt64AddOperator = new UInt64AddOperator()) {
+        opt.setMergeOperator(newUInt64AddOperator);
+        try (final RocksDB db = RocksDB.open(opt,
+                dbFolder.getRoot().getAbsolutePath())) {
+          //no-op
+        }
+      }
+    }
+  }
+
   @Test
   public void emptyStringInSetMergeOperatorByName() {
     try (final Options opt = new Options()
diff --git a/java/src/test/java/org/rocksdb/OptionsTest.java b/java/src/test/java/org/rocksdb/OptionsTest.java
index 7f7679d732c..2571c3e26fb 100644
--- a/java/src/test/java/org/rocksdb/OptionsTest.java
+++ b/java/src/test/java/org/rocksdb/OptionsTest.java
@@ -645,6 +645,26 @@ public void dbWriteBufferSize() {
     }
   }
 
+  @Test
+  public void setWriteBufferManager() throws RocksDBException {
+    try (final Options opt = new Options();
+         final Cache cache = new LRUCache(1 * 1024 * 1024);
+         final WriteBufferManager writeBufferManager = new WriteBufferManager(2000l, cache)) {
+      opt.setWriteBufferManager(writeBufferManager);
+      assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager);
+    }
+  }
+
+  @Test
+  public void setWriteBufferManagerWithZeroBufferSize() throws RocksDBException {
+    try (final Options opt = new Options();
+         final Cache cache = new LRUCache(1 * 1024 * 1024);
+         final WriteBufferManager writeBufferManager = new WriteBufferManager(0l, cache)) {
+      opt.setWriteBufferManager(writeBufferManager);
+      assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager);
+    }
+  }
+
   @Test
   public void accessHintOnCompactionStart() {
     try (final Options opt = new Options()) {
diff --git a/java/src/test/java/org/rocksdb/ReadOptionsTest.java b/java/src/test/java/org/rocksdb/ReadOptionsTest.java
index f7d799909d9..4e860ae4ccf 100644
--- a/java/src/test/java/org/rocksdb/ReadOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/ReadOptionsTest.java
@@ -144,16 +144,34 @@ public void iterateUpperBoundNull() {
     }
   }
 
+  @Test
+  public void iterateLowerBound() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      Slice lowerBound = buildRandomSlice();
+      opt.setIterateLowerBound(lowerBound);
+      assertThat(Arrays.equals(lowerBound.data(), opt.iterateLowerBound().data())).isTrue();
+    }
+  }
+
+  @Test
+  public void iterateLowerBoundNull() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      assertThat(opt.iterateLowerBound()).isNull();
+    }
+  }
+
   @Test
   public void copyConstructor() {
     try (final ReadOptions opt = new ReadOptions()) {
       opt.setVerifyChecksums(false);
       opt.setFillCache(false);
       opt.setIterateUpperBound(buildRandomSlice());
+      opt.setIterateLowerBound(buildRandomSlice());
       ReadOptions other = new ReadOptions(opt);
       assertThat(opt.verifyChecksums()).isEqualTo(other.verifyChecksums());
       assertThat(opt.fillCache()).isEqualTo(other.fillCache());
       assertThat(Arrays.equals(opt.iterateUpperBound().data(), other.iterateUpperBound().data())).isTrue();
+      assertThat(Arrays.equals(opt.iterateLowerBound().data(), other.iterateLowerBound().data())).isTrue();
     }
   }
 
@@ -237,6 +255,22 @@ public void failIterateUpperBoundUninitialized() {
     }
   }
 
+  @Test
+  public void failSetIterateLowerBoundUninitialized() {
+    try (final ReadOptions readOptions =
+             setupUninitializedReadOptions(exception)) {
+      readOptions.setIterateLowerBound(null);
+    }
+  }
+
+  @Test
+  public void failIterateLowerBoundUninitialized() {
+    try (final ReadOptions readOptions =
+             setupUninitializedReadOptions(exception)) {
+      readOptions.iterateLowerBound();
+    }
+  }
+
   private ReadOptions setupUninitializedReadOptions(
       ExpectedException exception) {
     final ReadOptions readOptions = new ReadOptions();
diff --git a/java/src/test/java/org/rocksdb/RocksDBTest.java b/java/src/test/java/org/rocksdb/RocksDBTest.java
index 158b8d56a89..66ebc69db8a 100644
--- a/java/src/test/java/org/rocksdb/RocksDBTest.java
+++ b/java/src/test/java/org/rocksdb/RocksDBTest.java
@@ -4,6 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 package org.rocksdb;
 
+import org.junit.Assert;
 import org.junit.Assume;
 import org.junit.ClassRule;
 import org.junit.Rule;
@@ -11,6 +12,7 @@
 import org.junit.rules.ExpectedException;
 import org.junit.rules.TemporaryFolder;
 
+import java.nio.ByteBuffer;
 import java.util.*;
 
 import static org.assertj.core.api.Assertions.assertThat;
@@ -70,6 +72,57 @@ public void put() throws RocksDBException {
           "value".getBytes());
       assertThat(db.get("key2".getBytes())).isEqualTo(
           "12345678".getBytes());
+
+
+      // put
+      Segment key3 = sliceSegment("key3");
+      Segment key4 = sliceSegment("key4");
+      Segment value0 = sliceSegment("value 0");
+      Segment value1 = sliceSegment("value 1");
+      db.put(key3.data, key3.offset, key3.len, value0.data, value0.offset, value0.len);
+      db.put(opt, key4.data, key4.offset, key4.len, value1.data, value1.offset, value1.len);
+
+      // compare
+      Assert.assertTrue(value0.isSamePayload(db.get(key3.data, key3.offset, key3.len)));
+      Assert.assertTrue(value1.isSamePayload(db.get(key4.data, key4.offset, key4.len)));
+    }
+  }
+
+  private static Segment sliceSegment(String key) {
+    ByteBuffer rawKey = ByteBuffer.allocate(key.length() + 4);
+    rawKey.put((byte)0);
+    rawKey.put((byte)0);
+    rawKey.put(key.getBytes());
+
+    return new Segment(rawKey.array(), 2, key.length());
+  }
+
+  private static class Segment {
+    final byte[] data;
+    final int offset;
+    final int len;
+
+    public boolean isSamePayload(byte[] value) {
+      if (value == null) {
+        return false;
+      }
+      if (value.length != len) {
+        return false;
+      }
+
+      for (int i = 0; i < value.length; i++) {
+        if (data[i + offset] != value[i]) {
+          return false;
+        }
+      }
+
+      return true;
+    }
+
+    public Segment(byte[] value, int offset, int len) {
+      this.data = value;
+      this.offset = offset;
+      this.len = len;
     }
   }
 
@@ -242,6 +295,18 @@ public void merge() throws RocksDBException {
       db.merge(wOpt, "key2".getBytes(), "xxxx".getBytes());
       assertThat(db.get("key2".getBytes())).isEqualTo(
           "xxxx".getBytes());
+
+      Segment key3 = sliceSegment("key3");
+      Segment key4 = sliceSegment("key4");
+      Segment value0 = sliceSegment("value 0");
+      Segment value1 = sliceSegment("value 1");
+
+      db.merge(key3.data, key3.offset, key3.len, value0.data, value0.offset, value0.len);
+      db.merge(wOpt, key4.data, key4.offset, key4.len, value1.data, value1.offset, value1.len);
+
+      // compare
+      Assert.assertTrue(value0.isSamePayload(db.get(key3.data, key3.offset, key3.len)));
+      Assert.assertTrue(value1.isSamePayload(db.get(key4.data, key4.offset, key4.len)));
     }
   }
 
@@ -259,6 +324,18 @@ public void delete() throws RocksDBException {
       db.delete(wOpt, "key2".getBytes());
       assertThat(db.get("key1".getBytes())).isNull();
       assertThat(db.get("key2".getBytes())).isNull();
+
+
+      Segment key3 = sliceSegment("key3");
+      Segment key4 = sliceSegment("key4");
+      db.put("key3".getBytes(), "key3 value".getBytes());
+      db.put("key4".getBytes(), "key4 value".getBytes());
+
+      db.delete(key3.data, key3.offset, key3.len);
+      db.delete(wOpt, key4.data, key4.offset, key4.len);
+
+      assertThat(db.get("key3".getBytes())).isNull();
+      assertThat(db.get("key4".getBytes())).isNull();
     }
   }
 
diff --git a/memtable/alloc_tracker.cc b/memtable/alloc_tracker.cc
index 9889cc4230c..a1fa4938c52 100644
--- a/memtable/alloc_tracker.cc
+++ b/memtable/alloc_tracker.cc
@@ -24,7 +24,8 @@ AllocTracker::~AllocTracker() { FreeMem(); }
 
 void AllocTracker::Allocate(size_t bytes) {
   assert(write_buffer_manager_ != nullptr);
-  if (write_buffer_manager_->enabled()) {
+  if (write_buffer_manager_->enabled() ||
+      write_buffer_manager_->cost_to_cache()) {
     bytes_allocated_.fetch_add(bytes, std::memory_order_relaxed);
     write_buffer_manager_->ReserveMem(bytes);
   }
@@ -32,7 +33,8 @@ void AllocTracker::Allocate(size_t bytes) {
 
 void AllocTracker::DoneAllocating() {
   if (write_buffer_manager_ != nullptr && !done_allocating_) {
-    if (write_buffer_manager_->enabled()) {
+    if (write_buffer_manager_->enabled() ||
+        write_buffer_manager_->cost_to_cache()) {
       write_buffer_manager_->ScheduleFreeMem(
           bytes_allocated_.load(std::memory_order_relaxed));
     } else {
@@ -47,7 +49,8 @@ void AllocTracker::FreeMem() {
     DoneAllocating();
   }
   if (write_buffer_manager_ != nullptr && !freed_) {
-    if (write_buffer_manager_->enabled()) {
+    if (write_buffer_manager_->enabled() ||
+        write_buffer_manager_->cost_to_cache()) {
       write_buffer_manager_->FreeMem(
           bytes_allocated_.load(std::memory_order_relaxed));
     } else {
diff --git a/memtable/hash_skiplist_rep.cc b/memtable/hash_skiplist_rep.cc
index 93082b1ec28..a5c46011e3f 100644
--- a/memtable/hash_skiplist_rep.cc
+++ b/memtable/hash_skiplist_rep.cc
@@ -168,7 +168,7 @@ class HashSkipListRep : public MemTableRep {
     Bucket* list_;
     Bucket::Iterator iter_;
     // here we track if we own list_. If we own it, we are also
-    // responsible for it's cleaning. This is a poor man's shared_ptr
+    // responsible for it's cleaning. This is a poor man's std::shared_ptr
     bool own_list_;
     std::unique_ptr<Arena> arena_;
     std::string tmp_;       // For passing to EncodeKey
diff --git a/memtable/write_buffer_manager.cc b/memtable/write_buffer_manager.cc
index 21b18c8f76e..7f2e664ab5e 100644
--- a/memtable/write_buffer_manager.cc
+++ b/memtable/write_buffer_manager.cc
@@ -79,7 +79,7 @@ WriteBufferManager::~WriteBufferManager() {
 void WriteBufferManager::ReserveMemWithCache(size_t mem) {
 #ifndef ROCKSDB_LITE
   assert(cache_rep_ != nullptr);
-  // Use a mutex to protect various data structures. Can be optimzied to a
+  // Use a mutex to protect various data structures. Can be optimized to a
   // lock-free solution if it ends up with a performance bottleneck.
   std::lock_guard<std::mutex> lock(cache_rep_->cache_mutex_);
 
@@ -102,14 +102,14 @@ void WriteBufferManager::ReserveMemWithCache(size_t mem) {
 void WriteBufferManager::FreeMemWithCache(size_t mem) {
 #ifndef ROCKSDB_LITE
   assert(cache_rep_ != nullptr);
-  // Use a mutex to protect various data structures. Can be optimzied to a
+  // Use a mutex to protect various data structures. Can be optimized to a
   // lock-free solution if it ends up with a performance bottleneck.
   std::lock_guard<std::mutex> lock(cache_rep_->cache_mutex_);
   size_t new_mem_used = memory_used_.load(std::memory_order_relaxed) - mem;
   memory_used_.store(new_mem_used, std::memory_order_relaxed);
   // Gradually shrink memory costed in the block cache if the actual
   // usage is less than 3/4 of what we reserve from the block cache.
-  // We do this becausse:
+  // We do this because:
   // 1. we don't pay the cost of the block cache immediately a memtable is
   //    freed, as block cache insert is expensive;
   // 2. eventually, if we walk away from a temporary memtable size increase,
diff --git a/monitoring/perf_context.cc b/monitoring/perf_context.cc
index 9bba841f8f5..423443869be 100644
--- a/monitoring/perf_context.cc
+++ b/monitoring/perf_context.cc
@@ -15,7 +15,7 @@ PerfContext perf_context;
 #if defined(OS_SOLARIS)
 __thread PerfContext perf_context_;
 #else
-__thread PerfContext perf_context;
+thread_local PerfContext perf_context;
 #endif
 #endif
 
@@ -31,6 +31,12 @@ PerfContext* get_perf_context() {
 #endif
 }
 
+PerfContext::~PerfContext() {
+#if !defined(NPERF_CONTEXT) && defined(ROCKSDB_SUPPORT_THREAD_LOCAL) && !defined(OS_SOLARIS)
+  ClearPerLevelPerfContext();
+#endif
+}
+
 void PerfContext::Reset() {
 #ifndef NPERF_CONTEXT
   user_key_comparison_count = 0;
@@ -104,6 +110,11 @@ void PerfContext::Reset() {
   env_lock_file_nanos = 0;
   env_unlock_file_nanos = 0;
   env_new_logger_nanos = 0;
+  if (per_level_perf_context_enabled && level_to_perf_context) {
+    for (auto& kv : *level_to_perf_context) {
+      kv.second.Reset();
+    }
+  }
 #endif
 }
 
@@ -112,6 +123,25 @@ void PerfContext::Reset() {
     ss << #counter << " = " << counter << ", ";  \
   }
 
+#define PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(counter)         \
+  if (per_level_perf_context_enabled && \
+      level_to_perf_context) {                                    \
+    ss << #counter << " = ";                                      \
+    for (auto& kv : *level_to_perf_context) {                     \
+      if (!exclude_zero_counters || (kv.second.counter > 0)) {    \
+        ss << kv.second.counter << "@level" << kv.first << ", ";  \
+      }                                                           \
+    }                                                             \
+  }
+
+void PerfContextByLevel::Reset() {
+#ifndef NPERF_CONTEXT
+  bloom_filter_useful = 0;
+  bloom_filter_full_positive = 0;
+  bloom_filter_full_true_positive = 0;
+#endif
+}
+
 std::string PerfContext::ToString(bool exclude_zero_counters) const {
 #ifdef NPERF_CONTEXT
   return "";
@@ -186,8 +216,30 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const {
   PERF_CONTEXT_OUTPUT(env_lock_file_nanos);
   PERF_CONTEXT_OUTPUT(env_unlock_file_nanos);
   PERF_CONTEXT_OUTPUT(env_new_logger_nanos);
+  PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_useful);
+  PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_full_positive);
+  PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_full_true_positive);
   return ss.str();
 #endif
 }
 
+void PerfContext::EnablePerLevelPerfContext() {
+  if (!level_to_perf_context) {
+    level_to_perf_context = new std::map<uint32_t, PerfContextByLevel>();
+  }
+  per_level_perf_context_enabled = true;
+}
+
+void PerfContext::DisablePerLevelPerfContext(){
+  per_level_perf_context_enabled = false;
+}
+
+void PerfContext::ClearPerLevelPerfContext(){
+  if (level_to_perf_context) {
+    delete level_to_perf_context;
+    level_to_perf_context = nullptr;
+  }
+  per_level_perf_context_enabled = false;
+}
+
 }
diff --git a/monitoring/perf_context_imp.h b/monitoring/perf_context_imp.h
index cfcded1c96b..d67654914e8 100644
--- a/monitoring/perf_context_imp.h
+++ b/monitoring/perf_context_imp.h
@@ -16,7 +16,7 @@ extern PerfContext perf_context;
 extern __thread PerfContext perf_context_;
 #define perf_context (*get_perf_context())
 #else
-extern __thread PerfContext perf_context;
+extern thread_local PerfContext perf_context;
 #endif
 #endif
 
@@ -59,6 +59,22 @@ extern __thread PerfContext perf_context;
     perf_context.metric += value;              \
   }
 
+// Increase metric value
+#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level)                      \
+  if (perf_level >= PerfLevel::kEnableCount &&                               \
+      perf_context.per_level_perf_context_enabled &&                         \
+      perf_context.level_to_perf_context) {                                  \
+    if ((*(perf_context.level_to_perf_context)).find(level) !=               \
+        (*(perf_context.level_to_perf_context)).end()) {                     \
+      (*(perf_context.level_to_perf_context))[level].metric += value;        \
+    }                                                                        \
+    else {                                                                   \
+      PerfContextByLevel empty_context;                                      \
+      (*(perf_context.level_to_perf_context))[level] = empty_context;        \
+      (*(perf_context.level_to_perf_context))[level].metric += value;       \
+    }                                                                        \
+  }                                                                          \
+
 #endif
 
 }
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index 59ce3d9e0a8..cba427ae4b7 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -17,13 +17,214 @@
 
 namespace rocksdb {
 
+// The order of items listed in  Tickers should be the same as
+// the order listed in TickersNameMap
+const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
+    {BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"},
+    {BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"},
+    {BLOCK_CACHE_ADD, "rocksdb.block.cache.add"},
+    {BLOCK_CACHE_ADD_FAILURES, "rocksdb.block.cache.add.failures"},
+    {BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"},
+    {BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"},
+    {BLOCK_CACHE_INDEX_ADD, "rocksdb.block.cache.index.add"},
+    {BLOCK_CACHE_INDEX_BYTES_INSERT, "rocksdb.block.cache.index.bytes.insert"},
+    {BLOCK_CACHE_INDEX_BYTES_EVICT, "rocksdb.block.cache.index.bytes.evict"},
+    {BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"},
+    {BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"},
+    {BLOCK_CACHE_FILTER_ADD, "rocksdb.block.cache.filter.add"},
+    {BLOCK_CACHE_FILTER_BYTES_INSERT,
+     "rocksdb.block.cache.filter.bytes.insert"},
+    {BLOCK_CACHE_FILTER_BYTES_EVICT, "rocksdb.block.cache.filter.bytes.evict"},
+    {BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"},
+    {BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"},
+    {BLOCK_CACHE_DATA_ADD, "rocksdb.block.cache.data.add"},
+    {BLOCK_CACHE_DATA_BYTES_INSERT, "rocksdb.block.cache.data.bytes.insert"},
+    {BLOCK_CACHE_BYTES_READ, "rocksdb.block.cache.bytes.read"},
+    {BLOCK_CACHE_BYTES_WRITE, "rocksdb.block.cache.bytes.write"},
+    {BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"},
+    {BLOOM_FILTER_FULL_POSITIVE, "rocksdb.bloom.filter.full.positive"},
+    {BLOOM_FILTER_FULL_TRUE_POSITIVE,
+     "rocksdb.bloom.filter.full.true.positive"},
+    {PERSISTENT_CACHE_HIT, "rocksdb.persistent.cache.hit"},
+    {PERSISTENT_CACHE_MISS, "rocksdb.persistent.cache.miss"},
+    {SIM_BLOCK_CACHE_HIT, "rocksdb.sim.block.cache.hit"},
+    {SIM_BLOCK_CACHE_MISS, "rocksdb.sim.block.cache.miss"},
+    {MEMTABLE_HIT, "rocksdb.memtable.hit"},
+    {MEMTABLE_MISS, "rocksdb.memtable.miss"},
+    {GET_HIT_L0, "rocksdb.l0.hit"},
+    {GET_HIT_L1, "rocksdb.l1.hit"},
+    {GET_HIT_L2_AND_UP, "rocksdb.l2andup.hit"},
+    {COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"},
+    {COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"},
+    {COMPACTION_KEY_DROP_RANGE_DEL, "rocksdb.compaction.key.drop.range_del"},
+    {COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"},
+    {COMPACTION_RANGE_DEL_DROP_OBSOLETE,
+     "rocksdb.compaction.range_del.drop.obsolete"},
+    {COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
+     "rocksdb.compaction.optimized.del.drop.obsolete"},
+    {COMPACTION_CANCELLED, "rocksdb.compaction.cancelled"},
+    {NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"},
+    {NUMBER_KEYS_READ, "rocksdb.number.keys.read"},
+    {NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"},
+    {BYTES_WRITTEN, "rocksdb.bytes.written"},
+    {BYTES_READ, "rocksdb.bytes.read"},
+    {NUMBER_DB_SEEK, "rocksdb.number.db.seek"},
+    {NUMBER_DB_NEXT, "rocksdb.number.db.next"},
+    {NUMBER_DB_PREV, "rocksdb.number.db.prev"},
+    {NUMBER_DB_SEEK_FOUND, "rocksdb.number.db.seek.found"},
+    {NUMBER_DB_NEXT_FOUND, "rocksdb.number.db.next.found"},
+    {NUMBER_DB_PREV_FOUND, "rocksdb.number.db.prev.found"},
+    {ITER_BYTES_READ, "rocksdb.db.iter.bytes.read"},
+    {NO_FILE_CLOSES, "rocksdb.no.file.closes"},
+    {NO_FILE_OPENS, "rocksdb.no.file.opens"},
+    {NO_FILE_ERRORS, "rocksdb.no.file.errors"},
+    {STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"},
+    {STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"},
+    {STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"},
+    {STALL_MICROS, "rocksdb.stall.micros"},
+    {DB_MUTEX_WAIT_MICROS, "rocksdb.db.mutex.wait.micros"},
+    {RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"},
+    {NO_ITERATORS, "rocksdb.num.iterators"},
+    {NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"},
+    {NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"},
+    {NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"},
+    {NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered"},
+    {NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"},
+    {BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"},
+    {BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"},
+    {NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"},
+    {GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"},
+    {BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss"},
+    {BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit"},
+    {BLOCK_CACHE_COMPRESSED_ADD, "rocksdb.block.cachecompressed.add"},
+    {BLOCK_CACHE_COMPRESSED_ADD_FAILURES,
+     "rocksdb.block.cachecompressed.add.failures"},
+    {WAL_FILE_SYNCED, "rocksdb.wal.synced"},
+    {WAL_FILE_BYTES, "rocksdb.wal.bytes"},
+    {WRITE_DONE_BY_SELF, "rocksdb.write.self"},
+    {WRITE_DONE_BY_OTHER, "rocksdb.write.other"},
+    {WRITE_TIMEDOUT, "rocksdb.write.timeout"},
+    {WRITE_WITH_WAL, "rocksdb.write.wal"},
+    {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"},
+    {COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"},
+    {FLUSH_WRITE_BYTES, "rocksdb.flush.write.bytes"},
+    {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
+     "rocksdb.number.direct.load.table.properties"},
+    {NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"},
+    {NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"},
+    {NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"},
+    {NUMBER_BLOCK_COMPRESSED, "rocksdb.number.block.compressed"},
+    {NUMBER_BLOCK_DECOMPRESSED, "rocksdb.number.block.decompressed"},
+    {NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"},
+    {MERGE_OPERATION_TOTAL_TIME, "rocksdb.merge.operation.time.nanos"},
+    {FILTER_OPERATION_TOTAL_TIME, "rocksdb.filter.operation.time.nanos"},
+    {ROW_CACHE_HIT, "rocksdb.row.cache.hit"},
+    {ROW_CACHE_MISS, "rocksdb.row.cache.miss"},
+    {READ_AMP_ESTIMATE_USEFUL_BYTES, "rocksdb.read.amp.estimate.useful.bytes"},
+    {READ_AMP_TOTAL_READ_BYTES, "rocksdb.read.amp.total.read.bytes"},
+    {NUMBER_RATE_LIMITER_DRAINS, "rocksdb.number.rate_limiter.drains"},
+    {NUMBER_ITER_SKIP, "rocksdb.number.iter.skip"},
+    {BLOB_DB_NUM_PUT, "rocksdb.blobdb.num.put"},
+    {BLOB_DB_NUM_WRITE, "rocksdb.blobdb.num.write"},
+    {BLOB_DB_NUM_GET, "rocksdb.blobdb.num.get"},
+    {BLOB_DB_NUM_MULTIGET, "rocksdb.blobdb.num.multiget"},
+    {BLOB_DB_NUM_SEEK, "rocksdb.blobdb.num.seek"},
+    {BLOB_DB_NUM_NEXT, "rocksdb.blobdb.num.next"},
+    {BLOB_DB_NUM_PREV, "rocksdb.blobdb.num.prev"},
+    {BLOB_DB_NUM_KEYS_WRITTEN, "rocksdb.blobdb.num.keys.written"},
+    {BLOB_DB_NUM_KEYS_READ, "rocksdb.blobdb.num.keys.read"},
+    {BLOB_DB_BYTES_WRITTEN, "rocksdb.blobdb.bytes.written"},
+    {BLOB_DB_BYTES_READ, "rocksdb.blobdb.bytes.read"},
+    {BLOB_DB_WRITE_INLINED, "rocksdb.blobdb.write.inlined"},
+    {BLOB_DB_WRITE_INLINED_TTL, "rocksdb.blobdb.write.inlined.ttl"},
+    {BLOB_DB_WRITE_BLOB, "rocksdb.blobdb.write.blob"},
+    {BLOB_DB_WRITE_BLOB_TTL, "rocksdb.blobdb.write.blob.ttl"},
+    {BLOB_DB_BLOB_FILE_BYTES_WRITTEN, "rocksdb.blobdb.blob.file.bytes.written"},
+    {BLOB_DB_BLOB_FILE_BYTES_READ, "rocksdb.blobdb.blob.file.bytes.read"},
+    {BLOB_DB_BLOB_FILE_SYNCED, "rocksdb.blobdb.blob.file.synced"},
+    {BLOB_DB_BLOB_INDEX_EXPIRED_COUNT,
+     "rocksdb.blobdb.blob.index.expired.count"},
+    {BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, "rocksdb.blobdb.blob.index.expired.size"},
+    {BLOB_DB_BLOB_INDEX_EVICTED_COUNT,
+     "rocksdb.blobdb.blob.index.evicted.count"},
+    {BLOB_DB_BLOB_INDEX_EVICTED_SIZE, "rocksdb.blobdb.blob.index.evicted.size"},
+    {BLOB_DB_GC_NUM_FILES, "rocksdb.blobdb.gc.num.files"},
+    {BLOB_DB_GC_NUM_NEW_FILES, "rocksdb.blobdb.gc.num.new.files"},
+    {BLOB_DB_GC_FAILURES, "rocksdb.blobdb.gc.failures"},
+    {BLOB_DB_GC_NUM_KEYS_OVERWRITTEN, "rocksdb.blobdb.gc.num.keys.overwritten"},
+    {BLOB_DB_GC_NUM_KEYS_EXPIRED, "rocksdb.blobdb.gc.num.keys.expired"},
+    {BLOB_DB_GC_NUM_KEYS_RELOCATED, "rocksdb.blobdb.gc.num.keys.relocated"},
+    {BLOB_DB_GC_BYTES_OVERWRITTEN, "rocksdb.blobdb.gc.bytes.overwritten"},
+    {BLOB_DB_GC_BYTES_EXPIRED, "rocksdb.blobdb.gc.bytes.expired"},
+    {BLOB_DB_GC_BYTES_RELOCATED, "rocksdb.blobdb.gc.bytes.relocated"},
+    {BLOB_DB_FIFO_NUM_FILES_EVICTED, "rocksdb.blobdb.fifo.num.files.evicted"},
+    {BLOB_DB_FIFO_NUM_KEYS_EVICTED, "rocksdb.blobdb.fifo.num.keys.evicted"},
+    {BLOB_DB_FIFO_BYTES_EVICTED, "rocksdb.blobdb.fifo.bytes.evicted"},
+    {TXN_PREPARE_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.prepare"},
+    {TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
+     "rocksdb.txn.overhead.mutex.old.commit.map"},
+    {TXN_DUPLICATE_KEY_OVERHEAD, "rocksdb.txn.overhead.duplicate.key"},
+    {TXN_SNAPSHOT_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.snapshot"},
+    {NUMBER_MULTIGET_KEYS_FOUND, "rocksdb.number.multiget.keys.found"},
+    {NO_ITERATOR_CREATED, "rocksdb.num.iterator.created"},
+    {NO_ITERATOR_DELETED, "rocksdb.num.iterator.deleted"},
+};
+
+const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
+    {DB_GET, "rocksdb.db.get.micros"},
+    {DB_WRITE, "rocksdb.db.write.micros"},
+    {COMPACTION_TIME, "rocksdb.compaction.times.micros"},
+    {SUBCOMPACTION_SETUP_TIME, "rocksdb.subcompaction.setup.times.micros"},
+    {TABLE_SYNC_MICROS, "rocksdb.table.sync.micros"},
+    {COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros"},
+    {WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros"},
+    {MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros"},
+    {TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros"},
+    {DB_MULTIGET, "rocksdb.db.multiget.micros"},
+    {READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros"},
+    {READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros"},
+    {WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros"},
+    {STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"},
+    {STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"},
+    {STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"},
+    {HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
+    {SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
+    {NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction"},
+    {DB_SEEK, "rocksdb.db.seek.micros"},
+    {WRITE_STALL, "rocksdb.db.write.stall"},
+    {SST_READ_MICROS, "rocksdb.sst.read.micros"},
+    {NUM_SUBCOMPACTIONS_SCHEDULED, "rocksdb.num.subcompactions.scheduled"},
+    {BYTES_PER_READ, "rocksdb.bytes.per.read"},
+    {BYTES_PER_WRITE, "rocksdb.bytes.per.write"},
+    {BYTES_PER_MULTIGET, "rocksdb.bytes.per.multiget"},
+    {BYTES_COMPRESSED, "rocksdb.bytes.compressed"},
+    {BYTES_DECOMPRESSED, "rocksdb.bytes.decompressed"},
+    {COMPRESSION_TIMES_NANOS, "rocksdb.compression.times.nanos"},
+    {DECOMPRESSION_TIMES_NANOS, "rocksdb.decompression.times.nanos"},
+    {READ_NUM_MERGE_OPERANDS, "rocksdb.read.num.merge_operands"},
+    {BLOB_DB_KEY_SIZE, "rocksdb.blobdb.key.size"},
+    {BLOB_DB_VALUE_SIZE, "rocksdb.blobdb.value.size"},
+    {BLOB_DB_WRITE_MICROS, "rocksdb.blobdb.write.micros"},
+    {BLOB_DB_GET_MICROS, "rocksdb.blobdb.get.micros"},
+    {BLOB_DB_MULTIGET_MICROS, "rocksdb.blobdb.multiget.micros"},
+    {BLOB_DB_SEEK_MICROS, "rocksdb.blobdb.seek.micros"},
+    {BLOB_DB_NEXT_MICROS, "rocksdb.blobdb.next.micros"},
+    {BLOB_DB_PREV_MICROS, "rocksdb.blobdb.prev.micros"},
+    {BLOB_DB_BLOB_FILE_WRITE_MICROS, "rocksdb.blobdb.blob.file.write.micros"},
+    {BLOB_DB_BLOB_FILE_READ_MICROS, "rocksdb.blobdb.blob.file.read.micros"},
+    {BLOB_DB_BLOB_FILE_SYNC_MICROS, "rocksdb.blobdb.blob.file.sync.micros"},
+    {BLOB_DB_GC_MICROS, "rocksdb.blobdb.gc.micros"},
+    {BLOB_DB_COMPRESSION_MICROS, "rocksdb.blobdb.compression.micros"},
+    {BLOB_DB_DECOMPRESSION_MICROS, "rocksdb.blobdb.decompression.micros"},
+    {FLUSH_TIME, "rocksdb.db.flush.micros"},
+};
+
 std::shared_ptr<Statistics> CreateDBStatistics() {
-  return std::make_shared<StatisticsImpl>(nullptr, false);
+  return std::make_shared<StatisticsImpl>(nullptr);
 }
 
-StatisticsImpl::StatisticsImpl(std::shared_ptr<Statistics> stats,
-                               bool enable_internal_stats)
-    : stats_(std::move(stats)), enable_internal_stats_(enable_internal_stats) {}
+StatisticsImpl::StatisticsImpl(std::shared_ptr<Statistics> stats)
+    : stats_(std::move(stats)) {}
 
 StatisticsImpl::~StatisticsImpl() {}
 
@@ -33,10 +234,7 @@ uint64_t StatisticsImpl::getTickerCount(uint32_t tickerType) const {
 }
 
 uint64_t StatisticsImpl::getTickerCountLocked(uint32_t tickerType) const {
-  assert(
-    enable_internal_stats_ ?
-      tickerType < INTERNAL_TICKER_ENUM_MAX :
-      tickerType < TICKER_ENUM_MAX);
+  assert(tickerType < TICKER_ENUM_MAX);
   uint64_t res = 0;
   for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
     res += per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType];
@@ -52,10 +250,7 @@ void StatisticsImpl::histogramData(uint32_t histogramType,
 
 std::unique_ptr<HistogramImpl> StatisticsImpl::getHistogramImplLocked(
     uint32_t histogramType) const {
-  assert(
-    enable_internal_stats_ ?
-      histogramType < INTERNAL_HISTOGRAM_ENUM_MAX :
-      histogramType < HISTOGRAM_ENUM_MAX);
+  assert(histogramType < HISTOGRAM_ENUM_MAX);
   std::unique_ptr<HistogramImpl> res_hist(new HistogramImpl());
   for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
     res_hist->Merge(
@@ -80,8 +275,7 @@ void StatisticsImpl::setTickerCount(uint32_t tickerType, uint64_t count) {
 }
 
 void StatisticsImpl::setTickerCountLocked(uint32_t tickerType, uint64_t count) {
-  assert(enable_internal_stats_ ? tickerType < INTERNAL_TICKER_ENUM_MAX
-                                : tickerType < TICKER_ENUM_MAX);
+  assert(tickerType < TICKER_ENUM_MAX);
   for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
     if (core_idx == 0) {
       per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType] = count;
@@ -95,8 +289,7 @@ uint64_t StatisticsImpl::getAndResetTickerCount(uint32_t tickerType) {
   uint64_t sum = 0;
   {
     MutexLock lock(&aggregate_lock_);
-    assert(enable_internal_stats_ ? tickerType < INTERNAL_TICKER_ENUM_MAX
-                                  : tickerType < TICKER_ENUM_MAX);
+    assert(tickerType < TICKER_ENUM_MAX);
     for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
       sum +=
           per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType].exchange(
@@ -110,10 +303,7 @@ uint64_t StatisticsImpl::getAndResetTickerCount(uint32_t tickerType) {
 }
 
 void StatisticsImpl::recordTick(uint32_t tickerType, uint64_t count) {
-  assert(
-    enable_internal_stats_ ?
-      tickerType < INTERNAL_TICKER_ENUM_MAX :
-      tickerType < TICKER_ENUM_MAX);
+  assert(tickerType < TICKER_ENUM_MAX);
   per_core_stats_.Access()->tickers_[tickerType].fetch_add(
       count, std::memory_order_relaxed);
   if (stats_ && tickerType < TICKER_ENUM_MAX) {
@@ -122,10 +312,7 @@ void StatisticsImpl::recordTick(uint32_t tickerType, uint64_t count) {
 }
 
 void StatisticsImpl::measureTime(uint32_t histogramType, uint64_t value) {
-  assert(
-    enable_internal_stats_ ?
-      histogramType < INTERNAL_HISTOGRAM_ENUM_MAX :
-      histogramType < HISTOGRAM_ENUM_MAX);
+  assert(histogramType < HISTOGRAM_ENUM_MAX);
   per_core_stats_.Access()->histograms_[histogramType].Add(value);
   if (stats_ && histogramType < HISTOGRAM_ENUM_MAX) {
     stats_->measureTime(histogramType, value);
@@ -157,41 +344,36 @@ std::string StatisticsImpl::ToString() const {
   std::string res;
   res.reserve(20000);
   for (const auto& t : TickersNameMap) {
-    if (t.first < TICKER_ENUM_MAX || enable_internal_stats_) {
-      char buffer[kTmpStrBufferSize];
-      snprintf(buffer, kTmpStrBufferSize, "%s COUNT : %" PRIu64 "\n",
-               t.second.c_str(), getTickerCountLocked(t.first));
-      res.append(buffer);
-    }
+    assert(t.first < TICKER_ENUM_MAX);
+    char buffer[kTmpStrBufferSize];
+    snprintf(buffer, kTmpStrBufferSize, "%s COUNT : %" PRIu64 "\n",
+             t.second.c_str(), getTickerCountLocked(t.first));
+    res.append(buffer);
   }
   for (const auto& h : HistogramsNameMap) {
-    if (h.first < HISTOGRAM_ENUM_MAX || enable_internal_stats_) {
-      char buffer[kTmpStrBufferSize];
-      HistogramData hData;
-      getHistogramImplLocked(h.first)->Data(&hData);
-      // don't handle failures - buffer should always be big enough and arguments
-      // should be provided correctly
-      int ret = snprintf(
-          buffer, kTmpStrBufferSize,
-          "%s P50 : %f P95 : %f P99 : %f P100 : %f COUNT : %" PRIu64 " SUM : %"
-          PRIu64 "\n", h.second.c_str(), hData.median, hData.percentile95,
-          hData.percentile99, hData.max, hData.count, hData.sum);
-      if (ret < 0 || ret >= kTmpStrBufferSize) {
-        assert(false);
-        continue;
-      }
-      res.append(buffer);
+    assert(h.first < HISTOGRAM_ENUM_MAX);
+    char buffer[kTmpStrBufferSize];
+    HistogramData hData;
+    getHistogramImplLocked(h.first)->Data(&hData);
+    // don't handle failures - buffer should always be big enough and arguments
+    // should be provided correctly
+    int ret = snprintf(
+        buffer, kTmpStrBufferSize,
+        "%s P50 : %f P95 : %f P99 : %f P100 : %f COUNT : %" PRIu64 " SUM : %"
+        PRIu64 "\n", h.second.c_str(), hData.median, hData.percentile95,
+        hData.percentile99, hData.max, hData.count, hData.sum);
+    if (ret < 0 || ret >= kTmpStrBufferSize) {
+      assert(false);
+      continue;
     }
+    res.append(buffer);
   }
   res.shrink_to_fit();
   return res;
 }
 
 bool StatisticsImpl::HistEnabledForType(uint32_t type) const {
-  if (LIKELY(!enable_internal_stats_)) {
-    return type < HISTOGRAM_ENUM_MAX;
-  }
-  return true;
+  return type < HISTOGRAM_ENUM_MAX;
 }
 
 } // namespace rocksdb
diff --git a/monitoring/statistics.h b/monitoring/statistics.h
index 4427c8c5465..dcd5f7a010c 100644
--- a/monitoring/statistics.h
+++ b/monitoring/statistics.h
@@ -41,8 +41,7 @@ enum HistogramsInternal : uint32_t {
 
 class StatisticsImpl : public Statistics {
  public:
-  StatisticsImpl(std::shared_ptr<Statistics> stats,
-                 bool enable_internal_stats);
+  StatisticsImpl(std::shared_ptr<Statistics> stats);
   virtual ~StatisticsImpl();
 
   virtual uint64_t getTickerCount(uint32_t ticker_type) const override;
@@ -62,8 +61,6 @@ class StatisticsImpl : public Statistics {
  private:
   // If non-nullptr, forwards updates to the object pointed to by `stats_`.
   std::shared_ptr<Statistics> stats_;
-  // TODO(ajkr): clean this up since there are no internal stats anymore
-  bool enable_internal_stats_;
   // Synchronizes anything that operates across other cores' local data,
   // such that operations like Reset() can be performed atomically.
   mutable port::Mutex aggregate_lock_;
diff --git a/monitoring/statistics_test.cc b/monitoring/statistics_test.cc
index 43aacde9c1b..a77022bfb3d 100644
--- a/monitoring/statistics_test.cc
+++ b/monitoring/statistics_test.cc
@@ -16,7 +16,7 @@ class StatisticsTest : public testing::Test {};
 
 // Sanity check to make sure that contents and order of TickersNameMap
 // match Tickers enum
-TEST_F(StatisticsTest, Sanity) {
+TEST_F(StatisticsTest, SanityTickers) {
   EXPECT_EQ(static_cast<size_t>(Tickers::TICKER_ENUM_MAX),
             TickersNameMap.size());
 
@@ -26,6 +26,18 @@ TEST_F(StatisticsTest, Sanity) {
   }
 }
 
+// Sanity check to make sure that contents and order of HistogramsNameMap
+// match Tickers enum
+TEST_F(StatisticsTest, SanityHistograms) {
+  EXPECT_EQ(static_cast<size_t>(Histograms::HISTOGRAM_ENUM_MAX),
+            HistogramsNameMap.size());
+
+  for (uint32_t h = 0; h < Histograms::HISTOGRAM_ENUM_MAX; h++) {
+    auto pair = HistogramsNameMap[static_cast<size_t>(h)];
+    ASSERT_EQ(pair.first, h) << "Miss match at " << pair.second;
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/options/cf_options.h b/options/cf_options.h
index 1658bf427a3..69b0b0105af 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -18,7 +18,7 @@ namespace rocksdb {
 // ImmutableCFOptions is a data struct used by RocksDB internal. It contains a
 // subset of Options that should not be changed during the entire lifetime
 // of DB. Raw pointers defined in this struct do not have ownership to the data
-// they point to. Options contains shared_ptr to these data.
+// they point to. Options contains std::shared_ptr to these data.
 struct ImmutableCFOptions {
   ImmutableCFOptions();
   explicit ImmutableCFOptions(const Options& options);
diff --git a/options/db_options.cc b/options/db_options.cc
index fd3cdcccd66..4e8134511ba 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -85,7 +85,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       allow_ingest_behind(options.allow_ingest_behind),
       preserve_deletes(options.preserve_deletes),
       two_write_queues(options.two_write_queues),
-      manual_wal_flush(options.manual_wal_flush) {
+      manual_wal_flush(options.manual_wal_flush),
+      atomic_flush(options.atomic_flush) {
 }
 
 void ImmutableDBOptions::Dump(Logger* log) const {
diff --git a/options/db_options.h b/options/db_options.h
index 107d35c8770..2cd83b55d43 100644
--- a/options/db_options.h
+++ b/options/db_options.h
@@ -78,6 +78,7 @@ struct ImmutableDBOptions {
   bool preserve_deletes;
   bool two_write_queues;
   bool manual_wal_flush;
+  bool atomic_flush;
 };
 
 struct MutableDBOptions {
diff --git a/options/options_helper.cc b/options/options_helper.cc
index f4c59ff06e7..27a2252a02e 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -126,6 +126,7 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
       immutable_db_options.preserve_deletes;
   options.two_write_queues = immutable_db_options.two_write_queues;
   options.manual_wal_flush = immutable_db_options.manual_wal_flush;
+  options.atomic_flush = immutable_db_options.atomic_flush;
 
   return options;
 }
@@ -215,7 +216,8 @@ std::map<CompactionStopStyle, std::string>
 std::unordered_map<std::string, ChecksumType>
     OptionsHelper::checksum_type_string_map = {{"kNoChecksum", kNoChecksum},
                                                {"kCRC32c", kCRC32c},
-                                               {"kxxHash", kxxHash}};
+                                               {"kxxHash", kxxHash},
+                                               {"kxxHash64", kxxHash64}};
 
 std::unordered_map<std::string, CompressionType>
     OptionsHelper::compression_type_string_map = {
@@ -1554,7 +1556,11 @@ std::unordered_map<std::string, OptionTypeInfo>
           offsetof(struct ImmutableDBOptions, manual_wal_flush)}},
         {"seq_per_batch",
          {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false,
-          0}}};
+          0}},
+        {"atomic_flush",
+         {offsetof(struct DBOptions, atomic_flush), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false,
+          offsetof(struct ImmutableDBOptions, atomic_flush)}}};
 
 std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
     OptionsHelper::block_base_table_index_type_string_map = {
diff --git a/options/options_parser.cc b/options/options_parser.cc
index f9144b67d77..32cfb8d5316 100644
--- a/options/options_parser.cc
+++ b/options/options_parser.cc
@@ -48,7 +48,7 @@ Status PersistRocksDBOptions(const DBOptions& db_opt,
   if (!s.ok()) {
     return s;
   }
-  unique_ptr<WritableFileWriter> writable;
+  std::unique_ptr<WritableFileWriter> writable;
   writable.reset(new WritableFileWriter(std::move(wf), file_name, EnvOptions(),
                                         nullptr /* statistics */));
 
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index ded152ba99e..cad1af3d769 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -291,7 +291,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "concurrent_prepare=false;"
                              "two_write_queues=false;"
                              "manual_wal_flush=false;"
-                             "seq_per_batch=false;",
+                             "seq_per_batch=false;"
+                             "atomic_flush=false",
                              new_options));
 
   ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),
diff --git a/port/jemalloc_helper.h b/port/jemalloc_helper.h
new file mode 100644
index 00000000000..412a80d26a4
--- /dev/null
+++ b/port/jemalloc_helper.h
@@ -0,0 +1,49 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifdef ROCKSDB_JEMALLOC
+#ifdef __FreeBSD__
+#include <malloc_np.h>
+#else
+#include <jemalloc/jemalloc.h>
+#endif
+
+// Declare non-standard jemalloc APIs as weak symbols. We can null-check these
+// symbols to detect whether jemalloc is linked with the binary.
+extern "C" void* mallocx(size_t, int) __attribute__((__weak__));
+extern "C" void* rallocx(void*, size_t, int) __attribute__((__weak__));
+extern "C" size_t xallocx(void*, size_t, size_t, int) __attribute__((__weak__));
+extern "C" size_t sallocx(const void*, int) __attribute__((__weak__));
+extern "C" void dallocx(void*, int) __attribute__((__weak__));
+extern "C" void sdallocx(void*, size_t, int) __attribute__((__weak__));
+extern "C" size_t nallocx(size_t, int) __attribute__((__weak__));
+extern "C" int mallctl(const char*, void*, size_t*, void*, size_t)
+    __attribute__((__weak__));
+extern "C" int mallctlnametomib(const char*, size_t*, size_t*)
+    __attribute__((__weak__));
+extern "C" int mallctlbymib(const size_t*, size_t, void*, size_t*, void*,
+                            size_t) __attribute__((__weak__));
+extern "C" void malloc_stats_print(void (*)(void*, const char*), void*,
+                                   const char*) __attribute__((__weak__));
+extern "C" size_t malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void*)
+    JEMALLOC_CXX_THROW __attribute__((__weak__));
+
+// Check if Jemalloc is linked with the binary. Note the main program might be
+// using a different memory allocator even this method return true.
+// It is loosely based on folly::usingJEMalloc(), minus the check that actually
+// allocate memory and see if it is through jemalloc, to handle the dlopen()
+// case:
+// https://github.com/facebook/folly/blob/76cf8b5841fb33137cfbf8b224f0226437c855bc/folly/memory/Malloc.h#L147
+static inline bool HasJemalloc() {
+  return mallocx != nullptr && rallocx != nullptr && xallocx != nullptr &&
+         sallocx != nullptr && dallocx != nullptr && sdallocx != nullptr &&
+         nallocx != nullptr && mallctl != nullptr &&
+         mallctlnametomib != nullptr && mallctlbymib != nullptr &&
+         malloc_stats_print != nullptr && malloc_usable_size != nullptr;
+}
+
+#endif  // ROCKSDB_JEMALLOC
diff --git a/port/win/env_win.cc b/port/win/env_win.cc
index 723a273f0bf..d3013906709 100644
--- a/port/win/env_win.cc
+++ b/port/win/env_win.cc
@@ -102,7 +102,8 @@ WinEnvIO::~WinEnvIO() {
 Status WinEnvIO::DeleteFile(const std::string& fname) {
   Status result;
 
-  BOOL ret = DeleteFileA(fname.c_str());
+  BOOL ret = RX_DeleteFile(RX_FN(fname).c_str());
+
   if(!ret) {
     auto lastError = GetLastError();
     result = IOErrorFromWindowsError("Failed to delete: " + fname,
@@ -114,7 +115,7 @@ Status WinEnvIO::DeleteFile(const std::string& fname) {
 
 Status WinEnvIO::Truncate(const std::string& fname, size_t size) {
   Status s;
-  int result = truncate(fname.c_str(), size);
+  int result = rocksdb::port::Truncate(fname, size);
   if (result != 0) {
     s = IOError("Failed to truncate: " + fname, errno);
   }
@@ -151,8 +152,8 @@ Status WinEnvIO::NewSequentialFile(const std::string& fname,
 
   {
     IOSTATS_TIMER_GUARD(open_nanos);
-    hFile = CreateFileA(
-      fname.c_str(), GENERIC_READ,
+    hFile = RX_CreateFile(
+      RX_FN(fname).c_str(), GENERIC_READ,
       FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL,
       OPEN_EXISTING,  // Original fopen mode is "rb"
       fileFlags, NULL);
@@ -190,7 +191,7 @@ Status WinEnvIO::NewRandomAccessFile(const std::string& fname,
   {
     IOSTATS_TIMER_GUARD(open_nanos);
     hFile =
-      CreateFileA(fname.c_str(), GENERIC_READ,
+      RX_CreateFile(RX_FN(fname).c_str(), GENERIC_READ,
       FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
       NULL, OPEN_EXISTING, fileFlags, NULL);
   }
@@ -217,7 +218,7 @@ Status WinEnvIO::NewRandomAccessFile(const std::string& fname,
           "NewRandomAccessFile failed to map empty file: " + fname, EINVAL);
       }
 
-      HANDLE hMap = CreateFileMappingA(hFile, NULL, PAGE_READONLY,
+      HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READONLY,
         0,  // Whole file at its present length
         0,
         NULL);  // Mapping name
@@ -302,8 +303,8 @@ Status WinEnvIO::OpenWritableFile(const std::string& fname,
   HANDLE hFile = 0;
   {
     IOSTATS_TIMER_GUARD(open_nanos);
-    hFile = CreateFileA(
-      fname.c_str(),
+    hFile = RX_CreateFile(
+      RX_FN(fname).c_str(),
       desired_access,  // Access desired
       shared_mode,
       NULL,           // Security attributes
@@ -366,7 +367,7 @@ Status WinEnvIO::NewRandomRWFile(const std::string & fname,
   {
     IOSTATS_TIMER_GUARD(open_nanos);
     hFile =
-      CreateFileA(fname.c_str(),
+      RX_CreateFile(RX_FN(fname).c_str(),
         desired_access,
         shared_mode,
         NULL, // Security attributes
@@ -399,8 +400,8 @@ Status WinEnvIO::NewMemoryMappedFileBuffer(const std::string & fname,
   HANDLE hFile = INVALID_HANDLE_VALUE;
   {
     IOSTATS_TIMER_GUARD(open_nanos);
-    hFile = CreateFileA(
-      fname.c_str(), GENERIC_READ | GENERIC_WRITE,
+    hFile = RX_CreateFile(
+      RX_FN(fname).c_str(), GENERIC_READ | GENERIC_WRITE,
       FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
       NULL,
       OPEN_EXISTING,  // Open only if it exists
@@ -432,7 +433,7 @@ Status WinEnvIO::NewMemoryMappedFileBuffer(const std::string & fname,
       "The specified file size does not fit into 32-bit memory addressing: " + fname);
   }
 
-  HANDLE hMap = CreateFileMappingA(hFile, NULL, PAGE_READWRITE,
+  HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READWRITE,
       0,  // Whole file at its present length
       0,
       NULL);  // Mapping name
@@ -483,7 +484,7 @@ Status WinEnvIO::NewDirectory(const std::string& name,
   // 0 - for access means read metadata
   {
     IOSTATS_TIMER_GUARD(open_nanos);
-    handle = ::CreateFileA(name.c_str(), 0,
+    handle = RX_CreateFile(RX_FN(name).c_str(), 0,
       FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
       NULL,
       OPEN_EXISTING,
@@ -509,8 +510,7 @@ Status WinEnvIO::FileExists(const std::string& fname) {
   // which is consistent with _access() impl on windows
   // but can be added
   WIN32_FILE_ATTRIBUTE_DATA attrs;
-  if (FALSE == GetFileAttributesExA(fname.c_str(), GetFileExInfoStandard,
-    &attrs)) {
+  if (FALSE == RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard, &attrs)) {
     auto lastError = GetLastError();
     switch (lastError) {
     case ERROR_ACCESS_DENIED:
@@ -535,11 +535,12 @@ Status WinEnvIO::GetChildren(const std::string& dir,
   result->clear();
   std::vector<std::string> output;
 
-  WIN32_FIND_DATA data;
+  RX_WIN32_FIND_DATA data;
+  memset(&data, 0, sizeof(data));
   std::string pattern(dir);
   pattern.append("\\").append("*");
 
-  HANDLE handle = ::FindFirstFileExA(pattern.c_str(),
+  HANDLE handle = RX_FindFirstFileEx(RX_FN(pattern).c_str(),
     FindExInfoBasic, // Do not want alternative name
     &data,
     FindExSearchNameMatch,
@@ -572,8 +573,9 @@ Status WinEnvIO::GetChildren(const std::string& dir,
   data.cFileName[MAX_PATH - 1] = 0;
 
   while (true) {
-    output.emplace_back(data.cFileName);
-    BOOL ret =- ::FindNextFileA(handle, &data);
+    auto x = RX_FILESTRING(data.cFileName, RX_FNLEN(data.cFileName));
+    output.emplace_back(FN_TO_RX(x));
+    BOOL ret =- RX_FindNextFile(handle, &data);
     // If the function fails the return value is zero
     // and non-zero otherwise. Not TRUE or FALSE.
     if (ret == FALSE) {
@@ -588,8 +590,7 @@ Status WinEnvIO::GetChildren(const std::string& dir,
 
 Status WinEnvIO::CreateDir(const std::string& name) {
   Status result;
-
-  BOOL ret = CreateDirectoryA(name.c_str(), NULL);
+  BOOL ret = RX_CreateDirectory(RX_FN(name).c_str(), NULL);
   if (!ret) {
     auto lastError = GetLastError();
     result = IOErrorFromWindowsError(
@@ -606,7 +607,7 @@ Status  WinEnvIO::CreateDirIfMissing(const std::string& name) {
     return result;
   }
 
-  BOOL ret = CreateDirectoryA(name.c_str(), NULL);
+  BOOL ret = RX_CreateDirectory(RX_FN(name).c_str(), NULL);
   if (!ret) {
     auto lastError = GetLastError();
     if (lastError != ERROR_ALREADY_EXISTS) {
@@ -622,7 +623,7 @@ Status  WinEnvIO::CreateDirIfMissing(const std::string& name) {
 
 Status WinEnvIO::DeleteDir(const std::string& name) {
   Status result;
-  BOOL ret = RemoveDirectoryA(name.c_str());
+  BOOL ret = RX_RemoveDirectory(RX_FN(name).c_str());
   if (!ret) {
     auto lastError = GetLastError();
     result = IOErrorFromWindowsError("Failed to remove dir: " + name, lastError);
@@ -635,7 +636,7 @@ Status WinEnvIO::GetFileSize(const std::string& fname,
   Status s;
 
   WIN32_FILE_ATTRIBUTE_DATA attrs;
-  if (GetFileAttributesExA(fname.c_str(), GetFileExInfoStandard, &attrs)) {
+  if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard, &attrs)) {
     ULARGE_INTEGER file_size;
     file_size.HighPart = attrs.nFileSizeHigh;
     file_size.LowPart = attrs.nFileSizeLow;
@@ -670,7 +671,7 @@ Status WinEnvIO::GetFileModificationTime(const std::string& fname,
   Status s;
 
   WIN32_FILE_ATTRIBUTE_DATA attrs;
-  if (GetFileAttributesExA(fname.c_str(), GetFileExInfoStandard, &attrs)) {
+  if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard, &attrs)) {
     *file_mtime = FileTimeToUnixTime(attrs.ftLastWriteTime);
   } else {
     auto lastError = GetLastError();
@@ -688,7 +689,7 @@ Status WinEnvIO::RenameFile(const std::string& src,
 
   // rename() is not capable of replacing the existing file as on Linux
   // so use OS API directly
-  if (!MoveFileExA(src.c_str(), target.c_str(), MOVEFILE_REPLACE_EXISTING)) {
+  if (!RX_MoveFileEx(RX_FN(src).c_str(), RX_FN(target).c_str(), MOVEFILE_REPLACE_EXISTING)) {
     DWORD lastError = GetLastError();
 
     std::string text("Failed to rename: ");
@@ -704,7 +705,7 @@ Status WinEnvIO::LinkFile(const std::string& src,
   const std::string& target) {
   Status result;
 
-  if (!CreateHardLinkA(target.c_str(), src.c_str(), NULL)) {
+  if (!RX_CreateHardLink(RX_FN(target).c_str(), RX_FN(src).c_str(),  NULL)) {
     DWORD lastError = GetLastError();
     if (lastError == ERROR_NOT_SAME_DEVICE) {
       return Status::NotSupported("No cross FS links allowed");
@@ -721,8 +722,9 @@ Status WinEnvIO::LinkFile(const std::string& src,
 
 Status WinEnvIO::NumFileLinks(const std::string& fname, uint64_t* count) {
   Status s;
-  HANDLE handle = ::CreateFileA(
-      fname.c_str(), 0, FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
+  HANDLE handle = RX_CreateFile(
+      RX_FN(fname).c_str(), 0,
+      FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
       NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
 
   if (INVALID_HANDLE_VALUE == handle) {
@@ -758,7 +760,7 @@ Status WinEnvIO::AreFilesSame(const std::string& first,
   }
 
   // 0 - for access means read metadata
-  HANDLE file_1 = ::CreateFileA(first.c_str(), 0, 
+  HANDLE file_1 = RX_CreateFile(RX_FN(first).c_str(), 0, 
                       FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
                       NULL, 
                       OPEN_EXISTING,
@@ -773,7 +775,7 @@ Status WinEnvIO::AreFilesSame(const std::string& first,
   }
   UniqueCloseHandlePtr g_1(file_1, CloseHandleFunc);
 
-  HANDLE file_2 = ::CreateFileA(second.c_str(), 0,
+  HANDLE file_2 = RX_CreateFile(RX_FN(second).c_str(), 0,
                      FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
                      NULL, OPEN_EXISTING,
                      FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible
@@ -835,7 +837,7 @@ Status  WinEnvIO::LockFile(const std::string& lockFname,
   HANDLE hFile = 0;
   {
     IOSTATS_TIMER_GUARD(open_nanos);
-    hFile = CreateFileA(lockFname.c_str(), (GENERIC_READ | GENERIC_WRITE),
+    hFile = RX_CreateFile(RX_FN(lockFname).c_str(), (GENERIC_READ | GENERIC_WRITE),
       ExclusiveAccessON, NULL, CREATE_ALWAYS,
       FILE_ATTRIBUTE_NORMAL, NULL);
   }
@@ -898,8 +900,8 @@ Status WinEnvIO::NewLogger(const std::string& fname,
   HANDLE hFile = 0;
   {
     IOSTATS_TIMER_GUARD(open_nanos);
-    hFile = CreateFileA(
-      fname.c_str(), GENERIC_WRITE,
+    hFile = RX_CreateFile(
+      RX_FN(fname).c_str(), GENERIC_WRITE,
       FILE_SHARE_READ | FILE_SHARE_DELETE,  // In RocksDb log files are
       // renamed and deleted before
       // they are closed. This enables
@@ -992,17 +994,17 @@ Status WinEnvIO::GetAbsolutePath(const std::string& db_path,
   // For test compatibility we will consider starting slash as an
   // absolute path
   if ((!db_path.empty() && (db_path[0] == '\\' || db_path[0] == '/')) ||
-    !PathIsRelativeA(db_path.c_str())) {
+    !RX_PathIsRelative(RX_FN(db_path).c_str())) {
     *output_path = db_path;
     return Status::OK();
   }
 
-  std::string result;
+  RX_FILESTRING result;
   result.resize(MAX_PATH);
 
   // Hopefully no changes the current directory while we do this
   // however _getcwd also suffers from the same limitation
-  DWORD len = GetCurrentDirectoryA(MAX_PATH, &result[0]);
+  DWORD len = RX_GetCurrentDirectory(MAX_PATH, &result[0]);
   if (len == 0) {
     auto lastError = GetLastError();
     return IOErrorFromWindowsError("Failed to get current working directory",
@@ -1010,8 +1012,9 @@ Status WinEnvIO::GetAbsolutePath(const std::string& db_path,
   }
 
   result.resize(len);
-
-  result.swap(*output_path);
+  std::string res = FN_TO_RX(result);
+  
+  res.swap(*output_path);
   return Status::OK();
 }
 
@@ -1076,7 +1079,7 @@ EnvOptions WinEnvIO::OptimizeForManifestRead(
 // Returns true iff the named directory exists and is a directory.
 bool WinEnvIO::DirExists(const std::string& dname) {
   WIN32_FILE_ATTRIBUTE_DATA attrs;
-  if (GetFileAttributesExA(dname.c_str(), GetFileExInfoStandard, &attrs)) {
+  if (RX_GetFileAttributesEx(RX_FN(dname).c_str(), GetFileExInfoStandard, &attrs)) {
     return 0 != (attrs.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY);
   }
   return false;
@@ -1085,7 +1088,7 @@ bool WinEnvIO::DirExists(const std::string& dname) {
 size_t WinEnvIO::GetSectorSize(const std::string& fname) {
   size_t sector_size = kSectorSize;
 
-  if (PathIsRelativeA(fname.c_str())) {
+  if (RX_PathIsRelative(RX_FN(fname).c_str())) {
     return sector_size;
   }
 
diff --git a/port/win/env_win.h b/port/win/env_win.h
index 81b323a7119..d61ac3acd6d 100644
--- a/port/win/env_win.h
+++ b/port/win/env_win.h
@@ -109,8 +109,8 @@ class WinEnvIO {
 
   // The returned file will only be accessed by one thread at a time.
   virtual Status NewRandomRWFile(const std::string& fname,
-    unique_ptr<RandomRWFile>* result,
-    const EnvOptions& options);
+                                 std::unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions& options);
 
   virtual Status NewMemoryMappedFileBuffer(
     const std::string& fname,
diff --git a/port/win/io_win.h b/port/win/io_win.h
index 3b08c394f4a..c46876b8c0c 100644
--- a/port/win/io_win.h
+++ b/port/win/io_win.h
@@ -58,7 +58,7 @@ class WinFileData {
  protected:
   const std::string filename_;
   HANDLE hFile_;
-  // If ture,  the I/O issued would be direct I/O which the buffer
+  // If true, the I/O issued would be direct I/O which the buffer
   // will need to be aligned (not sure there is a guarantee that the buffer
   // passed in is aligned).
   const bool use_direct_io_;
diff --git a/port/win/port_win.cc b/port/win/port_win.cc
index 75b4ec6de90..6ca5bba3b94 100644
--- a/port/win/port_win.cc
+++ b/port/win/port_win.cc
@@ -26,11 +26,30 @@
 #include <exception>
 #include <chrono>
 
+#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES
+// utf8 <-> utf16
+#include <string>
+#include <locale>
+#include <codecvt>
+#endif
+
 #include "util/logging.h"
 
 namespace rocksdb {
 namespace port {
 
+#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES
+std::string utf16_to_utf8(const std::wstring& utf16) {
+  std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>,wchar_t> convert;
+  return convert.to_bytes(utf16);
+}
+
+std::wstring utf8_to_utf16(const std::string& utf8) {
+  std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+  return converter.from_bytes(utf8);
+}
+#endif
+
 void gettimeofday(struct timeval* tv, struct timezone* /* tz */) {
   using namespace std::chrono;
 
@@ -110,7 +129,7 @@ void InitOnce(OnceType* once, void (*initializer)()) {
 struct DIR {
   HANDLE      handle_;
   bool        firstread_;
-  WIN32_FIND_DATA data_;
+  RX_WIN32_FIND_DATA data_;
   dirent entry_;
 
   DIR() : handle_(INVALID_HANDLE_VALUE),
@@ -137,7 +156,7 @@ DIR* opendir(const char* name) {
 
   std::unique_ptr<DIR> dir(new DIR);
 
-  dir->handle_ = ::FindFirstFileExA(pattern.c_str(), 
+  dir->handle_ = RX_FindFirstFileEx(RX_FN(pattern).c_str(), 
     FindExInfoBasic, // Do not want alternative name
     &dir->data_,
     FindExSearchNameMatch,
@@ -148,8 +167,9 @@ DIR* opendir(const char* name) {
     return nullptr;
   }
 
+  RX_FILESTRING x(dir->data_.cFileName, RX_FNLEN(dir->data_.cFileName));
   strcpy_s(dir->entry_.d_name, sizeof(dir->entry_.d_name), 
-    dir->data_.cFileName);
+           FN_TO_RX(x).c_str());
 
   return dir.release();
 }
@@ -165,14 +185,15 @@ struct dirent* readdir(DIR* dirp) {
     return &dirp->entry_;
   }
 
-  auto ret = ::FindNextFileA(dirp->handle_, &dirp->data_);
+  auto ret = RX_FindNextFile(dirp->handle_, &dirp->data_);
 
   if (ret == 0) {
     return nullptr;
   }
 
+  RX_FILESTRING x(dirp->data_.cFileName, RX_FNLEN(dirp->data_.cFileName));
   strcpy_s(dirp->entry_.d_name, sizeof(dirp->entry_.d_name), 
-    dirp->data_.cFileName);
+           FN_TO_RX(x).c_str());
 
   return &dirp->entry_;
 }
@@ -182,11 +203,15 @@ int closedir(DIR* dirp) {
   return 0;
 }
 
-int truncate(const char* path, int64_t len) {
+int truncate(const char* path, int64_t length) {
   if (path == nullptr) {
     errno = EFAULT;
     return -1;
   }
+  return rocksdb::port::Truncate(path, length);
+}
+
+int Truncate(std::string path, int64_t len) {
 
   if (len < 0) {
     errno = EINVAL;
@@ -194,7 +219,7 @@ int truncate(const char* path, int64_t len) {
   }
 
   HANDLE hFile =
-      CreateFile(path, GENERIC_READ | GENERIC_WRITE,
+      RX_CreateFile(RX_FN(path).c_str(), GENERIC_READ | GENERIC_WRITE,
                  FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
                  NULL,           // Security attrs
                  OPEN_EXISTING,  // Truncate existing file only
diff --git a/port/win/port_win.h b/port/win/port_win.h
index 41ccea68d45..9b8ba9ff89f 100644
--- a/port/win/port_win.h
+++ b/port/win/port_win.h
@@ -327,11 +327,62 @@ inline void* pthread_getspecific(pthread_key_t key) {
 // using C-runtime to implement. Note, this does not
 // feel space with zeros in case the file is extended.
 int truncate(const char* path, int64_t length);
+int Truncate(std::string path, int64_t length);
 void Crash(const std::string& srcfile, int srcline);
 extern int GetMaxOpenFiles();
+std::string utf16_to_utf8(const std::wstring& utf16);
+std::wstring utf8_to_utf16(const std::string& utf8);
 
 }  // namespace port
 
+
+#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES
+
+#define RX_FILESTRING std::wstring
+#define RX_FN(a) rocksdb::port::utf8_to_utf16(a)
+#define FN_TO_RX(a) rocksdb::port::utf16_to_utf8(a)
+#define RX_FNLEN(a) ::wcslen(a)
+
+#define RX_DeleteFile DeleteFileW
+#define RX_CreateFile CreateFileW
+#define RX_CreateFileMapping CreateFileMappingW
+#define RX_GetFileAttributesEx GetFileAttributesExW
+#define RX_FindFirstFileEx FindFirstFileExW
+#define RX_FindNextFile FindNextFileW
+#define RX_WIN32_FIND_DATA WIN32_FIND_DATAW
+#define RX_CreateDirectory CreateDirectoryW
+#define RX_RemoveDirectory RemoveDirectoryW
+#define RX_GetFileAttributesEx GetFileAttributesExW
+#define RX_MoveFileEx MoveFileExW
+#define RX_CreateHardLink CreateHardLinkW
+#define RX_PathIsRelative PathIsRelativeW
+#define RX_GetCurrentDirectory GetCurrentDirectoryW
+
+#else
+
+#define RX_FILESTRING std::string
+#define RX_FN(a) a
+#define FN_TO_RX(a) a
+#define RX_FNLEN(a) strlen(a)
+
+#define RX_DeleteFile DeleteFileA
+#define RX_CreateFile CreateFileA
+#define RX_CreateFileMapping CreateFileMappingA
+#define RX_GetFileAttributesEx GetFileAttributesExA
+#define RX_FindFirstFileEx FindFirstFileExA
+#define RX_CreateDirectory CreateDirectoryA
+#define RX_FindNextFile FindNextFileA
+#define RX_WIN32_FIND_DATA WIN32_FIND_DATA
+#define RX_CreateDirectory CreateDirectoryA
+#define RX_RemoveDirectory RemoveDirectoryA
+#define RX_GetFileAttributesEx GetFileAttributesExA
+#define RX_MoveFileEx MoveFileExA
+#define RX_CreateHardLink CreateHardLinkA
+#define RX_PathIsRelative PathIsRelativeA
+#define RX_GetCurrentDirectory GetCurrentDirectoryA
+
+#endif
+
 using port::pthread_key_t;
 using port::pthread_key_create;
 using port::pthread_key_delete;
diff --git a/port/win/win_thread.cc b/port/win/win_thread.cc
index b48af2370fc..9a976e2c6b8 100644
--- a/port/win/win_thread.cc
+++ b/port/win/win_thread.cc
@@ -40,7 +40,7 @@ struct WindowsThread::Data {
 void WindowsThread::Init(std::function<void()>&& func) {
 
   data_ = std::make_shared<Data>(std::move(func));
-  // We create another instance of shared_ptr to get an additional ref
+  // We create another instance of std::shared_ptr to get an additional ref
   // since we may detach and destroy this instance before the threadproc
   // may start to run. We choose to allocate this additional ref on the heap
   // so we do not need to synchronize and allow this thread to proceed
diff --git a/src.mk b/src.mk
index e2ad3f45c18..97dad2034b3 100644
--- a/src.mk
+++ b/src.mk
@@ -11,6 +11,7 @@ LIB_SOURCES =                                                   \
   db/compaction_iterator.cc                                     \
   db/compaction_job.cc                                          \
   db/compaction_picker.cc                                       \
+  db/compaction_picker_fifo.cc                                  \
   db/compaction_picker_universal.cc                             \
   db/convenience.cc                                             \
   db/db_filesnapshot.cc                                         \
@@ -43,6 +44,7 @@ LIB_SOURCES =                                                   \
   db/merge_helper.cc                                            \
   db/merge_operator.cc                                          \
   db/range_del_aggregator.cc                                    \
+  db/range_tombstone_fragmenter.cc                              \
   db/repair.cc                                                  \
   db/snapshot_impl.cc                                           \
   db/table_cache.cc                                             \
@@ -120,6 +122,7 @@ LIB_SOURCES =                                                   \
   table/plain_table_index.cc                                    \
   table/plain_table_key_coding.cc                               \
   table/plain_table_reader.cc                                   \
+  table/sst_file_reader.cc                                      \
   table/sst_file_writer.cc                                      \
   table/table_properties.cc                                     \
   table/two_level_iterator.cc                                   \
@@ -142,6 +145,7 @@ LIB_SOURCES =                                                   \
   util/filename.cc                                              \
   util/filter_policy.cc                                         \
   util/hash.cc                                                  \
+  util/jemalloc_nodump_allocator.cc                             \
   util/log_buffer.cc                                            \
   util/murmurhash.cc                                            \
   util/random.cc                                                \
@@ -341,6 +345,7 @@ MAIN_SOURCES =                                                    \
   db/repair_test.cc                                                     \
   db/range_del_aggregator_test.cc                                       \
   db/range_del_aggregator_bench.cc                                      \
+  db/range_tombstone_fragmenter_test.cc                                 \
   db/table_properties_collector_test.cc                                 \
   db/util_merge_operators_test.cc                                       \
   db/version_builder_test.cc                                            \
@@ -369,6 +374,7 @@ MAIN_SOURCES =                                                    \
   table/data_block_hash_index_test.cc                                   \
   table/full_filter_block_test.cc                                       \
   table/merger_test.cc                                                  \
+  table/sst_file_reader_test.cc                                         \
   table/table_reader_bench.cc                                           \
   table/table_test.cc                                                   \
   third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc                  \
@@ -449,6 +455,7 @@ JNI_NATIVE_SOURCES =                                          \
   java/rocksjni/loggerjnicallback.cc                          \
   java/rocksjni/lru_cache.cc                                  \
   java/rocksjni/memtablejni.cc                                \
+  java/rocksjni/memory_util.cc                                \
   java/rocksjni/merge_operator.cc                             \
   java/rocksjni/native_comparator_wrapper_test.cc             \
   java/rocksjni/optimistic_transaction_db.cc                  \
@@ -481,4 +488,5 @@ JNI_NATIVE_SOURCES =                                          \
   java/rocksjni/write_batch.cc                                \
   java/rocksjni/writebatchhandlerjnicallback.cc               \
   java/rocksjni/write_batch_test.cc                           \
-  java/rocksjni/write_batch_with_index.cc
+  java/rocksjni/write_batch_with_index.cc                     \
+  java/rocksjni/write_buffer_manager.cc
diff --git a/table/adaptive_table_factory.cc b/table/adaptive_table_factory.cc
index 0a3e9415ad7..bbba3b91935 100644
--- a/table/adaptive_table_factory.cc
+++ b/table/adaptive_table_factory.cc
@@ -42,8 +42,8 @@ extern const uint64_t kCuckooTableMagicNumber;
 
 Status AdaptiveTableFactory::NewTableReader(
     const TableReaderOptions& table_reader_options,
-    unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-    unique_ptr<TableReader>* table,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table,
     bool /*prefetch_index_and_filter_in_cache*/) const {
   Footer footer;
   auto s = ReadFooterFromFile(file.get(), nullptr /* prefetch_buffer */,
diff --git a/table/adaptive_table_factory.h b/table/adaptive_table_factory.h
index 00af6a76e95..2a82dbfa988 100644
--- a/table/adaptive_table_factory.h
+++ b/table/adaptive_table_factory.h
@@ -35,8 +35,8 @@ class AdaptiveTableFactory : public TableFactory {
 
   Status NewTableReader(
       const TableReaderOptions& table_reader_options,
-      unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-      unique_ptr<TableReader>* table,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table,
       bool prefetch_index_and_filter_in_cache = true) const override;
 
   TableBuilder* NewTableBuilder(
diff --git a/table/block.cc b/table/block.cc
index c8247828e4f..4e8d6e5ca5a 100644
--- a/table/block.cc
+++ b/table/block.cc
@@ -781,47 +781,45 @@ Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
     size_ = 0;  // Error marker
   } else {
     // Should only decode restart points for uncompressed blocks
-    if (compression_type() == kNoCompression) {
-      num_restarts_ = NumRestarts();
-      switch (IndexType()) {
-        case BlockBasedTableOptions::kDataBlockBinarySearch:
-          restart_offset_ = static_cast<uint32_t>(size_) -
-                            (1 + num_restarts_) * sizeof(uint32_t);
-          if (restart_offset_ > size_ - sizeof(uint32_t)) {
-            // The size is too small for NumRestarts() and therefore
-            // restart_offset_ wrapped around.
-            size_ = 0;
-          }
+    num_restarts_ = NumRestarts();
+    switch (IndexType()) {
+      case BlockBasedTableOptions::kDataBlockBinarySearch:
+        restart_offset_ = static_cast<uint32_t>(size_) -
+                          (1 + num_restarts_) * sizeof(uint32_t);
+        if (restart_offset_ > size_ - sizeof(uint32_t)) {
+          // The size is too small for NumRestarts() and therefore
+          // restart_offset_ wrapped around.
+          size_ = 0;
+        }
+        break;
+      case BlockBasedTableOptions::kDataBlockBinaryAndHash:
+        if (size_ < sizeof(uint32_t) /* block footer */ +
+                        sizeof(uint16_t) /* NUM_BUCK */) {
+          size_ = 0;
           break;
-        case BlockBasedTableOptions::kDataBlockBinaryAndHash:
-          if (size_ < sizeof(uint32_t) /* block footer */ +
-                          sizeof(uint16_t) /* NUM_BUCK */) {
-            size_ = 0;
-            break;
-          }
-
-          uint16_t map_offset;
-          data_block_hash_index_.Initialize(
-              contents.data.data(),
-              static_cast<uint16_t>(contents.data.size() -
-                                    sizeof(uint32_t)), /*chop off
-                                                   NUM_RESTARTS*/
-              &map_offset);
-
-          restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
-
-          if (restart_offset_ > map_offset) {
-            // map_offset is too small for NumRestarts() and
-            // therefore restart_offset_ wrapped around.
-            size_ = 0;
-            break;
-          }
+        }
+
+        uint16_t map_offset;
+        data_block_hash_index_.Initialize(
+            contents.data.data(),
+            static_cast<uint16_t>(contents.data.size() -
+                                  sizeof(uint32_t)), /*chop off
+                                                 NUM_RESTARTS*/
+            &map_offset);
+
+        restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
+
+        if (restart_offset_ > map_offset) {
+          // map_offset is too small for NumRestarts() and
+          // therefore restart_offset_ wrapped around.
+          size_ = 0;
           break;
-        default:
-          size_ = 0;  // Error marker
-      }
+        }
+        break;
+      default:
+        size_ = 0;  // Error marker
+    }
     }
-  }
   if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
     read_amp_bitmap_.reset(new BlockReadAmpBitmap(
         restart_offset_, read_amp_bytes_per_bit, statistics));
@@ -834,6 +832,7 @@ DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp,
                                   bool /*total_order_seek*/,
                                   bool /*key_includes_seq*/,
                                   bool /*value_is_full*/,
+                                  bool block_contents_pinned,
                                   BlockPrefixIndex* /*prefix_index*/) {
   DataBlockIter* ret_iter;
   if (iter != nullptr) {
@@ -852,7 +851,7 @@ DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp,
   } else {
     ret_iter->Initialize(
         cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno_,
-        read_amp_bitmap_.get(), cachable(),
+        read_amp_bitmap_.get(), block_contents_pinned,
         data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
     if (read_amp_bitmap_) {
       if (read_amp_bitmap_->GetStatistics() != stats) {
@@ -870,6 +869,7 @@ IndexBlockIter* Block::NewIterator(const Comparator* cmp,
                                    const Comparator* ucmp, IndexBlockIter* iter,
                                    Statistics* /*stats*/, bool total_order_seek,
                                    bool key_includes_seq, bool value_is_full,
+                                   bool block_contents_pinned,
                                    BlockPrefixIndex* prefix_index) {
   IndexBlockIter* ret_iter;
   if (iter != nullptr) {
@@ -890,7 +890,8 @@ IndexBlockIter* Block::NewIterator(const Comparator* cmp,
         total_order_seek ? nullptr : prefix_index;
     ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
                          prefix_index_ptr, key_includes_seq, value_is_full,
-                         cachable(), nullptr /* data_block_hash_index */);
+                         block_contents_pinned,
+                         nullptr /* data_block_hash_index */);
   }
 
   return ret_iter;
diff --git a/table/block.h b/table/block.h
index 83900b56f55..1a8073203b4 100644
--- a/table/block.h
+++ b/table/block.h
@@ -153,14 +153,12 @@ class Block {
 
   size_t size() const { return size_; }
   const char* data() const { return data_; }
-  bool cachable() const { return contents_.cachable; }
   // The additional memory space taken by the block data.
   size_t usable_size() const { return contents_.usable_size(); }
   uint32_t NumRestarts() const;
+  bool own_bytes() const { return contents_.own_bytes(); }
+
   BlockBasedTableOptions::DataBlockIndexType IndexType() const;
-  CompressionType compression_type() const {
-    return contents_.compression_type;
-  }
 
   // If comparator is InternalKeyComparator, user_comparator is its user
   // comparator; they are equal otherwise.
@@ -170,7 +168,7 @@ class Block {
   //
   // key_includes_seq, default true, means that the keys are in internal key
   // format.
-  // value_is_full, default ture, means that no delta encoding is
+  // value_is_full, default true, means that no delta encoding is
   // applied to values.
   //
   // NewIterator<DataBlockIter>
@@ -180,6 +178,14 @@ class Block {
   // If `prefix_index` is not nullptr this block will do hash lookup for the key
   // prefix. If total_order_seek is true, prefix_index_ is ignored.
   //
+  // If `block_contents_pinned` is true, the caller will guarantee that when
+  // the cleanup functions are transferred from the iterator to other
+  // classes, e.g. PinnableSlice, the pointer to the bytes will still be
+  // valid. Either the iterator holds cache handle or ownership of some resource
+  // and release them in a release function, or caller is sure that the data
+  // will not go away (for example, it's from mmapped file which will not be
+  // closed).
+  //
   // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
   // the iterator will simply be set as "invalid", rather than returning
   // the key that is just pass the target key.
@@ -188,7 +194,8 @@ class Block {
       const Comparator* comparator, const Comparator* user_comparator,
       TBlockIter* iter = nullptr, Statistics* stats = nullptr,
       bool total_order_seek = true, bool key_includes_seq = true,
-      bool value_is_full = true, BlockPrefixIndex* prefix_index = nullptr);
+      bool value_is_full = true, bool block_contents_pinned = false,
+      BlockPrefixIndex* prefix_index = nullptr);
 
   // Report an approximation of how much memory has been used.
   size_t ApproximateMemoryUsage() const;
@@ -295,7 +302,9 @@ class BlockIter : public InternalIteratorBase<TValue> {
   Slice value_;
   Status status_;
   bool key_pinned_;
-  // whether the block data is guaranteed to outlive this iterator
+  // Whether the block data is guaranteed to outlive this iterator, and
+  // as long as the cleanup functions are transferred to another class,
+  // e.g. PinnableSlice, the pointer to the bytes will still be valid.
   bool block_contents_pinned_;
   SequenceNumber global_seqno_;
 
@@ -449,7 +458,7 @@ class IndexBlockIter final : public BlockIter<BlockHandle> {
   }
   // key_includes_seq, default true, means that the keys are in internal key
   // format.
-  // value_is_full, default ture, means that no delta encoding is
+  // value_is_full, default true, means that no delta encoding is
   // applied to values.
   IndexBlockIter(const Comparator* comparator,
                  const Comparator* user_comparator, const char* data,
diff --git a/table/block_based_filter_block_test.cc b/table/block_based_filter_block_test.cc
index 8de857f4efc..3cba09847a8 100644
--- a/table/block_based_filter_block_test.cc
+++ b/table/block_based_filter_block_test.cc
@@ -55,7 +55,7 @@ class FilterBlockTest : public testing::Test {
 
 TEST_F(FilterBlockTest, EmptyBuilder) {
   BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
-  BlockContents block(builder.Finish(), false, kNoCompression);
+  BlockContents block(builder.Finish());
   ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data));
   BlockBasedFilterBlockReader reader(nullptr, table_options_, true,
                                      std::move(block), nullptr);
@@ -75,7 +75,7 @@ TEST_F(FilterBlockTest, SingleChunk) {
   builder.StartBlock(300);
   builder.Add("hello");
   ASSERT_EQ(5, builder.NumAdded());
-  BlockContents block(builder.Finish(), false, kNoCompression);
+  BlockContents block(builder.Finish());
   BlockBasedFilterBlockReader reader(nullptr, table_options_, true,
                                      std::move(block), nullptr);
   ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100));
@@ -107,7 +107,7 @@ TEST_F(FilterBlockTest, MultiChunk) {
   builder.Add("box");
   builder.Add("hello");
 
-  BlockContents block(builder.Finish(), false, kNoCompression);
+  BlockContents block(builder.Finish());
   BlockBasedFilterBlockReader reader(nullptr, table_options_, true,
                                      std::move(block), nullptr);
 
@@ -152,7 +152,7 @@ class BlockBasedFilterBlockTest : public testing::Test {
 TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) {
   FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(
       nullptr, table_options_);
-  BlockContents block(builder->Finish(), false, kNoCompression);
+  BlockContents block(builder->Finish());
   ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data));
   FilterBlockReader* reader = new BlockBasedFilterBlockReader(
       nullptr, table_options_, true, std::move(block), nullptr);
@@ -174,7 +174,7 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedSingleChunk) {
   builder->Add("box");
   builder->StartBlock(300);
   builder->Add("hello");
-  BlockContents block(builder->Finish(), false, kNoCompression);
+  BlockContents block(builder->Finish());
   FilterBlockReader* reader = new BlockBasedFilterBlockReader(
       nullptr, table_options_, true, std::move(block), nullptr);
   ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100));
@@ -210,7 +210,7 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) {
   builder->Add("box");
   builder->Add("hello");
 
-  BlockContents block(builder->Finish(), false, kNoCompression);
+  BlockContents block(builder->Finish());
   FilterBlockReader* reader = new BlockBasedFilterBlockReader(
       nullptr, table_options_, true, std::move(block), nullptr);
 
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index 59c385d65ae..a4007b07a2c 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -42,6 +42,7 @@
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
+#include "util/memory_allocator.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/xxhash.h"
@@ -449,6 +450,11 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
     r->props.num_entries++;
     r->props.raw_key_size += key.size();
     r->props.raw_value_size += value.size();
+    if (value_type == kTypeDeletion || value_type == kTypeSingleDeletion) {
+      r->props.num_deletions++;
+    } else if (value_type == kTypeMerge) {
+      r->props.num_merge_operands++;
+    }
 
     r->index_builder->OnKeyAdded(key);
     NotifyCollectTableCollectorsOnAdd(key, value, r->offset,
@@ -609,6 +615,18 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
         EncodeFixed32(trailer_without_type, XXH32_digest(xxh));
         break;
       }
+      case kxxHash64: {
+        XXH64_state_t* const state = XXH64_createState();
+        XXH64_reset(state, 0);
+        XXH64_update(state, block_contents.data(),
+                static_cast<uint32_t>(block_contents.size()));
+        XXH64_update(state, trailer, 1);  // Extend  to cover block type
+        EncodeFixed32(trailer_without_type,
+          static_cast<uint32_t>(XXH64_digest(state) & // lower 32 bits
+                                   uint64_t{0xffffffff}));
+        XXH64_freeState(state);
+        break;
+      }
     }
 
     assert(r->status.ok());
@@ -636,9 +654,9 @@ Status BlockBasedTableBuilder::status() const {
   return rep_->status;
 }
 
-static void DeleteCachedBlock(const Slice& /*key*/, void* value) {
-  Block* block = reinterpret_cast<Block*>(value);
-  delete block;
+static void DeleteCachedBlockContents(const Slice& /*key*/, void* value) {
+  BlockContents* bc = reinterpret_cast<BlockContents*>(value);
+  delete bc;
 }
 
 //
@@ -654,13 +672,16 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
 
     size_t size = block_contents.size();
 
-    std::unique_ptr<char[]> ubuf(new char[size + 1]);
+    auto ubuf =
+        AllocateBlock(size + 1, block_cache_compressed->memory_allocator());
     memcpy(ubuf.get(), block_contents.data(), size);
     ubuf[size] = type;
 
-    BlockContents results(std::move(ubuf), size, true, type);
-
-    Block* block = new Block(std::move(results), kDisableGlobalSequenceNumber);
+    BlockContents* block_contents_to_cache =
+        new BlockContents(std::move(ubuf), size);
+#ifndef NDEBUG
+    block_contents_to_cache->is_raw_block = true;
+#endif  // NDEBUG
 
     // make cache key by appending the file offset to the cache prefix id
     char* end = EncodeVarint64(
@@ -671,8 +692,10 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
               (end - r->compressed_cache_key_prefix));
 
     // Insert into compressed block cache.
-    block_cache_compressed->Insert(key, block, block->ApproximateMemoryUsage(),
-                                   &DeleteCachedBlock);
+    block_cache_compressed->Insert(
+        key, block_contents_to_cache,
+        block_contents_to_cache->ApproximateMemoryUsage(),
+        &DeleteCachedBlockContents);
 
     // Invalidate OS cache.
     r->file->InvalidateCache(static_cast<size_t>(r->offset), size);
diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc
index 485aed87041..fbb7406a3d8 100644
--- a/table/block_based_table_factory.cc
+++ b/table/block_based_table_factory.cc
@@ -194,8 +194,8 @@ BlockBasedTableFactory::BlockBasedTableFactory(
 
 Status BlockBasedTableFactory::NewTableReader(
     const TableReaderOptions& table_reader_options,
-    unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-    unique_ptr<TableReader>* table_reader,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table_reader,
     bool prefetch_index_and_filter_in_cache) const {
   return BlockBasedTable::Open(
       table_reader_options.ioptions, table_reader_options.env_options,
diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h
index b30bd6232ac..cde6f653573 100644
--- a/table/block_based_table_factory.h
+++ b/table/block_based_table_factory.h
@@ -53,8 +53,8 @@ class BlockBasedTableFactory : public TableFactory {
 
   Status NewTableReader(
       const TableReaderOptions& table_reader_options,
-      unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-      unique_ptr<TableReader>* table_reader,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
       bool prefetch_index_and_filter_in_cache = true) const override;
 
   TableBuilder* NewTableBuilder(
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 9f2e02d6806..a126de88c04 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -78,13 +78,14 @@ Status ReadBlockFromFile(
     RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
     const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
     std::unique_ptr<Block>* result, const ImmutableCFOptions& ioptions,
-    bool do_uncompress, const Slice& compression_dict,
+    bool do_uncompress, bool maybe_compressed, const Slice& compression_dict,
     const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
-    size_t read_amp_bytes_per_bit, const bool immortal_file = false) {
+    size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator) {
   BlockContents contents;
   BlockFetcher block_fetcher(file, prefetch_buffer, footer, options, handle,
                              &contents, ioptions, do_uncompress,
-                             compression_dict, cache_options, immortal_file);
+                             maybe_compressed, compression_dict, cache_options,
+                             memory_allocator);
   Status s = block_fetcher.ReadBlockContents();
   if (s.ok()) {
     result->reset(new Block(std::move(contents), global_seqno,
@@ -94,6 +95,20 @@ Status ReadBlockFromFile(
   return s;
 }
 
+inline MemoryAllocator* GetMemoryAllocator(
+    const BlockBasedTableOptions& table_options) {
+  return table_options.block_cache.get()
+             ? table_options.block_cache->memory_allocator()
+             : nullptr;
+}
+
+inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
+    const BlockBasedTableOptions& table_options) {
+  return table_options.block_cache_compressed.get()
+             ? table_options.block_cache_compressed->memory_allocator()
+             : nullptr;
+}
+
 // Delete the resource that is held by the iterator.
 template <class ResourceType>
 void DeleteHeldResource(void* arg, void* /*ignored*/) {
@@ -215,13 +230,15 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
                        IndexReader** index_reader,
                        const PersistentCacheOptions& cache_options,
                        const int level, const bool index_key_includes_seq,
-                       const bool index_value_is_full) {
+                       const bool index_value_is_full,
+                       MemoryAllocator* memory_allocator) {
     std::unique_ptr<Block> index_block;
     auto s = ReadBlockFromFile(
         file, prefetch_buffer, footer, ReadOptions(), index_handle,
         &index_block, ioptions, true /* decompress */,
-        Slice() /*compression dict*/, cache_options,
-        kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */);
+        true /*maybe_compressed*/, Slice() /*compression dict*/, cache_options,
+        kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
+        memory_allocator);
 
     if (s.ok()) {
       *index_reader = new PartitionIndexReader(
@@ -239,6 +256,8 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
     Statistics* kNullStats = nullptr;
     // Filters are already checked before seeking the index
     if (!partition_map_.empty()) {
+      // We don't return pinned datat from index blocks, so no need
+      // to set `block_contents_pinned`.
       return NewTwoLevelIterator(
           new BlockBasedTable::PartitionedIndexIteratorState(
               table_, &partition_map_, index_key_includes_seq_,
@@ -250,6 +269,8 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
       auto ro = ReadOptions();
       ro.fill_cache = fill_cache;
       bool kIsIndex = true;
+      // We don't return pinned datat from index blocks, so no need
+      // to set `block_contents_pinned`.
       return new BlockBasedTableIterator<IndexBlockIter, BlockHandle>(
           table_, ro, *icomparator_,
           index_block_->NewIterator<IndexBlockIter>(
@@ -270,6 +291,8 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
     IndexBlockIter biter;
     BlockHandle handle;
     Statistics* kNullStats = nullptr;
+    // We don't return pinned datat from index blocks, so no need
+    // to set `block_contents_pinned`.
     index_block_->NewIterator<IndexBlockIter>(
         icomparator_, icomparator_->user_comparator(), &biter, kNullStats, true,
         index_key_includes_seq_, index_value_is_full_);
@@ -312,7 +335,7 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
       const bool is_index = true;
       // TODO: Support counter batch update for partitioned index and
       // filter blocks
-      s = table_->MaybeLoadDataBlockToCache(
+      s = table_->MaybeReadBlockAndLoadToCache(
           prefetch_buffer.get(), rep, ro, handle, compression_dict, &block,
           is_index, nullptr /* get_context */);
 
@@ -388,13 +411,15 @@ class BinarySearchIndexReader : public IndexReader {
                        IndexReader** index_reader,
                        const PersistentCacheOptions& cache_options,
                        const bool index_key_includes_seq,
-                       const bool index_value_is_full) {
+                       const bool index_value_is_full,
+                       MemoryAllocator* memory_allocator) {
     std::unique_ptr<Block> index_block;
     auto s = ReadBlockFromFile(
         file, prefetch_buffer, footer, ReadOptions(), index_handle,
         &index_block, ioptions, true /* decompress */,
-        Slice() /*compression dict*/, cache_options,
-        kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */);
+        true /*maybe_compressed*/, Slice() /*compression dict*/, cache_options,
+        kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
+        memory_allocator);
 
     if (s.ok()) {
       *index_reader = new BinarySearchIndexReader(
@@ -409,6 +434,8 @@ class BinarySearchIndexReader : public IndexReader {
       IndexBlockIter* iter = nullptr, bool /*dont_care*/ = true,
       bool /*dont_care*/ = true) override {
     Statistics* kNullStats = nullptr;
+    // We don't return pinned datat from index blocks, so no need
+    // to set `block_contents_pinned`.
     return index_block_->NewIterator<IndexBlockIter>(
         icomparator_, icomparator_->user_comparator(), iter, kNullStats, true,
         index_key_includes_seq_, index_value_is_full_);
@@ -458,13 +485,15 @@ class HashIndexReader : public IndexReader {
       InternalIterator* meta_index_iter, IndexReader** index_reader,
       bool /*hash_index_allow_collision*/,
       const PersistentCacheOptions& cache_options,
-      const bool index_key_includes_seq, const bool index_value_is_full) {
+      const bool index_key_includes_seq, const bool index_value_is_full,
+      MemoryAllocator* memory_allocator) {
     std::unique_ptr<Block> index_block;
     auto s = ReadBlockFromFile(
         file, prefetch_buffer, footer, ReadOptions(), index_handle,
         &index_block, ioptions, true /* decompress */,
-        Slice() /*compression dict*/, cache_options,
-        kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */);
+        true /*maybe_compressed*/, Slice() /*compression dict*/, cache_options,
+        kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
+        memory_allocator);
 
     if (!s.ok()) {
       return s;
@@ -502,8 +531,9 @@ class HashIndexReader : public IndexReader {
     BlockContents prefixes_contents;
     BlockFetcher prefixes_block_fetcher(
         file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
-        &prefixes_contents, ioptions, true /* decompress */,
-        dummy_comp_dict /*compression dict*/, cache_options);
+        &prefixes_contents, ioptions, true /*decompress*/,
+        true /*maybe_compressed*/, dummy_comp_dict /*compression dict*/,
+        cache_options, memory_allocator);
     s = prefixes_block_fetcher.ReadBlockContents();
     if (!s.ok()) {
       return s;
@@ -511,8 +541,9 @@ class HashIndexReader : public IndexReader {
     BlockContents prefixes_meta_contents;
     BlockFetcher prefixes_meta_block_fetcher(
         file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
-        &prefixes_meta_contents, ioptions, true /* decompress */,
-        dummy_comp_dict /*compression dict*/, cache_options);
+        &prefixes_meta_contents, ioptions, true /*decompress*/,
+        true /*maybe_compressed*/, dummy_comp_dict /*compression dict*/,
+        cache_options, memory_allocator);
     s = prefixes_meta_block_fetcher.ReadBlockContents();
     if (!s.ok()) {
       // TODO: log error
@@ -534,10 +565,12 @@ class HashIndexReader : public IndexReader {
       IndexBlockIter* iter = nullptr, bool total_order_seek = true,
       bool /*dont_care*/ = true) override {
     Statistics* kNullStats = nullptr;
+    // We don't return pinned datat from index blocks, so no need
+    // to set `block_contents_pinned`.
     return index_block_->NewIterator<IndexBlockIter>(
         icomparator_, icomparator_->user_comparator(), iter, kNullStats,
         total_order_seek, index_key_includes_seq_, index_value_is_full_,
-        prefix_index_.get());
+        false /* block_contents_pinned */, prefix_index_.get());
   }
 
   virtual size_t size() const override { return index_block_->size(); }
@@ -572,8 +605,7 @@ class HashIndexReader : public IndexReader {
     assert(index_block_ != nullptr);
   }
 
-  ~HashIndexReader() {
-  }
+  ~HashIndexReader() {}
 
   std::unique_ptr<Block> index_block_;
   std::unique_ptr<BlockPrefixIndex> prefix_index_;
@@ -737,9 +769,9 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
                              const EnvOptions& env_options,
                              const BlockBasedTableOptions& table_options,
                              const InternalKeyComparator& internal_comparator,
-                             unique_ptr<RandomAccessFileReader>&& file,
+                             std::unique_ptr<RandomAccessFileReader>&& file,
                              uint64_t file_size,
-                             unique_ptr<TableReader>* table_reader,
+                             std::unique_ptr<TableReader>* table_reader,
                              const SliceTransform* prefix_extractor,
                              const bool prefetch_index_and_filter_in_cache,
                              const bool skip_filters, const int level,
@@ -807,7 +839,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
   // raw pointer will be used to create HashIndexReader, whose reset may
   // access a dangling pointer.
   Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
-                                      internal_comparator, skip_filters,
+                                      internal_comparator, skip_filters, level,
                                       immortal_table);
   rep->file = std::move(file);
   rep->footer = footer;
@@ -818,7 +850,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
   rep->internal_prefix_transform.reset(
       new InternalKeySliceTransform(prefix_extractor));
   SetupCacheKeyPrefix(rep, file_size);
-  unique_ptr<BlockBasedTable> new_table(new BlockBasedTable(rep));
+  std::unique_ptr<BlockBasedTable> new_table(new BlockBasedTable(rep));
 
   // page cache options
   rep->persistent_cache_options =
@@ -878,7 +910,9 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
     if (s.ok()) {
       s = ReadProperties(meta_iter->value(), rep->file.get(),
                          prefetch_buffer.get(), rep->footer, rep->ioptions,
-                         &table_properties, false /* compression_type_missing */);
+                         &table_properties,
+                         false /* compression_type_missing */,
+                         nullptr /* memory_allocator */);
     }
 
     if (!s.ok()) {
@@ -921,9 +955,10 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
     ReadOptions read_options;
     read_options.verify_checksums = false;
     BlockFetcher compression_block_fetcher(
-      rep->file.get(), prefetch_buffer.get(), rep->footer, read_options,
-      compression_dict_handle, compression_dict_cont.get(), rep->ioptions, false /* decompress */,
-      Slice() /*compression dict*/, cache_options);
+        rep->file.get(), prefetch_buffer.get(), rep->footer, read_options,
+        compression_dict_handle, compression_dict_cont.get(), rep->ioptions,
+        false /* decompress */, false /*maybe_compressed*/,
+        Slice() /*compression dict*/, cache_options);
     s = compression_block_fetcher.ReadBlockContents();
 
     if (!s.ok()) {
@@ -964,20 +999,22 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
         rep->ioptions.info_log,
         "Error when seeking to range delete tombstones block from file: %s",
         s.ToString().c_str());
-  } else {
-    if (found_range_del_block && !rep->range_del_handle.IsNull()) {
-      ReadOptions read_options;
-      s = MaybeLoadDataBlockToCache(
-          prefetch_buffer.get(), rep, read_options, rep->range_del_handle,
-          Slice() /* compression_dict */, &rep->range_del_entry,
-          false /* is_index */, nullptr /* get_context */);
-      if (!s.ok()) {
-        ROCKS_LOG_WARN(
-            rep->ioptions.info_log,
-            "Encountered error while reading data from range del block %s",
-            s.ToString().c_str());
-      }
+  } else if (found_range_del_block && !rep->range_del_handle.IsNull()) {
+    ReadOptions read_options;
+    s = MaybeReadBlockAndLoadToCache(
+        prefetch_buffer.get(), rep, read_options, rep->range_del_handle,
+        Slice() /* compression_dict */, &rep->range_del_entry,
+        false /* is_index */, nullptr /* get_context */);
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(
+          rep->ioptions.info_log,
+          "Encountered error while reading data from range del block %s",
+          s.ToString().c_str());
     }
+    auto iter = std::unique_ptr<InternalIterator>(
+        new_table->NewUnfragmentedRangeTombstoneIterator(read_options));
+    rep->fragmented_range_dels = std::make_shared<FragmentedRangeTombstoneList>(
+        std::move(iter), internal_comparator);
   }
 
   bool need_upper_bound_check =
@@ -1019,7 +1056,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
       bool disable_prefix_seek =
           rep->index_type == BlockBasedTableOptions::kHashSearch &&
           need_upper_bound_check;
-      unique_ptr<InternalIteratorBase<BlockHandle>> iter(
+      std::unique_ptr<InternalIteratorBase<BlockHandle>> iter(
           new_table->NewIndexIterator(ReadOptions(), disable_prefix_seek,
                                       nullptr, &index_entry));
       s = iter->status();
@@ -1094,7 +1131,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
     if (tail_prefetch_stats != nullptr) {
       assert(prefetch_buffer->min_offset_read() < file_size);
       tail_prefetch_stats->RecordEffectiveSize(
-				static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
+        static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
     }
     *table_reader = std::move(new_table);
   }
@@ -1148,9 +1185,10 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep,
   Status s = ReadBlockFromFile(
       rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(),
       rep->footer.metaindex_handle(), &meta, rep->ioptions,
-      true /* decompress */, Slice() /*compression dict*/,
-      rep->persistent_cache_options, kDisableGlobalSequenceNumber,
-      0 /* read_amp_bytes_per_bit */);
+      true /* decompress */, true /*maybe_compressed*/,
+      Slice() /*compression dict*/, rep->persistent_cache_options,
+      kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
+      GetMemoryAllocator(rep->table_options));
 
   if (!s.ok()) {
     ROCKS_LOG_ERROR(rep->ioptions.info_log,
@@ -1169,15 +1207,14 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep,
 
 Status BlockBasedTable::GetDataBlockFromCache(
     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
-    Cache* block_cache, Cache* block_cache_compressed,
-    const ImmutableCFOptions& ioptions, const ReadOptions& read_options,
-    BlockBasedTable::CachableEntry<Block>* block, uint32_t format_version,
-    const Slice& compression_dict, size_t read_amp_bytes_per_bit, bool is_index,
-    GetContext* get_context) {
+    Cache* block_cache, Cache* block_cache_compressed, Rep* rep,
+    const ReadOptions& read_options,
+    BlockBasedTable::CachableEntry<Block>* block, const Slice& compression_dict,
+    size_t read_amp_bytes_per_bit, bool is_index, GetContext* get_context) {
   Status s;
-  Block* compressed_block = nullptr;
+  BlockContents* compressed_block = nullptr;
   Cache::Handle* block_cache_compressed_handle = nullptr;
-  Statistics* statistics = ioptions.statistics;
+  Statistics* statistics = rep->ioptions.statistics;
 
   // Lookup uncompressed cache first
   if (block_cache != nullptr) {
@@ -1220,32 +1257,34 @@ Status BlockBasedTable::GetDataBlockFromCache(
 
   // found compressed block
   RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
-  compressed_block = reinterpret_cast<Block*>(
+  compressed_block = reinterpret_cast<BlockContents*>(
       block_cache_compressed->Value(block_cache_compressed_handle));
-  assert(compressed_block->compression_type() != kNoCompression);
+  CompressionType compression_type = compressed_block->get_compression_type();
+  assert(compression_type != kNoCompression);
 
   // Retrieve the uncompressed contents into a new buffer
   BlockContents contents;
-  UncompressionContext uncompresssion_ctx(compressed_block->compression_type(),
-                                          compression_dict);
-  s = UncompressBlockContents(uncompresssion_ctx, compressed_block->data(),
-                              compressed_block->size(), &contents,
-                              format_version, ioptions);
+  UncompressionContext uncompresssion_ctx(compression_type, compression_dict);
+  s = UncompressBlockContents(uncompresssion_ctx, compressed_block->data.data(),
+                              compressed_block->data.size(), &contents,
+                              rep->table_options.format_version, rep->ioptions,
+                              GetMemoryAllocator(rep->table_options));
 
   // Insert uncompressed block into block cache
   if (s.ok()) {
     block->value =
-        new Block(std::move(contents), compressed_block->global_seqno(),
+        new Block(std::move(contents), rep->get_global_seqno(is_index),
                   read_amp_bytes_per_bit,
                   statistics);  // uncompressed block
-    assert(block->value->compression_type() == kNoCompression);
-    if (block_cache != nullptr && block->value->cachable() &&
+    if (block_cache != nullptr && block->value->own_bytes() &&
         read_options.fill_cache) {
       size_t charge = block->value->ApproximateMemoryUsage();
       s = block_cache->Insert(block_cache_key, block->value, charge,
                               &DeleteCachedEntry<Block>,
                               &(block->cache_handle));
+#ifndef NDEBUG
       block_cache->TEST_mark_as_data_block(block_cache_key, charge);
+#endif  // NDEBUG
       if (s.ok()) {
         if (get_context != nullptr) {
           get_context->get_context_stats_.num_cache_add++;
@@ -1290,64 +1329,77 @@ Status BlockBasedTable::PutDataBlockToCache(
     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
     Cache* block_cache, Cache* block_cache_compressed,
     const ReadOptions& /*read_options*/, const ImmutableCFOptions& ioptions,
-    CachableEntry<Block>* block, Block* raw_block, uint32_t format_version,
-    const Slice& compression_dict, size_t read_amp_bytes_per_bit, bool is_index,
-    Cache::Priority priority, GetContext* get_context) {
-  assert(raw_block->compression_type() == kNoCompression ||
+    CachableEntry<Block>* cached_block, BlockContents* raw_block_contents,
+    CompressionType raw_block_comp_type, uint32_t format_version,
+    const Slice& compression_dict, SequenceNumber seq_no,
+    size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
+    bool is_index, Cache::Priority priority, GetContext* get_context) {
+  assert(raw_block_comp_type == kNoCompression ||
          block_cache_compressed != nullptr);
 
   Status s;
   // Retrieve the uncompressed contents into a new buffer
-  BlockContents contents;
+  BlockContents uncompressed_block_contents;
   Statistics* statistics = ioptions.statistics;
-  if (raw_block->compression_type() != kNoCompression) {
-    UncompressionContext uncompression_ctx(raw_block->compression_type(),
+  if (raw_block_comp_type != kNoCompression) {
+    UncompressionContext uncompression_ctx(raw_block_comp_type,
                                            compression_dict);
-    s = UncompressBlockContents(uncompression_ctx, raw_block->data(),
-                                raw_block->size(), &contents, format_version,
-                                ioptions);
+    s = UncompressBlockContents(
+        uncompression_ctx, raw_block_contents->data.data(),
+        raw_block_contents->data.size(), &uncompressed_block_contents,
+        format_version, ioptions, memory_allocator);
   }
   if (!s.ok()) {
-    delete raw_block;
     return s;
   }
 
-  if (raw_block->compression_type() != kNoCompression) {
-    block->value = new Block(std::move(contents), raw_block->global_seqno(),
-                             read_amp_bytes_per_bit,
-                             statistics);  // uncompressed block
+  if (raw_block_comp_type != kNoCompression) {
+    cached_block->value = new Block(std::move(uncompressed_block_contents),
+                                    seq_no, read_amp_bytes_per_bit,
+                                    statistics);  // uncompressed block
   } else {
-    block->value = raw_block;
-    raw_block = nullptr;
+    cached_block->value =
+        new Block(std::move(*raw_block_contents), seq_no,
+                  read_amp_bytes_per_bit, ioptions.statistics);
   }
 
   // Insert compressed block into compressed block cache.
   // Release the hold on the compressed cache entry immediately.
-  if (block_cache_compressed != nullptr && raw_block != nullptr &&
-      raw_block->cachable()) {
-    s = block_cache_compressed->Insert(compressed_block_cache_key, raw_block,
-                                       raw_block->ApproximateMemoryUsage(),
-                                       &DeleteCachedEntry<Block>);
+  if (block_cache_compressed != nullptr &&
+      raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
+      raw_block_contents->own_bytes()) {
+#ifndef NDEBUG
+    assert(raw_block_contents->is_raw_block);
+#endif  // NDEBUG
+
+    // We cannot directly put raw_block_contents because this could point to
+    // an object in the stack.
+    BlockContents* block_cont_for_comp_cache =
+        new BlockContents(std::move(*raw_block_contents));
+    s = block_cache_compressed->Insert(
+        compressed_block_cache_key, block_cont_for_comp_cache,
+        block_cont_for_comp_cache->ApproximateMemoryUsage(),
+        &DeleteCachedEntry<BlockContents>);
     if (s.ok()) {
       // Avoid the following code to delete this cached block.
-      raw_block = nullptr;
       RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
     } else {
       RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+      delete block_cont_for_comp_cache;
     }
   }
-  delete raw_block;
 
   // insert into uncompressed block cache
-  assert((block->value->compression_type() == kNoCompression));
-  if (block_cache != nullptr && block->value->cachable()) {
-    size_t charge = block->value->ApproximateMemoryUsage();
-    s = block_cache->Insert(block_cache_key, block->value, charge,
-                            &DeleteCachedEntry<Block>, &(block->cache_handle),
-                            priority);
+  if (block_cache != nullptr && cached_block->value->own_bytes()) {
+    size_t charge = cached_block->value->ApproximateMemoryUsage();
+    s = block_cache->Insert(block_cache_key, cached_block->value, charge,
+                            &DeleteCachedEntry<Block>,
+                            &(cached_block->cache_handle), priority);
+#ifndef NDEBUG
     block_cache->TEST_mark_as_data_block(block_cache_key, charge);
+#endif  // NDEBUG
     if (s.ok()) {
-      assert(block->cache_handle != nullptr);
+      assert(cached_block->cache_handle != nullptr);
       if (get_context != nullptr) {
         get_context->get_context_stats_.num_cache_add++;
         get_context->get_context_stats_.num_cache_bytes_write += charge;
@@ -1373,12 +1425,12 @@ Status BlockBasedTable::PutDataBlockToCache(
           RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge);
         }
       }
-      assert(reinterpret_cast<Block*>(
-                 block_cache->Value(block->cache_handle)) == block->value);
+      assert(reinterpret_cast<Block*>(block_cache->Value(
+                 cached_block->cache_handle)) == cached_block->value);
     } else {
       RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
-      delete block->value;
-      block->value = nullptr;
+      delete cached_block->value;
+      cached_block->value = nullptr;
     }
   }
 
@@ -1399,10 +1451,11 @@ FilterBlockReader* BlockBasedTable::ReadFilter(
 
   Slice dummy_comp_dict;
 
-  BlockFetcher block_fetcher(rep->file.get(), prefetch_buffer, rep->footer,
-                             ReadOptions(), filter_handle, &block,
-                             rep->ioptions, false /* decompress */,
-                             dummy_comp_dict, rep->persistent_cache_options);
+  BlockFetcher block_fetcher(
+      rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(),
+      filter_handle, &block, rep->ioptions, false /* decompress */,
+      false /*maybe_compressed*/, dummy_comp_dict,
+      rep->persistent_cache_options, GetMemoryAllocator(rep->table_options));
   Status s = block_fetcher.ReadBlockContents();
 
   if (!s.ok()) {
@@ -1551,12 +1604,16 @@ InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
     GetContext* get_context) {
   // index reader has already been pre-populated.
   if (rep_->index_reader) {
+    // We don't return pinned datat from index blocks, so no need
+    // to set `block_contents_pinned`.
     return rep_->index_reader->NewIterator(
         input_iter, read_options.total_order_seek || disable_prefix_seek,
         read_options.fill_cache);
   }
   // we have a pinned index block
   if (rep_->index_entry.IsSet()) {
+    // We don't return pinned datat from index blocks, so no need
+    // to set `block_contents_pinned`.
     return rep_->index_entry.value->NewIterator(
         input_iter, read_options.total_order_seek || disable_prefix_seek,
         read_options.fill_cache);
@@ -1639,6 +1696,8 @@ InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
   }
 
   assert(cache_handle);
+  // We don't return pinned datat from index blocks, so no need
+  // to set `block_contents_pinned`.
   auto* iter = index_reader->NewIterator(
       input_iter, read_options.total_order_seek || disable_prefix_seek);
 
@@ -1673,9 +1732,9 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
     if (rep->compression_dict_block) {
       compression_dict = rep->compression_dict_block->data;
     }
-    s = MaybeLoadDataBlockToCache(prefetch_buffer, rep, ro, handle,
-                                  compression_dict, &block, is_index,
-                                  get_context);
+    s = MaybeReadBlockAndLoadToCache(prefetch_buffer, rep, ro, handle,
+                                     compression_dict, &block, is_index,
+                                     get_context);
   }
 
   TBlockIter* iter;
@@ -1697,10 +1756,13 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
                    READ_BLOCK_GET_MICROS);
       s = ReadBlockFromFile(
           rep->file.get(), prefetch_buffer, rep->footer, ro, handle,
-          &block_value, rep->ioptions, rep->blocks_maybe_compressed,
-          compression_dict, rep->persistent_cache_options,
+          &block_value, rep->ioptions,
+          rep->blocks_maybe_compressed /*do_decompress*/,
+          rep->blocks_maybe_compressed, compression_dict,
+          rep->persistent_cache_options,
           is_index ? kDisableGlobalSequenceNumber : rep->global_seqno,
-          rep->table_options.read_amp_bytes_per_bit, rep->immortal_table);
+          rep->table_options.read_amp_bytes_per_bit,
+          GetMemoryAllocator(rep->table_options));
     }
     if (s.ok()) {
       block.value = block_value.release();
@@ -1710,10 +1772,20 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
   if (s.ok()) {
     assert(block.value != nullptr);
     const bool kTotalOrderSeek = true;
+    // Block contents are pinned and it is still pinned after the iterator
+    // is destoryed as long as cleanup functions are moved to another object,
+    // when:
+    // 1. block cache handle is set to be released in cleanup function, or
+    // 2. it's pointing to immortable source. If own_bytes is true then we are
+    //    not reading data from the original source, weather immortal or not.
+    //    Otherwise, the block is pinned iff the source is immortal.
+    bool block_contents_pinned =
+        (block.cache_handle != nullptr ||
+         (!block.value->own_bytes() && rep->immortal_table));
     iter = block.value->NewIterator<TBlockIter>(
         &rep->internal_comparator, rep->internal_comparator.user_comparator(),
         iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq,
-        index_key_is_full);
+        index_key_is_full, block_contents_pinned);
     if (block.cache_handle != nullptr) {
       iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
                             block.cache_handle);
@@ -1722,7 +1794,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
         // insert a dummy record to block cache to track the memory usage
         Cache::Handle* cache_handle;
         // There are two other types of cache keys: 1) SST cache key added in
-        // `MaybeLoadDataBlockToCache` 2) dummy cache key added in
+        // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
         // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
         // from SST cache key(31 bytes), and use non-zero prefix to
         // differentiate from `write_buffer_manager`
@@ -1758,25 +1830,28 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
   return iter;
 }
 
-Status BlockBasedTable::MaybeLoadDataBlockToCache(
+Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
     FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro,
     const BlockHandle& handle, Slice compression_dict,
     CachableEntry<Block>* block_entry, bool is_index, GetContext* get_context) {
   assert(block_entry != nullptr);
   const bool no_io = (ro.read_tier == kBlockCacheTier);
   Cache* block_cache = rep->table_options.block_cache.get();
+
+  // No point to cache compressed blocks if it never goes away
   Cache* block_cache_compressed =
-      rep->table_options.block_cache_compressed.get();
+      rep->immortal_table ? nullptr
+                          : rep->table_options.block_cache_compressed.get();
 
+  // First, try to get the block from the cache
+  //
   // If either block cache is enabled, we'll try to read from it.
   Status s;
+  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+  char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+  Slice key /* key to the block cache */;
+  Slice ckey /* key to the compressed block cache */;
   if (block_cache != nullptr || block_cache_compressed != nullptr) {
-    Statistics* statistics = rep->ioptions.statistics;
-    char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-    char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-    Slice key, /* key to the block cache */
-        ckey /* key to the compressed block cache */;
-
     // create key for block cache
     if (block_cache != nullptr) {
       key = GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size,
@@ -1789,30 +1864,42 @@ Status BlockBasedTable::MaybeLoadDataBlockToCache(
                          compressed_cache_key);
     }
 
-    s = GetDataBlockFromCache(
-        key, ckey, block_cache, block_cache_compressed, rep->ioptions, ro,
-        block_entry, rep->table_options.format_version, compression_dict,
-        rep->table_options.read_amp_bytes_per_bit, is_index, get_context);
+    s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
+                              rep, ro, block_entry, compression_dict,
+                              rep->table_options.read_amp_bytes_per_bit,
+                              is_index, get_context);
 
+    // Can't find the block from the cache. If I/O is allowed, read from the
+    // file.
     if (block_entry->value == nullptr && !no_io && ro.fill_cache) {
-      std::unique_ptr<Block> raw_block;
+      Statistics* statistics = rep->ioptions.statistics;
+      bool do_decompress =
+          block_cache_compressed == nullptr && rep->blocks_maybe_compressed;
+      CompressionType raw_block_comp_type;
+      BlockContents raw_block_contents;
       {
         StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
-        s = ReadBlockFromFile(
+        BlockFetcher block_fetcher(
             rep->file.get(), prefetch_buffer, rep->footer, ro, handle,
-            &raw_block, rep->ioptions,
-            block_cache_compressed == nullptr && rep->blocks_maybe_compressed,
+            &raw_block_contents, rep->ioptions,
+            do_decompress /* do uncompress */, rep->blocks_maybe_compressed,
             compression_dict, rep->persistent_cache_options,
-            is_index ? kDisableGlobalSequenceNumber : rep->global_seqno,
-            rep->table_options.read_amp_bytes_per_bit, rep->immortal_table);
+            GetMemoryAllocator(rep->table_options),
+            GetMemoryAllocatorForCompressedBlock(rep->table_options));
+        s = block_fetcher.ReadBlockContents();
+        raw_block_comp_type = block_fetcher.get_compression_type();
       }
 
       if (s.ok()) {
+        SequenceNumber seq_no = rep->get_global_seqno(is_index);
+        // If filling cache is allowed and a cache is configured, try to put the
+        // block to the cache.
         s = PutDataBlockToCache(
             key, ckey, block_cache, block_cache_compressed, ro, rep->ioptions,
-            block_entry, raw_block.release(), rep->table_options.format_version,
-            compression_dict, rep->table_options.read_amp_bytes_per_bit,
-            is_index,
+            block_entry, &raw_block_contents, raw_block_comp_type,
+            rep->table_options.format_version, compression_dict, seq_no,
+            rep->table_options.read_amp_bytes_per_bit,
+            GetMemoryAllocator(rep->table_options), is_index,
             is_index && rep->table_options
                             .cache_index_and_filter_blocks_with_high_priority
                 ? Cache::Priority::HIGH
@@ -1855,6 +1942,8 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
     RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ,
                block_cache->GetUsage(block->second.cache_handle));
     Statistics* kNullStats = nullptr;
+    // We don't return pinned datat from index blocks, so no need
+    // to set `block_contents_pinned`.
     return block->second.value->NewIterator<IndexBlockIter>(
         &rep->internal_comparator, rep->internal_comparator.user_comparator(),
         nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_);
@@ -1933,7 +2022,7 @@ bool BlockBasedTable::PrefixMayMatch(
       // Then, try find it within each block
       // we already know prefix_extractor and prefix_extractor_name must match
       // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
-      unique_ptr<InternalIteratorBase<BlockHandle>> iiter(
+      std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(
           NewIndexIterator(no_io_read_options,
                            /* need_upper_bound_check */ false));
       iiter->Seek(internal_prefix);
@@ -2249,7 +2338,20 @@ InternalIterator* BlockBasedTable::NewIterator(
   }
 }
 
-InternalIterator* BlockBasedTable::NewRangeTombstoneIterator(
+FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
+    const ReadOptions& read_options) {
+  if (rep_->fragmented_range_dels == nullptr) {
+    return nullptr;
+  }
+  SequenceNumber snapshot = kMaxSequenceNumber;
+  if (read_options.snapshot != nullptr) {
+    snapshot = read_options.snapshot->GetSequenceNumber();
+  }
+  return new FragmentedRangeTombstoneIterator(
+      rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
+}
+
+InternalIterator* BlockBasedTable::NewUnfragmentedRangeTombstoneIterator(
     const ReadOptions& read_options) {
   if (rep_->range_del_handle.IsNull()) {
     // The block didn't exist, nullptr indicates no range tombstones.
@@ -2302,6 +2404,7 @@ bool BlockBasedTable::FullFilterKeyMayMatch(
   }
   if (may_match) {
     RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
   }
   return may_match;
 }
@@ -2326,6 +2429,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
   if (!FullFilterKeyMayMatch(read_options, filter, key, no_io,
                              prefix_extractor)) {
     RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
   } else {
     IndexBlockIter iiter_on_stack;
     // if prefix_extractor found in block differs from options, disable
@@ -2358,6 +2462,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
         // TODO: think about interaction with Merge. If a user key cannot
         // cross one data block, we should be fine.
         RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
+        PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
         break;
       } else {
         DataBlockIter biter;
@@ -2410,6 +2515,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
     }
     if (matched && filter != nullptr && !filter->IsBlockBased()) {
       RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+      PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
+                                rep_->level);
     }
     if (s.ok()) {
       s = iiter->status();
@@ -2524,11 +2631,11 @@ Status BlockBasedTable::VerifyChecksumInBlocks(
     BlockHandle handle = index_iter->value();
     BlockContents contents;
     Slice dummy_comp_dict;
-    BlockFetcher block_fetcher(rep_->file.get(), nullptr /* prefetch buffer */,
-                               rep_->footer, ReadOptions(), handle, &contents,
-                               rep_->ioptions, false /* decompress */,
-                               dummy_comp_dict /*compression dict*/,
-                               rep_->persistent_cache_options);
+    BlockFetcher block_fetcher(
+        rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
+        ReadOptions(), handle, &contents, rep_->ioptions,
+        false /* decompress */, false /*maybe_compressed*/,
+        dummy_comp_dict /*compression dict*/, rep_->persistent_cache_options);
     s = block_fetcher.ReadBlockContents();
     if (!s.ok()) {
       break;
@@ -2550,11 +2657,11 @@ Status BlockBasedTable::VerifyChecksumInBlocks(
     s = handle.DecodeFrom(&input);
     BlockContents contents;
     Slice dummy_comp_dict;
-    BlockFetcher block_fetcher(rep_->file.get(), nullptr /* prefetch buffer */,
-                               rep_->footer, ReadOptions(), handle, &contents,
-                               rep_->ioptions, false /* decompress */,
-                               dummy_comp_dict /*compression dict*/,
-                               rep_->persistent_cache_options);
+    BlockFetcher block_fetcher(
+        rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
+        ReadOptions(), handle, &contents, rep_->ioptions,
+        false /* decompress */, false /*maybe_compressed*/,
+        dummy_comp_dict /*compression dict*/, rep_->persistent_cache_options);
     s = block_fetcher.ReadBlockContents();
     if (!s.ok()) {
       break;
@@ -2583,8 +2690,7 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
 
   Status s;
   s = GetDataBlockFromCache(
-      cache_key, ckey, block_cache, nullptr, rep_->ioptions, options, &block,
-      rep_->table_options.format_version,
+      cache_key, ckey, block_cache, nullptr, rep_, options, &block,
       rep_->compression_dict_block ? rep_->compression_dict_block->data
                                    : Slice(),
       0 /* read_amp_bytes_per_bit */);
@@ -2644,7 +2750,8 @@ Status BlockBasedTable::CreateIndexReader(
           rep_->table_properties == nullptr ||
               rep_->table_properties->index_key_is_user_key == 0,
           rep_->table_properties == nullptr ||
-              rep_->table_properties->index_value_is_delta_encoded == 0);
+              rep_->table_properties->index_value_is_delta_encoded == 0,
+          GetMemoryAllocator(rep_->table_options));
     }
     case BlockBasedTableOptions::kBinarySearch: {
       return BinarySearchIndexReader::Create(
@@ -2653,7 +2760,8 @@ Status BlockBasedTable::CreateIndexReader(
           rep_->table_properties == nullptr ||
               rep_->table_properties->index_key_is_user_key == 0,
           rep_->table_properties == nullptr ||
-              rep_->table_properties->index_value_is_delta_encoded == 0);
+              rep_->table_properties->index_value_is_delta_encoded == 0,
+          GetMemoryAllocator(rep_->table_options));
     }
     case BlockBasedTableOptions::kHashSearch: {
       std::unique_ptr<Block> meta_guard;
@@ -2675,7 +2783,8 @@ Status BlockBasedTable::CreateIndexReader(
               rep_->table_properties == nullptr ||
                   rep_->table_properties->index_key_is_user_key == 0,
               rep_->table_properties == nullptr ||
-                  rep_->table_properties->index_value_is_delta_encoded == 0);
+                  rep_->table_properties->index_value_is_delta_encoded == 0,
+              GetMemoryAllocator(rep_->table_options));
         }
         meta_index_iter = meta_iter_guard.get();
       }
@@ -2688,7 +2797,8 @@ Status BlockBasedTable::CreateIndexReader(
           rep_->table_properties == nullptr ||
               rep_->table_properties->index_key_is_user_key == 0,
           rep_->table_properties == nullptr ||
-              rep_->table_properties->index_value_is_delta_encoded == 0);
+              rep_->table_properties->index_value_is_delta_encoded == 0,
+          GetMemoryAllocator(rep_->table_options));
     }
     default: {
       std::string error_message =
@@ -2699,7 +2809,7 @@ Status BlockBasedTable::CreateIndexReader(
 }
 
 uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) {
-  unique_ptr<InternalIteratorBase<BlockHandle>> index_iter(
+  std::unique_ptr<InternalIteratorBase<BlockHandle>> index_iter(
       NewIndexIterator(ReadOptions()));
 
   index_iter->Seek(key);
@@ -2857,7 +2967,8 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file,
           BlockFetcher block_fetcher(
               rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer,
               ReadOptions(), handle, &block, rep_->ioptions,
-              false /*decompress*/, dummy_comp_dict /*compression dict*/,
+              false /*decompress*/, false /*maybe_compressed*/,
+              dummy_comp_dict /*compression dict*/,
               rep_->persistent_cache_options);
           s = block_fetcher.ReadBlockContents();
           if (!s.ok()) {
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index 3cada0c2c2d..cb6a865660c 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -16,6 +16,7 @@
 #include <utility>
 #include <vector>
 
+#include "db/range_tombstone_fragmenter.h"
 #include "options/cf_options.h"
 #include "rocksdb/options.h"
 #include "rocksdb/persistent_cache.h"
@@ -88,8 +89,9 @@ class BlockBasedTable : public TableReader {
                      const EnvOptions& env_options,
                      const BlockBasedTableOptions& table_options,
                      const InternalKeyComparator& internal_key_comparator,
-                     unique_ptr<RandomAccessFileReader>&& file,
-                     uint64_t file_size, unique_ptr<TableReader>* table_reader,
+                     std::unique_ptr<RandomAccessFileReader>&& file,
+                     uint64_t file_size,
+                     std::unique_ptr<TableReader>* table_reader,
                      const SliceTransform* prefix_extractor = nullptr,
                      bool prefetch_index_and_filter_in_cache = true,
                      bool skip_filters = false, int level = -1,
@@ -112,7 +114,7 @@ class BlockBasedTable : public TableReader {
                                 bool skip_filters = false,
                                 bool for_compaction = false) override;
 
-  InternalIterator* NewRangeTombstoneIterator(
+  FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
       const ReadOptions& read_options) override;
 
   // @param skip_filters Disables loading/accessing the filter block
@@ -255,13 +257,11 @@ class BlockBasedTable : public TableReader {
   // @param block_entry value is set to the uncompressed block if found. If
   //    in uncompressed block cache, also sets cache_handle to reference that
   //    block.
-  static Status MaybeLoadDataBlockToCache(FilePrefetchBuffer* prefetch_buffer,
-                                          Rep* rep, const ReadOptions& ro,
-                                          const BlockHandle& handle,
-                                          Slice compression_dict,
-                                          CachableEntry<Block>* block_entry,
-                                          bool is_index = false,
-                                          GetContext* get_context = nullptr);
+  static Status MaybeReadBlockAndLoadToCache(
+      FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro,
+      const BlockHandle& handle, Slice compression_dict,
+      CachableEntry<Block>* block_entry, bool is_index = false,
+      GetContext* get_context = nullptr);
 
   // For the following two functions:
   // if `no_io == true`, we will not try to read filter/index from sst file
@@ -299,9 +299,9 @@ class BlockBasedTable : public TableReader {
   //    dictionary.
   static Status GetDataBlockFromCache(
       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
-      Cache* block_cache, Cache* block_cache_compressed,
-      const ImmutableCFOptions& ioptions, const ReadOptions& read_options,
-      BlockBasedTable::CachableEntry<Block>* block, uint32_t format_version,
+      Cache* block_cache, Cache* block_cache_compressed, Rep* rep,
+      const ReadOptions& read_options,
+      BlockBasedTable::CachableEntry<Block>* block,
       const Slice& compression_dict, size_t read_amp_bytes_per_bit,
       bool is_index = false, GetContext* get_context = nullptr);
 
@@ -311,16 +311,18 @@ class BlockBasedTable : public TableReader {
   // On success, Status::OK will be returned; also @block will be populated with
   // uncompressed block and its cache handle.
   //
-  // REQUIRES: raw_block is heap-allocated. PutDataBlockToCache() will be
-  // responsible for releasing its memory if error occurs.
+  // Allocated memory managed by raw_block_contents will be transferred to
+  // PutDataBlockToCache(). After the call, the object will be invalid.
   // @param compression_dict Data for presetting the compression library's
   //    dictionary.
   static Status PutDataBlockToCache(
       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
       Cache* block_cache, Cache* block_cache_compressed,
       const ReadOptions& read_options, const ImmutableCFOptions& ioptions,
-      CachableEntry<Block>* block, Block* raw_block, uint32_t format_version,
-      const Slice& compression_dict, size_t read_amp_bytes_per_bit,
+      CachableEntry<Block>* block, BlockContents* raw_block_contents,
+      CompressionType raw_block_comp_type, uint32_t format_version,
+      const Slice& compression_dict, SequenceNumber seq_no,
+      size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
       bool is_index = false, Cache::Priority pri = Cache::Priority::LOW,
       GetContext* get_context = nullptr);
 
@@ -383,6 +385,9 @@ class BlockBasedTable : public TableReader {
 
   friend class PartitionedFilterBlockReader;
   friend class PartitionedFilterBlockTest;
+
+  InternalIterator* NewUnfragmentedRangeTombstoneIterator(
+      const ReadOptions& read_options);
 };
 
 // Maitaning state of a two-level iteration on a partitioned index structure
@@ -431,7 +436,7 @@ struct BlockBasedTable::Rep {
   Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
       const BlockBasedTableOptions& _table_opt,
       const InternalKeyComparator& _internal_comparator, bool skip_filters,
-      const bool _immortal_table)
+      int _level, const bool _immortal_table)
       : ioptions(_ioptions),
         env_options(_env_options),
         table_options(_table_opt),
@@ -444,6 +449,7 @@ struct BlockBasedTable::Rep {
         prefix_filtering(true),
         range_del_handle(BlockHandle::NullBlockHandle()),
         global_seqno(kDisableGlobalSequenceNumber),
+        level(_level),
         immortal_table(_immortal_table) {}
 
   const ImmutableCFOptions& ioptions;
@@ -452,7 +458,7 @@ struct BlockBasedTable::Rep {
   const FilterPolicy* const filter_policy;
   const InternalKeyComparator& internal_comparator;
   Status status;
-  unique_ptr<RandomAccessFileReader> file;
+  std::unique_ptr<RandomAccessFileReader> file;
   char cache_key_prefix[kMaxCacheKeyPrefixSize];
   size_t cache_key_prefix_size = 0;
   char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize];
@@ -468,8 +474,8 @@ struct BlockBasedTable::Rep {
   // index_reader and filter will be populated and used only when
   // options.block_cache is nullptr; otherwise we will get the index block via
   // the block cache.
-  unique_ptr<IndexReader> index_reader;
-  unique_ptr<FilterBlockReader> filter;
+  std::unique_ptr<IndexReader> index_reader;
+  std::unique_ptr<FilterBlockReader> filter;
 
   enum class FilterType {
     kNoFilter,
@@ -494,7 +500,7 @@ struct BlockBasedTable::Rep {
   // module should not be relying on db module. However to make things easier
   // and compatible with existing code, we introduce a wrapper that allows
   // block to extract prefix without knowing if a key is internal or not.
-  unique_ptr<SliceTransform> internal_prefix_transform;
+  std::unique_ptr<SliceTransform> internal_prefix_transform;
   std::shared_ptr<const SliceTransform> table_prefix_extractor;
 
   // only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is
@@ -509,6 +515,7 @@ struct BlockBasedTable::Rep {
   // cache is enabled.
   CachableEntry<Block> range_del_entry;
   BlockHandle range_del_handle;
+  std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels;
 
   // If global_seqno is used, all Keys in this file will have the same
   // seqno with value `global_seqno`.
@@ -517,12 +524,20 @@ struct BlockBasedTable::Rep {
   // and every key have it's own seqno.
   SequenceNumber global_seqno;
 
+  // the level when the table is opened, could potentially change when trivial
+  // move is involved
+  int level;
+
   // If false, blocks in this file are definitely all uncompressed. Knowing this
   // before reading individual blocks enables certain optimizations.
   bool blocks_maybe_compressed = true;
 
   bool closed = false;
   const bool immortal_table;
+
+  SequenceNumber get_global_seqno(bool is_index) const {
+    return is_index ? kDisableGlobalSequenceNumber : global_seqno;
+  }
 };
 
 template <class TBlockIter, typename TValue = Slice>
diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc
index ea97066ec40..9ad254a59f5 100644
--- a/table/block_fetcher.cc
+++ b/table/block_fetcher.cc
@@ -17,13 +17,14 @@
 #include "rocksdb/env.h"
 #include "table/block.h"
 #include "table/block_based_table_reader.h"
-#include "table/persistent_cache_helper.h"
 #include "table/format.h"
+#include "table/persistent_cache_helper.h"
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
 #include "util/logging.h"
+#include "util/memory_allocator.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/xxhash.h"
@@ -48,6 +49,12 @@ void BlockFetcher::CheckBlockChecksum() {
       case kxxHash:
         actual = XXH32(data, static_cast<int>(block_size_) + 1, 0);
         break;
+      case kxxHash64:
+        actual =static_cast<uint32_t> (
+             XXH64(data, static_cast<int>(block_size_) + 1, 0) &
+              uint64_t{0xffffffff}
+          );
+        break;
       default:
         status_ = Status::Corruption(
             "unknown checksum type " + ToString(footer_.checksum()) + " in " +
@@ -107,9 +114,11 @@ bool BlockFetcher::TryGetCompressedBlockFromPersistentCache() {
   if (cache_options_.persistent_cache &&
       cache_options_.persistent_cache->IsCompressed()) {
     // lookup uncompressed cache mode p-cache
+    std::unique_ptr<char[]> raw_data;
     status_ = PersistentCacheHelper::LookupRawPage(
-        cache_options_, handle_, &heap_buf_, block_size_ + kBlockTrailerSize);
+        cache_options_, handle_, &raw_data, block_size_ + kBlockTrailerSize);
     if (status_.ok()) {
+      heap_buf_ = CacheAllocationPtr(raw_data.release());
       used_buf_ = heap_buf_.get();
       slice_ = Slice(heap_buf_.get(), block_size_);
       return true;
@@ -131,8 +140,13 @@ void BlockFetcher::PrepareBufferForBlockFromFile() {
     // If we've got a small enough hunk of data, read it in to the
     // trivially allocated stack buffer instead of needing a full malloc()
     used_buf_ = &stack_buf_[0];
+  } else if (maybe_compressed_ && !do_uncompress_) {
+    compressed_buf_ = AllocateBlock(block_size_ + kBlockTrailerSize,
+                                    memory_allocator_compressed_);
+    used_buf_ = compressed_buf_.get();
   } else {
-    heap_buf_.reset(new char[block_size_ + kBlockTrailerSize]);
+    heap_buf_ =
+        AllocateBlock(block_size_ + kBlockTrailerSize, memory_allocator_);
     used_buf_ = heap_buf_.get();
   }
 }
@@ -159,29 +173,45 @@ void BlockFetcher::InsertUncompressedBlockToPersistentCacheIfNeeded() {
   }
 }
 
+inline void BlockFetcher::CopyBufferToHeap() {
+  assert(used_buf_ != heap_buf_.get());
+  heap_buf_ = AllocateBlock(block_size_ + kBlockTrailerSize, memory_allocator_);
+  memcpy(heap_buf_.get(), used_buf_, block_size_ + kBlockTrailerSize);
+}
+
 inline
 void BlockFetcher::GetBlockContents() {
   if (slice_.data() != used_buf_) {
     // the slice content is not the buffer provided
-    *contents_ = BlockContents(Slice(slice_.data(), block_size_),
-                               immortal_source_, compression_type);
+    *contents_ = BlockContents(Slice(slice_.data(), block_size_));
   } else {
     // page can be either uncompressed or compressed, the buffer either stack
     // or heap provided. Refer to https://github.com/facebook/rocksdb/pull/4096
     if (got_from_prefetch_buffer_ || used_buf_ == &stack_buf_[0]) {
-      assert(used_buf_ != heap_buf_.get());
-      heap_buf_.reset(new char[block_size_ + kBlockTrailerSize]);
-      memcpy(heap_buf_.get(), used_buf_, block_size_ + kBlockTrailerSize);
+      CopyBufferToHeap();
+    } else if (used_buf_ == compressed_buf_.get()) {
+      if (compression_type_ == kNoCompression &&
+          memory_allocator_ != memory_allocator_compressed_) {
+        CopyBufferToHeap();
+      } else {
+        heap_buf_ = std::move(compressed_buf_);
+      }
     }
-    *contents_ = BlockContents(std::move(heap_buf_), block_size_, true,
-                               compression_type);
+    *contents_ = BlockContents(std::move(heap_buf_), block_size_);
   }
+#ifndef NDEBUG
+  contents_->is_raw_block = true;
+#endif
 }
 
 Status BlockFetcher::ReadBlockContents() {
   block_size_ = static_cast<size_t>(handle_.size());
 
   if (TryGetUncompressBlockFromPersistentCache()) {
+    compression_type_ = kNoCompression;
+#ifndef NDEBUG
+    contents_->is_raw_block = true;
+#endif  // NDEBUG
     return Status::OK();
   }
   if (TryGetFromPrefetchBuffer()) {
@@ -222,15 +252,16 @@ Status BlockFetcher::ReadBlockContents() {
 
   PERF_TIMER_GUARD(block_decompress_time);
 
-  compression_type =
-      static_cast<rocksdb::CompressionType>(slice_.data()[block_size_]);
+  compression_type_ = get_block_compression_type(slice_.data(), block_size_);
 
-  if (do_uncompress_ && compression_type != kNoCompression) {
+  if (do_uncompress_ && compression_type_ != kNoCompression) {
     // compressed page, uncompress, update cache
-    UncompressionContext uncompression_ctx(compression_type, compression_dict_);
-    status_ =
-        UncompressBlockContents(uncompression_ctx, slice_.data(), block_size_,
-                                contents_, footer_.version(), ioptions_);
+    UncompressionContext uncompression_ctx(compression_type_,
+                                           compression_dict_);
+    status_ = UncompressBlockContents(uncompression_ctx, slice_.data(),
+                                      block_size_, contents_, footer_.version(),
+                                      ioptions_, memory_allocator_);
+    compression_type_ = kNoCompression;
   } else {
     GetBlockContents();
   }
diff --git a/table/block_fetcher.h b/table/block_fetcher.h
index 9e0d2448dd5..aed73a39252 100644
--- a/table/block_fetcher.h
+++ b/table/block_fetcher.h
@@ -10,6 +10,7 @@
 #pragma once
 #include "table/block.h"
 #include "table/format.h"
+#include "util/memory_allocator.h"
 
 namespace rocksdb {
 class BlockFetcher {
@@ -24,9 +25,11 @@ class BlockFetcher {
                FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
                const ReadOptions& read_options, const BlockHandle& handle,
                BlockContents* contents, const ImmutableCFOptions& ioptions,
-               bool do_uncompress, const Slice& compression_dict,
+               bool do_uncompress, bool maybe_compressed,
+               const Slice& compression_dict,
                const PersistentCacheOptions& cache_options,
-               const bool immortal_source = false)
+               MemoryAllocator* memory_allocator = nullptr,
+               MemoryAllocator* memory_allocator_compressed = nullptr)
       : file_(file),
         prefetch_buffer_(prefetch_buffer),
         footer_(footer),
@@ -35,10 +38,13 @@ class BlockFetcher {
         contents_(contents),
         ioptions_(ioptions),
         do_uncompress_(do_uncompress),
-        immortal_source_(immortal_source),
+        maybe_compressed_(maybe_compressed),
         compression_dict_(compression_dict),
-        cache_options_(cache_options) {}
+        cache_options_(cache_options),
+        memory_allocator_(memory_allocator),
+        memory_allocator_compressed_(memory_allocator_compressed) {}
   Status ReadBlockContents();
+  CompressionType get_compression_type() const { return compression_type_; }
 
  private:
   static const uint32_t kDefaultStackBufferSize = 5000;
@@ -51,17 +57,20 @@ class BlockFetcher {
   BlockContents* contents_;
   const ImmutableCFOptions& ioptions_;
   bool do_uncompress_;
-  const bool immortal_source_;
+  bool maybe_compressed_;
   const Slice& compression_dict_;
   const PersistentCacheOptions& cache_options_;
+  MemoryAllocator* memory_allocator_;
+  MemoryAllocator* memory_allocator_compressed_;
   Status status_;
   Slice slice_;
   char* used_buf_ = nullptr;
   size_t block_size_;
-  std::unique_ptr<char[]> heap_buf_;
+  CacheAllocationPtr heap_buf_;
+  CacheAllocationPtr compressed_buf_;
   char stack_buf_[kDefaultStackBufferSize];
   bool got_from_prefetch_buffer_ = false;
-  rocksdb::CompressionType compression_type;
+  rocksdb::CompressionType compression_type_;
 
   // return true if found
   bool TryGetUncompressBlockFromPersistentCache();
@@ -69,6 +78,8 @@ class BlockFetcher {
   bool TryGetFromPrefetchBuffer();
   bool TryGetCompressedBlockFromPersistentCache();
   void PrepareBufferForBlockFromFile();
+  // Copy content from used_buf_ to new heap buffer.
+  void CopyBufferToHeap();
   void GetBlockContents();
   void InsertCompressedBlockToPersistentCacheIfNeeded();
   void InsertUncompressedBlockToPersistentCacheIfNeeded();
diff --git a/table/block_test.cc b/table/block_test.cc
index 0ca6ec3f6de..5ac9ffb2141 100644
--- a/table/block_test.cc
+++ b/table/block_test.cc
@@ -117,7 +117,6 @@ TEST_F(BlockTest, SimpleTest) {
   // create block reader
   BlockContents contents;
   contents.data = rawblock;
-  contents.cachable = false;
   Block reader(std::move(contents), kDisableGlobalSequenceNumber);
 
   // read contents of block sequentially
@@ -188,7 +187,6 @@ TEST_F(BlockTest, ValueDeltaEncodingTest) {
   // create block reader
   BlockContents contents;
   contents.data = rawblock;
-  contents.cachable = false;
   Block reader(std::move(contents), kDisableGlobalSequenceNumber);
 
   const bool kTotalOrderSeek = true;
@@ -247,7 +245,6 @@ BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
 
   BlockContents contents;
   contents.data = rawblock;
-  contents.cachable = false;
 
   return contents;
 }
@@ -257,8 +254,7 @@ void CheckBlockContents(BlockContents contents, const int max_key,
                         const std::vector<std::string> &values) {
   const size_t prefix_size = 6;
   // create block reader
-  BlockContents contents_ref(contents.data, contents.cachable,
-                             contents.compression_type);
+  BlockContents contents_ref(contents.data);
   Block reader1(std::move(contents), kDisableGlobalSequenceNumber);
   Block reader2(std::move(contents_ref), kDisableGlobalSequenceNumber);
 
@@ -486,7 +482,6 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
     // create block reader
     BlockContents contents;
     contents.data = rawblock;
-    contents.cachable = true;
     Block reader(std::move(contents), kDisableGlobalSequenceNumber,
                  kBytesPerBit, stats.get());
 
@@ -521,7 +516,6 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
     // create block reader
     BlockContents contents;
     contents.data = rawblock;
-    contents.cachable = true;
     Block reader(std::move(contents), kDisableGlobalSequenceNumber,
                  kBytesPerBit, stats.get());
 
@@ -558,7 +552,6 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
     // create block reader
     BlockContents contents;
     contents.data = rawblock;
-    contents.cachable = true;
     Block reader(std::move(contents), kDisableGlobalSequenceNumber,
                  kBytesPerBit, stats.get());
 
diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc
index 7d9842a95f0..f590e6ad405 100644
--- a/table/cuckoo_table_builder.cc
+++ b/table/cuckoo_table_builder.cc
@@ -289,6 +289,7 @@ Status CuckooTableBuilder::Finish() {
     }
   }
   properties_.num_entries = num_entries_;
+  properties_.num_deletions = num_entries_ - num_values_;
   properties_.fixed_key_len = key_size_;
   properties_.user_collected_properties[
         CuckooTablePropertyNames::kValueLength].assign(
diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo_table_builder_test.cc
index 27eacf6ec95..c1e350327f3 100644
--- a/table/cuckoo_table_builder_test.cc
+++ b/table/cuckoo_table_builder_test.cc
@@ -43,8 +43,15 @@ class CuckooBuilderTest : public testing::Test {
       std::string expected_unused_bucket, uint64_t expected_table_size,
       uint32_t expected_num_hash_func, bool expected_is_last_level,
       uint32_t expected_cuckoo_block_size = 1) {
+    uint64_t num_deletions = 0;
+    for (const auto& key : keys) {
+      ParsedInternalKey parsed;
+      if (ParseInternalKey(key, &parsed) && parsed.type == kTypeDeletion) {
+        num_deletions++;
+      }
+    }
     // Read file
-    unique_ptr<RandomAccessFile> read_file;
+    std::unique_ptr<RandomAccessFile> read_file;
     ASSERT_OK(env_->NewRandomAccessFile(fname, &read_file, env_options_));
     uint64_t read_file_size;
     ASSERT_OK(env_->GetFileSize(fname, &read_file_size));
@@ -56,7 +63,7 @@ class CuckooBuilderTest : public testing::Test {
 
     // Assert Table Properties.
     TableProperties* props = nullptr;
-    unique_ptr<RandomAccessFileReader> file_reader(
+    std::unique_ptr<RandomAccessFileReader> file_reader(
         new RandomAccessFileReader(std::move(read_file), fname));
     ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size,
                                   kCuckooTableMagicNumber, ioptions,
@@ -90,6 +97,7 @@ class CuckooBuilderTest : public testing::Test {
     ASSERT_EQ(expected_is_last_level, is_last_level_found);
 
     ASSERT_EQ(props->num_entries, keys.size());
+    ASSERT_EQ(props->num_deletions, num_deletions);
     ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size());
     ASSERT_EQ(props->data_size, expected_unused_bucket.size() *
         (expected_table_size + expected_cuckoo_block_size - 1));
@@ -126,9 +134,10 @@ class CuckooBuilderTest : public testing::Test {
     }
   }
 
-  std::string GetInternalKey(Slice user_key, bool zero_seqno) {
+  std::string GetInternalKey(Slice user_key, bool zero_seqno,
+                             ValueType type = kTypeValue) {
     IterKey ikey;
-    ikey.SetInternalKey(user_key, zero_seqno ? 0 : 1000, kTypeValue);
+    ikey.SetInternalKey(user_key, zero_seqno ? 0 : 1000, type);
     return ikey.GetInternalKey().ToString();
   }
 
@@ -152,10 +161,10 @@ class CuckooBuilderTest : public testing::Test {
 };
 
 TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) {
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   fname = test::PerThreadDBPath("EmptyFile");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
+  std::unique_ptr<WritableFileWriter> file_writer(
       new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, 4, 100,
                              BytewiseComparator(), 1, false, false,
@@ -169,50 +178,57 @@ TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) {
 }
 
 TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
-  uint32_t num_hash_fun = 4;
-  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
-  std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
-  // Need to have a temporary variable here as VS compiler does not currently
-  // support operator= with initializer_list as a parameter
-  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
-      {user_keys[0], {0, 1, 2, 3}},
-      {user_keys[1], {1, 2, 3, 4}},
-      {user_keys[2], {2, 3, 4, 5}},
-      {user_keys[3], {3, 4, 5, 6}}};
-  hash_map = std::move(hm);
-
-  std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
-  std::vector<std::string> keys;
-  for (auto& user_key : user_keys) {
-    keys.push_back(GetInternalKey(user_key, false));
-  }
-  uint64_t expected_table_size = GetExpectedTableSize(keys.size());
-
-  unique_ptr<WritableFile> writable_file;
-  fname = test::PerThreadDBPath("NoCollisionFullKey");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
-  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
-                             100, BytewiseComparator(), 1, false, false,
-                             GetSliceHash, 0 /* column_family_id */,
-                             kDefaultColumnFamilyName);
-  ASSERT_OK(builder.status());
-  for (uint32_t i = 0; i < user_keys.size(); i++) {
-    builder.Add(Slice(keys[i]), Slice(values[i]));
-    ASSERT_EQ(builder.NumEntries(), i + 1);
+  for (auto type : {kTypeValue, kTypeDeletion}) {
+    uint32_t num_hash_fun = 4;
+    std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+    std::vector<std::string> values;
+    if (type == kTypeValue) {
+      values = {"v01", "v02", "v03", "v04"};
+    } else {
+      values = {"", "", "", ""};
+    }
+    // Need to have a temporary variable here as VS compiler does not currently
+    // support operator= with initializer_list as a parameter
+    std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+        {user_keys[0], {0, 1, 2, 3}},
+        {user_keys[1], {1, 2, 3, 4}},
+        {user_keys[2], {2, 3, 4, 5}},
+        {user_keys[3], {3, 4, 5, 6}}};
+    hash_map = std::move(hm);
+
+    std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+    std::vector<std::string> keys;
+    for (auto& user_key : user_keys) {
+      keys.push_back(GetInternalKey(user_key, false, type));
+    }
+    uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+    std::unique_ptr<WritableFile> writable_file;
+    fname = test::PerThreadDBPath("NoCollisionFullKey");
+    ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+    std::unique_ptr<WritableFileWriter> file_writer(
+        new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
+    CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                               100, BytewiseComparator(), 1, false, false,
+                               GetSliceHash, 0 /* column_family_id */,
+                               kDefaultColumnFamilyName);
     ASSERT_OK(builder.status());
+    for (uint32_t i = 0; i < user_keys.size(); i++) {
+      builder.Add(Slice(keys[i]), Slice(values[i]));
+      ASSERT_EQ(builder.NumEntries(), i + 1);
+      ASSERT_OK(builder.status());
+    }
+    size_t bucket_size = keys[0].size() + values[0].size();
+    ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+    ASSERT_OK(builder.Finish());
+    ASSERT_OK(file_writer->Close());
+    ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+    std::string expected_unused_bucket = GetInternalKey("key00", true);
+    expected_unused_bucket += std::string(values[0].size(), 'a');
+    CheckFileContents(keys, values, expected_locations, expected_unused_bucket,
+                      expected_table_size, 2, false);
   }
-  size_t bucket_size = keys[0].size() + values[0].size();
-  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
-  ASSERT_OK(builder.Finish());
-  ASSERT_OK(file_writer->Close());
-  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
-
-  std::string expected_unused_bucket = GetInternalKey("key00", true);
-  expected_unused_bucket += std::string(values[0].size(), 'a');
-  CheckFileContents(keys, values, expected_locations,
-      expected_unused_bucket, expected_table_size, 2, false);
 }
 
 TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
@@ -236,10 +252,10 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
   }
   uint64_t expected_table_size = GetExpectedTableSize(keys.size());
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   fname = test::PerThreadDBPath("WithCollisionFullKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
+  std::unique_ptr<WritableFileWriter> file_writer(
       new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
@@ -284,11 +300,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
   }
   uint64_t expected_table_size = GetExpectedTableSize(keys.size());
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   uint32_t cuckoo_block_size = 2;
   fname = test::PerThreadDBPath("WithCollisionFullKey2");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
+  std::unique_ptr<WritableFileWriter> file_writer(
       new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(
       file_writer.get(), kHashTableRatio, num_hash_fun, 100,
@@ -338,10 +354,10 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) {
   }
   uint64_t expected_table_size = GetExpectedTableSize(keys.size());
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   fname = test::PerThreadDBPath("WithCollisionPathFullKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
+  std::unique_ptr<WritableFileWriter> file_writer(
       new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
@@ -388,10 +404,10 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
   }
   uint64_t expected_table_size = GetExpectedTableSize(keys.size());
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   fname = test::PerThreadDBPath("WithCollisionPathFullKeyAndCuckooBlock");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
+  std::unique_ptr<WritableFileWriter> file_writer(
       new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 2, false, false,
@@ -431,10 +447,10 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
   std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
   uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   fname = test::PerThreadDBPath("NoCollisionUserKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
+  std::unique_ptr<WritableFileWriter> file_writer(
       new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
@@ -475,10 +491,10 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
   std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
   uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   fname = test::PerThreadDBPath("WithCollisionUserKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
+  std::unique_ptr<WritableFileWriter> file_writer(
       new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
@@ -521,10 +537,10 @@ TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) {
   std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2};
   uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   fname = test::PerThreadDBPath("WithCollisionPathUserKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
+  std::unique_ptr<WritableFileWriter> file_writer(
       new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              2, BytewiseComparator(), 1, false, false,
@@ -566,10 +582,10 @@ TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) {
   };
   hash_map = std::move(hm);
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   fname = test::PerThreadDBPath("WithCollisionPathUserKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
+  std::unique_ptr<WritableFileWriter> file_writer(
       new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              2, BytewiseComparator(), 1, false, false,
@@ -594,10 +610,10 @@ TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) {
   uint32_t num_hash_fun = 4;
   std::string user_key = "repeatedkey";
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   fname = test::PerThreadDBPath("FailWhenSameKeyInserted");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
+  std::unique_ptr<WritableFileWriter> file_writer(
       new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
diff --git a/table/cuckoo_table_factory.cc b/table/cuckoo_table_factory.cc
index 84d22468eb9..74d18d51213 100644
--- a/table/cuckoo_table_factory.cc
+++ b/table/cuckoo_table_factory.cc
@@ -14,7 +14,7 @@ namespace rocksdb {
 
 Status CuckooTableFactory::NewTableReader(
     const TableReaderOptions& table_reader_options,
-    unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
     std::unique_ptr<TableReader>* table,
     bool /*prefetch_index_and_filter_in_cache*/) const {
   std::unique_ptr<CuckooTableReader> new_reader(new CuckooTableReader(
diff --git a/table/cuckoo_table_factory.h b/table/cuckoo_table_factory.h
index a96635de57d..eb3c5e51768 100644
--- a/table/cuckoo_table_factory.h
+++ b/table/cuckoo_table_factory.h
@@ -60,8 +60,8 @@ class CuckooTableFactory : public TableFactory {
 
   Status NewTableReader(
       const TableReaderOptions& table_reader_options,
-      unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-      unique_ptr<TableReader>* table,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table,
       bool prefetch_index_and_filter_in_cache = true) const override;
 
   TableBuilder* NewTableBuilder(
diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc
index 36083c54747..74fb52e6c78 100644
--- a/table/cuckoo_table_reader_test.cc
+++ b/table/cuckoo_table_reader_test.cc
@@ -95,7 +95,7 @@ class CuckooReaderTest : public testing::Test {
       const Comparator* ucomp = BytewiseComparator()) {
     std::unique_ptr<WritableFile> writable_file;
     ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
-    unique_ptr<WritableFileWriter> file_writer(
+    std::unique_ptr<WritableFileWriter> file_writer(
         new WritableFileWriter(std::move(writable_file), fname, env_options));
 
     CuckooTableBuilder builder(
@@ -115,7 +115,7 @@ class CuckooReaderTest : public testing::Test {
     // Check reader now.
     std::unique_ptr<RandomAccessFile> read_file;
     ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-    unique_ptr<RandomAccessFileReader> file_reader(
+    std::unique_ptr<RandomAccessFileReader> file_reader(
         new RandomAccessFileReader(std::move(read_file), fname));
     const ImmutableCFOptions ioptions(options);
     CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
@@ -144,7 +144,7 @@ class CuckooReaderTest : public testing::Test {
   void CheckIterator(const Comparator* ucomp = BytewiseComparator()) {
     std::unique_ptr<RandomAccessFile> read_file;
     ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-    unique_ptr<RandomAccessFileReader> file_reader(
+    std::unique_ptr<RandomAccessFileReader> file_reader(
         new RandomAccessFileReader(std::move(read_file), fname));
     const ImmutableCFOptions ioptions(options);
     CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
@@ -323,7 +323,7 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) {
   CreateCuckooFileAndCheckReader();
   std::unique_ptr<RandomAccessFile> read_file;
   ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-  unique_ptr<RandomAccessFileReader> file_reader(
+  std::unique_ptr<RandomAccessFileReader> file_reader(
       new RandomAccessFileReader(std::move(read_file), fname));
   const ImmutableCFOptions ioptions(options);
   CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucmp,
@@ -411,7 +411,7 @@ void WriteFile(const std::vector<std::string>& keys,
 
   std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
-  unique_ptr<WritableFileWriter> file_writer(
+  std::unique_ptr<WritableFileWriter> file_writer(
       new WritableFileWriter(std::move(writable_file), fname, env_options));
   CuckooTableBuilder builder(
       file_writer.get(), hash_ratio, 64, 1000, test::Uint64Comparator(), 5,
@@ -432,7 +432,7 @@ void WriteFile(const std::vector<std::string>& keys,
   env->GetFileSize(fname, &file_size);
   std::unique_ptr<RandomAccessFile> read_file;
   ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-  unique_ptr<RandomAccessFileReader> file_reader(
+  std::unique_ptr<RandomAccessFileReader> file_reader(
       new RandomAccessFileReader(std::move(read_file), fname));
 
   const ImmutableCFOptions ioptions(options);
@@ -464,7 +464,7 @@ void ReadKeys(uint64_t num, uint32_t batch_size) {
   env->GetFileSize(fname, &file_size);
   std::unique_ptr<RandomAccessFile> read_file;
   ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-  unique_ptr<RandomAccessFileReader> file_reader(
+  std::unique_ptr<RandomAccessFileReader> file_reader(
       new RandomAccessFileReader(std::move(read_file), fname));
 
   const ImmutableCFOptions ioptions(options);
diff --git a/table/data_block_hash_index_test.cc b/table/data_block_hash_index_test.cc
index dc62917f2a1..ac12bbf935d 100644
--- a/table/data_block_hash_index_test.cc
+++ b/table/data_block_hash_index_test.cc
@@ -7,12 +7,14 @@
 #include <string>
 #include <unordered_map>
 
+#include "db/table_properties_collector.h"
 #include "rocksdb/slice.h"
 #include "table/block.h"
 #include "table/block_based_table_reader.h"
 #include "table/block_builder.h"
 #include "table/data_block_hash_index.h"
 #include "table/get_context.h"
+#include "table/table_builder.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 
@@ -282,7 +284,6 @@ TEST(DataBlockHashIndex, BlockRestartIndexExceedMax) {
     // create block reader
     BlockContents contents;
     contents.data = rawblock;
-    contents.cachable = false;
     Block reader(std::move(contents), kDisableGlobalSequenceNumber);
 
     ASSERT_EQ(reader.IndexType(),
@@ -305,7 +306,6 @@ TEST(DataBlockHashIndex, BlockRestartIndexExceedMax) {
     // create block reader
     BlockContents contents;
     contents.data = rawblock;
-    contents.cachable = false;
     Block reader(std::move(contents), kDisableGlobalSequenceNumber);
 
     ASSERT_EQ(reader.IndexType(),
@@ -337,7 +337,6 @@ TEST(DataBlockHashIndex, BlockSizeExceedMax) {
     // create block reader
     BlockContents contents;
     contents.data = rawblock;
-    contents.cachable = false;
     Block reader(std::move(contents), kDisableGlobalSequenceNumber);
 
     ASSERT_EQ(reader.IndexType(),
@@ -362,7 +361,6 @@ TEST(DataBlockHashIndex, BlockSizeExceedMax) {
     // create block reader
     BlockContents contents;
     contents.data = rawblock;
-    contents.cachable = false;
     Block reader(std::move(contents), kDisableGlobalSequenceNumber);
 
     // the index type have fallen back to binary when build finish.
@@ -390,7 +388,6 @@ TEST(DataBlockHashIndex, BlockTestSingleKey) {
   // create block reader
   BlockContents contents;
   contents.data = rawblock;
-  contents.cachable = false;
   Block reader(std::move(contents), kDisableGlobalSequenceNumber);
 
   const InternalKeyComparator icmp(BytewiseComparator());
@@ -472,7 +469,6 @@ TEST(DataBlockHashIndex, BlockTestLarge) {
   // create block reader
   BlockContents contents;
   contents.data = rawblock;
-  contents.cachable = false;
   Block reader(std::move(contents), kDisableGlobalSequenceNumber);
   const InternalKeyComparator icmp(BytewiseComparator());
 
@@ -540,9 +536,9 @@ TEST(DataBlockHashIndex, BlockTestLarge) {
 void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2,
                   std::string& v2, InternalKey& seek_ikey,
                   GetContext& get_context, Options& options) {
-  unique_ptr<WritableFileWriter> file_writer;
-  unique_ptr<RandomAccessFileReader> file_reader;
-  unique_ptr<TableReader> table_reader;
+  std::unique_ptr<WritableFileWriter> file_writer;
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  std::unique_ptr<TableReader> table_reader;
   int level_ = -1;
 
   std::vector<std::string> keys;
@@ -555,7 +551,7 @@ void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2,
   soptions.use_mmap_reads = ioptions.allow_mmap_reads;
   file_writer.reset(
       test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */));
-  unique_ptr<TableBuilder> builder;
+  std::unique_ptr<TableBuilder> builder;
   std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
       int_tbl_prop_collector_factories;
   std::string column_family_name;
diff --git a/table/format.cc b/table/format.cc
index 16d959c3dce..0e43e824334 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -24,6 +24,7 @@
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
 #include "util/logging.h"
+#include "util/memory_allocator.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/xxhash.h"
@@ -279,8 +280,8 @@ Status ReadFooterFromFile(RandomAccessFileReader* file,
 Status UncompressBlockContentsForCompressionType(
     const UncompressionContext& uncompression_ctx, const char* data, size_t n,
     BlockContents* contents, uint32_t format_version,
-    const ImmutableCFOptions& ioptions) {
-  std::unique_ptr<char[]> ubuf;
+    const ImmutableCFOptions& ioptions, MemoryAllocator* allocator) {
+  CacheAllocationPtr ubuf;
 
   assert(uncompression_ctx.type() != kNoCompression &&
          "Invalid compression type");
@@ -296,81 +297,82 @@ Status UncompressBlockContentsForCompressionType(
       if (!Snappy_GetUncompressedLength(data, n, &ulength)) {
         return Status::Corruption(snappy_corrupt_msg);
       }
-      ubuf.reset(new char[ulength]);
+      ubuf = AllocateBlock(ulength, allocator);
       if (!Snappy_Uncompress(data, n, ubuf.get())) {
         return Status::Corruption(snappy_corrupt_msg);
       }
-      *contents = BlockContents(std::move(ubuf), ulength, true, kNoCompression);
+      *contents = BlockContents(std::move(ubuf), ulength);
       break;
     }
     case kZlibCompression:
-      ubuf.reset(Zlib_Uncompress(
+      ubuf = Zlib_Uncompress(
           uncompression_ctx, data, n, &decompress_size,
-          GetCompressFormatForVersion(kZlibCompression, format_version)));
+          GetCompressFormatForVersion(kZlibCompression, format_version),
+          allocator);
       if (!ubuf) {
         static char zlib_corrupt_msg[] =
           "Zlib not supported or corrupted Zlib compressed block contents";
         return Status::Corruption(zlib_corrupt_msg);
       }
-      *contents =
-          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
+      *contents = BlockContents(std::move(ubuf), decompress_size);
       break;
     case kBZip2Compression:
-      ubuf.reset(BZip2_Uncompress(
+      ubuf = BZip2_Uncompress(
           data, n, &decompress_size,
-          GetCompressFormatForVersion(kBZip2Compression, format_version)));
+          GetCompressFormatForVersion(kBZip2Compression, format_version),
+          allocator);
       if (!ubuf) {
         static char bzip2_corrupt_msg[] =
           "Bzip2 not supported or corrupted Bzip2 compressed block contents";
         return Status::Corruption(bzip2_corrupt_msg);
       }
-      *contents =
-          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
+      *contents = BlockContents(std::move(ubuf), decompress_size);
       break;
     case kLZ4Compression:
-      ubuf.reset(LZ4_Uncompress(
+      ubuf = LZ4_Uncompress(
           uncompression_ctx, data, n, &decompress_size,
-          GetCompressFormatForVersion(kLZ4Compression, format_version)));
+          GetCompressFormatForVersion(kLZ4Compression, format_version),
+          allocator);
       if (!ubuf) {
         static char lz4_corrupt_msg[] =
           "LZ4 not supported or corrupted LZ4 compressed block contents";
         return Status::Corruption(lz4_corrupt_msg);
       }
-      *contents =
-          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
+      *contents = BlockContents(std::move(ubuf), decompress_size);
       break;
     case kLZ4HCCompression:
-      ubuf.reset(LZ4_Uncompress(
+      ubuf = LZ4_Uncompress(
           uncompression_ctx, data, n, &decompress_size,
-          GetCompressFormatForVersion(kLZ4HCCompression, format_version)));
+          GetCompressFormatForVersion(kLZ4HCCompression, format_version),
+          allocator);
       if (!ubuf) {
         static char lz4hc_corrupt_msg[] =
           "LZ4HC not supported or corrupted LZ4HC compressed block contents";
         return Status::Corruption(lz4hc_corrupt_msg);
       }
-      *contents =
-          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
+      *contents = BlockContents(std::move(ubuf), decompress_size);
       break;
     case kXpressCompression:
+      // XPRESS allocates memory internally, thus no support for custom
+      // allocator.
       ubuf.reset(XPRESS_Uncompress(data, n, &decompress_size));
       if (!ubuf) {
         static char xpress_corrupt_msg[] =
           "XPRESS not supported or corrupted XPRESS compressed block contents";
         return Status::Corruption(xpress_corrupt_msg);
       }
-      *contents =
-        BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
+      *contents = BlockContents(std::move(ubuf), decompress_size);
       break;
     case kZSTD:
     case kZSTDNotFinalCompression:
-      ubuf.reset(ZSTD_Uncompress(uncompression_ctx, data, n, &decompress_size));
+      ubuf = ZSTD_Uncompress(uncompression_ctx, data, n, &decompress_size,
+                             allocator);
       if (!ubuf) {
         static char zstd_corrupt_msg[] =
             "ZSTD not supported or corrupted ZSTD compressed block contents";
         return Status::Corruption(zstd_corrupt_msg);
       }
-      *contents =
-          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
+      *contents = BlockContents(std::move(ubuf), decompress_size);
       break;
     default:
       return Status::Corruption("bad block type");
@@ -396,11 +398,13 @@ Status UncompressBlockContentsForCompressionType(
 Status UncompressBlockContents(const UncompressionContext& uncompression_ctx,
                                const char* data, size_t n,
                                BlockContents* contents, uint32_t format_version,
-                               const ImmutableCFOptions& ioptions) {
+                               const ImmutableCFOptions& ioptions,
+                               MemoryAllocator* allocator) {
   assert(data[n] != kNoCompression);
   assert(data[n] == uncompression_ctx.type());
-  return UncompressBlockContentsForCompressionType(
-      uncompression_ctx, data, n, contents, format_version, ioptions);
+  return UncompressBlockContentsForCompressionType(uncompression_ctx, data, n,
+                                                   contents, format_version,
+                                                   ioptions, allocator);
 }
 
 }  // namespace rocksdb
diff --git a/table/format.h b/table/format.h
index 6e0e99c1c74..0039c70a417 100644
--- a/table/format.h
+++ b/table/format.h
@@ -26,6 +26,7 @@
 #include "port/port.h"  // noexcept
 #include "table/persistent_cache_options.h"
 #include "util/file_reader_writer.h"
+#include "util/memory_allocator.h"
 
 namespace rocksdb {
 
@@ -188,24 +189,42 @@ Status ReadFooterFromFile(RandomAccessFileReader* file,
 // 1-byte type + 32-bit crc
 static const size_t kBlockTrailerSize = 5;
 
+inline CompressionType get_block_compression_type(const char* block_data,
+                                                  size_t block_size) {
+  return static_cast<CompressionType>(block_data[block_size]);
+}
+
 struct BlockContents {
   Slice data;     // Actual contents of data
-  bool cachable;  // True iff data can be cached
-  CompressionType compression_type;
-  std::unique_ptr<char[]> allocation;
+  CacheAllocationPtr allocation;
+
+#ifndef NDEBUG
+  // Whether the block is a raw block, which contains compression type
+  // byte. It is only used for assertion.
+  bool is_raw_block = false;
+#endif  // NDEBUG
+
+  BlockContents() {}
+
+  BlockContents(const Slice& _data) : data(_data) {}
 
-  BlockContents() : cachable(false), compression_type(kNoCompression) {}
+  BlockContents(CacheAllocationPtr&& _data, size_t _size)
+      : data(_data.get(), _size), allocation(std::move(_data)) {}
 
-  BlockContents(const Slice& _data, bool _cachable,
-                CompressionType _compression_type)
-      : data(_data), cachable(_cachable), compression_type(_compression_type) {}
+  BlockContents(std::unique_ptr<char[]>&& _data, size_t _size)
+      : data(_data.get(), _size) {
+    allocation.reset(_data.release());
+  }
+
+  bool own_bytes() const { return allocation.get() != nullptr; }
 
-  BlockContents(std::unique_ptr<char[]>&& _data, size_t _size, bool _cachable,
-                CompressionType _compression_type)
-      : data(_data.get(), _size),
-        cachable(_cachable),
-        compression_type(_compression_type),
-        allocation(std::move(_data)) {}
+  // It's the caller's responsibility to make sure that this is
+  // for raw block contents, which contains the compression
+  // byte in the end.
+  CompressionType get_compression_type() const {
+    assert(is_raw_block);
+    return get_block_compression_type(data.data(), data.size());
+  }
 
   // The additional memory space taken by the block data.
   size_t usable_size() const {
@@ -220,15 +239,20 @@ struct BlockContents {
     }
   }
 
+  size_t ApproximateMemoryUsage() const {
+    return usable_size() + sizeof(*this);
+  }
+
   BlockContents(BlockContents&& other) ROCKSDB_NOEXCEPT {
     *this = std::move(other);
   }
 
   BlockContents& operator=(BlockContents&& other) {
     data = std::move(other.data);
-    cachable = other.cachable;
-    compression_type = other.compression_type;
     allocation = std::move(other.allocation);
+#ifndef NDEBUG
+    is_raw_block = other.is_raw_block;
+#endif  // NDEBUG
     return *this;
   }
 };
@@ -252,7 +276,7 @@ extern Status ReadBlockContents(
 extern Status UncompressBlockContents(
     const UncompressionContext& uncompression_ctx, const char* data, size_t n,
     BlockContents* contents, uint32_t compress_format_version,
-    const ImmutableCFOptions& ioptions);
+    const ImmutableCFOptions& ioptions, MemoryAllocator* allocator = nullptr);
 
 // This is an extension to UncompressBlockContents that accepts
 // a specific compression type. This is used by un-wrapped blocks
@@ -260,7 +284,7 @@ extern Status UncompressBlockContents(
 extern Status UncompressBlockContentsForCompressionType(
     const UncompressionContext& uncompression_ctx, const char* data, size_t n,
     BlockContents* contents, uint32_t compress_format_version,
-    const ImmutableCFOptions& ioptions);
+    const ImmutableCFOptions& ioptions, MemoryAllocator* allocator = nullptr);
 
 // Implementation details follow.  Clients should ignore,
 
diff --git a/table/get_context.cc b/table/get_context.cc
index 0aa75b6079c..6f0bd2ebbc3 100644
--- a/table/get_context.cc
+++ b/table/get_context.cc
@@ -43,7 +43,7 @@ GetContext::GetContext(const Comparator* ucmp,
                        Statistics* statistics, GetState init_state,
                        const Slice& user_key, PinnableSlice* pinnable_val,
                        bool* value_found, MergeContext* merge_context,
-                       RangeDelAggregator* _range_del_agg, Env* env,
+                       SequenceNumber* _max_covering_tombstone_seq, Env* env,
                        SequenceNumber* seq,
                        PinnedIteratorsManager* _pinned_iters_mgr,
                        ReadCallback* callback, bool* is_blob_index)
@@ -56,7 +56,7 @@ GetContext::GetContext(const Comparator* ucmp,
       pinnable_val_(pinnable_val),
       value_found_(value_found),
       merge_context_(merge_context),
-      range_del_agg_(_range_del_agg),
+      max_covering_tombstone_seq_(_max_covering_tombstone_seq),
       env_(env),
       seq_(seq),
       replay_log_(nullptr),
@@ -185,7 +185,8 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
     auto type = parsed_key.type;
     // Key matches. Process it
     if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
-        range_del_agg_ != nullptr && range_del_agg_->ShouldDelete(parsed_key)) {
+        max_covering_tombstone_seq_ != nullptr &&
+        *max_covering_tombstone_seq_ > parsed_key.sequence) {
       type = kTypeRangeDeletion;
     }
     switch (type) {
diff --git a/table/get_context.h b/table/get_context.h
index 066be104ba8..407473808f1 100644
--- a/table/get_context.h
+++ b/table/get_context.h
@@ -6,7 +6,6 @@
 #pragma once
 #include <string>
 #include "db/merge_context.h"
-#include "db/range_del_aggregator.h"
 #include "db/read_callback.h"
 #include "rocksdb/env.h"
 #include "rocksdb/statistics.h"
@@ -52,8 +51,9 @@ class GetContext {
   GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
              Logger* logger, Statistics* statistics, GetState init_state,
              const Slice& user_key, PinnableSlice* value, bool* value_found,
-             MergeContext* merge_context, RangeDelAggregator* range_del_agg,
-             Env* env, SequenceNumber* seq = nullptr,
+             MergeContext* merge_context,
+             SequenceNumber* max_covering_tombstone_seq, Env* env,
+             SequenceNumber* seq = nullptr,
              PinnedIteratorsManager* _pinned_iters_mgr = nullptr,
              ReadCallback* callback = nullptr, bool* is_blob_index = nullptr);
 
@@ -76,7 +76,9 @@ class GetContext {
 
   GetState State() const { return state_; }
 
-  RangeDelAggregator* range_del_agg() { return range_del_agg_; }
+  SequenceNumber* max_covering_tombstone_seq() {
+    return max_covering_tombstone_seq_;
+  }
 
   PinnedIteratorsManager* pinned_iters_mgr() { return pinned_iters_mgr_; }
 
@@ -111,7 +113,7 @@ class GetContext {
   PinnableSlice* pinnable_val_;
   bool* value_found_;  // Is value set correctly? Used by KeyMayExist
   MergeContext* merge_context_;
-  RangeDelAggregator* range_del_agg_;
+  SequenceNumber* max_covering_tombstone_seq_;
   Env* env_;
   // If a key is found, seq_ will be set to the SequenceNumber of most recent
   // write to the key or kMaxSequenceNumber if unknown
diff --git a/table/iterator.cc b/table/iterator.cc
index 97c47fb2854..3a1063f6ef9 100644
--- a/table/iterator.cc
+++ b/table/iterator.cc
@@ -103,7 +103,7 @@ Status Iterator::GetProperty(std::string prop_name, std::string* prop) {
     *prop = "0";
     return Status::OK();
   }
-  return Status::InvalidArgument("Undentified property.");
+  return Status::InvalidArgument("Unidentified property.");
 }
 
 namespace {
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 256730bfa7a..fdf8a56120e 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -79,6 +79,8 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
   Add(TablePropertiesNames::kIndexValueIsDeltaEncoded,
       props.index_value_is_delta_encoded);
   Add(TablePropertiesNames::kNumEntries, props.num_entries);
+  Add(TablePropertiesNames::kDeletedKeys, props.num_deletions);
+  Add(TablePropertiesNames::kMergeOperands, props.num_merge_operands);
   Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions);
   Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
   Add(TablePropertiesNames::kFilterSize, props.filter_size);
@@ -173,7 +175,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
                       FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
                       const ImmutableCFOptions& ioptions,
                       TableProperties** table_properties,
-                      bool compression_type_missing) {
+                      bool /*compression_type_missing*/,
+                      MemoryAllocator* memory_allocator) {
   assert(table_properties);
 
   Slice v = handle_value;
@@ -189,15 +192,13 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
   Slice compression_dict;
   PersistentCacheOptions cache_options;
 
-  BlockFetcher block_fetcher(
-      file, prefetch_buffer, footer, read_options, handle, &block_contents,
-      ioptions, false /* decompress */, compression_dict, cache_options);
+  BlockFetcher block_fetcher(file, prefetch_buffer, footer, read_options,
+                             handle, &block_contents, ioptions,
+                             false /* decompress */, false /*maybe_compressed*/,
+                             compression_dict, cache_options, memory_allocator);
   s = block_fetcher.ReadBlockContents();
-  // override compression_type when table file is known to contain undefined
-  // value at compression type marker
-  if (compression_type_missing) {
-    block_contents.compression_type = kNoCompression;
-  }
+  // property block is never compressed. Need to add uncompress logic if we are
+  // to compress it..
 
   if (!s.ok()) {
     return s;
@@ -229,6 +230,10 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
       {TablePropertiesNames::kNumDataBlocks,
        &new_table_properties->num_data_blocks},
       {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
+      {TablePropertiesNames::kDeletedKeys,
+       &new_table_properties->num_deletions},
+      {TablePropertiesNames::kMergeOperands,
+       &new_table_properties->num_merge_operands},
       {TablePropertiesNames::kNumRangeDeletions,
        &new_table_properties->num_range_deletions},
       {TablePropertiesNames::kFormatVersion,
@@ -263,6 +268,12 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
         {key, handle.offset() + iter.ValueOffset()});
 
     if (pos != predefined_uint64_properties.end()) {
+      if (key == TablePropertiesNames::kDeletedKeys ||
+          key == TablePropertiesNames::kMergeOperands) {
+        // Insert in user-collected properties for API backwards compatibility
+        new_table_properties->user_collected_properties.insert(
+            {key, raw_val.ToString()});
+      }
       // handle predefined rocksdb properties
       uint64_t val;
       if (!GetVarint64(&raw_val, &val)) {
@@ -305,9 +316,10 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
 
 Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
                            uint64_t table_magic_number,
-                           const ImmutableCFOptions &ioptions,
+                           const ImmutableCFOptions& ioptions,
                            TableProperties** properties,
-                           bool compression_type_missing) {
+                           bool compression_type_missing,
+                           MemoryAllocator* memory_allocator) {
   // -- Read metaindex block
   Footer footer;
   auto s = ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size,
@@ -323,19 +335,17 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
   Slice compression_dict;
   PersistentCacheOptions cache_options;
 
-  BlockFetcher block_fetcher(
-      file, nullptr /* prefetch_buffer */, footer, read_options,
-      metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
-      compression_dict, cache_options);
+  BlockFetcher block_fetcher(file, nullptr /* prefetch_buffer */, footer,
+                             read_options, metaindex_handle,
+                             &metaindex_contents, ioptions,
+                             false /* decompress */, false /*maybe_compressed*/,
+                             compression_dict, cache_options, memory_allocator);
   s = block_fetcher.ReadBlockContents();
   if (!s.ok()) {
     return s;
   }
-  // override compression_type when table file is known to contain undefined
-  // value at compression type marker
-  if (compression_type_missing) {
-    metaindex_contents.compression_type = kNoCompression;
-  }
+  // property blocks are never compressed. Need to add uncompress logic if we
+  // are to compress it.
   Block metaindex_block(std::move(metaindex_contents),
                         kDisableGlobalSequenceNumber);
   std::unique_ptr<InternalIterator> meta_iter(
@@ -352,7 +362,8 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
   TableProperties table_properties;
   if (found_properties_block == true) {
     s = ReadProperties(meta_iter->value(), file, nullptr /* prefetch_buffer */,
-                       footer, ioptions, properties, compression_type_missing);
+                       footer, ioptions, properties, compression_type_missing,
+                       memory_allocator);
   } else {
     s = Status::NotFound();
   }
@@ -375,10 +386,11 @@ Status FindMetaBlock(InternalIterator* meta_index_iter,
 
 Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
                      uint64_t table_magic_number,
-                     const ImmutableCFOptions &ioptions,
+                     const ImmutableCFOptions& ioptions,
                      const std::string& meta_block_name,
                      BlockHandle* block_handle,
-                     bool compression_type_missing) {
+                     bool /*compression_type_missing*/,
+                     MemoryAllocator* memory_allocator) {
   Footer footer;
   auto s = ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size,
                               &footer, table_magic_number);
@@ -395,16 +407,14 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
   BlockFetcher block_fetcher(
       file, nullptr /* prefetch_buffer */, footer, read_options,
       metaindex_handle, &metaindex_contents, ioptions,
-      false /* do decompression */, compression_dict, cache_options);
+      false /* do decompression */, false /*maybe_compressed*/,
+      compression_dict, cache_options, memory_allocator);
   s = block_fetcher.ReadBlockContents();
   if (!s.ok()) {
     return s;
   }
-  // override compression_type when table file is known to contain undefined
-  // value at compression type marker
-  if (compression_type_missing) {
-    metaindex_contents.compression_type = kNoCompression;
-  }
+  // meta blocks are never compressed. Need to add uncompress logic if we are to
+  // compress it.
   Block metaindex_block(std::move(metaindex_contents),
                         kDisableGlobalSequenceNumber);
 
@@ -420,7 +430,8 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
                      uint64_t table_magic_number,
                      const ImmutableCFOptions& ioptions,
                      const std::string& meta_block_name,
-                     BlockContents* contents, bool compression_type_missing) {
+                     BlockContents* contents, bool /*compression_type_missing*/,
+                     MemoryAllocator* memory_allocator) {
   Status status;
   Footer footer;
   status = ReadFooterFromFile(file, prefetch_buffer, file_size, &footer,
@@ -439,17 +450,14 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
 
   BlockFetcher block_fetcher(file, prefetch_buffer, footer, read_options,
                              metaindex_handle, &metaindex_contents, ioptions,
-                             false /* decompress */, compression_dict,
-                             cache_options);
+                             false /* decompress */, false /*maybe_compressed*/,
+                             compression_dict, cache_options, memory_allocator);
   status = block_fetcher.ReadBlockContents();
   if (!status.ok()) {
     return status;
   }
-  // override compression_type when table file is known to contain undefined
-  // value at compression type marker
-  if (compression_type_missing) {
-    metaindex_contents.compression_type = kNoCompression;
-  }
+  // meta block is never compressed. Need to add uncompress logic if we are to
+  // compress it.
 
   // Finding metablock
   Block metaindex_block(std::move(metaindex_contents),
@@ -469,7 +477,8 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
   // Reading metablock
   BlockFetcher block_fetcher2(
       file, prefetch_buffer, footer, read_options, block_handle, contents,
-      ioptions, false /* decompress */, compression_dict, cache_options);
+      ioptions, false /* decompress */, false /*maybe_compressed*/,
+      compression_dict, cache_options, memory_allocator);
   return block_fetcher2.ReadBlockContents();
 }
 
diff --git a/table/meta_blocks.h b/table/meta_blocks.h
index a18c8edc47c..1c8fe686ca8 100644
--- a/table/meta_blocks.h
+++ b/table/meta_blocks.h
@@ -11,12 +11,13 @@
 
 #include "db/builder.h"
 #include "db/table_properties_collector.h"
-#include "util/kv_map.h"
 #include "rocksdb/comparator.h"
+#include "rocksdb/memory_allocator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "table/block_builder.h"
 #include "table/format.h"
+#include "util/kv_map.h"
 
 namespace rocksdb {
 
@@ -96,7 +97,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
                       FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
                       const ImmutableCFOptions& ioptions,
                       TableProperties** table_properties,
-                      bool compression_type_missing = false);
+                      bool compression_type_missing = false,
+                      MemoryAllocator* memory_allocator = nullptr);
 
 // Directly read the properties from the properties block of a plain table.
 // @returns a status to indicate if the operation succeeded. On success,
@@ -108,9 +110,10 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
 // `ReadProperties`, `FindMetaBlock`, and `ReadMetaBlock`
 Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
                            uint64_t table_magic_number,
-                           const ImmutableCFOptions &ioptions,
+                           const ImmutableCFOptions& ioptions,
                            TableProperties** properties,
-                           bool compression_type_missing = false);
+                           bool compression_type_missing = false,
+                           MemoryAllocator* memory_allocator = nullptr);
 
 // Find the meta block from the meta index block.
 Status FindMetaBlock(InternalIterator* meta_index_iter,
@@ -120,10 +123,11 @@ Status FindMetaBlock(InternalIterator* meta_index_iter,
 // Find the meta block
 Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
                      uint64_t table_magic_number,
-                     const ImmutableCFOptions &ioptions,
+                     const ImmutableCFOptions& ioptions,
                      const std::string& meta_block_name,
                      BlockHandle* block_handle,
-                     bool compression_type_missing = false);
+                     bool compression_type_missing = false,
+                     MemoryAllocator* memory_allocator = nullptr);
 
 // Read the specified meta block with name meta_block_name
 // from `file` and initialize `contents` with contents of this block.
@@ -134,6 +138,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
                      const ImmutableCFOptions& ioptions,
                      const std::string& meta_block_name,
                      BlockContents* contents,
-                     bool compression_type_missing = false);
+                     bool compression_type_missing = false,
+                     MemoryAllocator* memory_allocator = nullptr);
 
 }  // namespace rocksdb
diff --git a/table/mock_table.cc b/table/mock_table.cc
index a5473b30bc8..65a43616969 100644
--- a/table/mock_table.cc
+++ b/table/mock_table.cc
@@ -60,8 +60,8 @@ MockTableFactory::MockTableFactory() : next_id_(1) {}
 
 Status MockTableFactory::NewTableReader(
     const TableReaderOptions& /*table_reader_options*/,
-    unique_ptr<RandomAccessFileReader>&& file, uint64_t /*file_size*/,
-    unique_ptr<TableReader>* table_reader,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t /*file_size*/,
+    std::unique_ptr<TableReader>* table_reader,
     bool /*prefetch_index_and_filter_in_cache*/) const {
   uint32_t id = GetIDFromFile(file.get());
 
diff --git a/table/mock_table.h b/table/mock_table.h
index 92cf87370ff..2f123a963cd 100644
--- a/table/mock_table.h
+++ b/table/mock_table.h
@@ -157,8 +157,8 @@ class MockTableFactory : public TableFactory {
   const char* Name() const override { return "MockTable"; }
   Status NewTableReader(
       const TableReaderOptions& table_reader_options,
-      unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-      unique_ptr<TableReader>* table_reader,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
       bool prefetch_index_and_filter_in_cache = true) const override;
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
diff --git a/table/partitioned_filter_block_test.cc b/table/partitioned_filter_block_test.cc
index 0b11c0df2a7..ffa8a9a5630 100644
--- a/table/partitioned_filter_block_test.cc
+++ b/table/partitioned_filter_block_test.cc
@@ -33,7 +33,7 @@ class MockedBlockBasedTable : public BlockBasedTable {
       const SliceTransform* prefix_extractor) const override {
     Slice slice = slices[filter_blk_handle.offset()];
     auto obj = new FullFilterBlockReader(
-        prefix_extractor, true, BlockContents(slice, false, kNoCompression),
+        prefix_extractor, true, BlockContents(slice),
         rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr);
     return {obj, nullptr};
   }
@@ -44,7 +44,7 @@ class MockedBlockBasedTable : public BlockBasedTable {
       const SliceTransform* prefix_extractor) const override {
     Slice slice = slices[filter_blk_handle.offset()];
     auto obj = new FullFilterBlockReader(
-        prefix_extractor, true, BlockContents(slice, false, kNoCompression),
+        prefix_extractor, true, BlockContents(slice),
         rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr);
     return obj;
   }
@@ -147,10 +147,10 @@ class PartitionedFilterBlockTest
     const bool kImmortal = true;
     table.reset(new MockedBlockBasedTable(
         new BlockBasedTable::Rep(ioptions, env_options, table_options_, icomp,
-                                 !kSkipFilters, !kImmortal)));
+                                 !kSkipFilters, 0, !kImmortal)));
     auto reader = new PartitionedFilterBlockReader(
-        prefix_extractor, true, BlockContents(slice, false, kNoCompression),
-        nullptr, nullptr, icomp, table.get(), pib->seperator_is_key_plus_seq(),
+        prefix_extractor, true, BlockContents(slice), nullptr, nullptr, icomp,
+        table.get(), pib->seperator_is_key_plus_seq(),
         !pib->get_use_value_delta_encoding());
     return reader;
   }
diff --git a/table/persistent_cache_helper.cc b/table/persistent_cache_helper.cc
index 103f57c80ac..4e90697a6e5 100644
--- a/table/persistent_cache_helper.cc
+++ b/table/persistent_cache_helper.cc
@@ -29,12 +29,9 @@ void PersistentCacheHelper::InsertUncompressedPage(
     const BlockContents& contents) {
   assert(cache_options.persistent_cache);
   assert(!cache_options.persistent_cache->IsCompressed());
-  if (!contents.cachable || contents.compression_type != kNoCompression) {
-    // We shouldn't cache this. Either
-    // (1) content is not cacheable
-    // (2) content is compressed
-    return;
-  }
+  // Precondition:
+  // (1) content is cacheable
+  // (2) content is not compressed
 
   // construct the page key
   char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
@@ -109,8 +106,7 @@ Status PersistentCacheHelper::LookupUncompressedPage(
   // update stats
   RecordTick(cache_options.statistics, PERSISTENT_CACHE_HIT);
   // construct result and return
-  *contents =
-      BlockContents(std::move(data), size, false /*cacheable*/, kNoCompression);
+  *contents = BlockContents(std::move(data), size);
   return Status::OK();
 }
 
diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc
index 717635cc1a9..453b6c768b5 100644
--- a/table/plain_table_builder.cc
+++ b/table/plain_table_builder.cc
@@ -166,6 +166,12 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
   properties_.num_entries++;
   properties_.raw_key_size += key.size();
   properties_.raw_value_size += value.size();
+  if (internal_key.type == kTypeDeletion ||
+      internal_key.type == kTypeSingleDeletion) {
+    properties_.num_deletions++;
+  } else if (internal_key.type == kTypeMerge) {
+    properties_.num_merge_operands++;
+  }
 
   // notify property collectors
   NotifyCollectTableCollectorsOnAdd(
diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc
index b88a689d4b0..273a1bd4f2f 100644
--- a/table/plain_table_factory.cc
+++ b/table/plain_table_factory.cc
@@ -19,8 +19,8 @@ namespace rocksdb {
 
 Status PlainTableFactory::NewTableReader(
     const TableReaderOptions& table_reader_options,
-    unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-    unique_ptr<TableReader>* table,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table,
     bool /*prefetch_index_and_filter_in_cache*/) const {
   return PlainTableReader::Open(
       table_reader_options.ioptions, table_reader_options.env_options,
diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h
index f540a92b89d..157e3acda01 100644
--- a/table/plain_table_factory.h
+++ b/table/plain_table_factory.h
@@ -149,8 +149,8 @@ class PlainTableFactory : public TableFactory {
 
   const char* Name() const override { return "PlainTable"; }
   Status NewTableReader(const TableReaderOptions& table_reader_options,
-                        unique_ptr<RandomAccessFileReader>&& file,
-                        uint64_t file_size, unique_ptr<TableReader>* table,
+                        std::unique_ptr<RandomAccessFileReader>&& file,
+                        uint64_t file_size, std::unique_ptr<TableReader>* table,
                         bool prefetch_index_and_filter_in_cache) const override;
 
   TableBuilder* NewTableBuilder(
diff --git a/table/plain_table_key_coding.h b/table/plain_table_key_coding.h
index 321e0aed594..9a27ad06b78 100644
--- a/table/plain_table_key_coding.h
+++ b/table/plain_table_key_coding.h
@@ -114,7 +114,7 @@ class PlainTableFileReader {
   };
 
   // Keep buffers for two recent reads.
-  std::array<unique_ptr<Buffer>, 2> buffers_;
+  std::array<std::unique_ptr<Buffer>, 2> buffers_;
   uint32_t num_buf_;
   Status status_;
 
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index 4f6c99f94af..ae656763cbb 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -91,14 +91,13 @@ class PlainTableIterator : public InternalIterator {
 };
 
 extern const uint64_t kPlainTableMagicNumber;
-PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions,
-                                   unique_ptr<RandomAccessFileReader>&& file,
-                                   const EnvOptions& storage_options,
-                                   const InternalKeyComparator& icomparator,
-                                   EncodingType encoding_type,
-                                   uint64_t file_size,
-                                   const TableProperties* table_properties,
-                                   const SliceTransform* prefix_extractor)
+PlainTableReader::PlainTableReader(
+    const ImmutableCFOptions& ioptions,
+    std::unique_ptr<RandomAccessFileReader>&& file,
+    const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
+    EncodingType encoding_type, uint64_t file_size,
+    const TableProperties* table_properties,
+    const SliceTransform* prefix_extractor)
     : internal_comparator_(icomparator),
       encoding_type_(encoding_type),
       full_scan_mode_(false),
@@ -118,8 +117,8 @@ PlainTableReader::~PlainTableReader() {
 Status PlainTableReader::Open(
     const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
     const InternalKeyComparator& internal_comparator,
-    unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-    unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
     double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size,
     bool full_scan_mode, const SliceTransform* prefix_extractor) {
   if (file_size > PlainTableIndex::kMaxFileSize) {
diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h
index df08a98fa17..5f8248dd717 100644
--- a/table/plain_table_reader.h
+++ b/table/plain_table_reader.h
@@ -48,7 +48,7 @@ struct PlainTableReaderFileInfo {
   bool is_mmap_mode;
   Slice file_data;
   uint32_t data_end_offset;
-  unique_ptr<RandomAccessFileReader> file;
+  std::unique_ptr<RandomAccessFileReader> file;
 
   PlainTableReaderFileInfo(unique_ptr<RandomAccessFileReader>&& _file,
                            const EnvOptions& storage_options,
@@ -71,8 +71,8 @@ class PlainTableReader: public TableReader {
   static Status Open(const ImmutableCFOptions& ioptions,
                      const EnvOptions& env_options,
                      const InternalKeyComparator& internal_comparator,
-                     unique_ptr<RandomAccessFileReader>&& file,
-                     uint64_t file_size, unique_ptr<TableReader>* table,
+                     std::unique_ptr<RandomAccessFileReader>&& file,
+                     uint64_t file_size, std::unique_ptr<TableReader>* table,
                      const int bloom_bits_per_key, double hash_table_ratio,
                      size_t index_sparseness, size_t huge_page_tlb_size,
                      bool full_scan_mode,
@@ -104,7 +104,7 @@ class PlainTableReader: public TableReader {
   }
 
   PlainTableReader(const ImmutableCFOptions& ioptions,
-                   unique_ptr<RandomAccessFileReader>&& file,
+                   std::unique_ptr<RandomAccessFileReader>&& file,
                    const EnvOptions& env_options,
                    const InternalKeyComparator& internal_comparator,
                    EncodingType encoding_type, uint64_t file_size,
@@ -153,8 +153,8 @@ class PlainTableReader: public TableReader {
   DynamicBloom bloom_;
   PlainTableReaderFileInfo file_info_;
   Arena arena_;
-  std::unique_ptr<char[]> index_block_alloc_;
-  std::unique_ptr<char[]> bloom_block_alloc_;
+  CacheAllocationPtr index_block_alloc_;
+  CacheAllocationPtr bloom_block_alloc_;
 
   const ImmutableCFOptions& ioptions_;
   uint64_t file_size_;
diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc
new file mode 100644
index 00000000000..a915449bee0
--- /dev/null
+++ b/table/sst_file_reader.cc
@@ -0,0 +1,84 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/sst_file_reader.h"
+
+#include "db/db_iter.h"
+#include "options/cf_options.h"
+#include "table/get_context.h"
+#include "table/table_reader.h"
+#include "table/table_builder.h"
+#include "util/file_reader_writer.h"
+
+namespace rocksdb {
+
+struct SstFileReader::Rep {
+  Options options;
+  EnvOptions soptions;
+  ImmutableCFOptions ioptions;
+  MutableCFOptions moptions;
+
+  std::unique_ptr<TableReader> table_reader;
+
+  Rep(const Options& opts)
+      : options(opts),
+        soptions(options),
+        ioptions(options),
+        moptions(ColumnFamilyOptions(options)) {}
+};
+
+SstFileReader::SstFileReader(const Options& options)
+    : rep_(new Rep(options)) {}
+
+SstFileReader::~SstFileReader() {}
+
+Status SstFileReader::Open(const std::string& file_path) {
+  auto r = rep_.get();
+  Status s;
+  uint64_t file_size = 0;
+  std::unique_ptr<RandomAccessFile> file;
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  s = r->options.env->GetFileSize(file_path, &file_size);
+  if (s.ok()) {
+    s = r->options.env->NewRandomAccessFile(file_path, &file, r->soptions);
+  }
+  if (s.ok()) {
+    file_reader.reset(new RandomAccessFileReader(std::move(file), file_path));
+  }
+  if (s.ok()) {
+    s = r->options.table_factory->NewTableReader(
+        TableReaderOptions(r->ioptions, r->moptions.prefix_extractor.get(),
+                           r->soptions, r->ioptions.internal_comparator),
+        std::move(file_reader), file_size, &r->table_reader);
+  }
+  return s;
+}
+
+Iterator* SstFileReader::NewIterator(const ReadOptions& options) {
+  auto r = rep_.get();
+  auto sequence = options.snapshot != nullptr ?
+                  options.snapshot->GetSequenceNumber() :
+                  kMaxSequenceNumber;
+  auto internal_iter = r->table_reader->NewIterator(
+      options, r->moptions.prefix_extractor.get());
+  return NewDBIterator(r->options.env, options, r->ioptions, r->moptions,
+                       r->ioptions.user_comparator, internal_iter, sequence,
+                       r->moptions.max_sequential_skip_in_iterations,
+                       nullptr /* read_callback */);
+}
+
+std::shared_ptr<const TableProperties> SstFileReader::GetTableProperties() const {
+  return rep_->table_reader->GetTableProperties();
+}
+
+Status SstFileReader::VerifyChecksum() {
+  return rep_->table_reader->VerifyChecksum();
+}
+
+}  // namespace rocksdb
+
+#endif  // !ROCKSDB_LITE
diff --git a/table/sst_file_reader_test.cc b/table/sst_file_reader_test.cc
new file mode 100644
index 00000000000..8da366fd7cc
--- /dev/null
+++ b/table/sst_file_reader_test.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <inttypes.h>
+
+#include "rocksdb/sst_file_reader.h"
+#include "rocksdb/sst_file_writer.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "utilities/merge_operators.h"
+
+namespace rocksdb {
+
+std::string EncodeAsString(uint64_t v) {
+  char buf[16];
+  snprintf(buf, sizeof(buf), "%08" PRIu64, v);
+  return std::string(buf);
+}
+
+std::string EncodeAsUint64(uint64_t v) {
+  std::string dst;
+  PutFixed64(&dst, v);
+  return dst;
+}
+
+class SstFileReaderTest : public testing::Test {
+ public:
+  SstFileReaderTest() {
+    options_.merge_operator = MergeOperators::CreateUInt64AddOperator();
+    sst_name_ = test::PerThreadDBPath("sst_file");
+  }
+
+  void CreateFileAndCheck(const std::vector<std::string>& keys) {
+    SstFileWriter writer(soptions_, options_);
+    ASSERT_OK(writer.Open(sst_name_));
+    for (size_t i = 0; i + 2 < keys.size(); i += 3) {
+      ASSERT_OK(writer.Put(keys[i], keys[i]));
+      ASSERT_OK(writer.Merge(keys[i+1], EncodeAsUint64(i+1)));
+      ASSERT_OK(writer.Delete(keys[i+2]));
+    }
+    ASSERT_OK(writer.Finish());
+
+    ReadOptions ropts;
+    SstFileReader reader(options_);
+    ASSERT_OK(reader.Open(sst_name_));
+    ASSERT_OK(reader.VerifyChecksum());
+    std::unique_ptr<Iterator> iter(reader.NewIterator(ropts));
+    iter->SeekToFirst();
+    for (size_t i = 0; i + 2 < keys.size(); i += 3) {
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ(iter->key().compare(keys[i]), 0);
+      ASSERT_EQ(iter->value().compare(keys[i]), 0);
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ(iter->key().compare(keys[i+1]), 0);
+      ASSERT_EQ(iter->value().compare(EncodeAsUint64(i+1)), 0);
+      iter->Next();
+    }
+    ASSERT_FALSE(iter->Valid());
+  }
+
+ protected:
+  Options options_;
+  EnvOptions soptions_;
+  std::string sst_name_;
+};
+
+const uint64_t kNumKeys = 100;
+
+TEST_F(SstFileReaderTest, Basic) {
+  std::vector<std::string> keys;
+  for (uint64_t i = 0; i < kNumKeys; i++) {
+    keys.emplace_back(EncodeAsString(i));
+  }
+  CreateFileAndCheck(keys);
+}
+
+TEST_F(SstFileReaderTest, Uint64Comparator) {
+  options_.comparator = test::Uint64Comparator();
+  std::vector<std::string> keys;
+  for (uint64_t i = 0; i < kNumKeys; i++) {
+    keys.emplace_back(EncodeAsUint64(i));
+  }
+  CreateFileAndCheck(keys);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as SstFileReader is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc
index e0c4c31896b..a752504c8f6 100644
--- a/table/sst_file_writer.cc
+++ b/table/sst_file_writer.cc
@@ -238,7 +238,8 @@ Status SstFileWriter::Open(const std::string& file_path) {
       nullptr /* compression_dict */, r->skip_filters, r->column_family_name,
       unknown_level);
   r->file_writer.reset(
-      new WritableFileWriter(std::move(sst_file), file_path, r->env_options));
+      new WritableFileWriter(std::move(sst_file), file_path, r->env_options,
+                             nullptr /* stats */, r->ioptions.listeners));
 
   // TODO(tec) : If table_factory is using compressed block cache, we will
   // be adding the external sst file blocks into it, which is wasteful.
diff --git a/table/table_properties.cc b/table/table_properties.cc
index 207a6419119..56e1d03f1f7 100644
--- a/table/table_properties.cc
+++ b/table/table_properties.cc
@@ -78,6 +78,9 @@ std::string TableProperties::ToString(
   AppendProperty(result, "# data blocks", num_data_blocks, prop_delim,
                  kv_delim);
   AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim);
+  AppendProperty(result, "# deletions", num_deletions, prop_delim, kv_delim);
+  AppendProperty(result, "# merge operands", num_merge_operands, prop_delim,
+                 kv_delim);
   AppendProperty(result, "# range deletions", num_range_deletions, prop_delim,
                  kv_delim);
 
@@ -170,6 +173,8 @@ void TableProperties::Add(const TableProperties& tp) {
   raw_value_size += tp.raw_value_size;
   num_data_blocks += tp.num_data_blocks;
   num_entries += tp.num_entries;
+  num_deletions += tp.num_deletions;
+  num_merge_operands += tp.num_merge_operands;
   num_range_deletions += tp.num_range_deletions;
 }
 
@@ -195,6 +200,9 @@ const std::string TablePropertiesNames::kNumDataBlocks =
     "rocksdb.num.data.blocks";
 const std::string TablePropertiesNames::kNumEntries =
     "rocksdb.num.entries";
+const std::string TablePropertiesNames::kDeletedKeys = "rocksdb.deleted.keys";
+const std::string TablePropertiesNames::kMergeOperands =
+    "rocksdb.merge.operands";
 const std::string TablePropertiesNames::kNumRangeDeletions =
     "rocksdb.num.range-deletions";
 const std::string TablePropertiesNames::kFilterPolicy =
diff --git a/table/table_reader.h b/table/table_reader.h
index 505b5ba1fb8..a5f15e13044 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -9,6 +9,7 @@
 
 #pragma once
 #include <memory>
+#include "db/range_tombstone_fragmenter.h"
 #include "rocksdb/slice_transform.h"
 #include "table/internal_iterator.h"
 
@@ -44,7 +45,7 @@ class TableReader {
                                         bool skip_filters = false,
                                         bool for_compaction = false) = 0;
 
-  virtual InternalIterator* NewRangeTombstoneIterator(
+  virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
       const ReadOptions& /*read_options*/) {
     return nullptr;
   }
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index 4032c4a5a1e..fbcfac826c8 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -86,9 +86,9 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
   const ImmutableCFOptions ioptions(opts);
   const ColumnFamilyOptions cfo(opts);
   const MutableCFOptions moptions(cfo);
-  unique_ptr<WritableFileWriter> file_writer;
+  std::unique_ptr<WritableFileWriter> file_writer;
   if (!through_db) {
-    unique_ptr<WritableFile> file;
+    std::unique_ptr<WritableFile> file;
     env->NewWritableFile(file_name, &file, env_options);
 
     std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
@@ -127,9 +127,9 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
     db->Flush(FlushOptions());
   }
 
-  unique_ptr<TableReader> table_reader;
+  std::unique_ptr<TableReader> table_reader;
   if (!through_db) {
-    unique_ptr<RandomAccessFile> raf;
+    std::unique_ptr<RandomAccessFile> raf;
     s = env->NewRandomAccessFile(file_name, &raf, env_options);
     if (!s.ok()) {
       fprintf(stderr, "Create File Error: %s\n", s.ToString().c_str());
@@ -137,7 +137,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
     }
     uint64_t file_size;
     env->GetFileSize(file_name, &file_size);
-    unique_ptr<RandomAccessFileReader> file_reader(
+    std::unique_ptr<RandomAccessFileReader> file_reader(
         new RandomAccessFileReader(std::move(raf), file_name));
     s = opts.table_factory->NewTableReader(
         TableReaderOptions(ioptions, moptions.prefix_extractor.get(),
@@ -170,12 +170,12 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
           if (!through_db) {
             PinnableSlice value;
             MergeContext merge_context;
-            RangeDelAggregator range_del_agg(ikc, {} /* snapshots */);
+            SequenceNumber max_covering_tombstone_seq = 0;
             GetContext get_context(ioptions.user_comparator,
                                    ioptions.merge_operator, ioptions.info_log,
                                    ioptions.statistics, GetContext::kNotFound,
                                    Slice(key), &value, nullptr, &merge_context,
-                                   &range_del_agg, env);
+                                   &max_covering_tombstone_seq, env);
             s = table_reader->Get(read_options, key, &get_context, nullptr);
           } else {
             s = db->Get(read_options, key, &result);
diff --git a/table/table_test.cc b/table/table_test.cc
index 26383fa8179..5ec613bec44 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -232,7 +232,6 @@ class BlockConstructor: public Constructor {
     data_ = builder.Finish().ToString();
     BlockContents contents;
     contents.data = data_;
-    contents.cachable = false;
     block_ = new Block(std::move(contents), kDisableGlobalSequenceNumber);
     return Status::OK();
   }
@@ -325,7 +324,7 @@ class TableConstructor: public Constructor {
     soptions.use_mmap_reads = ioptions.allow_mmap_reads;
     file_writer_.reset(test::GetWritableFileWriter(new test::StringSink(),
                                                    "" /* don't care */));
-    unique_ptr<TableBuilder> builder;
+    std::unique_ptr<TableBuilder> builder;
     std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
         int_tbl_prop_collector_factories;
     std::string column_family_name;
@@ -423,9 +422,9 @@ class TableConstructor: public Constructor {
   }
 
   uint64_t uniq_id_;
-  unique_ptr<WritableFileWriter> file_writer_;
-  unique_ptr<RandomAccessFileReader> file_reader_;
-  unique_ptr<TableReader> table_reader_;
+  std::unique_ptr<WritableFileWriter> file_writer_;
+  std::unique_ptr<RandomAccessFileReader> file_reader_;
+  std::unique_ptr<TableReader> table_reader_;
   bool convert_to_internal_key_;
   int level_;
 
@@ -508,7 +507,7 @@ class InternalIteratorFromIterator : public InternalIterator {
   virtual Status status() const override { return it_->status(); }
 
  private:
-  unique_ptr<Iterator> it_;
+  std::unique_ptr<Iterator> it_;
 };
 
 class DBConstructor: public Constructor {
@@ -1024,7 +1023,7 @@ class HarnessTest : public testing::Test {
   WriteBufferManager write_buffer_;
   bool support_prev_;
   bool only_support_prefix_seek_;
-  shared_ptr<InternalKeyComparator> internal_comparator_;
+  std::shared_ptr<InternalKeyComparator> internal_comparator_;
 };
 
 static bool Between(uint64_t val, uint64_t low, uint64_t high) {
@@ -1278,6 +1277,13 @@ TEST_P(BlockBasedTableTest, RangeDelBlock) {
   std::vector<std::string> keys = {"1pika", "2chu"};
   std::vector<std::string> vals = {"p", "c"};
 
+  std::vector<RangeTombstone> expected_tombstones = {
+      {"1pika", "2chu", 0},
+      {"2chu", "c", 1},
+      {"2chu", "c", 0},
+      {"c", "p", 0},
+  };
+
   for (int i = 0; i < 2; i++) {
     RangeTombstone t(keys[i], vals[i], i);
     std::pair<InternalKey, Slice> p = t.Serialize();
@@ -1310,14 +1316,15 @@ TEST_P(BlockBasedTableTest, RangeDelBlock) {
     ASSERT_FALSE(iter->Valid());
     iter->SeekToFirst();
     ASSERT_TRUE(iter->Valid());
-    for (int i = 0; i < 2; i++) {
+    for (size_t i = 0; i < expected_tombstones.size(); i++) {
       ASSERT_TRUE(iter->Valid());
       ParsedInternalKey parsed_key;
       ASSERT_TRUE(ParseInternalKey(iter->key(), &parsed_key));
       RangeTombstone t(parsed_key, iter->value());
-      ASSERT_EQ(t.start_key_, keys[i]);
-      ASSERT_EQ(t.end_key_, vals[i]);
-      ASSERT_EQ(t.seq_, i);
+      const auto& expected_t = expected_tombstones[i];
+      ASSERT_EQ(t.start_key_, expected_t.start_key_);
+      ASSERT_EQ(t.end_key_, expected_t.end_key_);
+      ASSERT_EQ(t.seq_, expected_t.seq_);
       iter->Next();
     }
     ASSERT_TRUE(!iter->Valid());
@@ -1385,8 +1392,8 @@ void PrefetchRange(TableConstructor* c, Options* opt,
   // prefetch
   auto* table_reader = dynamic_cast<BlockBasedTable*>(c->GetTableReader());
   Status s;
-  unique_ptr<Slice> begin, end;
-  unique_ptr<InternalKey> i_begin, i_end;
+  std::unique_ptr<Slice> begin, end;
+  std::unique_ptr<InternalKey> i_begin, i_end;
   if (key_begin != nullptr) {
     if (c->ConvertToInternalKey()) {
       i_begin.reset(new InternalKey(key_begin, kMaxSequenceNumber, kTypeValue));
@@ -1417,7 +1424,7 @@ TEST_P(BlockBasedTableTest, PrefetchTest) {
   // The purpose of this test is to test the prefetching operation built into
   // BlockBasedTable.
   Options opt;
-  unique_ptr<InternalKeyComparator> ikc;
+  std::unique_ptr<InternalKeyComparator> ikc;
   ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
   opt.compression = kNoCompression;
   BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
@@ -2009,7 +2016,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
 
   // -- PART 1: Open with regular block cache.
   // Since block_cache is disabled, no cache activities will be involved.
-  unique_ptr<InternalIterator> iter;
+  std::unique_ptr<InternalIterator> iter;
 
   int64_t last_cache_bytes_read = 0;
   // At first, no block will be accessed.
@@ -2343,7 +2350,7 @@ TEST_P(BlockBasedTableTest, NoObjectInCacheAfterTableClose) {
               }
               // Create a table
               Options opt;
-              unique_ptr<InternalKeyComparator> ikc;
+              std::unique_ptr<InternalKeyComparator> ikc;
               ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
               opt.compression = kNoCompression;
               BlockBasedTableOptions table_options =
@@ -2419,7 +2426,7 @@ TEST_P(BlockBasedTableTest, BlockCacheLeak) {
   // unique ID from the file.
 
   Options opt;
-  unique_ptr<InternalKeyComparator> ikc;
+  std::unique_ptr<InternalKeyComparator> ikc;
   ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
   opt.compression = kNoCompression;
   BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
@@ -2442,7 +2449,7 @@ TEST_P(BlockBasedTableTest, BlockCacheLeak) {
   const MutableCFOptions moptions(opt);
   c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
 
-  unique_ptr<InternalIterator> iter(
+  std::unique_ptr<InternalIterator> iter(
       c.NewIterator(moptions.prefix_extractor.get()));
   iter->SeekToFirst();
   while (iter->Valid()) {
@@ -2477,6 +2484,78 @@ TEST_P(BlockBasedTableTest, BlockCacheLeak) {
   c.ResetTableReader();
 }
 
+namespace {
+class CustomMemoryAllocator : public MemoryAllocator {
+ public:
+  virtual const char* Name() const override { return "CustomMemoryAllocator"; }
+
+  void* Allocate(size_t size) override {
+    ++numAllocations;
+    auto ptr = new char[size + 16];
+    memcpy(ptr, "memory_allocator_", 16);  // mangle first 16 bytes
+    return reinterpret_cast<void*>(ptr + 16);
+  }
+  void Deallocate(void* p) override {
+    ++numDeallocations;
+    char* ptr = reinterpret_cast<char*>(p) - 16;
+    delete[] ptr;
+  }
+
+  std::atomic<int> numAllocations;
+  std::atomic<int> numDeallocations;
+};
+}  // namespace
+
+TEST_P(BlockBasedTableTest, MemoryAllocator) {
+  auto custom_memory_allocator = std::make_shared<CustomMemoryAllocator>();
+  {
+    Options opt;
+    std::unique_ptr<InternalKeyComparator> ikc;
+    ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
+    opt.compression = kNoCompression;
+    BlockBasedTableOptions table_options;
+    table_options.block_size = 1024;
+    LRUCacheOptions lruOptions;
+    lruOptions.memory_allocator = custom_memory_allocator;
+    lruOptions.capacity = 16 * 1024 * 1024;
+    lruOptions.num_shard_bits = 4;
+    table_options.block_cache = NewLRUCache(std::move(lruOptions));
+    opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    TableConstructor c(BytewiseComparator(),
+                       true /* convert_to_internal_key_ */);
+    c.Add("k01", "hello");
+    c.Add("k02", "hello2");
+    c.Add("k03", std::string(10000, 'x'));
+    c.Add("k04", std::string(200000, 'x'));
+    c.Add("k05", std::string(300000, 'x'));
+    c.Add("k06", "hello3");
+    c.Add("k07", std::string(100000, 'x'));
+    std::vector<std::string> keys;
+    stl_wrappers::KVMap kvmap;
+    const ImmutableCFOptions ioptions(opt);
+    const MutableCFOptions moptions(opt);
+    c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
+
+    std::unique_ptr<InternalIterator> iter(
+        c.NewIterator(moptions.prefix_extractor.get()));
+    iter->SeekToFirst();
+    while (iter->Valid()) {
+      iter->key();
+      iter->value();
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+  }
+
+  // out of scope, block cache should have been deleted, all allocations
+  // deallocated
+  EXPECT_EQ(custom_memory_allocator->numAllocations.load(),
+            custom_memory_allocator->numDeallocations.load());
+  // make sure that allocations actually happened through the cache allocator
+  EXPECT_GT(custom_memory_allocator->numAllocations.load(), 0);
+}
+
 TEST_P(BlockBasedTableTest, NewIndexIteratorLeak) {
   // A regression test to avoid data race described in
   // https://github.com/facebook/rocksdb/issues/1267
@@ -2550,7 +2629,7 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) {
 
   PlainTableFactory factory(plain_table_options);
   test::StringSink sink;
-  unique_ptr<WritableFileWriter> file_writer(
+  std::unique_ptr<WritableFileWriter> file_writer(
       test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */));
   Options options;
   const ImmutableCFOptions ioptions(options);
@@ -2579,7 +2658,7 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) {
 
   test::StringSink* ss =
     static_cast<test::StringSink*>(file_writer->writable_file());
-  unique_ptr<RandomAccessFileReader> file_reader(
+  std::unique_ptr<RandomAccessFileReader> file_reader(
       test::GetRandomAccessFileReader(
           new test::StringSource(ss->contents(), 72242, true)));
 
@@ -2658,9 +2737,9 @@ static void DoCompressionTest(CompressionType comp) {
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"),       0,      0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"),       0,      0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"),       0,      0));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"),    2000,   3000));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"),    2000,   3000));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),    4000,   6100));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"),    2000,   3500));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"),    2000,   3500));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),    4000,   6500));
   c.ResetTableReader();
 }
 
@@ -2706,6 +2785,7 @@ TEST_F(GeneralTableTest, ApproximateOffsetOfCompressed) {
   }
 }
 
+#ifndef ROCKSDB_VALGRIND_RUN
 // RandomizedHarnessTest is very slow for certain combination of arguments
 // Split into 8 pieces to reduce the time individual tests take.
 TEST_F(HarnessTest, Randomized1) {
@@ -2789,6 +2869,7 @@ TEST_F(HarnessTest, RandomizedLongDB) {
   ASSERT_GT(files, 0);
 }
 #endif  // ROCKSDB_LITE
+#endif  // ROCKSDB_VALGRIND_RUN
 
 class MemTableTest : public testing::Test {};
 
@@ -2824,7 +2905,8 @@ TEST_F(MemTableTest, Simple) {
       iter = memtable->NewIterator(ReadOptions(), &arena);
       arena_iter_guard.set(iter);
     } else {
-      iter = memtable->NewRangeTombstoneIterator(ReadOptions());
+      iter = memtable->NewRangeTombstoneIterator(
+          ReadOptions(), kMaxSequenceNumber /* read_seq */);
       iter_guard.reset(iter);
     }
     if (iter == nullptr) {
@@ -2924,6 +3006,26 @@ TEST_F(HarnessTest, FooterTests) {
     ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
     ASSERT_EQ(decoded_footer.version(), 1U);
   }
+  {
+    // xxhash64 block based
+    std::string encoded;
+    Footer footer(kBlockBasedTableMagicNumber, 1);
+    BlockHandle meta_index(10, 5), index(20, 15);
+    footer.set_metaindex_handle(meta_index);
+    footer.set_index_handle(index);
+    footer.set_checksum(kxxHash64);
+    footer.EncodeTo(&encoded);
+    Footer decoded_footer;
+    Slice encoded_slice(encoded);
+    decoded_footer.DecodeFrom(&encoded_slice);
+    ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
+    ASSERT_EQ(decoded_footer.checksum(), kxxHash64);
+    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+    ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+    ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+    ASSERT_EQ(decoded_footer.version(), 1U);
+  }
 // Plain table is not supported in ROCKSDB_LITE
 #ifndef ROCKSDB_LITE
   {
@@ -3151,7 +3253,7 @@ TEST_F(PrefixTest, PrefixAndWholeKeyTest) {
 TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
   BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
   test::StringSink* sink = new test::StringSink();
-  unique_ptr<WritableFileWriter> file_writer(
+  std::unique_ptr<WritableFileWriter> file_writer(
       test::GetWritableFileWriter(sink, "" /* don't care */));
   Options options;
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
@@ -3189,7 +3291,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
 
   // Helper function to get version, global_seqno, global_seqno_offset
   std::function<void()> GetVersionAndGlobalSeqno = [&]() {
-    unique_ptr<RandomAccessFileReader> file_reader(
+    std::unique_ptr<RandomAccessFileReader> file_reader(
         test::GetRandomAccessFileReader(
             new test::StringSource(ss_rw.contents(), 73342, true)));
 
@@ -3218,9 +3320,9 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
   };
 
   // Helper function to get the contents of the table InternalIterator
-  unique_ptr<TableReader> table_reader;
+  std::unique_ptr<TableReader> table_reader;
   std::function<InternalIterator*()> GetTableInternalIter = [&]() {
-    unique_ptr<RandomAccessFileReader> file_reader(
+    std::unique_ptr<RandomAccessFileReader> file_reader(
         test::GetRandomAccessFileReader(
             new test::StringSource(ss_rw.contents(), 73342, true)));
 
@@ -3333,7 +3435,7 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) {
   BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
   bbto.block_align = true;
   test::StringSink* sink = new test::StringSink();
-  unique_ptr<WritableFileWriter> file_writer(
+  std::unique_ptr<WritableFileWriter> file_writer(
       test::GetWritableFileWriter(sink, "" /* don't care */));
   Options options;
   options.compression = kNoCompression;
@@ -3365,7 +3467,7 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) {
   file_writer->Flush();
 
   test::RandomRWStringSink ss_rw(sink);
-  unique_ptr<RandomAccessFileReader> file_reader(
+  std::unique_ptr<RandomAccessFileReader> file_reader(
       test::GetRandomAccessFileReader(
           new test::StringSource(ss_rw.contents(), 73342, true)));
 
@@ -3423,7 +3525,7 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
   BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
   bbto.block_align = true;
   test::StringSink* sink = new test::StringSink();
-  unique_ptr<WritableFileWriter> file_writer(
+  std::unique_ptr<WritableFileWriter> file_writer(
       test::GetWritableFileWriter(sink, "" /* don't care */));
 
   Options options;
@@ -3458,7 +3560,7 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
   file_writer->Flush();
 
   test::RandomRWStringSink ss_rw(sink);
-  unique_ptr<RandomAccessFileReader> file_reader(
+  std::unique_ptr<RandomAccessFileReader> file_reader(
       test::GetRandomAccessFileReader(
           new test::StringSource(ss_rw.contents(), 73342, true)));
 
@@ -3477,10 +3579,10 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
       Slice compression_dict;
       PersistentCacheOptions cache_options;
 
-      BlockFetcher block_fetcher(file, nullptr /* prefetch_buffer */, footer,
-                                 read_options, handle, contents, ioptions,
-                                 false /* decompress */, compression_dict,
-                                 cache_options);
+      BlockFetcher block_fetcher(
+          file, nullptr /* prefetch_buffer */, footer, read_options, handle,
+          contents, ioptions, false /* decompress */,
+          false /*maybe_compressed*/, compression_dict, cache_options);
 
       ASSERT_OK(block_fetcher.ReadBlockContents());
     };
@@ -3566,7 +3668,8 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) {
   BlockFetcher block_fetcher(
       table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
       metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
-      compression_dict, pcache_opts);
+      false /*maybe_compressed*/, compression_dict, pcache_opts,
+      nullptr /*memory_allocator*/);
   ASSERT_OK(block_fetcher.ReadBlockContents());
   Block metaindex_block(std::move(metaindex_contents),
                         kDisableGlobalSequenceNumber);
diff --git a/tools/benchmark.sh b/tools/benchmark.sh
index 6d09204900f..0ba1081e195 100755
--- a/tools/benchmark.sh
+++ b/tools/benchmark.sh
@@ -151,8 +151,8 @@ function summarize_result {
   stall_pct=$( grep "^Cumulative stall" $test_out| tail -1  | awk '{  print $5 }' )
   ops_sec=$( grep ^${bench_name} $test_out | awk '{ print $5 }' )
   mb_sec=$( grep ^${bench_name} $test_out | awk '{ print $7 }' )
-  lo_wgb=$( grep "^  L0" $test_out | tail -1 | awk '{ print $8 }' )
-  sum_wgb=$( grep "^ Sum" $test_out | tail -1 | awk '{ print $8 }' )
+  lo_wgb=$( grep "^  L0" $test_out | tail -1 | awk '{ print $9 }' )
+  sum_wgb=$( grep "^ Sum" $test_out | tail -1 | awk '{ print $9 }' )
   sum_size=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.1f", $3 / 1024.0 }' )
   wamp=$( echo "scale=1; $sum_wgb / $lo_wgb" | bc )
   wmb_ps=$( echo "scale=1; ( $sum_wgb * 1024.0 ) / $uptime" | bc )
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index 5959fb83293..2d260c7ecc3 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -56,7 +56,7 @@ declare -a backward_compatible_checkout_objs=("2.2.fb.branch" "2.3.fb.branch" "2
 declare -a forward_compatible_checkout_objs=("3.10.fb" "3.11.fb" "3.12.fb" "3.13.fb" "4.0.fb" "4.1.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb")
 declare -a forward_compatible_with_options_checkout_objs=("5.11.fb" "5.12.fb" "5.13.fb" "5.14.fb")
 declare -a checkout_objs=(${backward_compatible_checkout_objs[@]} ${forward_compatible_checkout_objs[@]} ${forward_compatible_with_options_checkout_objs[@]})
-declare -a extern_sst_ingestion_compatible_checkout_objs=("5.14.fb" "5.15.fb")
+declare -a extern_sst_ingestion_compatible_checkout_objs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb")
 
 generate_db()
 {
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 2dd3f402fef..2e20fd8275f 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -34,10 +34,12 @@
 
 #include "cloud/aws/aws_env.h"
 #include "db/db_impl.h"
+#include "db/malloc_stats.h"
 #include "db/version_set.h"
 #include "hdfs/env_hdfs.h"
 #include "monitoring/histogram.h"
 #include "monitoring/statistics.h"
+#include "options/cf_options.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/cache.h"
@@ -46,7 +48,6 @@
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/options.h"
-#include "options/cf_options.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/persistent_cache.h"
 #include "rocksdb/rate_limiter.h"
@@ -249,6 +250,10 @@ DEFINE_bool(reverse_iterator, false,
             "When true use Prev rather than Next for iterators that do "
             "Seek and then Next");
 
+DEFINE_int64(max_scan_distance, 0,
+             "Used to define iterate_upper_bound (or iterate_lower_bound "
+             "if FLAGS_reverse_iterator is set to true) when value is nonzero");
+
 DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
 
 DEFINE_int64(batch_size, 1, "Batch size");
@@ -641,9 +646,11 @@ DEFINE_bool(optimize_filters_for_hits, false,
 DEFINE_uint64(delete_obsolete_files_period_micros, 0,
               "Ignored. Left here for backward compatibility");
 
+DEFINE_int64(writes_before_delete_range, 0,
+             "Number of writes before DeleteRange is called regularly.");
+
 DEFINE_int64(writes_per_range_tombstone, 0,
-             "Number of writes between range "
-             "tombstones");
+             "Number of writes between range tombstones");
 
 DEFINE_int64(range_tombstone_width, 100, "Number of keys in tombstone's range");
 
@@ -941,6 +948,9 @@ DEFINE_uint64(max_compaction_bytes, rocksdb::Options().max_compaction_bytes,
 
 #ifndef ROCKSDB_LITE
 DEFINE_bool(readonly, false, "Run read only benchmarks.");
+
+DEFINE_bool(print_malloc_stats, false,
+            "Print malloc stats to stdout after benchmarks finish.");
 #endif  // ROCKSDB_LITE
 
 DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions");
@@ -1195,11 +1205,12 @@ class ReportFileOpEnv : public EnvWrapper {
     counters_.bytes_written_ = 0;
   }
 
-  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
                            const EnvOptions& soptions) override {
     class CountingFile : public SequentialFile {
      private:
-      unique_ptr<SequentialFile> target_;
+      std::unique_ptr<SequentialFile> target_;
       ReportFileOpCounters* counters_;
 
      public:
@@ -1227,11 +1238,11 @@ class ReportFileOpEnv : public EnvWrapper {
   }
 
   Status NewRandomAccessFile(const std::string& f,
-                             unique_ptr<RandomAccessFile>* r,
+                             std::unique_ptr<RandomAccessFile>* r,
                              const EnvOptions& soptions) override {
     class CountingFile : public RandomAccessFile {
      private:
-      unique_ptr<RandomAccessFile> target_;
+      std::unique_ptr<RandomAccessFile> target_;
       ReportFileOpCounters* counters_;
 
      public:
@@ -1256,11 +1267,11 @@ class ReportFileOpEnv : public EnvWrapper {
     return s;
   }
 
-  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
                          const EnvOptions& soptions) override {
     class CountingFile : public WritableFile {
      private:
-      unique_ptr<WritableFile> target_;
+      std::unique_ptr<WritableFile> target_;
       ReportFileOpCounters* counters_;
 
      public:
@@ -2026,12 +2037,15 @@ class Benchmark {
   int prefix_size_;
   int64_t keys_per_prefix_;
   int64_t entries_per_batch_;
+  int64_t writes_before_delete_range_;
   int64_t writes_per_range_tombstone_;
   int64_t range_tombstone_width_;
   int64_t max_num_range_tombstones_;
   WriteOptions write_options_;
   Options open_options_;  // keep options around to properly destroy db later
+#ifndef ROCKSDB_LITE
   TraceOptions trace_options_;
+#endif
   int64_t reads_;
   int64_t deletes_;
   double read_random_exp_range_;
@@ -2553,6 +2567,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       value_size_ = FLAGS_value_size;
       key_size_ = FLAGS_key_size;
       entries_per_batch_ = FLAGS_batch_size;
+      writes_before_delete_range_ = FLAGS_writes_before_delete_range;
       writes_per_range_tombstone_ = FLAGS_writes_per_range_tombstone;
       range_tombstone_width_ = FLAGS_range_tombstone_width;
       max_num_range_tombstones_ = FLAGS_max_num_range_tombstones;
@@ -2907,6 +2922,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     }
 
     SetPerfLevel(static_cast<PerfLevel> (shared->perf_level));
+    perf_context.EnablePerLevelPerfContext();
     thread->stats.Start(thread->tid);
     (arg->bm->*(arg->method))(thread);
     thread->stats.Stop();
@@ -3934,9 +3950,13 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         bytes += value_size_ + key_size_;
         ++num_written;
         if (writes_per_range_tombstone_ > 0 &&
-            num_written / writes_per_range_tombstone_ <=
+            num_written > writes_before_delete_range_ &&
+            (num_written - writes_before_delete_range_) /
+                    writes_per_range_tombstone_ <=
                 max_num_range_tombstones_ &&
-            num_written % writes_per_range_tombstone_ == 0) {
+            (num_written - writes_before_delete_range_) %
+                    writes_per_range_tombstone_ ==
+                0) {
           int64_t begin_num = key_gens[id]->Next();
           if (FLAGS_expand_range_tombstones) {
             for (int64_t offset = 0; offset < range_tombstone_width_;
@@ -4287,7 +4307,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         }
         if (levelMeta.level == 0) {
           for (auto& fileMeta : levelMeta.files) {
-            fprintf(stdout, "Level[%d]: %s(size: %" PRIu64 " bytes)\n",
+            fprintf(stdout, "Level[%d]: %s(size: %" ROCKSDB_PRIszt " bytes)\n",
                     levelMeta.level, fileMeta.name.c_str(), fileMeta.size);
           }
         } else {
@@ -4606,9 +4626,31 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     std::unique_ptr<const char[]> key_guard;
     Slice key = AllocateKey(&key_guard);
 
+    std::unique_ptr<const char[]> upper_bound_key_guard;
+    Slice upper_bound = AllocateKey(&upper_bound_key_guard);
+    std::unique_ptr<const char[]> lower_bound_key_guard;
+    Slice lower_bound = AllocateKey(&lower_bound_key_guard);
+
     Duration duration(FLAGS_duration, reads_);
     char value_buffer[256];
     while (!duration.Done(1)) {
+      int64_t seek_pos = thread->rand.Next() % FLAGS_num;
+      GenerateKeyFromInt((uint64_t)seek_pos, FLAGS_num, &key);
+      if (FLAGS_max_scan_distance != 0) {
+        if (FLAGS_reverse_iterator) {
+          GenerateKeyFromInt(
+              (uint64_t)std::max((int64_t)0,
+                                 seek_pos - FLAGS_max_scan_distance),
+              FLAGS_num, &lower_bound);
+          options.iterate_lower_bound = &lower_bound;
+        } else {
+          GenerateKeyFromInt(
+              (uint64_t)std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance),
+              FLAGS_num, &upper_bound);
+          options.iterate_upper_bound = &upper_bound;
+        }
+      }
+
       if (!FLAGS_use_tailing_iterator) {
         if (db_.db != nullptr) {
           delete single_iter;
@@ -4629,7 +4671,6 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         iter_to_use = multi_iters[thread->rand.Next() % multi_iters.size()];
       }
 
-      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
       iter_to_use->Seek(key);
       read++;
       if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) {
@@ -5726,7 +5767,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
 
   void Replay(ThreadState* /*thread*/, DBWithColumnFamilies* db_with_cfh) {
     Status s;
-    unique_ptr<TraceReader> trace_reader;
+    std::unique_ptr<TraceReader> trace_reader;
     s = NewFileTraceReader(FLAGS_env, EnvOptions(), FLAGS_trace_file,
                            &trace_reader);
     if (!s.ok()) {
@@ -5854,6 +5895,15 @@ int db_bench_tool(int argc, char** argv) {
 
   rocksdb::Benchmark benchmark;
   benchmark.Run();
+
+#ifndef ROCKSDB_LITE
+  if (FLAGS_print_malloc_stats) {
+    std::string stats_string;
+    rocksdb::DumpMallocStats(&stats_string);
+    fprintf(stdout, "Malloc stats:\n%s\n", stats_string.c_str());
+  }
+#endif  // ROCKSDB_LITE
+
   return 0;
 }
 }  // namespace rocksdb
diff --git a/tools/db_bench_tool_test.cc b/tools/db_bench_tool_test.cc
index 67426066eb9..dfc461193c4 100644
--- a/tools/db_bench_tool_test.cc
+++ b/tools/db_bench_tool_test.cc
@@ -279,7 +279,7 @@ const std::string options_file_content = R"OPTIONS_FILE(
 
 TEST_F(DBBenchTest, OptionsFileFromFile) {
   const std::string kOptionsFileName = test_path_ + "/OPTIONS_flash";
-  unique_ptr<WritableFile> writable;
+  std::unique_ptr<WritableFile> writable;
   ASSERT_OK(Env::Default()->NewWritableFile(kOptionsFileName, &writable,
                                             EnvOptions()));
   ASSERT_OK(writable->Append(options_file_content));
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 59528128b4c..0bf43780df5 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -15,6 +15,9 @@
 #       default_params < {blackbox,whitebox}_default_params <
 #       simple_default_params <
 #       {blackbox,whitebox}_simple_default_params < args
+#   for enable_atomic_flush:
+#       default_params < {blackbox,whitebox}_default_params <
+#       atomic_flush_params < args
 
 expected_values_file = tempfile.NamedTemporaryFile()
 
@@ -122,6 +125,15 @@ def is_direct_io_supported(dbname):
 
 whitebox_simple_default_params = {}
 
+atomic_flush_params = {
+    "atomic_flush": 1,
+    "disable_wal": 1,
+    "reopen": 0,
+    # use small value for write_buffer_size so that RocksDB triggers flush
+    # more frequently
+    "write_buffer_size": 1024 * 1024,
+}
+
 
 def finalize_and_sanitize(src_params):
     dest_params = dict([(k,  v() if callable(v) else v)
@@ -152,6 +164,8 @@ def gen_cmd_params(args):
             params.update(blackbox_simple_default_params)
         if args.test_type == 'whitebox':
             params.update(whitebox_simple_default_params)
+    if args.enable_atomic_flush:
+        params.update(atomic_flush_params)
 
     for k, v in vars(args).items():
         if v is not None:
@@ -164,7 +178,7 @@ def gen_cmd(params, unknown_params):
         '--{0}={1}'.format(k, v)
         for k, v in finalize_and_sanitize(params).items()
         if k not in set(['test_type', 'simple', 'duration', 'interval',
-                         'random_kill_odd'])
+                         'random_kill_odd', 'enable_atomic_flush'])
         and v is not None] + unknown_params
     return cmd
 
@@ -356,6 +370,7 @@ def main():
         db_stress multiple times")
     parser.add_argument("test_type", choices=["blackbox", "whitebox"])
     parser.add_argument("--simple", action="store_true")
+    parser.add_argument("--enable_atomic_flush", action='store_true')
 
     all_params = dict(default_params.items()
                       + blackbox_default_params.items()
diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc
index 5901b97778e..c640b5945b0 100644
--- a/tools/db_repl_stress.cc
+++ b/tools/db_repl_stress.cc
@@ -67,7 +67,7 @@ struct ReplicationThread {
 static void ReplicationThreadBody(void* arg) {
   ReplicationThread* t = reinterpret_cast<ReplicationThread*>(arg);
   DB* db = t->db;
-  unique_ptr<TransactionLogIterator> iter;
+  std::unique_ptr<TransactionLogIterator> iter;
   SequenceNumber currentSeqNum = 1;
   while (!t->stop.load(std::memory_order_acquire)) {
     iter.reset();
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 45a7c9a0d0a..20b2899e957 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -133,6 +133,8 @@ DEFINE_bool(test_batches_snapshots, false,
             "\t(b) No long validation at the end (more speed up)\n"
             "\t(c) Test snapshot and atomicity of batch writes");
 
+DEFINE_bool(atomic_flush, false, "If true, the test enables atomic flush\n");
+
 DEFINE_int32(threads, 32, "Number of concurrent threads to run.");
 
 DEFINE_int32(ttl, -1,
@@ -790,46 +792,36 @@ class Stats {
     }
   }
 
-  void AddBytesForWrites(int nwrites, size_t nbytes) {
+  void AddBytesForWrites(long nwrites, size_t nbytes) {
     writes_ += nwrites;
     bytes_ += nbytes;
   }
 
-  void AddGets(int ngets, int nfounds) {
+  void AddGets(long ngets, long nfounds) {
     founds_ += nfounds;
     gets_ += ngets;
   }
 
-  void AddPrefixes(int nprefixes, int count) {
+  void AddPrefixes(long nprefixes, long count) {
     prefixes_ += nprefixes;
     iterator_size_sums_ += count;
   }
 
-  void AddIterations(int n) {
-    iterations_ += n;
-  }
+  void AddIterations(long n) { iterations_ += n; }
 
-  void AddDeletes(int n) {
-    deletes_ += n;
-  }
+  void AddDeletes(long n) { deletes_ += n; }
 
   void AddSingleDeletes(size_t n) { single_deletes_ += n; }
 
-  void AddRangeDeletions(int n) {
-    range_deletions_ += n;
-  }
+  void AddRangeDeletions(long n) { range_deletions_ += n; }
 
-  void AddCoveredByRangeDeletions(int n) {
-    covered_by_range_deletions_ += n;
-  }
+  void AddCoveredByRangeDeletions(long n) { covered_by_range_deletions_ += n; }
 
-  void AddErrors(int n) {
-    errors_ += n;
-  }
+  void AddErrors(long n) { errors_ += n; }
 
-  void AddNumCompactFilesSucceed(int n) { num_compact_files_succeed_ += n; }
+  void AddNumCompactFilesSucceed(long n) { num_compact_files_succeed_ += n; }
 
-  void AddNumCompactFilesFailed(int n) { num_compact_files_failed_ += n; }
+  void AddNumCompactFilesFailed(long n) { num_compact_files_failed_ += n; }
 
   void Report(const char* name) {
     std::string extra;
@@ -948,7 +940,7 @@ class SharedState {
       if (status.ok()) {
         status = FLAGS_env->GetFileSize(FLAGS_expected_values_path, &size);
       }
-      unique_ptr<WritableFile> wfile;
+      std::unique_ptr<WritableFile> wfile;
       if (status.ok() && size == 0) {
         const EnvOptions soptions;
         status = FLAGS_env->NewWritableFile(FLAGS_expected_values_path, &wfile,
@@ -1743,6 +1735,9 @@ class StressTest {
       }
     }
     if (snap_state.key_vec != nullptr) {
+      // When `prefix_extractor` is set, seeking to beginning and scanning
+      // across prefixes are only supported with `total_order_seek` set.
+      ropt.total_order_seek = true;
       std::unique_ptr<Iterator> iterator(db->NewIterator(ropt));
       std::unique_ptr<std::vector<bool>> tmp_bitvec(new std::vector<bool>(FLAGS_max_key));
       for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
@@ -1892,27 +1887,6 @@ class StressTest {
         }
       }
 
-      if (FLAGS_backup_one_in > 0 &&
-          thread->rand.Uniform(FLAGS_backup_one_in) == 0) {
-        std::string backup_dir = FLAGS_db + "/.backup" + ToString(thread->tid);
-        BackupableDBOptions backup_opts(backup_dir);
-        BackupEngine* backup_engine = nullptr;
-        Status s = BackupEngine::Open(FLAGS_env, backup_opts, &backup_engine);
-        if (s.ok()) {
-          s = backup_engine->CreateNewBackup(db_);
-        }
-        if (s.ok()) {
-          s = backup_engine->PurgeOldBackups(0 /* num_backups_to_keep */);
-        }
-        if (!s.ok()) {
-          printf("A BackupEngine operation failed with: %s\n",
-                 s.ToString().c_str());
-        }
-        if (backup_engine != nullptr) {
-          delete backup_engine;
-        }
-      }
-
       if (FLAGS_compact_files_one_in > 0 &&
           thread->rand.Uniform(FLAGS_compact_files_one_in) == 0) {
         auto* random_cf =
@@ -1975,15 +1949,6 @@ class StressTest {
 
       auto column_family = column_families_[rand_column_family];
 
-      if (FLAGS_flush_one_in > 0 &&
-          thread->rand.Uniform(FLAGS_flush_one_in) == 0) {
-        FlushOptions flush_opts;
-        Status status = db_->Flush(flush_opts, column_family);
-        if (!status.ok()) {
-          fprintf(stdout, "Unable to perform Flush(): %s\n", status.ToString().c_str());
-        }
-      }
-
       if (FLAGS_compact_range_one_in > 0 &&
           thread->rand.Uniform(FLAGS_compact_range_one_in) == 0) {
         int64_t end_key_num;
@@ -2007,6 +1972,21 @@ class StressTest {
 
       std::vector<int> rand_column_families =
           GenerateColumnFamilies(FLAGS_column_families, rand_column_family);
+
+      if (FLAGS_flush_one_in > 0 &&
+          thread->rand.Uniform(FLAGS_flush_one_in) == 0) {
+        FlushOptions flush_opts;
+        std::vector<ColumnFamilyHandle*> cfhs;
+        std::for_each(
+            rand_column_families.begin(), rand_column_families.end(),
+            [this, &cfhs](int k) { cfhs.push_back(column_families_[k]); });
+        Status status = db_->Flush(flush_opts, cfhs);
+        if (!status.ok()) {
+          fprintf(stdout, "Unable to perform Flush(): %s\n",
+                  status.ToString().c_str());
+        }
+      }
+
       std::vector<int64_t> rand_keys = GenerateKeys(rand_key);
 
       if (FLAGS_ingest_external_file_one_in > 0 &&
@@ -2014,6 +1994,15 @@ class StressTest {
         TestIngestExternalFile(thread, rand_column_families, rand_keys, lock);
       }
 
+      if (FLAGS_backup_one_in > 0 &&
+          thread->rand.Uniform(FLAGS_backup_one_in) == 0) {
+        Status s = TestBackupRestore(thread, rand_column_families, rand_keys);
+        if (!s.ok()) {
+          VerificationAbort(shared, "Backup/restore gave inconsistent state",
+                            s);
+        }
+      }
+
       if (FLAGS_acquire_snapshot_one_in > 0 &&
           thread->rand.Uniform(FLAGS_acquire_snapshot_one_in) == 0) {
         auto snapshot = db_->GetSnapshot();
@@ -2029,6 +2018,9 @@ class StressTest {
         if (FLAGS_compare_full_db_state_snapshot &&
             (thread->tid == 0)) {
           key_vec = new std::vector<bool>(FLAGS_max_key);
+          // When `prefix_extractor` is set, seeking to beginning and scanning
+          // across prefixes are only supported with `total_order_seek` set.
+          ropt.total_order_seek = true;
           std::unique_ptr<Iterator> iterator(db_->NewIterator(ropt));
           for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
             uint64_t key_val;
@@ -2199,6 +2191,106 @@ class StressTest {
     return s;
   }
 
+#ifdef ROCKSDB_LITE
+  virtual Status TestBackupRestore(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */) {
+    assert(false);
+    fprintf(stderr,
+            "RocksDB lite does not support "
+            "TestBackupRestore\n");
+    std::terminate();
+  }
+#else  // ROCKSDB_LITE
+  virtual Status TestBackupRestore(ThreadState* thread,
+                                   const std::vector<int>& rand_column_families,
+                                   const std::vector<int64_t>& rand_keys) {
+    // Note the column families chosen by `rand_column_families` cannot be
+    // dropped while the locks for `rand_keys` are held. So we should not have
+    // to worry about accessing those column families throughout this function.
+    assert(rand_column_families.size() == rand_keys.size());
+    std::string backup_dir = FLAGS_db + "/.backup" + ToString(thread->tid);
+    std::string restore_dir = FLAGS_db + "/.restore" + ToString(thread->tid);
+    BackupableDBOptions backup_opts(backup_dir);
+    BackupEngine* backup_engine = nullptr;
+    Status s = BackupEngine::Open(FLAGS_env, backup_opts, &backup_engine);
+    if (s.ok()) {
+      s = backup_engine->CreateNewBackup(db_);
+    }
+    if (s.ok()) {
+      delete backup_engine;
+      backup_engine = nullptr;
+      s = BackupEngine::Open(FLAGS_env, backup_opts, &backup_engine);
+    }
+    if (s.ok()) {
+      s = backup_engine->RestoreDBFromLatestBackup(restore_dir /* db_dir */,
+                                                   restore_dir /* wal_dir */);
+    }
+    if (s.ok()) {
+      s = backup_engine->PurgeOldBackups(0 /* num_backups_to_keep */);
+    }
+    DB* restored_db = nullptr;
+    std::vector<ColumnFamilyHandle*> restored_cf_handles;
+    if (s.ok()) {
+      Options restore_options(options_);
+      restore_options.listeners.clear();
+      std::vector<ColumnFamilyDescriptor> cf_descriptors;
+      // TODO(ajkr): `column_family_names_` is not safe to access here when
+      // `clear_column_family_one_in != 0`. But we can't easily switch to
+      // `ListColumnFamilies` to get names because it won't necessarily give
+      // the same order as `column_family_names_`.
+      assert(FLAGS_clear_column_family_one_in == 0);
+      for (auto name : column_family_names_) {
+        cf_descriptors.emplace_back(name, ColumnFamilyOptions(restore_options));
+      }
+      s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors,
+                   &restored_cf_handles, &restored_db);
+    }
+    // for simplicity, currently only verifies existence/non-existence of a few
+    // keys
+    for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
+      std::string key_str = Key(rand_keys[i]);
+      Slice key = key_str;
+      std::string restored_value;
+      Status get_status = restored_db->Get(
+          ReadOptions(), restored_cf_handles[rand_column_families[i]], key,
+          &restored_value);
+      bool exists =
+          thread->shared->Exists(rand_column_families[i], rand_keys[i]);
+      if (get_status.ok()) {
+        if (!exists) {
+          s = Status::Corruption(
+              "key exists in restore but not in original db");
+        }
+      } else if (get_status.IsNotFound()) {
+        if (exists) {
+          s = Status::Corruption(
+              "key exists in original db but not in restore");
+        }
+      } else {
+        s = get_status;
+      }
+    }
+    if (backup_engine != nullptr) {
+      delete backup_engine;
+      backup_engine = nullptr;
+    }
+    if (restored_db != nullptr) {
+      for (auto* cf_handle : restored_cf_handles) {
+        restored_db->DestroyColumnFamilyHandle(cf_handle);
+      }
+      delete restored_db;
+      restored_db = nullptr;
+    }
+    if (!s.ok()) {
+      printf("A backup/restore operation failed with: %s\n",
+             s.ToString().c_str());
+    }
+    return s;
+  }
+#endif  // ROCKSDB_LITE
+
   void VerificationAbort(SharedState* shared, std::string msg, Status s) const {
     printf("Verification failed: %s. Status is %s\n", msg.c_str(),
            s.ToString().c_str());
@@ -2218,6 +2310,8 @@ class StressTest {
     fprintf(stdout, "Format version            : %d\n", FLAGS_format_version);
     fprintf(stdout, "TransactionDB             : %s\n",
             FLAGS_use_txn ? "true" : "false");
+    fprintf(stdout, "Atomic flush              : %s\n",
+            FLAGS_atomic_flush ? "true" : "false");
     fprintf(stdout, "Column families           : %d\n", FLAGS_column_families);
     if (!FLAGS_test_batches_snapshots) {
       fprintf(stdout, "Clear CFs one in          : %d\n",
@@ -2363,6 +2457,7 @@ class StressTest {
           FLAGS_universal_max_merge_width;
       options_.compaction_options_universal.max_size_amplification_percent =
           FLAGS_universal_max_size_amplification_percent;
+      options_.atomic_flush = FLAGS_atomic_flush;
     } else {
 #ifdef ROCKSDB_LITE
       fprintf(stderr, "--options_file not supported in lite mode\n");
@@ -2594,7 +2689,7 @@ class NonBatchedOpsStressTest : public StressTest {
       }
       if (!thread->rand.OneIn(2)) {
         // Use iterator to verify this range
-        unique_ptr<Iterator> iter(
+        std::unique_ptr<Iterator> iter(
             db_->NewIterator(options, column_families_[cf]));
         iter->Seek(Key(start));
         for (auto i = start; i < end; i++) {
@@ -2733,16 +2828,15 @@ class NonBatchedOpsStressTest : public StressTest {
     }
 
     Iterator* iter = db_->NewIterator(ro_copy, cfh);
-    int64_t count = 0;
+    long count = 0;
     for (iter->Seek(prefix);
         iter->Valid() && iter->key().starts_with(prefix); iter->Next()) {
       ++count;
     }
-    assert(count <=
-        (static_cast<int64_t>(1) << ((8 - FLAGS_prefix_size) * 8)));
+    assert(count <= (static_cast<long>(1) << ((8 - FLAGS_prefix_size) * 8)));
     Status s = iter->status();
     if (iter->status().ok()) {
-      thread->stats.AddPrefixes(1, static_cast<int>(count));
+      thread->stats.AddPrefixes(1, count);
     } else {
       thread->stats.AddErrors(1);
     }
@@ -3272,7 +3366,7 @@ class BatchedOpsStressTest : public StressTest {
       iters[i]->Seek(prefix_slices[i]);
     }
 
-    int count = 0;
+    long count = 0;
     while (iters[0]->Valid() && iters[0]->key().starts_with(prefix_slices[0])) {
       count++;
       std::string values[10];
@@ -3327,6 +3421,274 @@ class BatchedOpsStressTest : public StressTest {
   virtual void VerifyDb(ThreadState* /* thread */) const {}
 };
 
+class AtomicFlushStressTest : public StressTest {
+ public:
+  AtomicFlushStressTest() : batch_id_(0) {}
+
+  virtual ~AtomicFlushStressTest() {}
+
+  virtual Status TestPut(ThreadState* thread, WriteOptions& write_opts,
+                         const ReadOptions& /* read_opts */,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys,
+                         char (&value)[100],
+                         std::unique_ptr<MutexLock>& /* lock */) {
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    uint64_t value_base = batch_id_.fetch_add(1);
+    size_t sz =
+        GenerateValue(static_cast<uint32_t>(value_base), value, sizeof(value));
+    Slice v(value, sz);
+    WriteBatch batch;
+    for (auto cf : rand_column_families) {
+      ColumnFamilyHandle* cfh = column_families_[cf];
+      if (FLAGS_use_merge) {
+        batch.Merge(cfh, key, v);
+      } else { /* !FLAGS_use_merge */
+        batch.Put(cfh, key, v);
+      }
+    }
+    Status s = db_->Write(write_opts, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multi put or merge error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      auto num = static_cast<long>(rand_column_families.size());
+      thread->stats.AddBytesForWrites(num, (sz + 1) * num);
+    }
+
+    return s;
+  }
+
+  virtual Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
+                            const std::vector<int>& rand_column_families,
+                            const std::vector<int64_t>& rand_keys,
+                            std::unique_ptr<MutexLock>& /* lock */) {
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    WriteBatch batch;
+    for (auto cf : rand_column_families) {
+      ColumnFamilyHandle* cfh = column_families_[cf];
+      batch.Delete(cfh, key);
+    }
+    Status s = db_->Write(write_opts, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multidel error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      thread->stats.AddDeletes(static_cast<long>(rand_column_families.size()));
+    }
+    return s;
+  }
+
+  virtual Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts,
+                                 const std::vector<int>& rand_column_families,
+                                 const std::vector<int64_t>& rand_keys,
+                                 std::unique_ptr<MutexLock>& /* lock */) {
+    int64_t rand_key = rand_keys[0];
+    auto shared = thread->shared;
+    int64_t max_key = shared->GetMaxKey();
+    if (rand_key > max_key - FLAGS_range_deletion_width) {
+      rand_key =
+          thread->rand.Next() % (max_key - FLAGS_range_deletion_width + 1);
+    }
+    std::string key_str = Key(rand_key);
+    Slice key = key_str;
+    std::string end_key_str = Key(rand_key + FLAGS_range_deletion_width);
+    Slice end_key = end_key_str;
+    WriteBatch batch;
+    for (auto cf : rand_column_families) {
+      ColumnFamilyHandle* cfh = column_families_[rand_column_families[cf]];
+      batch.DeleteRange(cfh, key, end_key);
+    }
+    Status s = db_->Write(write_opts, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multi del range error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      thread->stats.AddRangeDeletions(
+          static_cast<long>(rand_column_families.size()));
+    }
+    return s;
+  }
+
+  virtual void TestIngestExternalFile(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */,
+      std::unique_ptr<MutexLock>& /* lock */) {
+    assert(false);
+    fprintf(stderr,
+            "AtomicFlushStressTest does not support TestIngestExternalFile "
+            "because it's not possible to verify the result\n");
+    std::terminate();
+  }
+
+  virtual Status TestGet(ThreadState* thread, const ReadOptions& readoptions,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys) {
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    auto cfh =
+        column_families_[rand_column_families[thread->rand.Next() %
+                                              rand_column_families.size()]];
+    std::string from_db;
+    Status s = db_->Get(readoptions, cfh, key, &from_db);
+    if (s.ok()) {
+      thread->stats.AddGets(1, 1);
+    } else if (s.IsNotFound()) {
+      thread->stats.AddGets(1, 0);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    return s;
+  }
+
+  virtual Status TestPrefixScan(ThreadState* thread,
+                                const ReadOptions& readoptions,
+                                const std::vector<int>& rand_column_families,
+                                const std::vector<int64_t>& rand_keys) {
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    Slice prefix = Slice(key.data(), FLAGS_prefix_size);
+
+    std::string upper_bound;
+    Slice ub_slice;
+    ReadOptions ro_copy = readoptions;
+    if (thread->rand.OneIn(2) && GetNextPrefix(prefix, &upper_bound)) {
+      ub_slice = Slice(upper_bound);
+      ro_copy.iterate_upper_bound = &ub_slice;
+    }
+    auto cfh =
+        column_families_[rand_column_families[thread->rand.Next() %
+                                              rand_column_families.size()]];
+    Iterator* iter = db_->NewIterator(ro_copy, cfh);
+    long count = 0;
+    for (iter->Seek(prefix); iter->Valid() && iter->key().starts_with(prefix);
+         iter->Next()) {
+      ++count;
+    }
+    assert(count <= (static_cast<long>(1) << ((8 - FLAGS_prefix_size) * 8)));
+    Status s = iter->status();
+    if (s.ok()) {
+      thread->stats.AddPrefixes(1, count);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    delete iter;
+    return s;
+  }
+
+  virtual void VerifyDb(ThreadState* thread) const {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    // We must set total_order_seek to true because we are doing a SeekToFirst
+    // on a column family whose memtables may support (by default) prefix-based
+    // iterator. In this case, NewIterator with options.total_order_seek being
+    // false returns a prefix-based iterator. Calling SeekToFirst using this
+    // iterator causes the iterator to become invalid. That means we cannot
+    // iterate the memtable using this iterator any more, although the memtable
+    // contains the most up-to-date key-values.
+    options.total_order_seek = true;
+    assert(thread != nullptr);
+    auto shared = thread->shared;
+    std::vector<std::unique_ptr<Iterator> > iters(column_families_.size());
+    for (size_t i = 0; i != column_families_.size(); ++i) {
+      iters[i].reset(db_->NewIterator(options, column_families_[i]));
+    }
+    for (auto& iter : iters) {
+      iter->SeekToFirst();
+    }
+    size_t num = column_families_.size();
+    assert(num == iters.size());
+    std::vector<Status> statuses(num, Status::OK());
+    do {
+      size_t valid_cnt = 0;
+      size_t idx = 0;
+      for (auto& iter : iters) {
+        if (iter->Valid()) {
+          ++valid_cnt;
+        } else {
+          statuses[idx] = iter->status();
+        }
+        ++idx;
+      }
+      if (valid_cnt == 0) {
+        Status status;
+        for (size_t i = 0; i != num; ++i) {
+          const auto& s = statuses[i];
+          if (!s.ok()) {
+            status = s;
+            fprintf(stderr, "Iterator on cf %s has error: %s\n",
+                    column_families_[i]->GetName().c_str(),
+                    s.ToString().c_str());
+            shared->SetVerificationFailure();
+          }
+        }
+        if (status.ok()) {
+          fprintf(stdout, "Finished scanning all column families.\n");
+        }
+        break;
+      } else if (valid_cnt != iters.size()) {
+        for (size_t i = 0; i != num; ++i) {
+          if (!iters[i]->Valid()) {
+            if (statuses[i].ok()) {
+              fprintf(stderr, "Finished scanning cf %s\n",
+                      column_families_[i]->GetName().c_str());
+            } else {
+              fprintf(stderr, "Iterator on cf %s has error: %s\n",
+                      column_families_[i]->GetName().c_str(),
+                      statuses[i].ToString().c_str());
+            }
+          } else {
+            fprintf(stderr, "cf %s has remaining data to scan\n",
+                    column_families_[i]->GetName().c_str());
+          }
+        }
+        shared->SetVerificationFailure();
+        break;
+      }
+      // If the program reaches here, then all column families' iterators are
+      // still valid.
+      Slice key;
+      Slice value;
+      for (size_t i = 0; i != num; ++i) {
+        if (i == 0) {
+          key = iters[i]->key();
+          value = iters[i]->value();
+        } else {
+          if (key.compare(iters[i]->key()) != 0) {
+            fprintf(stderr, "Verification failed\n");
+            fprintf(stderr, "cf%s: %s => %s\n",
+                    column_families_[0]->GetName().c_str(),
+                    key.ToString(true /* hex */).c_str(),
+                    value.ToString(/* hex */).c_str());
+            fprintf(stderr, "cf%s: %s => %s\n",
+                    column_families_[i]->GetName().c_str(),
+                    iters[i]->key().ToString(true /* hex */).c_str(),
+                    iters[i]->value().ToString(true /* hex */).c_str());
+            shared->SetVerificationFailure();
+          }
+        }
+      }
+      for (auto& iter : iters) {
+        iter->Next();
+      }
+    } while (true);
+  }
+
+  virtual std::vector<int> GenerateColumnFamilies(
+      const int /* num_column_families */, int /* rand_column_family */) const {
+    std::vector<int> ret;
+    int num = static_cast<int>(column_families_.size());
+    int k = 0;
+    std::generate_n(back_inserter(ret), num, [&k]() -> int { return k++; });
+    return ret;
+  }
+
+ private:
+  std::atomic<int64_t> batch_id_;
+};
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
@@ -3415,6 +3777,11 @@ int main(int argc, char** argv) {
             "Error: nooverwritepercent must be 0 when using file ingestion\n");
     exit(1);
   }
+  if (FLAGS_clear_column_family_one_in > 0 && FLAGS_backup_one_in > 0) {
+    fprintf(stderr,
+            "Error: clear_column_family_one_in must be 0 when using backup\n");
+    exit(1);
+  }
 
   // Choose a location for the test database if none given with --db=<path>
   if (FLAGS_db.empty()) {
@@ -3428,7 +3795,9 @@ int main(int argc, char** argv) {
   rocksdb_kill_prefix_blacklist = SplitString(FLAGS_kill_prefix_blacklist);
 
   std::unique_ptr<rocksdb::StressTest> stress;
-  if (FLAGS_test_batches_snapshots) {
+  if (FLAGS_atomic_flush) {
+    stress.reset(new rocksdb::AtomicFlushStressTest());
+  } else if (FLAGS_test_batches_snapshots) {
     stress.reset(new rocksdb::BatchedOpsStressTest());
   } else {
     stress.reset(new rocksdb::NonBatchedOpsStressTest());
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 4b6f6f4d8a2..997718ef28e 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -1964,11 +1964,11 @@ void DumpWalFile(std::string wal_file, bool print_header, bool print_values,
                  bool is_write_committed, LDBCommandExecuteResult* exec_state) {
   Env* env_ = Env::Default();
   EnvOptions soptions;
-  unique_ptr<SequentialFileReader> wal_file_reader;
+  std::unique_ptr<SequentialFileReader> wal_file_reader;
 
   Status status;
   {
-    unique_ptr<SequentialFile> file;
+    std::unique_ptr<SequentialFile> file;
     status = env_->NewSequentialFile(wal_file, &file, soptions);
     if (status.ok()) {
       wal_file_reader.reset(
@@ -1999,7 +1999,8 @@ void DumpWalFile(std::string wal_file, bool print_header, bool print_values,
     }
     DBOptions db_options;
     log::Reader reader(db_options.info_log, std::move(wal_file_reader),
-                       &reporter, true /* checksum */, log_number);
+                       &reporter, true /* checksum */, log_number,
+                       false /* retry_after_eof */);
     std::string scratch;
     WriteBatch batch;
     Slice record;
@@ -2844,8 +2845,8 @@ void DumpSstFile(std::string filename, bool output_hex, bool show_properties) {
     return;
   }
   // no verification
-  rocksdb::SstFileReader reader(filename, false, output_hex);
-  Status st = reader.ReadSequential(true, std::numeric_limits<uint64_t>::max(), false,  // has_from
+  rocksdb::SstFileDumper dumper(filename, false, output_hex);
+  Status st = dumper.ReadSequential(true, std::numeric_limits<uint64_t>::max(), false,  // has_from
                                     from_key, false,  // has_to
                                     to_key);
   if (!st.ok()) {
@@ -2859,21 +2860,17 @@ void DumpSstFile(std::string filename, bool output_hex, bool show_properties) {
 
     std::shared_ptr<const rocksdb::TableProperties>
         table_properties_from_reader;
-    st = reader.ReadTableProperties(&table_properties_from_reader);
+    st = dumper.ReadTableProperties(&table_properties_from_reader);
     if (!st.ok()) {
       std::cerr << filename << ": " << st.ToString()
                 << ". Try to use initial table properties" << std::endl;
-      table_properties = reader.GetInitTableProperties();
+      table_properties = dumper.GetInitTableProperties();
     } else {
       table_properties = table_properties_from_reader.get();
     }
     if (table_properties != nullptr) {
       std::cout << std::endl << "Table Properties:" << std::endl;
       std::cout << table_properties->ToString("\n") << std::endl;
-      std::cout << "# deleted keys: "
-                << rocksdb::GetDeletedKeys(
-                       table_properties->user_collected_properties)
-                << std::endl;
     }
   }
 }
diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc
index beab224d129..9032123cc6f 100644
--- a/tools/sst_dump_test.cc
+++ b/tools/sst_dump_test.cc
@@ -43,7 +43,7 @@ void createSST(const std::string& file_name,
   std::shared_ptr<rocksdb::TableFactory> tf;
   tf.reset(new rocksdb::BlockBasedTableFactory(table_options));
 
-  unique_ptr<WritableFile> file;
+  std::unique_ptr<WritableFile> file;
   Env* env = Env::Default();
   EnvOptions env_options;
   ReadOptions read_options;
@@ -51,7 +51,7 @@ void createSST(const std::string& file_name,
   const ImmutableCFOptions imoptions(opts);
   const MutableCFOptions moptions(opts);
   rocksdb::InternalKeyComparator ikc(opts.comparator);
-  unique_ptr<TableBuilder> tb;
+  std::unique_ptr<TableBuilder> tb;
 
   ASSERT_OK(env->NewWritableFile(file_name, &file, env_options));
 
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index 6ca56aad98c..25699777e89 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -43,7 +43,7 @@
 
 namespace rocksdb {
 
-SstFileReader::SstFileReader(const std::string& file_path, bool verify_checksum,
+SstFileDumper::SstFileDumper(const std::string& file_path, bool verify_checksum,
                              bool output_hex)
     : file_name_(file_path),
       read_num_(0),
@@ -74,7 +74,7 @@ static const std::vector<std::pair<CompressionType, const char*>>
         {CompressionType::kXpressCompression, "kXpressCompression"},
         {CompressionType::kZSTD, "kZSTD"}};
 
-Status SstFileReader::GetTableReader(const std::string& file_path) {
+Status SstFileDumper::GetTableReader(const std::string& file_path) {
   // Warning about 'magic_number' being uninitialized shows up only in UBsan
   // builds. Though access is guarded by 's.ok()' checks, fix the issue to
   // avoid any warnings.
@@ -83,7 +83,7 @@ Status SstFileReader::GetTableReader(const std::string& file_path) {
   // read table magic number
   Footer footer;
 
-  unique_ptr<RandomAccessFile> file;
+  std::unique_ptr<RandomAccessFile> file;
   uint64_t file_size = 0;
   Status s = options_.env->NewRandomAccessFile(file_path, &file, soptions_);
   if (s.ok()) {
@@ -123,10 +123,10 @@ Status SstFileReader::GetTableReader(const std::string& file_path) {
   return s;
 }
 
-Status SstFileReader::NewTableReader(
+Status SstFileDumper::NewTableReader(
     const ImmutableCFOptions& /*ioptions*/, const EnvOptions& /*soptions*/,
     const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size,
-    unique_ptr<TableReader>* /*table_reader*/) {
+    std::unique_ptr<TableReader>* /*table_reader*/) {
   // We need to turn off pre-fetching of index and filter nodes for
   // BlockBasedTable
   if (BlockBasedTableFactory::kName == options_.table_factory->Name()) {
@@ -143,12 +143,12 @@ Status SstFileReader::NewTableReader(
       std::move(file_), file_size, &table_reader_);
 }
 
-Status SstFileReader::VerifyChecksum() {
+Status SstFileDumper::VerifyChecksum() {
   return table_reader_->VerifyChecksum();
 }
 
-Status SstFileReader::DumpTable(const std::string& out_filename) {
-  unique_ptr<WritableFile> out_file;
+Status SstFileDumper::DumpTable(const std::string& out_filename) {
+  std::unique_ptr<WritableFile> out_file;
   Env* env = Env::Default();
   env->NewWritableFile(out_filename, &out_file, soptions_);
   Status s = table_reader_->DumpTable(out_file.get(),
@@ -157,23 +157,23 @@ Status SstFileReader::DumpTable(const std::string& out_filename) {
   return s;
 }
 
-uint64_t SstFileReader::CalculateCompressedTableSize(
+uint64_t SstFileDumper::CalculateCompressedTableSize(
     const TableBuilderOptions& tb_options, size_t block_size) {
-  unique_ptr<WritableFile> out_file;
-  unique_ptr<Env> env(NewMemEnv(Env::Default()));
+  std::unique_ptr<WritableFile> out_file;
+  std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
   env->NewWritableFile(testFileName, &out_file, soptions_);
-  unique_ptr<WritableFileWriter> dest_writer;
+  std::unique_ptr<WritableFileWriter> dest_writer;
   dest_writer.reset(
       new WritableFileWriter(std::move(out_file), testFileName, soptions_));
   BlockBasedTableOptions table_options;
   table_options.block_size = block_size;
   BlockBasedTableFactory block_based_tf(table_options);
-  unique_ptr<TableBuilder> table_builder;
+  std::unique_ptr<TableBuilder> table_builder;
   table_builder.reset(block_based_tf.NewTableBuilder(
       tb_options,
       TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
       dest_writer.get()));
-  unique_ptr<InternalIterator> iter(table_reader_->NewIterator(
+  std::unique_ptr<InternalIterator> iter(table_reader_->NewIterator(
       ReadOptions(), moptions_.prefix_extractor.get()));
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     if (!iter->status().ok()) {
@@ -192,7 +192,7 @@ uint64_t SstFileReader::CalculateCompressedTableSize(
   return size;
 }
 
-int SstFileReader::ShowAllCompressionSizes(
+int SstFileDumper::ShowAllCompressionSizes(
     size_t block_size,
     const std::vector<std::pair<CompressionType, const char*>>&
         compression_types) {
@@ -226,7 +226,7 @@ int SstFileReader::ShowAllCompressionSizes(
   return 0;
 }
 
-Status SstFileReader::ReadTableProperties(uint64_t table_magic_number,
+Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number,
                                           RandomAccessFileReader* file,
                                           uint64_t file_size) {
   TableProperties* table_properties = nullptr;
@@ -240,7 +240,7 @@ Status SstFileReader::ReadTableProperties(uint64_t table_magic_number,
   return s;
 }
 
-Status SstFileReader::SetTableOptionsByMagicNumber(
+Status SstFileDumper::SetTableOptionsByMagicNumber(
     uint64_t table_magic_number) {
   assert(table_properties_);
   if (table_magic_number == kBlockBasedTableMagicNumber ||
@@ -283,7 +283,7 @@ Status SstFileReader::SetTableOptionsByMagicNumber(
   return Status::OK();
 }
 
-Status SstFileReader::SetOldTableOptions() {
+Status SstFileDumper::SetOldTableOptions() {
   assert(table_properties_ == nullptr);
   options_.table_factory = std::make_shared<BlockBasedTableFactory>();
   fprintf(stdout, "Sst file format: block-based(old version)\n");
@@ -291,7 +291,7 @@ Status SstFileReader::SetOldTableOptions() {
   return Status::OK();
 }
 
-Status SstFileReader::ReadSequential(bool print_kv, uint64_t read_num,
+Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num,
                                      bool has_from, const std::string& from_key,
                                      bool has_to, const std::string& to_key,
                                      bool use_from_as_prefix) {
@@ -348,7 +348,7 @@ Status SstFileReader::ReadSequential(bool print_kv, uint64_t read_num,
   return ret;
 }
 
-Status SstFileReader::ReadTableProperties(
+Status SstFileDumper::ReadTableProperties(
     std::shared_ptr<const TableProperties>* table_properties) {
   if (!table_reader_) {
     return init_result_;
@@ -570,16 +570,16 @@ int SSTDumpTool::Run(int argc, char** argv) {
       filename = std::string(dir_or_file) + "/" + filename;
     }
 
-    rocksdb::SstFileReader reader(filename, verify_checksum,
+    rocksdb::SstFileDumper dumper(filename, verify_checksum,
                                   output_hex);
-    if (!reader.getStatus().ok()) {
+    if (!dumper.getStatus().ok()) {
       fprintf(stderr, "%s: %s\n", filename.c_str(),
-              reader.getStatus().ToString().c_str());
+              dumper.getStatus().ToString().c_str());
       continue;
     }
 
     if (command == "recompress") {
-      reader.ShowAllCompressionSizes(
+      dumper.ShowAllCompressionSizes(
           set_block_size ? block_size : 16384,
           compression_types.empty() ? kCompressions : compression_types);
       return 0;
@@ -589,7 +589,7 @@ int SSTDumpTool::Run(int argc, char** argv) {
       std::string out_filename = filename.substr(0, filename.length() - 4);
       out_filename.append("_dump.txt");
 
-      st = reader.DumpTable(out_filename);
+      st = dumper.DumpTable(out_filename);
       if (!st.ok()) {
         fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
         exit(1);
@@ -601,7 +601,7 @@ int SSTDumpTool::Run(int argc, char** argv) {
 
     // scan all files in give file path.
     if (command == "" || command == "scan" || command == "check") {
-      st = reader.ReadSequential(
+      st = dumper.ReadSequential(
           command == "scan", read_num > 0 ? (read_num - total_read) : read_num,
           has_from || use_from_as_prefix, from_key, has_to, to_key,
           use_from_as_prefix);
@@ -609,14 +609,14 @@ int SSTDumpTool::Run(int argc, char** argv) {
         fprintf(stderr, "%s: %s\n", filename.c_str(),
             st.ToString().c_str());
       }
-      total_read += reader.GetReadNumber();
+      total_read += dumper.GetReadNumber();
       if (read_num > 0 && total_read > read_num) {
         break;
       }
     }
 
     if (command == "verify") {
-      st = reader.VerifyChecksum();
+      st = dumper.VerifyChecksum();
       if (!st.ok()) {
         fprintf(stderr, "%s is corrupted: %s\n", filename.c_str(),
                 st.ToString().c_str());
@@ -631,11 +631,11 @@ int SSTDumpTool::Run(int argc, char** argv) {
 
       std::shared_ptr<const rocksdb::TableProperties>
           table_properties_from_reader;
-      st = reader.ReadTableProperties(&table_properties_from_reader);
+      st = dumper.ReadTableProperties(&table_properties_from_reader);
       if (!st.ok()) {
         fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
         fprintf(stderr, "Try to use initial table properties\n");
-        table_properties = reader.GetInitTableProperties();
+        table_properties = dumper.GetInitTableProperties();
       } else {
         table_properties = table_properties_from_reader.get();
       }
@@ -646,19 +646,6 @@ int SSTDumpTool::Run(int argc, char** argv) {
                   "------------------------------\n"
                   "  %s",
                   table_properties->ToString("\n  ", ": ").c_str());
-          fprintf(stdout, "# deleted keys: %" PRIu64 "\n",
-                  rocksdb::GetDeletedKeys(
-                      table_properties->user_collected_properties));
-
-          bool property_present;
-          uint64_t merge_operands = rocksdb::GetMergeOperands(
-              table_properties->user_collected_properties, &property_present);
-          if (property_present) {
-            fprintf(stdout, "  # merge operands: %" PRIu64 "\n",
-                    merge_operands);
-          } else {
-            fprintf(stdout, "  # merge operands: UNKNOWN\n");
-          }
         }
         total_num_files += 1;
         total_num_data_blocks += table_properties->num_data_blocks;
diff --git a/tools/sst_dump_tool_imp.h b/tools/sst_dump_tool_imp.h
index ca60dd93c9c..9e83d8d0402 100644
--- a/tools/sst_dump_tool_imp.h
+++ b/tools/sst_dump_tool_imp.h
@@ -15,9 +15,9 @@
 
 namespace rocksdb {
 
-class SstFileReader {
+class SstFileDumper {
  public:
-  explicit SstFileReader(const std::string& file_name, bool verify_checksum,
+  explicit SstFileDumper(const std::string& file_name, bool verify_checksum,
                          bool output_hex);
 
   Status ReadSequential(bool print_kv, uint64_t read_num, bool has_from,
@@ -57,7 +57,7 @@ class SstFileReader {
                         const EnvOptions& soptions,
                         const InternalKeyComparator& internal_comparator,
                         uint64_t file_size,
-                        unique_ptr<TableReader>* table_reader);
+                        std::unique_ptr<TableReader>* table_reader);
 
   std::string file_name_;
   uint64_t read_num_;
@@ -70,13 +70,13 @@ class SstFileReader {
   Options options_;
 
   Status init_result_;
-  unique_ptr<TableReader> table_reader_;
-  unique_ptr<RandomAccessFileReader> file_;
+  std::unique_ptr<TableReader> table_reader_;
+  std::unique_ptr<RandomAccessFileReader> file_;
 
   const ImmutableCFOptions ioptions_;
   const MutableCFOptions moptions_;
   InternalKeyComparator internal_comparator_;
-  unique_ptr<TableProperties> table_properties_;
+  std::unique_ptr<TableProperties> table_properties_;
 };
 
 }  // namespace rocksdb
diff --git a/tools/trace_analyzer_tool.cc b/tools/trace_analyzer_tool.cc
index 7915322f0e7..49f2175a394 100644
--- a/tools/trace_analyzer_tool.cc
+++ b/tools/trace_analyzer_tool.cc
@@ -139,7 +139,7 @@ DEFINE_bool(no_key, false,
 DEFINE_bool(print_overall_stats, true,
             " Print the stats of the whole trace, "
             "like total requests, keys, and etc.");
-DEFINE_bool(print_key_distribution, false, "Print the key size distribution.");
+DEFINE_bool(output_key_distribution, false, "Print the key size distribution.");
 DEFINE_bool(
     output_value_distribution, false,
     "Out put the value size distribution, only available for Put and Merge.\n"
@@ -158,6 +158,9 @@ DEFINE_int32(value_interval, 8,
              "To output the value distribution, we need to set the value "
              "intervals and make the statistic of the value size distribution "
              "in different intervals. The default is 8.");
+DEFINE_double(sample_ratio, 1.0,
+              "If the trace size is extremely huge or user want to sample "
+              "the trace when analyzing, sample ratio can be set (0, 1.0]");
 
 namespace rocksdb {
 
@@ -276,9 +279,17 @@ TraceAnalyzer::TraceAnalyzer(std::string& trace_path, std::string& output_path,
   total_access_keys_ = 0;
   total_gets_ = 0;
   total_writes_ = 0;
+  trace_create_time_ = 0;
   begin_time_ = 0;
   end_time_ = 0;
   time_series_start_ = 0;
+  cur_time_sec_ = 0;
+  if (FLAGS_sample_ratio > 1.0 || FLAGS_sample_ratio <= 0) {
+    sample_max_ = 1;
+  } else {
+    sample_max_ = static_cast<uint32_t>(1.0 / FLAGS_sample_ratio);
+  }
+
   ta_.resize(kTaTypeNum);
   ta_[0].type_name = "get";
   if (FLAGS_analyze_get) {
@@ -328,6 +339,9 @@ TraceAnalyzer::TraceAnalyzer(std::string& trace_path, std::string& output_path,
   } else {
     ta_[7].enabled = false;
   }
+  for (int i = 0; i < kTaTypeNum; i++) {
+    ta_[i].sample_count = 0;
+  }
 }
 
 TraceAnalyzer::~TraceAnalyzer() {}
@@ -363,6 +377,13 @@ Status TraceAnalyzer::PrepareProcessing() {
     if (!s.ok()) {
       return s;
     }
+
+    qps_stats_name =
+        output_path_ + "/" + FLAGS_output_prefix + "-cf_qps_stats.txt";
+    s = env_->NewWritableFile(qps_stats_name, &cf_qps_f_, env_options_);
+    if (!s.ok()) {
+      return s;
+    }
   }
   return Status::OK();
 }
@@ -422,6 +443,7 @@ Status TraceAnalyzer::StartProcessing() {
     fprintf(stderr, "Cannot read the header\n");
     return s;
   }
+  trace_create_time_ = header.ts;
   if (FLAGS_output_time_series) {
     time_series_start_ = header.ts;
   }
@@ -521,7 +543,7 @@ Status TraceAnalyzer::MakeStatistics() {
         }
 
         // Generate the key size distribution data
-        if (FLAGS_print_key_distribution) {
+        if (FLAGS_output_key_distribution) {
           if (stat.second.a_key_size_stats.find(record.first.size()) ==
               stat.second.a_key_size_stats.end()) {
             stat.second.a_key_size_stats[record.first.size()] = 1;
@@ -565,17 +587,31 @@ Status TraceAnalyzer::MakeStatistics() {
 
       // find the medium of the key size
       uint64_t k_count = 0;
+      bool get_mid = false;
       for (auto& record : stat.second.a_key_size_stats) {
         k_count += record.second;
-        if (k_count >= stat.second.a_key_mid) {
+        if (!get_mid && k_count >= stat.second.a_key_mid) {
           stat.second.a_key_mid = record.first;
-          break;
+          get_mid = true;
+        }
+        if (FLAGS_output_key_distribution && stat.second.a_key_size_f) {
+          ret = sprintf(buffer_, "%" PRIu64 " %" PRIu64 "\n", record.first,
+                        record.second);
+          if (ret < 0) {
+            return Status::IOError("Format output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.second.a_key_size_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr, "Write key size distribution file failed\n");
+            return s;
+          }
         }
       }
 
       // output the value size distribution
       uint64_t v_begin = 0, v_end = 0, v_count = 0;
-      bool get_mid = false;
+      get_mid = false;
       for (auto& record : stat.second.a_value_size_stats) {
         v_begin = v_end;
         v_end = (record.first + 1) * FLAGS_value_interval;
@@ -740,6 +776,9 @@ Status TraceAnalyzer::MakeStatisticCorrelation(TraceStats& stats,
 
 // Process the statistics of QPS
 Status TraceAnalyzer::MakeStatisticQPS() {
+  if(begin_time_ == 0) {
+    begin_time_ = trace_create_time_;
+  }
   uint32_t duration =
       static_cast<uint32_t>((end_time_ - begin_time_) / 1000000);
   int ret;
@@ -818,6 +857,32 @@ Status TraceAnalyzer::MakeStatisticQPS() {
         stat.second.a_ave_qps = (static_cast<double>(cf_qps_sum)) / duration;
       }
 
+      // Output the accessed unique key number change overtime
+      if (stat.second.a_key_num_f) {
+        uint64_t cur_uni_key =
+            static_cast<uint64_t>(stat.second.a_key_stats.size());
+        double cur_ratio = 0.0;
+        uint64_t cur_num = 0;
+        for (uint32_t i = 0; i < duration; i++) {
+          auto find_time = stat.second.uni_key_num.find(i);
+          if (find_time != stat.second.uni_key_num.end()) {
+            cur_ratio = (static_cast<double>(find_time->second)) / cur_uni_key;
+            cur_num = find_time->second;
+          }
+          ret = sprintf(buffer_, "%" PRIu64 " %.12f\n", cur_num, cur_ratio);
+          if (ret < 0) {
+            return Status::IOError("Format the output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.second.a_key_num_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr,
+                    "Write accessed unique key number change file failed\n");
+            return s;
+          }
+        }
+      }
+
       // output the prefix of top k access peak
       if (FLAGS_output_prefix_cut > 0 && stat.second.a_top_qps_prefix_f) {
         while (!stat.second.top_k_qps_sec.empty()) {
@@ -882,6 +947,33 @@ Status TraceAnalyzer::MakeStatisticQPS() {
     }
   }
 
+  if (cf_qps_f_) {
+    int cfs_size = static_cast<uint32_t>(cfs_.size());
+    uint32_t v;
+    for (uint32_t i = 0; i < duration; i++) {
+      for (int cf = 0; cf < cfs_size; cf++) {
+        if (cfs_[cf].cf_qps.find(i) != cfs_[cf].cf_qps.end()) {
+          v = cfs_[cf].cf_qps[i];
+        } else {
+          v = 0;
+        }
+        if (cf < cfs_size - 1) {
+          ret = sprintf(buffer_, "%u ", v);
+        } else {
+          ret = sprintf(buffer_, "%u\n", v);
+        }
+        if (ret < 0) {
+          return Status::IOError("Format the output failed");
+        }
+        std::string printout(buffer_);
+        s = cf_qps_f_->Append(printout);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+    }
+  }
+
   qps_peak_ = qps_peak;
   for (int type = 0; type <= kTaTypeNum; type++) {
     if (duration == 0) {
@@ -1010,7 +1102,7 @@ Status TraceAnalyzer::ReProcessing() {
           }
 
           // Make the statistics fo the key size distribution
-          if (FLAGS_print_key_distribution) {
+          if (FLAGS_output_key_distribution) {
             if (cfs_[cf_id].w_key_size_stats.find(input_key.size()) ==
                 cfs_[cf_id].w_key_size_stats.end()) {
               cfs_[cf_id].w_key_size_stats[input_key.size()] = 1;
@@ -1129,6 +1221,11 @@ Status TraceAnalyzer::KeyStatsInsertion(const uint32_t& type,
       tmp_qps_map[prefix] = 1;
       ta_[type].stats[cf_id].a_qps_prefix_stats[time_in_sec] = tmp_qps_map;
     }
+    if (time_in_sec != cur_time_sec_) {
+      ta_[type].stats[cf_id].uni_key_num[cur_time_sec_] =
+          static_cast<uint64_t>(ta_[type].stats[cf_id].a_key_stats.size());
+      cur_time_sec_ = time_in_sec;
+    }
   } else {
     found_stats->second.a_count++;
     found_stats->second.a_key_size_sqsum += MultiplyCheckOverflow(
@@ -1149,6 +1246,11 @@ Status TraceAnalyzer::KeyStatsInsertion(const uint32_t& type,
         s = StatsUnitCorrelationUpdate(found_key->second, type, ts, key);
       }
     }
+    if (time_in_sec != cur_time_sec_) {
+      found_stats->second.uni_key_num[cur_time_sec_] =
+          static_cast<uint64_t>(found_stats->second.a_key_stats.size());
+      cur_time_sec_ = time_in_sec;
+    }
 
     auto found_value =
         found_stats->second.a_value_size_stats.find(dist_value_size);
@@ -1189,6 +1291,10 @@ Status TraceAnalyzer::KeyStatsInsertion(const uint32_t& type,
     cfs_[cf_id] = cf_unit;
   }
 
+  if (FLAGS_output_qps_stats) {
+    cfs_[cf_id].cf_qps[time_in_sec]++;
+  }
+
   if (FLAGS_output_time_series) {
     TraceUnit trace_u;
     trace_u.type = type;
@@ -1251,6 +1357,9 @@ Status TraceAnalyzer::OpenStatsOutputFiles(const std::string& type,
   if (FLAGS_output_key_stats) {
     s = CreateOutputFile(type, new_stats.cf_name, "accessed_key_stats.txt",
                          &new_stats.a_key_f);
+    s = CreateOutputFile(type, new_stats.cf_name,
+                         "accessed_unique_key_num_change.txt",
+                         &new_stats.a_key_num_f);
     if (!FLAGS_key_space_dir.empty()) {
       s = CreateOutputFile(type, new_stats.cf_name, "whole_key_stats.txt",
                            &new_stats.w_key_f);
@@ -1289,6 +1398,12 @@ Status TraceAnalyzer::OpenStatsOutputFiles(const std::string& type,
                          &new_stats.a_value_size_f);
   }
 
+  if (FLAGS_output_key_distribution) {
+    s = CreateOutputFile(type, new_stats.cf_name,
+                         "accessed_key_size_distribution.txt",
+                         &new_stats.a_key_size_f);
+  }
+
   if (FLAGS_output_qps_stats) {
     s = CreateOutputFile(type, new_stats.cf_name, "qps_stats.txt",
                          &new_stats.a_qps_f);
@@ -1328,6 +1443,10 @@ void TraceAnalyzer::CloseOutputFiles() {
         stat.second.a_key_f->Close();
       }
 
+      if (stat.second.a_key_num_f) {
+        stat.second.a_key_num_f->Close();
+      }
+
       if (stat.second.a_count_dist_f) {
         stat.second.a_count_dist_f->Close();
       }
@@ -1340,6 +1459,10 @@ void TraceAnalyzer::CloseOutputFiles() {
         stat.second.a_value_size_f->Close();
       }
 
+      if (stat.second.a_key_size_f) {
+        stat.second.a_key_size_f->Close();
+      }
+
       if (stat.second.a_qps_f) {
         stat.second.a_qps_f->Close();
       }
@@ -1373,6 +1496,15 @@ Status TraceAnalyzer::HandleGet(uint32_t column_family_id,
     }
   }
 
+  if (ta_[TraceOperationType::kGet].sample_count >= sample_max_) {
+    ta_[TraceOperationType::kGet].sample_count = 0;
+  }
+  if (ta_[TraceOperationType::kGet].sample_count > 0) {
+    ta_[TraceOperationType::kGet].sample_count++;
+    return Status::OK();
+  }
+  ta_[TraceOperationType::kGet].sample_count++;
+
   if (!ta_[TraceOperationType::kGet].enabled) {
     return Status::OK();
   }
@@ -1400,6 +1532,15 @@ Status TraceAnalyzer::HandlePut(uint32_t column_family_id, const Slice& key,
     }
   }
 
+  if (ta_[TraceOperationType::kPut].sample_count >= sample_max_) {
+    ta_[TraceOperationType::kPut].sample_count = 0;
+  }
+  if (ta_[TraceOperationType::kPut].sample_count > 0) {
+    ta_[TraceOperationType::kPut].sample_count++;
+    return Status::OK();
+  }
+  ta_[TraceOperationType::kPut].sample_count++;
+
   if (!ta_[TraceOperationType::kPut].enabled) {
     return Status::OK();
   }
@@ -1424,6 +1565,15 @@ Status TraceAnalyzer::HandleDelete(uint32_t column_family_id,
     }
   }
 
+  if (ta_[TraceOperationType::kDelete].sample_count >= sample_max_) {
+    ta_[TraceOperationType::kDelete].sample_count = 0;
+  }
+  if (ta_[TraceOperationType::kDelete].sample_count > 0) {
+    ta_[TraceOperationType::kDelete].sample_count++;
+    return Status::OK();
+  }
+  ta_[TraceOperationType::kDelete].sample_count++;
+
   if (!ta_[TraceOperationType::kDelete].enabled) {
     return Status::OK();
   }
@@ -1448,6 +1598,15 @@ Status TraceAnalyzer::HandleSingleDelete(uint32_t column_family_id,
     }
   }
 
+  if (ta_[TraceOperationType::kSingleDelete].sample_count >= sample_max_) {
+    ta_[TraceOperationType::kSingleDelete].sample_count = 0;
+  }
+  if (ta_[TraceOperationType::kSingleDelete].sample_count > 0) {
+    ta_[TraceOperationType::kSingleDelete].sample_count++;
+    return Status::OK();
+  }
+  ta_[TraceOperationType::kSingleDelete].sample_count++;
+
   if (!ta_[TraceOperationType::kSingleDelete].enabled) {
     return Status::OK();
   }
@@ -1473,6 +1632,15 @@ Status TraceAnalyzer::HandleDeleteRange(uint32_t column_family_id,
     }
   }
 
+  if (ta_[TraceOperationType::kRangeDelete].sample_count >= sample_max_) {
+    ta_[TraceOperationType::kRangeDelete].sample_count = 0;
+  }
+  if (ta_[TraceOperationType::kRangeDelete].sample_count > 0) {
+    ta_[TraceOperationType::kRangeDelete].sample_count++;
+    return Status::OK();
+  }
+  ta_[TraceOperationType::kRangeDelete].sample_count++;
+
   if (!ta_[TraceOperationType::kRangeDelete].enabled) {
     return Status::OK();
   }
@@ -1499,6 +1667,15 @@ Status TraceAnalyzer::HandleMerge(uint32_t column_family_id, const Slice& key,
     }
   }
 
+  if (ta_[TraceOperationType::kMerge].sample_count >= sample_max_) {
+    ta_[TraceOperationType::kMerge].sample_count = 0;
+  }
+  if (ta_[TraceOperationType::kMerge].sample_count > 0) {
+    ta_[TraceOperationType::kMerge].sample_count++;
+    return Status::OK();
+  }
+  ta_[TraceOperationType::kMerge].sample_count++;
+
   if (!ta_[TraceOperationType::kMerge].enabled) {
     return Status::OK();
   }
@@ -1535,6 +1712,15 @@ Status TraceAnalyzer::HandleIter(uint32_t column_family_id,
     }
   }
 
+  if (ta_[type].sample_count >= sample_max_) {
+    ta_[type].sample_count = 0;
+  }
+  if (ta_[type].sample_count > 0) {
+    ta_[type].sample_count++;
+    return Status::OK();
+  }
+  ta_[type].sample_count++;
+
   if (!ta_[type].enabled) {
     return Status::OK();
   }
@@ -1596,6 +1782,8 @@ void TraceAnalyzer::PrintStatistics() {
       ta_[type].total_succ_access += stat.a_succ_count;
       printf("*********************************************************\n");
       printf("colume family id: %u\n", stat.cf_id);
+      printf("Total number of queries to this cf by %s: %" PRIu64 "\n",
+             ta_[type].type_name.c_str(), stat.a_count);
       printf("Total unique keys in this cf: %" PRIu64 "\n", total_a_keys);
       printf("Average key size: %f key size medium: %" PRIu64
              " Key size Variation: %f\n",
@@ -1642,15 +1830,6 @@ void TraceAnalyzer::PrintStatistics() {
         }
       }
 
-      // print the key size distribution
-      if (FLAGS_print_key_distribution) {
-        printf("The key size distribution\n");
-        for (auto& record : stat.a_key_size_stats) {
-          printf("key_size %" PRIu64 " nums: %" PRIu64 "\n", record.first,
-                 record.second);
-        }
-      }
-
       // print the operation correlations
       if (!FLAGS_print_correlation.empty()) {
         for (int correlation = 0;
@@ -1700,6 +1879,8 @@ void TraceAnalyzer::PrintStatistics() {
       printf("Average QPS per second: %f Peak QPS: %u\n", qps_ave_[kTaTypeNum],
              qps_peak_[kTaTypeNum]);
     }
+    printf("The statistics related to query number need to times: %u\n",
+           sample_max_);
     printf("Total_requests: %" PRIu64 " Total_accessed_keys: %" PRIu64
            " Total_gets: %" PRIu64 " Total_write_batch: %" PRIu64 "\n",
            total_requests_, total_access_keys_, total_gets_, total_writes_);
diff --git a/tools/trace_analyzer_tool.h b/tools/trace_analyzer_tool.h
index ac9f42f1c07..be96f5005da 100644
--- a/tools/trace_analyzer_tool.h
+++ b/tools/trace_analyzer_tool.h
@@ -115,12 +115,15 @@ struct TraceStats {
       top_k_qps_sec;
   std::list<TraceUnit> time_series;
   std::vector<std::pair<uint64_t, uint64_t>> correlation_output;
+  std::map<uint32_t, uint64_t> uni_key_num;
 
   std::unique_ptr<rocksdb::WritableFile> time_series_f;
   std::unique_ptr<rocksdb::WritableFile> a_key_f;
   std::unique_ptr<rocksdb::WritableFile> a_count_dist_f;
   std::unique_ptr<rocksdb::WritableFile> a_prefix_cut_f;
   std::unique_ptr<rocksdb::WritableFile> a_value_size_f;
+  std::unique_ptr<rocksdb::WritableFile> a_key_size_f;
+  std::unique_ptr<rocksdb::WritableFile> a_key_num_f;
   std::unique_ptr<rocksdb::WritableFile> a_qps_f;
   std::unique_ptr<rocksdb::WritableFile> a_top_qps_prefix_f;
   std::unique_ptr<rocksdb::WritableFile> w_key_f;
@@ -140,6 +143,7 @@ struct TypeUnit {
   uint64_t total_keys;
   uint64_t total_access;
   uint64_t total_succ_access;
+  uint32_t sample_count;
   std::map<uint32_t, TraceStats> stats;
   TypeUnit() = default;
   ~TypeUnit() = default;
@@ -155,6 +159,7 @@ struct CfUnit {
   uint64_t a_count;  // the total keys in this cf that are accessed
   std::map<uint64_t, uint64_t> w_key_size_stats;  // whole key space key size
                                                   // statistic this cf
+  std::map<uint32_t, uint32_t> cf_qps;
 };
 
 class TraceAnalyzer {
@@ -204,11 +209,15 @@ class TraceAnalyzer {
   uint64_t total_access_keys_;
   uint64_t total_gets_;
   uint64_t total_writes_;
+  uint64_t trace_create_time_;
   uint64_t begin_time_;
   uint64_t end_time_;
   uint64_t time_series_start_;
+  uint32_t sample_max_;
+  uint32_t cur_time_sec_;
   std::unique_ptr<rocksdb::WritableFile> trace_sequence_f_;  // readable trace
   std::unique_ptr<rocksdb::WritableFile> qps_f_;             // overall qps
+  std::unique_ptr<rocksdb::WritableFile> cf_qps_f_;  // The qps of each CF>
   std::unique_ptr<rocksdb::SequentialFile> wkey_input_f_;
   std::vector<TypeUnit> ta_;  // The main statistic collecting data structure
   std::map<uint32_t, CfUnit> cfs_;  // All the cf_id appears in this trace;
diff --git a/util/auto_roll_logger_test.cc b/util/auto_roll_logger_test.cc
index 5a6b3abc112..284a9815218 100644
--- a/util/auto_roll_logger_test.cc
+++ b/util/auto_roll_logger_test.cc
@@ -230,7 +230,7 @@ TEST_F(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) {
 TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) {
   DBOptions options;
   NoSleepEnv nse(Env::Default());
-  shared_ptr<Logger> logger;
+  std::shared_ptr<Logger> logger;
 
   // Normal logger
   ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
@@ -273,7 +273,7 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) {
 
 TEST_F(AutoRollLoggerTest, LogFlushWhileRolling) {
   DBOptions options;
-  shared_ptr<Logger> logger;
+  std::shared_ptr<Logger> logger;
 
   InitTestDb();
   options.max_log_file_size = 1024 * 5;
diff --git a/util/autovector.h b/util/autovector.h
index b5c84712450..97348d818a0 100644
--- a/util/autovector.h
+++ b/util/autovector.h
@@ -271,7 +271,12 @@ class autovector {
 
   template <class... Args>
   void emplace_back(Args&&... args) {
-    push_back(value_type(args...));
+    if (num_stack_items_ < kSize) {
+      values_[num_stack_items_++] =
+          std::move(value_type(std::forward<Args>(args)...));
+    } else {
+      vect_.emplace_back(std::forward<Args>(args)...);
+    }
   }
 
   void pop_back() {
diff --git a/util/compression.h b/util/compression.h
index e918e14fbec..e91faeac658 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -14,8 +14,10 @@
 #include <string>
 
 #include "rocksdb/options.h"
+#include "rocksdb/table.h"
 #include "util/coding.h"
 #include "util/compression_context_cache.h"
+#include "util/memory_allocator.h"
 
 #ifdef SNAPPY
 #include <snappy.h>
@@ -495,11 +497,10 @@ inline bool Zlib_Compress(const CompressionContext& ctx,
 // header in varint32 format
 // @param compression_dict Data for presetting the compression library's
 //    dictionary.
-inline char* Zlib_Uncompress(const UncompressionContext& ctx,
-                             const char* input_data, size_t input_length,
-                             int* decompress_size,
-                             uint32_t compress_format_version,
-                             int windowBits = -14) {
+inline CacheAllocationPtr Zlib_Uncompress(
+    const UncompressionContext& ctx, const char* input_data,
+    size_t input_length, int* decompress_size, uint32_t compress_format_version,
+    MemoryAllocator* allocator = nullptr, int windowBits = -14) {
 #ifdef ZLIB
   uint32_t output_len = 0;
   if (compress_format_version == 2) {
@@ -541,9 +542,9 @@ inline char* Zlib_Uncompress(const UncompressionContext& ctx,
   _stream.next_in = (Bytef*)input_data;
   _stream.avail_in = static_cast<unsigned int>(input_length);
 
-  char* output = new char[output_len];
+  auto output = AllocateBlock(output_len, allocator);
 
-  _stream.next_out = (Bytef*)output;
+  _stream.next_out = (Bytef*)output.get();
   _stream.avail_out = static_cast<unsigned int>(output_len);
 
   bool done = false;
@@ -561,19 +562,17 @@ inline char* Zlib_Uncompress(const UncompressionContext& ctx,
         size_t old_sz = output_len;
         uint32_t output_len_delta = output_len / 5;
         output_len += output_len_delta < 10 ? 10 : output_len_delta;
-        char* tmp = new char[output_len];
-        memcpy(tmp, output, old_sz);
-        delete[] output;
-        output = tmp;
+        auto tmp = AllocateBlock(output_len, allocator);
+        memcpy(tmp.get(), output.get(), old_sz);
+        output = std::move(tmp);
 
         // Set more output.
-        _stream.next_out = (Bytef*)(output + old_sz);
+        _stream.next_out = (Bytef*)(output.get() + old_sz);
         _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
         break;
       }
       case Z_BUF_ERROR:
       default:
-        delete[] output;
         inflateEnd(&_stream);
         return nullptr;
     }
@@ -590,6 +589,7 @@ inline char* Zlib_Uncompress(const UncompressionContext& ctx,
   (void)input_length;
   (void)decompress_size;
   (void)compress_format_version;
+  (void)allocator;
   (void)windowBits;
   return nullptr;
 #endif
@@ -660,9 +660,9 @@ inline bool BZip2_Compress(const CompressionContext& /*ctx*/,
 // block header
 // compress_format_version == 2 -- decompressed size is included in the block
 // header in varint32 format
-inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
-                              int* decompress_size,
-                              uint32_t compress_format_version) {
+inline CacheAllocationPtr BZip2_Uncompress(
+    const char* input_data, size_t input_length, int* decompress_size,
+    uint32_t compress_format_version, MemoryAllocator* allocator = nullptr) {
 #ifdef BZIP2
   uint32_t output_len = 0;
   if (compress_format_version == 2) {
@@ -690,9 +690,9 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
   _stream.next_in = (char*)input_data;
   _stream.avail_in = static_cast<unsigned int>(input_length);
 
-  char* output = new char[output_len];
+  auto output = AllocateBlock(output_len, allocator);
 
-  _stream.next_out = (char*)output;
+  _stream.next_out = (char*)output.get();
   _stream.avail_out = static_cast<unsigned int>(output_len);
 
   bool done = false;
@@ -709,18 +709,16 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
         assert(compress_format_version != 2);
         uint32_t old_sz = output_len;
         output_len = output_len * 1.2;
-        char* tmp = new char[output_len];
-        memcpy(tmp, output, old_sz);
-        delete[] output;
-        output = tmp;
+        auto tmp = AllocateBlock(output_len, allocator);
+        memcpy(tmp.get(), output.get(), old_sz);
+        output = std::move(tmp);
 
         // Set more output.
-        _stream.next_out = (char*)(output + old_sz);
+        _stream.next_out = (char*)(output.get() + old_sz);
         _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
         break;
       }
       default:
-        delete[] output;
         BZ2_bzDecompressEnd(&_stream);
         return nullptr;
     }
@@ -736,6 +734,7 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
   (void)input_length;
   (void)decompress_size;
   (void)compress_format_version;
+  (void)allocator;
   return nullptr;
 #endif
 }
@@ -791,6 +790,7 @@ inline bool LZ4_Compress(const CompressionContext& ctx,
 #else   // up to r123
   outlen = LZ4_compress_limitedOutput(input, &(*output)[output_header_len],
                                       static_cast<int>(length), compress_bound);
+  (void)ctx;
 #endif  // LZ4_VERSION_NUMBER >= 10400
 
   if (outlen == 0) {
@@ -814,10 +814,12 @@ inline bool LZ4_Compress(const CompressionContext& ctx,
 // header in varint32 format
 // @param compression_dict Data for presetting the compression library's
 //    dictionary.
-inline char* LZ4_Uncompress(const UncompressionContext& ctx,
-                            const char* input_data, size_t input_length,
-                            int* decompress_size,
-                            uint32_t compress_format_version) {
+inline CacheAllocationPtr LZ4_Uncompress(const UncompressionContext& ctx,
+                                         const char* input_data,
+                                         size_t input_length,
+                                         int* decompress_size,
+                                         uint32_t compress_format_version,
+                                         MemoryAllocator* allocator = nullptr) {
 #ifdef LZ4
   uint32_t output_len = 0;
   if (compress_format_version == 2) {
@@ -837,7 +839,7 @@ inline char* LZ4_Uncompress(const UncompressionContext& ctx,
     input_data += 8;
   }
 
-  char* output = new char[output_len];
+  auto output = AllocateBlock(output_len, allocator);
 #if LZ4_VERSION_NUMBER >= 10400  // r124+
   LZ4_streamDecode_t* stream = LZ4_createStreamDecode();
   if (ctx.dict().size()) {
@@ -845,17 +847,17 @@ inline char* LZ4_Uncompress(const UncompressionContext& ctx,
                         static_cast<int>(ctx.dict().size()));
   }
   *decompress_size = LZ4_decompress_safe_continue(
-      stream, input_data, output, static_cast<int>(input_length),
+      stream, input_data, output.get(), static_cast<int>(input_length),
       static_cast<int>(output_len));
   LZ4_freeStreamDecode(stream);
 #else   // up to r123
-  *decompress_size =
-      LZ4_decompress_safe(input_data, output, static_cast<int>(input_length),
-                          static_cast<int>(output_len));
+  *decompress_size = LZ4_decompress_safe(input_data, output.get(),
+                                         static_cast<int>(input_length),
+                                         static_cast<int>(output_len));
+  (void)ctx;
 #endif  // LZ4_VERSION_NUMBER >= 10400
 
   if (*decompress_size < 0) {
-    delete[] output;
     return nullptr;
   }
   assert(*decompress_size == static_cast<int>(output_len));
@@ -866,6 +868,7 @@ inline char* LZ4_Uncompress(const UncompressionContext& ctx,
   (void)input_length;
   (void)decompress_size;
   (void)compress_format_version;
+  (void)allocator;
   return nullptr;
 #endif
 }
@@ -1028,9 +1031,10 @@ inline bool ZSTD_Compress(const CompressionContext& ctx, const char* input,
 
 // @param compression_dict Data for presetting the compression library's
 //    dictionary.
-inline char* ZSTD_Uncompress(const UncompressionContext& ctx,
-                             const char* input_data, size_t input_length,
-                             int* decompress_size) {
+inline CacheAllocationPtr ZSTD_Uncompress(
+    const UncompressionContext& ctx, const char* input_data,
+    size_t input_length, int* decompress_size,
+    MemoryAllocator* allocator = nullptr) {
 #ifdef ZSTD
   uint32_t output_len = 0;
   if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
@@ -1038,17 +1042,17 @@ inline char* ZSTD_Uncompress(const UncompressionContext& ctx,
     return nullptr;
   }
 
-  char* output = new char[output_len];
+  auto output = AllocateBlock(output_len, allocator);
   size_t actual_output_length;
 #if ZSTD_VERSION_NUMBER >= 500  // v0.5.0+
   ZSTD_DCtx* context = ctx.GetZSTDContext();
   assert(context != nullptr);
   actual_output_length = ZSTD_decompress_usingDict(
-      context, output, output_len, input_data, input_length, ctx.dict().data(),
-      ctx.dict().size());
+      context, output.get(), output_len, input_data, input_length,
+      ctx.dict().data(), ctx.dict().size());
 #else   // up to v0.4.x
   actual_output_length =
-      ZSTD_decompress(output, output_len, input_data, input_length);
+      ZSTD_decompress(output.get(), output_len, input_data, input_length);
 #endif  // ZSTD_VERSION_NUMBER >= 500
   assert(actual_output_length == output_len);
   *decompress_size = static_cast<int>(actual_output_length);
@@ -1058,6 +1062,7 @@ inline char* ZSTD_Uncompress(const UncompressionContext& ctx,
   (void)input_data;
   (void)input_length;
   (void)decompress_size;
+  (void)allocator;
   return nullptr;
 #endif
 }
diff --git a/util/delete_scheduler.cc b/util/delete_scheduler.cc
index 1d51055a3bf..f5ee2844896 100644
--- a/util/delete_scheduler.cc
+++ b/util/delete_scheduler.cc
@@ -52,11 +52,12 @@ DeleteScheduler::~DeleteScheduler() {
 }
 
 Status DeleteScheduler::DeleteFile(const std::string& file_path,
-                                   const std::string& dir_to_sync) {
+                                   const std::string& dir_to_sync,
+                                   const bool force_bg) {
   Status s;
-  if (rate_bytes_per_sec_.load() <= 0 ||
+  if (rate_bytes_per_sec_.load() <= 0 || (!force_bg &&
       total_trash_size_.load() >
-          sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load()) {
+          sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load())) {
     // Rate limiting is disabled or trash size makes up more than
     // max_trash_db_ratio_ (default 25%) of the total DB size
     TEST_SYNC_POINT("DeleteScheduler::DeleteFile");
@@ -275,7 +276,7 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
       Status my_status = env_->NumFileLinks(path_in_trash, &num_hard_links);
       if (my_status.ok()) {
         if (num_hard_links == 1) {
-          unique_ptr<WritableFile> wf;
+          std::unique_ptr<WritableFile> wf;
           my_status =
               env_->ReopenWritableFile(path_in_trash, &wf, EnvOptions());
           if (my_status.ok()) {
diff --git a/util/delete_scheduler.h b/util/delete_scheduler.h
index cbd13ecefd0..29b70517b67 100644
--- a/util/delete_scheduler.h
+++ b/util/delete_scheduler.h
@@ -46,8 +46,11 @@ class DeleteScheduler {
     rate_bytes_per_sec_.store(bytes_per_sec);
   }
 
-  // Mark file as trash directory and schedule it's deletion
-  Status DeleteFile(const std::string& fname, const std::string& dir_to_sync);
+  // Mark file as trash directory and schedule it's deletion. If force_bg is
+  // set, it forces the file to always be deleted in the background thread,
+  // except when rate limiting is disabled
+  Status DeleteFile(const std::string& fname, const std::string& dir_to_sync,
+      const bool force_bg = false);
 
   // Wait for all files being deleteing in the background to finish or for
   // destructor to be called.
diff --git a/util/fault_injection_test_env.cc b/util/fault_injection_test_env.cc
index 3b3dbbe99bd..64e9da1aac6 100644
--- a/util/fault_injection_test_env.cc
+++ b/util/fault_injection_test_env.cc
@@ -29,12 +29,12 @@ std::string GetDirName(const std::string filename) {
 
 // A basic file truncation function suitable for this test.
 Status Truncate(Env* env, const std::string& filename, uint64_t length) {
-  unique_ptr<SequentialFile> orig_file;
+  std::unique_ptr<SequentialFile> orig_file;
   const EnvOptions options;
   Status s = env->NewSequentialFile(filename, &orig_file, options);
   if (!s.ok()) {
-    fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(),
-            s.ToString().c_str());
+    fprintf(stderr, "Cannot open file %s for truncation: %s\n",
+            filename.c_str(), s.ToString().c_str());
     return s;
   }
 
@@ -46,7 +46,7 @@ Status Truncate(Env* env, const std::string& filename, uint64_t length) {
 #endif
   if (s.ok()) {
     std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
-    unique_ptr<WritableFile> tmp_file;
+    std::unique_ptr<WritableFile> tmp_file;
     s = env->NewWritableFile(tmp_name, &tmp_file, options);
     if (s.ok()) {
       s = tmp_file->Append(result);
@@ -103,7 +103,7 @@ Status TestDirectory::Fsync() {
 }
 
 TestWritableFile::TestWritableFile(const std::string& fname,
-                                   unique_ptr<WritableFile>&& f,
+                                   std::unique_ptr<WritableFile>&& f,
                                    FaultInjectionTestEnv* env)
     : state_(fname),
       target_(std::move(f)),
@@ -157,8 +157,8 @@ Status TestWritableFile::Sync() {
 }
 
 Status FaultInjectionTestEnv::NewDirectory(const std::string& name,
-                                           unique_ptr<Directory>* result) {
-  unique_ptr<Directory> r;
+                                           std::unique_ptr<Directory>* result) {
+  std::unique_ptr<Directory> r;
   Status s = target()->NewDirectory(name, &r);
   assert(s.ok());
   if (!s.ok()) {
@@ -168,9 +168,9 @@ Status FaultInjectionTestEnv::NewDirectory(const std::string& name,
   return Status::OK();
 }
 
-Status FaultInjectionTestEnv::NewWritableFile(const std::string& fname,
-                                              unique_ptr<WritableFile>* result,
-                                              const EnvOptions& soptions) {
+Status FaultInjectionTestEnv::NewWritableFile(
+    const std::string& fname, std::unique_ptr<WritableFile>* result,
+    const EnvOptions& soptions) {
   if (!IsFilesystemActive()) {
     return GetError();
   }
@@ -197,6 +197,27 @@ Status FaultInjectionTestEnv::NewWritableFile(const std::string& fname,
   return s;
 }
 
+Status FaultInjectionTestEnv::ReopenWritableFile(
+    const std::string& fname, std::unique_ptr<WritableFile>* result,
+    const EnvOptions& soptions) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  Status s = target()->ReopenWritableFile(fname, result, soptions);
+  if (s.ok()) {
+    result->reset(new TestWritableFile(fname, std::move(*result), this));
+    // WritableFileWriter* file is opened
+    // again then it will be truncated - so forget our saved state.
+    UntrackFile(fname);
+    MutexLock l(&mutex_);
+    open_files_.insert(fname);
+    auto dir_and_name = GetDirAndName(fname);
+    auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+    list.insert(dir_and_name.second);
+  }
+  return s;
+}
+
 Status FaultInjectionTestEnv::NewRandomAccessFile(
     const std::string& fname, std::unique_ptr<RandomAccessFile>* result,
     const EnvOptions& soptions) {
diff --git a/util/fault_injection_test_env.h b/util/fault_injection_test_env.h
index 563986e29ec..d3775d3a3fe 100644
--- a/util/fault_injection_test_env.h
+++ b/util/fault_injection_test_env.h
@@ -56,7 +56,7 @@ struct FileState {
 class TestWritableFile : public WritableFile {
  public:
   explicit TestWritableFile(const std::string& fname,
-                            unique_ptr<WritableFile>&& f,
+                            std::unique_ptr<WritableFile>&& f,
                             FaultInjectionTestEnv* env);
   virtual ~TestWritableFile();
   virtual Status Append(const Slice& data) override;
@@ -77,7 +77,7 @@ class TestWritableFile : public WritableFile {
 
  private:
   FileState state_;
-  unique_ptr<WritableFile> target_;
+  std::unique_ptr<WritableFile> target_;
   bool writable_file_opened_;
   FaultInjectionTestEnv* env_;
 };
@@ -94,7 +94,7 @@ class TestDirectory : public Directory {
  private:
   FaultInjectionTestEnv* env_;
   std::string dirname_;
-  unique_ptr<Directory> dir_;
+  std::unique_ptr<Directory> dir_;
 };
 
 class FaultInjectionTestEnv : public EnvWrapper {
@@ -104,12 +104,16 @@ class FaultInjectionTestEnv : public EnvWrapper {
   virtual ~FaultInjectionTestEnv() {}
 
   Status NewDirectory(const std::string& name,
-                      unique_ptr<Directory>* result) override;
+                      std::unique_ptr<Directory>* result) override;
 
   Status NewWritableFile(const std::string& fname,
-                         unique_ptr<WritableFile>* result,
+                         std::unique_ptr<WritableFile>* result,
                          const EnvOptions& soptions) override;
 
+  Status ReopenWritableFile(const std::string& fname,
+                            std::unique_ptr<WritableFile>* result,
+                            const EnvOptions& soptions) override;
+
   Status NewRandomAccessFile(const std::string& fname,
                              std::unique_ptr<RandomAccessFile>* result,
                              const EnvOptions& soptions) override;
diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc
index cd09f712255..9e40d4d4082 100644
--- a/util/file_reader_writer.cc
+++ b/util/file_reader_writer.cc
@@ -98,8 +98,21 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
           allowed = read_size;
         }
         Slice tmp;
+
+        FileOperationInfo::TimePoint start_ts;
+        uint64_t orig_offset = 0;
+        if (ShouldNotifyListeners()) {
+          start_ts = std::chrono::system_clock::now();
+          orig_offset = aligned_offset + buf.CurrentSize();
+        }
         s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, &tmp,
                         buf.Destination());
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = std::chrono::system_clock::now();
+          NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
+                                 s);
+        }
+
         buf.Size(buf.CurrentSize() + tmp.size());
         if (!s.ok() || tmp.size() < allowed) {
           break;
@@ -131,7 +144,22 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
           allowed = n;
         }
         Slice tmp_result;
+
+#ifndef ROCKSDB_LITE
+        FileOperationInfo::TimePoint start_ts;
+        if (ShouldNotifyListeners()) {
+          start_ts = std::chrono::system_clock::now();
+        }
+#endif
         s = file_->Read(offset + pos, allowed, &tmp_result, scratch + pos);
+#ifndef ROCKSDB_LITE
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = std::chrono::system_clock::now();
+          NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts,
+                                 finish_ts, s);
+        }
+#endif
+
         if (res_scratch == nullptr) {
           // we can't simply use `scratch` because reads of mmap'd files return
           // data in a different buffer.
@@ -414,7 +442,22 @@ Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
     {
       IOSTATS_TIMER_GUARD(write_nanos);
       TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+
+#ifndef ROCKSDB_LITE
+      FileOperationInfo::TimePoint start_ts;
+      uint64_t old_size = writable_file_->GetFileSize();
+      if (ShouldNotifyListeners()) {
+        start_ts = std::chrono::system_clock::now();
+        old_size = next_write_offset_;
+      }
+#endif
       s = writable_file_->Append(Slice(src, allowed));
+#ifndef ROCKSDB_LITE
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::system_clock::now();
+        NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s);
+      }
+#endif
       if (!s.ok()) {
         return s;
       }
@@ -477,8 +520,16 @@ Status WritableFileWriter::WriteDirect() {
     {
       IOSTATS_TIMER_GUARD(write_nanos);
       TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+      FileOperationInfo::TimePoint start_ts;
+      if (ShouldNotifyListeners()) {
+        start_ts = std::chrono::system_clock::now();
+      }
       // direct writes must be positional
       s = writable_file_->PositionedAppend(Slice(src, size), write_offset);
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::system_clock::now();
+        NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
+      }
       if (!s.ok()) {
         buf_.Size(file_advance + leftover_tail);
         return s;
@@ -753,7 +804,7 @@ std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
 }
 
 Status NewWritableFile(Env* env, const std::string& fname,
-                       unique_ptr<WritableFile>* result,
+                       std::unique_ptr<WritableFile>* result,
                        const EnvOptions& options) {
   Status s = env->NewWritableFile(fname, result, options);
   TEST_KILL_RANDOM("NewWritableFile:0", rocksdb_kill_odds * REDUCE_ODDS2);
diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h
index a2c90f2b330..1083c685cb7 100644
--- a/util/file_reader_writer.h
+++ b/util/file_reader_writer.h
@@ -12,6 +12,7 @@
 #include <string>
 #include "port/port.h"
 #include "rocksdb/env.h"
+#include "rocksdb/listener.h"
 #include "rocksdb/rate_limiter.h"
 #include "util/aligned_buffer.h"
 #include "util/sync_point.h"
@@ -62,6 +63,24 @@ class SequentialFileReader {
 
 class RandomAccessFileReader {
  private:
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileReadFinish(uint64_t offset, size_t length,
+                              const FileOperationInfo::TimePoint& start_ts,
+                              const FileOperationInfo::TimePoint& finish_ts,
+                              const Status& status) const {
+    FileOperationInfo info(file_name_, start_ts, finish_ts);
+    info.offset = offset;
+    info.length = length;
+    info.status = status;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileReadFinish(info);
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+
   std::unique_ptr<RandomAccessFile> file_;
   std::string     file_name_;
   Env*            env_;
@@ -70,16 +89,15 @@ class RandomAccessFileReader {
   HistogramImpl*  file_read_hist_;
   RateLimiter* rate_limiter_;
   bool for_compaction_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
 
  public:
-  explicit RandomAccessFileReader(std::unique_ptr<RandomAccessFile>&& raf,
-                                  std::string _file_name,
-                                  Env* env = nullptr,
-                                  Statistics* stats = nullptr,
-                                  uint32_t hist_type = 0,
-                                  HistogramImpl* file_read_hist = nullptr,
-                                  RateLimiter* rate_limiter = nullptr,
-                                  bool for_compaction = false)
+  explicit RandomAccessFileReader(
+      std::unique_ptr<RandomAccessFile>&& raf, std::string _file_name,
+      Env* env = nullptr, Statistics* stats = nullptr, uint32_t hist_type = 0,
+      HistogramImpl* file_read_hist = nullptr,
+      RateLimiter* rate_limiter = nullptr, bool for_compaction = false,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
       : file_(std::move(raf)),
         file_name_(std::move(_file_name)),
         env_(env),
@@ -87,7 +105,19 @@ class RandomAccessFileReader {
         hist_type_(hist_type),
         file_read_hist_(file_read_hist),
         rate_limiter_(rate_limiter),
-        for_compaction_(for_compaction) {}
+        for_compaction_(for_compaction),
+        listeners_() {
+#ifndef ROCKSDB_LITE
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+#else  // !ROCKSDB_LITE
+    (void)listeners;
+#endif
+  }
 
   RandomAccessFileReader(RandomAccessFileReader&& o) ROCKSDB_NOEXCEPT {
     *this = std::move(o);
@@ -124,6 +154,24 @@ class RandomAccessFileReader {
 // Use posix write to write data to a file.
 class WritableFileWriter {
  private:
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileWriteFinish(uint64_t offset, size_t length,
+                               const FileOperationInfo::TimePoint& start_ts,
+                               const FileOperationInfo::TimePoint& finish_ts,
+                               const Status& status) {
+    FileOperationInfo info(file_name_, start_ts, finish_ts);
+    info.offset = offset;
+    info.length = length;
+    info.status = status;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileWriteFinish(info);
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+
   std::unique_ptr<WritableFile> writable_file_;
   std::string file_name_;
   AlignedBuffer           buf_;
@@ -142,11 +190,13 @@ class WritableFileWriter {
   uint64_t                bytes_per_sync_;
   RateLimiter*            rate_limiter_;
   Statistics* stats_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
 
  public:
-  WritableFileWriter(std::unique_ptr<WritableFile>&& file,
-                     const std::string& _file_name, const EnvOptions& options,
-                     Statistics* stats = nullptr)
+  WritableFileWriter(
+      std::unique_ptr<WritableFile>&& file, const std::string& _file_name,
+      const EnvOptions& options, Statistics* stats = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
       : writable_file_(std::move(file)),
         file_name_(_file_name),
         buf_(),
@@ -159,11 +209,22 @@ class WritableFileWriter {
         last_sync_size_(0),
         bytes_per_sync_(options.bytes_per_sync),
         rate_limiter_(options.rate_limiter),
-        stats_(stats) {
+        stats_(stats),
+        listeners_() {
     TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0",
                              reinterpret_cast<void*>(max_buffer_size_));
     buf_.Alignment(writable_file_->GetRequiredBufferAlignment());
     buf_.AllocateNewBuffer(std::min((size_t)65536, max_buffer_size_));
+#ifndef ROCKSDB_LITE
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+#else  // !ROCKSDB_LITE
+    (void)listeners;
+#endif
   }
 
   WritableFileWriter(const WritableFileWriter&) = delete;
@@ -254,7 +315,7 @@ class FilePrefetchBuffer {
 };
 
 extern Status NewWritableFile(Env* env, const std::string& fname,
-                              unique_ptr<WritableFile>* result,
+                              std::unique_ptr<WritableFile>* result,
                               const EnvOptions& options);
 bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
                  std::string* output, bool* has_data, Status* result);
diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc
index 3ca44ecc095..72dd625c1fb 100644
--- a/util/file_reader_writer_test.cc
+++ b/util/file_reader_writer_test.cc
@@ -71,8 +71,8 @@ TEST_F(WritableFileWriterTest, RangeSync) {
 
   EnvOptions env_options;
   env_options.bytes_per_sync = kMb;
-  unique_ptr<FakeWF> wf(new FakeWF);
-  unique_ptr<WritableFileWriter> writer(
+  std::unique_ptr<FakeWF> wf(new FakeWF);
+  std::unique_ptr<WritableFileWriter> writer(
       new WritableFileWriter(std::move(wf), "" /* don't care */, env_options));
   Random r(301);
   std::unique_ptr<char[]> large_buf(new char[10 * kMb]);
@@ -147,14 +147,14 @@ TEST_F(WritableFileWriterTest, IncrementalBuffer) {
     env_options.writable_file_max_buffer_size =
         (attempt < kNumAttempts / 2) ? 512 * 1024 : 700 * 1024;
     std::string actual;
-    unique_ptr<FakeWF> wf(new FakeWF(&actual,
+    std::unique_ptr<FakeWF> wf(new FakeWF(&actual,
 #ifndef ROCKSDB_LITE
-                                     attempt % 2 == 1,
+                                          attempt % 2 == 1,
 #else
-                                     false,
+                                          false,
 #endif
-                                     no_flush));
-    unique_ptr<WritableFileWriter> writer(new WritableFileWriter(
+                                          no_flush));
+    std::unique_ptr<WritableFileWriter> writer(new WritableFileWriter(
         std::move(wf), "" /* don't care */, env_options));
 
     std::string target;
@@ -206,9 +206,9 @@ TEST_F(WritableFileWriterTest, AppendStatusReturn) {
     bool use_direct_io_;
     bool io_error_;
   };
-  unique_ptr<FakeWF> wf(new FakeWF());
+  std::unique_ptr<FakeWF> wf(new FakeWF());
   wf->Setuse_direct_io(true);
-  unique_ptr<WritableFileWriter> writer(
+  std::unique_ptr<WritableFileWriter> writer(
       new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions()));
 
   ASSERT_OK(writer->Append(std::string(2 * kMb, 'a')));
diff --git a/util/file_util.cc b/util/file_util.cc
index aa2994b1e9f..3f730f3e840 100644
--- a/util/file_util.cc
+++ b/util/file_util.cc
@@ -19,16 +19,16 @@ Status CopyFile(Env* env, const std::string& source,
                 const std::string& destination, uint64_t size, bool use_fsync) {
   const EnvOptions soptions;
   Status s;
-  unique_ptr<SequentialFileReader> src_reader;
-  unique_ptr<WritableFileWriter> dest_writer;
+  std::unique_ptr<SequentialFileReader> src_reader;
+  std::unique_ptr<WritableFileWriter> dest_writer;
 
   {
-    unique_ptr<SequentialFile> srcfile;
+    std::unique_ptr<SequentialFile> srcfile;
     s = env->NewSequentialFile(source, &srcfile, soptions);
     if (!s.ok()) {
       return s;
     }
-    unique_ptr<WritableFile> destfile;
+    std::unique_ptr<WritableFile> destfile;
     s = env->NewWritableFile(destination, &destfile, soptions);
     if (!s.ok()) {
       return s;
@@ -71,9 +71,9 @@ Status CreateFile(Env* env, const std::string& destination,
                   const std::string& contents, bool use_fsync) {
   const EnvOptions soptions;
   Status s;
-  unique_ptr<WritableFileWriter> dest_writer;
+  std::unique_ptr<WritableFileWriter> dest_writer;
 
-  unique_ptr<WritableFile> destfile;
+  std::unique_ptr<WritableFile> destfile;
   s = env->NewWritableFile(destination, &destfile, soptions);
   if (!s.ok()) {
     return s;
@@ -89,16 +89,23 @@ Status CreateFile(Env* env, const std::string& destination,
 
 Status DeleteSSTFile(const ImmutableDBOptions* db_options,
                      const std::string& fname, const std::string& dir_to_sync) {
+  return DeleteDBFile(db_options, fname, dir_to_sync, false);
+}
+
+Status DeleteDBFile(const ImmutableDBOptions* db_options,
+                     const std::string& fname, const std::string& dir_to_sync,
+                     const bool force_bg) {
 #ifndef ROCKSDB_LITE
   auto sfm =
       static_cast<SstFileManagerImpl*>(db_options->sst_file_manager.get());
   if (sfm) {
-    return sfm->ScheduleFileDeletion(fname, dir_to_sync);
+    return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg);
   } else {
     return db_options->env->DeleteFile(fname);
   }
 #else
   (void)dir_to_sync;
+  (void)force_bg;
   // SstFileManager is not supported in ROCKSDB_LITE
   return db_options->env->DeleteFile(fname);
 #endif
diff --git a/util/file_util.h b/util/file_util.h
index 5c05c9def6e..cd054518e17 100644
--- a/util/file_util.h
+++ b/util/file_util.h
@@ -25,4 +25,9 @@ extern Status DeleteSSTFile(const ImmutableDBOptions* db_options,
                             const std::string& fname,
                             const std::string& path_to_sync);
 
+extern Status DeleteDBFile(const ImmutableDBOptions* db_options,
+                            const std::string& fname,
+                            const std::string& path_to_sync,
+                            const bool force_bg);
+
 }  // namespace rocksdb
diff --git a/util/heap.h b/util/heap.h
index 4d5894134f2..6093c20e2bf 100644
--- a/util/heap.h
+++ b/util/heap.h
@@ -92,9 +92,9 @@ class BinaryHeap {
     reset_root_cmp_cache();
   }
 
-  bool empty() const {
-    return data_.empty();
-  }
+  bool empty() const { return data_.empty(); }
+
+  size_t size() const { return data_.size(); }
 
   void reset_root_cmp_cache() { root_cmp_cache_ = port::kMaxSizet; }
 
diff --git a/util/jemalloc_nodump_allocator.cc b/util/jemalloc_nodump_allocator.cc
new file mode 100644
index 00000000000..cdd08e932e3
--- /dev/null
+++ b/util/jemalloc_nodump_allocator.cc
@@ -0,0 +1,206 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/jemalloc_nodump_allocator.h"
+
+#include <string>
+#include <thread>
+
+#include "port/likely.h"
+#include "port/port.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+std::atomic<extent_alloc_t*> JemallocNodumpAllocator::original_alloc_{nullptr};
+
+JemallocNodumpAllocator::JemallocNodumpAllocator(
+    JemallocAllocatorOptions& options,
+    std::unique_ptr<extent_hooks_t>&& arena_hooks, unsigned arena_index)
+    : options_(options),
+      arena_hooks_(std::move(arena_hooks)),
+      arena_index_(arena_index),
+      tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache) {}
+
+int JemallocNodumpAllocator::GetThreadSpecificCache(size_t size) {
+  // We always enable tcache. The only corner case is when there are a ton of
+  // threads accessing with low frequency, then it could consume a lot of
+  // memory (may reach # threads * ~1MB) without bringing too much benefit.
+  if (options_.limit_tcache_size && (size <= options_.tcache_size_lower_bound ||
+                                     size > options_.tcache_size_upper_bound)) {
+    return MALLOCX_TCACHE_NONE;
+  }
+  unsigned* tcache_index = reinterpret_cast<unsigned*>(tcache_.Get());
+  if (UNLIKELY(tcache_index == nullptr)) {
+    // Instantiate tcache.
+    tcache_index = new unsigned(0);
+    size_t tcache_index_size = sizeof(unsigned);
+    int ret =
+        mallctl("tcache.create", tcache_index, &tcache_index_size, nullptr, 0);
+    if (ret != 0) {
+      // No good way to expose the error. Silently disable tcache.
+      delete tcache_index;
+      return MALLOCX_TCACHE_NONE;
+    }
+    tcache_.Reset(static_cast<void*>(tcache_index));
+  }
+  return MALLOCX_TCACHE(*tcache_index);
+}
+
+void* JemallocNodumpAllocator::Allocate(size_t size) {
+  int tcache_flag = GetThreadSpecificCache(size);
+  return mallocx(size, MALLOCX_ARENA(arena_index_) | tcache_flag);
+}
+
+void JemallocNodumpAllocator::Deallocate(void* p) {
+  // Obtain tcache.
+  size_t size = 0;
+  if (options_.limit_tcache_size) {
+    size = malloc_usable_size(p);
+  }
+  int tcache_flag = GetThreadSpecificCache(size);
+  // No need to pass arena index to dallocx(). Jemalloc will find arena index
+  // from its own metadata.
+  dallocx(p, tcache_flag);
+}
+
+void* JemallocNodumpAllocator::Alloc(extent_hooks_t* extent, void* new_addr,
+                                     size_t size, size_t alignment, bool* zero,
+                                     bool* commit, unsigned arena_ind) {
+  extent_alloc_t* original_alloc =
+      original_alloc_.load(std::memory_order_relaxed);
+  assert(original_alloc != nullptr);
+  void* result = original_alloc(extent, new_addr, size, alignment, zero, commit,
+                                arena_ind);
+  if (result != nullptr) {
+    int ret = madvise(result, size, MADV_DONTDUMP);
+    if (ret != 0) {
+      fprintf(
+          stderr,
+          "JemallocNodumpAllocator failed to set MADV_DONTDUMP, error code: %d",
+          ret);
+      assert(false);
+    }
+  }
+  return result;
+}
+
+Status JemallocNodumpAllocator::DestroyArena(unsigned arena_index) {
+  assert(arena_index != 0);
+  std::string key = "arena." + ToString(arena_index) + ".destroy";
+  int ret = mallctl(key.c_str(), nullptr, 0, nullptr, 0);
+  if (ret != 0) {
+    return Status::Incomplete("Failed to destroy jemalloc arena, error code: " +
+                              ToString(ret));
+  }
+  return Status::OK();
+}
+
+void JemallocNodumpAllocator::DestroyThreadSpecificCache(void* ptr) {
+  assert(ptr != nullptr);
+  unsigned* tcache_index = static_cast<unsigned*>(ptr);
+  size_t tcache_index_size = sizeof(unsigned);
+  int ret __attribute__((__unused__)) =
+      mallctl("tcache.destroy", nullptr, 0, tcache_index, tcache_index_size);
+  // Silently ignore error.
+  assert(ret == 0);
+  delete tcache_index;
+}
+
+JemallocNodumpAllocator::~JemallocNodumpAllocator() {
+  // Destroy tcache before destroying arena.
+  autovector<void*> tcache_list;
+  tcache_.Scrape(&tcache_list, nullptr);
+  for (void* tcache_index : tcache_list) {
+    DestroyThreadSpecificCache(tcache_index);
+  }
+  // Destroy arena. Silently ignore error.
+  Status s __attribute__((__unused__)) = DestroyArena(arena_index_);
+  assert(s.ok());
+}
+
+size_t JemallocNodumpAllocator::UsableSize(void* p,
+                                           size_t /*allocation_size*/) const {
+  return malloc_usable_size(static_cast<void*>(p));
+}
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+Status NewJemallocNodumpAllocator(
+    JemallocAllocatorOptions& options,
+    std::shared_ptr<MemoryAllocator>* memory_allocator) {
+  *memory_allocator = nullptr;
+  Status unsupported = Status::NotSupported(
+      "JemallocNodumpAllocator only available with jemalloc version >= 5 "
+      "and MADV_DONTDUMP is available.");
+#ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+  (void)options;
+  return unsupported;
+#else
+  if (!HasJemalloc()) {
+    return unsupported;
+  }
+  if (memory_allocator == nullptr) {
+    return Status::InvalidArgument("memory_allocator must be non-null.");
+  }
+  if (options.limit_tcache_size &&
+      options.tcache_size_lower_bound >= options.tcache_size_upper_bound) {
+    return Status::InvalidArgument(
+        "tcache_size_lower_bound larger or equal to tcache_size_upper_bound.");
+  }
+
+  // Create arena.
+  unsigned arena_index = 0;
+  size_t arena_index_size = sizeof(arena_index);
+  int ret =
+      mallctl("arenas.create", &arena_index, &arena_index_size, nullptr, 0);
+  if (ret != 0) {
+    return Status::Incomplete("Failed to create jemalloc arena, error code: " +
+                              ToString(ret));
+  }
+  assert(arena_index != 0);
+
+  // Read existing hooks.
+  std::string key = "arena." + ToString(arena_index) + ".extent_hooks";
+  extent_hooks_t* hooks;
+  size_t hooks_size = sizeof(hooks);
+  ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0);
+  if (ret != 0) {
+    JemallocNodumpAllocator::DestroyArena(arena_index);
+    return Status::Incomplete("Failed to read existing hooks, error code: " +
+                              ToString(ret));
+  }
+
+  // Store existing alloc.
+  extent_alloc_t* original_alloc = hooks->alloc;
+  extent_alloc_t* expected = nullptr;
+  bool success =
+      JemallocNodumpAllocator::original_alloc_.compare_exchange_strong(
+          expected, original_alloc);
+  if (!success && original_alloc != expected) {
+    JemallocNodumpAllocator::DestroyArena(arena_index);
+    return Status::Incomplete("Original alloc conflict.");
+  }
+
+  // Set the custom hook.
+  std::unique_ptr<extent_hooks_t> new_hooks(new extent_hooks_t(*hooks));
+  new_hooks->alloc = &JemallocNodumpAllocator::Alloc;
+  extent_hooks_t* hooks_ptr = new_hooks.get();
+  ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr));
+  if (ret != 0) {
+    JemallocNodumpAllocator::DestroyArena(arena_index);
+    return Status::Incomplete("Failed to set custom hook, error code: " +
+                              ToString(ret));
+  }
+
+  // Create cache allocator.
+  memory_allocator->reset(
+      new JemallocNodumpAllocator(options, std::move(new_hooks), arena_index));
+  return Status::OK();
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+}
+
+}  // namespace rocksdb
diff --git a/util/jemalloc_nodump_allocator.h b/util/jemalloc_nodump_allocator.h
new file mode 100644
index 00000000000..e93c1223778
--- /dev/null
+++ b/util/jemalloc_nodump_allocator.h
@@ -0,0 +1,79 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <vector>
+
+#include "port/jemalloc_helper.h"
+#include "port/port.h"
+#include "rocksdb/memory_allocator.h"
+#include "util/core_local.h"
+#include "util/thread_local.h"
+
+#if defined(ROCKSDB_JEMALLOC) && defined(ROCKSDB_PLATFORM_POSIX)
+
+#include <sys/mman.h>
+
+#if (JEMALLOC_VERSION_MAJOR >= 5) && defined(MADV_DONTDUMP)
+#define ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+namespace rocksdb {
+
+class JemallocNodumpAllocator : public MemoryAllocator {
+ public:
+  JemallocNodumpAllocator(JemallocAllocatorOptions& options,
+                          std::unique_ptr<extent_hooks_t>&& arena_hooks,
+                          unsigned arena_index);
+  ~JemallocNodumpAllocator();
+
+  const char* Name() const override { return "JemallocNodumpAllocator"; }
+  void* Allocate(size_t size) override;
+  void Deallocate(void* p) override;
+  size_t UsableSize(void* p, size_t allocation_size) const override;
+
+ private:
+  friend Status NewJemallocNodumpAllocator(
+      JemallocAllocatorOptions& options,
+      std::shared_ptr<MemoryAllocator>* memory_allocator);
+
+  // Custom alloc hook to replace jemalloc default alloc.
+  static void* Alloc(extent_hooks_t* extent, void* new_addr, size_t size,
+                     size_t alignment, bool* zero, bool* commit,
+                     unsigned arena_ind);
+
+  // Destroy arena on destruction of the allocator, or on failure.
+  static Status DestroyArena(unsigned arena_index);
+
+  // Destroy tcache on destruction of the allocator, or thread exit.
+  static void DestroyThreadSpecificCache(void* ptr);
+
+  // Get or create tcache. Return flag suitable to use with `mallocx`:
+  // either MALLOCX_TCACHE_NONE or MALLOCX_TCACHE(tc).
+  int GetThreadSpecificCache(size_t size);
+
+  // A function pointer to jemalloc default alloc. Use atomic to make sure
+  // NewJemallocNodumpAllocator is thread-safe.
+  //
+  // Hack: original_alloc_ needs to be static for Alloc() to access it.
+  // alloc needs to be static to pass to jemalloc as function pointer.
+  static std::atomic<extent_alloc_t*> original_alloc_;
+
+  const JemallocAllocatorOptions options_;
+
+  // Custom hooks has to outlive corresponding arena.
+  const std::unique_ptr<extent_hooks_t> arena_hooks_;
+
+  // Arena index.
+  const unsigned arena_index_;
+
+  // Hold thread-local tcache index.
+  ThreadLocalPtr tcache_;
+};
+
+}  // namespace rocksdb
+#endif  // (JEMALLOC_VERSION_MAJOR >= 5) && MADV_DONTDUMP
+#endif  // ROCKSDB_JEMALLOC && ROCKSDB_PLATFORM_POSIX
diff --git a/util/log_write_bench.cc b/util/log_write_bench.cc
index b4e12b948c5..5c9b3e84bf4 100644
--- a/util/log_write_bench.cc
+++ b/util/log_write_bench.cc
@@ -35,9 +35,9 @@ void RunBenchmark() {
   Env* env = Env::Default();
   EnvOptions env_options = env->OptimizeForLogWrite(EnvOptions());
   env_options.bytes_per_sync = FLAGS_bytes_per_sync;
-  unique_ptr<WritableFile> file;
+  std::unique_ptr<WritableFile> file;
   env->NewWritableFile(file_name, &file, env_options);
-  unique_ptr<WritableFileWriter> writer;
+  std::unique_ptr<WritableFileWriter> writer;
   writer.reset(new WritableFileWriter(std::move(file), env_options));
 
   std::string record;
diff --git a/util/logging.h b/util/logging.h
index 992e0018d7c..f605d36a5ac 100644
--- a/util/logging.h
+++ b/util/logging.h
@@ -11,40 +11,47 @@
 // with macros.
 
 #pragma once
-#include "port/port.h"
 
 // Helper macros that include information about file name and line number
-#define STRINGIFY(x) #x
-#define TOSTRING(x) STRINGIFY(x)
-#define PREPEND_FILE_LINE(FMT) ("[" __FILE__ ":" TOSTRING(__LINE__) "] " FMT)
+#define ROCKS_LOG_STRINGIFY(x) #x
+#define ROCKS_LOG_TOSTRING(x) ROCKS_LOG_STRINGIFY(x)
+#define ROCKS_LOG_PREPEND_FILE_LINE(FMT) ("[%s:" ROCKS_LOG_TOSTRING(__LINE__) "] " FMT)
+
+inline const char* RocksLogShorterFileName(const char* file)
+{
+  // 15 is the length of "util/logging.h".
+  // If the name of this file changed, please change this number, too.
+  return file + (sizeof(__FILE__) > 15 ? sizeof(__FILE__) - 15 : 0);
+}
 
 // Don't inclide file/line info in HEADER level
-#define ROCKS_LOG_HEADER(LGR, FMT, ...) \
+#define ROCKS_LOG_HEADER(LGR, FMT, ...)                                          \
   rocksdb::Log(InfoLogLevel::HEADER_LEVEL, LGR, FMT, ##__VA_ARGS__)
 
-#define ROCKS_LOG_DEBUG(LGR, FMT, ...)                                 \
-  rocksdb::Log(InfoLogLevel::DEBUG_LEVEL, LGR, PREPEND_FILE_LINE(FMT), \
-               ##__VA_ARGS__)
+#define ROCKS_LOG_DEBUG(LGR, FMT, ...)                                           \
+  rocksdb::Log(InfoLogLevel::DEBUG_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \
+               RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
 
-#define ROCKS_LOG_INFO(LGR, FMT, ...)                                 \
-  rocksdb::Log(InfoLogLevel::INFO_LEVEL, LGR, PREPEND_FILE_LINE(FMT), \
-               ##__VA_ARGS__)
+#define ROCKS_LOG_INFO(LGR, FMT, ...)                                            \
+  rocksdb::Log(InfoLogLevel::INFO_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT),  \
+               RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
 
-#define ROCKS_LOG_WARN(LGR, FMT, ...)                                 \
-  rocksdb::Log(InfoLogLevel::WARN_LEVEL, LGR, PREPEND_FILE_LINE(FMT), \
-               ##__VA_ARGS__)
+#define ROCKS_LOG_WARN(LGR, FMT, ...)                                            \
+  rocksdb::Log(InfoLogLevel::WARN_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT),  \
+               RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
 
-#define ROCKS_LOG_ERROR(LGR, FMT, ...)                                 \
-  rocksdb::Log(InfoLogLevel::ERROR_LEVEL, LGR, PREPEND_FILE_LINE(FMT), \
-               ##__VA_ARGS__)
+#define ROCKS_LOG_ERROR(LGR, FMT, ...)                                           \
+  rocksdb::Log(InfoLogLevel::ERROR_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \
+               RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
 
-#define ROCKS_LOG_FATAL(LGR, FMT, ...)                                 \
-  rocksdb::Log(InfoLogLevel::FATAL_LEVEL, LGR, PREPEND_FILE_LINE(FMT), \
-               ##__VA_ARGS__)
+#define ROCKS_LOG_FATAL(LGR, FMT, ...)                                           \
+  rocksdb::Log(InfoLogLevel::FATAL_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \
+               RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
 
-#define ROCKS_LOG_BUFFER(LOG_BUF, FMT, ...) \
-  rocksdb::LogToBuffer(LOG_BUF, PREPEND_FILE_LINE(FMT), ##__VA_ARGS__)
+#define ROCKS_LOG_BUFFER(LOG_BUF, FMT, ...)                                      \
+  rocksdb::LogToBuffer(LOG_BUF, ROCKS_LOG_PREPEND_FILE_LINE(FMT),                \
+                       RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
 
-#define ROCKS_LOG_BUFFER_MAX_SZ(LOG_BUF, MAX_LOG_SIZE, FMT, ...)      \
-  rocksdb::LogToBuffer(LOG_BUF, MAX_LOG_SIZE, PREPEND_FILE_LINE(FMT), \
-                       ##__VA_ARGS__)
+#define ROCKS_LOG_BUFFER_MAX_SZ(LOG_BUF, MAX_LOG_SIZE, FMT, ...)                 \
+  rocksdb::LogToBuffer(LOG_BUF, MAX_LOG_SIZE, ROCKS_LOG_PREPEND_FILE_LINE(FMT),  \
+                       RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
diff --git a/util/memory_allocator.h b/util/memory_allocator.h
new file mode 100644
index 00000000000..99a7241d0a9
--- /dev/null
+++ b/util/memory_allocator.h
@@ -0,0 +1,38 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include "rocksdb/memory_allocator.h"
+
+namespace rocksdb {
+
+struct CustomDeleter {
+  CustomDeleter(MemoryAllocator* a = nullptr) : allocator(a) {}
+
+  void operator()(char* ptr) const {
+    if (allocator) {
+      allocator->Deallocate(reinterpret_cast<void*>(ptr));
+    } else {
+      delete[] ptr;
+    }
+  }
+
+  MemoryAllocator* allocator;
+};
+
+using CacheAllocationPtr = std::unique_ptr<char[], CustomDeleter>;
+
+inline CacheAllocationPtr AllocateBlock(size_t size,
+                                        MemoryAllocator* allocator) {
+  if (allocator) {
+    auto block = reinterpret_cast<char*>(allocator->Allocate(size));
+    return CacheAllocationPtr(block, allocator);
+  }
+  return CacheAllocationPtr(new char[size]);
+}
+
+}  // namespace rocksdb
diff --git a/util/mock_time_env.h b/util/mock_time_env.h
new file mode 100644
index 00000000000..c6ab8a7483d
--- /dev/null
+++ b/util/mock_time_env.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+class MockTimeEnv : public EnvWrapper {
+ public:
+  explicit MockTimeEnv(Env* base) : EnvWrapper(base) {}
+
+  virtual Status GetCurrentTime(int64_t* time) override {
+    assert(time != nullptr);
+    assert(current_time_ <=
+           static_cast<uint64_t>(std::numeric_limits<int64_t>::max()));
+    *time = static_cast<int64_t>(current_time_);
+    return Status::OK();
+  }
+
+  virtual uint64_t NowMicros() override {
+    assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000);
+    return current_time_ * 1000000;
+  }
+
+  virtual uint64_t NowNanos() override {
+    assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000000);
+    return current_time_ * 1000000000;
+  }
+
+  void set_current_time(uint64_t time) {
+    assert(time >= current_time_);
+    current_time_ = time;
+  }
+
+ private:
+  std::atomic<uint64_t> current_time_{0};
+};
+
+}  // namespace rocksdb
diff --git a/util/repeatable_thread.h b/util/repeatable_thread.h
index 34164ca562b..3506234f9e9 100644
--- a/util/repeatable_thread.h
+++ b/util/repeatable_thread.h
@@ -10,6 +10,7 @@
 
 #include "port/port.h"
 #include "rocksdb/env.h"
+#include "util/mock_time_env.h"
 #include "util/mutexlock.h"
 
 namespace rocksdb {
@@ -80,7 +81,17 @@ class RepeatableThread {
       cond_var_.SignalAll();
 #endif
       while (running_) {
+#ifndef NDEBUG
+        if (dynamic_cast<MockTimeEnv*>(env_) != nullptr) {
+          // MockTimeEnv is used. Since it is not easy to mock TimedWait,
+          // we wait without timeout to wait for TEST_WaitForRun to wake us up.
+          cond_var_.Wait();
+        } else {
+          cond_var_.TimedWait(wait_until);
+        }
+#else
         cond_var_.TimedWait(wait_until);
+#endif
         if (env_->NowMicros() >= wait_until) {
           break;
         }
diff --git a/util/slice_transform_test.cc b/util/slice_transform_test.cc
index ddbb9f4bfac..2eb56af6d6c 100644
--- a/util/slice_transform_test.cc
+++ b/util/slice_transform_test.cc
@@ -24,7 +24,7 @@ TEST_F(SliceTransformTest, CapPrefixTransform) {
   std::string s;
   s = "abcdefge";
 
-  unique_ptr<const SliceTransform> transform;
+  std::unique_ptr<const SliceTransform> transform;
 
   transform.reset(NewCappedPrefixTransform(6));
   ASSERT_EQ(transform->Transform(s).ToString(), "abcdef");
@@ -115,7 +115,7 @@ TEST_F(SliceTransformDBTest, CapPrefix) {
   ASSERT_OK(db()->Put(wo, "foo3", "bar3"));
   ASSERT_OK(db()->Flush(fo));
 
-  unique_ptr<Iterator> iter(db()->NewIterator(ro));
+  std::unique_ptr<Iterator> iter(db()->NewIterator(ro));
 
   iter->Seek("foo");
   ASSERT_OK(iter->status());
diff --git a/util/sst_file_manager_impl.cc b/util/sst_file_manager_impl.cc
index ee1394bc91e..733cd9cf609 100644
--- a/util/sst_file_manager_impl.cc
+++ b/util/sst_file_manager_impl.cc
@@ -402,8 +402,11 @@ bool SstFileManagerImpl::CancelErrorRecovery(ErrorHandler* handler) {
 }
 
 Status SstFileManagerImpl::ScheduleFileDeletion(
-    const std::string& file_path, const std::string& path_to_sync) {
-  return delete_scheduler_.DeleteFile(file_path, path_to_sync);
+    const std::string& file_path, const std::string& path_to_sync,
+    const bool force_bg) {
+  TEST_SYNC_POINT("SstFileManagerImpl::ScheduleFileDeletion");
+  return delete_scheduler_.DeleteFile(file_path, path_to_sync,
+                                      force_bg);
 }
 
 void SstFileManagerImpl::WaitForEmptyTrash() {
diff --git a/util/sst_file_manager_impl.h b/util/sst_file_manager_impl.h
index d11035df80c..211b4fa7160 100644
--- a/util/sst_file_manager_impl.h
+++ b/util/sst_file_manager_impl.h
@@ -111,9 +111,12 @@ class SstFileManagerImpl : public SstFileManager {
   // not guaranteed
   bool CancelErrorRecovery(ErrorHandler* db);
 
-  // Mark file as trash and schedule it's deletion.
+  // Mark file as trash and schedule it's deletion. If force_bg is set, it
+  // forces the file to be deleting in the background regardless of DB size,
+  // except when rate limited delete is disabled
   virtual Status ScheduleFileDeletion(const std::string& file_path,
-                                      const std::string& dir_to_sync);
+                                      const std::string& dir_to_sync,
+                                      const bool force_bg = false);
 
   // Wait for all files being deleteing in the background to finish or for
   // destructor to be called.
diff --git a/util/sync_point.cc b/util/sync_point.cc
index ce0fa0a9727..4599c256d9f 100644
--- a/util/sync_point.cc
+++ b/util/sync_point.cc
@@ -17,9 +17,7 @@ SyncPoint* SyncPoint::GetInstance() {
   return &sync_point;
 }
 
-SyncPoint::SyncPoint() : 
-  impl_(new Data) {
-}
+SyncPoint::SyncPoint() : impl_(new Data) {}
 
 SyncPoint:: ~SyncPoint() {
   delete impl_;
diff --git a/util/testutil.cc b/util/testutil.cc
index 0983f759ce9..2f8e31cd571 100644
--- a/util/testutil.cc
+++ b/util/testutil.cc
@@ -126,19 +126,19 @@ const Comparator* Uint64Comparator() {
 
 WritableFileWriter* GetWritableFileWriter(WritableFile* wf,
                                           const std::string& fname) {
-  unique_ptr<WritableFile> file(wf);
+  std::unique_ptr<WritableFile> file(wf);
   return new WritableFileWriter(std::move(file), fname, EnvOptions());
 }
 
 RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf) {
-  unique_ptr<RandomAccessFile> file(raf);
+  std::unique_ptr<RandomAccessFile> file(raf);
   return new RandomAccessFileReader(std::move(file),
                                     "[test RandomAccessFileReader]");
 }
 
 SequentialFileReader* GetSequentialFileReader(SequentialFile* se,
                                               const std::string& fname) {
-  unique_ptr<SequentialFile> file(se);
+  std::unique_ptr<SequentialFile> file(se);
   return new SequentialFileReader(std::move(file), fname);
 }
 
@@ -401,5 +401,21 @@ Status DestroyDir(Env* env, const std::string& dir) {
   return s;
 }
 
+bool IsDirectIOSupported(Env* env, const std::string& dir) {
+  EnvOptions env_options;
+  env_options.use_mmap_writes = false;
+  env_options.use_direct_writes = true;
+  std::string tmp = TempFileName(dir, 999);
+  Status s;
+  {
+    std::unique_ptr<WritableFile> file;
+    s = env->NewWritableFile(tmp, &file, env_options);
+  }
+  if (s.ok()) {
+    s = env->DeleteFile(tmp);
+  }
+  return s.ok();
+}
+
 }  // namespace test
 }  // namespace rocksdb
diff --git a/util/testutil.h b/util/testutil.h
index c16c0cbe503..2aab3df72c4 100644
--- a/util/testutil.h
+++ b/util/testutil.h
@@ -64,7 +64,7 @@ class ErrorEnv : public EnvWrapper {
                num_writable_file_errors_(0) { }
 
   virtual Status NewWritableFile(const std::string& fname,
-                                 unique_ptr<WritableFile>* result,
+                                 std::unique_ptr<WritableFile>* result,
                                  const EnvOptions& soptions) override {
     result->reset();
     if (writable_file_error_) {
@@ -554,7 +554,7 @@ class StringEnv : public EnvWrapper {
 
   const Status WriteToNewFile(const std::string& file_name,
                               const std::string& content) {
-    unique_ptr<WritableFile> r;
+    std::unique_ptr<WritableFile> r;
     auto s = NewWritableFile(file_name, &r, EnvOptions());
     if (!s.ok()) {
       return s;
@@ -567,7 +567,8 @@ class StringEnv : public EnvWrapper {
   }
 
   // The following text is boilerplate that forwards all methods to target()
-  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
                            const EnvOptions& /*options*/) override {
     auto iter = files_.find(f);
     if (iter == files_.end()) {
@@ -577,11 +578,11 @@ class StringEnv : public EnvWrapper {
     return Status::OK();
   }
   Status NewRandomAccessFile(const std::string& /*f*/,
-                             unique_ptr<RandomAccessFile>* /*r*/,
+                             std::unique_ptr<RandomAccessFile>* /*r*/,
                              const EnvOptions& /*options*/) override {
     return Status::NotSupported();
   }
-  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
                          const EnvOptions& /*options*/) override {
     auto iter = files_.find(f);
     if (iter != files_.end()) {
@@ -591,7 +592,7 @@ class StringEnv : public EnvWrapper {
     return Status::OK();
   }
   virtual Status NewDirectory(const std::string& /*name*/,
-                              unique_ptr<Directory>* /*result*/) override {
+                              std::unique_ptr<Directory>* /*result*/) override {
     return Status::NotSupported();
   }
   Status FileExists(const std::string& f) override {
@@ -747,5 +748,7 @@ std::string RandomName(Random* rnd, const size_t len);
 
 Status DestroyDir(Env* env, const std::string& dir);
 
+bool IsDirectIOSupported(Env* env, const std::string& dir);
+
 }  // namespace test
 }  // namespace rocksdb
diff --git a/util/thread_local.cc b/util/thread_local.cc
index dea2002a021..7346eff11e8 100644
--- a/util/thread_local.cc
+++ b/util/thread_local.cc
@@ -204,7 +204,7 @@ extern "C" {
 // The linker must not discard thread_callback_on_exit.  (We force a reference
 // to this variable with a linker /include:symbol pragma to ensure that.) If
 // this variable is discarded, the OnThreadExit function will never be called.
-#ifdef _WIN64
+#ifndef _X86_
 
 // .CRT section is merged with .rdata on x64 so it must be constant data.
 #pragma const_seg(".CRT$XLB")
@@ -219,7 +219,7 @@ const PIMAGE_TLS_CALLBACK p_thread_callback_on_exit =
 #pragma comment(linker, "/include:_tls_used")
 #pragma comment(linker, "/include:p_thread_callback_on_exit")
 
-#else  // _WIN64
+#else  // _X86_
 
 #pragma data_seg(".CRT$XLB")
 PIMAGE_TLS_CALLBACK p_thread_callback_on_exit = wintlscleanup::WinOnThreadExit;
@@ -229,7 +229,7 @@ PIMAGE_TLS_CALLBACK p_thread_callback_on_exit = wintlscleanup::WinOnThreadExit;
 #pragma comment(linker, "/INCLUDE:__tls_used")
 #pragma comment(linker, "/INCLUDE:_p_thread_callback_on_exit")
 
-#endif  // _WIN64
+#endif  // _X86_
 
 #else
 // https://github.com/couchbase/gperftools/blob/master/src/windows/port.cc
diff --git a/util/thread_operation.h b/util/thread_operation.h
index 025392b59de..f1827da0a0c 100644
--- a/util/thread_operation.h
+++ b/util/thread_operation.h
@@ -70,7 +70,7 @@ static OperationStageInfo global_op_stage_table[] = {
   {ThreadStatus::STAGE_MEMTABLE_ROLLBACK,
       "MemTableList::RollbackMemtableFlush"},
   {ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS,
-      "MemTableList::InstallMemtableFlushResults"},
+      "MemTableList::TryInstallMemtableFlushResults"},
 };
 
 // The structure that describes a state.
diff --git a/util/threadpool_imp.cc b/util/threadpool_imp.cc
index d850b7c9e9f..b431830ee6d 100644
--- a/util/threadpool_imp.cc
+++ b/util/threadpool_imp.cc
@@ -188,7 +188,7 @@ void ThreadPoolImpl::Impl::BGThread(size_t thread_id) {
   bool low_cpu_priority = false;
 
   while (true) {
-// Wait until there is an item that is ready to run
+    // Wait until there is an item that is ready to run
     std::unique_lock<std::mutex> lock(mu_);
     // Stop waiting if the thread needs to do work or needs to terminate.
     while (!exit_all_threads_ && !IsLastExcessiveThread(thread_id) &&
@@ -198,7 +198,7 @@ void ThreadPoolImpl::Impl::BGThread(size_t thread_id) {
 
     if (exit_all_threads_) {  // mechanism to let BG threads exit safely
 
-      if(!wait_for_jobs_to_complete_ ||
+      if (!wait_for_jobs_to_complete_ ||
           queue_.empty()) {
         break;
        }
diff --git a/util/trace_replay.cc b/util/trace_replay.cc
index cd2e3ee95e2..5b9bec651e4 100644
--- a/util/trace_replay.cc
+++ b/util/trace_replay.cc
@@ -16,6 +16,8 @@
 
 namespace rocksdb {
 
+const std::string kTraceMagic = "feedcafedeadbeef";
+
 namespace {
 void EncodeCFAndKey(std::string* dst, uint32_t cf_id, const Slice& key) {
   PutFixed32(dst, cf_id);
@@ -29,14 +31,20 @@ void DecodeCFAndKey(std::string& buffer, uint32_t* cf_id, Slice* key) {
 }
 }  // namespace
 
-Tracer::Tracer(Env* env, std::unique_ptr<TraceWriter>&& trace_writer)
-    : env_(env), trace_writer_(std::move(trace_writer)) {
+Tracer::Tracer(Env* env, const TraceOptions& trace_options,
+               std::unique_ptr<TraceWriter>&& trace_writer)
+    : env_(env),
+      trace_options_(trace_options),
+      trace_writer_(std::move(trace_writer)) {
   WriteHeader();
 }
 
 Tracer::~Tracer() { trace_writer_.reset(); }
 
 Status Tracer::Write(WriteBatch* write_batch) {
+  if (IsTraceFileOverMax()) {
+    return Status::OK();
+  }
   Trace trace;
   trace.ts = env_->NowMicros();
   trace.type = kTraceWrite;
@@ -45,6 +53,9 @@ Status Tracer::Write(WriteBatch* write_batch) {
 }
 
 Status Tracer::Get(ColumnFamilyHandle* column_family, const Slice& key) {
+  if (IsTraceFileOverMax()) {
+    return Status::OK();
+  }
   Trace trace;
   trace.ts = env_->NowMicros();
   trace.type = kTraceGet;
@@ -53,6 +64,9 @@ Status Tracer::Get(ColumnFamilyHandle* column_family, const Slice& key) {
 }
 
 Status Tracer::IteratorSeek(const uint32_t& cf_id, const Slice& key) {
+  if (IsTraceFileOverMax()) {
+    return Status::OK();
+  }
   Trace trace;
   trace.ts = env_->NowMicros();
   trace.type = kTraceIteratorSeek;
@@ -61,6 +75,9 @@ Status Tracer::IteratorSeek(const uint32_t& cf_id, const Slice& key) {
 }
 
 Status Tracer::IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key) {
+  if (IsTraceFileOverMax()) {
+    return Status::OK();
+  }
   Trace trace;
   trace.ts = env_->NowMicros();
   trace.type = kTraceIteratorSeekForPrev;
@@ -68,6 +85,11 @@ Status Tracer::IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key) {
   return WriteTrace(trace);
 }
 
+bool Tracer::IsTraceFileOverMax() {
+  uint64_t trace_file_size = trace_writer_->GetFileSize();
+  return (trace_file_size > trace_options_.max_trace_file_size);
+}
+
 Status Tracer::WriteHeader() {
   std::ostringstream s;
   s << kTraceMagic << "\t"
@@ -103,7 +125,7 @@ Status Tracer::WriteTrace(const Trace& trace) {
 Status Tracer::Close() { return WriteFooter(); }
 
 Replayer::Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
-                   unique_ptr<TraceReader>&& reader)
+                   std::unique_ptr<TraceReader>&& reader)
     : trace_reader_(std::move(reader)) {
   assert(db != nullptr);
   db_ = static_cast<DBImpl*>(db->GetRootDB());
diff --git a/util/trace_replay.h b/util/trace_replay.h
index b324696f013..d935f65ce7e 100644
--- a/util/trace_replay.h
+++ b/util/trace_replay.h
@@ -10,6 +10,7 @@
 #include <utility>
 
 #include "rocksdb/env.h"
+#include "rocksdb/options.h"
 #include "rocksdb/trace_reader_writer.h"
 
 namespace rocksdb {
@@ -21,7 +22,7 @@ class DBImpl;
 class Slice;
 class WriteBatch;
 
-const std::string kTraceMagic = "feedcafedeadbeef";
+extern const std::string kTraceMagic;
 const unsigned int kTraceTimestampSize = 8;
 const unsigned int kTraceTypeSize = 1;
 const unsigned int kTracePayloadLengthSize = 4;
@@ -55,13 +56,15 @@ struct Trace {
 // Trace RocksDB operations using a TraceWriter.
 class Tracer {
  public:
-  Tracer(Env* env, std::unique_ptr<TraceWriter>&& trace_writer);
+  Tracer(Env* env, const TraceOptions& trace_options,
+         std::unique_ptr<TraceWriter>&& trace_writer);
   ~Tracer();
 
   Status Write(WriteBatch* write_batch);
   Status Get(ColumnFamilyHandle* cfname, const Slice& key);
   Status IteratorSeek(const uint32_t& cf_id, const Slice& key);
   Status IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
+  bool IsTraceFileOverMax();
 
   Status Close();
 
@@ -71,7 +74,8 @@ class Tracer {
   Status WriteTrace(const Trace& trace);
 
   Env* env_;
-  unique_ptr<TraceWriter> trace_writer_;
+  TraceOptions trace_options_;
+  std::unique_ptr<TraceWriter> trace_writer_;
 };
 
 // Replay RocksDB operations from a trace.
diff --git a/util/transaction_test_util.cc b/util/transaction_test_util.cc
index 63339189170..58d95b2ae19 100644
--- a/util/transaction_test_util.cc
+++ b/util/transaction_test_util.cc
@@ -13,6 +13,7 @@
 #include <inttypes.h>
 #include <algorithm>
 #include <numeric>
+#include <random>
 #include <string>
 #include <thread>
 
@@ -135,8 +136,7 @@ bool RandomTransactionInserter::DoInsert(DB* db, Transaction* txn,
 
   std::vector<uint16_t> set_vec(num_sets_);
   std::iota(set_vec.begin(), set_vec.end(), static_cast<uint16_t>(0));
-  std::random_shuffle(set_vec.begin(), set_vec.end(),
-                      [&](uint64_t r) { return rand_->Uniform(r); });
+  std::shuffle(set_vec.begin(), set_vec.end(), std::random_device{});
 
   // For each set, pick a key at random and increment it
   for (uint16_t set_i : set_vec) {
@@ -258,10 +258,8 @@ Status RandomTransactionInserter::Verify(DB* db, uint16_t num_sets,
 
   std::vector<uint16_t> set_vec(num_sets);
   std::iota(set_vec.begin(), set_vec.end(), static_cast<uint16_t>(0));
-  if (rand) {
-    std::random_shuffle(set_vec.begin(), set_vec.end(),
-                        [&](uint64_t r) { return rand->Uniform(r); });
-  }
+  std::shuffle(set_vec.begin(), set_vec.end(), std::random_device{});
+
   // For each set of keys with the same prefix, sum all the values
   for (uint16_t set_i : set_vec) {
     // Five digits (since the largest uint16_t is 65535) plus the NUL
diff --git a/util/vector_iterator.h b/util/vector_iterator.h
new file mode 100644
index 00000000000..da60eb229cf
--- /dev/null
+++ b/util/vector_iterator.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "table/internal_iterator.h"
+
+namespace rocksdb {
+
+// Iterator over a vector of keys/values
+class VectorIterator : public InternalIterator {
+ public:
+  VectorIterator(std::vector<std::string> keys, std::vector<std::string> values,
+                 const InternalKeyComparator* icmp)
+      : keys_(std::move(keys)),
+        values_(std::move(values)),
+        indexed_cmp_(icmp, &keys_),
+        current_(keys.size()) {
+    assert(keys_.size() == values_.size());
+
+    indices_.reserve(keys_.size());
+    for (size_t i = 0; i < keys_.size(); i++) {
+      indices_.push_back(i);
+    }
+    std::sort(indices_.begin(), indices_.end(), indexed_cmp_);
+  }
+
+  virtual bool Valid() const override {
+    return !indices_.empty() && current_ < indices_.size();
+  }
+
+  virtual void SeekToFirst() override { current_ = 0; }
+  virtual void SeekToLast() override { current_ = indices_.size() - 1; }
+
+  virtual void Seek(const Slice& target) override {
+    current_ = std::lower_bound(indices_.begin(), indices_.end(), target,
+                                indexed_cmp_) -
+               indices_.begin();
+  }
+
+  virtual void SeekForPrev(const Slice& target) override {
+    current_ = std::lower_bound(indices_.begin(), indices_.end(), target,
+                                indexed_cmp_) -
+               indices_.begin();
+    if (!Valid()) {
+      SeekToLast();
+    } else {
+      Prev();
+    }
+  }
+
+  virtual void Next() override { current_++; }
+  virtual void Prev() override { current_--; }
+
+  virtual Slice key() const override {
+    return Slice(keys_[indices_[current_]]);
+  }
+  virtual Slice value() const override {
+    return Slice(values_[indices_[current_]]);
+  }
+
+  virtual Status status() const override { return Status::OK(); }
+
+  virtual bool IsKeyPinned() const override { return true; }
+  virtual bool IsValuePinned() const override { return true; }
+
+ private:
+  struct IndexedKeyComparator {
+    IndexedKeyComparator(const InternalKeyComparator* c,
+                         const std::vector<std::string>* ks)
+        : cmp(c), keys(ks) {}
+
+    bool operator()(size_t a, size_t b) const {
+      return cmp->Compare((*keys)[a], (*keys)[b]) < 0;
+    }
+
+    bool operator()(size_t a, const Slice& b) const {
+      return cmp->Compare((*keys)[a], b) < 0;
+    }
+
+    bool operator()(const Slice& a, size_t b) const {
+      return cmp->Compare(a, (*keys)[b]) < 0;
+    }
+
+    const InternalKeyComparator* cmp;
+    const std::vector<std::string>* keys;
+  };
+
+  std::vector<std::string> keys_;
+  std::vector<std::string> values_;
+  IndexedKeyComparator indexed_cmp_;
+  std::vector<size_t> indices_;
+  size_t current_;
+};
+
+}  // namespace rocksdb
diff --git a/util/xxhash.cc b/util/xxhash.cc
index 4bce61a4878..2ec95a636e5 100644
--- a/util/xxhash.cc
+++ b/util/xxhash.cc
@@ -34,6 +34,39 @@ You can contact the author at :
 //**************************************
 // Tuning parameters
 //**************************************
+/*!XXH_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable. Unfortunately, on some target/compiler combinations, the
+ * generated assembly is sub-optimal. The below switch allow to select different
+ * access method for improved performance. Method 0 (default) : use `memcpy()`.
+ * Safe and portable. Method 1 : `__packed` statement. It depends on compiler
+ * extension (ie, not portable). This method is safe if your compiler supports
+ * it, and *generally* as fast or faster than `memcpy`. Method 2 : direct
+ * access. This method doesn't depend on compiler but violate C standard. It can
+ * generate buggy code on targets which do not support unaligned memory
+ * accesses. But in some circumstances, it's the only known way to get the most
+ * performance (ie GCC + ARMv6) See http://stackoverflow.com/a/32095106/646947
+ * for details. Prefer these methods in priority order (0 > 1 > 2)
+ */
+
+#include "util/util.h"
+
+#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line \
+                                   for example */
+#if defined(__GNUC__) &&                                     \
+    (defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) ||  \
+     defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \
+     defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__))
+#define XXH_FORCE_MEMORY_ACCESS 2
+#elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) ||      \
+    (defined(__GNUC__) &&                                     \
+     (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) ||  \
+      defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || \
+      defined(__ARM_ARCH_7S__)))
+#define XXH_FORCE_MEMORY_ACCESS 1
+#endif
+#endif
+
 // Unaligned memory access is automatically enabled for "common" CPU, such as x86.
 // For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
 // If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
@@ -58,6 +91,21 @@ You can contact the author at :
 // This option has no impact on Little_Endian CPU.
 #define XXH_FORCE_NATIVE_FORMAT 0
 
+/*!XXH_FORCE_ALIGN_CHECK :
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means : check for aligned/unaligned input.
+ * The check costs one initial branch per hash;
+ * set it to 0 when the input is guaranteed to be aligned,
+ * or when alignment doesn't matter for performance.
+ */
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || \
+    defined(_M_X64)
+#define XXH_FORCE_ALIGN_CHECK 0
+#else
+#define XXH_FORCE_ALIGN_CHECK 1
+#endif
+#endif
 
 //**************************************
 // Compiler Specific Options
@@ -91,7 +139,7 @@ FORCE_INLINE void  XXH_free  (void* p)  { free(p); }
 // for memcpy()
 #include <string.h>
 FORCE_INLINE void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
-
+#include <assert.h> /* assert */
 
 namespace rocksdb {
 //**************************************
@@ -134,6 +182,34 @@ typedef struct _U32_S { U32 v; } _PACKED U32_S;
 
 #define A32(x) (((U32_S *)(x))->v)
 
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory
+ * access in hardware */
+static U32 XXH_read32(const void* memPtr) { return *(const U32*)memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially
+ * problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union {
+  U32 u32;
+} __attribute__((packed)) unalign;
+static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+static U32 XXH_read32(const void* memPtr) {
+  U32 val;
+  memcpy(&val, memPtr, sizeof(val));
+  return val;
+}
+
+#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
 
 //***************************************
 // Compiler-specific Functions and Macros
@@ -143,8 +219,10 @@ typedef struct _U32_S { U32 v; } _PACKED U32_S;
 // Note : although _rotl exists for minGW (GCC under windows), performance seems poor
 #if defined(_MSC_VER)
 #  define XXH_rotl32(x,r) _rotl(x,r)
+#define XXH_rotl64(x, r) _rotl64(x, r)
 #else
 #  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#define XXH_rotl64(x, r) ((x << r) | (x >> (64 - r)))
 #endif
 
 #if defined(_MSC_VER)     // Visual Studio
@@ -199,12 +277,25 @@ FORCE_INLINE U32 XXH_readLE32_align(const U32* ptr, XXH_endianess endian, XXH_al
         return endian==XXH_littleEndian ? *ptr : XXH_swap32(*ptr);
 }
 
-FORCE_INLINE U32 XXH_readLE32(const U32* ptr, XXH_endianess endian) { return XXH_readLE32_align(ptr, endian, XXH_unaligned); }
+FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian,
+                                    XXH_alignment align) {
+  if (align == XXH_unaligned)
+    return endian == XXH_littleEndian ? XXH_read32(ptr)
+                                      : XXH_swap32(XXH_read32(ptr));
+  else
+    return endian == XXH_littleEndian ? *(const U32*)ptr
+                                      : XXH_swap32(*(const U32*)ptr);
+}
 
+FORCE_INLINE U32 XXH_readLE32(const U32* ptr, XXH_endianess endian) {
+  return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+}
 
 //****************************
 // Simple Hash Functions
 //****************************
+#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+
 FORCE_INLINE U32 XXH32_endian_align(const void* input, int len, U32 seed, XXH_endianess endian, XXH_alignment align)
 {
     const BYTE* p = (const BYTE*)input;
@@ -476,4 +567,508 @@ U32 XXH32_digest (void* state_in)
     return h32;
 }
 
+/* *******************************************************************
+ *  64-bit hash functions
+ *********************************************************************/
+
+ #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+ /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+ static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+ #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+ /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+ /* currently only defined for gcc and icc */
+ typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign64;
+ static U64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
+
+ #else
+
+ /* portable and safe solution. Generally efficient.
+  * see : http://stackoverflow.com/a/32095106/646947
+  */
+
+ static U64 XXH_read64(const void* memPtr)
+ {
+     U64 val;
+     memcpy(&val, memPtr, sizeof(val));
+     return val;
+ }
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER) /* Visual Studio */
+#define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#define XXH_swap64 __builtin_bswap64
+#else
+static U64 XXH_swap64(U64 x) {
+  return ((x << 56) & 0xff00000000000000ULL) |
+         ((x << 40) & 0x00ff000000000000ULL) |
+         ((x << 24) & 0x0000ff0000000000ULL) |
+         ((x << 8) & 0x000000ff00000000ULL) |
+         ((x >> 8) & 0x00000000ff000000ULL) |
+         ((x >> 24) & 0x0000000000ff0000ULL) |
+         ((x >> 40) & 0x000000000000ff00ULL) |
+         ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian,
+                                    XXH_alignment align) {
+  if (align == XXH_unaligned)
+    return endian == XXH_littleEndian ? XXH_read64(ptr)
+                                      : XXH_swap64(XXH_read64(ptr));
+  else
+    return endian == XXH_littleEndian ? *(const U64*)ptr
+                                      : XXH_swap64(*(const U64*)ptr);
+}
+
+FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) {
+  return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+}
+
+static U64 XXH_readBE64(const void* ptr) {
+  return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+
+/*======   xxh64   ======*/
+
+static const U64 PRIME64_1 =
+    11400714785074694791ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111
+                              */
+static const U64 PRIME64_2 =
+    14029467366897019727ULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111
+                              */
+static const U64 PRIME64_3 =
+    1609587929392839161ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001
+                             */
+static const U64 PRIME64_4 =
+    9650029242287828579ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011
+                             */
+static const U64 PRIME64_5 =
+    2870177450012600261ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101
+                             */
+
+static U64 XXH64_round(U64 acc, U64 input) {
+  acc += input * PRIME64_2;
+  acc = XXH_rotl64(acc, 31);
+  acc *= PRIME64_1;
+  return acc;
+}
+
+static U64 XXH64_mergeRound(U64 acc, U64 val) {
+  val = XXH64_round(0, val);
+  acc ^= val;
+  acc = acc * PRIME64_1 + PRIME64_4;
+  return acc;
+}
+
+static U64 XXH64_avalanche(U64 h64) {
+  h64 ^= h64 >> 33;
+  h64 *= PRIME64_2;
+  h64 ^= h64 >> 29;
+  h64 *= PRIME64_3;
+  h64 ^= h64 >> 32;
+  return h64;
+}
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+
+static U64 XXH64_finalize(U64 h64, const void* ptr, size_t len,
+                          XXH_endianess endian, XXH_alignment align) {
+  const BYTE* p = (const BYTE*)ptr;
+
+#define PROCESS1_64          \
+  h64 ^= (*p++) * PRIME64_5; \
+  h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+
+#define PROCESS4_64                           \
+  h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; \
+  p += 4;                                     \
+  h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+
+#define PROCESS8_64                                    \
+  {                                                    \
+    U64 const k1 = XXH64_round(0, XXH_get64bits(p));   \
+    p += 8;                                            \
+    h64 ^= k1;                                         \
+    h64 = XXH_rotl64(h64, 27) * PRIME64_1 + PRIME64_4; \
+  }
+
+  switch (len & 31) {
+    case 24:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 16:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 8:
+      PROCESS8_64;
+      return XXH64_avalanche(h64);
+
+    case 28:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 20:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 12:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 4:
+      PROCESS4_64;
+      return XXH64_avalanche(h64);
+
+    case 25:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 17:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 9:
+      PROCESS8_64;
+      PROCESS1_64;
+      return XXH64_avalanche(h64);
+
+    case 29:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 21:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 13:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 5:
+      PROCESS4_64;
+      PROCESS1_64;
+      return XXH64_avalanche(h64);
+
+    case 26:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 18:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 10:
+      PROCESS8_64;
+      PROCESS1_64;
+      PROCESS1_64;
+      return XXH64_avalanche(h64);
+
+    case 30:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 22:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 14:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 6:
+      PROCESS4_64;
+      PROCESS1_64;
+      PROCESS1_64;
+      return XXH64_avalanche(h64);
+
+    case 27:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 19:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 11:
+      PROCESS8_64;
+      PROCESS1_64;
+      PROCESS1_64;
+      PROCESS1_64;
+      return XXH64_avalanche(h64);
+
+    case 31:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 23:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 15:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 7:
+      PROCESS4_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 3:
+      PROCESS1_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 2:
+      PROCESS1_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 1:
+      PROCESS1_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 0:
+      return XXH64_avalanche(h64);
+  }
+
+  /* impossible to reach */
+  assert(0);
+  return 0; /* unreachable, but some compilers complain without it */
+}
+
+FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed,
+                                    XXH_endianess endian, XXH_alignment align) {
+  const BYTE* p = (const BYTE*)input;
+  const BYTE* bEnd = p + len;
+  U64 h64;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
+    (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
+  if (p == NULL) {
+    len = 0;
+    bEnd = p = (const BYTE*)(size_t)32;
+  }
+#endif
+
+  if (len >= 32) {
+    const BYTE* const limit = bEnd - 32;
+    U64 v1 = seed + PRIME64_1 + PRIME64_2;
+    U64 v2 = seed + PRIME64_2;
+    U64 v3 = seed + 0;
+    U64 v4 = seed - PRIME64_1;
+
+    do {
+      v1 = XXH64_round(v1, XXH_get64bits(p));
+      p += 8;
+      v2 = XXH64_round(v2, XXH_get64bits(p));
+      p += 8;
+      v3 = XXH64_round(v3, XXH_get64bits(p));
+      p += 8;
+      v4 = XXH64_round(v4, XXH_get64bits(p));
+      p += 8;
+    } while (p <= limit);
+
+    h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +
+          XXH_rotl64(v4, 18);
+    h64 = XXH64_mergeRound(h64, v1);
+    h64 = XXH64_mergeRound(h64, v2);
+    h64 = XXH64_mergeRound(h64, v3);
+    h64 = XXH64_mergeRound(h64, v4);
+
+  } else {
+    h64 = seed + PRIME64_5;
+  }
+
+  h64 += (U64)len;
+
+  return XXH64_finalize(h64, p, len, endian, align);
+}
+
+unsigned long long XXH64(const void* input, size_t len,
+                         unsigned long long seed) {
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, input, len);
+    return XXH64_digest(&state);
+#else
+  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+  if (XXH_FORCE_ALIGN_CHECK) {
+    if ((((size_t)input) & 7) ==
+        0) { /* Input is aligned, let's leverage the speed advantage */
+      if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_endian_align(input, len, seed, XXH_littleEndian,
+                                  XXH_aligned);
+      else
+        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }
+  }
+
+  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+    return XXH64_endian_align(input, len, seed, XXH_littleEndian,
+                              XXH_unaligned);
+  else
+    return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+/*======   Hash Streaming   ======*/
+
+XXH64_state_t* XXH64_createState(void) {
+  return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) {
+  XXH_free(statePtr);
+  return XXH_OK;
+}
+
+void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) {
+  memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed) {
+  XXH64_state_t state; /* using a local state to memcpy() in order to avoid
+                          strict-aliasing warnings */
+  memset(&state, 0, sizeof(state));
+  state.v1 = seed + PRIME64_1 + PRIME64_2;
+  state.v2 = seed + PRIME64_2;
+  state.v3 = seed + 0;
+  state.v4 = seed - PRIME64_1;
+  /* do not write into reserved, planned to be removed in a future version */
+  memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+  return XXH_OK;
+}
+
+FORCE_INLINE XXH_errorcode XXH64_update_endian(XXH64_state_t* state,
+                                               const void* input, size_t len,
+                                               XXH_endianess endian) {
+  if (input == NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
+    (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
+    return XXH_OK;
+#else
+    return XXH_ERROR;
+#endif
+
+  {
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+    state->total_len += len;
+
+    if (state->memsize + len < 32) { /* fill in tmp buffer */
+      XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
+      state->memsize += (U32)len;
+      return XXH_OK;
+    }
+
+    if (state->memsize) { /* tmp buffer is full */
+      XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input,
+                 32 - state->memsize);
+      state->v1 =
+          XXH64_round(state->v1, XXH_readLE64(state->mem64 + 0, endian));
+      state->v2 =
+          XXH64_round(state->v2, XXH_readLE64(state->mem64 + 1, endian));
+      state->v3 =
+          XXH64_round(state->v3, XXH_readLE64(state->mem64 + 2, endian));
+      state->v4 =
+          XXH64_round(state->v4, XXH_readLE64(state->mem64 + 3, endian));
+      p += 32 - state->memsize;
+      state->memsize = 0;
+    }
+
+    if (p + 32 <= bEnd) {
+      const BYTE* const limit = bEnd - 32;
+      U64 v1 = state->v1;
+      U64 v2 = state->v2;
+      U64 v3 = state->v3;
+      U64 v4 = state->v4;
+
+      do {
+        v1 = XXH64_round(v1, XXH_readLE64(p, endian));
+        p += 8;
+        v2 = XXH64_round(v2, XXH_readLE64(p, endian));
+        p += 8;
+        v3 = XXH64_round(v3, XXH_readLE64(p, endian));
+        p += 8;
+        v4 = XXH64_round(v4, XXH_readLE64(p, endian));
+        p += 8;
+      } while (p <= limit);
+
+      state->v1 = v1;
+      state->v2 = v2;
+      state->v3 = v3;
+      state->v4 = v4;
+    }
+
+    if (p < bEnd) {
+      XXH_memcpy(state->mem64, p, (size_t)(bEnd - p));
+      state->memsize = (unsigned)(bEnd - p);
+    }
+  }
+
+  return XXH_OK;
+}
+
+XXH_errorcode XXH64_update(XXH64_state_t* state_in, const void* input,
+                           size_t len) {
+  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+    return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
+  else
+    return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+FORCE_INLINE U64 XXH64_digest_endian(const XXH64_state_t* state,
+                                     XXH_endianess endian) {
+  U64 h64;
+
+  if (state->total_len >= 32) {
+    U64 const v1 = state->v1;
+    U64 const v2 = state->v2;
+    U64 const v3 = state->v3;
+    U64 const v4 = state->v4;
+
+    h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +
+          XXH_rotl64(v4, 18);
+    h64 = XXH64_mergeRound(h64, v1);
+    h64 = XXH64_mergeRound(h64, v2);
+    h64 = XXH64_mergeRound(h64, v3);
+    h64 = XXH64_mergeRound(h64, v4);
+  } else {
+    h64 = state->v3 /*seed*/ + PRIME64_5;
+  }
+
+  h64 += (U64)state->total_len;
+
+  return XXH64_finalize(h64, state->mem64, (size_t)state->total_len, endian,
+                        XXH_aligned);
+}
+
+unsigned long long XXH64_digest(const XXH64_state_t* state_in) {
+  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+    return XXH64_digest_endian(state_in, XXH_littleEndian);
+  else
+    return XXH64_digest_endian(state_in, XXH_bigEndian);
+}
+
+/*====== Canonical representation   ======*/
+
+void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) {
+  XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+  if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+  memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) {
+  return XXH_readBE64(src);
+}
 }  // namespace rocksdb
diff --git a/util/xxhash.h b/util/xxhash.h
index 3343e3488f4..88352ac75f9 100644
--- a/util/xxhash.h
+++ b/util/xxhash.h
@@ -59,6 +59,14 @@ It depends on successfully passing SMHasher test set.
 
 #pragma once
 
+#include <stdlib.h>
+
+#if !defined(__VMS) &&       \
+    (defined(__cplusplus) || \
+     (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
+#include <stdint.h>
+#endif
+
 #if defined (__cplusplus)
 namespace rocksdb {
 #endif
@@ -67,6 +75,7 @@ namespace rocksdb {
 //****************************
 // Type
 //****************************
+/* size_t */
 typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 
 
@@ -157,7 +166,74 @@ To free memory context, use XXH32_digest(), or free().
 #define XXH32_result XXH32_digest
 #define XXH32_getIntermediateResult XXH32_intermediateDigest
 
+/*-**********************************************************************
+ *  64-bit hash
+ ************************************************************************/
+typedef unsigned long long XXH64_hash_t;
 
+/*! XXH64() :
+    Calculate the 64-bit hash of sequence of length "len" stored at memory
+   address "input". "seed" can be used to alter the result predictably. This
+   function runs faster on 64-bit systems, but slower on 32-bit systems (see
+   benchmark).
+*/
+XXH64_hash_t XXH64(const void* input, size_t length, unsigned long long seed);
+
+/*======   Streaming   ======*/
+typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
+XXH64_state_t* XXH64_createState(void);
+XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
+void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed);
+XXH_errorcode XXH64_update(XXH64_state_t* statePtr, const void* input,
+                           size_t length);
+XXH64_hash_t XXH64_digest(const XXH64_state_t* statePtr);
+
+/*======   Canonical representation   ======*/
+typedef struct {
+  unsigned char digest[8];
+} XXH64_canonical_t;
+void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+/* These definitions are only present to allow
+ * static allocation of XXH state, on stack or in a struct for example.
+ * Never **ever** use members directly. */
+
+#if !defined(__VMS) &&       \
+    (defined(__cplusplus) || \
+     (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
+
+struct XXH64_state_s {
+  uint64_t total_len;
+  uint64_t v1;
+  uint64_t v2;
+  uint64_t v3;
+  uint64_t v4;
+  uint64_t mem64[4];
+  uint32_t memsize;
+  uint32_t reserved[2]; /* never read nor write, might be removed in a future
+                           version */
+};                      /* typedef'd to XXH64_state_t */
+
+#else
+
+#ifndef XXH_NO_LONG_LONG /* remove 64-bit support */
+struct XXH64_state_s {
+  unsigned long long total_len;
+  unsigned long long v1;
+  unsigned long long v2;
+  unsigned long long v3;
+  unsigned long long v4;
+  unsigned long long mem64[4];
+  unsigned memsize;
+  unsigned reserved[2]; /* never read nor write, might be removed in a future
+                           version */
+};                      /* typedef'd to XXH64_state_t */
+#endif
+
+#endif
 
 #if defined (__cplusplus)
 }  // namespace rocksdb
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 4cafc6ab148..78def188cf4 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -305,16 +305,16 @@ class BackupEngineImpl : public BackupEngine {
   // @param contents If non-empty, the file will be created with these contents.
   Status CopyOrCreateFile(const std::string& src, const std::string& dst,
                           const std::string& contents, Env* src_env,
-                          Env* dst_env, bool sync, RateLimiter* rate_limiter,
+                          Env* dst_env, const EnvOptions& src_env_options,
+                          bool sync, RateLimiter* rate_limiter,
                           uint64_t* size = nullptr,
                           uint32_t* checksum_value = nullptr,
                           uint64_t size_limit = 0,
                           std::function<void()> progress_callback = []() {});
 
-  Status CalculateChecksum(const std::string& src,
-                           Env* src_env,
-                           uint64_t size_limit,
-                           uint32_t* checksum_value);
+  Status CalculateChecksum(const std::string& src, Env* src_env,
+                           const EnvOptions& src_env_options,
+                           uint64_t size_limit, uint32_t* checksum_value);
 
   struct CopyOrCreateResult {
     uint64_t size;
@@ -331,6 +331,7 @@ class BackupEngineImpl : public BackupEngine {
     std::string contents;
     Env* src_env;
     Env* dst_env;
+    EnvOptions src_env_options;
     bool sync;
     RateLimiter* rate_limiter;
     uint64_t size_limit;
@@ -338,14 +339,15 @@ class BackupEngineImpl : public BackupEngine {
     std::function<void()> progress_callback;
 
     CopyOrCreateWorkItem()
-      : src_path(""),
-        dst_path(""),
-        contents(""),
-        src_env(nullptr),
-        dst_env(nullptr),
-        sync(false),
-        rate_limiter(nullptr),
-        size_limit(0) {}
+        : src_path(""),
+          dst_path(""),
+          contents(""),
+          src_env(nullptr),
+          dst_env(nullptr),
+          src_env_options(),
+          sync(false),
+          rate_limiter(nullptr),
+          size_limit(0) {}
 
     CopyOrCreateWorkItem(const CopyOrCreateWorkItem&) = delete;
     CopyOrCreateWorkItem& operator=(const CopyOrCreateWorkItem&) = delete;
@@ -360,6 +362,7 @@ class BackupEngineImpl : public BackupEngine {
       contents = std::move(o.contents);
       src_env = o.src_env;
       dst_env = o.dst_env;
+      src_env_options = std::move(o.src_env_options);
       sync = o.sync;
       rate_limiter = o.rate_limiter;
       size_limit = o.size_limit;
@@ -370,14 +373,15 @@ class BackupEngineImpl : public BackupEngine {
 
     CopyOrCreateWorkItem(std::string _src_path, std::string _dst_path,
                          std::string _contents, Env* _src_env, Env* _dst_env,
-                         bool _sync, RateLimiter* _rate_limiter,
-                         uint64_t _size_limit,
+                         EnvOptions _src_env_options, bool _sync,
+                         RateLimiter* _rate_limiter, uint64_t _size_limit,
                          std::function<void()> _progress_callback = []() {})
         : src_path(std::move(_src_path)),
           dst_path(std::move(_dst_path)),
           contents(std::move(_contents)),
           src_env(_src_env),
           dst_env(_dst_env),
+          src_env_options(std::move(_src_env_options)),
           sync(_sync),
           rate_limiter(_rate_limiter),
           size_limit(_size_limit),
@@ -471,7 +475,8 @@ class BackupEngineImpl : public BackupEngine {
       std::vector<BackupAfterCopyOrCreateWorkItem>& backup_items_to_finish,
       BackupID backup_id, bool shared, const std::string& src_dir,
       const std::string& fname,  // starts with "/"
-      RateLimiter* rate_limiter, uint64_t size_bytes, uint64_t size_limit = 0,
+      const EnvOptions& src_env_options, RateLimiter* rate_limiter,
+      uint64_t size_bytes, uint64_t size_limit = 0,
       bool shared_checksum = false,
       std::function<void()> progress_callback = []() {},
       const std::string& contents = std::string());
@@ -479,9 +484,9 @@ class BackupEngineImpl : public BackupEngine {
   // backup state data
   BackupID latest_backup_id_;
   BackupID latest_valid_backup_id_;
-  std::map<BackupID, unique_ptr<BackupMeta>> backups_;
-  std::map<BackupID,
-           std::pair<Status, unique_ptr<BackupMeta>>> corrupt_backups_;
+  std::map<BackupID, std::unique_ptr<BackupMeta>> backups_;
+  std::map<BackupID, std::pair<Status, std::unique_ptr<BackupMeta>>>
+      corrupt_backups_;
   std::unordered_map<std::string,
                      std::shared_ptr<FileInfo>> backuped_file_infos_;
   std::atomic<bool> stop_backup_;
@@ -492,10 +497,10 @@ class BackupEngineImpl : public BackupEngine {
   Env* backup_env_;
 
   // directories
-  unique_ptr<Directory> backup_directory_;
-  unique_ptr<Directory> shared_directory_;
-  unique_ptr<Directory> meta_directory_;
-  unique_ptr<Directory> private_directory_;
+  std::unique_ptr<Directory> backup_directory_;
+  std::unique_ptr<Directory> shared_directory_;
+  std::unique_ptr<Directory> meta_directory_;
+  std::unique_ptr<Directory> private_directory_;
 
   static const size_t kDefaultCopyFileBufferSize = 5 * 1024 * 1024LL;  // 5MB
   size_t copy_file_buffer_size_;
@@ -616,7 +621,7 @@ Status BackupEngineImpl::Initialize() {
     }
     assert(backups_.find(backup_id) == backups_.end());
     backups_.insert(std::make_pair(
-        backup_id, unique_ptr<BackupMeta>(new BackupMeta(
+        backup_id, std::unique_ptr<BackupMeta>(new BackupMeta(
                        GetBackupMetaFile(backup_id, false /* tmp */),
                        GetBackupMetaFile(backup_id, true /* tmp */),
                        &backuped_file_infos_, backup_env_))));
@@ -723,9 +728,10 @@ Status BackupEngineImpl::Initialize() {
         CopyOrCreateResult result;
         result.status = CopyOrCreateFile(
             work_item.src_path, work_item.dst_path, work_item.contents,
-            work_item.src_env, work_item.dst_env, work_item.sync,
-            work_item.rate_limiter, &result.size, &result.checksum_value,
-            work_item.size_limit, work_item.progress_callback);
+            work_item.src_env, work_item.dst_env, work_item.src_env_options,
+            work_item.sync, work_item.rate_limiter, &result.size,
+            &result.checksum_value, work_item.size_limit,
+            work_item.progress_callback);
         work_item.result.set_value(std::move(result));
       }
     });
@@ -761,7 +767,7 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
   }
 
   auto ret = backups_.insert(std::make_pair(
-      new_backup_id, unique_ptr<BackupMeta>(new BackupMeta(
+      new_backup_id, std::unique_ptr<BackupMeta>(new BackupMeta(
                          GetBackupMetaFile(new_backup_id, false /* tmp */),
                          GetBackupMetaFile(new_backup_id, true /* tmp */),
                          &backuped_file_infos_, backup_env_))));
@@ -796,8 +802,10 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
   if (s.ok()) {
     CheckpointImpl checkpoint(db);
     uint64_t sequence_number = 0;
+    DBOptions db_options = db->GetDBOptions();
+    EnvOptions src_raw_env_options(db_options);
     s = checkpoint.CreateCustomCheckpoint(
-        db->GetDBOptions(),
+        db_options,
         [&](const std::string& /*src_dirname*/, const std::string& /*fname*/,
             FileType) {
           // custom checkpoint will switch to calling copy_file_cb after it sees
@@ -815,11 +823,33 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
           if (type == kTableFile) {
             st = db_env_->GetFileSize(src_dirname + fname, &size_bytes);
           }
+          EnvOptions src_env_options;
+          switch (type) {
+            case kLogFile:
+              src_env_options =
+                  db_env_->OptimizeForLogRead(src_raw_env_options);
+              break;
+            case kTableFile:
+              src_env_options = db_env_->OptimizeForCompactionTableRead(
+                  src_raw_env_options, ImmutableDBOptions(db_options));
+              break;
+            case kDescriptorFile:
+              src_env_options =
+                  db_env_->OptimizeForManifestRead(src_raw_env_options);
+              break;
+            default:
+              // Other backed up files (like options file) are not read by live
+              // DB, so don't need to worry about avoiding mixing buffered and
+              // direct I/O. Just use plain defaults.
+              src_env_options = src_raw_env_options;
+              break;
+          }
           if (st.ok()) {
             st = AddBackupFileWorkItem(
                 live_dst_paths, backup_items_to_finish, new_backup_id,
                 options_.share_table_files && type == kTableFile, src_dirname,
-                fname, rate_limiter, size_bytes, size_limit_bytes,
+                fname, src_env_options, rate_limiter, size_bytes,
+                size_limit_bytes,
                 options_.share_files_with_checksum && type == kTableFile,
                 progress_callback);
           }
@@ -829,8 +859,9 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
           Log(options_.info_log, "add file for backup %s", fname.c_str());
           return AddBackupFileWorkItem(
               live_dst_paths, backup_items_to_finish, new_backup_id,
-              false /* shared */, "" /* src_dir */, fname, rate_limiter,
-              contents.size(), 0 /* size_limit */, false /* shared_checksum */,
+              false /* shared */, "" /* src_dir */, fname,
+              EnvOptions() /* src_env_options */, rate_limiter, contents.size(),
+              0 /* size_limit */, false /* shared_checksum */,
               progress_callback, contents);
         } /* create_file_cb */,
         &sequence_number, flush_before_backup ? 0 : port::kMaxUint64);
@@ -869,7 +900,7 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
     s = new_backup->StoreToFile(options_.sync);
   }
   if (s.ok() && options_.sync) {
-    unique_ptr<Directory> backup_private_directory;
+    std::unique_ptr<Directory> backup_private_directory;
     backup_env_->NewDirectory(
         GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)),
         &backup_private_directory);
@@ -1114,7 +1145,8 @@ Status BackupEngineImpl::RestoreDBFromBackup(
                    dst.c_str());
     CopyOrCreateWorkItem copy_or_create_work_item(
         GetAbsolutePath(file), dst, "" /* contents */, backup_env_, db_env_,
-        false, rate_limiter, 0 /* size_limit */);
+        EnvOptions() /* src_env_options */, false, rate_limiter,
+        0 /* size_limit */);
     RestoreAfterCopyOrCreateWorkItem after_copy_or_create_work_item(
         copy_or_create_work_item.result.get_future(),
         file_info->checksum_value);
@@ -1183,15 +1215,15 @@ Status BackupEngineImpl::VerifyBackup(BackupID backup_id) {
 
 Status BackupEngineImpl::CopyOrCreateFile(
     const std::string& src, const std::string& dst, const std::string& contents,
-    Env* src_env, Env* dst_env, bool sync, RateLimiter* rate_limiter,
-    uint64_t* size, uint32_t* checksum_value, uint64_t size_limit,
-    std::function<void()> progress_callback) {
+    Env* src_env, Env* dst_env, const EnvOptions& src_env_options, bool sync,
+    RateLimiter* rate_limiter, uint64_t* size, uint32_t* checksum_value,
+    uint64_t size_limit, std::function<void()> progress_callback) {
   assert(src.empty() != contents.empty());
   Status s;
-  unique_ptr<WritableFile> dst_file;
-  unique_ptr<SequentialFile> src_file;
-  EnvOptions env_options;
-  env_options.use_mmap_writes = false;
+  std::unique_ptr<WritableFile> dst_file;
+  std::unique_ptr<SequentialFile> src_file;
+  EnvOptions dst_env_options;
+  dst_env_options.use_mmap_writes = false;
   // TODO:(gzh) maybe use direct reads/writes here if possible
   if (size != nullptr) {
     *size = 0;
@@ -1205,18 +1237,18 @@ Status BackupEngineImpl::CopyOrCreateFile(
     size_limit = std::numeric_limits<uint64_t>::max();
   }
 
-  s = dst_env->NewWritableFile(dst, &dst_file, env_options);
+  s = dst_env->NewWritableFile(dst, &dst_file, dst_env_options);
   if (s.ok() && !src.empty()) {
-    s = src_env->NewSequentialFile(src, &src_file, env_options);
+    s = src_env->NewSequentialFile(src, &src_file, src_env_options);
   }
   if (!s.ok()) {
     return s;
   }
 
-  unique_ptr<WritableFileWriter> dest_writer(
-      new WritableFileWriter(std::move(dst_file), dst, env_options));
-  unique_ptr<SequentialFileReader> src_reader;
-  unique_ptr<char[]> buf;
+  std::unique_ptr<WritableFileWriter> dest_writer(
+      new WritableFileWriter(std::move(dst_file), dst, dst_env_options));
+  std::unique_ptr<SequentialFileReader> src_reader;
+  std::unique_ptr<char[]> buf;
   if (!src.empty()) {
     src_reader.reset(new SequentialFileReader(std::move(src_file), src));
     buf.reset(new char[copy_file_buffer_size_]);
@@ -1276,9 +1308,10 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
     std::unordered_set<std::string>& live_dst_paths,
     std::vector<BackupAfterCopyOrCreateWorkItem>& backup_items_to_finish,
     BackupID backup_id, bool shared, const std::string& src_dir,
-    const std::string& fname, RateLimiter* rate_limiter, uint64_t size_bytes,
-    uint64_t size_limit, bool shared_checksum,
-    std::function<void()> progress_callback, const std::string& contents) {
+    const std::string& fname, const EnvOptions& src_env_options,
+    RateLimiter* rate_limiter, uint64_t size_bytes, uint64_t size_limit,
+    bool shared_checksum, std::function<void()> progress_callback,
+    const std::string& contents) {
   assert(!fname.empty() && fname[0] == '/');
   assert(contents.empty() != src_dir.empty());
 
@@ -1289,7 +1322,7 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
 
   if (shared && shared_checksum) {
     // add checksum and file length to the file name
-    s = CalculateChecksum(src_dir + fname, db_env_, size_limit,
+    s = CalculateChecksum(src_dir + fname, db_env_, src_env_options, size_limit,
                           &checksum_value);
     if (!s.ok()) {
       return s;
@@ -1365,8 +1398,8 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
       // the file is present and referenced by a backup
       ROCKS_LOG_INFO(options_.info_log,
                      "%s already present, calculate checksum", fname.c_str());
-      s = CalculateChecksum(src_dir + fname, db_env_, size_limit,
-                            &checksum_value);
+      s = CalculateChecksum(src_dir + fname, db_env_, src_env_options,
+                            size_limit, &checksum_value);
     }
   }
   live_dst_paths.insert(final_dest_path);
@@ -1376,8 +1409,8 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
                    copy_dest_path->c_str());
     CopyOrCreateWorkItem copy_or_create_work_item(
         src_dir.empty() ? "" : src_dir + fname, *copy_dest_path, contents,
-        db_env_, backup_env_, options_.sync, rate_limiter, size_limit,
-        progress_callback);
+        db_env_, backup_env_, src_env_options, options_.sync, rate_limiter,
+        size_limit, progress_callback);
     BackupAfterCopyOrCreateWorkItem after_copy_or_create_work_item(
         copy_or_create_work_item.result.get_future(), shared, need_to_copy,
         backup_env_, temp_dest_path, final_dest_path, dst_relative);
@@ -1399,6 +1432,7 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
 }
 
 Status BackupEngineImpl::CalculateChecksum(const std::string& src, Env* src_env,
+                                           const EnvOptions& src_env_options,
                                            uint64_t size_limit,
                                            uint32_t* checksum_value) {
   *checksum_value = 0;
@@ -1406,17 +1440,13 @@ Status BackupEngineImpl::CalculateChecksum(const std::string& src, Env* src_env,
     size_limit = std::numeric_limits<uint64_t>::max();
   }
 
-  EnvOptions env_options;
-  env_options.use_mmap_writes = false;
-  env_options.use_direct_reads = false;
-
   std::unique_ptr<SequentialFile> src_file;
-  Status s = src_env->NewSequentialFile(src, &src_file, env_options);
+  Status s = src_env->NewSequentialFile(src, &src_file, src_env_options);
   if (!s.ok()) {
     return s;
   }
 
-  unique_ptr<SequentialFileReader> src_reader(
+  std::unique_ptr<SequentialFileReader> src_reader(
       new SequentialFileReader(std::move(src_file), src));
   std::unique_ptr<char[]> buf(new char[copy_file_buffer_size_]);
   Slice data;
@@ -1634,15 +1664,15 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(
     const std::unordered_map<std::string, uint64_t>& abs_path_to_size) {
   assert(Empty());
   Status s;
-  unique_ptr<SequentialFile> backup_meta_file;
+  std::unique_ptr<SequentialFile> backup_meta_file;
   s = env_->NewSequentialFile(meta_filename_, &backup_meta_file, EnvOptions());
   if (!s.ok()) {
     return s;
   }
 
-  unique_ptr<SequentialFileReader> backup_meta_reader(
+  std::unique_ptr<SequentialFileReader> backup_meta_reader(
       new SequentialFileReader(std::move(backup_meta_file), meta_filename_));
-  unique_ptr<char[]> buf(new char[max_backup_meta_file_size_ + 1]);
+  std::unique_ptr<char[]> buf(new char[max_backup_meta_file_size_ + 1]);
   Slice data;
   s = backup_meta_reader->Read(max_backup_meta_file_size_, &data, buf.get());
 
@@ -1736,7 +1766,7 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(
 
 Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
   Status s;
-  unique_ptr<WritableFile> backup_meta_file;
+  std::unique_ptr<WritableFile> backup_meta_file;
   EnvOptions env_options;
   env_options.use_mmap_writes = false;
   env_options.use_direct_writes = false;
@@ -1745,7 +1775,7 @@ Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
     return s;
   }
 
-  unique_ptr<char[]> buf(new char[max_backup_meta_file_size_]);
+  std::unique_ptr<char[]> buf(new char[max_backup_meta_file_size_]);
   size_t len = 0, buf_size = max_backup_meta_file_size_;
   len += snprintf(buf.get(), buf_size, "%" PRId64 "\n", timestamp_);
   len += snprintf(buf.get() + len, buf_size - len, "%" PRIu64 "\n",
@@ -1762,7 +1792,8 @@ Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
     else if (len + hex_meta_strlen >= buf_size) {
       backup_meta_file->Append(Slice(buf.get(), len));
       buf.reset();
-      unique_ptr<char[]> new_reset_buf(new char[max_backup_meta_file_size_]);
+      std::unique_ptr<char[]> new_reset_buf(
+          new char[max_backup_meta_file_size_]);
       buf.swap(new_reset_buf);
       len = 0;
     }
@@ -1776,7 +1807,7 @@ Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
                      "%" ROCKSDB_PRIszt "\n", files_.size()) >= buf_size) {
     backup_meta_file->Append(Slice(buf.get(), len));
     buf.reset();
-    unique_ptr<char[]> new_reset_buf(new char[max_backup_meta_file_size_]);
+    std::unique_ptr<char[]> new_reset_buf(new char[max_backup_meta_file_size_]);
     buf.swap(new_reset_buf);
     len = 0;
   }
@@ -1794,7 +1825,8 @@ Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
     if (newlen >= buf_size) {
       backup_meta_file->Append(Slice(buf.get(), len));
       buf.reset();
-      unique_ptr<char[]> new_reset_buf(new char[max_backup_meta_file_size_]);
+      std::unique_ptr<char[]> new_reset_buf(
+          new char[max_backup_meta_file_size_]);
       buf.swap(new_reset_buf);
       len = 0;
     }
diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
index 9fdc058fd03..26ff00e91a1 100644
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@@ -179,7 +179,8 @@ class TestEnv : public EnvWrapper {
     bool fail_reads_;
   };
 
-  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
                            const EnvOptions& options) override {
     MutexLock l(&mutex_);
     if (dummy_sequential_file_) {
@@ -187,11 +188,18 @@ class TestEnv : public EnvWrapper {
           new TestEnv::DummySequentialFile(dummy_sequential_file_fail_reads_));
       return Status::OK();
     } else {
-      return EnvWrapper::NewSequentialFile(f, r, options);
+      Status s = EnvWrapper::NewSequentialFile(f, r, options);
+      if (s.ok()) {
+        if ((*r)->use_direct_io()) {
+          ++num_direct_seq_readers_;
+        }
+        ++num_seq_readers_;
+      }
+      return s;
     }
   }
 
-  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
                          const EnvOptions& options) override {
     MutexLock l(&mutex_);
     written_files_.push_back(f);
@@ -199,7 +207,28 @@ class TestEnv : public EnvWrapper {
       return Status::NotSupported("Sorry, can't do this");
     }
     limit_written_files_--;
-    return EnvWrapper::NewWritableFile(f, r, options);
+    Status s = EnvWrapper::NewWritableFile(f, r, options);
+    if (s.ok()) {
+      if ((*r)->use_direct_io()) {
+        ++num_direct_writers_;
+      }
+      ++num_writers_;
+    }
+    return s;
+  }
+
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options) override {
+    MutexLock l(&mutex_);
+    Status s = EnvWrapper::NewRandomAccessFile(fname, result, options);
+    if (s.ok()) {
+      if ((*result)->use_direct_io()) {
+        ++num_direct_rand_readers_;
+      }
+      ++num_rand_readers_;
+    }
+    return s;
   }
 
   virtual Status DeleteFile(const std::string& fname) override {
@@ -308,13 +337,30 @@ class TestEnv : public EnvWrapper {
 
   void SetNewDirectoryFailure(bool fail) { new_directory_failure_ = fail; }
   virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) override {
+                              std::unique_ptr<Directory>* result) override {
     if (new_directory_failure_) {
       return Status::IOError("SimulatedFailure");
     }
     return EnvWrapper::NewDirectory(name, result);
   }
 
+  void ClearFileOpenCounters() {
+    MutexLock l(&mutex_);
+    num_rand_readers_ = 0;
+    num_direct_rand_readers_ = 0;
+    num_seq_readers_ = 0;
+    num_direct_seq_readers_ = 0;
+    num_writers_ = 0;
+    num_direct_writers_ = 0;
+  }
+
+  int num_rand_readers() { return num_rand_readers_; }
+  int num_direct_rand_readers() { return num_direct_rand_readers_; }
+  int num_seq_readers() { return num_seq_readers_; }
+  int num_direct_seq_readers() { return num_direct_seq_readers_; }
+  int num_writers() { return num_writers_; }
+  int num_direct_writers() { return num_direct_writers_; }
+
  private:
   port::Mutex mutex_;
   bool dummy_sequential_file_ = false;
@@ -328,6 +374,15 @@ class TestEnv : public EnvWrapper {
   bool get_children_failure_ = false;
   bool create_dir_if_missing_failure_ = false;
   bool new_directory_failure_ = false;
+
+  // Keeps track of how many files of each type were successfully opened, and
+  // out of those, how many were opened with direct I/O.
+  std::atomic<int> num_rand_readers_;
+  std::atomic<int> num_direct_rand_readers_;
+  std::atomic<int> num_seq_readers_;
+  std::atomic<int> num_direct_seq_readers_;
+  std::atomic<int> num_writers_;
+  std::atomic<int> num_direct_writers_;
 };  // TestEnv
 
 class FileManager : public EnvWrapper {
@@ -427,7 +482,7 @@ class FileManager : public EnvWrapper {
   }
 
   Status WriteToFile(const std::string& fname, const std::string& data) {
-    unique_ptr<WritableFile> file;
+    std::unique_ptr<WritableFile> file;
     EnvOptions env_options;
     env_options.use_mmap_writes = false;
     Status s = EnvWrapper::NewWritableFile(fname, &file, env_options);
@@ -620,22 +675,22 @@ class BackupableDBTest : public testing::Test {
   std::shared_ptr<Logger> logger_;
 
   // envs
-  unique_ptr<Env> db_chroot_env_;
-  unique_ptr<Env> backup_chroot_env_;
-  unique_ptr<TestEnv> test_db_env_;
-  unique_ptr<TestEnv> test_backup_env_;
-  unique_ptr<FileManager> file_manager_;
+  std::unique_ptr<Env> db_chroot_env_;
+  std::unique_ptr<Env> backup_chroot_env_;
+  std::unique_ptr<TestEnv> test_db_env_;
+  std::unique_ptr<TestEnv> test_backup_env_;
+  std::unique_ptr<FileManager> file_manager_;
 
   // all the dbs!
   DummyDB* dummy_db_; // BackupableDB owns dummy_db_
-  unique_ptr<DB> db_;
-  unique_ptr<BackupEngine> backup_engine_;
+  std::unique_ptr<DB> db_;
+  std::unique_ptr<BackupEngine> backup_engine_;
 
   // options
   Options options_;
 
  protected:
-  unique_ptr<BackupableDBOptions> backupable_options_;
+  std::unique_ptr<BackupableDBOptions> backupable_options_;
 }; // BackupableDBTest
 
 void AppendPath(const std::string& path, std::vector<std::string>& v) {
@@ -1633,6 +1688,59 @@ TEST_F(BackupableDBTest, WriteOnlyEngineNoSharedFileDeletion) {
     AssertBackupConsistency(i + 1, 0, (i + 1) * kNumKeys);
   }
 }
+
+TEST_P(BackupableDBTestWithParam, BackupUsingDirectIO) {
+  // Tests direct I/O on the backup engine's reads and writes on the DB env and
+  // backup env
+  // We use ChrootEnv underneath so the below line checks for direct I/O support
+  // in the chroot directory, not the true filesystem root.
+  if (!test::IsDirectIOSupported(test_db_env_.get(), "/")) {
+    return;
+  }
+  const int kNumKeysPerBackup = 100;
+  const int kNumBackups = 3;
+  options_.use_direct_reads = true;
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  for (int i = 0; i < kNumBackups; ++i) {
+    FillDB(db_.get(), i * kNumKeysPerBackup /* from */,
+           (i + 1) * kNumKeysPerBackup /* to */);
+    ASSERT_OK(db_->Flush(FlushOptions()));
+
+    // Clear the file open counters and then do a bunch of backup engine ops.
+    // For all ops, files should be opened in direct mode.
+    test_backup_env_->ClearFileOpenCounters();
+    test_db_env_->ClearFileOpenCounters();
+    CloseBackupEngine();
+    OpenBackupEngine();
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                              false /* flush_before_backup */));
+    ASSERT_OK(backup_engine_->VerifyBackup(i + 1));
+    CloseBackupEngine();
+    OpenBackupEngine();
+    std::vector<BackupInfo> backup_infos;
+    backup_engine_->GetBackupInfo(&backup_infos);
+    ASSERT_EQ(static_cast<size_t>(i + 1), backup_infos.size());
+
+    // Verify backup engine always opened files with direct I/O
+    ASSERT_EQ(0, test_db_env_->num_writers());
+    ASSERT_EQ(0, test_db_env_->num_rand_readers());
+    ASSERT_GT(test_db_env_->num_direct_seq_readers(), 0);
+    // Currently the DB doesn't support reading WALs or manifest with direct
+    // I/O, so subtract two.
+    ASSERT_EQ(test_db_env_->num_seq_readers() - 2,
+              test_db_env_->num_direct_seq_readers());
+    ASSERT_EQ(0, test_db_env_->num_rand_readers());
+  }
+  CloseDBAndBackupEngine();
+
+  for (int i = 0; i < kNumBackups; ++i) {
+    AssertBackupConsistency(i + 1 /* backup_id */,
+                            i * kNumKeysPerBackup /* start_exist */,
+                            (i + 1) * kNumKeysPerBackup /* end_exist */,
+                            (i + 2) * kNumKeysPerBackup /* end */);
+  }
+}
+
 }  // anon namespace
 
 } //  namespace rocksdb
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 1a32bd562eb..bf46bf6b1c0 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -26,6 +26,7 @@
 #include "util/cast_util.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
+#include "util/file_util.h"
 #include "util/filename.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
@@ -404,82 +405,91 @@ std::shared_ptr<BlobFile> BlobDBImpl::FindBlobFileLocked(
   return (b1 || b2) ? nullptr : (*finditr);
 }
 
-std::shared_ptr<Writer> BlobDBImpl::CheckOrCreateWriterLocked(
-    const std::shared_ptr<BlobFile>& bfile) {
-  std::shared_ptr<Writer> writer = bfile->GetWriter();
-  if (writer) return writer;
-
-  Status s = CreateWriterLocked(bfile);
-  if (!s.ok()) return nullptr;
-
-  writer = bfile->GetWriter();
-  return writer;
+Status BlobDBImpl::CheckOrCreateWriterLocked(
+    const std::shared_ptr<BlobFile>& blob_file,
+    std::shared_ptr<Writer>* writer) {
+  assert(writer != nullptr);
+  *writer = blob_file->GetWriter();
+  if (*writer != nullptr) {
+    return Status::OK();
+  }
+  Status s = CreateWriterLocked(blob_file);
+  if (s.ok()) {
+    *writer = blob_file->GetWriter();
+  }
+  return s;
 }
 
-std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFile() {
+Status BlobDBImpl::SelectBlobFile(std::shared_ptr<BlobFile>* blob_file) {
+  assert(blob_file != nullptr);
   {
     ReadLock rl(&mutex_);
     if (open_non_ttl_file_ != nullptr) {
-      return open_non_ttl_file_;
+      *blob_file = open_non_ttl_file_;
+      return Status::OK();
     }
   }
 
   // CHECK again
   WriteLock wl(&mutex_);
   if (open_non_ttl_file_ != nullptr) {
-    return open_non_ttl_file_;
+    *blob_file = open_non_ttl_file_;
+    return Status::OK();
   }
 
-  std::shared_ptr<BlobFile> bfile = NewBlobFile("SelectBlobFile");
-  assert(bfile);
+  *blob_file = NewBlobFile("SelectBlobFile");
+  assert(*blob_file != nullptr);
 
   // file not visible, hence no lock
-  std::shared_ptr<Writer> writer = CheckOrCreateWriterLocked(bfile);
-  if (!writer) {
+  std::shared_ptr<Writer> writer;
+  Status s = CheckOrCreateWriterLocked(*blob_file, &writer);
+  if (!s.ok()) {
     ROCKS_LOG_ERROR(db_options_.info_log,
-                    "Failed to get writer from blob file: %s",
-                    bfile->PathName().c_str());
-    return nullptr;
+                    "Failed to get writer from blob file: %s, error: %s",
+                    (*blob_file)->PathName().c_str(), s.ToString().c_str());
+    return s;
   }
 
-  bfile->file_size_ = BlobLogHeader::kSize;
-  bfile->header_.compression = bdb_options_.compression;
-  bfile->header_.has_ttl = false;
-  bfile->header_.column_family_id =
+  (*blob_file)->file_size_ = BlobLogHeader::kSize;
+  (*blob_file)->header_.compression = bdb_options_.compression;
+  (*blob_file)->header_.has_ttl = false;
+  (*blob_file)->header_.column_family_id =
       reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
-  bfile->header_valid_ = true;
-  bfile->SetColumnFamilyId(bfile->header_.column_family_id);
-  bfile->SetHasTTL(false);
-  bfile->SetCompression(bdb_options_.compression);
+  (*blob_file)->header_valid_ = true;
+  (*blob_file)->SetColumnFamilyId((*blob_file)->header_.column_family_id);
+  (*blob_file)->SetHasTTL(false);
+  (*blob_file)->SetCompression(bdb_options_.compression);
 
-  Status s = writer->WriteHeader(bfile->header_);
+  s = writer->WriteHeader((*blob_file)->header_);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(db_options_.info_log,
                     "Failed to write header to new blob file: %s"
                     " status: '%s'",
-                    bfile->PathName().c_str(), s.ToString().c_str());
-    return nullptr;
+                    (*blob_file)->PathName().c_str(), s.ToString().c_str());
+    return s;
   }
 
-  blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile));
-  open_non_ttl_file_ = bfile;
+  blob_files_.insert(
+      std::make_pair((*blob_file)->BlobFileNumber(), *blob_file));
+  open_non_ttl_file_ = *blob_file;
   total_blob_size_ += BlobLogHeader::kSize;
-  return bfile;
+  return s;
 }
 
-std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFileTTL(uint64_t expiration) {
+Status BlobDBImpl::SelectBlobFileTTL(uint64_t expiration,
+                                     std::shared_ptr<BlobFile>* blob_file) {
+  assert(blob_file != nullptr);
   assert(expiration != kNoExpiration);
   uint64_t epoch_read = 0;
-  std::shared_ptr<BlobFile> bfile;
   {
     ReadLock rl(&mutex_);
-    bfile = FindBlobFileLocked(expiration);
+    *blob_file = FindBlobFileLocked(expiration);
     epoch_read = epoch_of_.load();
   }
 
-  if (bfile) {
-    assert(!bfile->Immutable());
-    return bfile;
+  if (*blob_file != nullptr) {
+    assert(!(*blob_file)->Immutable());
+    return Status::OK();
   }
 
   uint64_t exp_low =
@@ -487,61 +497,66 @@ std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFileTTL(uint64_t expiration) {
   uint64_t exp_high = exp_low + bdb_options_.ttl_range_secs;
   ExpirationRange expiration_range = std::make_pair(exp_low, exp_high);
 
-  bfile = NewBlobFile("SelectBlobFileTTL");
-  assert(bfile);
+  *blob_file = NewBlobFile("SelectBlobFileTTL");
+  assert(*blob_file != nullptr);
 
   ROCKS_LOG_INFO(db_options_.info_log, "New blob file TTL range: %s %d %d",
-                 bfile->PathName().c_str(), exp_low, exp_high);
+                 (*blob_file)->PathName().c_str(), exp_low, exp_high);
   LogFlush(db_options_.info_log);
 
   // we don't need to take lock as no other thread is seeing bfile yet
-  std::shared_ptr<Writer> writer = CheckOrCreateWriterLocked(bfile);
-  if (!writer) {
-    ROCKS_LOG_ERROR(db_options_.info_log,
-                    "Failed to get writer from blob file with TTL: %s",
-                    bfile->PathName().c_str());
-    return nullptr;
+  std::shared_ptr<Writer> writer;
+  Status s = CheckOrCreateWriterLocked(*blob_file, &writer);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(
+        db_options_.info_log,
+        "Failed to get writer from blob file with TTL: %s, error: %s",
+        (*blob_file)->PathName().c_str(), s.ToString().c_str());
+    return s;
   }
 
-  bfile->header_.expiration_range = expiration_range;
-  bfile->header_.compression = bdb_options_.compression;
-  bfile->header_.has_ttl = true;
-  bfile->header_.column_family_id =
+  (*blob_file)->header_.expiration_range = expiration_range;
+  (*blob_file)->header_.compression = bdb_options_.compression;
+  (*blob_file)->header_.has_ttl = true;
+  (*blob_file)->header_.column_family_id =
       reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
-  ;
-  bfile->header_valid_ = true;
-  bfile->SetColumnFamilyId(bfile->header_.column_family_id);
-  bfile->SetHasTTL(true);
-  bfile->SetCompression(bdb_options_.compression);
-  bfile->file_size_ = BlobLogHeader::kSize;
+  (*blob_file)->header_valid_ = true;
+  (*blob_file)->SetColumnFamilyId((*blob_file)->header_.column_family_id);
+  (*blob_file)->SetHasTTL(true);
+  (*blob_file)->SetCompression(bdb_options_.compression);
+  (*blob_file)->file_size_ = BlobLogHeader::kSize;
 
   // set the first value of the range, since that is
   // concrete at this time.  also necessary to add to open_ttl_files_
-  bfile->expiration_range_ = expiration_range;
+  (*blob_file)->expiration_range_ = expiration_range;
 
   WriteLock wl(&mutex_);
   // in case the epoch has shifted in the interim, then check
   // check condition again - should be rare.
   if (epoch_of_.load() != epoch_read) {
-    auto bfile2 = FindBlobFileLocked(expiration);
-    if (bfile2) return bfile2;
+    std::shared_ptr<BlobFile> blob_file2 = FindBlobFileLocked(expiration);
+    if (blob_file2 != nullptr) {
+      *blob_file = std::move(blob_file2);
+      return Status::OK();
+    }
   }
 
-  Status s = writer->WriteHeader(bfile->header_);
+  s = writer->WriteHeader((*blob_file)->header_);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(db_options_.info_log,
                     "Failed to write header to new blob file: %s"
                     " status: '%s'",
-                    bfile->PathName().c_str(), s.ToString().c_str());
-    return nullptr;
+                    (*blob_file)->PathName().c_str(), s.ToString().c_str());
+    return s;
   }
 
-  blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile));
-  open_ttl_files_.insert(bfile);
+  blob_files_.insert(
+      std::make_pair((*blob_file)->BlobFileNumber(), *blob_file));
+  open_ttl_files_.insert(*blob_file);
   total_blob_size_ += BlobLogHeader::kSize;
   epoch_of_++;
 
-  return bfile;
+  return s;
 }
 
 class BlobDBImpl::BlobInserter : public WriteBatch::Handler {
@@ -695,36 +710,41 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& /*options*/,
       return s;
     }
 
-    std::shared_ptr<BlobFile> bfile = (expiration != kNoExpiration)
-                                          ? SelectBlobFileTTL(expiration)
-                                          : SelectBlobFile();
-    assert(bfile != nullptr);
-    assert(bfile->compression() == bdb_options_.compression);
-
-    s = AppendBlob(bfile, headerbuf, key, value_compressed, expiration,
-                   &index_entry);
-    if (expiration == kNoExpiration) {
-      RecordTick(statistics_, BLOB_DB_WRITE_BLOB);
+    std::shared_ptr<BlobFile> blob_file;
+    if (expiration != kNoExpiration) {
+      s = SelectBlobFileTTL(expiration, &blob_file);
     } else {
-      RecordTick(statistics_, BLOB_DB_WRITE_BLOB_TTL);
+      s = SelectBlobFile(&blob_file);
+    }
+    if (s.ok()) {
+      assert(blob_file != nullptr);
+      assert(blob_file->compression() == bdb_options_.compression);
+      s = AppendBlob(blob_file, headerbuf, key, value_compressed, expiration,
+                     &index_entry);
     }
-
     if (s.ok()) {
       if (expiration != kNoExpiration) {
-        bfile->ExtendExpirationRange(expiration);
+        blob_file->ExtendExpirationRange(expiration);
       }
-      s = CloseBlobFileIfNeeded(bfile);
-      if (s.ok()) {
-        s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
-                                             index_entry);
+      s = CloseBlobFileIfNeeded(blob_file);
+    }
+    if (s.ok()) {
+      s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
+                                           index_entry);
+    }
+    if (s.ok()) {
+      if (expiration == kNoExpiration) {
+        RecordTick(statistics_, BLOB_DB_WRITE_BLOB);
+      } else {
+        RecordTick(statistics_, BLOB_DB_WRITE_BLOB_TTL);
       }
     } else {
       ROCKS_LOG_ERROR(db_options_.info_log,
                       "Failed to append blob to FILE: %s: KEY: %s VALSZ: %d"
                       " status: '%s' blob_file: '%s'",
-                      bfile->PathName().c_str(), key.ToString().c_str(),
+                      blob_file->PathName().c_str(), key.ToString().c_str(),
                       value.size(), s.ToString().c_str(),
-                      bfile->DumpState().c_str());
+                      blob_file->DumpState().c_str());
     }
   }
 
@@ -867,9 +887,10 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
   uint64_t key_offset = 0;
   {
     WriteLock lockbfile_w(&bfile->mutex_);
-    std::shared_ptr<Writer> writer = CheckOrCreateWriterLocked(bfile);
-    if (!writer) {
-      return Status::IOError("Failed to create blob writer");
+    std::shared_ptr<Writer> writer;
+    s = CheckOrCreateWriterLocked(bfile, &writer);
+    if (!s.ok()) {
+      return s;
     }
 
     // write the blob to the blob log.
@@ -1459,8 +1480,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
     return s;
   }
 
-  auto* cfh =
-      db_impl_->GetColumnFamilyHandleUnlocked(bfptr->column_family_id());
+  auto cfh = db_impl_->DefaultColumnFamily();
   auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
   auto column_family_id = cfd->GetID();
   bool has_ttl = header.has_ttl;
@@ -1575,7 +1595,13 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
       reason += bfptr->PathName();
       newfile = NewBlobFile(reason);
 
-      new_writer = CheckOrCreateWriterLocked(newfile);
+      s = CheckOrCreateWriterLocked(newfile, &new_writer);
+      if (!s.ok()) {
+        ROCKS_LOG_ERROR(db_options_.info_log,
+                        "Failed to open file %s for writer, error: %s",
+                        newfile->PathName().c_str(), s.ToString().c_str());
+        break;
+      }
       // Can't use header beyond this point
       newfile->header_ = std::move(header);
       newfile->header_valid_ = true;
@@ -1720,7 +1746,8 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
                    bfile->PathName().c_str());
 
     blob_files_.erase(bfile->BlobFileNumber());
-    Status s = env_->DeleteFile(bfile->PathName());
+    Status s = DeleteDBFile(&(db_impl_->immutable_db_options()),
+                             bfile->PathName(), blob_dir_, true);
     if (!s.ok()) {
       ROCKS_LOG_ERROR(db_options_.info_log,
                       "File failed to be deleted as obsolete %s",
@@ -1810,7 +1837,7 @@ Status DestroyBlobDB(const std::string& dbname, const Options& options,
     uint64_t number;
     FileType type;
     if (ParseFileName(f, &number, &type) && type == kBlobFile) {
-      Status del = env->DeleteFile(blobdir + "/" + f);
+      Status del = DeleteDBFile(&soptions, blobdir + "/" + f, blobdir, true);
       if (status.ok() && !del.ok()) {
         status = del;
       }
diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h
index 4296d5c6abb..8d5148def61 100644
--- a/utilities/blob_db/blob_db_impl.h
+++ b/utilities/blob_db/blob_db_impl.h
@@ -255,10 +255,11 @@ class BlobDBImpl : public BlobDB {
 
   // find an existing blob log file based on the expiration unix epoch
   // if such a file does not exist, return nullptr
-  std::shared_ptr<BlobFile> SelectBlobFileTTL(uint64_t expiration);
+  Status SelectBlobFileTTL(uint64_t expiration,
+                           std::shared_ptr<BlobFile>* blob_file);
 
   // find an existing blob log file to append the value to
-  std::shared_ptr<BlobFile> SelectBlobFile();
+  Status SelectBlobFile(std::shared_ptr<BlobFile>* blob_file);
 
   std::shared_ptr<BlobFile> FindBlobFileLocked(uint64_t expiration) const;
 
@@ -309,8 +310,8 @@ class BlobDBImpl : public BlobDB {
 
   // returns a Writer object for the file. If writer is not
   // already present, creates one. Needs Write Mutex to be held
-  std::shared_ptr<Writer> CheckOrCreateWriterLocked(
-      const std::shared_ptr<BlobFile>& bfile);
+  Status CheckOrCreateWriterLocked(const std::shared_ptr<BlobFile>& blob_file,
+                                   std::shared_ptr<Writer>* writer);
 
   // Iterate through keys and values on Blob and write into
   // separate file the remaining blobs and delete/update pointers
@@ -347,7 +348,8 @@ class BlobDBImpl : public BlobDB {
   ColumnFamilyOptions cf_options_;
   EnvOptions env_options_;
 
-  // Raw pointer of statistic. db_options_ has a shared_ptr to hold ownership.
+  // Raw pointer of statistic. db_options_ has a std::shared_ptr to hold
+  // ownership.
   Statistics* statistics_;
 
   // by default this is "blob_dir" under dbname_
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index cf8f1217aa0..d9cca123e96 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -18,6 +18,7 @@
 #include "util/cast_util.h"
 #include "util/fault_injection_test_env.h"
 #include "util/random.h"
+#include "util/sst_file_manager_impl.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
 #include "util/testharness.h"
@@ -374,6 +375,19 @@ TEST_F(BlobDBTest, GetIOError) {
   fault_injection_env_->SetFilesystemActive(true);
 }
 
+TEST_F(BlobDBTest, PutIOError) {
+  Options options;
+  options.env = fault_injection_env_.get();
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;  // Make sure value write to blob file
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options, options);
+  fault_injection_env_->SetFilesystemActive(false, Status::IOError());
+  ASSERT_TRUE(Put("foo", "v1").IsIOError());
+  fault_injection_env_->SetFilesystemActive(true, Status::IOError());
+  ASSERT_OK(Put("bar", "v1"));
+}
+
 TEST_F(BlobDBTest, WriteBatch) {
   Random rnd(301);
   BlobDBOptions bdb_options;
@@ -749,6 +763,52 @@ TEST_F(BlobDBTest, ReadWhileGC) {
   }
 }
 
+TEST_F(BlobDBTest, SstFileManager) {
+  // run the same test for Get(), MultiGet() and Iterator each.
+  std::shared_ptr<SstFileManager> sst_file_manager(
+      NewSstFileManager(mock_env_.get()));
+  sst_file_manager->SetDeleteRateBytesPerSecond(1);
+  SstFileManagerImpl *sfm =
+      static_cast<SstFileManagerImpl *>(sst_file_manager.get());
+
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  Options db_options;
+
+  int files_deleted_directly = 0;
+  int files_scheduled_to_delete = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleFileDeletion",
+      [&](void * /*arg*/) { files_scheduled_to_delete++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteFile",
+      [&](void * /*arg*/) { files_deleted_directly++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+  db_options.sst_file_manager = sst_file_manager;
+
+  Open(bdb_options, db_options);
+
+  // Create one obselete file and clean it.
+  blob_db_->Put(WriteOptions(), "foo", "bar");
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  std::shared_ptr<BlobFile> bfile = blob_files[0];
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile));
+  GCStats gc_stats;
+  ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(bfile, &gc_stats));
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+
+  // Even if SSTFileManager is not set, DB is creating a dummy one.
+  ASSERT_EQ(1, files_scheduled_to_delete);
+  ASSERT_EQ(0, files_deleted_directly);
+  Destroy();
+  // Make sure that DestroyBlobDB() also goes through delete scheduler.
+  ASSERT_GE(2, files_scheduled_to_delete);
+  ASSERT_EQ(0, files_deleted_directly);
+  SyncPoint::GetInstance()->DisableProcessing();
+  sfm->WaitForEmptyTrash();
+}
+
 TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
   BlobDBOptions bdb_options;
   bdb_options.min_blob_size = 0;
diff --git a/utilities/blob_db/blob_dump_tool.h b/utilities/blob_db/blob_dump_tool.h
index e91feffa794..ff4672fd3f3 100644
--- a/utilities/blob_db/blob_dump_tool.h
+++ b/utilities/blob_db/blob_dump_tool.h
@@ -33,7 +33,7 @@ class BlobDumpTool {
 
  private:
   std::unique_ptr<RandomAccessFileReader> reader_;
-  std::unique_ptr<char> buffer_;
+  std::unique_ptr<char[]> buffer_;
   size_t buffer_size_;
 
   Status Read(uint64_t offset, size_t size, Slice* result);
diff --git a/utilities/blob_db/blob_log_format.h b/utilities/blob_db/blob_log_format.h
index 3e1b686aa12..fcc042f06db 100644
--- a/utilities/blob_db/blob_log_format.h
+++ b/utilities/blob_db/blob_log_format.h
@@ -10,7 +10,9 @@
 #ifndef ROCKSDB_LITE
 
 #include <limits>
+#include <memory>
 #include <utility>
+
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
@@ -106,8 +108,8 @@ struct BlobLogRecord {
   uint32_t blob_crc = 0;
   Slice key;
   Slice value;
-  std::string key_buf;
-  std::string value_buf;
+  std::unique_ptr<char[]> key_buf;
+  std::unique_ptr<char[]> value_buf;
 
   uint64_t record_size() const { return kHeaderSize + key_size + value_size; }
 
diff --git a/utilities/blob_db/blob_log_reader.cc b/utilities/blob_db/blob_log_reader.cc
index 4996d987b63..0f098f2d45c 100644
--- a/utilities/blob_db/blob_log_reader.cc
+++ b/utilities/blob_db/blob_log_reader.cc
@@ -24,10 +24,9 @@ Reader::Reader(unique_ptr<RandomAccessFileReader>&& file_reader, Env* env,
       buffer_(),
       next_byte_(0) {}
 
-Status Reader::ReadSlice(uint64_t size, Slice* slice, std::string* buf) {
+Status Reader::ReadSlice(uint64_t size, Slice* slice, char* buf) {
   StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
-  buf->reserve(static_cast<size_t>(size));
-  Status s = file_->Read(next_byte_, static_cast<size_t>(size), slice, &(*buf)[0]);
+  Status s = file_->Read(next_byte_, static_cast<size_t>(size), slice, buf);
   next_byte_ += size;
   if (!s.ok()) {
     return s;
@@ -42,7 +41,7 @@ Status Reader::ReadSlice(uint64_t size, Slice* slice, std::string* buf) {
 Status Reader::ReadHeader(BlobLogHeader* header) {
   assert(file_.get() != nullptr);
   assert(next_byte_ == 0);
-  Status s = ReadSlice(BlobLogHeader::kSize, &buffer_, &backing_store_);
+  Status s = ReadSlice(BlobLogHeader::kSize, &buffer_, header_buf_);
   if (!s.ok()) {
     return s;
   }
@@ -56,7 +55,7 @@ Status Reader::ReadHeader(BlobLogHeader* header) {
 
 Status Reader::ReadRecord(BlobLogRecord* record, ReadLevel level,
                           uint64_t* blob_offset) {
-  Status s = ReadSlice(BlobLogRecord::kHeaderSize, &buffer_, &backing_store_);
+  Status s = ReadSlice(BlobLogRecord::kHeaderSize, &buffer_, header_buf_);
   if (!s.ok()) {
     return s;
   }
@@ -80,14 +79,18 @@ Status Reader::ReadRecord(BlobLogRecord* record, ReadLevel level,
       break;
 
     case kReadHeaderKey:
-      s = ReadSlice(record->key_size, &record->key, &record->key_buf);
+      record->key_buf.reset(new char[record->key_size]);
+      s = ReadSlice(record->key_size, &record->key, record->key_buf.get());
       next_byte_ += record->value_size;
       break;
 
     case kReadHeaderKeyBlob:
-      s = ReadSlice(record->key_size, &record->key, &record->key_buf);
+      record->key_buf.reset(new char[record->key_size]);
+      s = ReadSlice(record->key_size, &record->key, record->key_buf.get());
       if (s.ok()) {
-        s = ReadSlice(record->value_size, &record->value, &record->value_buf);
+        record->value_buf.reset(new char[record->value_size]);
+        s = ReadSlice(record->value_size, &record->value,
+                      record->value_buf.get());
       }
       if (s.ok()) {
         s = record->CheckBlobCRC();
diff --git a/utilities/blob_db/blob_log_reader.h b/utilities/blob_db/blob_log_reader.h
index 4b780decd52..45e2e955145 100644
--- a/utilities/blob_db/blob_log_reader.h
+++ b/utilities/blob_db/blob_log_reader.h
@@ -60,19 +60,19 @@ class Reader {
   Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHeader,
                     uint64_t* blob_offset = nullptr);
 
-  Status ReadSlice(uint64_t size, Slice* slice, std::string* buf);
-
   void ResetNextByte() { next_byte_ = 0; }
 
   uint64_t GetNextByte() const { return next_byte_; }
 
  private:
+  Status ReadSlice(uint64_t size, Slice* slice, char* buf);
+
   const std::unique_ptr<RandomAccessFileReader> file_;
   Env* env_;
   Statistics* statistics_;
 
-  std::string backing_store_;
   Slice buffer_;
+  char header_buf_[BlobLogRecord::kHeaderSize];
 
   // which byte to read next. For asserting proper usage
   uint64_t next_byte_;
diff --git a/utilities/cassandra/cassandra_functional_test.cc b/utilities/cassandra/cassandra_functional_test.cc
index 3e612b3ad6a..653e6da72b8 100644
--- a/utilities/cassandra/cassandra_functional_test.cc
+++ b/utilities/cassandra/cassandra_functional_test.cc
@@ -101,7 +101,7 @@ class TestCompactionFilterFactory : public CompactionFilterFactory {
 
  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
      const CompactionFilter::Context& /*context*/) override {
-   return unique_ptr<CompactionFilter>(new CassandraCompactionFilter(
+   return std::unique_ptr<CompactionFilter>(new CassandraCompactionFilter(
        purge_ttl_on_expiration_, gc_grace_period_in_seconds_));
   }
 
diff --git a/utilities/cassandra/format.cc b/utilities/cassandra/format.cc
index 4a22658de15..42cd7206b61 100644
--- a/utilities/cassandra/format.cc
+++ b/utilities/cassandra/format.cc
@@ -266,7 +266,7 @@ RowValue RowValue::ConvertExpiredColumnsToTombstones(bool* changed) const {
         std::static_pointer_cast<ExpiringColumn>(column);
 
       if(expiring_column->Expired()) {
-        shared_ptr<Tombstone> tombstone = expiring_column->ToTombstone();
+        std::shared_ptr<Tombstone> tombstone = expiring_column->ToTombstone();
         new_columns.push_back(tombstone);
         *changed = true;
         continue;
diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc
index 48f9200fb64..9863ac1d564 100644
--- a/utilities/checkpoint/checkpoint_impl.cc
+++ b/utilities/checkpoint/checkpoint_impl.cc
@@ -133,7 +133,7 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
     s = db_->GetEnv()->RenameFile(full_private_path, checkpoint_dir);
   }
   if (s.ok()) {
-    unique_ptr<Directory> checkpoint_directory;
+    std::unique_ptr<Directory> checkpoint_directory;
     db_->GetEnv()->NewDirectory(checkpoint_dir, &checkpoint_directory);
     if (checkpoint_directory != nullptr) {
       s = checkpoint_directory->Fsync();
diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc
index 62c78faa8b4..b8436ccf590 100644
--- a/utilities/checkpoint/checkpoint_test.cc
+++ b/utilities/checkpoint/checkpoint_test.cc
@@ -164,6 +164,16 @@ class CheckpointTest : public testing::Test {
     return DB::OpenForReadOnly(options, dbname_, &db_);
   }
 
+  Status ReadOnlyReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                          const Options& options) {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (const auto& cf : cfs) {
+      column_families.emplace_back(cf, options);
+    }
+    return DB::OpenForReadOnly(options, dbname_, column_families, &handles_,
+                               &db_);
+  }
+
   Status TryReopen(const Options& options) {
     Close();
     last_options_ = options;
@@ -612,6 +622,69 @@ TEST_F(CheckpointTest, CheckpointWithUnsyncedDataDropped) {
   db_ = nullptr;
 }
 
+TEST_F(CheckpointTest, CheckpointReadOnlyDB) {
+  ASSERT_OK(Put("foo", "foo_value"));
+  ASSERT_OK(Flush());
+  Close();
+  Options options = CurrentOptions();
+  ASSERT_OK(ReadOnlyReopen(options));
+  Checkpoint* checkpoint = nullptr;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+  delete checkpoint;
+  checkpoint = nullptr;
+  Close();
+  DB* snapshot_db = nullptr;
+  ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
+  ReadOptions read_opts;
+  std::string get_result;
+  ASSERT_OK(snapshot_db->Get(read_opts, "foo", &get_result));
+  ASSERT_EQ("foo_value", get_result);
+  delete snapshot_db;
+}
+
+TEST_F(CheckpointTest, CheckpointReadOnlyDBWithMultipleColumnFamilies) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  for (int i = 0; i != 3; ++i) {
+    ASSERT_OK(Put(i, "foo", "foo_value"));
+    ASSERT_OK(Flush(i));
+  }
+  Close();
+  Status s = ReadOnlyReopenWithColumnFamilies(
+      {kDefaultColumnFamilyName, "pikachu", "eevee"}, options);
+  ASSERT_OK(s);
+  Checkpoint* checkpoint = nullptr;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+  delete checkpoint;
+  checkpoint = nullptr;
+  Close();
+
+  std::vector<ColumnFamilyDescriptor> column_families{
+      {kDefaultColumnFamilyName, options},
+      {"pikachu", options},
+      {"eevee", options}};
+  DB* snapshot_db = nullptr;
+  std::vector<ColumnFamilyHandle*> snapshot_handles;
+  s = DB::Open(options, snapshot_name_, column_families, &snapshot_handles,
+               &snapshot_db);
+  ASSERT_OK(s);
+  ReadOptions read_opts;
+  for (int i = 0; i != 3; ++i) {
+    std::string get_result;
+    s = snapshot_db->Get(read_opts, snapshot_handles[i], "foo", &get_result);
+    ASSERT_OK(s);
+    ASSERT_EQ("foo_value", get_result);
+  }
+
+  for (auto snapshot_h : snapshot_handles) {
+    delete snapshot_h;
+  }
+  snapshot_handles.clear();
+  delete snapshot_db;
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/utilities/column_aware_encoding_exp.cc b/utilities/column_aware_encoding_exp.cc
index 988a59b3c77..c251c985ec6 100644
--- a/utilities/column_aware_encoding_exp.cc
+++ b/utilities/column_aware_encoding_exp.cc
@@ -88,7 +88,7 @@ class ColumnAwareEncodingExp {
     EnvOptions env_options;
     if (CompressionTypeSupported(compression_type)) {
       fprintf(stdout, "[%s]\n", FLAGS_compression_type.c_str());
-      unique_ptr<WritableFile> encoded_out_file;
+      std::unique_ptr<WritableFile> encoded_out_file;
 
       std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
       if (!FLAGS_encoded_file.empty()) {
@@ -116,7 +116,7 @@ class ColumnAwareEncodingExp {
       uint64_t encode_time = sw.ElapsedNanosSafe(false /* reset */);
       fprintf(stdout, "Encode time: %" PRIu64 "\n", encode_time);
       if (decode) {
-        unique_ptr<WritableFile> decoded_out_file;
+        std::unique_ptr<WritableFile> decoded_out_file;
         if (!FLAGS_decoded_file.empty()) {
           env->NewWritableFile(FLAGS_decoded_file, &decoded_out_file,
                                env_options);
diff --git a/utilities/date_tiered/date_tiered_db_impl.cc b/utilities/date_tiered/date_tiered_db_impl.cc
index 978bfb2e495..2574d379f2a 100644
--- a/utilities/date_tiered/date_tiered_db_impl.cc
+++ b/utilities/date_tiered/date_tiered_db_impl.cc
@@ -389,7 +389,7 @@ Iterator* DateTieredDBImpl::NewIterator(const ReadOptions& opts) {
   for (auto& item : handle_map_) {
     auto handle = item.second;
     builder.AddIterator(db_impl->NewInternalIterator(
-        arena, db_iter->GetRangeDelAggregator(), handle));
+        arena, db_iter->GetRangeDelAggregator(), kMaxSequenceNumber, handle));
   }
   auto internal_iter = builder.Finish();
   db_iter->SetIterUnderDBIter(internal_iter);
diff --git a/utilities/date_tiered/date_tiered_test.cc b/utilities/date_tiered/date_tiered_test.cc
index 8e7fced58a0..35f15584e5a 100644
--- a/utilities/date_tiered/date_tiered_test.cc
+++ b/utilities/date_tiered/date_tiered_test.cc
@@ -13,6 +13,7 @@
 
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/utilities/date_tiered_db.h"
+#include "port/port.h"
 #include "util/logging.h"
 #include "util/string_util.h"
 #include "util/testharness.h"
@@ -131,7 +132,7 @@ class DateTieredTest : public testing::Test {
   Options options_;
   KVMap::iterator kv_it_;
   const std::string kNewValue_ = "new_value";
-  unique_ptr<CompactionFilter> test_comp_filter_;
+  std::unique_ptr<CompactionFilter> test_comp_filter_;
 };
 
 // Puts a set of values and checks its presence using Get during ttl
diff --git a/utilities/debug.cc b/utilities/debug.cc
index e0c5f5566eb..72fcbf0f54d 100644
--- a/utilities/debug.cc
+++ b/utilities/debug.cc
@@ -19,9 +19,11 @@ Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
 
   DBImpl* idb = static_cast<DBImpl*>(db->GetRootDB());
   auto icmp = InternalKeyComparator(idb->GetOptions().comparator);
-  RangeDelAggregator range_del_agg(icmp, {} /* snapshots */);
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
   Arena arena;
-  ScopedArenaIterator iter(idb->NewInternalIterator(&arena, &range_del_agg));
+  ScopedArenaIterator iter(
+      idb->NewInternalIterator(&arena, &range_del_agg, kMaxSequenceNumber));
 
   if (!begin_key.empty()) {
     InternalKey ikey;
diff --git a/utilities/document/json_document_test.cc b/utilities/document/json_document_test.cc
index 977905b9156..9d79c41cf5d 100644
--- a/utilities/document/json_document_test.cc
+++ b/utilities/document/json_document_test.cc
@@ -249,21 +249,23 @@ TEST_F(JSONDocumentTest, OperatorEqualsTest) {
   ASSERT_TRUE(JSONDocument(static_cast<int64_t>(15)) ==
               JSONDocument(static_cast<int64_t>(15)));
 
-  unique_ptr<JSONDocument> arrayWithInt8Doc(JSONDocument::ParseJSON("[8]"));
+  std::unique_ptr<JSONDocument> arrayWithInt8Doc(
+      JSONDocument::ParseJSON("[8]"));
   ASSERT_TRUE(arrayWithInt8Doc != nullptr);
   ASSERT_TRUE(arrayWithInt8Doc->IsArray());
   ASSERT_TRUE((*arrayWithInt8Doc)[0].IsInt64());
   ASSERT_TRUE((*arrayWithInt8Doc)[0] == JSONDocument(static_cast<int64_t>(8)));
 
-  unique_ptr<JSONDocument> arrayWithInt16Doc(JSONDocument::ParseJSON("[512]"));
+  std::unique_ptr<JSONDocument> arrayWithInt16Doc(
+      JSONDocument::ParseJSON("[512]"));
   ASSERT_TRUE(arrayWithInt16Doc != nullptr);
   ASSERT_TRUE(arrayWithInt16Doc->IsArray());
   ASSERT_TRUE((*arrayWithInt16Doc)[0].IsInt64());
   ASSERT_TRUE((*arrayWithInt16Doc)[0] ==
               JSONDocument(static_cast<int64_t>(512)));
 
-  unique_ptr<JSONDocument> arrayWithInt32Doc(
-    JSONDocument::ParseJSON("[1000000]"));
+  std::unique_ptr<JSONDocument> arrayWithInt32Doc(
+      JSONDocument::ParseJSON("[1000000]"));
   ASSERT_TRUE(arrayWithInt32Doc != nullptr);
   ASSERT_TRUE(arrayWithInt32Doc->IsArray());
   ASSERT_TRUE((*arrayWithInt32Doc)[0].IsInt64());
@@ -277,8 +279,8 @@ TEST_F(JSONDocumentTest, OperatorEqualsTest) {
 }
 
 TEST_F(JSONDocumentTest, JSONDocumentBuilderTest) {
-  unique_ptr<JSONDocument> parsedArray(
-    JSONDocument::ParseJSON("[1, [123, \"a\", \"b\"], {\"b\":\"c\"}]"));
+  std::unique_ptr<JSONDocument> parsedArray(
+      JSONDocument::ParseJSON("[1, [123, \"a\", \"b\"], {\"b\":\"c\"}]"));
   ASSERT_TRUE(parsedArray != nullptr);
 
   JSONDocumentBuilder builder;
diff --git a/utilities/env_librados_test.cc b/utilities/env_librados_test.cc
index 7d9b252ea41..fb10224e7d7 100644
--- a/utilities/env_librados_test.cc
+++ b/utilities/env_librados_test.cc
@@ -108,7 +108,7 @@ class EnvLibradosTest : public testing::Test {
 
 TEST_F(EnvLibradosTest, Basics) {
   uint64_t file_size;
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   std::vector<std::string> children;
 
   ASSERT_OK(env_->CreateDir("/dir"));
@@ -150,8 +150,8 @@ TEST_F(EnvLibradosTest, Basics) {
   ASSERT_EQ(3U, file_size);
 
   // Check that opening non-existent file fails.
-  unique_ptr<SequentialFile> seq_file;
-  unique_ptr<RandomAccessFile> rand_file;
+  std::unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<RandomAccessFile> rand_file;
   ASSERT_TRUE(
     !env_->NewSequentialFile("/dir/non_existent", &seq_file, soptions_).ok());
   ASSERT_TRUE(!seq_file);
@@ -169,9 +169,9 @@ TEST_F(EnvLibradosTest, Basics) {
 }
 
 TEST_F(EnvLibradosTest, ReadWrite) {
-  unique_ptr<WritableFile> writable_file;
-  unique_ptr<SequentialFile> seq_file;
-  unique_ptr<RandomAccessFile> rand_file;
+  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<RandomAccessFile> rand_file;
   Slice result;
   char scratch[100];
 
@@ -210,7 +210,7 @@ TEST_F(EnvLibradosTest, ReadWrite) {
 
 TEST_F(EnvLibradosTest, Locks) {
   FileLock* lock = nullptr;
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
 
   ASSERT_OK(env_->CreateDir("/dir"));
 
@@ -229,7 +229,7 @@ TEST_F(EnvLibradosTest, Misc) {
   ASSERT_OK(env_->GetTestDirectory(&test_dir));
   ASSERT_TRUE(!test_dir.empty());
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_TRUE(!env_->NewWritableFile("/a/b", &writable_file, soptions_).ok());
 
   ASSERT_OK(env_->NewWritableFile("/a", &writable_file, soptions_));
@@ -249,14 +249,14 @@ TEST_F(EnvLibradosTest, LargeWrite) {
     write_data.append(1, 'h');
   }
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env_->CreateDir("/dir"));
   ASSERT_OK(env_->NewWritableFile("/dir/g", &writable_file, soptions_));
   ASSERT_OK(writable_file->Append("foo"));
   ASSERT_OK(writable_file->Append(write_data));
   writable_file.reset();
 
-  unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<SequentialFile> seq_file;
   Slice result;
   ASSERT_OK(env_->NewSequentialFile("/dir/g", &seq_file, soptions_));
   ASSERT_OK(seq_file->Read(3, &result, scratch));  // Read "foo".
@@ -282,7 +282,7 @@ TEST_F(EnvLibradosTest, FrequentlySmallWrite) {
     write_data.append(1, 'h');
   }
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env_->CreateDir("/dir"));
   ASSERT_OK(env_->NewWritableFile("/dir/g", &writable_file, soptions_));
   ASSERT_OK(writable_file->Append("foo"));
@@ -292,7 +292,7 @@ TEST_F(EnvLibradosTest, FrequentlySmallWrite) {
   }
   writable_file.reset();
 
-  unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<SequentialFile> seq_file;
   Slice result;
   ASSERT_OK(env_->NewSequentialFile("/dir/g", &seq_file, soptions_));
   ASSERT_OK(seq_file->Read(3, &result, scratch));  // Read "foo".
@@ -317,7 +317,7 @@ TEST_F(EnvLibradosTest, Truncate) {
     write_data.append(1, 'h');
   }
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env_->CreateDir("/dir"));
   ASSERT_OK(env_->NewWritableFile("/dir/g", &writable_file, soptions_));
   ASSERT_OK(writable_file->Append(write_data));
@@ -801,7 +801,7 @@ class EnvLibradosMutipoolTest : public testing::Test {
 
 TEST_F(EnvLibradosMutipoolTest, Basics) {
   uint64_t file_size;
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   std::vector<std::string> children;
   std::vector<std::string> v = {"/tmp/dir1", "/tmp/dir2", "/tmp/dir3", "/tmp/dir4", "dir"};
 
@@ -850,8 +850,8 @@ TEST_F(EnvLibradosMutipoolTest, Basics) {
     ASSERT_EQ(3U, file_size);
 
     // Check that opening non-existent file fails.
-    unique_ptr<SequentialFile> seq_file;
-    unique_ptr<RandomAccessFile> rand_file;
+    std::unique_ptr<SequentialFile> seq_file;
+    std::unique_ptr<RandomAccessFile> rand_file;
     ASSERT_TRUE(
       !env_->NewSequentialFile(dir_non_existent.c_str(), &seq_file, soptions_).ok());
     ASSERT_TRUE(!seq_file);
diff --git a/utilities/env_mirror.cc b/utilities/env_mirror.cc
index d14de97d00d..327d8e16228 100644
--- a/utilities/env_mirror.cc
+++ b/utilities/env_mirror.cc
@@ -16,7 +16,7 @@ namespace rocksdb {
 // Env's.  This is useful for debugging purposes.
 class SequentialFileMirror : public SequentialFile {
  public:
-  unique_ptr<SequentialFile> a_, b_;
+  std::unique_ptr<SequentialFile> a_, b_;
   std::string fname;
   explicit SequentialFileMirror(std::string f) : fname(f) {}
 
@@ -60,7 +60,7 @@ class SequentialFileMirror : public SequentialFile {
 
 class RandomAccessFileMirror : public RandomAccessFile {
  public:
-  unique_ptr<RandomAccessFile> a_, b_;
+  std::unique_ptr<RandomAccessFile> a_, b_;
   std::string fname;
   explicit RandomAccessFileMirror(std::string f) : fname(f) {}
 
@@ -95,7 +95,7 @@ class RandomAccessFileMirror : public RandomAccessFile {
 
 class WritableFileMirror : public WritableFile {
  public:
-  unique_ptr<WritableFile> a_, b_;
+  std::unique_ptr<WritableFile> a_, b_;
   std::string fname;
   explicit WritableFileMirror(std::string f) : fname(f) {}
 
@@ -191,7 +191,7 @@ class WritableFileMirror : public WritableFile {
 };
 
 Status EnvMirror::NewSequentialFile(const std::string& f,
-                                    unique_ptr<SequentialFile>* r,
+                                    std::unique_ptr<SequentialFile>* r,
                                     const EnvOptions& options) {
   if (f.find("/proc/") == 0) {
     return a_->NewSequentialFile(f, r, options);
@@ -208,7 +208,7 @@ Status EnvMirror::NewSequentialFile(const std::string& f,
 }
 
 Status EnvMirror::NewRandomAccessFile(const std::string& f,
-                                      unique_ptr<RandomAccessFile>* r,
+                                      std::unique_ptr<RandomAccessFile>* r,
                                       const EnvOptions& options) {
   if (f.find("/proc/") == 0) {
     return a_->NewRandomAccessFile(f, r, options);
@@ -225,7 +225,7 @@ Status EnvMirror::NewRandomAccessFile(const std::string& f,
 }
 
 Status EnvMirror::NewWritableFile(const std::string& f,
-                                  unique_ptr<WritableFile>* r,
+                                  std::unique_ptr<WritableFile>* r,
                                   const EnvOptions& options) {
   if (f.find("/proc/") == 0) return a_->NewWritableFile(f, r, options);
   WritableFileMirror* mf = new WritableFileMirror(f);
@@ -241,7 +241,7 @@ Status EnvMirror::NewWritableFile(const std::string& f,
 
 Status EnvMirror::ReuseWritableFile(const std::string& fname,
                                     const std::string& old_fname,
-                                    unique_ptr<WritableFile>* r,
+                                    std::unique_ptr<WritableFile>* r,
                                     const EnvOptions& options) {
   if (fname.find("/proc/") == 0)
     return a_->ReuseWritableFile(fname, old_fname, r, options);
diff --git a/utilities/env_mirror_test.cc b/utilities/env_mirror_test.cc
index 2bf8ec8583a..812595ca1ee 100644
--- a/utilities/env_mirror_test.cc
+++ b/utilities/env_mirror_test.cc
@@ -32,7 +32,7 @@ class EnvMirrorTest : public testing::Test {
 
 TEST_F(EnvMirrorTest, Basics) {
   uint64_t file_size;
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   std::vector<std::string> children;
 
   ASSERT_OK(env_->CreateDir("/dir"));
@@ -91,8 +91,8 @@ TEST_F(EnvMirrorTest, Basics) {
   ASSERT_EQ(3U, file_size);
 
   // Check that opening non-existent file fails.
-  unique_ptr<SequentialFile> seq_file;
-  unique_ptr<RandomAccessFile> rand_file;
+  std::unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<RandomAccessFile> rand_file;
   ASSERT_TRUE(
       !env_->NewSequentialFile("/dir/non_existent", &seq_file, soptions_).ok());
   ASSERT_TRUE(!seq_file);
@@ -110,9 +110,9 @@ TEST_F(EnvMirrorTest, Basics) {
 }
 
 TEST_F(EnvMirrorTest, ReadWrite) {
-  unique_ptr<WritableFile> writable_file;
-  unique_ptr<SequentialFile> seq_file;
-  unique_ptr<RandomAccessFile> rand_file;
+  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<RandomAccessFile> rand_file;
   Slice result;
   char scratch[100];
 
@@ -162,7 +162,7 @@ TEST_F(EnvMirrorTest, Misc) {
   ASSERT_OK(env_->GetTestDirectory(&test_dir));
   ASSERT_TRUE(!test_dir.empty());
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_));
 
   // These are no-ops, but we test they return success.
@@ -181,13 +181,13 @@ TEST_F(EnvMirrorTest, LargeWrite) {
     write_data.append(1, static_cast<char>(i));
   }
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
   ASSERT_OK(writable_file->Append("foo"));
   ASSERT_OK(writable_file->Append(write_data));
   writable_file.reset();
 
-  unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<SequentialFile> seq_file;
   Slice result;
   ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
   ASSERT_OK(seq_file->Read(3, &result, scratch));  // Read "foo".
diff --git a/utilities/env_timed.cc b/utilities/env_timed.cc
index 6afd45bf999..86455ee65c0 100644
--- a/utilities/env_timed.cc
+++ b/utilities/env_timed.cc
@@ -18,21 +18,21 @@ class TimedEnv : public EnvWrapper {
   explicit TimedEnv(Env* base_env) : EnvWrapper(base_env) {}
 
   virtual Status NewSequentialFile(const std::string& fname,
-                                   unique_ptr<SequentialFile>* result,
+                                   std::unique_ptr<SequentialFile>* result,
                                    const EnvOptions& options) override {
     PERF_TIMER_GUARD(env_new_sequential_file_nanos);
     return EnvWrapper::NewSequentialFile(fname, result, options);
   }
 
   virtual Status NewRandomAccessFile(const std::string& fname,
-                                     unique_ptr<RandomAccessFile>* result,
+                                     std::unique_ptr<RandomAccessFile>* result,
                                      const EnvOptions& options) override {
     PERF_TIMER_GUARD(env_new_random_access_file_nanos);
     return EnvWrapper::NewRandomAccessFile(fname, result, options);
   }
 
   virtual Status NewWritableFile(const std::string& fname,
-                                 unique_ptr<WritableFile>* result,
+                                 std::unique_ptr<WritableFile>* result,
                                  const EnvOptions& options) override {
     PERF_TIMER_GUARD(env_new_writable_file_nanos);
     return EnvWrapper::NewWritableFile(fname, result, options);
@@ -40,21 +40,21 @@ class TimedEnv : public EnvWrapper {
 
   virtual Status ReuseWritableFile(const std::string& fname,
                                    const std::string& old_fname,
-                                   unique_ptr<WritableFile>* result,
+                                   std::unique_ptr<WritableFile>* result,
                                    const EnvOptions& options) override {
     PERF_TIMER_GUARD(env_reuse_writable_file_nanos);
     return EnvWrapper::ReuseWritableFile(fname, old_fname, result, options);
   }
 
   virtual Status NewRandomRWFile(const std::string& fname,
-                                 unique_ptr<RandomRWFile>* result,
+                                 std::unique_ptr<RandomRWFile>* result,
                                  const EnvOptions& options) override {
     PERF_TIMER_GUARD(env_new_random_rw_file_nanos);
     return EnvWrapper::NewRandomRWFile(fname, result, options);
   }
 
   virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) override {
+                              std::unique_ptr<Directory>* result) override {
     PERF_TIMER_GUARD(env_new_directory_nanos);
     return EnvWrapper::NewDirectory(name, result);
   }
@@ -131,7 +131,7 @@ class TimedEnv : public EnvWrapper {
   }
 
   virtual Status NewLogger(const std::string& fname,
-                           shared_ptr<Logger>* result) override {
+                           std::shared_ptr<Logger>* result) override {
     PERF_TIMER_GUARD(env_new_logger_nanos);
     return EnvWrapper::NewLogger(fname, result);
   }
diff --git a/utilities/geodb/geodb_impl.cc b/utilities/geodb/geodb_impl.cc
index 97c4da0f736..9150b16b2c5 100644
--- a/utilities/geodb/geodb_impl.cc
+++ b/utilities/geodb/geodb_impl.cc
@@ -222,7 +222,7 @@ GeoIterator* GeoDBImpl::SearchRadial(const GeoPosition& pos,
   Iterator* iter = db_->NewIterator(ReadOptions());
 
   // Process each prospective quadkey
-  for (std::string qid : qids) {
+  for (const std::string& qid : qids) {
     // The user is interested in only these many objects.
     if (number_of_values == 0) {
       break;
diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc
index bf830190c6a..4c12f1a67d2 100644
--- a/utilities/options/options_util_test.cc
+++ b/utilities/options/options_util_test.cc
@@ -104,8 +104,8 @@ class DummyTableFactory : public TableFactory {
 
   virtual Status NewTableReader(
       const TableReaderOptions& /*table_reader_options*/,
-      unique_ptr<RandomAccessFileReader>&& /*file*/, uint64_t /*file_size*/,
-      unique_ptr<TableReader>* /*table_reader*/,
+      std::unique_ptr<RandomAccessFileReader>&& /*file*/,
+      uint64_t /*file_size*/, std::unique_ptr<TableReader>* /*table_reader*/,
       bool /*prefetch_index_and_filter_in_cache*/) const override {
     return Status::NotSupported();
   }
diff --git a/utilities/persistent_cache/block_cache_tier.cc b/utilities/persistent_cache/block_cache_tier.cc
index 1ebf8ae6b3a..f7f72df6dfc 100644
--- a/utilities/persistent_cache/block_cache_tier.cc
+++ b/utilities/persistent_cache/block_cache_tier.cc
@@ -263,7 +263,7 @@ Status BlockCacheTier::InsertImpl(const Slice& key, const Slice& data) {
   return Status::OK();
 }
 
-Status BlockCacheTier::Lookup(const Slice& key, unique_ptr<char[]>* val,
+Status BlockCacheTier::Lookup(const Slice& key, std::unique_ptr<char[]>* val,
                               size_t* size) {
   StopWatchNano timer(opt_.env, /*auto_start=*/ true);
 
@@ -287,7 +287,7 @@ Status BlockCacheTier::Lookup(const Slice& key, unique_ptr<char[]>* val,
 
   assert(file->refs_);
 
-  unique_ptr<char[]> scratch(new char[lba.size_]);
+  std::unique_ptr<char[]> scratch(new char[lba.size_]);
   Slice blk_key;
   Slice blk_val;
 
@@ -369,7 +369,7 @@ bool BlockCacheTier::Reserve(const size_t size) {
 
   const double retain_fac = (100 - kEvictPct) / static_cast<double>(100);
   while (size + size_ > opt_.cache_size * retain_fac) {
-    unique_ptr<BlockCacheFile> f(metadata_.Evict());
+    std::unique_ptr<BlockCacheFile> f(metadata_.Evict());
     if (!f) {
       // nothing is evictable
       return false;
diff --git a/utilities/persistent_cache/block_cache_tier_file.h b/utilities/persistent_cache/block_cache_tier_file.h
index ef5dbab0408..e38b6c9a1d3 100644
--- a/utilities/persistent_cache/block_cache_tier_file.h
+++ b/utilities/persistent_cache/block_cache_tier_file.h
@@ -149,7 +149,7 @@ class RandomAccessCacheFile : public BlockCacheFile {
  public:
   explicit RandomAccessCacheFile(Env* const env, const std::string& dir,
                                  const uint32_t cache_id,
-                                 const shared_ptr<Logger>& log)
+                                 const std::shared_ptr<Logger>& log)
       : BlockCacheFile(env, dir, cache_id), log_(log) {}
 
   virtual ~RandomAccessCacheFile() {}
diff --git a/utilities/persistent_cache/persistent_cache_bench.cc b/utilities/persistent_cache/persistent_cache_bench.cc
index 7d26c3a7de3..64d75c7a518 100644
--- a/utilities/persistent_cache/persistent_cache_bench.cc
+++ b/utilities/persistent_cache/persistent_cache_bench.cc
@@ -251,7 +251,7 @@ class CacheTierBenchmark {
 
   // create data for a key by filling with a certain pattern
   std::unique_ptr<char[]> NewBlock(const uint64_t val) {
-    unique_ptr<char[]> data(new char[FLAGS_iosize]);
+    std::unique_ptr<char[]> data(new char[FLAGS_iosize]);
     memset(data.get(), val % 255, FLAGS_iosize);
     return data;
   }
diff --git a/utilities/persistent_cache/persistent_cache_test.h b/utilities/persistent_cache/persistent_cache_test.h
index 37e842f2e2a..ad99ea864bd 100644
--- a/utilities/persistent_cache/persistent_cache_test.h
+++ b/utilities/persistent_cache/persistent_cache_test.h
@@ -157,7 +157,7 @@ class PersistentCacheTierTest : public testing::Test {
       memset(edata, '0' + (i % 10), sizeof(edata));
       auto k = prefix + PaddedNumber(i, /*count=*/8);
       Slice key(k);
-      unique_ptr<char[]> block;
+      std::unique_ptr<char[]> block;
       size_t block_size;
 
       if (eviction_enabled) {
@@ -210,7 +210,7 @@ class PersistentCacheTierTest : public testing::Test {
   }
 
   const std::string path_;
-  shared_ptr<Logger> log_;
+  std::shared_ptr<Logger> log_;
   std::shared_ptr<PersistentCacheTier> cache_;
   std::atomic<size_t> key_{0};
   size_t max_keys_ = 0;
diff --git a/utilities/spatialdb/spatial_db.cc b/utilities/spatialdb/spatial_db.cc
index 627eb9de6e4..b34976eb818 100644
--- a/utilities/spatialdb/spatial_db.cc
+++ b/utilities/spatialdb/spatial_db.cc
@@ -473,7 +473,7 @@ class SpatialIndexCursor : public Cursor {
 
   }
 
-  unique_ptr<ValueGetter> value_getter_;
+  std::unique_ptr<ValueGetter> value_getter_;
   bool valid_;
   Status status_;
 
diff --git a/utilities/spatialdb/spatial_db_test.cc b/utilities/spatialdb/spatial_db_test.cc
index 783b347d0a8..cb92af8b1a0 100644
--- a/utilities/spatialdb/spatial_db_test.cc
+++ b/utilities/spatialdb/spatial_db_test.cc
@@ -94,7 +94,7 @@ TEST_F(SpatialDBTest, FeatureSetSerializeTest) {
   ASSERT_EQ(deserialized.Get("m").get_double(), 3.25);
 
   // corrupted serialization
-  serialized = serialized.substr(0, serialized.size() - 4);
+  serialized = serialized.substr(0, serialized.size() - 1);
   deserialized.Clear();
   ASSERT_TRUE(!deserialized.Deserialize(serialized));
 }
diff --git a/utilities/trace/file_trace_reader_writer.cc b/utilities/trace/file_trace_reader_writer.cc
index 36baefc7bc2..4a81516a8b7 100644
--- a/utilities/trace/file_trace_reader_writer.cc
+++ b/utilities/trace/file_trace_reader_writer.cc
@@ -83,16 +83,18 @@ Status FileTraceWriter::Write(const Slice& data) {
   return file_writer_->Append(data);
 }
 
+uint64_t FileTraceWriter::GetFileSize() { return file_writer_->GetFileSize(); }
+
 Status NewFileTraceReader(Env* env, const EnvOptions& env_options,
                           const std::string& trace_filename,
                           std::unique_ptr<TraceReader>* trace_reader) {
-  unique_ptr<RandomAccessFile> trace_file;
+  std::unique_ptr<RandomAccessFile> trace_file;
   Status s = env->NewRandomAccessFile(trace_filename, &trace_file, env_options);
   if (!s.ok()) {
     return s;
   }
 
-  unique_ptr<RandomAccessFileReader> file_reader;
+  std::unique_ptr<RandomAccessFileReader> file_reader;
   file_reader.reset(
       new RandomAccessFileReader(std::move(trace_file), trace_filename));
   trace_reader->reset(new FileTraceReader(std::move(file_reader)));
@@ -102,13 +104,13 @@ Status NewFileTraceReader(Env* env, const EnvOptions& env_options,
 Status NewFileTraceWriter(Env* env, const EnvOptions& env_options,
                           const std::string& trace_filename,
                           std::unique_ptr<TraceWriter>* trace_writer) {
-  unique_ptr<WritableFile> trace_file;
+  std::unique_ptr<WritableFile> trace_file;
   Status s = env->NewWritableFile(trace_filename, &trace_file, env_options);
   if (!s.ok()) {
     return s;
   }
 
-  unique_ptr<WritableFileWriter> file_writer;
+  std::unique_ptr<WritableFileWriter> file_writer;
   file_writer.reset(new WritableFileWriter(std::move(trace_file),
                                            trace_filename, env_options));
   trace_writer->reset(new FileTraceWriter(std::move(file_writer)));
diff --git a/utilities/trace/file_trace_reader_writer.h b/utilities/trace/file_trace_reader_writer.h
index b363a3f09f7..863f5d9d061 100644
--- a/utilities/trace/file_trace_reader_writer.h
+++ b/utilities/trace/file_trace_reader_writer.h
@@ -22,7 +22,7 @@ class FileTraceReader : public TraceReader {
   virtual Status Close() override;
 
  private:
-  unique_ptr<RandomAccessFileReader> file_reader_;
+  std::unique_ptr<RandomAccessFileReader> file_reader_;
   Slice result_;
   size_t offset_;
   char* const buffer_;
@@ -39,9 +39,10 @@ class FileTraceWriter : public TraceWriter {
 
   virtual Status Write(const Slice& data) override;
   virtual Status Close() override;
+  virtual uint64_t GetFileSize() override;
 
  private:
-  unique_ptr<WritableFileWriter> file_writer_;
+  std::unique_ptr<WritableFileWriter> file_writer_;
 };
 
 }  // namespace rocksdb
diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc
index 67a333f3b08..d895d9d9357 100644
--- a/utilities/transactions/pessimistic_transaction.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -232,7 +232,7 @@ Status WriteCommittedTxn::PrepareInternal() {
   WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_);
   Status s =
       db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
-                          /*callback*/ nullptr, &log_number_, /*log ref*/ 0,
+                          /*callback*/ nullptr, &log_number_, /*log_ref*/ 0,
                           /* disable_memtable*/ true);
   return s;
 }
@@ -322,12 +322,27 @@ Status PessimisticTransaction::Commit() {
 }
 
 Status WriteCommittedTxn::CommitWithoutPrepareInternal() {
-  Status s = db_->Write(write_options_, GetWriteBatch()->GetWriteBatch());
+  uint64_t seq_used = kMaxSequenceNumber;
+  auto s =
+      db_impl_->WriteImpl(write_options_, GetWriteBatch()->GetWriteBatch(),
+                          /*callback*/ nullptr, /*log_used*/ nullptr,
+                          /*log_ref*/ 0, /*disable_memtable*/ false, &seq_used);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  if (s.ok()) {
+    SetId(seq_used);
+  }
   return s;
 }
 
 Status WriteCommittedTxn::CommitBatchInternal(WriteBatch* batch, size_t) {
-  Status s = db_->Write(write_options_, batch);
+  uint64_t seq_used = kMaxSequenceNumber;
+  auto s = db_impl_->WriteImpl(write_options_, batch, /*callback*/ nullptr,
+                               /*log_used*/ nullptr, /*log_ref*/ 0,
+                               /*disable_memtable*/ false, &seq_used);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  if (s.ok()) {
+    SetId(seq_used);
+  }
   return s;
 }
 
@@ -345,8 +360,15 @@ Status WriteCommittedTxn::CommitInternal() {
   // in non recovery mode and simply insert the values
   WriteBatchInternal::Append(working_batch, GetWriteBatch()->GetWriteBatch());
 
-  auto s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
-                               log_number_);
+  uint64_t seq_used = kMaxSequenceNumber;
+  auto s =
+      db_impl_->WriteImpl(write_options_, working_batch, /*callback*/ nullptr,
+                          /*log_used*/ nullptr, /*log_ref*/ log_number_,
+                          /*disable_memtable*/ false, &seq_used);  
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  if (s.ok()) {
+    SetId(seq_used);
+  }
   return s;
 }
 
diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc
index 6b016ef72a8..8eb21777a99 100644
--- a/utilities/transactions/pessimistic_transaction_db.cc
+++ b/utilities/transactions/pessimistic_transaction_db.cc
@@ -146,7 +146,9 @@ Status PessimisticTransactionDB::Initialize(
     assert(real_trx);
     real_trx->SetLogNumber(batch_info.log_number_);
     assert(seq != kMaxSequenceNumber);
-    real_trx->SetId(seq);
+    if (GetTxnDBOptions().write_policy != WRITE_COMMITTED) {
+      real_trx->SetId(seq);
+    }
 
     s = real_trx->SetName(recovered_trx->name_);
     if (!s.ok()) {
diff --git a/utilities/transactions/transaction_lock_mgr.cc b/utilities/transactions/transaction_lock_mgr.cc
index d285fd30ed4..8086f7c7c07 100644
--- a/utilities/transactions/transaction_lock_mgr.cc
+++ b/utilities/transactions/transaction_lock_mgr.cc
@@ -104,7 +104,7 @@ void DeadlockInfoBuffer::AddNewPath(DeadlockPath path) {
     return;
   }
 
-  paths_buffer_[buffer_idx_] = path;
+  paths_buffer_[buffer_idx_] = std::move(path);
   buffer_idx_ = (buffer_idx_ + 1) % paths_buffer_.size();
 }
 
@@ -222,9 +222,9 @@ void TransactionLockMgr::RemoveColumnFamily(uint32_t column_family_id) {
   }
 }
 
-// Look up the LockMap shared_ptr for a given column_family_id.
+// Look up the LockMap std::shared_ptr for a given column_family_id.
 // Note:  The LockMap is only valid as long as the caller is still holding on
-//   to the returned shared_ptr.
+//   to the returned std::shared_ptr.
 std::shared_ptr<LockMap> TransactionLockMgr::GetLockMap(
     uint32_t column_family_id) {
   // First check thread-local cache
@@ -494,8 +494,8 @@ bool TransactionLockMgr::IncrementWaiters(
 
         auto extracted_info = wait_txn_map_.Get(queue_values[head]);
         path.push_back({queue_values[head], extracted_info.m_cf_id,
-                        extracted_info.m_waiting_key,
-                        extracted_info.m_exclusive});
+                        extracted_info.m_exclusive,
+                        extracted_info.m_waiting_key});
         head = queue_parents[head];
       }
       env->GetCurrentTime(&deadlock_time);
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index f49c9225741..0968b9a3493 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -606,6 +606,7 @@ TEST_P(TransactionTest, DeadlockCycleShared) {
   }
 }
 
+#ifndef ROCKSDB_VALGRIND_RUN
 TEST_P(TransactionStressTest, DeadlockCycle) {
   WriteOptions write_options;
   ReadOptions read_options;
@@ -768,6 +769,7 @@ TEST_P(TransactionStressTest, DeadlockStress) {
     t.join();
   }
 }
+#endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_P(TransactionTest, CommitTimeBatchFailTest) {
   WriteOptions write_options;
@@ -1097,6 +1099,7 @@ TEST_P(TransactionTest, TwoPhaseEmptyWriteTest) {
   }
 }
 
+#ifndef ROCKSDB_VALGRIND_RUN
 TEST_P(TransactionStressTest, TwoPhaseExpirationTest) {
   Status s;
 
@@ -1334,6 +1337,7 @@ TEST_P(TransactionTest, PersistentTwoPhaseTransactionTest) {
   // deleting transaction should unregister transaction
   ASSERT_EQ(db->GetTransactionByName("xid"), nullptr);
 }
+#endif  // ROCKSDB_VALGRIND_RUN
 
 // TODO this test needs to be updated with serial commits
 TEST_P(TransactionTest, DISABLED_TwoPhaseMultiThreadTest) {
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 127f8cc8648..1d645d237fc 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -731,6 +731,71 @@ TEST_P(WritePreparedTransactionTest, MaybeUpdateOldCommitMap) {
   MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
 }
 
+// Reproduce the bug with two snapshots with the same seuqence number and test
+// that the release of the first snapshot will not affect the reads by the other
+// snapshot
+TEST_P(WritePreparedTransactionTest, DoubleSnapshot) {
+  TransactionOptions txn_options;
+  Status s;
+
+  // Insert initial value
+  ASSERT_OK(db->Put(WriteOptions(), "key", "value1"));
+
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  Transaction* txn =
+      wp_db->BeginTransaction(WriteOptions(), txn_options, nullptr);
+  ASSERT_OK(txn->SetName("txn"));
+  ASSERT_OK(txn->Put("key", "value2"));
+  ASSERT_OK(txn->Prepare());
+  // Three snapshots with the same seq number
+  const Snapshot* snapshot0 = wp_db->GetSnapshot();
+  const Snapshot* snapshot1 = wp_db->GetSnapshot();
+  const Snapshot* snapshot2 = wp_db->GetSnapshot();
+  ASSERT_OK(txn->Commit());
+  SequenceNumber cache_size = wp_db->COMMIT_CACHE_SIZE;
+  SequenceNumber overlap_seq = txn->GetId() + cache_size;
+  delete txn;
+
+  // 4th snapshot with a larger seq
+  const Snapshot* snapshot3 = wp_db->GetSnapshot();
+  // Cause an eviction to advance max evicted seq number
+  // This also fetches the 4 snapshots from db since their seq is lower than the
+  // new max
+  wp_db->AddCommitted(overlap_seq, overlap_seq);
+
+  ReadOptions ropt;
+  // It should see the value before commit
+  ropt.snapshot = snapshot2;
+  PinnableSlice pinnable_val;
+  s = wp_db->Get(ropt, wp_db->DefaultColumnFamily(), "key", &pinnable_val);
+  ASSERT_OK(s);
+  ASSERT_TRUE(pinnable_val == "value1");
+  pinnable_val.Reset();
+
+  wp_db->ReleaseSnapshot(snapshot1);
+
+  // It should still see the value before commit
+  s = wp_db->Get(ropt, wp_db->DefaultColumnFamily(), "key", &pinnable_val);
+  ASSERT_OK(s);
+  ASSERT_TRUE(pinnable_val == "value1");
+  pinnable_val.Reset();
+
+  // Cause an eviction to advance max evicted seq number and trigger updating
+  // the snapshot list
+  overlap_seq += cache_size;
+  wp_db->AddCommitted(overlap_seq, overlap_seq);
+
+  // It should still see the value before commit
+  s = wp_db->Get(ropt, wp_db->DefaultColumnFamily(), "key", &pinnable_val);
+  ASSERT_OK(s);
+  ASSERT_TRUE(pinnable_val == "value1");
+  pinnable_val.Reset();
+
+  wp_db->ReleaseSnapshot(snapshot0);
+  wp_db->ReleaseSnapshot(snapshot2);
+  wp_db->ReleaseSnapshot(snapshot3);
+}
+
 // Test that the entries in old_commit_map_ get garbage collected properly
 TEST_P(WritePreparedTransactionTest, OldCommitMapGC) {
   const size_t snapshot_cache_bits = 0;
@@ -816,6 +881,7 @@ TEST_P(WritePreparedTransactionTest, CheckAgainstSnapshotsTest) {
   std::vector<SequenceNumber> snapshots = {100l, 200l, 300l, 400l, 500l,
                                            600l, 700l, 800l, 900l};
   const size_t snapshot_cache_bits = 2;
+  const uint64_t cache_size = 1ul << snapshot_cache_bits;
   // Safety check to express the intended size in the test. Can be adjusted if
   // the snapshots lists changed.
   assert((1ul << snapshot_cache_bits) * 2 + 1 == snapshots.size());
@@ -843,6 +909,57 @@ TEST_P(WritePreparedTransactionTest, CheckAgainstSnapshotsTest) {
                          commit_entry.prep_seq <= snapshots.back();
     ASSERT_EQ(expect_update, !wp_db->old_commit_map_empty_);
   }
+
+  // Test that search will include multiple snapshot from snapshot cache
+  {
+    // exclude first and last item in the cache
+    CommitEntry commit_entry = {snapshots.front() + 1,
+                                snapshots[cache_size - 1] - 1};
+    wp_db->old_commit_map_empty_ = true;  // reset
+    wp_db->old_commit_map_.clear();
+    wp_db->CheckAgainstSnapshots(commit_entry);
+    ASSERT_EQ(wp_db->old_commit_map_.size(), cache_size - 2);
+  }
+
+  // Test that search will include multiple snapshot from old snapshots
+  {
+    // include two in the middle
+    CommitEntry commit_entry = {snapshots[cache_size] + 1,
+                                snapshots[cache_size + 2] + 1};
+    wp_db->old_commit_map_empty_ = true;  // reset
+    wp_db->old_commit_map_.clear();
+    wp_db->CheckAgainstSnapshots(commit_entry);
+    ASSERT_EQ(wp_db->old_commit_map_.size(), 2);
+  }
+
+  // Test that search will include both snapshot cache and old snapshots
+  // Case 1: includes all in snapshot cache
+  {
+    CommitEntry commit_entry = {snapshots.front() - 1, snapshots.back() + 1};
+    wp_db->old_commit_map_empty_ = true;  // reset
+    wp_db->old_commit_map_.clear();
+    wp_db->CheckAgainstSnapshots(commit_entry);
+    ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size());
+  }
+
+  // Case 2: includes all snapshot caches except the smallest
+  {
+    CommitEntry commit_entry = {snapshots.front() + 1, snapshots.back() + 1};
+    wp_db->old_commit_map_empty_ = true;  // reset
+    wp_db->old_commit_map_.clear();
+    wp_db->CheckAgainstSnapshots(commit_entry);
+    ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size() - 1);
+  }
+
+  // Case 3: includes only the largest of snapshot cache
+  {
+    CommitEntry commit_entry = {snapshots[cache_size - 1] - 1,
+                                snapshots.back() + 1};
+    wp_db->old_commit_map_empty_ = true;  // reset
+    wp_db->old_commit_map_.clear();
+    wp_db->CheckAgainstSnapshots(commit_entry);
+    ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size() - cache_size + 1);
+  }
 }
 
 // This test is too slow for travis
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index 2d8e4fcee1d..ca728d50713 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -379,9 +379,9 @@ void WritePreparedTxnDB::Init(const TransactionDBOptions& /* unused */) {
   // around.
   INC_STEP_FOR_MAX_EVICTED =
       std::max(COMMIT_CACHE_SIZE / 100, static_cast<size_t>(1));
-  snapshot_cache_ = unique_ptr<std::atomic<SequenceNumber>[]>(
+  snapshot_cache_ = std::unique_ptr<std::atomic<SequenceNumber>[]>(
       new std::atomic<SequenceNumber>[SNAPSHOT_CACHE_SIZE] {});
-  commit_cache_ = unique_ptr<std::atomic<CommitEntry64b>[]>(
+  commit_cache_ = std::unique_ptr<std::atomic<CommitEntry64b>[]>(
       new std::atomic<CommitEntry64b>[COMMIT_CACHE_SIZE] {});
 }
 
@@ -554,12 +554,6 @@ const std::vector<SequenceNumber> WritePreparedTxnDB::GetSnapshotListFromDB(
   return db_impl_->snapshots().GetAll(nullptr, max);
 }
 
-void WritePreparedTxnDB::ReleaseSnapshot(const Snapshot* snapshot) {
-  auto snap_seq = snapshot->GetSequenceNumber();
-  ReleaseSnapshotInternal(snap_seq);
-  db_impl_->ReleaseSnapshot(snapshot);
-}
-
 void WritePreparedTxnDB::ReleaseSnapshotInternal(
     const SequenceNumber snap_seq) {
   // relax is enough since max increases monotonically, i.e., if snap_seq <
@@ -572,14 +566,16 @@ void WritePreparedTxnDB::ReleaseSnapshotInternal(
     bool need_gc = false;
     {
       WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
-      ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead");
+      ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead for %" PRIu64,
+                     snap_seq);
       ReadLock rl(&old_commit_map_mutex_);
       auto prep_set_entry = old_commit_map_.find(snap_seq);
       need_gc = prep_set_entry != old_commit_map_.end();
     }
     if (need_gc) {
       WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
-      ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead");
+      ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead for %" PRIu64,
+                     snap_seq);
       WriteLock wl(&old_commit_map_mutex_);
       old_commit_map_.erase(snap_seq);
       old_commit_map_empty_.store(old_commit_map_.empty(),
@@ -588,6 +584,33 @@ void WritePreparedTxnDB::ReleaseSnapshotInternal(
   }
 }
 
+void WritePreparedTxnDB::CleanupReleasedSnapshots(
+    const std::vector<SequenceNumber>& new_snapshots,
+    const std::vector<SequenceNumber>& old_snapshots) {
+  auto newi = new_snapshots.begin();
+  auto oldi = old_snapshots.begin();
+  for (; newi != new_snapshots.end() && oldi != old_snapshots.end();) {
+    assert(*newi >= *oldi);  // cannot have new snapshots with lower seq
+    if (*newi == *oldi) {    // still not released
+      auto value = *newi;
+      while (newi != new_snapshots.end() && *newi == value) {
+        newi++;
+      }
+      while (oldi != old_snapshots.end() && *oldi == value) {
+        oldi++;
+      }
+    } else {
+      assert(*newi > *oldi);  // *oldi is released
+      ReleaseSnapshotInternal(*oldi);
+      oldi++;
+    }
+  }
+  // Everything remained in old_snapshots is released and must be cleaned up
+  for (; oldi != old_snapshots.end(); oldi++) {
+    ReleaseSnapshotInternal(*oldi);
+  }
+}
+
 void WritePreparedTxnDB::UpdateSnapshots(
     const std::vector<SequenceNumber>& snapshots,
     const SequenceNumber& version) {
@@ -636,6 +659,12 @@ void WritePreparedTxnDB::UpdateSnapshots(
   // Update the size at the end. Otherwise a parallel reader might read
   // items that are not set yet.
   snapshots_total_.store(snapshots.size(), std::memory_order_release);
+
+  // Note: this must be done after the snapshots data structures are updated
+  // with the new list of snapshots.
+  CleanupReleasedSnapshots(snapshots, snapshots_all_);
+  snapshots_all_ = snapshots;
+
   TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:end");
   TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:end");
 }
@@ -654,13 +683,20 @@ void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) {
   // place before gets overwritten the reader that reads bottom-up will
   // eventully see it.
   const bool next_is_larger = true;
-  SequenceNumber snapshot_seq = kMaxSequenceNumber;
+  // We will set to true if the border line snapshot suggests that.
+  bool search_larger_list = false;
   size_t ip1 = std::min(cnt, SNAPSHOT_CACHE_SIZE);
   for (; 0 < ip1; ip1--) {
-    snapshot_seq = snapshot_cache_[ip1 - 1].load(std::memory_order_acquire);
+    SequenceNumber snapshot_seq =
+        snapshot_cache_[ip1 - 1].load(std::memory_order_acquire);
     TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:",
                         ++sync_i);
     TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:", sync_i);
+    if (ip1 == SNAPSHOT_CACHE_SIZE) {  // border line snapshot
+      // snapshot_seq < commit_seq => larger_snapshot_seq <= commit_seq
+      // then later also continue the search to larger snapshots
+      search_larger_list = snapshot_seq < evicted.commit_seq;
+    }
     if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq,
                                  snapshot_seq, !next_is_larger)) {
       break;
@@ -675,17 +711,20 @@ void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) {
 #endif
   TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:end");
   TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:end");
-  if (UNLIKELY(SNAPSHOT_CACHE_SIZE < cnt && ip1 == SNAPSHOT_CACHE_SIZE &&
-               snapshot_seq < evicted.prep_seq)) {
+  if (UNLIKELY(SNAPSHOT_CACHE_SIZE < cnt && search_larger_list)) {
     // Then access the less efficient list of snapshots_
     WPRecordTick(TXN_SNAPSHOT_MUTEX_OVERHEAD);
-    ROCKS_LOG_WARN(info_log_, "snapshots_mutex_ overhead");
+    ROCKS_LOG_WARN(info_log_,
+                   "snapshots_mutex_ overhead for <%" PRIu64 ",%" PRIu64
+                   "> with %" ROCKSDB_PRIszt " snapshots",
+                   evicted.prep_seq, evicted.commit_seq, cnt);
     ReadLock rl(&snapshots_mutex_);
     // Items could have moved from the snapshots_ to snapshot_cache_ before
     // accquiring the lock. To make sure that we do not miss a valid snapshot,
     // read snapshot_cache_ again while holding the lock.
     for (size_t i = 0; i < SNAPSHOT_CACHE_SIZE; i++) {
-      snapshot_seq = snapshot_cache_[i].load(std::memory_order_acquire);
+      SequenceNumber snapshot_seq =
+          snapshot_cache_[i].load(std::memory_order_acquire);
       if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq,
                                    snapshot_seq, next_is_larger)) {
         break;
@@ -713,7 +752,10 @@ bool WritePreparedTxnDB::MaybeUpdateOldCommitMap(
   // then snapshot_seq < commit_seq
   if (prep_seq <= snapshot_seq) {  // overlapping range
     WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
-    ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead");
+    ROCKS_LOG_WARN(info_log_,
+                   "old_commit_map_mutex_ overhead for %" PRIu64
+                   " commit entry: <%" PRIu64 ",%" PRIu64 ">",
+                   snapshot_seq, prep_seq, commit_seq);
     WriteLock wl(&old_commit_map_mutex_);
     old_commit_map_empty_.store(false, std::memory_order_release);
     auto& vec = old_commit_map_[snapshot_seq];
diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h
index ec76e271634..e0263d4f7b9 100644
--- a/utilities/transactions/write_prepared_txn_db.h
+++ b/utilities/transactions/write_prepared_txn_db.h
@@ -112,8 +112,6 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
       const std::vector<ColumnFamilyHandle*>& column_families,
       std::vector<Iterator*>* iterators) override;
 
-  virtual void ReleaseSnapshot(const Snapshot* snapshot) override;
-
   // Check whether the transaction that wrote the value with sequence number seq
   // is visible to the snapshot with sequence number snapshot_seq.
   // Returns true if commit_seq <= snapshot_seq
@@ -222,7 +220,6 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
       // rare case and it is ok to pay the cost of mutex ReadLock for such old,
       // reading transactions.
       WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
-      ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead");
       ReadLock rl(&old_commit_map_mutex_);
       auto prep_set_entry = old_commit_map_.find(snapshot_seq);
       bool found = prep_set_entry != old_commit_map_.end();
@@ -380,6 +377,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
   friend class
       WritePreparedTransactionTest_AdvanceMaxEvictedSeqWithDuplicatesTest_Test;
   friend class WritePreparedTransactionTest_BasicRecoveryTest_Test;
+  friend class WritePreparedTransactionTest_DoubleSnapshot_Test;
   friend class WritePreparedTransactionTest_IsInSnapshotEmptyMapTest_Test;
   friend class WritePreparedTransactionTest_OldCommitMapGC_Test;
   friend class WritePreparedTransactionTest_RollbackTest_Test;
@@ -519,6 +517,11 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
   // version value.
   void UpdateSnapshots(const std::vector<SequenceNumber>& snapshots,
                        const SequenceNumber& version);
+  // Check the new list of new snapshots against the old one to see  if any of
+  // the snapshots are released and to do the cleanup for the released snapshot.
+  void CleanupReleasedSnapshots(
+      const std::vector<SequenceNumber>& new_snapshots,
+      const std::vector<SequenceNumber>& old_snapshots);
 
   // Check an evicted entry against live snapshots to see if it should be kept
   // around or it can be safely discarded (and hence assume committed for all
@@ -549,10 +552,14 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
   static const size_t DEF_SNAPSHOT_CACHE_BITS = static_cast<size_t>(7);
   const size_t SNAPSHOT_CACHE_BITS;
   const size_t SNAPSHOT_CACHE_SIZE;
-  unique_ptr<std::atomic<SequenceNumber>[]> snapshot_cache_;
+  std::unique_ptr<std::atomic<SequenceNumber>[]> snapshot_cache_;
   // 2nd list for storing snapshots. The list sorted in ascending order.
   // Thread-safety is provided with snapshots_mutex_.
   std::vector<SequenceNumber> snapshots_;
+  // The list of all snapshots: snapshots_ + snapshot_cache_. This list although
+  // redundant but simplifies CleanupOldSnapshots implementation.
+  // Thread-safety is provided with snapshots_mutex_.
+  std::vector<SequenceNumber> snapshots_all_;
   // The version of the latest list of snapshots. This can be used to avoid
   // rewriting a list that is concurrently updated with a more recent version.
   SequenceNumber snapshots_version_ = 0;
@@ -567,7 +574,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
   const CommitEntry64bFormat FORMAT;
   // commit_cache_ must be initialized to zero to tell apart an empty index from
   // a filled one. Thread-safety is provided with commit_cache_mutex_.
-  unique_ptr<std::atomic<CommitEntry64b>[]> commit_cache_;
+  std::unique_ptr<std::atomic<CommitEntry64b>[]> commit_cache_;
   // The largest evicted *commit* sequence number from the commit_cache_. If a
   // seq is smaller than max_evicted_seq_ is might or might not be present in
   // commit_cache_. So commit_cache_ must first be checked before consulting
diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc
index ee7b317aafd..f434d185700 100644
--- a/utilities/ttl/ttl_test.cc
+++ b/utilities/ttl/ttl_test.cc
@@ -370,14 +370,14 @@ class TtlTest : public testing::Test {
   static const int64_t kSampleSize_ = 100;
   std::string dbname_;
   DBWithTTL* db_ttl_;
-  unique_ptr<SpecialTimeEnv> env_;
+  std::unique_ptr<SpecialTimeEnv> env_;
 
  private:
   Options options_;
   KVMap kvmap_;
   KVMap::iterator kv_it_;
   const std::string kNewValue_ = "new_value";
-  unique_ptr<CompactionFilter> test_comp_filter_;
+  std::unique_ptr<CompactionFilter> test_comp_filter_;
 }; // class TtlTest
 
 // If TTL is non positive or not provided, the behaviour is TTL = infinity