From b519098b63b397f0ee30f43ae5e85b7dc87db549 Mon Sep 17 00:00:00 2001
From: Bryan Beaudreault <bbeaudreault@apache.org>
Date: Wed, 10 Jan 2024 09:59:53 -0500
Subject: [PATCH 01/37] HubSpot Edit: Add HubSpot build setup

---
 .blazar.yaml                                  |  25 +
 .build-jdk17                                  |   0
 build-scripts/prepare_environment.sh          |  97 ++++
 hbase-rpm/.blazar.yaml                        |  30 ++
 hbase-rpm/build.sh                            |  51 ++
 hbase-rpm/hbase.spec                          | 131 +++++
 hbase-rpm/sources/hbase.1                     |  88 ++++
 hbase-rpm/sources/install_hbase.sh            | 180 +++++++
 hubspot-client-bundles/.blazar.yaml           |  24 +
 hubspot-client-bundles/.build-jdk17           |   0
 hubspot-client-bundles/README.md              |  59 +++
 .../hbase-backup-restore-bundle/.blazar.yaml  |  26 +
 .../hbase-backup-restore-bundle/.build-jdk17  |   0
 .../hbase-backup-restore-bundle/pom.xml       | 119 +++++
 .../hbase-client-bundle/.blazar.yaml          |  25 +
 .../hbase-client-bundle/.build-jdk17          |   0
 .../hbase-client-bundle/pom.xml               | 127 +++++
 .../hbase-mapreduce-bundle/.blazar.yaml       |  25 +
 .../hbase-mapreduce-bundle/.build-jdk17       |   0
 .../hbase-mapreduce-bundle/pom.xml            | 251 ++++++++++
 .../hbase-server-it-bundle/.blazar.yaml       |  26 +
 .../hbase-server-it-bundle/.build-jdk17       |   0
 .../hbase-server-it-bundle/pom.xml            | 168 +++++++
 hubspot-client-bundles/pom.xml                | 458 ++++++++++++++++++
 24 files changed, 1910 insertions(+)
 create mode 100644 .blazar.yaml
 create mode 100644 .build-jdk17
 create mode 100755 build-scripts/prepare_environment.sh
 create mode 100644 hbase-rpm/.blazar.yaml
 create mode 100755 hbase-rpm/build.sh
 create mode 100644 hbase-rpm/hbase.spec
 create mode 100644 hbase-rpm/sources/hbase.1
 create mode 100755 hbase-rpm/sources/install_hbase.sh
 create mode 100644 hubspot-client-bundles/.blazar.yaml
 create mode 100644 hubspot-client-bundles/.build-jdk17
 create mode 100644 hubspot-client-bundles/README.md
 create mode 100644 hubspot-client-bundles/hbase-backup-restore-bundle/.blazar.yaml
 create mode 100644 hubspot-client-bundles/hbase-backup-restore-bundle/.build-jdk17
 create mode 100644 hubspot-client-bundles/hbase-backup-restore-bundle/pom.xml
 create mode 100644 hubspot-client-bundles/hbase-client-bundle/.blazar.yaml
 create mode 100644 hubspot-client-bundles/hbase-client-bundle/.build-jdk17
 create mode 100644 hubspot-client-bundles/hbase-client-bundle/pom.xml
 create mode 100644 hubspot-client-bundles/hbase-mapreduce-bundle/.blazar.yaml
 create mode 100644 hubspot-client-bundles/hbase-mapreduce-bundle/.build-jdk17
 create mode 100644 hubspot-client-bundles/hbase-mapreduce-bundle/pom.xml
 create mode 100644 hubspot-client-bundles/hbase-server-it-bundle/.blazar.yaml
 create mode 100644 hubspot-client-bundles/hbase-server-it-bundle/.build-jdk17
 create mode 100644 hubspot-client-bundles/hbase-server-it-bundle/pom.xml
 create mode 100644 hubspot-client-bundles/pom.xml

diff --git a/.blazar.yaml b/.blazar.yaml
new file mode 100644
index 000000000000..e034ada7508d
--- /dev/null
+++ b/.blazar.yaml
@@ -0,0 +1,25 @@
+buildpack:
+  name: Blazar-Buildpack-Java-single-module
+
+env:
+  MAVEN_PHASE: "package assembly:single deploy"
+  HADOOP_DEP_VERSION: "3.3.6-hubspot-SNAPSHOT"
+  MAVEN_BUILD_ARGS: "-Phadoop-3.0 -Dhadoop.profile=3.0 -Dhadoop-three.version=$HADOOP_DEP_VERSION -Dgpg.skip=true -DskipTests -DdeployAtEnd -pl hbase-assembly -am -T1C"
+
+  # Below variables are generated in prepare_environment.sh.
+  # The build environment requires environment variables to be explicitly defined before they may
+  # be modified by the `write-build-env-var` utilty script to persist changes to an environment variable
+  # throughout a build
+  REPO_NAME: ""
+  SET_VERSION: ""
+  HBASE_VERSION: ""
+  PKG_RELEASE: ""
+  FULL_BUILD_VERSION: ""
+
+before:
+  - description: "Prepare build environment"
+    commands:
+      - $WORKSPACE/build-scripts/prepare_environment.sh
+
+provides:
+  - hbase
diff --git a/.build-jdk17 b/.build-jdk17
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/build-scripts/prepare_environment.sh b/build-scripts/prepare_environment.sh
new file mode 100755
index 000000000000..65842dcd4d17
--- /dev/null
+++ b/build-scripts/prepare_environment.sh
@@ -0,0 +1,97 @@
+#
+# Generates the appropriate environment vars so that we:
+# - build against the right version of hadoop, and properly set up maven
+# - generate the correct maven version based on the branches
+# - upload RPMs with the correct release based on the branch, and to the right yum repo
+#
+# Since we need to distribute .blazar.yaml to all sub-modules of the project, we define our constants once
+# in this script which can be re-used by every .blazar.yaml.
+#
+set -ex
+printenv
+
+# We base the expected main branch and resulting maven version for clients on the hbase minor version
+# The reason for this is hbase re-branches for each minor release (2.4, 2.5, 2.6, etc). At each re-branch
+# the histories diverge. So we'll need to create our own fork of each new minor release branch.
+# The convention is a fork named "hubspot-$minorVersion", and the maven coordinates "$minorVersion-hubspot-SNAPSHOT"
+MINOR_VERSION="2.6"
+MAIN_BRANCH="hubspot-${MINOR_VERSION}"
+
+#
+# Validate inputs from blazar
+#
+
+if [ -z "$WORKSPACE" ]; then
+    echo "Missing env var \$WORKSPACE"
+    exit 1
+fi
+if [ -z "$GIT_BRANCH" ]; then
+    echo "Missing env var \$GIT_BRANCH"
+    exit 1
+fi
+if [ -z "$BUILD_COMMAND_RC_FILE" ]; then
+    echo "Missing env var \$BUILD_COMMAND_RC_FILE"
+    exit 1
+fi
+
+#
+# Extract current hbase version from root pom.xml
+#
+
+# the pom.xml has an invalid xml namespace, so just remove that so xmllint can parse it.
+cat $WORKSPACE/pom.xml | sed '2 s/xmlns=".*"//g' > pom.xml.tmp
+HBASE_VERSION=$(echo "cat /project/properties/revision/text()" | xmllint --nocdata --shell pom.xml.tmp | sed '1d;$d')
+rm pom.xml.tmp
+
+# sanity check that we've got some that looks right. it wouldn't be the end of the world if we got it wrong, but
+# will help avoid confusion.
+if [[ ! "$HBASE_VERSION" =~ 2\.[0-9]+\.[0-9]+ ]]; then
+    echo "Unexpected HBASE_Version extracted from pom.xml. Got $HBASE_VERSION but expected a string like '2.4.3', with 3 numbers separated by decimals, the first number being 2."
+    exit 1
+fi
+
+#
+# Generate branch-specific env vars
+# We are going to generate the maven version and the RPM release here:
+# - For the maven version, we need to special case our main branch
+# - For RPM, we want our final version to be:
+#   main branch: {hbase_version}-hs.{build_number}.el6
+#   other branches: {hbase_version}-hs~{branch_name}.{build_number}.el6, where branch_name substitutes underscore for non-alpha-numeric characters
+#
+
+echo "Git branch $GIT_BRANCH. Detecting appropriate version override and RPM release."
+
+RELEASE="hs"
+
+if [[ "$GIT_BRANCH" = "$MAIN_BRANCH" ]]; then
+    SET_VERSION="${MINOR_VERSION}-hubspot-SNAPSHOT"
+    REPO_NAME="AnyLinuxVersion_hs-hbase"
+elif [[ "$GIT_BRANCH" != "hubspot" ]]; then
+    SET_VERSION="${MINOR_VERSION}-${GIT_BRANCH}-SNAPSHOT"
+    RELEASE="${RELEASE}~${GIT_BRANCH//[^[:alnum:]]/_}"
+    REPO_NAME="AnyLinuxVersion_hs-hbase-develop"
+else
+    echo "Invalid git branch $GIT_BRANCH"
+    exit 1
+fi
+
+RELEASE="${RELEASE}.${BUILD_NUMBER}"
+FULL_BUILD_VERSION="${HBASE_VERSION}-${RELEASE}"
+
+# SET_VERSION is not the most intuitive name, but it's required for set-maven-versions script
+write-build-env-var SET_VERSION "$SET_VERSION"
+write-build-env-var HBASE_VERSION "$HBASE_VERSION"
+write-build-env-var PKG_RELEASE "$RELEASE"
+write-build-env-var FULL_BUILD_VERSION "$FULL_BUILD_VERSION"
+write-build-env-var REPO_NAME "$REPO_NAME"
+# Adding this value as versioninfo.version ensures we have the same value as would normally
+# show up in a non-hubspot hbase build. Otherwise due to set-maven-versions we'd end up
+# with 2.6-hubspot-SNAPSHOT which is not very useful as a point of reference.
+# Another option would be to pass in our FULL_BUILD_VERSION but that might cause some funniness
+# with the expectations in VersionInfo.compareVersion().
+write-build-env-var MAVEN_BUILD_ARGS "$MAVEN_BUILD_ARGS -Dversioninfo.version=$HBASE_VERSION"
+
+echo "Building HBase version $HBASE_VERSION"
+echo "Will deploy to nexus with version $SET_VERSION"
+echo "Will create rpm with version $FULL_BUILD_VERSION"
+echo "Will run maven with extra args $MAVEN_BUILD_ARGS"
diff --git a/hbase-rpm/.blazar.yaml b/hbase-rpm/.blazar.yaml
new file mode 100644
index 000000000000..a1bfcb2ae17b
--- /dev/null
+++ b/hbase-rpm/.blazar.yaml
@@ -0,0 +1,30 @@
+buildpack:
+  name: Buildpack-RPMs
+
+env:
+  RPM_BUILD_COMMAND: ./build.sh
+   # Below variables are generated in prepare_environment.sh.
+  # The build environment requires environment variables to be explicitly defined before they may
+  # be modified by the `write-build-env-var` utilty script to persist changes to an environment variable
+  # throughout a build
+  REPO_NAME: ""
+  SET_VERSION: ""
+  HBASE_VERSION: ""
+  PKG_RELEASE: ""
+  FULL_BUILD_VERSION: ""
+  MAVEN_BUILD_ARGS: ""
+
+enableBuildTargets:
+  - almalinux9_amd64
+
+depends:
+  - hbase
+
+before:
+  - description: "Prepare build environment"
+    commands:
+      - $WORKSPACE/build-scripts/prepare_environment.sh
+
+stepActivation:
+  uploadRpms:
+    branchRegexes: ['.*']
diff --git a/hbase-rpm/build.sh b/hbase-rpm/build.sh
new file mode 100755
index 000000000000..b527ca732913
--- /dev/null
+++ b/hbase-rpm/build.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+set -e
+set -x
+
+ROOT_DIR="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
+
+for iv in HBASE_VERSION SET_VERSION PKG_RELEASE; do
+    if [[ "X${!iv}" = "X" ]]; then
+        echo "Must specifiy $iv"
+        exit 1
+    fi
+done
+
+# Setup build dir
+BUILD_DIR="${ROOT_DIR}/build"
+rm -rf $BUILD_DIR
+mkdir -p ${BUILD_DIR}/{SOURCES,SPECS,RPMS}
+cp -a $ROOT_DIR/sources/* ${BUILD_DIR}/SOURCES/
+cp $ROOT_DIR/hbase.spec ${BUILD_DIR}/SPECS/
+
+# Download bin tar built by hbase-assembly
+SOURCES_DIR=$BUILD_DIR/SOURCES
+mvn dependency:copy \
+    -Dartifact=org.apache.hbase:hbase-assembly:${SET_VERSION}:tar.gz:bin \
+    -DoutputDirectory=$SOURCES_DIR \
+    -DlocalRepositoryDirectory=$SOURCES_DIR \
+    -Dtransitive=false
+INPUT_TAR=`ls -d $SOURCES_DIR/hbase-assembly-*.tar.gz`
+
+if [[ $HBASE_VERSION == *"-SNAPSHOT" ]]; then
+    # unreleased verion. do i want to denote that in the rpm release somehow?
+    # it can't be in the version, so strip here
+    HBASE_VERSION=${HBASE_VERSION//-SNAPSHOT/}
+fi
+
+rpmbuild \
+    --define "_topdir $BUILD_DIR" \
+    --define "input_tar $INPUT_TAR" \
+    --define "hbase_version ${HBASE_VERSION}" \
+    --define "maven_version ${SET_VERSION}" \
+    --define "release ${PKG_RELEASE}%{?dist}" \
+    -bb \
+    $BUILD_DIR/SPECS/hbase.spec
+
+if [[ -d $RPMS_OUTPUT_DIR ]]; then
+    mkdir -p $RPMS_OUTPUT_DIR
+
+    # Move rpms to output dir for upload
+
+    find ${BUILD_DIR}/RPMS -name "*.rpm" -exec mv {} $RPMS_OUTPUT_DIR/ \;
+fi
diff --git a/hbase-rpm/hbase.spec b/hbase-rpm/hbase.spec
new file mode 100644
index 000000000000..107c92636f06
--- /dev/null
+++ b/hbase-rpm/hbase.spec
@@ -0,0 +1,131 @@
+# taken from hbase.spec in https://github.com/apache/bigtop/
+# greatly modified to simplify and fix dependencies to work in the hubspot environment
+
+%define hadoop_major_version 3.2
+%define hbase_major_version 2.4
+%define etc_hbase_conf %{_sysconfdir}/hbase/conf
+%define etc_hbase_conf_dist %{etc_hbase_conf}.dist
+%define hbase_home /usr/lib/hbase
+%define bin_hbase %{hbase_home}/bin
+%define lib_hbase %{hbase_home}/lib
+%define conf_hbase %{hbase_home}/conf
+%define logs_hbase %{hbase_home}/logs
+%define pids_hbase %{hbase_home}/pids
+%define man_dir %{_mandir}
+%define hbase_username hbase
+%define hadoop_home /usr/lib/hadoop
+%define zookeeper_home /usr/lib/zookeeper
+
+# FIXME: brp-repack-jars uses unzip to expand jar files
+# Unfortunately guice-2.0.jar pulled by ivy contains some files and directories without any read permission
+# and make whole process to fail.
+# So for now brp-repack-jars is being deactivated until this is fixed.
+# See BIGTOP-294
+%define __os_install_post \
+    %{_rpmconfigdir}/brp-compress ; \
+    %{_rpmconfigdir}/brp-strip-static-archive %{__strip} ; \
+    %{_rpmconfigdir}/brp-strip-comment-note %{__strip} %{__objdump} ; \
+    /usr/lib/rpm/brp-python-bytecompile ; \
+    %{nil}
+
+%define doc_hbase %{_docdir}/hbase-%{hbase_version}
+%global initd_dir %{_sysconfdir}/rc.d/init.d
+%define alternatives_cmd alternatives
+
+# Disable debuginfo package
+%define debug_package %{nil}
+
+# HubSpot: use zstd because it decompresses much faster
+%define _binary_payload w19.zstdio
+%define _source_payload w19.zstdio
+
+Name: hbase
+Version: %{hbase_version}
+Release: %{release}
+BuildArch: noarch
+Summary: HBase is the Hadoop database. Use it when you need random, realtime read/write access to your Big Data. This project's goal is the hosting of very large tables -- billions of rows X millions of columns -- atop clusters of commodity hardware.
+URL: http://hbase.apache.org/
+Group: Systems/Daemons
+Buildroot: %{_topdir}/INSTALL/hbase-%{maven_version}
+License: ASL 2.0
+Source0: %{input_tar}
+Source1: install_hbase.sh
+
+Requires: coreutils, /usr/sbin/useradd, /sbin/chkconfig, /sbin/service
+Requires: hadoop >= %{hadoop_major_version}
+
+AutoReq: no
+
+%description
+HBase is an open-source, distributed, column-oriented store modeled after Google' Bigtable: A Distributed Storage System for Structured Data by Chang et al. Just as Bigtable leverages the distributed data storage provided by the Google File System, HBase provides Bigtable-like capabilities on top of Hadoop. HBase includes:
+
+    * Convenient base classes for backing Hadoop MapReduce jobs with HBase tables
+    * Query predicate push down via server side scan and get filters
+    * Optimizations for real time queries
+    * A high performance Thrift gateway
+    * A REST-ful Web service gateway that supports XML, Protobuf, and binary data encoding options
+    * Cascading source and sink modules
+    * Extensible jruby-based (JIRB) shell
+    * Support for exporting metrics via the Hadoop metrics subsystem to files or Ganglia; or via JMX
+
+%prep
+%setup -n hbase-%{maven_version}
+
+%install
+%__rm -rf $RPM_BUILD_ROOT
+bash %{SOURCE1} \
+	--input-tar=%{SOURCE0} \
+    --doc-dir=%{doc_hbase} \
+    --conf-dir=%{etc_hbase_conf_dist} \
+	--prefix=$RPM_BUILD_ROOT
+
+%__install -d -m 0755 $RPM_BUILD_ROOT/%{initd_dir}/
+
+%__install -d  -m 0755  %{buildroot}/%{_localstatedir}/log/hbase
+ln -s %{_localstatedir}/log/hbase %{buildroot}/%{logs_hbase}
+
+%__install -d  -m 0755  %{buildroot}/%{_localstatedir}/run/hbase
+ln -s %{_localstatedir}/run/hbase %{buildroot}/%{pids_hbase}
+
+%__install -d  -m 0755  %{buildroot}/%{_localstatedir}/lib/hbase
+
+%__install -d -m 0755 $RPM_BUILD_ROOT/usr/bin
+
+# Pull hadoop from its packages
+rm -f $RPM_BUILD_ROOT/%{lib_hbase}/{hadoop,slf4j-log4j12-}*.jar
+
+ln -f -s %{hadoop_home}/client/hadoop-annotations.jar $RPM_BUILD_ROOT/%{lib_hbase}
+ln -f -s %{hadoop_home}/client/hadoop-auth.jar $RPM_BUILD_ROOT/%{lib_hbase}
+ln -f -s %{hadoop_home}/client/hadoop-common.jar $RPM_BUILD_ROOT/%{lib_hbase}
+ln -f -s %{hadoop_home}/client/hadoop-hdfs-client.jar $RPM_BUILD_ROOT/%{lib_hbase}
+ln -f -s %{hadoop_home}/client/hadoop-mapreduce-client-common.jar $RPM_BUILD_ROOT/%{lib_hbase}
+ln -f -s %{hadoop_home}/client/hadoop-mapreduce-client-core.jar $RPM_BUILD_ROOT/%{lib_hbase}
+ln -f -s %{hadoop_home}/client/hadoop-mapreduce-client-jobclient.jar $RPM_BUILD_ROOT/%{lib_hbase}
+ln -f -s %{hadoop_home}/client/hadoop-yarn-api.jar $RPM_BUILD_ROOT/%{lib_hbase}
+ln -f -s %{hadoop_home}/client/hadoop-yarn-client.jar $RPM_BUILD_ROOT/%{lib_hbase}
+ln -f -s %{hadoop_home}/client/hadoop-yarn-common.jar $RPM_BUILD_ROOT/%{lib_hbase}
+
+%pre
+getent group hbase 2>/dev/null >/dev/null || /usr/sbin/groupadd -r hbase
+getent passwd hbase 2>&1 > /dev/null || /usr/sbin/useradd -c "HBase" -s /sbin/nologin -g hbase -r -d /var/lib/hbase hbase 2> /dev/null || :
+
+%post
+%{alternatives_cmd} --install %{etc_hbase_conf} %{name}-conf %{etc_hbase_conf_dist} 30
+
+%files
+%defattr(-,hbase,hbase)
+%{logs_hbase}
+%{pids_hbase}
+%dir %{_localstatedir}/log/hbase
+%dir %{_localstatedir}/run/hbase
+%dir %{_localstatedir}/lib/hbase
+
+%defattr(-,root,root)
+%{hbase_home}
+%{hbase_home}/hbase-*.jar
+/usr/bin/hbase
+%config(noreplace) %{etc_hbase_conf_dist}
+
+# files from doc package
+%defattr(-,root,root)
+%doc %{doc_hbase}/
diff --git a/hbase-rpm/sources/hbase.1 b/hbase-rpm/sources/hbase.1
new file mode 100644
index 000000000000..349218fe1d87
--- /dev/null
+++ b/hbase-rpm/sources/hbase.1
@@ -0,0 +1,88 @@
+.\" Licensed to the Apache Software Foundation (ASF) under one or more
+.\" contributor license agreements.  See the NOTICE file distributed with
+.\" this work for additional information regarding copyright ownership.
+.\" The ASF licenses this file to You under the Apache License, Version 2.0
+.\" (the "License"); you may not use this file except in compliance with
+.\" the License.  You may obtain a copy of the License at
+.\"
+.\"     http://www.apache.org/licenses/LICENSE-2.0
+.\"
+.\" Unless required by applicable law or agreed to in writing, software
+.\" distributed under the License is distributed on an "AS IS" BASIS,
+.\" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.\" See the License for the specific language governing permissions and
+.\" limitations under the License.
+.\"
+.\" Process this file with
+.\" groff -man -Tascii hbase.1
+.\"
+.TH hbase 1 "October 2010 " Linux "User Manuals"
+
+.SH NAME
+HBase \- HBase is the Hadoop database.
+
+.SH SYNOPSIS
+
+.B hbase
+\fICOMMAND\fR
+
+.SH DESCRIPTION
+
+HBase is the Hadoop database. Use it when you need random, realtime
+read/write access to your Big Data. This project's goal is the hosting
+of very large tables -- billions of rows X millions of columns -- atop
+clusters of commodity hardware.
+
+HBase is an open-source, distributed, versioned, column-oriented store
+modeled after Google's Bigtable: A Distributed Storage System for
+Structured Data by Chang et al. Just as Bigtable leverages the
+distributed data storage provided by the Google File System, HBase
+provides Bigtable-like capabilities on top of Hadoop.
+
+For more information about HBase, see http://hbase.apache.org.
+
+\fICOMMAND\fR may be one of the following:
+  shell            run the HBase shell
+  shell-tests      run the HBase shell tests
+  zkcli            run the ZooKeeper shell
+  master           run an HBase HMaster node
+  regionserver     run an HBase HRegionServer node
+  zookeeper        run a Zookeeper server
+  rest             run an HBase REST server
+  thrift           run an HBase Thrift server
+  avro             run an HBase Avro server
+  migrate          upgrade an hbase.rootdir
+  hbck             run the hbase 'fsck' tool
+ or
+  CLASSNAME        run the class named CLASSNAME
+
+Most commands print help when invoked w/o parameters or with --help.
+
+.SH ENVIRONMENT
+
+.IP JAVA_HOME
+The java implementation to use.  Overrides JAVA_HOME.
+
+.IP HBASE_CLASSPATH
+Extra Java CLASSPATH entries.
+
+.IP HBASE_HEAPSIZE
+The maximum amount of heap to use, in MB. Default is 1000.
+
+.IP HBASE_OPTS
+Extra Java runtime options.
+
+.IP HBASE_CONF_DIR
+Alternate conf dir. Default is ${HBASE_HOME}/conf.
+
+.IP HBASE_ROOT_LOGGER
+The root appender. Default is INFO,console
+
+.IP HIVE_OPT
+Extra Java runtime options.
+
+.IP HADOOP_HOME
+Optionally, the Hadoop home to run with.
+
+.SH COPYRIGHT
+Copyright (C) 2010 The Apache Software Foundation. All rights reserved.
diff --git a/hbase-rpm/sources/install_hbase.sh b/hbase-rpm/sources/install_hbase.sh
new file mode 100755
index 000000000000..95265d2100c8
--- /dev/null
+++ b/hbase-rpm/sources/install_hbase.sh
@@ -0,0 +1,180 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -ex
+
+usage() {
+  echo "
+usage: $0 <options>
+  Required not-so-options:
+     --mvn-target-dir=DIR        path to the output of the mvn assembly
+     --prefix=PREFIX             path to install into
+
+  Optional options:
+     --doc-dir=DIR               path to install docs into [/usr/share/doc/hbase]
+     --lib-dir=DIR               path to install hbase home [/usr/lib/hbase]
+     --installed-lib-dir=DIR     path where lib-dir will end up on target system
+     --bin-dir=DIR               path to install bins [/usr/bin]
+     --examples-dir=DIR          path to install examples [doc-dir/examples]
+     ... [ see source for more similar options ]
+  "
+  exit 1
+}
+
+OPTS=$(getopt \
+  -n $0 \
+  -o '' \
+  -l 'prefix:' \
+  -l 'doc-dir:' \
+  -l 'lib-dir:' \
+  -l 'installed-lib-dir:' \
+  -l 'bin-dir:' \
+  -l 'examples-dir:' \
+  -l 'conf-dir:' \
+  -l 'input-tar:' -- "$@")
+
+if [ $? != 0 ] ; then
+    usage
+fi
+
+eval set -- "$OPTS"
+while true ; do
+    case "$1" in
+        --prefix)
+        PREFIX=$2 ; shift 2
+        ;;
+        --input-tar)
+        INPUT_TAR=$2 ; shift 2
+        ;;
+        --doc-dir)
+        DOC_DIR=$2 ; shift 2
+        ;;
+        --lib-dir)
+        LIB_DIR=$2 ; shift 2
+        ;;
+        --bin-dir)
+        BIN_DIR=$2 ; shift 2
+        ;;
+        --examples-dir)
+        EXAMPLES_DIR=$2 ; shift 2
+        ;;
+        --conf-dir)
+        CONF_DIR=$2 ; shift 2
+        ;;
+        --)
+        shift ; break
+        ;;
+        *)
+        echo "Unknown option: $1"
+        usage
+        exit 1
+        ;;
+    esac
+done
+
+for var in PREFIX INPUT_TAR ; do
+  if [ -z "$(eval "echo \$$var")" ]; then
+    echo Missing param: $var
+    usage
+  fi
+done
+
+MAN_DIR=${MAN_DIR:-/usr/share/man/man1}
+DOC_DIR=${DOC_DIR:-/usr/share/doc/hbase}
+LIB_DIR=${LIB_DIR:-/usr/lib/hbase}
+
+BIN_DIR=${BIN_DIR:-/usr/lib/hbase/bin}
+ETC_DIR=${ETC_DIR:-/etc/hbase}
+CONF_DIR=${CONF_DIR:-${ETC_DIR}/conf.dist}
+THRIFT_DIR=${THRIFT_DIR:-${LIB_DIR}/include/thrift}
+
+EXTRACT_DIR=extracted
+rm -rf $EXTRACT_DIR
+mkdir $EXTRACT_DIR
+
+version_part=$SET_VERSION
+if [ -z "$version_part" ]; then
+  version_part=$HBASE_VERSION
+fi
+
+tar -C $EXTRACT_DIR --strip-components=1 -xzf $INPUT_TAR
+
+# we do not need the shaded clients in our rpm. they bloat the size and cause classpath issues for hbck2.
+rm -rf $EXTRACT_DIR/lib/shaded-clients
+
+install -d -m 0755 $PREFIX/$LIB_DIR
+install -d -m 0755 $PREFIX/$LIB_DIR/lib
+install -d -m 0755 $PREFIX/$DOC_DIR
+install -d -m 0755 $PREFIX/$BIN_DIR
+install -d -m 0755 $PREFIX/$ETC_DIR
+install -d -m 0755 $PREFIX/$MAN_DIR
+install -d -m 0755 $PREFIX/$THRIFT_DIR
+
+cp -ra $EXTRACT_DIR/lib/* ${PREFIX}/${LIB_DIR}/lib/
+cp $EXTRACT_DIR/lib/hbase*.jar $PREFIX/$LIB_DIR
+
+# We do not currently run "mvn site", so do not have a docs dir.
+# Only copy contents if dir exists
+if [ -n "$(ls -A $EXTRACT_DIR/docs 2>/dev/null)" ]; then
+  cp -a $EXTRACT_DIR/docs/* $PREFIX/$DOC_DIR
+  cp $EXTRACT_DIR/*.txt $PREFIX/$DOC_DIR/
+else
+  echo "Doc generation is currently disabled in our RPM build. If this is an issue, it should be possible to enable them with some work. See https://git.hubteam.com/HubSpot/apache-hbase/blob/hubspot-2/rpm/sources/do-component-build#L17-L24 for details." > $PREFIX/$DOC_DIR/README.txt
+fi
+
+cp -a $EXTRACT_DIR/conf $PREFIX/$CONF_DIR
+cp -a $EXTRACT_DIR/bin/* $PREFIX/$BIN_DIR
+
+# Purge scripts that don't work with packages
+for file in rolling-restart.sh graceful_stop.sh local-regionservers.sh \
+            master-backup.sh regionservers.sh zookeepers.sh hbase-daemons.sh \
+            start-hbase.sh stop-hbase.sh local-master-backup.sh ; do
+  rm -f $PREFIX/$BIN_DIR/$file
+done
+
+
+ln -s $ETC_DIR/conf $PREFIX/$LIB_DIR/conf
+
+# Make a symlink of hbase.jar to hbase-version.jar
+pushd `pwd`
+cd $PREFIX/$LIB_DIR
+for i in `ls hbase*jar | grep -v tests.jar`
+do
+    ln -s $i `echo $i | sed -n 's/\(.*\)\(-[0-9].*\)\(.jar\)/\1\3/p'`
+done
+popd
+
+wrapper=$PREFIX/usr/bin/hbase
+mkdir -p `dirname $wrapper`
+cat > $wrapper <<EOF
+#!/bin/bash
+
+DEFAULTS_DIR=\${DEFAULTS_DIR-/etc/default}
+[ -n "\${DEFAULTS_DIR}" -a -r \${DEFAULTS_DIR}/hbase ] && . \${DEFAULTS_DIR}/hbase
+[ -n "\${DEFAULTS_DIR}" -a -r \${DEFAULTS_DIR}/hadoop ] && . \${DEFAULTS_DIR}/hadoop
+
+export HADOOP_CONF_DIR=\${HADOOP_CONF_DIR:-/etc/hadoop/conf}
+export HBASE_CLASSPATH=\$HADOOP_CONF_DIR:\$HBASE_CLASSPATH
+
+exec /usr/lib/hbase/bin/hbase "\$@"
+EOF
+chmod 755 $wrapper
+
+install -d -m 0755 $PREFIX/usr/bin
+
+rm -f $PREFIX/$CONF_DIR/*.cmd
+rm -f $PREFIX/$BIN_DIR/*.cmd
diff --git a/hubspot-client-bundles/.blazar.yaml b/hubspot-client-bundles/.blazar.yaml
new file mode 100644
index 000000000000..a57d5eeb071b
--- /dev/null
+++ b/hubspot-client-bundles/.blazar.yaml
@@ -0,0 +1,24 @@
+buildpack:
+  name: Blazar-Buildpack-Java
+
+env:
+  # Below variables are generated in prepare_environment.sh.
+  # The build environment requires environment variables to be explicitly defined before they may
+  # be modified by the `write-build-env-var` utilty script to persist changes to an environment variable
+  # throughout a build
+  REPO_NAME: ""
+  SET_VERSION: ""
+  HBASE_VERSION: ""
+  PKG_RELEASE: ""
+  FULL_BUILD_VERSION: ""
+  MAVEN_BUILD_ARGS: ""
+
+before:
+  - description: "Prepare build environment"
+    commands:
+      - $WORKSPACE/build-scripts/prepare_environment.sh
+
+depends:
+  - hbase
+provides:
+  - hubspot-client-bundles
diff --git a/hubspot-client-bundles/.build-jdk17 b/hubspot-client-bundles/.build-jdk17
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/hubspot-client-bundles/README.md b/hubspot-client-bundles/README.md
new file mode 100644
index 000000000000..01462b6e0953
--- /dev/null
+++ b/hubspot-client-bundles/README.md
@@ -0,0 +1,59 @@
+# hubspot-client-bundles
+
+Bundles up the hbase client in a way that is most friendly to the hubspot dependency trees
+
+## Why?
+
+HBase provides some shaded artifacts, but they don't really work for us for two reasons:
+
+1. We have little control over what's included in them, so the jars end up being unnecessarily fat and/or leaking dependencies we don't want.
+2. The shaded artifacts have significant class overlaps because one is a superset of the other. This bloats our classpath and also makes `mvn dependency:analyze` complain. This can be a cause for the classic flappy "Unused declared"/"Used undeclared" dependency issue.
+
+One option would be to fix those existing artifacts to work how we'd like. I tried that in hbase2, but it was very complicated without fully redoing how the shading works. Rather than maintain a large rewrite of those poms, I'd rather start fresh with our artifacts. This also will give us greater flexibility in the future for changing the includes/excludes as we see fit.
+
+## Why here?
+
+The other design choice here was to include these artifacts in this repo as opposed to a separate repo. One pain point with developing on hbase has been the number of repos necessary to develop and/or test any change -- the client fork has historically had 2 branches (staging and master) and similar for hbase-shading. In order to get a branch out there for testing you need to modify two repos. Iterating on those branches is annoying because builds are not automatically in-sync.
+
+Putting the bundling here makes it part of the build, so we automatically have client artifacts created for every branch.
+
+One new guiding principle of our forking strategy is to minimize the number of customizations in our forks, instead aiming to get things upstreamed. The goal is to eliminate the tech debt inherent in having to re-analyze, copy patches, handle merge conflicts, etc, every time we upgrade. This module is an omission to that rule -- regardless of where it lives, we will want to be cognizant of dependency changes in new releases. Putting it here gives us the option to bake that process directly into our build and introduces no potential for merge conflicts because it's entirely isolated in a new module.
+
+## How it works
+
+These artifacts are produced with the usual maven-shade-plugin. Some understanding of that plugin is helpful, but I wanted to give a little clarity on a few techniques used.
+
+In general our goal with shading is to control two things:
+
+- Which classes end up in the jar, and the fully qualified class names (i.e. including package) of those classes.
+- Which dependencies are exposed in the resulting pom.
+
+At a very high level, the shade plugin does the following:
+
+1. Collect all the dependencies in your pom.xml, including transitive dependencies. It's worth noting that this flattens your dependency tree, so if your project A previously depended on project B which depended on project C, your project A now directly depends on B and C.
+2. Include any selected dependencies (via artifactSet) directly into your jar by copying the class files in.
+3. Rewrite those class packages and imports, if configured via relocations.
+4. Write a new dependency-reduced-pom.xml, which only includes the dependencies that weren't included in the jar. This pom becomes the new pom for your artifact.
+
+In terms of our two goals, choosing which classes end up in the jar is easy via artifactSet. Controlling which dependencies end up in your final pom is a lot trickier:
+
+- **Exclusions** - Since the shade plugin starts with your initial dependencies, you can eliminate transitive dependencies by excluding them from your direct dependencies. This is effective but typically involves needing to apply those same exclusions to all direct dependencies, because the ones you're trying to exclude will often come from multiple.
+- **Marking a dependency as scope provided** - The shade plugin seems to ignore scope provided dependencies, as well as all of their transitive dependencies (as long as they aren't converted to compile scope by some other dependency). This sometimes doesn't work and seems kind of magic, so might make sense to only use for cases where your jar actually provides that dependency.
+- **Inclusion in the jar** - Any dependencies included in the jar will be removed from the resulting pom. In general if you include something in the jar, it should be relocated or filtered. Otherwise, you run the risk of duplicate class conflicts. You can include something in the jar and then filter out all classes, which sort of wipes it out. But it requires configuring in multiple places and is again sort of magic, so another last resort.
+
+My strategy has evolved here over time since none of these are perfect and there's no easy answer as far as I can tell. But I've listed the above in approximately the order
+I chose to solve each dependency. So I mostly preferred exclusions here, then marked some stuff as scope provided, and mostly didn't use the last strategy.
+
+## How to make changes
+
+In general the best way I've found to iterate here is:
+
+1. Create a simple downstream project which depends on one or both of these bundles
+2. Run `mvn dependency:list -DoutputFile=dependencies.out` to see a full list of dependencies
+3. You can pass that through something like `cat dependencies.out | sed -E -e 's/^ +//' | sed -E -e 's/:(compile|runtime|provided|test).*/:\1/' | sed -E -e 's/:(compile|runtime)$/:compile/' | sort | uniq > dependencies.sorted` to get a file that can be compared with another such-processed file
+4. Make the change you want in the bundle, then `mvn clean install`
+5. Re-run steps 2 and 3, outputting to a new file
+6. Run `comm -13 first second` to see what might be newly added after your change, or `comm -23` to see what might have been removed
+7. If trying to track a specific dependency from the list, go back here and run `mvn dependency:tree -Dincludes=<coordinates>`. This might show you what dependency you need to add an exclusion to
+
+This ends up being pretty iterative and trial/error, but can eventually get to a jar which has what you want (and doesn't what you don't).
diff --git a/hubspot-client-bundles/hbase-backup-restore-bundle/.blazar.yaml b/hubspot-client-bundles/hbase-backup-restore-bundle/.blazar.yaml
new file mode 100644
index 000000000000..9399e5dc0aa4
--- /dev/null
+++ b/hubspot-client-bundles/hbase-backup-restore-bundle/.blazar.yaml
@@ -0,0 +1,26 @@
+buildpack:
+  name: Blazar-Buildpack-Java
+
+env:
+  # Below variables are generated in prepare_environment.sh.
+  # The build environment requires environment variables to be explicitly defined before they may
+  # be modified by the `write-build-env-var` utilty script to persist changes to an environment variable
+  # throughout a build
+  REPO_NAME: ""
+  SET_VERSION: ""
+  HBASE_VERSION: ""
+  PKG_RELEASE: ""
+  FULL_BUILD_VERSION: ""
+  MAVEN_BUILD_ARGS: ""
+
+before:
+  - description: "Prepare build environment"
+    commands:
+      - $WORKSPACE/build-scripts/prepare_environment.sh
+
+depends:
+  - hubspot-client-bundles
+  - hbase-client-bundle
+  - hbase-mapreduce-bundle
+provides:
+  - hbase-backup-restore-bundle
diff --git a/hubspot-client-bundles/hbase-backup-restore-bundle/.build-jdk17 b/hubspot-client-bundles/hbase-backup-restore-bundle/.build-jdk17
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/hubspot-client-bundles/hbase-backup-restore-bundle/pom.xml b/hubspot-client-bundles/hbase-backup-restore-bundle/pom.xml
new file mode 100644
index 000000000000..9707d9d8118d
--- /dev/null
+++ b/hubspot-client-bundles/hbase-backup-restore-bundle/pom.xml
@@ -0,0 +1,119 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>com.hubspot.hbase</groupId>
+    <artifactId>hubspot-client-bundles</artifactId>
+    <version>${revision}</version>
+  </parent>
+
+  <artifactId>hbase-backup-restore-bundle</artifactId>
+
+  <dependencies>
+    <dependency>
+      <groupId>com.hubspot.hbase</groupId>
+      <artifactId>hbase-client-bundle</artifactId>
+      <exclusions>
+        <!-- we bundle commons-io, but it also come through transitively. -->
+        <exclusion>
+          <groupId>commons-io</groupId>
+          <artifactId>commons-io</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>com.hubspot.hbase</groupId>
+      <artifactId>hbase-mapreduce-bundle</artifactId>
+      <exclusions>
+        <!-- we bundle commons-io, but it also come through transitively. -->
+        <exclusion>
+          <groupId>commons-io</groupId>
+          <artifactId>commons-io</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-backup</artifactId>
+      <exclusions>
+        <!-- we bundle commons-io, but it also come through transitively. -->
+        <exclusion>
+          <groupId>commons-io</groupId>
+          <artifactId>commons-io</artifactId>
+        </exclusion>
+        <!-- Remove duplicated that come through the client and mapreduce bundles-->
+        <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <!-- unnecessary or conflicting dependencies to exclude from the jar -->
+        <exclusion>
+          <groupId>commons-logging</groupId>
+          <artifactId>commons-logging</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet.jsp</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.glassfish.web</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jamon</groupId>
+          <artifactId>jamon-runtime</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>io.netty</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>create-bundle-with-relocations</id>
+            <configuration>
+              <artifactSet>
+                <includes>
+                  <include>org.apache.hbase:*</include>
+
+                  <include>io.opentelemetry:opentelemetry-api</include>
+                  <include>io.opentelemetry:opentelemetry-context</include>
+                  <include>com.google.protobuf:protobuf-java</include>
+                  <include>io.dropwizard.metrics:metrics-core</include>
+                </includes>
+              </artifactSet>
+              <filters>
+                <filter>
+                  <artifact>org.apache.kerby:*</artifact>
+                  <excludes>
+                    <exclude>krb5-template.conf</exclude>
+                    <exclude>krb5_udp-template.conf</exclude>
+                    <exclude>ccache.txt</exclude>
+                    <exclude>keytab.txt</exclude>
+                  </excludes>
+                </filter>
+              </filters>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/hubspot-client-bundles/hbase-client-bundle/.blazar.yaml b/hubspot-client-bundles/hbase-client-bundle/.blazar.yaml
new file mode 100644
index 000000000000..300be28892e8
--- /dev/null
+++ b/hubspot-client-bundles/hbase-client-bundle/.blazar.yaml
@@ -0,0 +1,25 @@
+buildpack:
+  name: Blazar-Buildpack-Java
+
+env:
+  # Below variables are generated in prepare_environment.sh.
+  # The build environment requires environment variables to be explicitly defined before they may
+  # be modified by the `write-build-env-var` utilty script to persist changes to an environment variable
+  # throughout a build
+  REPO_NAME: ""
+  SET_VERSION: ""
+  HBASE_VERSION: ""
+  PKG_RELEASE: ""
+  FULL_BUILD_VERSION: ""
+  MAVEN_BUILD_ARGS: ""
+
+before:
+  - description: "Prepare build environment"
+    commands:
+      - $WORKSPACE/build-scripts/prepare_environment.sh
+
+depends:
+  - hubspot-client-bundles
+provides:
+  - hbase-client-bundle
+
diff --git a/hubspot-client-bundles/hbase-client-bundle/.build-jdk17 b/hubspot-client-bundles/hbase-client-bundle/.build-jdk17
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/hubspot-client-bundles/hbase-client-bundle/pom.xml b/hubspot-client-bundles/hbase-client-bundle/pom.xml
new file mode 100644
index 000000000000..24ce44daf93a
--- /dev/null
+++ b/hubspot-client-bundles/hbase-client-bundle/pom.xml
@@ -0,0 +1,127 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>com.hubspot.hbase</groupId>
+    <artifactId>hubspot-client-bundles</artifactId>
+    <version>${revision}</version>
+  </parent>
+
+  <artifactId>hbase-client-bundle</artifactId>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-openssl</artifactId>
+    </dependency>
+    <!--
+    One main dependency: hbase-client. The rest pulled in transitively.
+    Additionally, need hbase-endpoint for AggregationClient. We need to make sure it
+    doesn't pull in any additional transitive dependencies.
+    -->
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-client</artifactId>
+      <exclusions>
+        <!-- these 2 compat modules are for metrics, but not used by the client metrics. -->
+        <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-hadoop-compat</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-hadoop2-compat</artifactId>
+        </exclusion>
+        <!-- unnecessary or conflicting dependencies to exclude from the jar -->
+        <exclusion>
+          <groupId>commons-logging</groupId>
+          <artifactId>commons-logging</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jruby.joni</groupId>
+          <artifactId>joni</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jruby.jcodings</groupId>
+          <artifactId>jcodings</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-endpoint</artifactId>
+      <exclusions>
+        <exclusion>
+          <groupId>*</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>create-bundle-with-relocations</id>
+            <configuration>
+              <artifactSet>
+                <includes>
+                  <!-- The hbase dependencies required for all clients -->
+                  <include>org.apache.hbase:hbase-client</include>
+                  <include>org.apache.hbase:hbase-common</include>
+                  <include>org.apache.hbase:hbase-logging</include>
+                  <include>org.apache.hbase:hbase-protocol</include>
+                  <include>org.apache.hbase:hbase-protocol-shaded</include>
+                  <include>org.apache.hbase:hbase-openssl</include>
+                  <!--
+                  need to include a bunch of thirdparty deps, otherwise we'd need
+                  to manage those versions in parent-pom
+                  -->
+                  <include>org.apache.hbase.thirdparty:*</include>
+                  <!-- for AggregationClient. we filter to just client classes below -->
+                  <include>org.apache.hbase:hbase-endpoint</include>
+
+                  <!--
+                  following deps shaded in hubspot-client-bundles pom
+                  -->
+
+                  <include>com.google.protobuf:protobuf-java</include>
+                  <!--  For client metrics we need to include so the metrics package can be shaded to prevent issues with our parent pom -->
+                  <include>io.dropwizard.metrics:metrics-core</include>
+                  <!-- conflicts with hubspot managed version, so include it with shading -->
+                  <include>commons-io:commons-io</include>
+                </includes>
+              </artifactSet>
+              <filters>
+                <filter>
+                  <artifact>org.apache.hbase:hbase-endpoint</artifact>
+                  <includes>
+                    <include>org/apache/hadoop/hbase/client/coprocessor/**</include>
+                    <include>org/apache/hadoop/hbase/protobuf/generated/**</include>
+                  </includes>
+                </filter>
+                <filter>
+                  <artifact>*:*</artifact>
+                  <excludes>
+                    <exclude>META-INF/*.SF</exclude>
+                    <exclude>META-INF/*.DSA</exclude>
+                    <exclude>META-INF/*.RSA</exclude>
+                  </excludes>
+                </filter>
+              </filters>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/hubspot-client-bundles/hbase-mapreduce-bundle/.blazar.yaml b/hubspot-client-bundles/hbase-mapreduce-bundle/.blazar.yaml
new file mode 100644
index 000000000000..5c020e374927
--- /dev/null
+++ b/hubspot-client-bundles/hbase-mapreduce-bundle/.blazar.yaml
@@ -0,0 +1,25 @@
+buildpack:
+  name: Blazar-Buildpack-Java
+
+env:
+  # Below variables are generated in prepare_environment.sh.
+  # The build environment requires environment variables to be explicitly defined before they may
+  # be modified by the `write-build-env-var` utilty script to persist changes to an environment variable
+  # throughout a build
+  REPO_NAME: ""
+  SET_VERSION: ""
+  HBASE_VERSION: ""
+  PKG_RELEASE: ""
+  FULL_BUILD_VERSION: ""
+  MAVEN_BUILD_ARGS: ""
+
+before:
+  - description: "Prepare build environment"
+    commands:
+      - $WORKSPACE/build-scripts/prepare_environment.sh
+
+depends:
+  - hubspot-client-bundles
+  - hbase-client-bundle
+provides:
+  - hbase-mapreduce-bundle
diff --git a/hubspot-client-bundles/hbase-mapreduce-bundle/.build-jdk17 b/hubspot-client-bundles/hbase-mapreduce-bundle/.build-jdk17
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/hubspot-client-bundles/hbase-mapreduce-bundle/pom.xml b/hubspot-client-bundles/hbase-mapreduce-bundle/pom.xml
new file mode 100644
index 000000000000..233e33750fe1
--- /dev/null
+++ b/hubspot-client-bundles/hbase-mapreduce-bundle/pom.xml
@@ -0,0 +1,251 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>com.hubspot.hbase</groupId>
+    <artifactId>hubspot-client-bundles</artifactId>
+    <version>${revision}</version>
+  </parent>
+
+  <artifactId>hbase-mapreduce-bundle</artifactId>
+
+  <dependencies>
+    <!-- Necessary direct dependencies for hbase + mapreduce -->
+    <dependency>
+      <groupId>com.hubspot.hbase</groupId>
+      <artifactId>hbase-client-bundle</artifactId>
+      <exclusions>
+        <!-- we bundle commons-io, but it also come through transitively. -->
+        <exclusion>
+          <groupId>commons-io</groupId>
+          <artifactId>commons-io</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-mapreduce</artifactId>
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-client</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-common</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-annotations</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-protocol</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-protocol-shaded</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-logging</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.google.protobuf</groupId>
+          <artifactId>protobuf-java</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase.thirdparty</groupId>
+          <artifactId>hbase-shaded-gson</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase.thirdparty</groupId>
+          <artifactId>hbase-shaded-protobuf</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase.thirdparty</groupId>
+          <artifactId>hbase-unsafe</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase.thirdparty</groupId>
+          <artifactId>hbase-shaded-miscellaneous</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase.thirdparty</groupId>
+          <artifactId>hbase-shaded-netty</artifactId>
+        </exclusion>
+        <!-- unnecessary or conflicting dependencies to exclude from the jar -->
+        <exclusion>
+          <groupId>commons-logging</groupId>
+          <artifactId>commons-logging</artifactId>
+        </exclusion>
+        <!-- this one is included in the client bundle, shaded -->
+        <exclusion>
+          <groupId>commons-io</groupId>
+          <artifactId>commons-io</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jersey</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>tomcat</groupId>
+          <artifactId>jasper-runtime</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.mortbay.jetty</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-server</artifactId>
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-client</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-common</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-annotations</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-protocol</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-protocol-shaded</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-logging</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.google.protobuf</groupId>
+          <artifactId>protobuf-java</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase.thirdparty</groupId>
+          <artifactId>hbase-shaded-gson</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase.thirdparty</groupId>
+          <artifactId>hbase-shaded-protobuf</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase.thirdparty</groupId>
+          <artifactId>hbase-unsafe</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase.thirdparty</groupId>
+          <artifactId>hbase-shaded-miscellaneous</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase.thirdparty</groupId>
+          <artifactId>hbase-shaded-netty</artifactId>
+        </exclusion>
+
+        <!-- unnecessary or conflicting dependencies to exclude from the jar -->
+        <exclusion>
+          <groupId>commons-logging</groupId>
+          <artifactId>commons-logging</artifactId>
+        </exclusion>
+        <!-- this one is included in the client bundle, shaded -->
+        <exclusion>
+          <groupId>commons-io</groupId>
+          <artifactId>commons-io</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet.jsp</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.glassfish.web</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jamon</groupId>
+          <artifactId>jamon-runtime</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>io.netty</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.glassfish.hk2.external</groupId>
+          <artifactId>jakarta.inject</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>jakarta.ws.rs</groupId>
+          <artifactId>jakarta.ws.rs-api</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-compression-zstd</artifactId>
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <!-- this one is included in the client bundle, shaded -->
+        <exclusion>
+          <groupId>commons-io</groupId>
+          <artifactId>commons-io</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>create-bundle-with-relocations</id>
+            <configuration>
+              <artifactSet>
+                <includes>
+                  <!--
+                  We can add all hbase dependencies here. The ones provided by
+                  hbase-client-bundle will automatically be excluded because we
+                  list them above with scope provided.
+                  -->
+                  <include>org.apache.hbase:*</include>
+                  <!--
+                  need to include a bunch of thirdparty deps, otherwise we'd need
+                  to manage those versions in parent-pom
+                  -->
+                  <include>org.apache.hbase.thirdparty:*</include>
+                </includes>
+              </artifactSet>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/hubspot-client-bundles/hbase-server-it-bundle/.blazar.yaml b/hubspot-client-bundles/hbase-server-it-bundle/.blazar.yaml
new file mode 100644
index 000000000000..26db8c8066b3
--- /dev/null
+++ b/hubspot-client-bundles/hbase-server-it-bundle/.blazar.yaml
@@ -0,0 +1,26 @@
+buildpack:
+  name: Blazar-Buildpack-Java
+
+env:
+  # Below variables are generated in prepare_environment.sh.
+  # The build environment requires environment variables to be explicitly defined before they may
+  # be modified by the `write-build-env-var` utilty script to persist changes to an environment variable
+  # throughout a build
+  YUM_REPO_UPLOAD_OVERRIDE_CENTOS_8: ""
+  SET_VERSION: ""
+  HBASE_VERSION: ""
+  PKG_RELEASE: ""
+  FULL_BUILD_VERSION: ""
+  MAVEN_BUILD_ARGS: ""
+  REPO_NAME: ""
+
+before:
+  - description: "Prepare build environment"
+    commands:
+      - $WORKSPACE/build-scripts/prepare_environment.sh
+
+depends:
+  - hbase
+provides:
+  - hbase-server-it-bundle
+
diff --git a/hubspot-client-bundles/hbase-server-it-bundle/.build-jdk17 b/hubspot-client-bundles/hbase-server-it-bundle/.build-jdk17
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/hubspot-client-bundles/hbase-server-it-bundle/pom.xml b/hubspot-client-bundles/hbase-server-it-bundle/pom.xml
new file mode 100644
index 000000000000..fa617258f82d
--- /dev/null
+++ b/hubspot-client-bundles/hbase-server-it-bundle/pom.xml
@@ -0,0 +1,168 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>com.hubspot.hbase</groupId>
+    <artifactId>hubspot-client-bundles</artifactId>
+    <version>${revision}</version>
+  </parent>
+
+  <artifactId>hbase-server-it-bundle</artifactId>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-it</artifactId>
+      <type>test-jar</type>
+      <exclusions>
+        <!-- unnecessary or conflicting dependencies to exclude from the jar -->
+        <exclusion>
+          <groupId>commons-logging</groupId>
+          <artifactId>commons-logging</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet.jsp</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.glassfish.web</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jamon</groupId>
+          <artifactId>jamon-runtime</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>io.netty</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-server</artifactId>
+      <type>test-jar</type>
+      <version>${project.version}</version>
+      <exclusions>
+        <!-- unnecessary or conflicting dependencies to exclude from the jar -->
+        <exclusion>
+          <groupId>commons-logging</groupId>
+          <artifactId>commons-logging</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet.jsp</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.glassfish.web</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jamon</groupId>
+          <artifactId>jamon-runtime</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>io.netty</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase-testing-util</artifactId>
+      <version>${project.version}</version>
+      <exclusions>
+        <!-- unnecessary or conflicting dependencies to exclude from the jar -->
+        <exclusion>
+          <groupId>commons-logging</groupId>
+          <artifactId>commons-logging</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet.jsp</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.servlet</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.glassfish.web</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.jamon</groupId>
+          <artifactId>jamon-runtime</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>io.netty</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-log4j12</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>create-bundle-with-relocations</id>
+            <configuration>
+              <artifactSet>
+                <includes>
+                  <include>org.apache.hbase:*</include>
+                  <!-- we are not allowed to depend on junit in non-test modules, but we need it -->
+                  <include>junit:junit</include>
+                  <!-- conflicts with hubspot managed version, so include it with shading -->
+                  <include>commons-io:commons-io</include>
+
+                  <include>org.apache.hbase.thirdparty:*</include>
+                  <include>com.google.protobuf:protobuf-java</include>
+
+                  <include>io.opentelemetry:opentelemetry-api</include>
+                  <include>io.opentelemetry:opentelemetry-context</include>
+                  <include>com.google.protobuf:protobuf-java</include>
+                  <include>io.dropwizard.metrics:metrics-core</include>
+                </includes>
+              </artifactSet>
+              <filters>
+                <filter>
+                  <artifact>org.apache.kerby:*</artifact>
+                  <excludes>
+                    <exclude>krb5-template.conf</exclude>
+                    <exclude>krb5_udp-template.conf</exclude>
+                    <exclude>ccache.txt</exclude>
+                    <exclude>keytab.txt</exclude>
+                  </excludes>
+                </filter>
+              </filters>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/hubspot-client-bundles/pom.xml b/hubspot-client-bundles/pom.xml
new file mode 100644
index 000000000000..0105ffd28c43
--- /dev/null
+++ b/hubspot-client-bundles/pom.xml
@@ -0,0 +1,458 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>com.hubspot.hbase</groupId>
+  <artifactId>hubspot-client-bundles</artifactId>
+  <version>${revision}</version>
+  <packaging>pom</packaging>
+  <name>Bundled versions of the hbase client</name>
+
+  <modules>
+    <module>hbase-client-bundle</module>
+    <module>hbase-mapreduce-bundle</module>
+    <module>hbase-backup-restore-bundle</module>
+    <module>hbase-server-it-bundle</module>
+  </modules>
+
+  <properties>
+    <shade.prefix>org.apache.hadoop.hbase.shaded</shade.prefix>
+    <!-- This is the hubspot shaded version of ZK. We added it here so that we don't pull in
+    unexpected netty dependencies. This is problematic with the shade plugin because it flattens
+    the dependency tree, so those dependencies become direct dependencies of the bundle. -->
+    <zookeeper.version>3.6.3-shaded-SNAPSHOT</zookeeper.version>
+
+    <revision>2.6-hubspot-SNAPSHOT</revision>
+  </properties>
+
+  <dependencyManagement>
+    <!-- local project deps -->
+    <dependencies>
+      <dependency>
+        <groupId>com.hubspot.hbase</groupId>
+        <artifactId>hbase-client-bundle</artifactId>
+        <version>${project.version}</version>
+        <exclusions>
+          <exclusion>
+            <groupId>org.apache.hbase</groupId>
+            <artifactId>hbase-client</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
+      <dependency>
+        <groupId>com.hubspot.hbase</groupId>
+        <artifactId>hbase-mapreduce-bundle</artifactId>
+        <version>${project.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>com.hubspot.hbase</groupId>
+        <artifactId>hbase-backup-restore-bundle</artifactId>
+        <version>${project.version}</version>
+      </dependency>
+
+      <!-- hbase-provided deps -->
+      <dependency>
+        <groupId>org.apache.zookeeper</groupId>
+        <artifactId>zookeeper</artifactId>
+        <version>${zookeeper.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.hbase</groupId>
+        <artifactId>hbase-openssl</artifactId>
+        <version>${project.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.hbase</groupId>
+        <artifactId>hbase-compression-zstd</artifactId>
+        <version>${project.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.hbase</groupId>
+        <artifactId>hbase-client</artifactId>
+        <version>${project.version}</version>
+        <!-- we repeat the same exclusions here in each of the below modules -->
+        <exclusions>
+          <exclusion>
+            <groupId>javax.activation</groupId>
+            <artifactId>javax.activation-api</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>javax.annotation</groupId>
+            <artifactId>javax.annotation-api</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-reload4j</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.google.code.findbugs</groupId>
+            <artifactId>jsr305</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.sun.jersey</groupId>
+            <artifactId>jersey-servlet</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.sun.jersey.contribs</groupId>
+            <artifactId>jersey-guice</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.github.pjfanning</groupId>
+            <artifactId>jersey-json</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.avro</groupId>
+            <artifactId>avro</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.eclipse.jetty</groupId>
+            <artifactId>jetty-client</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.google.j2objc</groupId>
+            <artifactId>j2objc-annotations</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.hbase</groupId>
+        <artifactId>hbase-server</artifactId>
+        <version>${project.version}</version>
+        <exclusions>
+          <exclusion>
+            <groupId>javax.activation</groupId>
+            <artifactId>javax.activation-api</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>javax.annotation</groupId>
+            <artifactId>javax.annotation-api</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-reload4j</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.google.code.findbugs</groupId>
+            <artifactId>jsr305</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.sun.jersey</groupId>
+            <artifactId>jersey-servlet</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.sun.jersey.contribs</groupId>
+            <artifactId>jersey-guice</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.github.pjfanning</groupId>
+            <artifactId>jersey-json</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.avro</groupId>
+            <artifactId>avro</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.eclipse.jetty</groupId>
+            <artifactId>jetty-client</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.google.j2objc</groupId>
+            <artifactId>j2objc-annotations</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.hbase</groupId>
+        <artifactId>hbase-mapreduce</artifactId>
+        <version>${project.version}</version>
+        <exclusions>
+          <exclusion>
+            <groupId>javax.activation</groupId>
+            <artifactId>javax.activation-api</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>javax.annotation</groupId>
+            <artifactId>javax.annotation-api</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-reload4j</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.google.code.findbugs</groupId>
+            <artifactId>jsr305</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.sun.jersey</groupId>
+            <artifactId>jersey-servlet</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.sun.jersey.contribs</groupId>
+            <artifactId>jersey-guice</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.github.pjfanning</groupId>
+            <artifactId>jersey-json</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.avro</groupId>
+            <artifactId>avro</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.eclipse.jetty</groupId>
+            <artifactId>jetty-client</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.google.j2objc</groupId>
+            <artifactId>j2objc-annotations</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.hbase</groupId>
+        <artifactId>hbase-endpoint</artifactId>
+        <version>${project.version}</version>
+        <exclusions>
+          <exclusion>
+            <groupId>javax.activation</groupId>
+            <artifactId>javax.activation-api</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>javax.annotation</groupId>
+            <artifactId>javax.annotation-api</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-reload4j</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.google.code.findbugs</groupId>
+            <artifactId>jsr305</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.sun.jersey</groupId>
+            <artifactId>jersey-servlet</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.sun.jersey.contribs</groupId>
+            <artifactId>jersey-guice</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.github.pjfanning</groupId>
+            <artifactId>jersey-json</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.avro</groupId>
+            <artifactId>avro</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.eclipse.jetty</groupId>
+            <artifactId>jetty-client</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.google.j2objc</groupId>
+            <artifactId>j2objc-annotations</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.hbase</groupId>
+        <artifactId>hbase-backup</artifactId>
+        <version>${project.version}</version>
+        <exclusions>
+          <exclusion>
+            <groupId>javax.activation</groupId>
+            <artifactId>javax.activation-api</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>javax.annotation</groupId>
+            <artifactId>javax.annotation-api</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-reload4j</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.google.code.findbugs</groupId>
+            <artifactId>jsr305</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.sun.jersey</groupId>
+            <artifactId>jersey-servlet</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.sun.jersey.contribs</groupId>
+            <artifactId>jersey-guice</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.github.pjfanning</groupId>
+            <artifactId>jersey-json</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.avro</groupId>
+            <artifactId>avro</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.eclipse.jetty</groupId>
+            <artifactId>jetty-client</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.google.j2objc</groupId>
+            <artifactId>j2objc-annotations</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.hbase</groupId>
+        <artifactId>hbase-hadoop2-compat</artifactId>
+        <version>${project.version}</version>
+        <exclusions>
+          <exclusion>
+            <groupId>javax.activation</groupId>
+            <artifactId>javax.activation-api</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>javax.annotation</groupId>
+            <artifactId>javax.annotation-api</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-reload4j</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.google.code.findbugs</groupId>
+            <artifactId>jsr305</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.sun.jersey</groupId>
+            <artifactId>jersey-servlet</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.sun.jersey.contribs</groupId>
+            <artifactId>jersey-guice</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.github.pjfanning</groupId>
+            <artifactId>jersey-json</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.avro</groupId>
+            <artifactId>avro</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.eclipse.jetty</groupId>
+            <artifactId>jetty-client</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.google.j2objc</groupId>
+            <artifactId>j2objc-annotations</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.hbase</groupId>
+        <artifactId>hbase-it</artifactId>
+        <type>test-jar</type>
+        <version>${project.version}</version>
+        <exclusions>
+          <exclusion>
+            <groupId>javax.activation</groupId>
+            <artifactId>javax.activation-api</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>javax.annotation</groupId>
+            <artifactId>javax.annotation-api</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-reload4j</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.google.code.findbugs</groupId>
+            <artifactId>jsr305</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.sun.jersey</groupId>
+            <artifactId>jersey-servlet</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.sun.jersey.contribs</groupId>
+            <artifactId>jersey-guice</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.github.pjfanning</groupId>
+            <artifactId>jersey-json</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.avro</groupId>
+            <artifactId>avro</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.eclipse.jetty</groupId>
+            <artifactId>jetty-client</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.google.j2objc</groupId>
+            <artifactId>j2objc-annotations</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
+    </dependencies>
+  </dependencyManagement>
+
+  <build>
+    <pluginManagement>
+      <plugins>
+        <plugin>
+          <groupId>org.apache.maven.plugins</groupId>
+          <artifactId>maven-shade-plugin</artifactId>
+          <version>3.6.0</version>
+          <executions>
+            <execution>
+              <id>create-bundle-with-relocations</id>
+              <goals>
+                <goal>shade</goal>
+              </goals>
+              <phase>package</phase>
+              <configuration>
+                <outputFile combine.self="override"/>
+                <createDependencyReducedPom>true</createDependencyReducedPom>
+                <promoteTransitiveDependencies>true</promoteTransitiveDependencies>
+                <createSourcesJar>true</createSourcesJar>
+                <shadeSourcesContent>true</shadeSourcesContent>
+                <useBaseVersion>true</useBaseVersion>
+                <relocations>
+                  <relocation>
+                    <pattern>com.google.protobuf</pattern>
+                    <shadedPattern>${shade.prefix}.com.google.protobuf</shadedPattern>
+                  </relocation>
+                  <relocation>
+                    <pattern>com.codahale.metrics</pattern>
+                    <shadedPattern>${shade.prefix}.com.codahale.metrics</shadedPattern>
+                  </relocation>
+                  <relocation>
+                    <pattern>org.apache.commons.io</pattern>
+                    <shadedPattern>${shade.prefix}.org.apache.commons.io</shadedPattern>
+                  </relocation>
+                </relocations>
+                <filters>
+                  <filter>
+                    <artifact>*:*</artifact>
+                    <excludes>
+                      <exclude>META-INF/*.SF</exclude>
+                      <exclude>META-INF/*.DSA</exclude>
+                      <exclude>META-INF/*.RSA</exclude>
+                    </excludes>
+                  </filter>
+                </filters>
+                <transformers>
+                  <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
+                </transformers>
+              </configuration>
+            </execution>
+          </executions>
+        </plugin>
+      </plugins>
+    </pluginManagement>
+  </build>
+</project>

From e5916dc018dc7e62df3ea422c362f6f19d567a15 Mon Sep 17 00:00:00 2001
From: Bryan Beaudreault <bbeaudreault@apache.org>
Date: Mon, 12 Feb 2024 12:01:54 -0500
Subject: [PATCH 02/37] HubSpot Edit: HBASE-28365: ChaosMonkey batch
 suspend/resume action assume shell implementation (not yet written upstream)

---
 .../chaos/actions/RollingBatchSuspendResumeRsAction.java      | 4 ++++
 .../hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java    | 1 -
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
index 559dec829ee3..78c78c531060 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchSuspendResumeRsAction.java
@@ -97,6 +97,8 @@ public void perform() throws Exception {
             suspendRs(server);
           } catch (Shell.ExitCodeException e) {
             LOG.warn("Problem suspending but presume successful; code={}", e.getExitCode(), e);
+          } catch (Exception e) {
+            LOG.warn("Problem suspending but presume successful", e);
           }
           suspendedServers.add(server);
           break;
@@ -106,6 +108,8 @@ public void perform() throws Exception {
             resumeRs(server);
           } catch (Shell.ExitCodeException e) {
             LOG.info("Problem resuming, will retry; code={}", e.getExitCode(), e);
+          } catch (Exception e) {
+            LOG.warn("Problem resulting, will retry", e);
           }
           break;
       }
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java
index fb8ab209c3a1..756f0d3846a6 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/monkies/PolicyBasedChaosMonkey.java
@@ -86,7 +86,6 @@ private static ExecutorService buildMonkeyThreadPool(final int size) {
     return Executors.newFixedThreadPool(size, new ThreadFactoryBuilder().setDaemon(false)
       .setNameFormat("ChaosMonkey-%d").setUncaughtExceptionHandler((t, e) -> {
         LOG.error("Uncaught exception in thread {}", t.getName(), e);
-        throw new RuntimeException(e);
       }).build());
   }
 

From 991513e5d0071d7bccbb7cdcb160c4f4ab281c6a Mon Sep 17 00:00:00 2001
From: Bryan Beaudreault <bbeaudreault@apache.org>
Date: Sat, 17 Feb 2024 11:40:11 -0500
Subject: [PATCH 03/37] HubSpot Edit: Add retries to verify step of ITBLL

---
 .../test/IntegrationTestBigLinkedList.java      | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/test/IntegrationTestBigLinkedList.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/test/IntegrationTestBigLinkedList.java
index c1854d87c199..2c4dd96eedab 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/test/IntegrationTestBigLinkedList.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/test/IntegrationTestBigLinkedList.java
@@ -1532,9 +1532,20 @@ protected void runVerify(String outputDir, int numReducers, long expectedNumNode
 
       Verify verify = new Verify();
       verify.setConf(getConf());
-      int retCode = verify.run(iterationOutput, numReducers);
-      if (retCode > 0) {
-        throw new RuntimeException("Verify.run failed with return code: " + retCode);
+
+      int retries = getConf().getInt("hbase.itbll.verify.retries", 1);
+
+      while (true) {
+        int retCode = verify.run(iterationOutput, numReducers);
+        if (retCode > 0) {
+          if (retries-- > 0) {
+            LOG.warn("Verify.run failed with return code: {}. Will retry", retries);
+          } else {
+            throw new RuntimeException("Verify.run failed with return code: " + retCode);
+          }
+        } else {
+          break;
+        }
       }
 
       if (!verify.verify(expectedNumNodes)) {

From 0e45bd542ec59ec79014674a37e5f5a5df486c13 Mon Sep 17 00:00:00 2001
From: Charles Connell <charles@connells.org>
Date: Fri, 2 Feb 2024 09:17:58 -0500
Subject: [PATCH 04/37] HubSpot Edit: Add an hbase-site.xml to our bundles that
 configures ZStdCodec

Co-authored-by: Charles Connell <cconnell@hubspot.com>
---
 .../src/main/resources/hbase-site.xml                | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 hubspot-client-bundles/hbase-mapreduce-bundle/src/main/resources/hbase-site.xml

diff --git a/hubspot-client-bundles/hbase-mapreduce-bundle/src/main/resources/hbase-site.xml b/hubspot-client-bundles/hbase-mapreduce-bundle/src/main/resources/hbase-site.xml
new file mode 100644
index 000000000000..629c6f84f30e
--- /dev/null
+++ b/hubspot-client-bundles/hbase-mapreduce-bundle/src/main/resources/hbase-site.xml
@@ -0,0 +1,12 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<configuration>
+  <property>
+    <!-- HBase clients that wish to read zstd HFiles must use the
+         same codec that wrote them on the RegionServers.
+         Compression.java reads its codec choice from HBaseConfiguration.create(),
+         so this setting must be supplied on the classpath, not programmatically. -->
+    <name>hbase.io.compress.zstd.codec</name>
+    <value>org.apache.hadoop.hbase.io.compress.zstd.ZstdCodec</value>
+  </property>
+</configuration>

From 4af1ad32d3a176be8e13d5ef1dca5b8d4da54c69 Mon Sep 17 00:00:00 2001
From: Bryan Beaudreault <bbeaudreault@hubspot.com>
Date: Fri, 19 Apr 2024 10:28:56 -0400
Subject: [PATCH 05/37] HubSpot Edit: Add hdfs stats for local and remote rack
 bytes read

---
 .../MetricsRegionServerSource.java            |  8 ++++++
 .../MetricsRegionServerWrapper.java           |  4 +++
 .../MetricsRegionServerSourceImpl.java        |  4 +++
 .../MetricsRegionServerWrapperImpl.java       | 25 +++++++++++++++++++
 .../MetricsRegionServerWrapperStub.java       | 10 ++++++++
 5 files changed, 51 insertions(+)

diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSource.java
index c68809a1fddb..c23c222edc54 100644
--- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSource.java
+++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSource.java
@@ -533,6 +533,14 @@ public interface MetricsRegionServerSource extends BaseSource, JvmPauseMonitorSo
   String ZEROCOPY_BYTES_READ = "zeroCopyBytesRead";
   String ZEROCOPY_BYTES_READ_DESC = "The number of bytes read through HDFS zero copy";
 
+  String LOCAL_RACK_BYTES_READ = "localRackBytesRead";
+  String LOCAL_RACK_BYTES_READ_DESC =
+    "The number of bytes read from the same rack of the RegionServer, but not the local HDFS DataNode";
+
+  String REMOTE_RACK_BYTES_READ = "remoteRackBytesRead";
+  String REMOTE_RACK_BYTES_READ_DESC =
+    "The number of bytes read from a different rack from that of the RegionServer";
+
   String BLOCKED_REQUESTS_COUNT = "blockedRequestCount";
   String BLOCKED_REQUESTS_COUNT_DESC = "The number of blocked requests because of memstore size is "
     + "larger than blockingMemStoreSize";
diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapper.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapper.java
index 10e71d091f59..67d31ffe64c4 100644
--- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapper.java
+++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapper.java
@@ -544,6 +544,10 @@ public interface MetricsRegionServerWrapper {
   /** Returns Number of bytes read from the local HDFS DataNode. */
   long getLocalBytesRead();
 
+  long getLocalRackBytesRead();
+
+  long getRemoteRackBytesRead();
+
   /** Returns Number of bytes read locally through HDFS short circuit. */
   long getShortCircuitBytesRead();
 
diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSourceImpl.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSourceImpl.java
index e0429cfb55d1..b42a02d0e659 100644
--- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSourceImpl.java
+++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSourceImpl.java
@@ -560,6 +560,10 @@ private MetricsRecordBuilder addGaugesToMetricsRecordBuilder(MetricsRecordBuilde
         PERCENT_FILES_LOCAL_SECONDARY_REGIONS_DESC), rsWrap.getPercentFileLocalSecondaryRegions())
       .addGauge(Interns.info(TOTAL_BYTES_READ, TOTAL_BYTES_READ_DESC), rsWrap.getTotalBytesRead())
       .addGauge(Interns.info(LOCAL_BYTES_READ, LOCAL_BYTES_READ_DESC), rsWrap.getLocalBytesRead())
+      .addGauge(Interns.info(LOCAL_RACK_BYTES_READ, LOCAL_RACK_BYTES_READ_DESC),
+        rsWrap.getLocalRackBytesRead())
+      .addGauge(Interns.info(REMOTE_RACK_BYTES_READ, REMOTE_RACK_BYTES_READ_DESC),
+        rsWrap.getRemoteRackBytesRead())
       .addGauge(Interns.info(SHORTCIRCUIT_BYTES_READ, SHORTCIRCUIT_BYTES_READ_DESC),
         rsWrap.getShortCircuitBytesRead())
       .addGauge(Interns.info(ZEROCOPY_BYTES_READ, ZEROCOPY_BYTES_READ_DESC),
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperImpl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperImpl.java
index 2bd396242a17..a256e8827a39 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperImpl.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperImpl.java
@@ -29,6 +29,8 @@
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
 import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.fs.GlobalStorageStatistics;
+import org.apache.hadoop.fs.StorageStatistics;
 import org.apache.hadoop.hbase.CompatibilitySingletonFactory;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.HDFSBlocksDistribution;
@@ -1052,6 +1054,29 @@ public long getLocalBytesRead() {
     return FSDataInputStreamWrapper.getLocalBytesRead();
   }
 
+  @Override
+  public long getLocalRackBytesRead() {
+    return getGlobalStorageStatistic("bytesReadDistanceOfOneOrTwo");
+  }
+
+  @Override
+  public long getRemoteRackBytesRead() {
+    return getGlobalStorageStatistic("bytesReadDistanceOfThreeOrFour")
+      + getGlobalStorageStatistic("bytesReadDistanceOfFiveOrLarger");
+  }
+
+  private static long getGlobalStorageStatistic(String name) {
+    StorageStatistics stats = GlobalStorageStatistics.INSTANCE.get("hdfs");
+    if (stats == null) {
+      return 0;
+    }
+    Long val = stats.getLong(name);
+    if (val == null) {
+      return 0;
+    }
+    return val;
+  }
+
   @Override
   public long getShortCircuitBytesRead() {
     return FSDataInputStreamWrapper.getShortCircuitBytesRead();
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperStub.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperStub.java
index 0e77ae89fef2..84654784c58d 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperStub.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerWrapperStub.java
@@ -537,6 +537,16 @@ public long getLocalBytesRead() {
     return 0;
   }
 
+  @Override
+  public long getLocalRackBytesRead() {
+    return 0;
+  }
+
+  @Override
+  public long getRemoteRackBytesRead() {
+    return 0;
+  }
+
   @Override
   public long getShortCircuitBytesRead() {
     return 0;

From 2709ece46ea7383359ddc0f9407a5ab1c413c625 Mon Sep 17 00:00:00 2001
From: Bryan Beaudreault <bbeaudreault@hubspot.com>
Date: Thu, 18 Apr 2024 08:54:07 -0400
Subject: [PATCH 06/37] HubSpot Edit: Basic healthcheck servlets

---
 .../apache/hadoop/hbase/master/HMaster.java   |   6 +
 .../master/http/MasterHealthServlet.java      |  48 ++++++++
 .../hbase/monitoring/HealthCheckServlet.java  | 103 ++++++++++++++++++
 .../hbase/regionserver/HRegionServer.java     |  12 +-
 .../regionserver/http/RSHealthServlet.java    |  95 ++++++++++++++++
 5 files changed, 263 insertions(+), 1 deletion(-)
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/http/MasterHealthServlet.java
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/monitoring/HealthCheckServlet.java
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/http/RSHealthServlet.java

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
index 9cafbb7cbf9e..21da55d7757b 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@@ -137,6 +137,7 @@
 import org.apache.hadoop.hbase.master.cleaner.SnapshotCleanerChore;
 import org.apache.hadoop.hbase.master.hbck.HbckChore;
 import org.apache.hadoop.hbase.master.http.MasterDumpServlet;
+import org.apache.hadoop.hbase.master.http.MasterHealthServlet;
 import org.apache.hadoop.hbase.master.http.MasterRedirectServlet;
 import org.apache.hadoop.hbase.master.http.MasterStatusServlet;
 import org.apache.hadoop.hbase.master.http.api_v1.ResourceConfigFactory;
@@ -775,6 +776,11 @@ protected Class<? extends HttpServlet> getDumpServlet() {
     return MasterDumpServlet.class;
   }
 
+  @Override
+  protected Class<? extends HttpServlet> getHealthServlet() {
+    return MasterHealthServlet.class;
+  }
+
   @Override
   public MetricsMaster getMasterMetrics() {
     return metricsMaster;
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/http/MasterHealthServlet.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/http/MasterHealthServlet.java
new file mode 100644
index 000000000000..99f2f08ac8bd
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/http/MasterHealthServlet.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.http;
+
+import java.io.IOException;
+import java.util.EnumSet;
+import java.util.Optional;
+import javax.servlet.http.HttpServletRequest;
+import org.apache.hadoop.hbase.ClusterMetrics;
+import org.apache.hadoop.hbase.client.Connection;
+import org.apache.hadoop.hbase.master.HMaster;
+import org.apache.hadoop.hbase.monitoring.HealthCheckServlet;
+import org.apache.yetus.audience.InterfaceAudience;
+
+@InterfaceAudience.Private
+public class MasterHealthServlet extends HealthCheckServlet<HMaster> {
+
+  public MasterHealthServlet() {
+    super(HMaster.MASTER);
+  }
+
+  @Override
+  protected Optional<String> check(HMaster master, HttpServletRequest req, Connection conn)
+    throws IOException {
+
+    if (master.isActiveMaster() && master.isOnline()) {
+      // this will fail if there is a problem with the active master
+      conn.getAdmin().getClusterMetrics(EnumSet.of(ClusterMetrics.Option.CLUSTER_ID));
+    }
+
+    return Optional.empty();
+  }
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/monitoring/HealthCheckServlet.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/monitoring/HealthCheckServlet.java
new file mode 100644
index 000000000000..8d09089b0c64
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/monitoring/HealthCheckServlet.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.monitoring;
+
+import java.io.IOException;
+import java.util.Optional;
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.client.Connection;
+import org.apache.hadoop.hbase.client.ConnectionFactory;
+import org.apache.hadoop.hbase.client.RpcConnectionRegistry;
+import org.apache.hadoop.hbase.regionserver.HRegionServer;
+import org.apache.yetus.audience.InterfaceAudience;
+
+@InterfaceAudience.Private
+public abstract class HealthCheckServlet<T extends HRegionServer> extends HttpServlet {
+
+  private static final String CLIENT_RPC_TIMEOUT = "healthcheck.hbase.client.rpc.timeout";
+  private static final int CLIENT_RPC_TIMEOUT_DEFAULT = 5000;
+  private static final String CLIENT_RETRIES = "healthcheck.hbase.client.retries";
+  private static final int CLIENT_RETRIES_DEFAULT = 2;
+  private static final String CLIENT_OPERATION_TIMEOUT =
+    "healthcheck.hbase.client.operation.timeout";
+  private static final int CLIENT_OPERATION_TIMEOUT_DEFAULT = 15000;
+
+  private final String serverLookupKey;
+
+  public HealthCheckServlet(String serverLookupKey) {
+    this.serverLookupKey = serverLookupKey;
+  }
+
+  @SuppressWarnings("unchecked")
+  @Override
+  protected void doGet(HttpServletRequest req, HttpServletResponse resp)
+    throws ServletException, IOException {
+    T server = (T) getServletContext().getAttribute(serverLookupKey);
+    try {
+      check(server, req);
+      Optional<String> message = check(server, req);
+      resp.setStatus(200);
+      resp.getWriter().write(message.orElse("ok"));
+    } catch (Exception e) {
+      resp.setStatus(500);
+      resp.getWriter().write(e.toString());
+    } finally {
+      resp.getWriter().close();
+    }
+  }
+
+  private Optional<String> check(T server, HttpServletRequest req) throws IOException {
+    if (server == null) {
+      throw new IOException("Unable to get access to " + serverLookupKey);
+    }
+    if (server.isAborted() || server.isStopped() || server.isStopping() || server.isKilled()) {
+      throw new IOException("The " + serverLookupKey + " is stopping!");
+    }
+    if (!server.getRpcServer().isStarted()) {
+      throw new IOException("The " + serverLookupKey + "'s RpcServer is not started");
+    }
+
+    Configuration conf = new Configuration(server.getConfiguration());
+    conf.set(HConstants.CLIENT_CONNECTION_REGISTRY_IMPL_CONF_KEY,
+      RpcConnectionRegistry.class.getName());
+    conf.set(RpcConnectionRegistry.BOOTSTRAP_NODES, server.getServerName().getAddress().toString());
+    conf.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY,
+      conf.getInt(CLIENT_RPC_TIMEOUT, CLIENT_RPC_TIMEOUT_DEFAULT));
+    conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER,
+      conf.getInt(CLIENT_RETRIES, CLIENT_RETRIES_DEFAULT));
+    conf.setInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT,
+      conf.getInt(CLIENT_OPERATION_TIMEOUT, CLIENT_OPERATION_TIMEOUT_DEFAULT));
+
+    try (Connection conn = ConnectionFactory.createConnection(conf)) {
+      // this will fail if the server is not accepting requests
+      if (conn.getClusterId() == null) {
+        throw new IOException("Could not retrieve clusterId from self via rpc");
+      }
+
+      return check(server, req, conn);
+    }
+  }
+
+  protected abstract Optional<String> check(T server, HttpServletRequest req, Connection conn)
+    throws IOException;
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
index 351b4fef191e..89f528af7573 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@@ -169,6 +169,7 @@
 import org.apache.hadoop.hbase.regionserver.handler.RSProcedureHandler;
 import org.apache.hadoop.hbase.regionserver.handler.RegionReplicaFlushHandler;
 import org.apache.hadoop.hbase.regionserver.http.RSDumpServlet;
+import org.apache.hadoop.hbase.regionserver.http.RSHealthServlet;
 import org.apache.hadoop.hbase.regionserver.http.RSStatusServlet;
 import org.apache.hadoop.hbase.regionserver.throttle.FlushThroughputControllerFactory;
 import org.apache.hadoop.hbase.regionserver.throttle.ThroughputController;
@@ -383,7 +384,7 @@ public class HRegionServer extends Thread
 
   // A state before we go into stopped state. At this stage we're closing user
   // space regions.
-  private boolean stopping = false;
+  private volatile boolean stopping = false;
   private volatile boolean killed = false;
   private volatile boolean shutDown = false;
 
@@ -864,6 +865,10 @@ protected Class<? extends HttpServlet> getDumpServlet() {
     return RSDumpServlet.class;
   }
 
+  protected Class<? extends HttpServlet> getHealthServlet() {
+    return RSHealthServlet.class;
+  }
+
   /**
    * Used by {@link RSDumpServlet} to generate debugging information.
    */
@@ -2466,6 +2471,7 @@ private void putUpWebUI() throws IOException {
       try {
         this.infoServer = new InfoServer(getProcessName(), addr, port, false, this.conf);
         infoServer.addPrivilegedServlet("dump", "/dump", getDumpServlet());
+        infoServer.addPrivilegedServlet("health", "/health", getHealthServlet());
         configureInfoServer();
         this.infoServer.start();
         break;
@@ -3193,6 +3199,10 @@ public boolean isStopping() {
     return this.stopping;
   }
 
+  public boolean isKilled() {
+    return this.killed;
+  }
+
   @Override
   public Configuration getConfiguration() {
     return conf;
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/http/RSHealthServlet.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/http/RSHealthServlet.java
new file mode 100644
index 000000000000..bc0f35193389
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/http/RSHealthServlet.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.regionserver.http;
+
+import java.io.IOException;
+import java.time.Duration;
+import java.time.Instant;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import javax.servlet.http.HttpServletRequest;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.hbase.client.Connection;
+import org.apache.hadoop.hbase.monitoring.HealthCheckServlet;
+import org.apache.hadoop.hbase.regionserver.HRegion;
+import org.apache.hadoop.hbase.regionserver.HRegionServer;
+import org.apache.yetus.audience.InterfaceAudience;
+
+@InterfaceAudience.Private
+public class RSHealthServlet extends HealthCheckServlet<HRegionServer> {
+
+  private final Map<String, Instant> regionUnavailableSince = new ConcurrentHashMap<>();
+
+  public RSHealthServlet() {
+    super(HRegionServer.REGIONSERVER);
+  }
+
+  @Override
+  protected Optional<String> check(HRegionServer regionServer, HttpServletRequest req,
+    Connection conn) throws IOException {
+    long maxUnavailableMillis = Optional.ofNullable(req.getParameter("maxUnavailableMillis"))
+      .filter(StringUtils::isNumeric).map(Long::parseLong).orElse(Long.MAX_VALUE);
+
+    Instant oldestUnavailableSince = Instant.MAX;
+    String longestUnavailableRegion = null;
+    int unavailableCount = 0;
+
+    synchronized (regionUnavailableSince) {
+      Set<String> regionsPreviouslyUnavailable = new HashSet<>(regionUnavailableSince.keySet());
+
+      for (HRegion region : regionServer.getOnlineRegionsLocalContext()) {
+        regionsPreviouslyUnavailable.remove(region.getRegionInfo().getEncodedName());
+        if (!region.isAvailable()) {
+          unavailableCount++;
+          Instant unavailableSince = regionUnavailableSince
+            .computeIfAbsent(region.getRegionInfo().getEncodedName(), k -> Instant.now());
+
+          if (unavailableSince.isBefore(oldestUnavailableSince)) {
+            oldestUnavailableSince = unavailableSince;
+            longestUnavailableRegion = region.getRegionInfo().getEncodedName();
+          }
+
+        } else {
+          regionUnavailableSince.remove(region.getRegionInfo().getEncodedName());
+        }
+      }
+
+      regionUnavailableSince.keySet().removeAll(regionsPreviouslyUnavailable);
+    }
+
+    String message = "ok";
+
+    if (unavailableCount > 0) {
+      Duration longestUnavailableRegionTime =
+        Duration.between(oldestUnavailableSince, Instant.now());
+      if (longestUnavailableRegionTime.toMillis() > maxUnavailableMillis) {
+        throw new IOException("Region " + longestUnavailableRegion
+          + " has been unavailable too long, since " + oldestUnavailableSince);
+      }
+
+      message += " - unavailableRegions: " + unavailableCount + ", longestUnavailableDuration: "
+        + longestUnavailableRegionTime + ", longestUnavailableRegion: " + longestUnavailableRegion;
+    }
+
+    return Optional.of(message);
+
+  }
+}

From 2fe5815d344b31262a9f9c7df2466d41fee7c25d Mon Sep 17 00:00:00 2001
From: Bryan Beaudreault <bbeaudreault@apache.org>
Date: Thu, 7 Mar 2024 16:57:11 -0500
Subject: [PATCH 07/37] HubSpot Edit: More info when interrupted while waiting
 on actions

---
 .../hbase/client/AsyncRequestFutureImpl.java  | 63 +++++++++++++++----
 .../hbase/client/MultiServerCallable.java     |  9 ++-
 2 files changed, 60 insertions(+), 12 deletions(-)

diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncRequestFutureImpl.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncRequestFutureImpl.java
index b34ef863d565..f3e9c0ed0178 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncRequestFutureImpl.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncRequestFutureImpl.java
@@ -23,6 +23,7 @@
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Comparator;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.List;
@@ -34,6 +35,7 @@
 import java.util.concurrent.RejectedExecutionException;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
 import org.apache.hadoop.hbase.DoNotRetryIOException;
 import org.apache.hadoop.hbase.HBaseServerException;
 import org.apache.hadoop.hbase.HConstants;
@@ -52,6 +54,8 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.hbase.thirdparty.com.google.common.base.Strings;
+
 /**
  * The context, and return value, for a single submit/submitAll call. Note on how this class (one AP
  * submit) works. Initially, all requests are split into groups by server; request is sent to each
@@ -195,7 +199,7 @@ public void run() {
       try {
         // setup the callable based on the actions, if we don't have one already from the request
         if (callable == null) {
-          callable = createCallable(server, tableName, multiAction);
+          callable = createCallable(server, tableName, multiAction, numAttempt);
         }
         RpcRetryingCaller<AbstractResponse> caller =
           asyncProcess.createCaller(callable, rpcTimeout);
@@ -387,10 +391,8 @@ public AsyncRequestFutureImpl(AsyncProcessTask task, List<Action> actions, long
     } else {
       this.replicaGetIndices = null;
     }
-    this.callsInProgress = !hasAnyReplicaGets
-      ? null
-      : Collections
-        .newSetFromMap(new ConcurrentHashMap<CancellableRegionServerCallable, Boolean>());
+    this.callsInProgress =
+      Collections.newSetFromMap(new ConcurrentHashMap<CancellableRegionServerCallable, Boolean>());
     this.asyncProcess = asyncProcess;
     this.errorsByServer = createServerErrorTracker();
     this.errors = new BatchErrors();
@@ -540,7 +542,12 @@ private HRegionLocation getReplicaLocationOrFail(Action action) {
 
   private void manageLocationError(Action action, Exception ex) {
     String msg =
-      "Cannot get replica " + action.getReplicaId() + " location for " + action.getAction();
+      "Cannot get replica " + action.getReplicaId() + " location for " + action.getAction() + ": ";
+    if (ex instanceof OperationTimeoutExceededException) {
+      msg += "Operation timeout exceeded.";
+    } else {
+      msg += ex == null ? "null cause" : ex.toString();
+    }
     LOG.error(msg);
     if (ex == null) {
       ex = new IOException(msg);
@@ -1247,20 +1254,31 @@ private String buildDetailedErrorMsg(String string, int index) {
 
   @Override
   public void waitUntilDone() throws InterruptedIOException {
+    long startTime = EnvironmentEdgeManager.currentTime();
     try {
       if (this.operationTimeout > 0) {
         // the worker thread maybe over by some exception without decrement the actionsInProgress,
         // then the guarantee of operationTimeout will be broken, so we should set cutoff to avoid
         // stuck here forever
-        long cutoff = (EnvironmentEdgeManager.currentTime() + this.operationTimeout) * 1000L;
+        long cutoff = (startTime + this.operationTimeout) * 1000L;
         if (!waitUntilDone(cutoff)) {
-          throw new SocketTimeoutException("time out before the actionsInProgress changed to zero");
+          String msg = "time out before the actionsInProgress changed to zero, with "
+            + actionsInProgress.get() + " remaining" + getServersInProgress();
+
+          throw new SocketTimeoutException(msg);
         }
       } else {
         waitUntilDone(Long.MAX_VALUE);
       }
     } catch (InterruptedException iex) {
-      throw new InterruptedIOException(iex.getMessage());
+      long duration = EnvironmentEdgeManager.currentTime() - startTime;
+      String message = "Interrupted after waiting " + duration + "ms of " + operationTimeout
+        + "ms operation timeout, with " + actionsInProgress.get() + " remaining"
+        + getServersInProgress();
+      if (!Strings.isNullOrEmpty(iex.getMessage())) {
+        message += ": " + iex.getMessage();
+      }
+      throw new InterruptedIOException(message);
     } finally {
       if (callsInProgress != null) {
         for (CancellableRegionServerCallable clb : callsInProgress) {
@@ -1270,6 +1288,29 @@ public void waitUntilDone() throws InterruptedIOException {
     }
   }
 
+  private String getServersInProgress() {
+    if (callsInProgress != null) {
+      Map<ServerName, Integer> serversInProgress = new HashMap<>(callsInProgress.size());
+      for (CancellableRegionServerCallable callable : callsInProgress) {
+        if (callable instanceof MultiServerCallable) {
+          MultiServerCallable multiServerCallable = (MultiServerCallable) callable;
+          int numAttempt = multiServerCallable.getNumAttempt();
+          serversInProgress.compute(multiServerCallable.getServerName(),
+            (k, v) -> v == null ? numAttempt : Math.max(v, numAttempt));
+        }
+      }
+
+      if (serversInProgress.size() > 0) {
+        return " on servers: " + serversInProgress.entrySet().stream()
+          .sorted(Comparator.<Map.Entry<ServerName, Integer>> comparingInt(Map.Entry::getValue)
+            .reversed())
+          .map(entry -> entry.getKey() + "(" + entry.getValue() + " attempts)")
+          .collect(Collectors.joining(", "));
+      }
+    }
+    return "";
+  }
+
   private boolean waitUntilDone(long cutoff) throws InterruptedException {
     boolean hasWait = cutoff != Long.MAX_VALUE;
     long lastLog = EnvironmentEdgeManager.currentTime();
@@ -1336,10 +1377,10 @@ private ConnectionImplementation.ServerErrorTracker createServerErrorTracker() {
    * Create a callable. Isolated to be easily overridden in the tests.
    */
   private MultiServerCallable createCallable(final ServerName server, TableName tableName,
-    final MultiAction multi) {
+    final MultiAction multi, int numAttempt) {
     return new MultiServerCallable(asyncProcess.connection, tableName, server, multi,
       asyncProcess.rpcFactory.newController(), rpcTimeout, tracker, multi.getPriority(),
-      requestAttributes);
+      requestAttributes, numAttempt);
   }
 
   private void updateResult(int index, Object result) {
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MultiServerCallable.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MultiServerCallable.java
index 6ba0832b26e5..33933dd5684f 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MultiServerCallable.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MultiServerCallable.java
@@ -48,14 +48,17 @@
 @InterfaceAudience.Private
 class MultiServerCallable extends CancellableRegionServerCallable<MultiResponse> {
   private MultiAction multiAction;
+  private final int numAttempt;
   private boolean cellBlock;
 
   MultiServerCallable(final ClusterConnection connection, final TableName tableName,
     final ServerName location, final MultiAction multi, RpcController rpcController, int rpcTimeout,
-    RetryingTimeTracker tracker, int priority, Map<String, byte[]> requestAttributes) {
+    RetryingTimeTracker tracker, int priority, Map<String, byte[]> requestAttributes,
+    int numAttempt) {
     super(connection, tableName, null, rpcController, rpcTimeout, tracker, priority,
       requestAttributes);
     this.multiAction = multi;
+    this.numAttempt = numAttempt;
     // RegionServerCallable has HRegionLocation field, but this is a multi-region request.
     // Using region info from parent HRegionLocation would be a mistake for this class; so
     // we will store the server here, and throw if someone tries to obtain location/regioninfo.
@@ -63,6 +66,10 @@ class MultiServerCallable extends CancellableRegionServerCallable<MultiResponse>
     this.cellBlock = isCellBlock();
   }
 
+  public int getNumAttempt() {
+    return numAttempt;
+  }
+
   public void reset(ServerName location, MultiAction multiAction) {
     this.location = new HRegionLocation(null, location);
     this.multiAction = multiAction;

From cc0df81869bff02004efef80e7edbbc1f2140f5d Mon Sep 17 00:00:00 2001
From: Wellington Ramos Chevreuil <wchevreuil@apache.org>
Date: Wed, 19 Jun 2024 14:38:18 +0100
Subject: [PATCH 08/37] HubSpot Backport: HBASE-28596: Optimise BucketCache
 usage upon regions splits/merges. (will be in 2.7.0)

---
 .../hadoop/hbase/io/HalfStoreFileReader.java  |  42 ++++++
 .../hadoop/hbase/io/hfile/BlockCache.java     |  11 ++
 .../hadoop/hbase/io/hfile/BlockCacheUtil.java |  42 ++++++
 .../hadoop/hbase/io/hfile/CacheConfig.java    |   3 +
 .../hbase/io/hfile/CombinedBlockCache.java    |   5 +
 .../hadoop/hbase/io/hfile/HFileBlock.java     |  13 +-
 .../hbase/io/hfile/HFilePreadReader.java      |   2 +-
 .../hbase/io/hfile/HFileReaderImpl.java       |  42 +++---
 .../hbase/io/hfile/bucket/BucketCache.java    | 116 +++++++++++-----
 .../TransitRegionStateProcedure.java          |   6 +-
 .../hbase/regionserver/StoreFileReader.java   |   2 +-
 .../handler/UnassignRegionHandler.java        |   8 +-
 .../hadoop/hbase/TestSplitWithCache.java      | 130 ++++++++++++++++++
 .../hbase/io/TestHalfStoreFileReader.java     |  37 +++--
 .../hadoop/hbase/io/hfile/TestPrefetch.java   |   8 --
 .../io/hfile/TestPrefetchWithBucketCache.java |  70 +++++++++-
 .../bucket/TestBucketCachePersister.java      |   6 +
 17 files changed, 447 insertions(+), 96 deletions(-)
 create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/TestSplitWithCache.java

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/HalfStoreFileReader.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/HalfStoreFileReader.java
index 2119a3e7cbef..3a4b0437bfca 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/HalfStoreFileReader.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/HalfStoreFileReader.java
@@ -20,6 +20,7 @@
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.Optional;
+import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.function.IntConsumer;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.Cell;
@@ -29,6 +30,7 @@
 import org.apache.hadoop.hbase.client.Scan;
 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
 import org.apache.hadoop.hbase.io.hfile.HFileInfo;
+import org.apache.hadoop.hbase.io.hfile.HFileReaderImpl;
 import org.apache.hadoop.hbase.io.hfile.HFileScanner;
 import org.apache.hadoop.hbase.io.hfile.ReaderContext;
 import org.apache.hadoop.hbase.regionserver.StoreFileInfo;
@@ -64,6 +66,8 @@ public class HalfStoreFileReader extends StoreFileReader {
 
   private boolean firstKeySeeked = false;
 
+  private AtomicBoolean closed = new AtomicBoolean(false);
+
   /**
    * Creates a half file reader for a hfile referred to by an hfilelink.
    * @param context   Reader context info
@@ -349,4 +353,42 @@ public long getFilterEntries() {
     // Estimate the number of entries as half the original file; this may be wildly inaccurate.
     return super.getFilterEntries() / 2;
   }
+
+  /**
+   * Overrides close method to handle cache evictions for the referred file. If evictionOnClose is
+   * true, we will seek to the block containing the splitCell and evict all blocks from offset 0 up
+   * to that block offset if this is a bottom half reader, or the from the split block offset up to
+   * the end of the file if this is a top half reader.
+   * @param evictOnClose true if it should evict the file blocks from the cache.
+   */
+  @Override
+  public void close(boolean evictOnClose) throws IOException {
+    if (closed.compareAndSet(false, true)) {
+      if (evictOnClose) {
+        final HFileReaderImpl.HFileScannerImpl s =
+          (HFileReaderImpl.HFileScannerImpl) super.getScanner(false, true, false);
+        final String reference = this.reader.getHFileInfo().getHFileContext().getHFileName();
+        final String referred = StoreFileInfo.getReferredToRegionAndFile(reference).getSecond();
+        s.seekTo(splitCell);
+        if (s.getCurBlock() != null) {
+          long offset = s.getCurBlock().getOffset();
+          LOG.trace("Seeking to split cell in reader: {} for file: {} top: {}, split offset: {}",
+            this, reference, top, offset);
+          ((HFileReaderImpl) reader).getCacheConf().getBlockCache().ifPresent(cache -> {
+            int numEvictedReferred = top
+              ? cache.evictBlocksRangeByHfileName(referred, offset, Long.MAX_VALUE)
+              : cache.evictBlocksRangeByHfileName(referred, 0, offset);
+            int numEvictedReference = cache.evictBlocksByHfileName(reference);
+            LOG.trace(
+              "Closing reference: {}; referred file: {}; was top? {}; evicted for referred: {};"
+                + "evicted for reference: {}",
+              reference, referred, top, numEvictedReferred, numEvictedReference);
+          });
+        }
+        reader.close(false);
+      } else {
+        reader.close(evictOnClose);
+      }
+    }
+  }
 }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java
index 5b11035ebe73..a468752de5cb 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCache.java
@@ -235,4 +235,15 @@ default Optional<Map<String, Pair<String, Long>>> getFullyCachedFiles() {
   default Optional<Map<String, Long>> getRegionCachedInfo() {
     return Optional.empty();
   }
+
+  /**
+   * Evict all blocks for the given file name between the passed offset values.
+   * @param hfileName  The file for which blocks should be evicted.
+   * @param initOffset the initial offset for the range of blocks to be evicted.
+   * @param endOffset  the end offset for the range of blocks to be evicted.
+   * @return number of blocks evicted.
+   */
+  default int evictBlocksRangeByHfileName(String hfileName, long initOffset, long endOffset) {
+    return 0;
+  }
 }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCacheUtil.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCacheUtil.java
index 65b886f80ed5..7324701efe58 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCacheUtil.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCacheUtil.java
@@ -17,6 +17,8 @@
  */
 package org.apache.hadoop.hbase.io.hfile;
 
+import static org.apache.hadoop.hbase.io.hfile.HFileBlock.FILL_HEADER;
+
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.HashSet;
@@ -28,8 +30,10 @@
 import java.util.concurrent.ConcurrentSkipListSet;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.metrics.impl.FastLongHistogram;
+import org.apache.hadoop.hbase.nio.ByteBuff;
 import org.apache.hadoop.hbase.regionserver.HRegion;
 import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.ChecksumType;
 import org.apache.hadoop.hbase.util.GsonUtil;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
@@ -258,6 +262,44 @@ public static int getMaxCachedBlocksByFile(Configuration conf) {
     return conf == null ? DEFAULT_MAX : conf.getInt("hbase.ui.blockcache.by.file.max", DEFAULT_MAX);
   }
 
+  /**
+   * Similarly to HFileBlock.Writer.getBlockForCaching(), creates a HFileBlock instance without
+   * checksum for caching. This is needed for when we cache blocks via readers (either prefetch or
+   * client read), otherwise we may fail equality comparison when checking against same block that
+   * may already have been cached at write time.
+   * @param cacheConf the related CacheConfig object.
+   * @param block     the HFileBlock instance to be converted.
+   * @return the resulting HFileBlock instance without checksum.
+   */
+  public static HFileBlock getBlockForCaching(CacheConfig cacheConf, HFileBlock block) {
+    // Calculate how many bytes we need for checksum on the tail of the block.
+    int numBytes = cacheConf.shouldCacheCompressed(block.getBlockType().getCategory())
+      ? 0
+      : (int) ChecksumUtil.numBytes(block.getOnDiskDataSizeWithHeader(),
+        block.getHFileContext().getBytesPerChecksum());
+    ByteBuff buff = block.getBufferReadOnly();
+    HFileBlockBuilder builder = new HFileBlockBuilder();
+    return builder.withBlockType(block.getBlockType())
+      .withOnDiskSizeWithoutHeader(block.getOnDiskSizeWithoutHeader())
+      .withUncompressedSizeWithoutHeader(block.getUncompressedSizeWithoutHeader())
+      .withPrevBlockOffset(block.getPrevBlockOffset()).withByteBuff(buff)
+      .withFillHeader(FILL_HEADER).withOffset(block.getOffset()).withNextBlockOnDiskSize(-1)
+      .withOnDiskDataSizeWithHeader(block.getOnDiskDataSizeWithHeader() + numBytes)
+      .withHFileContext(cloneContext(block.getHFileContext()))
+      .withByteBuffAllocator(cacheConf.getByteBuffAllocator()).withShared(!buff.hasArray()).build();
+  }
+
+  public static HFileContext cloneContext(HFileContext context) {
+    HFileContext newContext = new HFileContextBuilder().withBlockSize(context.getBlocksize())
+      .withBytesPerCheckSum(0).withChecksumType(ChecksumType.NULL) // no checksums in cached data
+      .withCompression(context.getCompression())
+      .withDataBlockEncoding(context.getDataBlockEncoding())
+      .withHBaseCheckSum(context.isUseHBaseChecksum()).withCompressTags(context.isCompressTags())
+      .withIncludesMvcc(context.isIncludesMvcc()).withIncludesTags(context.isIncludesTags())
+      .withColumnFamily(context.getColumnFamily()).withTableName(context.getTableName()).build();
+    return newContext;
+  }
+
   /**
    * Use one of these to keep a running account of cached blocks by file. Throw it away when done.
    * This is different than metrics in that it is stats on current state of a cache. See
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CacheConfig.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CacheConfig.java
index 34c97ee64daa..92d7f4eb8903 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CacheConfig.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CacheConfig.java
@@ -72,6 +72,8 @@ public class CacheConfig implements ConfigurationObserver {
    */
   public static final String EVICT_BLOCKS_ON_CLOSE_KEY = "hbase.rs.evictblocksonclose";
 
+  public static final String EVICT_BLOCKS_ON_SPLIT_KEY = "hbase.rs.evictblocksonsplit";
+
   /**
    * Configuration key to prefetch all blocks of a given file into the block cache when the file is
    * opened.
@@ -107,6 +109,7 @@ public class CacheConfig implements ConfigurationObserver {
   public static final boolean DEFAULT_CACHE_INDEXES_ON_WRITE = false;
   public static final boolean DEFAULT_CACHE_BLOOMS_ON_WRITE = false;
   public static final boolean DEFAULT_EVICT_ON_CLOSE = false;
+  public static final boolean DEFAULT_EVICT_ON_SPLIT = true;
   public static final boolean DEFAULT_CACHE_DATA_COMPRESSED = false;
   public static final boolean DEFAULT_PREFETCH_ON_OPEN = false;
   public static final boolean DEFAULT_CACHE_COMPACTED_BLOCKS_ON_WRITE = false;
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java
index 00dc8e4a5551..ef536f9e0be3 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java
@@ -492,4 +492,9 @@ public Optional<Integer> getBlockSize(BlockCacheKey key) {
     return l1Result.isPresent() ? l1Result : l2Cache.getBlockSize(key);
   }
 
+  @Override
+  public int evictBlocksRangeByHfileName(String hfileName, long initOffset, long endOffset) {
+    return l1Cache.evictBlocksRangeByHfileName(hfileName, initOffset, endOffset)
+      + l2Cache.evictBlocksRangeByHfileName(hfileName, initOffset, endOffset);
+  }
 }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java
index 16bec1e95888..4c73fc2bcdc7 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java
@@ -697,7 +697,7 @@ public boolean isUnpacked() {
    * when block is returned to the cache.
    * @return the offset of this block in the file it was read from
    */
-  long getOffset() {
+  public long getOffset() {
     if (offset < 0) {
       throw new IllegalStateException("HFile block offset not initialized properly");
     }
@@ -1205,16 +1205,7 @@ void writeBlock(BlockWritable bw, FSDataOutputStream out) throws IOException {
      * being wholesome (ECC memory or if file-backed, it does checksumming).
      */
     HFileBlock getBlockForCaching(CacheConfig cacheConf) {
-      HFileContext newContext = new HFileContextBuilder().withBlockSize(fileContext.getBlocksize())
-        .withBytesPerCheckSum(0).withChecksumType(ChecksumType.NULL) // no checksums in cached data
-        .withCompression(fileContext.getCompression())
-        .withDataBlockEncoding(fileContext.getDataBlockEncoding())
-        .withHBaseCheckSum(fileContext.isUseHBaseChecksum())
-        .withCompressTags(fileContext.isCompressTags())
-        .withIncludesMvcc(fileContext.isIncludesMvcc())
-        .withIncludesTags(fileContext.isIncludesTags())
-        .withColumnFamily(fileContext.getColumnFamily()).withTableName(fileContext.getTableName())
-        .build();
+      HFileContext newContext = BlockCacheUtil.cloneContext(fileContext);
       // Build the HFileBlock.
       HFileBlockBuilder builder = new HFileBlockBuilder();
       ByteBuff buff;
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePreadReader.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePreadReader.java
index 926237314828..b95ce4bde556 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePreadReader.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePreadReader.java
@@ -46,7 +46,7 @@ public HFilePreadReader(ReaderContext context, HFileInfo fileInfo, CacheConfig c
     });
 
     // Prefetch file blocks upon open if requested
-    if (cacheConf.shouldPrefetchOnOpen() && cacheIfCompactionsOff() && shouldCache.booleanValue()) {
+    if (cacheConf.shouldPrefetchOnOpen() && shouldCache.booleanValue()) {
       PrefetchExecutor.request(path, new Runnable() {
         @Override
         public void run() {
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java
index b6a061043070..db2383db399d 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java
@@ -17,7 +17,6 @@
  */
 package org.apache.hadoop.hbase.io.hfile;
 
-import static org.apache.hadoop.hbase.regionserver.CompactSplit.HBASE_REGION_SERVER_ENABLE_COMPACTION;
 import static org.apache.hadoop.hbase.trace.HBaseSemanticAttributes.BLOCK_CACHE_KEY_KEY;
 
 import io.opentelemetry.api.common.Attributes;
@@ -42,14 +41,12 @@
 import org.apache.hadoop.hbase.SizeCachedKeyValue;
 import org.apache.hadoop.hbase.SizeCachedNoTagsByteBufferKeyValue;
 import org.apache.hadoop.hbase.SizeCachedNoTagsKeyValue;
-import org.apache.hadoop.hbase.io.HFileLink;
 import org.apache.hadoop.hbase.io.compress.Compression;
 import org.apache.hadoop.hbase.io.encoding.DataBlockEncoder;
 import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
 import org.apache.hadoop.hbase.io.encoding.HFileBlockDecodingContext;
 import org.apache.hadoop.hbase.nio.ByteBuff;
 import org.apache.hadoop.hbase.regionserver.KeyValueScanner;
-import org.apache.hadoop.hbase.regionserver.StoreFileInfo;
 import org.apache.hadoop.hbase.util.ByteBufferUtils;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.IdLock;
@@ -159,6 +156,10 @@ public BlockIndexNotLoadedException(Path path) {
     }
   }
 
+  public CacheConfig getCacheConf() {
+    return cacheConf;
+  }
+
   private Optional<String> toStringFirstKey() {
     return getFirstKey().map(CellUtil::getCellKeyAsString);
   }
@@ -307,7 +308,7 @@ public NotSeekedException(Path path) {
     }
   }
 
-  protected static class HFileScannerImpl implements HFileScanner {
+  public static class HFileScannerImpl implements HFileScanner {
     private ByteBuff blockBuffer;
     protected final boolean cacheBlocks;
     protected final boolean pread;
@@ -340,6 +341,11 @@ protected static class HFileScannerImpl implements HFileScanner {
     // Whether we returned a result for curBlock's size in recordBlockSize().
     // gets reset whenever curBlock is changed.
     private boolean providedCurrentBlockSize = false;
+
+    public HFileBlock getCurBlock() {
+      return curBlock;
+    }
+
     // Previous blocks that were used in the course of the read
     protected final ArrayList<HFileBlock> prevBlocks = new ArrayList<>();
 
@@ -1292,8 +1298,6 @@ public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize, final bo
     BlockCacheKey cacheKey =
       new BlockCacheKey(path, dataBlockOffset, this.isPrimaryReplicaReader(), expectedBlockType);
 
-    boolean cacheable = cacheBlock && cacheIfCompactionsOff();
-
     boolean useLock = false;
     IdLock.Entry lockEntry = null;
     final Span span = Span.current();
@@ -1340,7 +1344,7 @@ public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize, final bo
             return cachedBlock;
           }
 
-          if (!useLock && cacheable && cacheConf.shouldLockOnCacheMiss(expectedBlockType)) {
+          if (!useLock && cacheBlock && cacheConf.shouldLockOnCacheMiss(expectedBlockType)) {
             // check cache again with lock
             useLock = true;
             continue;
@@ -1351,7 +1355,7 @@ public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize, final bo
         span.addEvent("block cache miss", attributes);
         // Load block from filesystem.
         HFileBlock hfileBlock = fsBlockReader.readBlockData(dataBlockOffset, onDiskBlockSize, pread,
-          !isCompaction, shouldUseHeap(expectedBlockType, cacheable));
+          !isCompaction, shouldUseHeap(expectedBlockType, cacheBlock));
         try {
           validateBlockType(hfileBlock, expectedBlockType);
         } catch (IOException e) {
@@ -1364,25 +1368,30 @@ public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize, final bo
 
         // Don't need the unpacked block back and we're storing the block in the cache compressed
         if (cacheOnly && cacheCompressed && cacheOnRead) {
+          HFileBlock blockNoChecksum = BlockCacheUtil.getBlockForCaching(cacheConf, hfileBlock);
           cacheConf.getBlockCache().ifPresent(cache -> {
             LOG.debug("Skipping decompression of block {} in prefetch", cacheKey);
             // Cache the block if necessary
-            if (cacheable && cacheConf.shouldCacheBlockOnRead(category)) {
-              cache.cacheBlock(cacheKey, hfileBlock, cacheConf.isInMemory(), cacheOnly);
+            if (cacheBlock && cacheConf.shouldCacheBlockOnRead(category)) {
+              cache.cacheBlock(cacheKey, blockNoChecksum, cacheConf.isInMemory(), cacheOnly);
             }
           });
 
           if (updateCacheMetrics && hfileBlock.getBlockType().isData()) {
             HFile.DATABLOCK_READ_COUNT.increment();
           }
-          return hfileBlock;
+          return blockNoChecksum;
         }
         HFileBlock unpacked = hfileBlock.unpack(hfileContext, fsBlockReader);
+        HFileBlock unpackedNoChecksum = BlockCacheUtil.getBlockForCaching(cacheConf, unpacked);
         // Cache the block if necessary
         cacheConf.getBlockCache().ifPresent(cache -> {
-          if (cacheable && cacheConf.shouldCacheBlockOnRead(category)) {
+          if (cacheBlock && cacheConf.shouldCacheBlockOnRead(category)) {
             // Using the wait on cache during compaction and prefetching.
-            cache.cacheBlock(cacheKey, cacheCompressed ? hfileBlock : unpacked,
+            cache.cacheBlock(cacheKey,
+              cacheCompressed
+                ? BlockCacheUtil.getBlockForCaching(cacheConf, hfileBlock)
+                : unpackedNoChecksum,
               cacheConf.isInMemory(), cacheOnly);
           }
         });
@@ -1394,7 +1403,7 @@ public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize, final bo
           HFile.DATABLOCK_READ_COUNT.increment();
         }
 
-        return unpacked;
+        return unpackedNoChecksum;
       }
     } finally {
       if (lockEntry != null) {
@@ -1716,9 +1725,4 @@ public int getMajorVersion() {
   public void unbufferStream() {
     fsBlockReader.unbufferStream();
   }
-
-  protected boolean cacheIfCompactionsOff() {
-    return (!StoreFileInfo.isReference(name) && !HFileLink.isHFileLink(name))
-      || !conf.getBoolean(HBASE_REGION_SERVER_ENABLE_COMPACTION, true);
-  }
 }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/bucket/BucketCache.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/bucket/BucketCache.java
index 3b08655bcfb3..cd82af74108a 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/bucket/BucketCache.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/bucket/BucketCache.java
@@ -79,6 +79,7 @@
 import org.apache.hadoop.hbase.nio.RefCnt;
 import org.apache.hadoop.hbase.protobuf.ProtobufMagic;
 import org.apache.hadoop.hbase.regionserver.HRegion;
+import org.apache.hadoop.hbase.regionserver.StoreFileInfo;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
 import org.apache.hadoop.hbase.util.IdReadWriteLock;
@@ -222,6 +223,8 @@ public class BucketCache implements BlockCache, HeapSize {
   // reset after a successful read/write.
   private volatile long ioErrorStartTime = -1;
 
+  private Configuration conf;
+
   /**
    * A ReentrantReadWriteLock to lock on a particular block identified by offset. The purpose of
    * this is to avoid freeing the block which is being read.
@@ -582,6 +585,30 @@ protected void cacheBlockWithWaitInternal(BlockCacheKey cacheKey, Cacheable cach
     }
   }
 
+  /**
+   * If the passed cache key relates to a reference (<hfile>.<parentEncRegion>), this method looks
+   * for the block from the referred file, in the cache. If present in the cache, the block for the
+   * referred file is returned, otherwise, this method returns null. It will also return null if the
+   * passed cache key doesn't relate to a reference.
+   * @param key the BlockCacheKey instance to look for in the cache.
+   * @return the cached block from the referred file, null if there's no such block in the cache or
+   *         the passed key doesn't relate to a reference.
+   */
+  public BucketEntry getBlockForReference(BlockCacheKey key) {
+    BucketEntry foundEntry = null;
+    String referredFileName = null;
+    if (StoreFileInfo.isReference(key.getHfileName())) {
+      referredFileName = StoreFileInfo.getReferredToRegionAndFile(key.getHfileName()).getSecond();
+    }
+    if (referredFileName != null) {
+      BlockCacheKey convertedCacheKey = new BlockCacheKey(referredFileName, key.getOffset());
+      foundEntry = backingMap.get(convertedCacheKey);
+      LOG.debug("Got a link/ref: {}. Related cacheKey: {}. Found entry: {}", key.getHfileName(),
+        convertedCacheKey, foundEntry);
+    }
+    return foundEntry;
+  }
+
   /**
    * Get the buffer of the block with the specified key.
    * @param key                block's cache key
@@ -605,6 +632,9 @@ public Cacheable getBlock(BlockCacheKey key, boolean caching, boolean repeat,
       return re.getData();
     }
     BucketEntry bucketEntry = backingMap.get(key);
+    if (bucketEntry == null) {
+      bucketEntry = getBlockForReference(key);
+    }
     if (bucketEntry != null) {
       long start = System.nanoTime();
       ReentrantReadWriteLock lock = offsetLock.getLock(bucketEntry.offset());
@@ -613,7 +643,9 @@ public Cacheable getBlock(BlockCacheKey key, boolean caching, boolean repeat,
         // We can not read here even if backingMap does contain the given key because its offset
         // maybe changed. If we lock BlockCacheKey instead of offset, then we can only check
         // existence here.
-        if (bucketEntry.equals(backingMap.get(key))) {
+        if (
+          bucketEntry.equals(backingMap.get(key)) || bucketEntry.equals(getBlockForReference(key))
+        ) {
           // Read the block from IOEngine based on the bucketEntry's offset and length, NOTICE: the
           // block will use the refCnt of bucketEntry, which means if two HFileBlock mapping to
           // the same BucketEntry, then all of the three will share the same refCnt.
@@ -1750,8 +1782,15 @@ protected String getAlgorithm() {
    */
   @Override
   public int evictBlocksByHfileName(String hfileName) {
+    return evictBlocksRangeByHfileName(hfileName, 0, Long.MAX_VALUE);
+  }
+
+  @Override
+  public int evictBlocksRangeByHfileName(String hfileName, long initOffset, long endOffset) {
     fileNotFullyCached(hfileName);
-    Set<BlockCacheKey> keySet = getAllCacheKeysForFile(hfileName);
+    Set<BlockCacheKey> keySet = getAllCacheKeysForFile(hfileName, initOffset, endOffset);
+    LOG.debug("found {} blocks for file {}, starting offset: {}, end offset: {}", keySet.size(),
+      hfileName, initOffset, endOffset);
     int numEvicted = 0;
     for (BlockCacheKey key : keySet) {
       if (evictBlock(key)) {
@@ -1761,9 +1800,9 @@ public int evictBlocksByHfileName(String hfileName) {
     return numEvicted;
   }
 
-  private Set<BlockCacheKey> getAllCacheKeysForFile(String hfileName) {
-    return blocksByHFile.subSet(new BlockCacheKey(hfileName, Long.MIN_VALUE), true,
-      new BlockCacheKey(hfileName, Long.MAX_VALUE), true);
+  private Set<BlockCacheKey> getAllCacheKeysForFile(String hfileName, long init, long end) {
+    return blocksByHFile.subSet(new BlockCacheKey(hfileName, init), true,
+      new BlockCacheKey(hfileName, end), true);
   }
 
   /**
@@ -2173,25 +2212,20 @@ public void notifyFileCachingCompleted(Path fileName, int totalBlockCount, int d
     try {
       final MutableInt count = new MutableInt();
       LOG.debug("iterating over {} entries in the backing map", backingMap.size());
-      backingMap.entrySet().stream().forEach(entry -> {
-        if (
-          entry.getKey().getHfileName().equals(fileName.getName())
-            && entry.getKey().getBlockType().equals(BlockType.DATA)
-        ) {
-          long offsetToLock = entry.getValue().offset();
-          LOG.debug("found block {} in the backing map. Acquiring read lock for offset {}",
-            entry.getKey(), offsetToLock);
-          ReentrantReadWriteLock lock = offsetLock.getLock(offsetToLock);
-          lock.readLock().lock();
-          locks.add(lock);
-          // rechecks the given key is still there (no eviction happened before the lock acquired)
-          if (backingMap.containsKey(entry.getKey())) {
-            count.increment();
-          } else {
-            lock.readLock().unlock();
-            locks.remove(lock);
-            LOG.debug("found block {}, but when locked and tried to count, it was gone.");
-          }
+      Set<BlockCacheKey> result = getAllCacheKeysForFile(fileName.getName(), 0, Long.MAX_VALUE);
+      if (result.isEmpty() && StoreFileInfo.isReference(fileName)) {
+        result = getAllCacheKeysForFile(
+          StoreFileInfo.getReferredToRegionAndFile(fileName.getName()).getSecond(), 0,
+          Long.MAX_VALUE);
+      }
+      result.stream().forEach(entry -> {
+        LOG.debug("found block for file {} in the backing map. Acquiring read lock for offset {}",
+          fileName.getName(), entry.getOffset());
+        ReentrantReadWriteLock lock = offsetLock.getLock(entry.getOffset());
+        lock.readLock().lock();
+        locks.add(lock);
+        if (backingMap.containsKey(entry) && entry.getBlockType() == BlockType.DATA) {
+          count.increment();
         }
       });
       int metaCount = totalBlockCount - dataBlockCount;
@@ -2214,17 +2248,19 @@ public void notifyFileCachingCompleted(Path fileName, int totalBlockCount, int d
             + "and try the verification again.", fileName.getName());
           Thread.sleep(100);
           notifyFileCachingCompleted(fileName, totalBlockCount, dataBlockCount, size);
-        } else
-          if ((getAllCacheKeysForFile(fileName.getName()).size() - metaCount) == dataBlockCount) {
-            LOG.debug("We counted {} data blocks, expected was {}, there was no more pending in "
-              + "the cache write queue but we now found that total cached blocks for file {} "
-              + "is equal to data block count.", count, dataBlockCount, fileName.getName());
-            fileCacheCompleted(fileName, size);
-          } else {
-            LOG.info("We found only {} data blocks cached from a total of {} for file {}, "
-              + "but no blocks pending caching. Maybe cache is full or evictions "
-              + "happened concurrently to cache prefetch.", count, dataBlockCount, fileName);
-          }
+        } else if (
+          (getAllCacheKeysForFile(fileName.getName(), 0, Long.MAX_VALUE).size() - metaCount)
+              == dataBlockCount
+        ) {
+          LOG.debug("We counted {} data blocks, expected was {}, there was no more pending in "
+            + "the cache write queue but we now found that total cached blocks for file {} "
+            + "is equal to data block count.", count, dataBlockCount, fileName.getName());
+          fileCacheCompleted(fileName, size);
+        } else {
+          LOG.info("We found only {} data blocks cached from a total of {} for file {}, "
+            + "but no blocks pending caching. Maybe cache is full or evictions "
+            + "happened concurrently to cache prefetch.", count, dataBlockCount, fileName);
+        }
       }
     } catch (InterruptedException e) {
       throw new RuntimeException(e);
@@ -2250,14 +2286,20 @@ public Optional<Boolean> shouldCacheFile(String fileName) {
 
   @Override
   public Optional<Boolean> isAlreadyCached(BlockCacheKey key) {
-    return Optional.of(getBackingMap().containsKey(key));
+    boolean foundKey = backingMap.containsKey(key);
+    // if there's no entry for the key itself, we need to check if this key is for a reference,
+    // and if so, look for a block from the referenced file using this getBlockForReference method.
+    return Optional.of(foundKey ? true : getBlockForReference(key) != null);
   }
 
   @Override
   public Optional<Integer> getBlockSize(BlockCacheKey key) {
     BucketEntry entry = backingMap.get(key);
     if (entry == null) {
-      return Optional.empty();
+      // the key might be for a reference tha we had found the block from the referenced file in
+      // the cache when we first tried to cache it.
+      entry = getBlockForReference(key);
+      return entry == null ? Optional.empty() : Optional.of(entry.getOnDiskSizeWithHeader());
     } else {
       return Optional.of(entry.getOnDiskSizeWithHeader());
     }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
index 81397915647d..b1d4483bd9e5 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
@@ -18,7 +18,9 @@
 package org.apache.hadoop.hbase.master.assignment;
 
 import static org.apache.hadoop.hbase.io.hfile.CacheConfig.DEFAULT_EVICT_ON_CLOSE;
+import static org.apache.hadoop.hbase.io.hfile.CacheConfig.DEFAULT_EVICT_ON_SPLIT;
 import static org.apache.hadoop.hbase.io.hfile.CacheConfig.EVICT_BLOCKS_ON_CLOSE_KEY;
+import static org.apache.hadoop.hbase.io.hfile.CacheConfig.EVICT_BLOCKS_ON_SPLIT_KEY;
 import static org.apache.hadoop.hbase.master.LoadBalancer.BOGUS_SERVER_NAME;
 import static org.apache.hadoop.hbase.master.assignment.AssignmentManager.FORCE_REGION_RETAINMENT;
 
@@ -335,7 +337,9 @@ private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode) thr
       env.getAssignmentManager().regionClosing(regionNode);
       CloseRegionProcedure closeProc = isSplit
         ? new CloseRegionProcedure(this, getRegion(), regionNode.getRegionLocation(),
-          assignCandidate, true)
+          assignCandidate,
+          env.getMasterConfiguration().getBoolean(EVICT_BLOCKS_ON_SPLIT_KEY,
+            DEFAULT_EVICT_ON_SPLIT))
         : new CloseRegionProcedure(this, getRegion(), regionNode.getRegionLocation(),
           assignCandidate, evictCache);
       addChildProcedure(closeProc);
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFileReader.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFileReader.java
index 09c379227bda..e241bf0a5d34 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFileReader.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFileReader.java
@@ -68,7 +68,7 @@ public class StoreFileReader {
   protected BloomFilter deleteFamilyBloomFilter = null;
   private BloomFilterMetrics bloomFilterMetrics = null;
   protected BloomType bloomFilterType;
-  private final HFile.Reader reader;
+  protected final HFile.Reader reader;
   protected long sequenceID = -1;
   protected TimeRange timeRange = null;
   private byte[] lastBloomKey;
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java
index 2419e709686a..8f8668aa87a8 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java
@@ -126,11 +126,9 @@ public void process() throws IOException {
       region.getCoprocessorHost().preClose(abort);
     }
     // This should be true only in the case of splits/merges closing the parent regions, as
-    // there's no point on keep blocks for those region files. As hbase.rs.evictblocksonclose is
-    // false by default we don't bother overriding it if evictCache is false.
-    if (evictCache) {
-      region.getStores().forEach(s -> s.getCacheConfig().setEvictOnClose(true));
-    }
+    // there's no point on keep blocks for those region files.
+    region.getStores().forEach(s -> s.getCacheConfig().setEvictOnClose(evictCache));
+
     if (region.close(abort) == null) {
       // XXX: Is this still possible? The old comment says about split, but now split is done at
       // master side, so...
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestSplitWithCache.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestSplitWithCache.java
new file mode 100644
index 000000000000..91e65610f81c
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestSplitWithCache.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase;
+
+import static org.apache.hadoop.hbase.HConstants.BUCKET_CACHE_IOENGINE_KEY;
+import static org.apache.hadoop.hbase.HConstants.BUCKET_CACHE_SIZE_KEY;
+import static org.apache.hadoop.hbase.io.hfile.CacheConfig.CACHE_BLOCKS_ON_WRITE_KEY;
+import static org.apache.hadoop.hbase.io.hfile.CacheConfig.EVICT_BLOCKS_ON_SPLIT_KEY;
+import static org.apache.hadoop.hbase.io.hfile.CacheConfig.PREFETCH_BLOCKS_ON_OPEN_KEY;
+import static org.junit.Assert.assertTrue;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.function.BiConsumer;
+import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
+import org.apache.hadoop.hbase.regionserver.HStoreFile;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.testclassification.MiscTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.Pair;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Category({ MiscTests.class, MediumTests.class })
+public class TestSplitWithCache {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestSplitWithCache.class);
+
+  private static final Logger LOG = LoggerFactory.getLogger(TestSplitWithCache.class);
+
+  private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
+
+  @BeforeClass
+  public static void setUp() throws Exception {
+    UTIL.getConfiguration().setInt(HConstants.HBASE_CLIENT_META_OPERATION_TIMEOUT, 1000);
+    UTIL.getConfiguration().setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 2);
+    UTIL.getConfiguration().setBoolean(CACHE_BLOCKS_ON_WRITE_KEY, true);
+    UTIL.getConfiguration().setBoolean(PREFETCH_BLOCKS_ON_OPEN_KEY, true);
+    UTIL.getConfiguration().set(BUCKET_CACHE_IOENGINE_KEY, "offheap");
+    UTIL.getConfiguration().setInt(BUCKET_CACHE_SIZE_KEY, 200);
+  }
+
+  @Test
+  public void testEvictOnSplit() throws Exception {
+    doTest("testEvictOnSplit", true,
+      (f, m) -> Waiter.waitFor(UTIL.getConfiguration(), 1000, () -> m.get(f) != null),
+      (f, m) -> Waiter.waitFor(UTIL.getConfiguration(), 1000, () -> m.get(f) == null));
+  }
+
+  @Test
+  public void testDoesntEvictOnSplit() throws Exception {
+    doTest("testDoesntEvictOnSplit", false,
+      (f, m) -> Waiter.waitFor(UTIL.getConfiguration(), 1000, () -> m.get(f) != null),
+      (f, m) -> Waiter.waitFor(UTIL.getConfiguration(), 1000, () -> m.get(f) != null));
+  }
+
+  private void doTest(String table, boolean evictOnSplit,
+    BiConsumer<String, Map<String, Pair<String, Long>>> predicateBeforeSplit,
+    BiConsumer<String, Map<String, Pair<String, Long>>> predicateAfterSplit) throws Exception {
+    UTIL.getConfiguration().setBoolean(EVICT_BLOCKS_ON_SPLIT_KEY, evictOnSplit);
+    UTIL.startMiniCluster(1);
+    try {
+      TableName tableName = TableName.valueOf(table);
+      byte[] family = Bytes.toBytes("CF");
+      TableDescriptor td = TableDescriptorBuilder.newBuilder(tableName)
+        .setColumnFamily(ColumnFamilyDescriptorBuilder.of(family)).build();
+      UTIL.getAdmin().createTable(td);
+      UTIL.waitTableAvailable(tableName);
+      Table tbl = UTIL.getConnection().getTable(tableName);
+      List<Put> puts = new ArrayList<>();
+      for (int i = 0; i < 1000; i++) {
+        Put p = new Put(Bytes.toBytes("row-" + i));
+        p.addColumn(family, Bytes.toBytes(1), Bytes.toBytes("val-" + i));
+        puts.add(p);
+      }
+      tbl.put(puts);
+      UTIL.getAdmin().flush(tableName);
+      Collection<HStoreFile> files =
+        UTIL.getMiniHBaseCluster().getRegions(tableName).get(0).getStores().get(0).getStorefiles();
+      checkCacheForBlocks(tableName, files, predicateBeforeSplit);
+      UTIL.getAdmin().split(tableName, Bytes.toBytes("row-500"));
+      Waiter.waitFor(UTIL.getConfiguration(), 30000,
+        () -> UTIL.getMiniHBaseCluster().getRegions(tableName).size() == 2);
+      UTIL.waitUntilNoRegionsInTransition();
+      checkCacheForBlocks(tableName, files, predicateAfterSplit);
+    } finally {
+      UTIL.shutdownMiniCluster();
+    }
+
+  }
+
+  private void checkCacheForBlocks(TableName tableName, Collection<HStoreFile> files,
+    BiConsumer<String, Map<String, Pair<String, Long>>> checker) {
+    files.forEach(f -> {
+      UTIL.getMiniHBaseCluster().getRegionServer(0).getBlockCache().ifPresent(cache -> {
+        cache.getFullyCachedFiles().ifPresent(m -> {
+          checker.accept(f.getPath().getName(), m);
+        });
+        assertTrue(cache.getFullyCachedFiles().isPresent());
+      });
+    });
+  }
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/TestHalfStoreFileReader.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/TestHalfStoreFileReader.java
index 13955ccebfec..0ac03b8d4136 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/TestHalfStoreFileReader.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/TestHalfStoreFileReader.java
@@ -25,6 +25,7 @@
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.Cell;
@@ -42,6 +43,7 @@
 import org.apache.hadoop.hbase.io.hfile.ReaderContext;
 import org.apache.hadoop.hbase.io.hfile.ReaderContextBuilder;
 import org.apache.hadoop.hbase.regionserver.StoreFileInfo;
+import org.apache.hadoop.hbase.regionserver.StoreFileWriter;
 import org.apache.hadoop.hbase.testclassification.IOTests;
 import org.apache.hadoop.hbase.testclassification.SmallTests;
 import org.apache.hadoop.hbase.util.Bytes;
@@ -82,15 +84,19 @@ public static void tearDownAfterClass() throws Exception {
    */
   @Test
   public void testHalfScanAndReseek() throws IOException {
-    String root_dir = TEST_UTIL.getDataTestDir().toString();
-    Path p = new Path(root_dir, "test");
-
     Configuration conf = TEST_UTIL.getConfiguration();
     FileSystem fs = FileSystem.get(conf);
+    String root_dir = TEST_UTIL.getDataTestDir().toString();
+    Path parentPath = new Path(new Path(root_dir, "parent"), "CF");
+    fs.mkdirs(parentPath);
+    Path splitAPath = new Path(new Path(root_dir, "splita"), "CF");
+    Path splitBPath = new Path(new Path(root_dir, "splitb"), "CF");
+    Path filePath = StoreFileWriter.getUniqueFile(fs, parentPath);
+
     CacheConfig cacheConf = new CacheConfig(conf);
     HFileContext meta = new HFileContextBuilder().withBlockSize(1024).build();
     HFile.Writer w =
-      HFile.getWriterFactory(conf, cacheConf).withPath(fs, p).withFileContext(meta).create();
+      HFile.getWriterFactory(conf, cacheConf).withPath(fs, filePath).withFileContext(meta).create();
 
     // write some things.
     List<KeyValue> items = genSomeKeys();
@@ -99,26 +105,35 @@ public void testHalfScanAndReseek() throws IOException {
     }
     w.close();
 
-    HFile.Reader r = HFile.createReader(fs, p, cacheConf, true, conf);
+    HFile.Reader r = HFile.createReader(fs, filePath, cacheConf, true, conf);
     Cell midKV = r.midKey().get();
     byte[] midkey = CellUtil.cloneRow(midKV);
 
-    // System.out.println("midkey: " + midKV + " or: " + Bytes.toStringBinary(midkey));
+    Path splitFileA = new Path(splitAPath, filePath.getName() + ".parent");
+    Path splitFileB = new Path(splitBPath, filePath.getName() + ".parent");
 
     Reference bottom = new Reference(midkey, Reference.Range.bottom);
-    doTestOfScanAndReseek(p, fs, bottom, cacheConf);
+    bottom.write(fs, splitFileA);
+    doTestOfScanAndReseek(splitFileA, fs, bottom, cacheConf);
 
     Reference top = new Reference(midkey, Reference.Range.top);
-    doTestOfScanAndReseek(p, fs, top, cacheConf);
+    top.write(fs, splitFileB);
+    doTestOfScanAndReseek(splitFileB, fs, top, cacheConf);
 
     r.close();
   }
 
   private void doTestOfScanAndReseek(Path p, FileSystem fs, Reference bottom, CacheConfig cacheConf)
     throws IOException {
-    ReaderContext context = new ReaderContextBuilder().withFileSystemAndPath(fs, p).build();
-    StoreFileInfo storeFileInfo =
-      new StoreFileInfo(TEST_UTIL.getConfiguration(), fs, fs.getFileStatus(p), bottom);
+    Path referencePath = StoreFileInfo.getReferredToFile(p);
+    FSDataInputStreamWrapper in = new FSDataInputStreamWrapper(fs, referencePath, false, 0);
+    FileStatus status = fs.getFileStatus(referencePath);
+    long length = status.getLen();
+    ReaderContextBuilder contextBuilder =
+      new ReaderContextBuilder().withInputStreamWrapper(in).withFileSize(length)
+        .withReaderType(ReaderContext.ReaderType.PREAD).withFileSystem(fs).withFilePath(p);
+    ReaderContext context = contextBuilder.build();
+    StoreFileInfo storeFileInfo = new StoreFileInfo(TEST_UTIL.getConfiguration(), fs, p, true);
     storeFileInfo.initHFileInfo(context);
     final HalfStoreFileReader halfreader =
       (HalfStoreFileReader) storeFileInfo.createReader(context, cacheConf);
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestPrefetch.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestPrefetch.java
index cd2793b8cea0..8e278e40336e 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestPrefetch.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestPrefetch.java
@@ -285,14 +285,6 @@ public void testPrefetchCompressed() throws Exception {
     conf.setBoolean(CACHE_DATA_BLOCKS_COMPRESSED_KEY, false);
   }
 
-  @Test
-  public void testPrefetchSkipsRefs() throws Exception {
-    testPrefetchWhenRefs(true, c -> {
-      boolean isCached = c != null;
-      assertFalse(isCached);
-    });
-  }
-
   @Test
   public void testPrefetchDoesntSkipRefs() throws Exception {
     testPrefetchWhenRefs(false, c -> {
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestPrefetchWithBucketCache.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestPrefetchWithBucketCache.java
index db8f2213d0c0..c3954d3cf901 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestPrefetchWithBucketCache.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestPrefetchWithBucketCache.java
@@ -22,6 +22,7 @@
 import static org.apache.hadoop.hbase.io.hfile.BlockCacheFactory.BUCKET_CACHE_BUCKETS_KEY;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
@@ -39,13 +40,20 @@
 import org.apache.hadoop.hbase.HBaseClassTestRule;
 import org.apache.hadoop.hbase.HBaseTestingUtility;
 import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.TableName;
 import org.apache.hadoop.hbase.Waiter;
 import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
 import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.RegionInfoBuilder;
 import org.apache.hadoop.hbase.fs.HFileSystem;
 import org.apache.hadoop.hbase.io.ByteBuffAllocator;
 import org.apache.hadoop.hbase.io.hfile.bucket.BucketCache;
 import org.apache.hadoop.hbase.io.hfile.bucket.BucketEntry;
+import org.apache.hadoop.hbase.regionserver.BloomType;
+import org.apache.hadoop.hbase.regionserver.ConstantSizeRegionSplitPolicy;
+import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
+import org.apache.hadoop.hbase.regionserver.HStoreFile;
 import org.apache.hadoop.hbase.regionserver.StoreFileWriter;
 import org.apache.hadoop.hbase.testclassification.IOTests;
 import org.apache.hadoop.hbase.testclassification.MediumTests;
@@ -135,6 +143,55 @@ public void testPrefetchDoesntOverwork() throws Exception {
     assertTrue(snapshot.get(key).getCachedTime() < bc.getBackingMap().get(key).getCachedTime());
   }
 
+  @Test
+  public void testPrefetchRefsAfterSplit() throws Exception {
+    conf.setLong(BUCKET_CACHE_SIZE_KEY, 200);
+    blockCache = BlockCacheFactory.createBlockCache(conf);
+    cacheConf = new CacheConfig(conf, blockCache);
+
+    Path tableDir = new Path(TEST_UTIL.getDataTestDir(), "testPrefetchRefsAfterSplit");
+    RegionInfo region = RegionInfoBuilder.newBuilder(TableName.valueOf(tableDir.getName())).build();
+    Path regionDir = new Path(tableDir, region.getEncodedName());
+    Path cfDir = new Path(regionDir, "cf");
+    HRegionFileSystem regionFS =
+      HRegionFileSystem.createRegionOnFileSystem(conf, fs, tableDir, region);
+    Path storeFile = writeStoreFile(100, cfDir);
+
+    // Prefetches the file blocks
+    LOG.debug("First read should prefetch the blocks.");
+    readStoreFile(storeFile);
+    BucketCache bc = BucketCache.getBucketCacheFromCacheConfig(cacheConf).get();
+    // Our file should have 6 DATA blocks. We should wait for all of them to be cached
+    Waiter.waitFor(conf, 300, () -> bc.getBackingMap().size() == 6);
+
+    // split the file and return references to the original file
+    Random rand = ThreadLocalRandom.current();
+    byte[] splitPoint = RandomKeyValueUtil.randomOrderedKey(rand, 50);
+    HStoreFile file = new HStoreFile(fs, storeFile, conf, cacheConf, BloomType.NONE, true);
+    Path ref = regionFS.splitStoreFile(region, "cf", file, splitPoint, false,
+      new ConstantSizeRegionSplitPolicy());
+    HStoreFile refHsf = new HStoreFile(this.fs, ref, conf, cacheConf, BloomType.NONE, true);
+    // starts reader for the ref. The ref should resolve to the original file blocks
+    // and not duplicate blocks in the cache.
+    refHsf.initReader();
+    HFile.Reader reader = refHsf.getReader().getHFileReader();
+    while (!reader.prefetchComplete()) {
+      // Sleep for a bit
+      Thread.sleep(1000);
+    }
+    // the ref file blocks keys should actually resolve to the referred file blocks,
+    // so we should not see additional blocks in the cache.
+    Waiter.waitFor(conf, 300, () -> bc.getBackingMap().size() == 6);
+
+    BlockCacheKey refCacheKey = new BlockCacheKey(ref.getName(), 0);
+    Cacheable result = bc.getBlock(refCacheKey, true, false, true);
+    assertNotNull(result);
+    BlockCacheKey fileCacheKey = new BlockCacheKey(file.getPath().getName(), 0);
+    assertEquals(result, bc.getBlock(fileCacheKey, true, false, true));
+    assertNull(bc.getBackingMap().get(refCacheKey));
+    assertNotNull(bc.getBlockForReference(refCacheKey));
+  }
+
   @Test
   public void testPrefetchInterruptOnCapacity() throws Exception {
     conf.setLong(BUCKET_CACHE_SIZE_KEY, 1);
@@ -270,10 +327,19 @@ private Path writeStoreFile(String fname, int numKVs) throws IOException {
     return writeStoreFile(fname, meta, numKVs);
   }
 
+  private Path writeStoreFile(int numKVs, Path regionCFDir) throws IOException {
+    HFileContext meta = new HFileContextBuilder().withBlockSize(DATA_BLOCK_SIZE).build();
+    return writeStoreFile(meta, numKVs, regionCFDir);
+  }
+
   private Path writeStoreFile(String fname, HFileContext context, int numKVs) throws IOException {
-    Path storeFileParentDir = new Path(TEST_UTIL.getDataTestDir(), fname);
+    return writeStoreFile(context, numKVs, new Path(TEST_UTIL.getDataTestDir(), fname));
+  }
+
+  private Path writeStoreFile(HFileContext context, int numKVs, Path regionCFDir)
+    throws IOException {
     StoreFileWriter sfw = new StoreFileWriter.Builder(conf, cacheConf, fs)
-      .withOutputDir(storeFileParentDir).withFileContext(context).build();
+      .withOutputDir(regionCFDir).withFileContext(context).build();
     Random rand = ThreadLocalRandom.current();
     final int rowLen = 32;
     for (int i = 0; i < numKVs; ++i) {
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/bucket/TestBucketCachePersister.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/bucket/TestBucketCachePersister.java
index 7be959dfad4b..35a60ec93125 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/bucket/TestBucketCachePersister.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/bucket/TestBucketCachePersister.java
@@ -49,6 +49,8 @@
 import org.junit.Test;
 import org.junit.experimental.categories.Category;
 import org.junit.rules.TestName;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 @Category({ IOTests.class, MediumTests.class })
 public class TestBucketCachePersister {
@@ -61,6 +63,8 @@ public class TestBucketCachePersister {
 
   public int constructedBlockSize = 16 * 1024;
 
+  private static final Logger LOG = LoggerFactory.getLogger(TestBucketCachePersister.class);
+
   public int[] constructedBlockSizes =
     new int[] { 2 * 1024 + 1024, 4 * 1024 + 1024, 8 * 1024 + 1024, 16 * 1024 + 1024,
       28 * 1024 + 1024, 32 * 1024 + 1024, 64 * 1024 + 1024, 96 * 1024 + 1024, 128 * 1024 + 1024 };
@@ -164,6 +168,7 @@ public void testPrefetchBlockEvictionWhilePrefetchRunning() throws Exception {
     HFile.createReader(fs, storeFile, cacheConf, true, conf);
     boolean evicted = false;
     while (!PrefetchExecutor.isCompleted(storeFile)) {
+      LOG.debug("Entered loop as prefetch for {} is still running.", storeFile);
       if (bucketCache.backingMap.size() > 0 && !evicted) {
         Iterator<Map.Entry<BlockCacheKey, BucketEntry>> it =
           bucketCache.backingMap.entrySet().iterator();
@@ -172,6 +177,7 @@ public void testPrefetchBlockEvictionWhilePrefetchRunning() throws Exception {
         while (it.hasNext() && !evicted) {
           if (entry.getKey().getBlockType().equals(BlockType.DATA)) {
             evicted = bucketCache.evictBlock(it.next().getKey());
+            LOG.debug("Attempted eviction for {}. Succeeded? {}", storeFile, evicted);
           }
         }
       }

From 9d04d575502fa587950d9a97c3f2beb1c182d54f Mon Sep 17 00:00:00 2001
From: Charles Connell <cconnell@hubspot.com>
Date: Fri, 27 Dec 2024 13:35:13 -0500
Subject: [PATCH 09/37] HubSpot Edit: Upgrade zstd-jni to latest version

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index e4be54a0adf9..ce0a86eb8ec7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -662,7 +662,7 @@
     <brotli4j.version>1.11.0</brotli4j.version>
     <lz4.version>1.8.0</lz4.version>
     <snappy.version>1.1.10.4</snappy.version>
-    <zstd-jni.version>1.5.5-2</zstd-jni.version>
+    <zstd-jni.version>1.5.7-1</zstd-jni.version>
     <!--
         Note that the version of protobuf shipped in hbase-thirdparty must match the version used
         in hbase-protocol-shaded and hbase-examples. The version of jackson-[annotations,core,

From f992d77206dec46908bde0e5626ea70cfdbac894 Mon Sep 17 00:00:00 2001
From: Hernan Romer <nanug33@gmail.com>
Date: Tue, 18 Feb 2025 10:39:47 -0500
Subject: [PATCH 10/37] HubSpot Edit: HBASE-29134: Optimize bulkload backup
 process for incremental backups (not yet written upstream)

---
 .../impl/IncrementalTableBackupClient.java    | 25 ++++++---
 .../backup/impl/MergeSplitBulkloadInfo.java   | 55 +++++++++++++++++++
 2 files changed, 71 insertions(+), 9 deletions(-)
 create mode 100644 hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/MergeSplitBulkloadInfo.java

diff --git a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java
index 50eceb84996b..d246cd048846 100644
--- a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java
+++ b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java
@@ -123,8 +123,7 @@ protected static int getIndex(TableName tbl, List<TableName> sTableList) {
    * @param tablesToBackup list of tables to be backed up
    */
   protected List<BulkLoad> handleBulkLoad(List<TableName> tablesToBackup) throws IOException {
-    List<String> activeFiles = new ArrayList<>();
-    List<String> archiveFiles = new ArrayList<>();
+    Map<TableName, MergeSplitBulkloadInfo> toBulkload = new HashMap<>();
     List<BulkLoad> bulkLoads = backupManager.readBulkloadRows(tablesToBackup);
     FileSystem tgtFs;
     try {
@@ -137,6 +136,8 @@ protected List<BulkLoad> handleBulkLoad(List<TableName> tablesToBackup) throws I
 
     for (BulkLoad bulkLoad : bulkLoads) {
       TableName srcTable = bulkLoad.getTableName();
+      MergeSplitBulkloadInfo bulkloadInfo =
+        toBulkload.computeIfAbsent(srcTable, MergeSplitBulkloadInfo::new);
       String regionName = bulkLoad.getRegion();
       String fam = bulkLoad.getColumnFamily();
       String filename = FilenameUtils.getName(bulkLoad.getHfilePath());
@@ -166,20 +167,27 @@ protected List<BulkLoad> handleBulkLoad(List<TableName> tablesToBackup) throws I
             srcTableQualifier);
           LOG.trace("copying {} to {}", p, tgt);
         }
-        activeFiles.add(p.toString());
+        bulkloadInfo.addActiveFile(p.toString());
       } else if (fs.exists(archive)) {
         LOG.debug("copying archive {} to {}", archive, tgt);
-        archiveFiles.add(archive.toString());
+        bulkloadInfo.addArchivedFiles(p.toString());
       }
-      mergeSplitBulkloads(activeFiles, archiveFiles, srcTable);
-      incrementalCopyBulkloadHFiles(tgtFs, srcTable);
+      toBulkload.put(srcTable, bulkloadInfo);
     }
+
+    for (MergeSplitBulkloadInfo bulkloadInfo : toBulkload.values()) {
+      mergeSplitBulkloads(bulkloadInfo);
+      incrementalCopyBulkloadHFiles(tgtFs, bulkloadInfo.getSrcTable());
+    }
+
     return bulkLoads;
   }
 
-  private void mergeSplitBulkloads(List<String> activeFiles, List<String> archiveFiles,
-    TableName tn) throws IOException {
+  private void mergeSplitBulkloads(MergeSplitBulkloadInfo bulkload) throws IOException {
     int attempt = 1;
+    List<String> activeFiles = bulkload.getActiveFiles();
+    List<String> archiveFiles = bulkload.getArchiveFiles();
+    TableName tn = bulkload.getSrcTable();
 
     while (!activeFiles.isEmpty()) {
       LOG.info("MergeSplit {} active bulk loaded files. Attempt={}", activeFiles.size(), attempt++);
@@ -398,7 +406,6 @@ protected void walToHFiles(List<String> dirPaths, List<String> tableList) throws
     Path bulkOutputPath = getBulkOutputDir();
     conf.set(WALPlayer.BULK_OUTPUT_CONF_KEY, bulkOutputPath.toString());
     conf.set(WALPlayer.INPUT_FILES_SEPARATOR_KEY, ";");
-    conf.setBoolean(HFileOutputFormat2.TABLE_NAME_WITH_NAMESPACE_INCLUSIVE_KEY, true);
     conf.setBoolean(WALPlayer.MULTI_TABLES_SUPPORT, true);
     conf.set(JOB_NAME_CONF_KEY, jobname);
     String[] playerArgs = { dirs, StringUtils.join(tableList, ",") };
diff --git a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/MergeSplitBulkloadInfo.java b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/MergeSplitBulkloadInfo.java
new file mode 100644
index 000000000000..412ea72d1950
--- /dev/null
+++ b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/MergeSplitBulkloadInfo.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.backup.impl;
+
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.yetus.audience.InterfaceAudience;
+
+@InterfaceAudience.Private
+public class MergeSplitBulkloadInfo {
+  private final List<String> activeFiles = new ArrayList<>();
+  private final List<String> archiveFiles = new ArrayList<>();
+
+  private final TableName srcTable;
+
+  public MergeSplitBulkloadInfo(TableName srcTable) {
+    this.srcTable = srcTable;
+  }
+
+  public TableName getSrcTable() {
+    return srcTable;
+  }
+
+  public List<String> getArchiveFiles() {
+    return archiveFiles;
+  }
+
+  public List<String> getActiveFiles() {
+    return activeFiles;
+  }
+
+  public void addActiveFile(String file) {
+    activeFiles.add(file);
+  }
+
+  public void addArchivedFiles(String file) {
+    archiveFiles.add(file);
+  }
+}

From 3d1a424d74c24c38e7449f6b7fdf8f209adc8b24 Mon Sep 17 00:00:00 2001
From: Hernan Romer <nanug33@gmail.com>
Date: Tue, 18 Feb 2025 16:13:08 -0500
Subject: [PATCH 11/37] HubSpot Edit: archive backup fix

---
 .../hadoop/hbase/backup/impl/IncrementalTableBackupClient.java  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java
index d246cd048846..b320c3b0d0fb 100644
--- a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java
+++ b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java
@@ -170,7 +170,7 @@ protected List<BulkLoad> handleBulkLoad(List<TableName> tablesToBackup) throws I
         bulkloadInfo.addActiveFile(p.toString());
       } else if (fs.exists(archive)) {
         LOG.debug("copying archive {} to {}", archive, tgt);
-        bulkloadInfo.addArchivedFiles(p.toString());
+        bulkloadInfo.addArchivedFiles(archive.toString());
       }
       toBulkload.put(srcTable, bulkloadInfo);
     }

From f415e9a36b3c5fbdc132acdc097a5c519fd0983e Mon Sep 17 00:00:00 2001
From: Charles Connell <cconnell@hubspot.com>
Date: Tue, 18 Feb 2025 04:35:07 -0500
Subject: [PATCH 12/37] HubSpot Backport: HBASE-29123: A faster CodecPool for
 HBase (will be in 2.6.3)

Signed-off-by: Wellington Ramos Chevreuil <wchevreuil@apache.org>
Signed-off-by: Ray Mattingly <rmattingly@apache.org>
Signed-off-by: Nick Dimiduk <ndimiduk@apache.org>
---
 .../hadoop/hbase/ipc/CellBlockBuilder.java    |   2 +-
 hbase-common/pom.xml                          |   4 +
 .../hadoop/hbase/io/compress/CodecPool.java   | 235 ++++++++++++++++
 .../hadoop/hbase/io/compress/Compression.java |   1 -
 .../hbase/io/compress/TestCodecPool.java      | 255 ++++++++++++++++++
 .../hbase/HFilePerformanceEvaluation.java     |   5 +-
 6 files changed, 498 insertions(+), 4 deletions(-)
 create mode 100644 hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/CodecPool.java
 create mode 100644 hbase-common/src/test/java/org/apache/hadoop/hbase/io/compress/TestCodecPool.java

diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/CellBlockBuilder.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/CellBlockBuilder.java
index e7364ca3b429..4156a91c9e51 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/CellBlockBuilder.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/CellBlockBuilder.java
@@ -34,10 +34,10 @@
 import org.apache.hadoop.hbase.io.ByteBufferInputStream;
 import org.apache.hadoop.hbase.io.ByteBufferListOutputStream;
 import org.apache.hadoop.hbase.io.ByteBufferOutputStream;
+import org.apache.hadoop.hbase.io.compress.CodecPool;
 import org.apache.hadoop.hbase.nio.ByteBuff;
 import org.apache.hadoop.hbase.nio.SingleByteBuff;
 import org.apache.hadoop.hbase.util.ClassSize;
-import org.apache.hadoop.io.compress.CodecPool;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionInputStream;
 import org.apache.hadoop.io.compress.Compressor;
diff --git a/hbase-common/pom.xml b/hbase-common/pom.xml
index cc0260cdee92..3d009654140b 100644
--- a/hbase-common/pom.xml
+++ b/hbase-common/pom.xml
@@ -109,6 +109,10 @@
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-crypto</artifactId>
     </dependency>
+    <dependency>
+      <groupId>com.github.ben-manes.caffeine</groupId>
+      <artifactId>caffeine</artifactId>
+    </dependency>
     <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/CodecPool.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/CodecPool.java
new file mode 100644
index 000000000000..b43ef12ae992
--- /dev/null
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/CodecPool.java
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.compress;
+
+import com.github.benmanes.caffeine.cache.Caffeine;
+import com.github.benmanes.caffeine.cache.LoadingCache;
+import edu.umd.cs.findbugs.annotations.Nullable;
+import java.util.Comparator;
+import java.util.NavigableSet;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ConcurrentSkipListSet;
+import java.util.concurrent.atomic.AtomicInteger;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.Compressor;
+import org.apache.hadoop.io.compress.Decompressor;
+import org.apache.hadoop.io.compress.DoNotPool;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A global compressor/decompressor pool used to save and reuse (possibly native)
+ * compression/decompression codecs. Copied from the class of the same name in hadoop-common and
+ * augmented to improve borrow/return performance.
+ */
+@InterfaceAudience.Private
+public class CodecPool {
+  private static final Logger LOG = LoggerFactory.getLogger(CodecPool.class);
+
+  private static final ConcurrentMap<Class<Compressor>, NavigableSet<Compressor>> COMPRESSOR_POOL =
+    new ConcurrentHashMap<>();
+
+  private static final ConcurrentMap<Class<Decompressor>,
+    NavigableSet<Decompressor>> DECOMPRESSOR_POOL = new ConcurrentHashMap<>();
+
+  private static <T> LoadingCache<Class<T>, AtomicInteger> createCache() {
+    return Caffeine.newBuilder().build(key -> new AtomicInteger());
+  }
+
+  /**
+   * Map to track the number of leased compressors. Only used in unit tests, kept null otherwise.
+   */
+  @Nullable
+  private static LoadingCache<Class<Compressor>, AtomicInteger> compressorCounts = null;
+
+  /**
+   * Map to tracks the number of leased decompressors. Only used in unit tests, kept null otherwise.
+   */
+  @Nullable
+  private static LoadingCache<Class<Decompressor>, AtomicInteger> decompressorCounts = null;
+
+  /**
+   * Call if you want lease counting to be enabled. Only used in unit tests.
+   */
+  static void initLeaseCounting() {
+    compressorCounts = createCache();
+    decompressorCounts = createCache();
+  }
+
+  private static <T> T borrow(ConcurrentMap<Class<T>, NavigableSet<T>> pool,
+    Class<? extends T> codecClass) {
+    if (codecClass == null) {
+      return null;
+    }
+
+    NavigableSet<T> codecSet = pool.get(codecClass);
+    if (codecSet != null) {
+      // If a copy of the codec is available, pollFirst() will grab one.
+      // If not, it will return null.
+      return codecSet.pollFirst();
+    } else {
+      return null;
+    }
+  }
+
+  private static <T> boolean payback(ConcurrentMap<Class<T>, NavigableSet<T>> pool, T codec) {
+    if (codec != null) {
+      Class<T> codecClass = ReflectionUtils.getClass(codec);
+      Set<T> codecSet = pool.computeIfAbsent(codecClass,
+        k -> new ConcurrentSkipListSet<>(Comparator.comparingInt(System::identityHashCode)));
+      return codecSet.add(codec);
+    }
+    return false;
+  }
+
+  /**
+   * Copied from hadoop-common without significant modification.
+   */
+  @SuppressWarnings("unchecked")
+  @edu.umd.cs.findbugs.annotations.SuppressWarnings(
+      value = "NP_NULL_ON_SOME_PATH_FROM_RETURN_VALUE",
+      justification = "LoadingCache will compute value if absent")
+  private static <T> int getLeaseCount(LoadingCache<Class<T>, AtomicInteger> usageCounts,
+    Class<? extends T> codecClass) {
+    return usageCounts.get((Class<T>) codecClass).get();
+  }
+
+  /**
+   * Copied from hadoop-common without significant modification.
+   */
+  @edu.umd.cs.findbugs.annotations.SuppressWarnings(
+      value = "NP_NULL_ON_SOME_PATH_FROM_RETURN_VALUE",
+      justification = "LoadingCache will compute value if absent")
+  private static <T> void updateLeaseCount(LoadingCache<Class<T>, AtomicInteger> usageCounts,
+    T codec, int delta) {
+    if (codec != null && usageCounts != null) {
+      Class<T> codecClass = ReflectionUtils.getClass(codec);
+      usageCounts.get(codecClass).addAndGet(delta);
+    }
+  }
+
+  /**
+   * Get a {@link Compressor} for the given {@link CompressionCodec} from the pool, or get a new one
+   * if the pool is empty. Copied from hadoop-common without significant modification.
+   */
+  public static Compressor getCompressor(CompressionCodec codec, Configuration conf) {
+    Compressor compressor = borrow(COMPRESSOR_POOL, codec.getCompressorType());
+    if (compressor == null) {
+      compressor = codec.createCompressor();
+      LOG.info("Got brand-new compressor [" + codec.getDefaultExtension() + "]");
+    } else {
+      compressor.reinit(conf);
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Got recycled compressor");
+      }
+    }
+    if (compressor != null && !compressor.getClass().isAnnotationPresent(DoNotPool.class)) {
+      updateLeaseCount(compressorCounts, compressor, 1);
+    }
+    return compressor;
+  }
+
+  public static Compressor getCompressor(CompressionCodec codec) {
+    return getCompressor(codec, null);
+  }
+
+  /**
+   * Get a {@link Decompressor} for the given {@link CompressionCodec} from the pool, or get a new
+   * one if the pool is empty. Copied from hadoop-common without significant modification.
+   */
+  public static Decompressor getDecompressor(CompressionCodec codec) {
+    Decompressor decompressor = borrow(DECOMPRESSOR_POOL, codec.getDecompressorType());
+    if (decompressor == null) {
+      decompressor = codec.createDecompressor();
+      LOG.info("Got brand-new decompressor [" + codec.getDefaultExtension() + "]");
+    } else {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Got recycled decompressor");
+      }
+    }
+    if (decompressor != null && !decompressor.getClass().isAnnotationPresent(DoNotPool.class)) {
+      updateLeaseCount(decompressorCounts, decompressor, 1);
+    }
+    return decompressor;
+  }
+
+  /**
+   * Return the {@link Compressor} to the pool. Copied from hadoop-common without significant
+   * modification.
+   */
+  public static void returnCompressor(Compressor compressor) {
+    if (compressor == null) {
+      return;
+    }
+    // if the compressor can't be reused, don't pool it.
+    if (compressor.getClass().isAnnotationPresent(DoNotPool.class)) {
+      compressor.end();
+      return;
+    }
+    compressor.reset();
+    if (payback(COMPRESSOR_POOL, compressor)) {
+      updateLeaseCount(compressorCounts, compressor, -1);
+    }
+  }
+
+  /**
+   * Return the {@link Decompressor} to the pool. Copied from hadoop-common without significant
+   * modification.
+   */
+  public static void returnDecompressor(Decompressor decompressor) {
+    if (decompressor == null) {
+      return;
+    }
+    // if the decompressor can't be reused, don't pool it.
+    if (decompressor.getClass().isAnnotationPresent(DoNotPool.class)) {
+      decompressor.end();
+      return;
+    }
+    decompressor.reset();
+    if (payback(DECOMPRESSOR_POOL, decompressor)) {
+      updateLeaseCount(decompressorCounts, decompressor, -1);
+    }
+  }
+
+  /**
+   * Returns the number of leased {@link Compressor}s for this {@link CompressionCodec}. Copied from
+   * hadoop-common without significant modification.
+   */
+  static int getLeasedCompressorsCount(@Nullable CompressionCodec codec) {
+    if (compressorCounts == null) {
+      throw new IllegalStateException("initLeaseCounting() not called to set up lease counting");
+    }
+    return (codec == null) ? 0 : getLeaseCount(compressorCounts, codec.getCompressorType());
+  }
+
+  /**
+   * Returns the number of leased {@link Decompressor}s for this {@link CompressionCodec}. Copied
+   * from hadoop-common without significant modification.
+   */
+  static int getLeasedDecompressorsCount(@Nullable CompressionCodec codec) {
+    if (decompressorCounts == null) {
+      throw new IllegalStateException("initLeaseCounting() not called to set up lease counting");
+    }
+    return (codec == null) ? 0 : getLeaseCount(decompressorCounts, codec.getDecompressorType());
+  }
+}
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/Compression.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/Compression.java
index 7f73cd2f004e..d4ca5af8649f 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/Compression.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/Compression.java
@@ -26,7 +26,6 @@
 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.HBaseConfiguration;
-import org.apache.hadoop.io.compress.CodecPool;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionInputStream;
 import org.apache.hadoop.io.compress.CompressionOutputStream;
diff --git a/hbase-common/src/test/java/org/apache/hadoop/hbase/io/compress/TestCodecPool.java b/hbase-common/src/test/java/org/apache/hadoop/hbase/io/compress/TestCodecPool.java
new file mode 100644
index 000000000000..166c12a658c3
--- /dev/null
+++ b/hbase-common/src/test/java/org/apache/hadoop/hbase/io/compress/TestCodecPool.java
@@ -0,0 +1,255 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.compress;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.OutputStream;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.LinkedBlockingDeque;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.testclassification.MiscTests;
+import org.apache.hadoop.hbase.testclassification.SmallTests;
+import org.apache.hadoop.io.compress.CompressionInputStream;
+import org.apache.hadoop.io.compress.Compressor;
+import org.apache.hadoop.io.compress.Decompressor;
+import org.apache.hadoop.io.compress.DefaultCodec;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * Along with CodecPool, this is copied from the class of the same name in hadoop-common. Modified
+ * to accommodate changes to HBase's CodecPool.
+ */
+@Category({ MiscTests.class, SmallTests.class })
+public class TestCodecPool {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestCodecPool.class);
+
+  private final String LEASE_COUNT_ERR = "Incorrect number of leased (de)compressors";
+  DefaultCodec codec;
+
+  @BeforeClass
+  public static void beforeClass() {
+    CodecPool.initLeaseCounting();
+  }
+
+  @Before
+  public void setup() {
+    this.codec = new DefaultCodec();
+    this.codec.setConf(new Configuration());
+  }
+
+  @Test(timeout = 10000)
+  public void testCompressorPoolCounts() {
+    // Get two compressors and return them
+    Compressor comp1 = CodecPool.getCompressor(codec);
+    Compressor comp2 = CodecPool.getCompressor(codec);
+    assertEquals(LEASE_COUNT_ERR, 2, CodecPool.getLeasedCompressorsCount(codec));
+
+    CodecPool.returnCompressor(comp2);
+    assertEquals(LEASE_COUNT_ERR, 1, CodecPool.getLeasedCompressorsCount(codec));
+
+    CodecPool.returnCompressor(comp1);
+    assertEquals(LEASE_COUNT_ERR, 0, CodecPool.getLeasedCompressorsCount(codec));
+
+    CodecPool.returnCompressor(comp1);
+    assertEquals(LEASE_COUNT_ERR, 0, CodecPool.getLeasedCompressorsCount(codec));
+  }
+
+  @Test(timeout = 10000)
+  public void testCompressorNotReturnSameInstance() {
+    Compressor comp = CodecPool.getCompressor(codec);
+    CodecPool.returnCompressor(comp);
+    CodecPool.returnCompressor(comp);
+    Set<Compressor> compressors = new HashSet<Compressor>();
+    for (int i = 0; i < 10; ++i) {
+      compressors.add(CodecPool.getCompressor(codec));
+    }
+    assertEquals(10, compressors.size());
+    for (Compressor compressor : compressors) {
+      CodecPool.returnCompressor(compressor);
+    }
+  }
+
+  @Test(timeout = 10000)
+  public void testDecompressorPoolCounts() {
+    // Get two decompressors and return them
+    Decompressor decomp1 = CodecPool.getDecompressor(codec);
+    Decompressor decomp2 = CodecPool.getDecompressor(codec);
+    assertEquals(LEASE_COUNT_ERR, 2, CodecPool.getLeasedDecompressorsCount(codec));
+
+    CodecPool.returnDecompressor(decomp2);
+    assertEquals(LEASE_COUNT_ERR, 1, CodecPool.getLeasedDecompressorsCount(codec));
+
+    CodecPool.returnDecompressor(decomp1);
+    assertEquals(LEASE_COUNT_ERR, 0, CodecPool.getLeasedDecompressorsCount(codec));
+
+    CodecPool.returnDecompressor(decomp1);
+    assertEquals(LEASE_COUNT_ERR, 0, CodecPool.getLeasedCompressorsCount(codec));
+  }
+
+  @Test(timeout = 10000)
+  public void testMultiThreadedCompressorPool() throws InterruptedException {
+    final int iterations = 4;
+    ExecutorService threadpool = Executors.newFixedThreadPool(3);
+    final LinkedBlockingDeque<Compressor> queue =
+      new LinkedBlockingDeque<Compressor>(2 * iterations);
+
+    Callable<Boolean> consumer = new Callable<Boolean>() {
+      @Override
+      public Boolean call() throws Exception {
+        Compressor c = queue.take();
+        CodecPool.returnCompressor(c);
+        return c != null;
+      }
+    };
+
+    Callable<Boolean> producer = new Callable<Boolean>() {
+      @Override
+      public Boolean call() throws Exception {
+        Compressor c = CodecPool.getCompressor(codec);
+        queue.put(c);
+        return c != null;
+      }
+    };
+
+    for (int i = 0; i < iterations; i++) {
+      threadpool.submit(consumer);
+      threadpool.submit(producer);
+    }
+
+    // wait for completion
+    threadpool.shutdown();
+    threadpool.awaitTermination(1000, TimeUnit.SECONDS);
+
+    assertEquals(LEASE_COUNT_ERR, 0, CodecPool.getLeasedCompressorsCount(codec));
+  }
+
+  @Test(timeout = 10000)
+  public void testMultiThreadedDecompressorPool() throws InterruptedException {
+    final int iterations = 4;
+    ExecutorService threadpool = Executors.newFixedThreadPool(3);
+    final LinkedBlockingDeque<Decompressor> queue =
+      new LinkedBlockingDeque<Decompressor>(2 * iterations);
+
+    Callable<Boolean> consumer = new Callable<Boolean>() {
+      @Override
+      public Boolean call() throws Exception {
+        Decompressor dc = queue.take();
+        CodecPool.returnDecompressor(dc);
+        return dc != null;
+      }
+    };
+
+    Callable<Boolean> producer = new Callable<Boolean>() {
+      @Override
+      public Boolean call() throws Exception {
+        Decompressor c = CodecPool.getDecompressor(codec);
+        queue.put(c);
+        return c != null;
+      }
+    };
+
+    for (int i = 0; i < iterations; i++) {
+      threadpool.submit(consumer);
+      threadpool.submit(producer);
+    }
+
+    // wait for completion
+    threadpool.shutdown();
+    threadpool.awaitTermination(1000, TimeUnit.SECONDS);
+
+    assertEquals(LEASE_COUNT_ERR, 0, CodecPool.getLeasedDecompressorsCount(codec));
+  }
+
+  @Test(timeout = 10000)
+  public void testDecompressorNotReturnSameInstance() {
+    Decompressor decomp = CodecPool.getDecompressor(codec);
+    CodecPool.returnDecompressor(decomp);
+    CodecPool.returnDecompressor(decomp);
+    Set<Decompressor> decompressors = new HashSet<Decompressor>();
+    for (int i = 0; i < 10; ++i) {
+      decompressors.add(CodecPool.getDecompressor(codec));
+    }
+    assertEquals(10, decompressors.size());
+    for (Decompressor decompressor : decompressors) {
+      CodecPool.returnDecompressor(decompressor);
+    }
+  }
+
+  @Test(timeout = 10000)
+  public void testDoNotPoolDecompressorNotUseableAfterReturn() throws Exception {
+
+    final GzipCodec gzipCodec = new GzipCodec();
+    gzipCodec.setConf(new Configuration());
+
+    final Random random = new Random();
+    final byte[] bytes = new byte[1024];
+    random.nextBytes(bytes);
+
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    try (OutputStream outputStream = gzipCodec.createOutputStream(baos)) {
+      outputStream.write(bytes);
+    }
+
+    final byte[] gzipBytes = baos.toByteArray();
+    final ByteArrayInputStream bais = new ByteArrayInputStream(gzipBytes);
+
+    // BuiltInGzipDecompressor is an explicit example of a Decompressor
+    // with the @DoNotPool annotation
+    final Decompressor decompressor = new BuiltInGzipDecompressor();
+    CodecPool.returnDecompressor(decompressor);
+
+    final CompressionInputStream inputStream = gzipCodec.createInputStream(bais, decompressor);
+    boolean passed = false;
+    try {
+      inputStream.read();
+    } catch (Exception e) {
+      if (
+        e.getMessage().contains("decompress called on closed decompressor")
+          || e.getMessage().contains("Inflater has been closed")
+      ) {
+        passed = true;
+      }
+    }
+
+    if (!passed) {
+      fail("Decompressor from Codec with @DoNotPool should not be "
+        + "useable after returning to CodecPool");
+    }
+  }
+
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HFilePerformanceEvaluation.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HFilePerformanceEvaluation.java
index c6cee19196cc..344da3db96b3 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HFilePerformanceEvaluation.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HFilePerformanceEvaluation.java
@@ -52,12 +52,13 @@ public class HFilePerformanceEvaluation {
   private static final int RFILE_BLOCKSIZE = 8 * 1024;
   private static StringBuilder testSummary = new StringBuilder();
 
-  // Disable verbose INFO logging from org.apache.hadoop.io.compress.CodecPool
+  // Disable verbose INFO logging from org.apache.hadoop.hbase.io.compress.CodecPool
   static {
     System.setProperty("org.apache.commons.logging.Log",
       "org.apache.commons.logging.impl.SimpleLog");
     System.setProperty(
-      "org.apache.commons.logging.simplelog.log.org.apache.hadoop.io.compress.CodecPool", "WARN");
+      "org.apache.commons.logging.simplelog.log.org.apache.hadoop.hbase.io.compress.CodecPool",
+      "WARN");
   }
 
   private static final Logger LOG =

From 2fb9fd1cc8ca951a3a6a25007c458d35d470547e Mon Sep 17 00:00:00 2001
From: Nick Dimiduk <ndimiduk@apache.org>
Date: Mon, 3 Mar 2025 12:44:24 +0100
Subject: [PATCH 13/37] HubSpot Backport: HBASE-29135: ZStandard decompression
 can operate directly on ByteBuffs (will be in 2.6.3)

Signed-off-by: Nick Dimiduk <ndimiduk@apache.org>
Co-authored-by: Charles Connell <cconnell@hubspot.com>
---
 .../io/compress/BlockDecompressorHelper.java  |  77 +++++++++
 .../compress/ByteBuffDecompressionCodec.java  |  29 ++++
 .../io/compress/ByteBuffDecompressor.java     |  48 ++++++
 .../hadoop/hbase/io/compress/CodecPool.java   |  32 +++-
 .../hadoop/hbase/io/compress/Compression.java |  41 +++++
 .../HFileBlockDefaultDecodingContext.java     |  53 ++++++
 .../zstd/ZstdByteBuffDecompressor.java        | 156 ++++++++++++++++++
 .../hbase/io/compress/zstd/ZstdCodec.java     |  14 +-
 .../zstd/TestHFileCompressionZstd.java        |  18 +-
 .../zstd/TestZstdByteBuffDecompressor.java    |  90 ++++++++++
 .../regionserver/wal/CompressionContext.java  |   3 +
 11 files changed, 557 insertions(+), 4 deletions(-)
 create mode 100644 hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/BlockDecompressorHelper.java
 create mode 100644 hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/ByteBuffDecompressionCodec.java
 create mode 100644 hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/ByteBuffDecompressor.java
 create mode 100644 hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdByteBuffDecompressor.java
 create mode 100644 hbase-compression/hbase-compression-zstd/src/test/java/org/apache/hadoop/hbase/io/compress/zstd/TestZstdByteBuffDecompressor.java

diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/BlockDecompressorHelper.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/BlockDecompressorHelper.java
new file mode 100644
index 000000000000..b03c0c35f7a6
--- /dev/null
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/BlockDecompressorHelper.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.compress;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import org.apache.hadoop.hbase.nio.ByteBuff;
+import org.apache.yetus.audience.InterfaceAudience;
+
+/**
+ * Helper to decompress a ByteBuff that was created by a
+ * {@link org.apache.hadoop.io.compress.BlockCompressorStream}, or is at least in the same format.
+ * Parses the binary format and delegates actual decompression work to the provided
+ * {@link RawDecompressor}. Note that the use of the word "block" here does not refer to an HFile
+ * block.
+ */
+@InterfaceAudience.Private
+public class BlockDecompressorHelper {
+
+  public interface RawDecompressor {
+    int decompress(ByteBuff output, ByteBuff input, int inputLen) throws IOException;
+  }
+
+  public static int decompress(ByteBuff output, ByteBuff input, int inputSize,
+    RawDecompressor rawDecompressor) throws IOException {
+    int totalDecompressedBytes = 0;
+    int compressedBytesConsumed = 0;
+
+    while (compressedBytesConsumed < inputSize) {
+      int decompressedBlockSize = rawReadInt(input);
+      compressedBytesConsumed += 4;
+      int decompressedBytesInBlock = 0;
+
+      while (decompressedBytesInBlock < decompressedBlockSize) {
+        int compressedChunkSize = rawReadInt(input);
+        compressedBytesConsumed += 4;
+        int n = rawDecompressor.decompress(output, input, compressedChunkSize);
+        if (n <= 0) {
+          throw new IOException("Decompression failed. Compressed size: " + compressedChunkSize
+            + ", decompressed size: " + decompressedBlockSize);
+        }
+        compressedBytesConsumed += compressedChunkSize;
+        decompressedBytesInBlock += n;
+        totalDecompressedBytes += n;
+      }
+    }
+    return totalDecompressedBytes;
+  }
+
+  /**
+   * Read an integer from the buffer in big-endian byte order. Note that {@link ByteBuffer#getInt()}
+   * reads in system-dependent endian-ness, so we can't use that.
+   */
+  private static int rawReadInt(ByteBuff input) {
+    int b1 = Byte.toUnsignedInt(input.get());
+    int b2 = Byte.toUnsignedInt(input.get());
+    int b3 = Byte.toUnsignedInt(input.get());
+    int b4 = Byte.toUnsignedInt(input.get());
+    return ((b1 << 24) + (b2 << 16) + (b3 << 8) + b4);
+  }
+
+}
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/ByteBuffDecompressionCodec.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/ByteBuffDecompressionCodec.java
new file mode 100644
index 000000000000..821f0d825446
--- /dev/null
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/ByteBuffDecompressionCodec.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.compress;
+
+import org.apache.yetus.audience.InterfaceAudience;
+
+@InterfaceAudience.Private
+public interface ByteBuffDecompressionCodec {
+
+  Class<? extends ByteBuffDecompressor> getByteBuffDecompressorType();
+
+  ByteBuffDecompressor createByteBuffDecompressor();
+
+}
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/ByteBuffDecompressor.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/ByteBuffDecompressor.java
new file mode 100644
index 000000000000..8a0ff71919a9
--- /dev/null
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/ByteBuffDecompressor.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.compress;
+
+import java.io.Closeable;
+import java.io.IOException;
+import org.apache.hadoop.hbase.nio.ByteBuff;
+import org.apache.yetus.audience.InterfaceAudience;
+
+/**
+ * Specification of a ByteBuff-based decompressor, which can be more efficient than the stream-based
+ * Decompressor.
+ */
+@InterfaceAudience.Private
+public interface ByteBuffDecompressor extends Closeable {
+
+  /**
+   * Fills the ouput buffer with uncompressed data. Always call
+   * {@link #canDecompress(ByteBuff, ByteBuff)} first to check if this decompressor can handle your
+   * input and output buffers.
+   * @return The actual number of bytes of uncompressed data.
+   */
+  int decompress(ByteBuff output, ByteBuff input, int inputLen) throws IOException;
+
+  /**
+   * Signals of these two particular {@link ByteBuff}s are compatible with this decompressor.
+   * ByteBuffs can have one or multiple backing buffers, and each of these may be stored in heap or
+   * direct memory. Different {@link ByteBuffDecompressor}s may be able to handle different
+   * combinations of these, so always check.
+   */
+  boolean canDecompress(ByteBuff output, ByteBuff input);
+
+}
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/CodecPool.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/CodecPool.java
index b43ef12ae992..8096af050037 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/CodecPool.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/CodecPool.java
@@ -52,6 +52,9 @@ public class CodecPool {
   private static final ConcurrentMap<Class<Decompressor>,
     NavigableSet<Decompressor>> DECOMPRESSOR_POOL = new ConcurrentHashMap<>();
 
+  private static final ConcurrentMap<Class<ByteBuffDecompressor>,
+    NavigableSet<ByteBuffDecompressor>> BYTE_BUFF_DECOMPRESSOR_POOL = new ConcurrentHashMap<>();
+
   private static <T> LoadingCache<Class<T>, AtomicInteger> createCache() {
     return Caffeine.newBuilder().build(key -> new AtomicInteger());
   }
@@ -161,10 +164,10 @@ public static Decompressor getDecompressor(CompressionCodec codec) {
     Decompressor decompressor = borrow(DECOMPRESSOR_POOL, codec.getDecompressorType());
     if (decompressor == null) {
       decompressor = codec.createDecompressor();
-      LOG.info("Got brand-new decompressor [" + codec.getDefaultExtension() + "]");
+      LOG.info("Got brand-new Decompressor [" + codec.getDefaultExtension() + "]");
     } else {
       if (LOG.isDebugEnabled()) {
-        LOG.debug("Got recycled decompressor");
+        LOG.debug("Got recycled Decompressor");
       }
     }
     if (decompressor != null && !decompressor.getClass().isAnnotationPresent(DoNotPool.class)) {
@@ -173,6 +176,20 @@ public static Decompressor getDecompressor(CompressionCodec codec) {
     return decompressor;
   }
 
+  public static ByteBuffDecompressor getByteBuffDecompressor(ByteBuffDecompressionCodec codec) {
+    ByteBuffDecompressor decompressor =
+      borrow(BYTE_BUFF_DECOMPRESSOR_POOL, codec.getByteBuffDecompressorType());
+    if (decompressor == null) {
+      decompressor = codec.createByteBuffDecompressor();
+      LOG.info("Got brand-new ByteBuffDecompressor " + decompressor.getClass().getName());
+    } else {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Got recycled ByteBuffDecompressor");
+      }
+    }
+    return decompressor;
+  }
+
   /**
    * Return the {@link Compressor} to the pool. Copied from hadoop-common without significant
    * modification.
@@ -211,6 +228,17 @@ public static void returnDecompressor(Decompressor decompressor) {
     }
   }
 
+  public static void returnByteBuffDecompressor(ByteBuffDecompressor decompressor) {
+    if (decompressor == null) {
+      return;
+    }
+    // if the decompressor can't be reused, don't pool it.
+    if (decompressor.getClass().isAnnotationPresent(DoNotPool.class)) {
+      return;
+    }
+    payback(BYTE_BUFF_DECOMPRESSOR_POOL, decompressor);
+  }
+
   /**
    * Returns the number of leased {@link Compressor}s for this {@link CompressionCodec}. Copied from
    * hadoop-common without significant modification.
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/Compression.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/Compression.java
index d4ca5af8649f..c187a96702d0 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/Compression.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/Compression.java
@@ -26,6 +26,7 @@
 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.nio.ByteBuff;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionInputStream;
 import org.apache.hadoop.io.compress.CompressionOutputStream;
@@ -507,6 +508,46 @@ public void returnDecompressor(Decompressor decompressor) {
       }
     }
 
+    /**
+     * Signals if this codec theoretically supports decompression on {@link ByteBuff}s. This can be
+     * faster than using a DecompressionStream. If this method returns true, you can call
+     * {@link #getByteBuffDecompressor()} to obtain a {@link ByteBuffDecompressor}. You must then
+     * also call {@link ByteBuffDecompressor#canDecompress(ByteBuff, ByteBuff)} before attempting
+     * decompression, to verify if that decompressor is capable of handling your particular input
+     * and output buffers.
+     */
+    public boolean supportsByteBuffDecompression() {
+      CompressionCodec codec = getCodec(conf);
+      return codec instanceof ByteBuffDecompressionCodec;
+    }
+
+    /**
+     * Be sure to call {@link #supportsByteBuffDecompression()} before calling this method.
+     * @throws IllegalStateException if the codec does not support block decompression
+     */
+    public ByteBuffDecompressor getByteBuffDecompressor() {
+      CompressionCodec codec = getCodec(conf);
+      if (codec instanceof ByteBuffDecompressionCodec) {
+        ByteBuffDecompressor decompressor =
+          CodecPool.getByteBuffDecompressor((ByteBuffDecompressionCodec) codec);
+        if (LOG.isTraceEnabled()) {
+          LOG.trace("Retrieved decompressor {} from pool.", decompressor);
+        }
+        return decompressor;
+      } else {
+        throw new IllegalStateException("Codec " + codec + " does not support block decompression");
+      }
+    }
+
+    public void returnByteBuffDecompressor(ByteBuffDecompressor decompressor) {
+      if (decompressor != null) {
+        if (LOG.isTraceEnabled()) {
+          LOG.trace("Returning decompressor {} to pool.", decompressor);
+        }
+        CodecPool.returnByteBuffDecompressor(decompressor);
+      }
+    }
+
     public String getName() {
       return compressName;
     }
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/HFileBlockDefaultDecodingContext.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/HFileBlockDefaultDecodingContext.java
index 7c4e348b44ad..2cdbdc620e07 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/HFileBlockDefaultDecodingContext.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/HFileBlockDefaultDecodingContext.java
@@ -24,6 +24,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.io.ByteBuffInputStream;
 import org.apache.hadoop.hbase.io.TagCompressionContext;
+import org.apache.hadoop.hbase.io.compress.ByteBuffDecompressor;
 import org.apache.hadoop.hbase.io.compress.CanReinit;
 import org.apache.hadoop.hbase.io.compress.Compression;
 import org.apache.hadoop.hbase.io.crypto.Cipher;
@@ -43,6 +44,7 @@
  */
 @InterfaceAudience.Private
 public class HFileBlockDefaultDecodingContext implements HFileBlockDecodingContext {
+
   private final Configuration conf;
   private final HFileContext fileContext;
   private TagCompressionContext tagCompressionContext;
@@ -55,6 +57,13 @@ public HFileBlockDefaultDecodingContext(Configuration conf, HFileContext fileCon
   @Override
   public void prepareDecoding(int onDiskSizeWithoutHeader, int uncompressedSizeWithoutHeader,
     ByteBuff blockBufferWithoutHeader, ByteBuff onDiskBlock) throws IOException {
+
+    // If possible, use the ByteBuffer decompression mechanism to avoid extra copies.
+    if (canDecompressViaByteBuff(blockBufferWithoutHeader, onDiskBlock)) {
+      decompressViaByteBuff(blockBufferWithoutHeader, onDiskBlock, onDiskSizeWithoutHeader);
+      return;
+    }
+
     final ByteBuffInputStream byteBuffInputStream = new ByteBuffInputStream(onDiskBlock);
     InputStream dataInputStream = new DataInputStream(byteBuffInputStream);
 
@@ -119,6 +128,50 @@ public void prepareDecoding(int onDiskSizeWithoutHeader, int uncompressedSizeWit
     }
   }
 
+  /**
+   * When only decompression is needed (not decryption), and the input and output buffers are
+   * SingleByteBuffs, and the decompression algorithm supports it, we can do decompression without
+   * any intermediate heap buffers. Do not call unless you've checked
+   * {@link #canDecompressViaByteBuff} first.
+   */
+  private void decompressViaByteBuff(ByteBuff blockBufferWithoutHeader, ByteBuff onDiskBlock,
+    int onDiskSizeWithoutHeader) throws IOException {
+    Compression.Algorithm compression = fileContext.getCompression();
+    ByteBuffDecompressor decompressor = compression.getByteBuffDecompressor();
+    try {
+      if (decompressor instanceof CanReinit) {
+        ((CanReinit) decompressor).reinit(conf);
+      }
+      decompressor.decompress(blockBufferWithoutHeader, onDiskBlock, onDiskSizeWithoutHeader);
+    } finally {
+      compression.returnByteBuffDecompressor(decompressor);
+    }
+  }
+
+  private boolean canDecompressViaByteBuff(ByteBuff blockBufferWithoutHeader,
+    ByteBuff onDiskBlock) {
+    // Theoretically we can do ByteBuff decompression after doing streaming decryption, but the
+    // refactoring necessary to support this has not been attempted. For now, we skip ByteBuff
+    // decompression if the input is encrypted.
+    if (fileContext.getEncryptionContext() != Encryption.Context.NONE) {
+      return false;
+    } else if (!fileContext.getCompression().supportsByteBuffDecompression()) {
+      return false;
+    } else {
+      ByteBuffDecompressor decompressor = fileContext.getCompression().getByteBuffDecompressor();
+      try {
+        if (decompressor instanceof CanReinit) {
+          ((CanReinit) decompressor).reinit(conf);
+        }
+        // Even if we have a ByteBuffDecompressor, we still need to check if it can decompress
+        // our particular ByteBuffs
+        return decompressor.canDecompress(blockBufferWithoutHeader, onDiskBlock);
+      } finally {
+        fileContext.getCompression().returnByteBuffDecompressor(decompressor);
+      }
+    }
+  }
+
   @Override
   public HFileContext getHFileContext() {
     return this.fileContext;
diff --git a/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdByteBuffDecompressor.java b/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdByteBuffDecompressor.java
new file mode 100644
index 000000000000..399753b92831
--- /dev/null
+++ b/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdByteBuffDecompressor.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.compress.zstd;
+
+import com.github.luben.zstd.ZstdDecompressCtx;
+import com.github.luben.zstd.ZstdDictDecompress;
+import edu.umd.cs.findbugs.annotations.Nullable;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.io.compress.BlockDecompressorHelper;
+import org.apache.hadoop.hbase.io.compress.ByteBuffDecompressor;
+import org.apache.hadoop.hbase.io.compress.CanReinit;
+import org.apache.hadoop.hbase.nio.ByteBuff;
+import org.apache.hadoop.hbase.nio.SingleByteBuff;
+import org.apache.yetus.audience.InterfaceAudience;
+
+/**
+ * Glue for ByteBuffDecompressor on top of zstd-jni
+ */
+@InterfaceAudience.Private
+public class ZstdByteBuffDecompressor implements ByteBuffDecompressor, CanReinit {
+
+  protected int dictId;
+  @Nullable
+  protected ZstdDictDecompress dict;
+  protected ZstdDecompressCtx ctx;
+  // Intended to be set to false by some unit tests
+  private boolean allowByteBuffDecompression;
+
+  ZstdByteBuffDecompressor(@Nullable byte[] dictionary) {
+    ctx = new ZstdDecompressCtx();
+    if (dictionary != null) {
+      this.dictId = ZstdCodec.getDictionaryId(dictionary);
+      this.dict = new ZstdDictDecompress(dictionary);
+      this.ctx.loadDict(this.dict);
+    }
+    allowByteBuffDecompression = true;
+  }
+
+  @Override
+  public boolean canDecompress(ByteBuff output, ByteBuff input) {
+    if (!allowByteBuffDecompression) {
+      return false;
+    }
+    if (output instanceof SingleByteBuff && input instanceof SingleByteBuff) {
+      ByteBuffer nioOutput = output.nioByteBuffers()[0];
+      ByteBuffer nioInput = input.nioByteBuffers()[0];
+      if (nioOutput.isDirect() && nioInput.isDirect()) {
+        return true;
+      } else if (!nioOutput.isDirect() && !nioInput.isDirect()) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  @Override
+  public int decompress(ByteBuff output, ByteBuff input, int inputLen) throws IOException {
+    return BlockDecompressorHelper.decompress(output, input, inputLen, this::decompressRaw);
+  }
+
+  private int decompressRaw(ByteBuff output, ByteBuff input, int inputLen) throws IOException {
+    if (output instanceof SingleByteBuff && input instanceof SingleByteBuff) {
+      ByteBuffer nioOutput = output.nioByteBuffers()[0];
+      ByteBuffer nioInput = input.nioByteBuffers()[0];
+      if (nioOutput.isDirect() && nioInput.isDirect()) {
+        return decompressDirectByteBuffers(nioOutput, nioInput, inputLen);
+      } else if (!nioOutput.isDirect() && !nioInput.isDirect()) {
+        return decompressHeapByteBuffers(nioOutput, nioInput, inputLen);
+      }
+    }
+
+    throw new IllegalStateException("One buffer is direct and the other is not, "
+      + "or one or more not SingleByteBuffs. This is not supported");
+  }
+
+  private int decompressDirectByteBuffers(ByteBuffer output, ByteBuffer input, int inputLen) {
+    int origOutputPos = output.position();
+
+    int n = ctx.decompressDirectByteBuffer(output, output.position(),
+      output.limit() - output.position(), input, input.position(), inputLen);
+
+    output.position(origOutputPos + n);
+    return n;
+  }
+
+  private int decompressHeapByteBuffers(ByteBuffer output, ByteBuffer input, int inputLen) {
+    int origOutputPos = output.position();
+
+    int n = ctx.decompressByteArray(output.array(), output.arrayOffset() + output.position(),
+      output.limit() - output.position(), input.array(), input.arrayOffset() + input.position(),
+      inputLen);
+
+    output.position(origOutputPos + n);
+    return n;
+  }
+
+  @Override
+  public void close() {
+    ctx.close();
+    if (dict != null) {
+      dict.close();
+    }
+  }
+
+  @Override
+  public void reinit(Configuration conf) {
+    if (conf != null) {
+      // Dictionary may have changed
+      byte[] b = ZstdCodec.getDictionary(conf);
+      if (b != null) {
+        // Don't casually create dictionary objects; they consume native memory
+        int thisDictId = ZstdCodec.getDictionaryId(b);
+        if (dict == null || dictId != thisDictId) {
+          dictId = thisDictId;
+          ZstdDictDecompress oldDict = dict;
+          dict = new ZstdDictDecompress(b);
+          ctx.loadDict(dict);
+          if (oldDict != null) {
+            oldDict.close();
+          }
+        }
+      } else {
+        ZstdDictDecompress oldDict = dict;
+        dict = null;
+        dictId = 0;
+        // loadDict((byte[]) accepts null to clear the dictionary
+        ctx.loadDict((byte[]) null);
+        if (oldDict != null) {
+          oldDict.close();
+        }
+      }
+
+      // unit test helper
+      this.allowByteBuffDecompression =
+        conf.getBoolean("hbase.io.compress.zstd.allowByteBuffDecompression", true);
+    }
+  }
+}
diff --git a/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdCodec.java b/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdCodec.java
index 7b97c817aca1..b06b93e3167b 100644
--- a/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdCodec.java
+++ b/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdCodec.java
@@ -26,6 +26,8 @@
 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.CommonConfigurationKeys;
+import org.apache.hadoop.hbase.io.compress.ByteBuffDecompressionCodec;
+import org.apache.hadoop.hbase.io.compress.ByteBuffDecompressor;
 import org.apache.hadoop.hbase.io.compress.DictionaryCache;
 import org.apache.hadoop.io.compress.BlockCompressorStream;
 import org.apache.hadoop.io.compress.BlockDecompressorStream;
@@ -42,7 +44,7 @@
  * This is data format compatible with Hadoop's native ZStandard codec.
  */
 @InterfaceAudience.Private
-public class ZstdCodec implements Configurable, CompressionCodec {
+public class ZstdCodec implements Configurable, CompressionCodec, ByteBuffDecompressionCodec {
 
   public static final String ZSTD_LEVEL_KEY = "hbase.io.compress.zstd.level";
   public static final String ZSTD_BUFFER_SIZE_KEY = "hbase.io.compress.zstd.buffersize";
@@ -80,6 +82,11 @@ public Decompressor createDecompressor() {
     return new ZstdDecompressor(bufferSize, dictionary);
   }
 
+  @Override
+  public ByteBuffDecompressor createByteBuffDecompressor() {
+    return new ZstdByteBuffDecompressor(dictionary);
+  }
+
   @Override
   public CompressionInputStream createInputStream(InputStream in) throws IOException {
     return createInputStream(in, createDecompressor());
@@ -113,6 +120,11 @@ public Class<? extends Decompressor> getDecompressorType() {
     return ZstdDecompressor.class;
   }
 
+  @Override
+  public Class<? extends ByteBuffDecompressor> getByteBuffDecompressorType() {
+    return ZstdByteBuffDecompressor.class;
+  }
+
   @Override
   public String getDefaultExtension() {
     return ".zst";
diff --git a/hbase-compression/hbase-compression-zstd/src/test/java/org/apache/hadoop/hbase/io/compress/zstd/TestHFileCompressionZstd.java b/hbase-compression/hbase-compression-zstd/src/test/java/org/apache/hadoop/hbase/io/compress/zstd/TestHFileCompressionZstd.java
index da8e1ae52bca..0c9302cb7da1 100644
--- a/hbase-compression/hbase-compression-zstd/src/test/java/org/apache/hadoop/hbase/io/compress/zstd/TestHFileCompressionZstd.java
+++ b/hbase-compression/hbase-compression-zstd/src/test/java/org/apache/hadoop/hbase/io/compress/zstd/TestHFileCompressionZstd.java
@@ -27,6 +27,7 @@
 import org.apache.hadoop.hbase.io.compress.HFileTestBase;
 import org.apache.hadoop.hbase.testclassification.IOTests;
 import org.apache.hadoop.hbase.testclassification.SmallTests;
+import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.ClassRule;
 import org.junit.Test;
@@ -43,6 +44,11 @@ public class TestHFileCompressionZstd extends HFileTestBase {
 
   @BeforeClass
   public static void setUpBeforeClass() throws Exception {
+    HFileTestBase.setUpBeforeClass();
+  }
+
+  @Before
+  public void setUp() throws Exception {
     conf = TEST_UTIL.getConfiguration();
     conf.set(Compression.ZSTD_CODEC_CLASS_KEY, ZstdCodec.class.getCanonicalName());
     Compression.Algorithm.ZSTD.reload(conf);
@@ -50,7 +56,17 @@ public static void setUpBeforeClass() throws Exception {
   }
 
   @Test
-  public void test() throws Exception {
+  public void testWithStreamDecompression() throws Exception {
+    conf.setBoolean("hbase.io.compress.zstd.allowByteBuffDecompression", false);
+    Compression.Algorithm.ZSTD.reload(conf);
+
+    Path path = new Path(TEST_UTIL.getDataTestDir(),
+      HBaseTestingUtility.getRandomUUID().toString() + ".hfile");
+    doTest(conf, path, Compression.Algorithm.ZSTD);
+  }
+
+  @Test
+  public void testWithByteBuffDecompression() throws Exception {
     Path path = new Path(TEST_UTIL.getDataTestDir(),
       HBaseTestingUtility.getRandomUUID().toString() + ".hfile");
     doTest(conf, path, Compression.Algorithm.ZSTD);
diff --git a/hbase-compression/hbase-compression-zstd/src/test/java/org/apache/hadoop/hbase/io/compress/zstd/TestZstdByteBuffDecompressor.java b/hbase-compression/hbase-compression-zstd/src/test/java/org/apache/hadoop/hbase/io/compress/zstd/TestZstdByteBuffDecompressor.java
new file mode 100644
index 000000000000..66d21081ba7a
--- /dev/null
+++ b/hbase-compression/hbase-compression-zstd/src/test/java/org/apache/hadoop/hbase/io/compress/zstd/TestZstdByteBuffDecompressor.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.compress.zstd;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.nio.ByteBuff;
+import org.apache.hadoop.hbase.nio.MultiByteBuff;
+import org.apache.hadoop.hbase.nio.SingleByteBuff;
+import org.apache.hadoop.hbase.testclassification.SmallTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+@Category(SmallTests.class)
+public class TestZstdByteBuffDecompressor {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestZstdByteBuffDecompressor.class);
+
+  // "HBase is awesome" compressed with zstd, and then prepended with metadata as a
+  // BlockCompressorStream would
+  private static final byte[] COMPRESSED_PAYLOAD =
+    Bytes.fromHex("000000100000001928b52ffd2010810000484261736520697320617765736f6d65");
+
+  @Test
+  public void testCapabilities() {
+    ByteBuff emptySingleHeapBuff = new SingleByteBuff(ByteBuffer.allocate(0));
+    ByteBuff emptyMultiHeapBuff = new MultiByteBuff(ByteBuffer.allocate(0), ByteBuffer.allocate(0));
+    ByteBuff emptySingleDirectBuff = new SingleByteBuff(ByteBuffer.allocateDirect(0));
+    ByteBuff emptyMultiDirectBuff =
+      new MultiByteBuff(ByteBuffer.allocateDirect(0), ByteBuffer.allocateDirect(0));
+
+    try (ZstdByteBuffDecompressor decompressor = new ZstdByteBuffDecompressor(null)) {
+      assertTrue(decompressor.canDecompress(emptySingleHeapBuff, emptySingleHeapBuff));
+      assertTrue(decompressor.canDecompress(emptySingleDirectBuff, emptySingleDirectBuff));
+      assertFalse(decompressor.canDecompress(emptySingleHeapBuff, emptySingleDirectBuff));
+      assertFalse(decompressor.canDecompress(emptySingleDirectBuff, emptySingleHeapBuff));
+      assertFalse(decompressor.canDecompress(emptyMultiHeapBuff, emptyMultiHeapBuff));
+      assertFalse(decompressor.canDecompress(emptyMultiDirectBuff, emptyMultiDirectBuff));
+      assertFalse(decompressor.canDecompress(emptySingleHeapBuff, emptyMultiHeapBuff));
+      assertFalse(decompressor.canDecompress(emptySingleDirectBuff, emptyMultiDirectBuff));
+    }
+  }
+
+  @Test
+  public void testDecompressHeap() throws IOException {
+    try (ZstdByteBuffDecompressor decompressor = new ZstdByteBuffDecompressor(null)) {
+      ByteBuff output = new SingleByteBuff(ByteBuffer.allocate(64));
+      ByteBuff input = new SingleByteBuff(ByteBuffer.wrap(COMPRESSED_PAYLOAD));
+      int decompressedSize = decompressor.decompress(output, input, COMPRESSED_PAYLOAD.length);
+      assertEquals("HBase is awesome", Bytes.toString(output.toBytes(0, decompressedSize)));
+    }
+  }
+
+  @Test
+  public void testDecompressDirect() throws IOException {
+    try (ZstdByteBuffDecompressor decompressor = new ZstdByteBuffDecompressor(null)) {
+      ByteBuff output = new SingleByteBuff(ByteBuffer.allocateDirect(64));
+      ByteBuff input = new SingleByteBuff(ByteBuffer.allocateDirect(COMPRESSED_PAYLOAD.length));
+      input.put(COMPRESSED_PAYLOAD);
+      input.rewind();
+      int decompressedSize = decompressor.decompress(output, input, COMPRESSED_PAYLOAD.length);
+      assertEquals("HBase is awesome", Bytes.toString(output.toBytes(0, decompressedSize)));
+    }
+  }
+
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/CompressionContext.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/CompressionContext.java
index 2481753dfb06..0c5d6047ceec 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/CompressionContext.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/CompressionContext.java
@@ -140,6 +140,9 @@ public void decompress(InputStream in, int inLength, byte[] outArray, int outOff
       }
     }
 
+    /**
+     * Read an integer from the stream in big-endian byte order.
+     */
     private int rawReadInt(InputStream in) throws IOException {
       int b1 = in.read();
       int b2 = in.read();

From 2e83f19160417e65dd772df09099e2cf659efd61 Mon Sep 17 00:00:00 2001
From: Charles Connell <cconnell@hubspot.com>
Date: Wed, 5 Mar 2025 09:22:31 -0500
Subject: [PATCH 14/37] HubSpot Backport: HBASE-29160: Cache region name string
 in RegionInfo (will be in 2.6.3)

Signed-off-by: Wellington Chevreuil <wchevreuil@apache.org>
(cherry picked from commit 7737ae18482daa2afd0db31581061ea49edae0ec)
---
 .../hadoop/hbase/client/MutableRegionInfo.java   | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MutableRegionInfo.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MutableRegionInfo.java
index 4217201b85e3..d6d8e00f7822 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MutableRegionInfo.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MutableRegionInfo.java
@@ -64,6 +64,7 @@ class MutableRegionInfo implements RegionInfo {
   private final int hashCode;
   private final String encodedName;
   private final byte[] encodedNameAsBytes;
+  private String nameAsString = null;
   private final TableName tableName;
 
   private static int generateHashCode(final TableName tableName, final byte[] startKey,
@@ -149,10 +150,21 @@ public byte[] getRegionName() {
     return regionName;
   }
 
-  /** Returns Region name as a String for use in logging, etc. */
+  /**
+   * Returns region name as a String for use in logging, tracing, etc. Expensive enough to compute
+   * that we do it on first request and save it. Used often because it's included in trace of every
+   * RPC.
+   */
   @Override
   public String getRegionNameAsString() {
-    return RegionInfo.getRegionNameAsString(this, this.regionName);
+    if (nameAsString == null) {
+      String name = RegionInfo.getRegionNameAsString(this, this.regionName);
+      // may race with other threads setting this, but that's ok
+      nameAsString = name;
+      return name;
+    } else {
+      return nameAsString;
+    }
   }
 
   /** Returns the encoded region name */

From d4b24e4578e2cf653fc42fbcd40ecec4d1824fa3 Mon Sep 17 00:00:00 2001
From: Charles Connell <cconnell@hubspot.com>
Date: Wed, 5 Mar 2025 10:09:48 -0500
Subject: [PATCH 15/37] HubSpot Backport: HBASE-29160: Cache region name string
 in RegionInfo, additional branch-2 changes (not yet merged upstream)

---
 .../org/apache/hadoop/hbase/HRegionInfo.java  | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/HRegionInfo.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/HRegionInfo.java
index 33d7d98c61e0..41aca85f5376 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/HRegionInfo.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/HRegionInfo.java
@@ -161,6 +161,7 @@ public static String prettyPrint(final String encodedRegionName) {
   public static final String NO_HASH = null;
   private String encodedName = null;
   private byte[] encodedNameAsBytes = null;
+  private String nameAsString = null;
   private int replicaId = DEFAULT_REPLICA_ID;
 
   // Current TableName
@@ -455,15 +456,21 @@ public byte[] getRegionName() {
   /** Returns Region name as a String for use in logging, etc. */
   @Override
   public String getRegionNameAsString() {
-    if (RegionInfo.hasEncodedName(this.regionName)) {
-      // new format region names already have their encoded name.
-      return Bytes.toStringBinary(this.regionName);
+    if (nameAsString == null) {
+      String name;
+      if (RegionInfo.hasEncodedName(this.regionName)) {
+        // new format region names already have their encoded name.
+        name = Bytes.toStringBinary(this.regionName);
+      } else {
+        // old format. regionNameStr doesn't have the region name.
+        name = Bytes.toStringBinary(this.regionName) + "." + this.getEncodedName();
+      }
+      // may race with other threads setting this, but that's ok
+      nameAsString = name;
+      return name;
+    } else {
+      return nameAsString;
     }
-
-    // old format. regionNameStr doesn't have the region name.
-    //
-    //
-    return Bytes.toStringBinary(this.regionName) + "." + this.getEncodedName();
   }
 
   /** Returns the encoded region name */

From e440b4c5859b25766841d9b89a136620c30edc44 Mon Sep 17 00:00:00 2001
From: Charles Connell <cconnell@hubspot.com>
Date: Mon, 3 Mar 2025 11:02:22 -0500
Subject: [PATCH 16/37] HubSpot Backport: HBASE-29148: BufferedMutator should
 be able to flush after buffering a certain number of mutations (not yet
 merged upstream)

Signed-off-by: Duo Zhang <zhangduo@apache.org>
Signed-off-by: Nick Dimiduk <ndimiduk@apache.org>
---
 .../hbase/client/AsyncBufferedMutator.java    |  5 ++
 .../client/AsyncBufferedMutatorBuilder.java   |  7 ++
 .../AsyncBufferedMutatorBuilderImpl.java      | 12 ++-
 .../client/AsyncBufferedMutatorImpl.java      | 18 ++++-
 .../client/AsyncConnectionConfiguration.java  |  7 ++
 .../hadoop/hbase/client/BufferedMutator.java  |  8 ++
 .../hbase/client/BufferedMutatorImpl.java     | 12 ++-
 .../hbase/client/BufferedMutatorParams.java   | 19 +++++
 .../hbase/client/ConnectionConfiguration.java | 12 +++
 .../client/ConnectionImplementation.java      |  3 +
 .../client/TestBufferedMutatorParams.java     |  6 +-
 .../hbase/client/TestAsyncBufferMutator.java  | 24 +++++-
 .../hbase/client/TestBufferedMutator2.java    | 79 +++++++++++++++++++
 13 files changed, 203 insertions(+), 9 deletions(-)
 create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestBufferedMutator2.java

diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncBufferedMutator.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncBufferedMutator.java
index 6cc2b5adf9d4..479446f8ea13 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncBufferedMutator.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncBufferedMutator.java
@@ -88,6 +88,11 @@ default CompletableFuture<Void> mutate(Mutation mutation) {
    */
   long getWriteBufferSize();
 
+  /**
+   * The maximum number of mutations that this buffered mutator will buffer before flushing them
+   */
+  int getMaxMutations();
+
   /**
    * Returns the periodical flush interval, 0 means disabled.
    */
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncBufferedMutatorBuilder.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncBufferedMutatorBuilder.java
index 4659fe63eefc..833550decd05 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncBufferedMutatorBuilder.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncBufferedMutatorBuilder.java
@@ -126,6 +126,13 @@ default AsyncBufferedMutatorBuilder setRequestAttributes(Map<String, byte[]> req
     throw new UnsupportedOperationException("Not implemented");
   }
 
+  /**
+   * Set the maximum number of mutations that this buffered mutator will buffer before flushing
+   * them. If you are talking to a cluster that uses hbase.rpc.rows.size.threshold.reject to reject
+   * large Multi requests, you may need this setting to avoid rejections. Default is no limit.
+   */
+  AsyncBufferedMutatorBuilder setMaxMutations(int maxMutations);
+
   /**
    * Create the {@link AsyncBufferedMutator} instance.
    */
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncBufferedMutatorBuilderImpl.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncBufferedMutatorBuilderImpl.java
index 6905ff3065cb..7fa860dc3d4e 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncBufferedMutatorBuilderImpl.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncBufferedMutatorBuilderImpl.java
@@ -40,12 +40,15 @@ class AsyncBufferedMutatorBuilderImpl implements AsyncBufferedMutatorBuilder {
 
   private int maxKeyValueSize;
 
+  private int maxMutations;
+
   public AsyncBufferedMutatorBuilderImpl(AsyncConnectionConfiguration connConf,
     AsyncTableBuilder<?> tableBuilder, HashedWheelTimer periodicalFlushTimer) {
     this.tableBuilder = tableBuilder;
     this.writeBufferSize = connConf.getWriteBufferSize();
     this.periodicFlushTimeoutNs = connConf.getWriteBufferPeriodicFlushTimeoutNs();
     this.maxKeyValueSize = connConf.getMaxKeyValueSize();
+    this.maxMutations = connConf.getBufferedMutatorMaxMutations();
     this.periodicalFlushTimer = periodicalFlushTimer;
   }
 
@@ -115,9 +118,16 @@ public AsyncBufferedMutatorBuilder setMaxKeyValueSize(int maxKeyValueSize) {
     return this;
   }
 
+  @Override
+  public AsyncBufferedMutatorBuilder setMaxMutations(int maxMutations) {
+    Preconditions.checkArgument(maxMutations > 0, "maxMutations %d must be > 0", maxMutations);
+    this.maxMutations = maxMutations;
+    return this;
+  }
+
   @Override
   public AsyncBufferedMutator build() {
     return new AsyncBufferedMutatorImpl(periodicalFlushTimer, tableBuilder.build(), writeBufferSize,
-      periodicFlushTimeoutNs, maxKeyValueSize);
+      periodicFlushTimeoutNs, maxKeyValueSize, maxMutations);
   }
 }
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncBufferedMutatorImpl.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncBufferedMutatorImpl.java
index 59eff8cf33f1..e5500b0977b1 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncBufferedMutatorImpl.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncBufferedMutatorImpl.java
@@ -32,6 +32,8 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.TableName;
 import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import org.apache.hbase.thirdparty.io.netty.util.HashedWheelTimer;
 import org.apache.hbase.thirdparty.io.netty.util.Timeout;
@@ -42,6 +44,8 @@
 @InterfaceAudience.Private
 class AsyncBufferedMutatorImpl implements AsyncBufferedMutator {
 
+  private static final Logger LOG = LoggerFactory.getLogger(AsyncBufferedMutatorImpl.class);
+
   private final HashedWheelTimer periodicalFlushTimer;
 
   private final AsyncTable<?> table;
@@ -52,6 +56,8 @@ class AsyncBufferedMutatorImpl implements AsyncBufferedMutator {
 
   private final int maxKeyValueSize;
 
+  private final int maxMutations;
+
   private List<Mutation> mutations = new ArrayList<>();
 
   private List<CompletableFuture<Void>> futures = new ArrayList<>();
@@ -63,12 +69,13 @@ class AsyncBufferedMutatorImpl implements AsyncBufferedMutator {
   Timeout periodicFlushTask;
 
   AsyncBufferedMutatorImpl(HashedWheelTimer periodicalFlushTimer, AsyncTable<?> table,
-    long writeBufferSize, long periodicFlushTimeoutNs, int maxKeyValueSize) {
+    long writeBufferSize, long periodicFlushTimeoutNs, int maxKeyValueSize, int maxMutations) {
     this.periodicalFlushTimer = periodicalFlushTimer;
     this.table = table;
     this.writeBufferSize = writeBufferSize;
     this.periodicFlushTimeoutNs = periodicFlushTimeoutNs;
     this.maxKeyValueSize = maxKeyValueSize;
+    this.maxMutations = maxMutations;
   }
 
   @Override
@@ -145,6 +152,10 @@ Stream.<CompletableFuture<Void>> generate(CompletableFuture::new).limit(mutation
       this.futures.addAll(futures);
       bufferedSize += heapSize;
       if (bufferedSize >= writeBufferSize) {
+        LOG.trace("Flushing because write buffer size {} reached", writeBufferSize);
+        internalFlush();
+      } else if (maxMutations > 0 && this.mutations.size() >= maxMutations) {
+        LOG.trace("Flushing because max mutations {} reached", maxMutations);
         internalFlush();
       }
     }
@@ -172,6 +183,11 @@ public long getPeriodicalFlushTimeout(TimeUnit unit) {
     return unit.convert(periodicFlushTimeoutNs, TimeUnit.NANOSECONDS);
   }
 
+  @Override
+  public int getMaxMutations() {
+    return maxMutations;
+  }
+
   @Override
   public Map<String, byte[]> getRequestAttributes() {
     return table.getRequestAttributes();
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncConnectionConfiguration.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncConnectionConfiguration.java
index 5fb95ebbd877..14bc0598d844 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncConnectionConfiguration.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncConnectionConfiguration.java
@@ -101,6 +101,8 @@ class AsyncConnectionConfiguration {
 
   private final int maxKeyValueSize;
 
+  private final int bufferedMutatorMaxMutations;
+
   AsyncConnectionConfiguration(Configuration conf) {
     ConnectionConfiguration connectionConf = new ConnectionConfiguration(conf);
 
@@ -111,6 +113,7 @@ class AsyncConnectionConfiguration {
     this.writeBufferPeriodicFlushTimeoutNs = connectionConf.getWriteBufferPeriodicFlushTimeoutMs();
     this.maxKeyValueSize = connectionConf.getMaxKeyValueSize();
     this.maxRetries = connectionConf.getRetriesNumber();
+    this.bufferedMutatorMaxMutations = connectionConf.getBufferedMutatorMaxMutations();
 
     // fields from connection configuration that need to be converted to nanos
     this.metaOperationTimeoutNs =
@@ -229,4 +232,8 @@ long getPrimaryMetaScanTimeoutNs() {
   int getMaxKeyValueSize() {
     return maxKeyValueSize;
   }
+
+  int getBufferedMutatorMaxMutations() {
+    return bufferedMutatorMaxMutations;
+  }
 }
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/BufferedMutator.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/BufferedMutator.java
index afb37e6e3ab9..9053cf448750 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/BufferedMutator.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/BufferedMutator.java
@@ -203,6 +203,14 @@ default Map<String, byte[]> getRequestAttributes() {
     return Collections.emptyMap();
   }
 
+  /**
+   * The maximum number of mutations that this buffered mutator will buffer before flushing them
+   */
+  default int getMaxMutations() {
+    throw new UnsupportedOperationException(
+      "The BufferedMutator::getMaxMutations has not been implemented");
+  }
+
   /**
    * Listens for asynchronous exceptions on a {@link BufferedMutator}.
    */
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/BufferedMutatorImpl.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/BufferedMutatorImpl.java
index 3e2b011337e5..29fb2c43bc0b 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/BufferedMutatorImpl.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/BufferedMutatorImpl.java
@@ -87,6 +87,7 @@ public class BufferedMutatorImpl implements BufferedMutator {
   private Timer writeBufferPeriodicFlushTimer = null;
 
   private final int maxKeyValueSize;
+  private final int maxMutations;
   private final ExecutorService pool;
   private final AtomicInteger rpcTimeout;
   private final AtomicInteger operationTimeout;
@@ -130,6 +131,10 @@ public class BufferedMutatorImpl implements BufferedMutator {
       ? params.getMaxKeyValueSize()
       : tableConf.getMaxKeyValueSize();
 
+    this.maxMutations = params.getMaxMutations() != UNSET
+      ? params.getMaxMutations()
+      : conn.getConnectionConfiguration().getBufferedMutatorMaxMutations();
+
     this.rpcTimeout = new AtomicInteger(params.getRpcTimeout() != UNSET
       ? params.getRpcTimeout()
       : conn.getConnectionConfiguration().getWriteRpcTimeout());
@@ -286,8 +291,11 @@ private void doFlush(boolean flushAll)
     throws InterruptedIOException, RetriesExhaustedWithDetailsException {
     List<RetriesExhaustedWithDetailsException> errors = new ArrayList<>();
     while (true) {
-      if (!flushAll && currentWriteBufferSize.get() <= writeBufferSize) {
-        // There is the room to accept more mutations.
+      if (
+        !flushAll && (currentWriteBufferSize.get() <= writeBufferSize)
+          && (maxMutations == UNSET || size() < maxMutations)
+      ) {
+        // There is room to accept more mutations.
         break;
       }
       AsyncRequestFuture asf;
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/BufferedMutatorParams.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/BufferedMutatorParams.java
index 0b36a59f9e35..de414cb86234 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/BufferedMutatorParams.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/BufferedMutatorParams.java
@@ -41,6 +41,7 @@ public class BufferedMutatorParams implements Cloneable {
   private String implementationClassName = null;
   private int rpcTimeout = UNSET;
   private int operationTimeout = UNSET;
+  private int maxMutations = UNSET;
   protected Map<String, byte[]> requestAttributes = Collections.emptyMap();
   private BufferedMutator.ExceptionListener listener = new BufferedMutator.ExceptionListener() {
     @Override
@@ -89,6 +90,23 @@ public int getOperationTimeout() {
     return operationTimeout;
   }
 
+  /**
+   * Set the maximum number of mutations that this buffered mutator will buffer before flushing
+   * them. If you are talking to a cluster that uses hbase.rpc.rows.size.threshold.reject to reject
+   * large Multi requests, you may need this setting to avoid rejections. Default is no limit.
+   */
+  public BufferedMutatorParams setMaxMutations(int maxMutations) {
+    this.maxMutations = maxMutations;
+    return this;
+  }
+
+  /**
+   * The maximum number of mutations that this buffered mutator will buffer before flushing them
+   */
+  public int getMaxMutations() {
+    return maxMutations;
+  }
+
   public BufferedMutatorParams setRequestAttribute(String key, byte[] value) {
     if (requestAttributes.isEmpty()) {
       requestAttributes = new HashMap<>();
@@ -204,6 +222,7 @@ public BufferedMutatorParams clone() {
     clone.writeBufferPeriodicFlushTimeoutMs = this.writeBufferPeriodicFlushTimeoutMs;
     clone.writeBufferPeriodicFlushTimerTickMs = this.writeBufferPeriodicFlushTimerTickMs;
     clone.maxKeyValueSize = this.maxKeyValueSize;
+    clone.maxMutations = this.maxMutations;
     clone.pool = this.pool;
     clone.listener = this.listener;
     clone.implementationClassName = this.implementationClassName;
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionConfiguration.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionConfiguration.java
index 2a6651b5dde0..15e09d6a3b43 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionConfiguration.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionConfiguration.java
@@ -47,6 +47,9 @@ public class ConnectionConfiguration {
   public static final long WRITE_BUFFER_PERIODIC_FLUSH_TIMERTICK_MS_DEFAULT = 1000L; // 1 second
   public static final String MAX_KEYVALUE_SIZE_KEY = "hbase.client.keyvalue.maxsize";
   public static final int MAX_KEYVALUE_SIZE_DEFAULT = 10485760;
+  public static final String BUFFERED_MUTATOR_MAX_MUTATIONS_KEY =
+    "hbase.client.write.buffer.maxmutations";
+  public static final int BUFFERED_MUTATOR_MAX_MUTATIONS_DEFAULT = -1;
   public static final String PRIMARY_CALL_TIMEOUT_MICROSECOND =
     "hbase.client.primaryCallTimeout.get";
   public static final int PRIMARY_CALL_TIMEOUT_MICROSECOND_DEFAULT = 10000; // 10ms
@@ -94,6 +97,7 @@ public class ConnectionConfiguration {
   private final int metaReplicaCallTimeoutMicroSecondScan;
   private final int retries;
   private final int maxKeyValueSize;
+  private final int bufferedMutatorMaxMutations;
   private final int rpcTimeout;
   private final int readRpcTimeout;
   private final int metaReadRpcTimeout;
@@ -150,6 +154,9 @@ public class ConnectionConfiguration {
 
     this.maxKeyValueSize = conf.getInt(MAX_KEYVALUE_SIZE_KEY, MAX_KEYVALUE_SIZE_DEFAULT);
 
+    this.bufferedMutatorMaxMutations =
+      conf.getInt(BUFFERED_MUTATOR_MAX_MUTATIONS_KEY, BUFFERED_MUTATOR_MAX_MUTATIONS_DEFAULT);
+
     this.rpcTimeout =
       conf.getInt(HConstants.HBASE_RPC_TIMEOUT_KEY, HConstants.DEFAULT_HBASE_RPC_TIMEOUT);
 
@@ -203,6 +210,7 @@ protected ConnectionConfiguration() {
     this.retries = HConstants.DEFAULT_HBASE_CLIENT_RETRIES_NUMBER;
     this.clientScannerAsyncPrefetch = Scan.DEFAULT_HBASE_CLIENT_SCANNER_ASYNC_PREFETCH;
     this.maxKeyValueSize = MAX_KEYVALUE_SIZE_DEFAULT;
+    this.bufferedMutatorMaxMutations = BUFFERED_MUTATOR_MAX_MUTATIONS_DEFAULT;
     this.readRpcTimeout = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
     this.metaReadRpcTimeout = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
     this.writeRpcTimeout = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
@@ -271,6 +279,10 @@ public int getMaxKeyValueSize() {
     return maxKeyValueSize;
   }
 
+  public int getBufferedMutatorMaxMutations() {
+    return bufferedMutatorMaxMutations;
+  }
+
   public long getScannerMaxResultSize() {
     return scannerMaxResultSize;
   }
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java
index 9d99e98d529d..5e0cd51730b2 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java
@@ -519,6 +519,9 @@ public BufferedMutator getBufferedMutator(BufferedMutatorParams params) {
     if (params.getMaxKeyValueSize() == BufferedMutatorParams.UNSET) {
       params.maxKeyValueSize(connectionConfig.getMaxKeyValueSize());
     }
+    if (params.getMaxMutations() == BufferedMutatorParams.UNSET) {
+      params.setMaxMutations(connectionConfig.getBufferedMutatorMaxMutations());
+    }
     // Look to see if an alternate BufferedMutation implementation is wanted.
     // Look in params and in config. If null, use default.
     String implementationClassName = params.getImplementationClassName();
diff --git a/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestBufferedMutatorParams.java b/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestBufferedMutatorParams.java
index ba23d1053938..fdc7c305500f 100644
--- a/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestBufferedMutatorParams.java
+++ b/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestBufferedMutatorParams.java
@@ -140,8 +140,8 @@ public void testClone() {
 
     BufferedMutator.ExceptionListener listener = new MockExceptionListener();
     bmp.writeBufferSize(17).setWriteBufferPeriodicFlushTimeoutMs(123)
-      .setWriteBufferPeriodicFlushTimerTickMs(456).maxKeyValueSize(13).pool(pool)
-      .listener(listener);
+      .setWriteBufferPeriodicFlushTimerTickMs(456).maxKeyValueSize(13).setMaxMutations(3737)
+      .pool(pool).listener(listener);
     bmp.implementationClassName("someClassName");
     BufferedMutatorParams clone = bmp.clone();
 
@@ -151,6 +151,7 @@ public void testClone() {
     assertEquals(123, clone.getWriteBufferPeriodicFlushTimeoutMs());
     assertEquals(456, clone.getWriteBufferPeriodicFlushTimerTickMs());
     assertEquals(13, clone.getMaxKeyValueSize());
+    assertEquals(3737, clone.getMaxMutations());
     assertEquals("someClassName", clone.getImplementationClassName());
 
     cloneTest(bmp, clone);
@@ -178,6 +179,7 @@ private void cloneTest(BufferedMutatorParams some, BufferedMutatorParams clone)
     assertEquals(some.getWriteBufferPeriodicFlushTimerTickMs(),
       clone.getWriteBufferPeriodicFlushTimerTickMs());
     assertEquals(some.getMaxKeyValueSize(), clone.getMaxKeyValueSize());
+    assertTrue(some.getMaxMutations() == clone.getMaxMutations());
     assertTrue(some.getListener() == clone.getListener());
     assertTrue(some.getPool() == clone.getPool());
     assertEquals(some.getImplementationClassName(), clone.getImplementationClassName());
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAsyncBufferMutator.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAsyncBufferMutator.java
index b479d4de5735..2802c77b5dd7 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAsyncBufferMutator.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAsyncBufferMutator.java
@@ -175,6 +175,23 @@ public void testPeriodicFlush() throws InterruptedException, ExecutionException
     assertArrayEquals(VALUE, table.get(new Get(Bytes.toBytes(0))).get().getValue(CF, CQ));
   }
 
+  @Test
+  public void testMaxMutationsFlush() throws InterruptedException, ExecutionException {
+    AsyncBufferedMutator mutator =
+      CONN.getBufferedMutatorBuilder(TABLE_NAME).setMaxMutations(3).build();
+    CompletableFuture<?> future1 =
+      mutator.mutate(new Put(Bytes.toBytes(0)).addColumn(CF, CQ, VALUE));
+    CompletableFuture<?> future2 =
+      mutator.mutate(new Put(Bytes.toBytes(1)).addColumn(CF, CQ, VALUE));
+    CompletableFuture<?> future3 =
+      mutator.mutate(new Put(Bytes.toBytes(2)).addColumn(CF, CQ, VALUE));
+    CompletableFuture.allOf(future1, future2, future3).join();
+    AsyncTable<?> table = CONN.getTable(TABLE_NAME);
+    assertArrayEquals(VALUE, table.get(new Get(Bytes.toBytes(0))).get().getValue(CF, CQ));
+    assertArrayEquals(VALUE, table.get(new Get(Bytes.toBytes(1))).get().getValue(CF, CQ));
+    assertArrayEquals(VALUE, table.get(new Get(Bytes.toBytes(2))).get().getValue(CF, CQ));
+  }
+
   // a bit deep into the implementation
   @Test
   public void testCancelPeriodicFlush() throws InterruptedException, ExecutionException {
@@ -244,8 +261,9 @@ private static final class AsyncBufferMutatorForTest extends AsyncBufferedMutato
     private int flushCount;
 
     AsyncBufferMutatorForTest(HashedWheelTimer periodicalFlushTimer, AsyncTable<?> table,
-      long writeBufferSize, long periodicFlushTimeoutNs, int maxKeyValueSize) {
-      super(periodicalFlushTimer, table, writeBufferSize, periodicFlushTimeoutNs, maxKeyValueSize);
+      long writeBufferSize, long periodicFlushTimeoutNs, int maxKeyValueSize, int maxMutation) {
+      super(periodicalFlushTimer, table, writeBufferSize, periodicFlushTimeoutNs, maxKeyValueSize,
+        maxMutation);
     }
 
     @Override
@@ -261,7 +279,7 @@ public void testRaceBetweenNormalFlushAndPeriodicFlush()
     Put put = new Put(Bytes.toBytes(0)).addColumn(CF, CQ, VALUE);
     try (AsyncBufferMutatorForTest mutator =
       new AsyncBufferMutatorForTest(AsyncConnectionImpl.RETRY_TIMER, CONN.getTable(TABLE_NAME),
-        10 * put.heapSize(), TimeUnit.MILLISECONDS.toNanos(200), 1024 * 1024)) {
+        10 * put.heapSize(), TimeUnit.MILLISECONDS.toNanos(200), 1024 * 1024, 100)) {
       CompletableFuture<?> future = mutator.mutate(put);
       Timeout task = mutator.periodicFlushTask;
       // we should have scheduled a periodic flush task
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestBufferedMutator2.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestBufferedMutator2.java
new file mode 100644
index 000000000000..2dd2057b8c08
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestBufferedMutator2.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.client;
+
+import static org.junit.Assert.assertArrayEquals;
+
+import java.io.IOException;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.testclassification.ClientTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+@Category({ MediumTests.class, ClientTests.class })
+public class TestBufferedMutator2 {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestBufferedMutator2.class);
+
+  private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+
+  private static TableName TABLE_NAME = TableName.valueOf("example-table");
+
+  private static byte[] CF = Bytes.toBytes("cf");
+  private static byte[] CQ = Bytes.toBytes("cq");
+  private static byte[] VALUE = new byte[1024];
+
+  private static Connection CONN;
+
+  @BeforeClass
+  public static void setUp() throws Exception {
+    TEST_UTIL.startMiniCluster(1);
+    TEST_UTIL.createTable(TABLE_NAME, CF);
+    CONN = ConnectionFactory.createConnection(TEST_UTIL.getConfiguration());
+    Bytes.random(VALUE);
+  }
+
+  @AfterClass
+  public static void tearDown() throws Exception {
+    CONN.close();
+    TEST_UTIL.shutdownMiniCluster();
+  }
+
+  @Test
+  public void testMaxMutationsFlush() throws IOException {
+    BufferedMutator mutator =
+      CONN.getBufferedMutator(new BufferedMutatorParams(TABLE_NAME).setMaxMutations(3));
+    mutator.mutate(new Put(Bytes.toBytes(0)).addColumn(CF, CQ, VALUE));
+    mutator.mutate(new Put(Bytes.toBytes(1)).addColumn(CF, CQ, VALUE));
+    mutator.mutate(new Put(Bytes.toBytes(2)).addColumn(CF, CQ, VALUE));
+    Table table = CONN.getTable(TABLE_NAME);
+    assertArrayEquals(VALUE, table.get(new Get(Bytes.toBytes(0))).getValue(CF, CQ));
+    assertArrayEquals(VALUE, table.get(new Get(Bytes.toBytes(1))).getValue(CF, CQ));
+    assertArrayEquals(VALUE, table.get(new Get(Bytes.toBytes(2))).getValue(CF, CQ));
+  }
+
+}

From 1d1fecceefaecf7bf84eb0416b7b28ca97fd2f00 Mon Sep 17 00:00:00 2001
From: Charles Connell <cconnell@hubspot.com>
Date: Thu, 6 Mar 2025 19:55:38 -0500
Subject: [PATCH 17/37] HubSpot Backport: HBASE-29172: Fix to
 ZstdByteBuffDecompressor (not yet merged upstream)

---
 .../zstd/ZstdByteBuffDecompressor.java         |  2 ++
 .../zstd/TestZstdByteBuffDecompressor.java     | 18 ++++++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdByteBuffDecompressor.java b/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdByteBuffDecompressor.java
index 399753b92831..ec5315aa4c02 100644
--- a/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdByteBuffDecompressor.java
+++ b/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdByteBuffDecompressor.java
@@ -98,6 +98,7 @@ private int decompressDirectByteBuffers(ByteBuffer output, ByteBuffer input, int
       output.limit() - output.position(), input, input.position(), inputLen);
 
     output.position(origOutputPos + n);
+    input.position(input.position() + inputLen);
     return n;
   }
 
@@ -109,6 +110,7 @@ private int decompressHeapByteBuffers(ByteBuffer output, ByteBuffer input, int i
       inputLen);
 
     output.position(origOutputPos + n);
+    input.position(input.position() + inputLen);
     return n;
   }
 
diff --git a/hbase-compression/hbase-compression-zstd/src/test/java/org/apache/hadoop/hbase/io/compress/zstd/TestZstdByteBuffDecompressor.java b/hbase-compression/hbase-compression-zstd/src/test/java/org/apache/hadoop/hbase/io/compress/zstd/TestZstdByteBuffDecompressor.java
index 66d21081ba7a..86ba921afdbb 100644
--- a/hbase-compression/hbase-compression-zstd/src/test/java/org/apache/hadoop/hbase/io/compress/zstd/TestZstdByteBuffDecompressor.java
+++ b/hbase-compression/hbase-compression-zstd/src/test/java/org/apache/hadoop/hbase/io/compress/zstd/TestZstdByteBuffDecompressor.java
@@ -40,10 +40,14 @@ public class TestZstdByteBuffDecompressor {
   public static final HBaseClassTestRule CLASS_RULE =
     HBaseClassTestRule.forClass(TestZstdByteBuffDecompressor.class);
 
-  // "HBase is awesome" compressed with zstd, and then prepended with metadata as a
-  // BlockCompressorStream would
-  private static final byte[] COMPRESSED_PAYLOAD =
-    Bytes.fromHex("000000100000001928b52ffd2010810000484261736520697320617765736f6d65");
+  /*
+   * "HBase is fun to use and very fast" compressed with zstd, and then prepended with metadata as a
+   * BlockCompressorStream would. The phrase is split in three parts and put into the payload in
+   * this structure: (block 1: (chunk 1: HBase is), (chunk 2: fun to use)), (block 2: (chunk 1: and
+   * very fast))
+   */
+  private static final byte[] COMPRESSED_PAYLOAD = Bytes.fromHex(
+    "000000130000001228b52ffd20094900004842617365206973200000001428b52ffd200b59000066756e20746f20757365200000000d0000001628b52ffd200d690000616e6420766572792066617374");
 
   @Test
   public void testCapabilities() {
@@ -71,7 +75,8 @@ public void testDecompressHeap() throws IOException {
       ByteBuff output = new SingleByteBuff(ByteBuffer.allocate(64));
       ByteBuff input = new SingleByteBuff(ByteBuffer.wrap(COMPRESSED_PAYLOAD));
       int decompressedSize = decompressor.decompress(output, input, COMPRESSED_PAYLOAD.length);
-      assertEquals("HBase is awesome", Bytes.toString(output.toBytes(0, decompressedSize)));
+      assertEquals("HBase is fun to use and very fast",
+        Bytes.toString(output.toBytes(0, decompressedSize)));
     }
   }
 
@@ -83,7 +88,8 @@ public void testDecompressDirect() throws IOException {
       input.put(COMPRESSED_PAYLOAD);
       input.rewind();
       int decompressedSize = decompressor.decompress(output, input, COMPRESSED_PAYLOAD.length);
-      assertEquals("HBase is awesome", Bytes.toString(output.toBytes(0, decompressedSize)));
+      assertEquals("HBase is fun to use and very fast",
+        Bytes.toString(output.toBytes(0, decompressedSize)));
     }
   }
 

From 277c16923b5bd283eb709fbe417322bc2fce449d Mon Sep 17 00:00:00 2001
From: Ray Mattingly <rmattingly@apache.org>
Date: Fri, 28 Feb 2025 16:39:02 -0500
Subject: [PATCH 18/37] HubSpot Backport: HBASE-28513 The
 StochasticLoadBalancer should support discrete evaluations (#6651) (#6720)
 (will be in 2.7)

Signed-off-by: Nick Dimiduk <ndimiduk@apache.org>
Co-authored-by: Ray Mattingly <rmattingly@hubspot.com>
---
 .../master/balancer/AssignRegionAction.java   |  10 +
 .../hbase/master/balancer/BalanceAction.java  |  36 ++-
 .../master/balancer/BalancerClusterState.java |  91 ++++++-
 .../master/balancer/BalancerConditionals.java | 213 +++++++++++++++
 .../master/balancer/BaseLoadBalancer.java     |  12 +-
 .../balancer/CacheAwareLoadBalancer.java      |   2 +-
 .../master/balancer/CandidateGenerator.java   |   2 +
 .../hbase/master/balancer/CostFunction.java   |   7 +
 .../DistributeReplicasCandidateGenerator.java | 115 ++++++++
 .../DistributeReplicasConditional.java        |  97 +++++++
 .../balancer/FavoredStochasticBalancer.java   |   4 +-
 .../master/balancer/MoveBatchAction.java      |  77 ++++++
 .../master/balancer/MoveRegionAction.java     |  10 +
 .../balancer/RegionPlanConditional.java       | 133 +++++++++
 ...gionPlanConditionalCandidateGenerator.java | 113 ++++++++
 .../SlopFixingCandidateGenerator.java         | 105 +++++++
 .../balancer/StochasticLoadBalancer.java      | 135 +++++++--
 .../master/balancer/SwapRegionsAction.java    |  13 +
 .../master/balancer/replicas/ReplicaKey.java  |  55 ++++
 .../balancer/replicas/ReplicaKeyCache.java    |  93 +++++++
 .../BalancerConditionalsTestUtil.java         | 221 +++++++++++++++
 .../balancer/CandidateGeneratorTestUtil.java  | 256 ++++++++++++++++++
 .../DistributeReplicasTestConditional.java    |  39 +++
 .../LoadOnlyFavoredStochasticBalancer.java    |   3 +-
 .../balancer/TestBalancerConditionals.java    |  83 ++++++
 ...lancingConditionalReplicaDistribution.java | 114 ++++++++
 ...eplicaDistributionBalancerConditional.java | 120 ++++++++
 ...ochasticLoadBalancerHeterogeneousCost.java |   2 +-
 28 files changed, 2120 insertions(+), 41 deletions(-)
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasCandidateGenerator.java
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasConditional.java
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MoveBatchAction.java
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionPlanConditional.java
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionPlanConditionalCandidateGenerator.java
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SlopFixingCandidateGenerator.java
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/replicas/ReplicaKey.java
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/replicas/ReplicaKeyCache.java
 create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionalsTestUtil.java
 create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java
 create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasTestConditional.java
 create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestBalancerConditionals.java
 create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingConditionalReplicaDistribution.java
 create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestReplicaDistributionBalancerConditional.java

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/AssignRegionAction.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/AssignRegionAction.java
index c99ae092d775..8a79b64142e0 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/AssignRegionAction.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/AssignRegionAction.java
@@ -17,9 +17,13 @@
  */
 package org.apache.hadoop.hbase.master.balancer;
 
+import java.util.List;
 import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.master.RegionPlan;
 import org.apache.yetus.audience.InterfaceAudience;
 
+import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableList;
+
 @InterfaceAudience.Private
 class AssignRegionAction extends BalanceAction {
   private final int region;
@@ -46,6 +50,12 @@ public BalanceAction undoAction() {
     throw new UnsupportedOperationException(HConstants.NOT_IMPLEMENTED);
   }
 
+  @Override
+  List<RegionPlan> toRegionPlans(BalancerClusterState cluster) {
+    return ImmutableList
+      .of(new RegionPlan(cluster.regions[getRegion()], null, cluster.servers[getServer()]));
+  }
+
   @Override
   public String toString() {
     return getType() + ": " + region + ":" + server;
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalanceAction.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalanceAction.java
index 56b473ae710c..a65b5253907c 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalanceAction.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalanceAction.java
@@ -17,6 +17,9 @@
  */
 package org.apache.hadoop.hbase.master.balancer;
 
+import java.util.Collections;
+import java.util.List;
+import org.apache.hadoop.hbase.master.RegionPlan;
 import org.apache.yetus.audience.InterfaceAudience;
 
 /**
@@ -28,11 +31,11 @@ enum Type {
     ASSIGN_REGION,
     MOVE_REGION,
     SWAP_REGIONS,
+    MOVE_BATCH,
     NULL,
   }
 
-  static final BalanceAction NULL_ACTION = new BalanceAction(Type.NULL) {
-  };
+  static final BalanceAction NULL_ACTION = new NullBalanceAction();
 
   private final Type type;
 
@@ -43,16 +46,39 @@ enum Type {
   /**
    * Returns an Action which would undo this action
    */
-  BalanceAction undoAction() {
-    return this;
-  }
+  abstract BalanceAction undoAction();
+
+  /**
+   * Returns the Action represented as RegionPlans
+   */
+  abstract List<RegionPlan> toRegionPlans(BalancerClusterState cluster);
 
   Type getType() {
     return type;
   }
 
+  long getStepCount() {
+    return 1;
+  }
+
   @Override
   public String toString() {
     return type + ":";
   }
+
+  private static final class NullBalanceAction extends BalanceAction {
+    private NullBalanceAction() {
+      super(Type.NULL);
+    }
+
+    @Override
+    BalanceAction undoAction() {
+      return this;
+    }
+
+    @Override
+    List<RegionPlan> toRegionPlans(BalancerClusterState cluster) {
+      return Collections.emptyList();
+    }
+  }
 }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerClusterState.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerClusterState.java
index b857055fb3ab..67755fc317c6 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerClusterState.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerClusterState.java
@@ -26,6 +26,9 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+import java.util.function.Supplier;
 import org.agrona.collections.Hashing;
 import org.agrona.collections.Int2IntCounterMap;
 import org.apache.hadoop.hbase.HDFSBlocksDistribution;
@@ -39,6 +42,8 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.hbase.thirdparty.com.google.common.base.Suppliers;
+
 /**
  * An efficient array based implementation similar to ClusterState for keeping the status of the
  * cluster in terms of region assignment and distribution. LoadBalancers, such as
@@ -123,6 +128,15 @@ class BalancerClusterState {
   // Maps regionName -> oldServerName -> cache ratio of the region on the old server
   Map<String, Pair<ServerName, Float>> regionCacheRatioOnOldServerMap;
 
+  private final Supplier<List<Integer>> shuffledServerIndicesSupplier =
+    Suppliers.memoizeWithExpiration(() -> {
+      Collection<Integer> serverIndices = serversToIndex.values();
+      List<Integer> shuffledServerIndices = new ArrayList<>(serverIndices);
+      Collections.shuffle(shuffledServerIndices);
+      return shuffledServerIndices;
+    }, 5, TimeUnit.SECONDS);
+  private long stopRequestedAt = Long.MAX_VALUE;
+
   static class DefaultRackManager extends RackManager {
     @Override
     public String getRack(ServerName server) {
@@ -728,8 +742,25 @@ public void doAction(BalanceAction action) {
         regionMoved(a.getFromRegion(), a.getFromServer(), a.getToServer());
         regionMoved(a.getToRegion(), a.getToServer(), a.getFromServer());
         break;
+      case MOVE_BATCH:
+        assert action instanceof MoveBatchAction : action.getClass();
+        MoveBatchAction mba = (MoveBatchAction) action;
+        for (int serverIndex : mba.getServerToRegionsToRemove().keySet()) {
+          Set<Integer> regionsToRemove = mba.getServerToRegionsToRemove().get(serverIndex);
+          regionsPerServer[serverIndex] =
+            removeRegions(regionsPerServer[serverIndex], regionsToRemove);
+        }
+        for (int serverIndex : mba.getServerToRegionsToAdd().keySet()) {
+          Set<Integer> regionsToAdd = mba.getServerToRegionsToAdd().get(serverIndex);
+          regionsPerServer[serverIndex] = addRegions(regionsPerServer[serverIndex], regionsToAdd);
+        }
+        for (MoveRegionAction moveRegionAction : mba.getMoveActions()) {
+          regionMoved(moveRegionAction.getRegion(), moveRegionAction.getFromServer(),
+            moveRegionAction.getToServer());
+        }
+        break;
       default:
-        throw new RuntimeException("Uknown action:" + action.getType());
+        throw new RuntimeException("Unknown action:" + action.getType());
     }
   }
 
@@ -891,6 +922,52 @@ int[] addRegion(int[] regions, int regionIndex) {
     return newRegions;
   }
 
+  int[] removeRegions(int[] regions, Set<Integer> regionIndicesToRemove) {
+    // Calculate the size of the new regions array
+    int newSize = regions.length - regionIndicesToRemove.size();
+    if (newSize < 0) {
+      throw new IllegalStateException(
+        "Region indices mismatch: more regions to remove than in the regions array");
+    }
+
+    int[] newRegions = new int[newSize];
+    int newIndex = 0;
+
+    // Copy only the regions not in the removal set
+    for (int region : regions) {
+      if (!regionIndicesToRemove.contains(region)) {
+        newRegions[newIndex++] = region;
+      }
+    }
+
+    // If the newIndex is smaller than newSize, some regions were missing from the input array
+    if (newIndex != newSize) {
+      throw new IllegalStateException("Region indices mismatch: some regions in the removal "
+        + "set were not found in the regions array");
+    }
+
+    return newRegions;
+  }
+
+  int[] addRegions(int[] regions, Set<Integer> regionIndicesToAdd) {
+    int[] newRegions = new int[regions.length + regionIndicesToAdd.size()];
+
+    // Copy the existing regions to the new array
+    System.arraycopy(regions, 0, newRegions, 0, regions.length);
+
+    // Add the new regions at the end of the array
+    int newIndex = regions.length;
+    for (int regionIndex : regionIndicesToAdd) {
+      newRegions[newIndex++] = regionIndex;
+    }
+
+    return newRegions;
+  }
+
+  List<Integer> getShuffledServerIndices() {
+    return shuffledServerIndicesSupplier.get();
+  }
+
   int[] addRegionSorted(int[] regions, int regionIndex) {
     int[] newRegions = new int[regions.length + 1];
     int i = 0;
@@ -990,6 +1067,18 @@ void setNumMovedRegions(int numMovedRegions) {
     this.numMovedRegions = numMovedRegions;
   }
 
+  public int getMaxReplicas() {
+    return maxReplicas;
+  }
+
+  void setStopRequestedAt(long stopRequestedAt) {
+    this.stopRequestedAt = stopRequestedAt;
+  }
+
+  long getStopRequestedAt() {
+    return stopRequestedAt;
+  }
+
   @Override
   public String toString() {
     StringBuilder desc = new StringBuilder("Cluster={servers=[");
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java
new file mode 100644
index 000000000000..c44e47996932
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import java.lang.reflect.Constructor;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+import java.util.stream.Collectors;
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.master.RegionPlan;
+import org.apache.hadoop.hbase.master.balancer.replicas.ReplicaKeyCache;
+import org.apache.hadoop.hbase.util.ReflectionUtils;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableSet;
+
+/**
+ * Balancer conditionals supplement cost functions in the {@link StochasticLoadBalancer}. Cost
+ * functions are insufficient and difficult to work with when making discrete decisions; this is
+ * because they operate on a continuous scale, and each cost function's multiplier affects the
+ * relative importance of every other cost function. So it is difficult to meaningfully and clearly
+ * value many aspects of your region distribution via cost functions alone. Conditionals allow you
+ * to very clearly define discrete rules that your balancer would ideally follow. To clarify, a
+ * conditional violation will not block a region assignment because we would prefer to have uptime
+ * than have perfectly intentional balance. But conditionals allow you to, for example, define that
+ * a region's primary and secondary should not live on the same rack. Another example, conditionals
+ * make it easy to define that system tables will ideally be isolated on their own RegionServer
+ * (without needing to manage distinct RegionServer groups).
+ */
+@InterfaceAudience.Private
+final class BalancerConditionals implements Configurable {
+
+  private static final Logger LOG = LoggerFactory.getLogger(BalancerConditionals.class);
+
+  public static final String DISTRIBUTE_REPLICAS_KEY =
+    "hbase.master.balancer.stochastic.conditionals.distributeReplicas";
+  public static final boolean DISTRIBUTE_REPLICAS_DEFAULT = false;
+
+  public static final String ADDITIONAL_CONDITIONALS_KEY =
+    "hbase.master.balancer.stochastic.additionalConditionals";
+
+  private Set<Class<? extends RegionPlanConditional>> conditionalClasses = Collections.emptySet();
+  private Set<RegionPlanConditional> conditionals = Collections.emptySet();
+  private Configuration conf;
+
+  static BalancerConditionals create() {
+    return new BalancerConditionals();
+  }
+
+  private BalancerConditionals() {
+  }
+
+  boolean shouldRunBalancer(BalancerClusterState cluster) {
+    return isConditionalBalancingEnabled() && conditionals.stream()
+      .map(RegionPlanConditional::getCandidateGenerators).flatMap(Collection::stream)
+      .map(generator -> generator.getWeight(cluster)).anyMatch(weight -> weight > 0);
+  }
+
+  Set<Class<? extends RegionPlanConditional>> getConditionalClasses() {
+    return new HashSet<>(conditionalClasses);
+  }
+
+  Collection<RegionPlanConditional> getConditionals() {
+    return conditionals;
+  }
+
+  boolean isReplicaDistributionEnabled() {
+    return conditionalClasses.stream()
+      .anyMatch(DistributeReplicasConditional.class::isAssignableFrom);
+  }
+
+  boolean shouldSkipSloppyServerEvaluation() {
+    return isConditionalBalancingEnabled();
+  }
+
+  boolean isConditionalBalancingEnabled() {
+    return !conditionalClasses.isEmpty();
+  }
+
+  void clearConditionalWeightCaches() {
+    conditionals.stream().map(RegionPlanConditional::getCandidateGenerators)
+      .flatMap(Collection::stream)
+      .forEach(RegionPlanConditionalCandidateGenerator::clearWeightCache);
+  }
+
+  void loadClusterState(BalancerClusterState cluster) {
+    conditionals = conditionalClasses.stream().map(clazz -> createConditional(clazz, cluster))
+      .filter(Objects::nonNull).collect(Collectors.toSet());
+  }
+
+  /**
+   * Indicates whether the action is good for our conditional compliance.
+   * @param cluster The cluster state
+   * @param action  The proposed action
+   * @return -1 if conditionals improve, 0 if neutral, 1 if conditionals degrade
+   */
+  int getViolationCountChange(BalancerClusterState cluster, BalanceAction action) {
+    // Cluster is in pre-move state, so figure out the proposed violations
+    boolean isViolatingPost = isViolating(cluster, action);
+    cluster.doAction(action);
+
+    // Cluster is in post-move state, so figure out the original violations
+    BalanceAction undoAction = action.undoAction();
+    boolean isViolatingPre = isViolating(cluster, undoAction);
+
+    // Reset cluster
+    cluster.doAction(undoAction);
+
+    if (isViolatingPre && isViolatingPost) {
+      return 0;
+    } else if (!isViolatingPre && isViolatingPost) {
+      return 1;
+    } else {
+      return -1;
+    }
+  }
+
+  /**
+   * Check if the proposed action violates conditionals
+   * @param cluster The cluster state
+   * @param action  The proposed action
+   */
+  boolean isViolating(BalancerClusterState cluster, BalanceAction action) {
+    conditionals.forEach(conditional -> conditional.setClusterState(cluster));
+    if (conditionals.isEmpty()) {
+      return false;
+    }
+    List<RegionPlan> regionPlans = action.toRegionPlans(cluster);
+    for (RegionPlan regionPlan : regionPlans) {
+      if (isViolating(regionPlan)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  private boolean isViolating(RegionPlan regionPlan) {
+    for (RegionPlanConditional conditional : conditionals) {
+      if (conditional.isViolating(regionPlan)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  private RegionPlanConditional createConditional(Class<? extends RegionPlanConditional> clazz,
+    BalancerClusterState cluster) {
+    if (cluster == null) {
+      cluster = new BalancerClusterState(Collections.emptyMap(), null, null, null, null);
+    }
+    try {
+      Constructor<? extends RegionPlanConditional> ctor =
+        clazz.getDeclaredConstructor(BalancerConditionals.class, BalancerClusterState.class);
+      return ReflectionUtils.instantiate(clazz.getName(), ctor, this, cluster);
+    } catch (NoSuchMethodException e) {
+      LOG.warn("Cannot find constructor with Configuration and "
+        + "BalancerClusterState parameters for class '{}': {}", clazz.getName(), e.getMessage());
+    }
+    return null;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    ImmutableSet.Builder<Class<? extends RegionPlanConditional>> conditionalClasses =
+      ImmutableSet.builder();
+
+    boolean distributeReplicas =
+      conf.getBoolean(DISTRIBUTE_REPLICAS_KEY, DISTRIBUTE_REPLICAS_DEFAULT);
+    if (distributeReplicas) {
+      conditionalClasses.add(DistributeReplicasConditional.class);
+    }
+
+    Class<?>[] classes = conf.getClasses(ADDITIONAL_CONDITIONALS_KEY);
+    for (Class<?> clazz : classes) {
+      if (!RegionPlanConditional.class.isAssignableFrom(clazz)) {
+        LOG.warn("Class {} is not a RegionPlanConditional", clazz.getName());
+        continue;
+      }
+      conditionalClasses.add(clazz.asSubclass(RegionPlanConditional.class));
+    }
+    this.conditionalClasses = conditionalClasses.build();
+    ReplicaKeyCache.getInstance().setConf(conf);
+    loadClusterState(null);
+  }
+
+  @Override
+  public Configuration getConf() {
+    return conf;
+  }
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
index 07cd58920860..fac0d82fe013 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java
@@ -77,6 +77,9 @@ public abstract class BaseLoadBalancer implements LoadBalancer {
 
   public static final boolean DEFAULT_HBASE_MASTER_LOADBALANCE_BYTABLE = false;
 
+  public static final String REGIONS_SLOP_KEY = "hbase.regions.slop";
+  public static final float REGIONS_SLOP_DEFAULT = 0.2f;
+
   protected static final int MIN_SERVER_BALANCE = 2;
   private volatile boolean stopped = false;
 
@@ -256,7 +259,9 @@ protected final boolean sloppyRegionServerExist(ClusterLoadState cs) {
     float average = cs.getLoadAverage(); // for logging
     int floor = (int) Math.floor(average * (1 - slop));
     int ceiling = (int) Math.ceil(average * (1 + slop));
-    if (!(cs.getMaxLoad() > ceiling || cs.getMinLoad() < floor)) {
+    int maxLoad = cs.getMaxLoad();
+    int minLoad = cs.getMinLoad();
+    if (!(maxLoad > ceiling || minLoad < floor)) {
       NavigableMap<ServerAndLoad, List<RegionInfo>> serversByLoad = cs.getServersByLoad();
       if (LOG.isTraceEnabled()) {
         // If nothing to balance, then don't say anything unless trace-level logging.
@@ -549,7 +554,7 @@ public Map<ServerName, List<RegionInfo>> retainAssignment(Map<RegionInfo, Server
   }
 
   protected float getDefaultSlop() {
-    return 0.2f;
+    return REGIONS_SLOP_DEFAULT;
   }
 
   private RegionLocationFinder createRegionLocationFinder(Configuration conf) {
@@ -560,9 +565,8 @@ private RegionLocationFinder createRegionLocationFinder(Configuration conf) {
   }
 
   protected void loadConf(Configuration conf) {
-    this.slop = conf.getFloat("hbase.regions.slop", getDefaultSlop());
+    this.slop = conf.getFloat(REGIONS_SLOP_KEY, getDefaultSlop());
     this.rackManager = new RackManager(getConf());
-    this.onlySystemTablesOnMaster = LoadBalancer.isSystemTablesOnlyOnMaster(conf);
     useRegionFinder = conf.getBoolean("hbase.master.balancer.uselocality", true);
     if (useRegionFinder) {
       regionFinder = createRegionLocationFinder(conf);
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/CacheAwareLoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/CacheAwareLoadBalancer.java
index a954ef47ad9f..3c74e4d28640 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/CacheAwareLoadBalancer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/CacheAwareLoadBalancer.java
@@ -68,7 +68,7 @@ public synchronized void loadConf(Configuration configuration) {
 
   @Override
   protected Map<Class<? extends CandidateGenerator>, CandidateGenerator>
-    createCandidateGenerators() {
+    createCandidateGenerators(Configuration conf) {
     Map<Class<? extends CandidateGenerator>, CandidateGenerator> candidateGenerators =
       new HashMap<>(2);
     candidateGenerators.put(CacheAwareSkewnessCandidateGenerator.class,
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/CandidateGenerator.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/CandidateGenerator.java
index d9245495e204..642e8162fff9 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/CandidateGenerator.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/CandidateGenerator.java
@@ -28,6 +28,8 @@
 @InterfaceAudience.Private
 abstract class CandidateGenerator {
 
+  protected static final double MAX_WEIGHT = 1.0;
+
   abstract BalanceAction generate(BalancerClusterState cluster);
 
   /**
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/CostFunction.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/CostFunction.java
index 1dcd4580b1a6..ee2fc2b6a5e9 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/CostFunction.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/CostFunction.java
@@ -76,6 +76,13 @@ void postAction(BalanceAction action) {
         regionMoved(a.getFromRegion(), a.getFromServer(), a.getToServer());
         regionMoved(a.getToRegion(), a.getToServer(), a.getFromServer());
         break;
+      case MOVE_BATCH:
+        MoveBatchAction mba = (MoveBatchAction) action;
+        for (MoveRegionAction moveRegionAction : mba.getMoveActions()) {
+          regionMoved(moveRegionAction.getRegion(), moveRegionAction.getFromServer(),
+            moveRegionAction.getToServer());
+        }
+        break;
       default:
         throw new RuntimeException("Uknown action:" + action.getType());
     }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasCandidateGenerator.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasCandidateGenerator.java
new file mode 100644
index 000000000000..38fbcc4a0fbc
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasCandidateGenerator.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import static org.apache.hadoop.hbase.master.balancer.DistributeReplicasConditional.getReplicaKey;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import org.apache.hadoop.hbase.master.balancer.replicas.ReplicaKey;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * CandidateGenerator to distribute colocated replicas across different servers.
+ */
+@InterfaceAudience.Private
+final class DistributeReplicasCandidateGenerator extends RegionPlanConditionalCandidateGenerator {
+
+  private static final Logger LOG =
+    LoggerFactory.getLogger(DistributeReplicasCandidateGenerator.class);
+  private static final int BATCH_SIZE = 100_000;
+
+  DistributeReplicasCandidateGenerator(BalancerConditionals balancerConditionals) {
+    super(balancerConditionals);
+  }
+
+  @Override
+  BalanceAction generateCandidate(BalancerClusterState cluster, boolean isWeighing) {
+    return generateCandidate(cluster, isWeighing, false);
+  }
+
+  BalanceAction generateCandidate(BalancerClusterState cluster, boolean isWeighing,
+    boolean isForced) {
+    if (cluster.getMaxReplicas() < cluster.numRacks) {
+      LOG.trace("Skipping replica distribution as there are not enough racks to distribute them.");
+      return BalanceAction.NULL_ACTION;
+    }
+
+    // Iterate through shuffled servers to find colocated replicas
+    boolean foundColocatedReplicas = false;
+    List<MoveRegionAction> moveRegionActions = new ArrayList<>();
+    List<Integer> shuffledServerIndices = cluster.getShuffledServerIndices();
+    for (int sourceIndex : shuffledServerIndices) {
+      if (
+        moveRegionActions.size() >= BATCH_SIZE
+          || EnvironmentEdgeManager.currentTime() > cluster.getStopRequestedAt()
+      ) {
+        break;
+      }
+      int[] serverRegions = cluster.regionsPerServer[sourceIndex];
+      Set<ReplicaKey> replicaKeys = new HashSet<>(serverRegions.length);
+      for (int regionIndex : serverRegions) {
+        ReplicaKey replicaKey = getReplicaKey(cluster.regions[regionIndex]);
+        if (replicaKeys.contains(replicaKey)) {
+          foundColocatedReplicas = true;
+          if (isWeighing) {
+            // If weighing, fast exit with an actionable move
+            return getAction(sourceIndex, regionIndex, pickOtherRandomServer(cluster, sourceIndex),
+              -1);
+          }
+          // If not weighing, pick a good move
+          for (int i = 0; i < cluster.numServers; i++) {
+            // Randomize destination ordering so we aren't overloading one destination
+            int destinationIndex = pickOtherRandomServer(cluster, sourceIndex);
+            if (destinationIndex == sourceIndex) {
+              continue;
+            }
+            MoveRegionAction possibleAction =
+              new MoveRegionAction(regionIndex, sourceIndex, destinationIndex);
+            if (isForced) {
+              return possibleAction;
+            }
+            if (willBeAccepted(cluster, possibleAction)) {
+              cluster.doAction(possibleAction); // Update cluster state to reflect move
+              moveRegionActions.add(possibleAction);
+              break;
+            }
+          }
+        } else {
+          replicaKeys.add(replicaKey);
+        }
+      }
+    }
+
+    if (!moveRegionActions.isEmpty()) {
+      return batchMovesAndResetClusterState(cluster, moveRegionActions);
+    }
+    // If no colocated replicas are found, return NULL_ACTION
+    if (foundColocatedReplicas) {
+      LOG.warn("Could not find a place to put a colocated replica! We will force a move.");
+      return generateCandidate(cluster, isWeighing, true);
+    }
+    LOG.trace("No colocated replicas found. No balancing action required.");
+    return BalanceAction.NULL_ACTION;
+  }
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasConditional.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasConditional.java
new file mode 100644
index 000000000000..2cd27615e5fd
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasConditional.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import java.util.List;
+import java.util.Set;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.master.RegionPlan;
+import org.apache.hadoop.hbase.master.balancer.replicas.ReplicaKey;
+import org.apache.hadoop.hbase.master.balancer.replicas.ReplicaKeyCache;
+import org.apache.yetus.audience.InterfaceAudience;
+
+import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableList;
+
+/**
+ * If enabled, this class will help the balancer ensure that replicas aren't placed on the same
+ * servers or racks as their primary. Configure this via
+ * {@link BalancerConditionals#DISTRIBUTE_REPLICAS_KEY}
+ */
+@InterfaceAudience.Private
+public class DistributeReplicasConditional extends RegionPlanConditional {
+
+  private final List<RegionPlanConditionalCandidateGenerator> candidateGenerators;
+
+  public DistributeReplicasConditional(BalancerConditionals balancerConditionals,
+    BalancerClusterState cluster) {
+    super(balancerConditionals.getConf(), cluster);
+    Configuration conf = balancerConditionals.getConf();
+    float slop =
+      conf.getFloat(BaseLoadBalancer.REGIONS_SLOP_KEY, BaseLoadBalancer.REGIONS_SLOP_DEFAULT);
+    this.candidateGenerators =
+      ImmutableList.of(new DistributeReplicasCandidateGenerator(balancerConditionals),
+        new SlopFixingCandidateGenerator(balancerConditionals, slop));
+  }
+
+  @Override
+  public ValidationLevel getValidationLevel() {
+    return ValidationLevel.SERVER_HOST_RACK;
+  }
+
+  @Override
+  List<RegionPlanConditionalCandidateGenerator> getCandidateGenerators() {
+    return candidateGenerators;
+  }
+
+  @Override
+  boolean isViolatingServer(RegionPlan regionPlan, Set<RegionInfo> serverRegions) {
+    return checkViolation(regionPlan.getRegionInfo(), getReplicaKey(regionPlan.getRegionInfo()),
+      serverRegions);
+  }
+
+  @Override
+  boolean isViolatingHost(RegionPlan regionPlan, Set<RegionInfo> hostRegions) {
+    return checkViolation(regionPlan.getRegionInfo(), getReplicaKey(regionPlan.getRegionInfo()),
+      hostRegions);
+  }
+
+  @Override
+  boolean isViolatingRack(RegionPlan regionPlan, Set<RegionInfo> rackRegions) {
+    return checkViolation(regionPlan.getRegionInfo(), getReplicaKey(regionPlan.getRegionInfo()),
+      rackRegions);
+  }
+
+  private boolean checkViolation(RegionInfo movingRegion, ReplicaKey movingReplicaKey,
+    Set<RegionInfo> destinationRegions) {
+    for (RegionInfo regionInfo : destinationRegions) {
+      if (regionInfo.equals(movingRegion)) {
+        continue;
+      }
+      if (getReplicaKey(regionInfo).equals(movingReplicaKey)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  static ReplicaKey getReplicaKey(RegionInfo regionInfo) {
+    return ReplicaKeyCache.getInstance().getReplicaKey(regionInfo);
+  }
+
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/FavoredStochasticBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/FavoredStochasticBalancer.java
index db4c7c95b656..98ad3beac8de 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/FavoredStochasticBalancer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/FavoredStochasticBalancer.java
@@ -81,7 +81,7 @@ public void setFavoredNodesManager(FavoredNodesManager fnm) {
 
   @Override
   protected Map<Class<? extends CandidateGenerator>, CandidateGenerator>
-    createCandidateGenerators() {
+    createCandidateGenerators(Configuration conf) {
     Map<Class<? extends CandidateGenerator>, CandidateGenerator> fnPickers = new HashMap<>(2);
     fnPickers.put(FavoredNodeLoadPicker.class, new FavoredNodeLoadPicker());
     fnPickers.put(FavoredNodeLocalityPicker.class, new FavoredNodeLocalityPicker());
@@ -90,7 +90,7 @@ public void setFavoredNodesManager(FavoredNodesManager fnm) {
 
   /** Returns any candidate generator in random */
   @Override
-  protected CandidateGenerator getRandomGenerator() {
+  protected CandidateGenerator getRandomGenerator(BalancerClusterState cluster) {
     Class<? extends CandidateGenerator> clazz = shuffledGeneratorClasses.get()
       .get(ThreadLocalRandom.current().nextInt(candidateGenerators.size()));
     return candidateGenerators.get(clazz);
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MoveBatchAction.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MoveBatchAction.java
new file mode 100644
index 000000000000..e7ea3ed15e1d
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MoveBatchAction.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.hadoop.hbase.master.RegionPlan;
+import org.apache.yetus.audience.InterfaceAudience;
+
+import org.apache.hbase.thirdparty.com.google.common.collect.HashMultimap;
+import org.apache.hbase.thirdparty.com.google.common.collect.Multimaps;
+
+@InterfaceAudience.Private
+public class MoveBatchAction extends BalanceAction {
+  private final List<MoveRegionAction> moveActions;
+
+  MoveBatchAction(List<MoveRegionAction> moveActions) {
+    super(Type.MOVE_BATCH);
+    this.moveActions = moveActions;
+  }
+
+  @Override
+  BalanceAction undoAction() {
+    List<MoveRegionAction> undoMoves = new ArrayList<>(getMoveActions().size());
+    for (int i = getMoveActions().size() - 1; i >= 0; i--) {
+      MoveRegionAction move = getMoveActions().get(i);
+      undoMoves
+        .add(new MoveRegionAction(move.getRegion(), move.getToServer(), move.getFromServer()));
+    }
+    return new MoveBatchAction(undoMoves);
+  }
+
+  @Override
+  List<RegionPlan> toRegionPlans(BalancerClusterState cluster) {
+    List<RegionPlan> mbRegionPlans = new ArrayList<>(getMoveActions().size());
+    for (MoveRegionAction moveRegionAction : getMoveActions()) {
+      mbRegionPlans.add(new RegionPlan(cluster.regions[moveRegionAction.getRegion()],
+        cluster.servers[moveRegionAction.getFromServer()],
+        cluster.servers[moveRegionAction.getToServer()]));
+    }
+    return mbRegionPlans;
+  }
+
+  @Override
+  long getStepCount() {
+    return moveActions.size();
+  }
+
+  public HashMultimap<Integer, Integer> getServerToRegionsToRemove() {
+    return moveActions.stream().collect(Multimaps.toMultimap(MoveRegionAction::getFromServer,
+      MoveRegionAction::getRegion, HashMultimap::create));
+  }
+
+  public HashMultimap<Integer, Integer> getServerToRegionsToAdd() {
+    return moveActions.stream().collect(Multimaps.toMultimap(MoveRegionAction::getToServer,
+      MoveRegionAction::getRegion, HashMultimap::create));
+  }
+
+  List<MoveRegionAction> getMoveActions() {
+    return moveActions;
+  }
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MoveRegionAction.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MoveRegionAction.java
index 547c9c5b28e9..9798e9cebe87 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MoveRegionAction.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MoveRegionAction.java
@@ -17,8 +17,12 @@
  */
 package org.apache.hadoop.hbase.master.balancer;
 
+import java.util.List;
+import org.apache.hadoop.hbase.master.RegionPlan;
 import org.apache.yetus.audience.InterfaceAudience;
 
+import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableList;
+
 @InterfaceAudience.Private
 class MoveRegionAction extends BalanceAction {
   private final int region;
@@ -49,6 +53,12 @@ public BalanceAction undoAction() {
     return new MoveRegionAction(region, toServer, fromServer);
   }
 
+  @Override
+  List<RegionPlan> toRegionPlans(BalancerClusterState cluster) {
+    return ImmutableList.of(new RegionPlan(cluster.regions[getRegion()],
+      cluster.servers[getFromServer()], cluster.servers[getToServer()]));
+  }
+
   @Override
   public String toString() {
     return getType() + ": " + region + ":" + fromServer + " -> " + toServer;
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionPlanConditional.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionPlanConditional.java
new file mode 100644
index 000000000000..8de371d341cd
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionPlanConditional.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseInterfaceAudience;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.master.RegionPlan;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.apache.yetus.audience.InterfaceStability;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.CONFIG)
+@InterfaceStability.Evolving
+public abstract class RegionPlanConditional {
+  private static final Logger LOG = LoggerFactory.getLogger(RegionPlanConditional.class);
+  private BalancerClusterState cluster;
+
+  RegionPlanConditional(Configuration conf, BalancerClusterState cluster) {
+    this.cluster = cluster;
+  }
+
+  public enum ValidationLevel {
+    /**
+     * Just check the server.
+     */
+    SERVER,
+    /**
+     * Check the server and the host.
+     */
+    SERVER_HOST,
+    /**
+     * Check the server, host, and rack.
+     */
+    SERVER_HOST_RACK
+  }
+
+  void setClusterState(BalancerClusterState cluster) {
+    this.cluster = cluster;
+  }
+
+  /**
+   * Returns a {@link ValidationLevel} that is appropriate for this conditional.
+   * @return the validation level
+   */
+  abstract ValidationLevel getValidationLevel();
+
+  /**
+   * Get the candidate generator(s) for this conditional. This can be useful to provide the balancer
+   * with hints that will appease your conditional. Your conditionals will be triggered in order.
+   * @return the candidate generator for this conditional
+   */
+  abstract List<RegionPlanConditionalCandidateGenerator> getCandidateGenerators();
+
+  /**
+   * Check if the conditional is violated by the given region plan.
+   * @param regionPlan the region plan to check
+   * @return true if the conditional is violated
+   */
+  boolean isViolating(RegionPlan regionPlan) {
+    if (regionPlan == null) {
+      return false;
+    }
+    int destinationServerIdx = cluster.serversToIndex.get(regionPlan.getDestination().getAddress());
+
+    // Check Server
+    int[] destinationRegionIndices = cluster.regionsPerServer[destinationServerIdx];
+    Set<RegionInfo> serverRegions = Arrays.stream(cluster.regionsPerServer[destinationServerIdx])
+      .mapToObj(idx -> cluster.regions[idx]).collect(Collectors.toSet());
+    for (int regionIdx : destinationRegionIndices) {
+      serverRegions.add(cluster.regions[regionIdx]);
+    }
+    if (isViolatingServer(regionPlan, serverRegions)) {
+      return true;
+    }
+
+    if (getValidationLevel() == ValidationLevel.SERVER) {
+      return false;
+    }
+
+    // Check Host
+    int hostIdx = cluster.serverIndexToHostIndex[destinationServerIdx];
+    Set<RegionInfo> hostRegions = Arrays.stream(cluster.regionsPerHost[hostIdx])
+      .mapToObj(idx -> cluster.regions[idx]).collect(Collectors.toSet());
+    if (isViolatingHost(regionPlan, hostRegions)) {
+      return true;
+    }
+
+    if (getValidationLevel() == ValidationLevel.SERVER_HOST) {
+      return false;
+    }
+
+    // Check Rack
+    int rackIdx = cluster.serverIndexToRackIndex[destinationServerIdx];
+    Set<RegionInfo> rackRegions = Arrays.stream(cluster.regionsPerRack[rackIdx])
+      .mapToObj(idx -> cluster.regions[idx]).collect(Collectors.toSet());
+    if (isViolatingRack(regionPlan, rackRegions)) {
+      return true;
+    }
+
+    return false;
+  }
+
+  abstract boolean isViolatingServer(RegionPlan regionPlan, Set<RegionInfo> destinationRegions);
+
+  boolean isViolatingHost(RegionPlan regionPlan, Set<RegionInfo> destinationRegions) {
+    return false;
+  }
+
+  boolean isViolatingRack(RegionPlan regionPlan, Set<RegionInfo> destinationRegions) {
+    return false;
+  }
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionPlanConditionalCandidateGenerator.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionPlanConditionalCandidateGenerator.java
new file mode 100644
index 000000000000..f8274841f729
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionPlanConditionalCandidateGenerator.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import java.time.Duration;
+import java.util.List;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.apache.yetus.audience.InterfaceStability;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@InterfaceAudience.Private
+@InterfaceStability.Evolving
+public abstract class RegionPlanConditionalCandidateGenerator extends CandidateGenerator {
+
+  private static final Logger LOG =
+    LoggerFactory.getLogger(RegionPlanConditionalCandidateGenerator.class);
+
+  private static final Duration WEIGHT_CACHE_TTL = Duration.ofMinutes(1);
+  private long lastWeighedAt = -1;
+  private double lastWeight = 0.0;
+
+  private final BalancerConditionals balancerConditionals;
+
+  RegionPlanConditionalCandidateGenerator(BalancerConditionals balancerConditionals) {
+    this.balancerConditionals = balancerConditionals;
+  }
+
+  BalancerConditionals getBalancerConditionals() {
+    return this.balancerConditionals;
+  }
+
+  /**
+   * Generates a balancing action to appease the conditional.
+   * @param cluster    Current state of the cluster.
+   * @param isWeighing Flag indicating if the generator is being used for weighing.
+   * @return A BalanceAction, or NULL_ACTION if no action is needed.
+   */
+  abstract BalanceAction generateCandidate(BalancerClusterState cluster, boolean isWeighing);
+
+  @Override
+  BalanceAction generate(BalancerClusterState cluster) {
+    BalanceAction balanceAction = generateCandidate(cluster, false);
+    if (!willBeAccepted(cluster, balanceAction)) {
+      LOG.debug("Generated action is not widely accepted by all conditionals. "
+        + "Likely we are finding our way out of a deadlock. balanceAction={}", balanceAction);
+    }
+    return balanceAction;
+  }
+
+  MoveBatchAction batchMovesAndResetClusterState(BalancerClusterState cluster,
+    List<MoveRegionAction> moves) {
+    MoveBatchAction batchAction = new MoveBatchAction(moves);
+    undoBatchAction(cluster, batchAction);
+    return batchAction;
+  }
+
+  boolean willBeAccepted(BalancerClusterState cluster, BalanceAction action) {
+    BalancerConditionals balancerConditionals = getBalancerConditionals();
+    if (balancerConditionals == null) {
+      return true;
+    }
+    return !balancerConditionals.isViolating(cluster, action);
+  }
+
+  void undoBatchAction(BalancerClusterState cluster, MoveBatchAction batchAction) {
+    for (int i = batchAction.getMoveActions().size() - 1; i >= 0; i--) {
+      MoveRegionAction action = batchAction.getMoveActions().get(i);
+      cluster.doAction(action.undoAction());
+    }
+  }
+
+  void clearWeightCache() {
+    lastWeighedAt = -1;
+  }
+
+  double getWeight(BalancerClusterState cluster) {
+    boolean hasCandidate = false;
+
+    // Candidate generation is expensive, so for re-weighing generators we will cache
+    // the value for a bit
+    if (EnvironmentEdgeManager.currentTime() - lastWeighedAt < WEIGHT_CACHE_TTL.toMillis()) {
+      return lastWeight;
+    } else {
+      hasCandidate = generateCandidate(cluster, true) != BalanceAction.NULL_ACTION;
+      lastWeighedAt = EnvironmentEdgeManager.currentTime();
+    }
+
+    if (hasCandidate) {
+      // If this generator has something to do, then it's important
+      lastWeight = CandidateGenerator.MAX_WEIGHT;
+    } else {
+      lastWeight = 0;
+    }
+    return lastWeight;
+  }
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SlopFixingCandidateGenerator.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SlopFixingCandidateGenerator.java
new file mode 100644
index 000000000000..070e4903394d
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SlopFixingCandidateGenerator.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A simple candidate generator that attempts to move regions from the most-loaded servers to the
+ * least-loaded servers.
+ */
+@InterfaceAudience.Private
+final class SlopFixingCandidateGenerator extends RegionPlanConditionalCandidateGenerator {
+
+  private static final Logger LOG = LoggerFactory.getLogger(SlopFixingCandidateGenerator.class);
+
+  private final float slop;
+
+  SlopFixingCandidateGenerator(BalancerConditionals balancerConditionals, float slop) {
+    super(balancerConditionals);
+    this.slop = slop;
+  }
+
+  @Override
+  BalanceAction generateCandidate(BalancerClusterState cluster, boolean isWeighing) {
+    ClusterLoadState cs = new ClusterLoadState(cluster.clusterState);
+    float average = cs.getLoadAverage();
+    int ceiling = (int) Math.ceil(average * (1 + slop));
+    Set<Integer> sloppyServerIndices = new HashSet<>();
+    for (int i = 0; i < cluster.numServers; i++) {
+      int regionCount = cluster.regionsPerServer[i].length;
+      if (regionCount > ceiling) {
+        sloppyServerIndices.add(i);
+      }
+    }
+
+    if (sloppyServerIndices.isEmpty()) {
+      LOG.trace("No action to take because no sloppy servers exist.");
+      return BalanceAction.NULL_ACTION;
+    }
+
+    List<MoveRegionAction> moves = new ArrayList<>();
+    Set<ServerAndLoad> fixedServers = new HashSet<>();
+    for (int sourceServer : sloppyServerIndices) {
+      for (int regionIdx : cluster.regionsPerServer[sourceServer]) {
+        boolean regionFoundMove = false;
+        for (ServerAndLoad serverAndLoad : cs.getServersByLoad().keySet()) {
+          ServerName destinationServer = serverAndLoad.getServerName();
+          int destinationServerIdx = cluster.serversToIndex.get(destinationServer.getAddress());
+          int regionsOnDestination = cluster.regionsPerServer[destinationServerIdx].length;
+          if (regionsOnDestination < average) {
+            MoveRegionAction move =
+              new MoveRegionAction(regionIdx, sourceServer, destinationServerIdx);
+            if (willBeAccepted(cluster, move)) {
+              if (isWeighing) {
+                // Fast exit for weighing candidate
+                return move;
+              }
+              moves.add(move);
+              cluster.doAction(move);
+              regionFoundMove = true;
+              break;
+            }
+          } else {
+            fixedServers.add(serverAndLoad);
+          }
+        }
+        fixedServers.forEach(s -> cs.getServersByLoad().remove(s));
+        fixedServers.clear();
+        if (!regionFoundMove) {
+          LOG.debug("Could not find a destination for region {} from server {}.", regionIdx,
+            sourceServer);
+        }
+        if (cluster.regionsPerServer[sourceServer].length <= ceiling) {
+          break;
+        }
+      }
+    }
+
+    MoveBatchAction batch = new MoveBatchAction(moves);
+    undoBatchAction(cluster, batch);
+    return batch;
+  }
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
index fca4ef952073..42784ea4440d 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
@@ -54,7 +54,6 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
 import org.apache.hbase.thirdparty.com.google.common.base.Suppliers;
 
 /**
@@ -192,6 +191,8 @@ public enum GeneratorType {
       return shuffled;
     }, 5, TimeUnit.SECONDS);
 
+  private final BalancerConditionals balancerConditionals = BalancerConditionals.create();
+
   /**
    * The constructor that pass a MetricsStochasticBalancer to BaseLoadBalancer to replace its
    * default MetricsBalancer
@@ -244,16 +245,24 @@ Map<Class<? extends CandidateGenerator>, CandidateGenerator> getCandidateGenerat
   }
 
   protected Map<Class<? extends CandidateGenerator>, CandidateGenerator>
-    createCandidateGenerators() {
-    Map<Class<? extends CandidateGenerator>, CandidateGenerator> candidateGenerators =
-      new HashMap<>(5);
-    candidateGenerators.put(RandomCandidateGenerator.class, new RandomCandidateGenerator());
-    candidateGenerators.put(LoadCandidateGenerator.class, new LoadCandidateGenerator());
-    candidateGenerators.put(LocalityBasedCandidateGenerator.class, localityCandidateGenerator);
-    candidateGenerators.put(RegionReplicaCandidateGenerator.class,
-      new RegionReplicaCandidateGenerator());
-    candidateGenerators.put(RegionReplicaRackCandidateGenerator.class,
-      new RegionReplicaRackCandidateGenerator());
+    createCandidateGenerators(Configuration conf) {
+    balancerConditionals.setConf(conf);
+    Map<Class<? extends CandidateGenerator>, CandidateGenerator> candidateGenerators;
+    if (balancerConditionals.isReplicaDistributionEnabled()) {
+      candidateGenerators = new HashMap<>(3);
+      candidateGenerators.put(RandomCandidateGenerator.class, new RandomCandidateGenerator());
+      candidateGenerators.put(LoadCandidateGenerator.class, new LoadCandidateGenerator());
+      candidateGenerators.put(LocalityBasedCandidateGenerator.class, localityCandidateGenerator);
+    } else {
+      candidateGenerators = new HashMap<>(5);
+      candidateGenerators.put(RandomCandidateGenerator.class, new RandomCandidateGenerator());
+      candidateGenerators.put(LoadCandidateGenerator.class, new LoadCandidateGenerator());
+      candidateGenerators.put(LocalityBasedCandidateGenerator.class, localityCandidateGenerator);
+      candidateGenerators.put(RegionReplicaCandidateGenerator.class,
+        new RegionReplicaCandidateGenerator());
+      candidateGenerators.put(RegionReplicaRackCandidateGenerator.class,
+        new RegionReplicaRackCandidateGenerator());
+    }
     return candidateGenerators;
   }
 
@@ -288,7 +297,8 @@ protected void loadConf(Configuration conf) {
     localityCost = new ServerLocalityCostFunction(conf);
     rackLocalityCost = new RackLocalityCostFunction(conf);
 
-    this.candidateGenerators = createCandidateGenerators();
+    balancerConditionals.setConf(conf);
+    this.candidateGenerators = createCandidateGenerators(conf);
 
     regionReplicaHostCostFunction = new RegionReplicaHostCostFunction(conf);
     regionReplicaRackCostFunction = new RegionReplicaRackCostFunction(conf);
@@ -377,6 +387,11 @@ void updateMetricsSize(int size) {
   }
 
   private boolean areSomeRegionReplicasColocatedOnHost(BalancerClusterState c) {
+    if (!c.hasRegionReplicas || balancerConditionals.isReplicaDistributionEnabled()) {
+      // This check is unnecessary without replicas, or with conditional replica distribution
+      // The balancer will auto-run if conditional replica distribution candidates are available
+      return false;
+    }
     if (c.numHosts >= c.maxReplicas) {
       regionReplicaHostCostFunction.prepare(c);
       double hostCost = Math.abs(regionReplicaHostCostFunction.cost());
@@ -390,6 +405,11 @@ private boolean areSomeRegionReplicasColocatedOnHost(BalancerClusterState c) {
   }
 
   private boolean areSomeRegionReplicasColocatedOnRack(BalancerClusterState c) {
+    if (!c.hasRegionReplicas || balancerConditionals.isReplicaDistributionEnabled()) {
+      // This check is unnecessary without replicas, or with conditional replica distribution
+      // The balancer will auto-run if conditional replica distribution candidates are available
+      return false;
+    }
     if (c.numRacks >= c.maxReplicas) {
       regionReplicaRackCostFunction.prepare(c);
       double rackCost = Math.abs(regionReplicaRackCostFunction.cost());
@@ -441,6 +461,11 @@ boolean needsBalance(TableName tableName, BalancerClusterState cluster) {
       return true;
     }
 
+    if (balancerConditionals.shouldRunBalancer(cluster)) {
+      LOG.info("Running balancer because conditional candidate generators have important moves");
+      return true;
+    }
+
     double total = 0.0;
     float localSumMultiplier = 0; // in case this.sumMultiplier is not initialized
     for (CostFunction c : costFunctions) {
@@ -470,14 +495,17 @@ boolean needsBalance(TableName tableName, BalancerClusterState cluster) {
       }
       LOG.info(
         "{} - skipping load balancing because weighted average imbalance={} <= "
-          + "threshold({}). If you want more aggressive balancing, either lower "
+          + "threshold({}) and conditionals do not have opinionated move candidates. "
+          + "If you want more aggressive balancing, either lower "
           + "hbase.master.balancer.stochastic.minCostNeedBalance from {} or increase the relative "
           + "multiplier(s) of the specific cost function(s). functionCost={}",
         isByTable ? "Table specific (" + tableName + ")" : "Cluster wide", total / sumMultiplier,
         minCostNeedBalance, minCostNeedBalance, functionCost());
     } else {
-      LOG.info("{} - Calculating plan. may take up to {}ms to complete.",
-        isByTable ? "Table specific (" + tableName + ")" : "Cluster wide", maxRunningTime);
+      LOG.info(
+        "{} - Calculating plan. may take up to {}ms to complete. currentCost={}, targetCost={}",
+        isByTable ? "Table specific (" + tableName + ")" : "Cluster wide", maxRunningTime, total,
+        minCostNeedBalance);
     }
     return !balanced;
   }
@@ -485,7 +513,7 @@ boolean needsBalance(TableName tableName, BalancerClusterState cluster) {
   @RestrictedApi(explanation = "Should only be called in tests", link = "",
       allowedOnPath = ".*(/src/test/.*|StochasticLoadBalancer).java")
   Pair<CandidateGenerator, BalanceAction> nextAction(BalancerClusterState cluster) {
-    CandidateGenerator generator = getRandomGenerator();
+    CandidateGenerator generator = getRandomGenerator(cluster);
     return Pair.newPair(generator, generator.generate(cluster));
   }
 
@@ -494,8 +522,20 @@ Pair<CandidateGenerator, BalanceAction> nextAction(BalancerClusterState cluster)
    * selecting a candidate generator is proportional to the share of cost of all cost functions
    * among all cost functions that benefit from it.
    */
-  protected CandidateGenerator getRandomGenerator() {
-    Preconditions.checkState(!candidateGenerators.isEmpty(), "No candidate generators available.");
+  protected CandidateGenerator getRandomGenerator(BalancerClusterState cluster) {
+    // Prefer conditional generators if they have moves to make
+    if (balancerConditionals.isConditionalBalancingEnabled()) {
+      for (RegionPlanConditional conditional : balancerConditionals.getConditionals()) {
+        List<RegionPlanConditionalCandidateGenerator> generators =
+          conditional.getCandidateGenerators();
+        for (RegionPlanConditionalCandidateGenerator generator : generators) {
+          if (generator.getWeight(cluster) > 0) {
+            return generator;
+          }
+        }
+      }
+    }
+
     List<Class<? extends CandidateGenerator>> generatorClasses = shuffledGeneratorClasses.get();
     List<Double> partialSums = new ArrayList<>(generatorClasses.size());
     double sum = 0.0;
@@ -583,8 +623,12 @@ protected List<RegionPlan> balanceTable(TableName tableName,
       rackManager, regionCacheRatioOnOldServerMap);
 
     long startTime = EnvironmentEdgeManager.currentTime();
+    cluster.setStopRequestedAt(startTime + maxRunningTime);
 
     initCosts(cluster);
+    balancerConditionals.loadClusterState(cluster);
+    balancerConditionals.clearConditionalWeightCaches();
+
     float localSumMultiplier = 0;
     for (CostFunction c : costFunctions) {
       if (c.isNeeded()) {
@@ -632,6 +676,7 @@ protected List<RegionPlan> balanceTable(TableName tableName,
     final String initFunctionTotalCosts = totalCostsPerFunc();
     // Perform a stochastic walk to see if we can get a good fit.
     long step;
+    boolean planImprovedConditionals = false;
     Map<Class<? extends CandidateGenerator>, Long> generatorToStepCount = new HashMap<>();
     Map<Class<? extends CandidateGenerator>, Long> generatorToApprovedActionCount = new HashMap<>();
     for (step = 0; step < computedMaxSteps; step++) {
@@ -643,16 +688,53 @@ protected List<RegionPlan> balanceTable(TableName tableName,
         continue;
       }
 
-      cluster.doAction(action);
+      int conditionalViolationsChange = 0;
+      boolean isViolatingConditionals = false;
+      boolean moveImprovedConditionals = false;
+      // Only check conditionals if they are enabled
+      if (balancerConditionals.isConditionalBalancingEnabled()) {
+        // Always accept a conditional generator output. Sometimes conditional generators
+        // may need to make controversial moves in order to break what would otherwise
+        // be a deadlocked situation.
+        // Otherwise, for normal moves, evaluate the action.
+        if (RegionPlanConditionalCandidateGenerator.class.isAssignableFrom(generator.getClass())) {
+          conditionalViolationsChange = -1;
+        } else {
+          conditionalViolationsChange =
+            balancerConditionals.getViolationCountChange(cluster, action);
+          isViolatingConditionals = balancerConditionals.isViolating(cluster, action);
+        }
+        moveImprovedConditionals = conditionalViolationsChange < 0;
+        if (moveImprovedConditionals) {
+          planImprovedConditionals = true;
+        }
+      }
+
+      // Change state and evaluate costs
+      try {
+        cluster.doAction(action);
+      } catch (IllegalStateException | ArrayIndexOutOfBoundsException e) {
+        LOG.warn(
+          "Generator {} produced invalid action! "
+            + "Debug your candidate generator as this is likely a bug, "
+            + "and may cause a balancer deadlock. {}",
+          generator.getClass().getSimpleName(), action, e);
+        continue;
+      }
       updateCostsAndWeightsWithAction(cluster, action);
-      generatorToStepCount.merge(generator.getClass(), 1L, Long::sum);
+      generatorToStepCount.merge(generator.getClass(), action.getStepCount(), Long::sum);
 
       newCost = computeCost(cluster, currentCost);
 
-      // Should this be kept?
-      if (newCost < currentCost) {
+      boolean conditionalsSimilarCostsImproved =
+        (newCost < currentCost && conditionalViolationsChange == 0 && !isViolatingConditionals);
+      // Our first priority is to reduce conditional violations
+      // Our second priority is to reduce balancer cost
+      // change, regardless of cost change
+      if (moveImprovedConditionals || conditionalsSimilarCostsImproved) {
         currentCost = newCost;
-        generatorToApprovedActionCount.merge(generator.getClass(), 1L, Long::sum);
+        generatorToApprovedActionCount.merge(generator.getClass(), action.getStepCount(),
+          Long::sum);
 
         // save for JMX
         curOverallCost = currentCost;
@@ -665,7 +747,7 @@ protected List<RegionPlan> balanceTable(TableName tableName,
         updateCostsAndWeightsWithAction(cluster, undoAction);
       }
 
-      if (EnvironmentEdgeManager.currentTime() - startTime > maxRunningTime) {
+      if (EnvironmentEdgeManager.currentTime() > cluster.getStopRequestedAt()) {
         break;
       }
     }
@@ -682,7 +764,7 @@ protected List<RegionPlan> balanceTable(TableName tableName,
 
     metricsBalancer.balanceCluster(endTime - startTime);
 
-    if (initCost > currentCost) {
+    if (planImprovedConditionals || (initCost > currentCost)) {
       updateStochasticCosts(tableName, curOverallCost, curFunctionCosts);
       plans = createRegionPlans(cluster);
       LOG.info(
@@ -697,7 +779,8 @@ protected List<RegionPlan> balanceTable(TableName tableName,
     }
     LOG.info(
       "Could not find a better moving plan.  Tried {} different configurations in "
-        + "{} ms, and did not find anything with an imbalance score less than {}",
+        + "{} ms, and did not find anything with an imbalance score less than {} "
+        + "and could not improve conditional violations",
       step, endTime - startTime, initCost / sumMultiplier);
     return null;
   }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SwapRegionsAction.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SwapRegionsAction.java
index 6f83d2bc930b..c99de022f038 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SwapRegionsAction.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SwapRegionsAction.java
@@ -17,8 +17,12 @@
  */
 package org.apache.hadoop.hbase.master.balancer;
 
+import java.util.List;
+import org.apache.hadoop.hbase.master.RegionPlan;
 import org.apache.yetus.audience.InterfaceAudience;
 
+import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableList;
+
 @InterfaceAudience.Private
 public class SwapRegionsAction extends BalanceAction {
   private final int fromServer;
@@ -55,6 +59,15 @@ public BalanceAction undoAction() {
     return new SwapRegionsAction(fromServer, toRegion, toServer, fromRegion);
   }
 
+  @Override
+  List<RegionPlan> toRegionPlans(BalancerClusterState cluster) {
+    return ImmutableList.of(
+      new RegionPlan(cluster.regions[getFromRegion()], cluster.servers[getFromServer()],
+        cluster.servers[getToServer()]),
+      new RegionPlan(cluster.regions[getToRegion()], cluster.servers[getToServer()],
+        cluster.servers[getFromServer()]));
+  }
+
   @Override
   public String toString() {
     return getType() + ": " + fromRegion + ":" + fromServer + " <-> " + toRegion + ":" + toServer;
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/replicas/ReplicaKey.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/replicas/ReplicaKey.java
new file mode 100644
index 000000000000..f43df965da33
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/replicas/ReplicaKey.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer.replicas;
+
+import java.util.Arrays;
+import org.apache.commons.lang3.builder.HashCodeBuilder;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+
+@InterfaceAudience.Private
+public final class ReplicaKey {
+  private final TableName tableName;
+  private final byte[] start;
+  private final byte[] stop;
+
+  public ReplicaKey(RegionInfo regionInfo) {
+    this.tableName = regionInfo.getTable();
+    this.start = regionInfo.getStartKey();
+    this.stop = regionInfo.getEndKey();
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) {
+      return true;
+    }
+    if (!(o instanceof ReplicaKey)) {
+      return false;
+    }
+    ReplicaKey other = (ReplicaKey) o;
+    return Arrays.equals(this.start, other.start) && Arrays.equals(this.stop, other.stop)
+      && this.tableName.equals(other.tableName);
+  }
+
+  @Override
+  public int hashCode() {
+    return new HashCodeBuilder().append(tableName).append(start).append(stop).toHashCode();
+  }
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/replicas/ReplicaKeyCache.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/replicas/ReplicaKeyCache.java
new file mode 100644
index 000000000000..a40e5f9a2f2d
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/replicas/ReplicaKeyCache.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer.replicas;
+
+import java.time.Duration;
+import java.util.function.Supplier;
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+
+import org.apache.hbase.thirdparty.com.google.common.base.Suppliers;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheLoader;
+import org.apache.hbase.thirdparty.com.google.common.cache.LoadingCache;
+
+@InterfaceAudience.Private
+public final class ReplicaKeyCache implements Configurable {
+  /**
+   * ReplicaKey creation is expensive if you have lots of regions. If your HMaster has adequate
+   * memory, and you would like balancing to be faster, then you can turn on this flag to cache
+   * ReplicaKey objects.
+   */
+  public static final String CACHE_REPLICA_KEYS_KEY =
+    "hbase.replica.distribution.conditional.cacheReplicaKeys";
+  public static final boolean CACHE_REPLICA_KEYS_DEFAULT = false;
+
+  /**
+   * If memory is available, then set this to a value greater than your region count to maximize
+   * replica distribution performance.
+   */
+  public static final String REPLICA_KEY_CACHE_SIZE_KEY =
+    "hbase.replica.distribution.conditional.replicaKeyCacheSize";
+  public static final int REPLICA_KEY_CACHE_SIZE_DEFAULT = 1000;
+
+  private static final Supplier<ReplicaKeyCache> INSTANCE = Suppliers.memoize(ReplicaKeyCache::new);
+
+  private volatile LoadingCache<RegionInfo, ReplicaKey> replicaKeyCache = null;
+
+  private Configuration conf;
+
+  public static ReplicaKeyCache getInstance() {
+    return INSTANCE.get();
+  }
+
+  private ReplicaKeyCache() {
+  }
+
+  public ReplicaKey getReplicaKey(RegionInfo regionInfo) {
+    return replicaKeyCache == null
+      ? new ReplicaKey(regionInfo)
+      : replicaKeyCache.getUnchecked(regionInfo);
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    boolean cacheKeys = conf.getBoolean(CACHE_REPLICA_KEYS_KEY, CACHE_REPLICA_KEYS_DEFAULT);
+    if (cacheKeys && replicaKeyCache == null) {
+      int replicaKeyCacheSize =
+        conf.getInt(REPLICA_KEY_CACHE_SIZE_KEY, REPLICA_KEY_CACHE_SIZE_DEFAULT);
+      replicaKeyCache = CacheBuilder.newBuilder().maximumSize(replicaKeyCacheSize)
+        .expireAfterAccess(Duration.ofMinutes(30)).build(new CacheLoader<RegionInfo, ReplicaKey>() {
+          @Override
+          public ReplicaKey load(RegionInfo regionInfo) {
+            return new ReplicaKey(regionInfo);
+          }
+        });
+    } else if (!cacheKeys) {
+      replicaKeyCache = null;
+    }
+  }
+
+  @Override
+  public Configuration getConf() {
+    return conf;
+  }
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionalsTestUtil.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionalsTestUtil.java
new file mode 100644
index 000000000000..0678cc3b67fb
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionalsTestUtil.java
@@ -0,0 +1,221 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HRegionLocation;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.Admin;
+import org.apache.hadoop.hbase.client.Connection;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.quotas.QuotaUtil;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableSet;
+
+public final class BalancerConditionalsTestUtil {
+
+  private static final Logger LOG = LoggerFactory.getLogger(BalancerConditionalsTestUtil.class);
+
+  private BalancerConditionalsTestUtil() {
+  }
+
+  static byte[][] generateSplits(int numRegions) {
+    byte[][] splitKeys = new byte[numRegions - 1][];
+    for (int i = 0; i < numRegions - 1; i++) {
+      splitKeys[i] =
+        Bytes.toBytes(String.format("%09d", (i + 1) * (Integer.MAX_VALUE / numRegions)));
+    }
+    return splitKeys;
+  }
+
+  static void printRegionLocations(Connection connection) throws IOException {
+    Admin admin = connection.getAdmin();
+
+    // Get all table names in the cluster
+    Set<TableName> tableNames = admin.listTableDescriptors().stream()
+      .map(TableDescriptor::getTableName).collect(Collectors.toSet());
+
+    // Group regions by server
+    Map<ServerName, Map<TableName, List<RegionInfo>>> serverToRegions =
+      admin.getClusterMetrics().getLiveServerMetrics().keySet().stream()
+        .collect(Collectors.toMap(server -> server, server -> {
+          try {
+            return listRegionsByTable(connection, server, tableNames);
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+        }));
+
+    // Pretty print region locations
+    StringBuilder regionLocationOutput = new StringBuilder();
+    regionLocationOutput.append("Pretty printing region locations...\n");
+    serverToRegions.forEach((server, tableRegions) -> {
+      regionLocationOutput.append("Server: " + server.getServerName() + "\n");
+      tableRegions.forEach((table, regions) -> {
+        if (regions.isEmpty()) {
+          return;
+        }
+        regionLocationOutput.append("  Table: " + table.getNameAsString() + "\n");
+        regions.forEach(region -> regionLocationOutput
+          .append(String.format("    Region: %s, start: %s, end: %s, replica: %s\n",
+            region.getEncodedName(), Bytes.toString(region.getStartKey()),
+            Bytes.toString(region.getEndKey()), region.getReplicaId())));
+      });
+    });
+    LOG.info(regionLocationOutput.toString());
+  }
+
+  private static Map<TableName, List<RegionInfo>> listRegionsByTable(Connection connection,
+    ServerName server, Set<TableName> tableNames) throws IOException {
+    Admin admin = connection.getAdmin();
+
+    // Find regions for each table
+    return tableNames.stream().collect(Collectors.toMap(tableName -> tableName, tableName -> {
+      List<RegionInfo> allRegions = null;
+      try {
+        allRegions = admin.getRegions(server);
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+      return allRegions.stream().filter(region -> region.getTable().equals(tableName))
+        .collect(Collectors.toList());
+    }));
+  }
+
+  static void validateReplicaDistribution(Connection connection, TableName tableName,
+    boolean shouldBeDistributed) {
+    Map<ServerName, List<RegionInfo>> serverToRegions = null;
+    try {
+      serverToRegions = connection.getRegionLocator(tableName).getAllRegionLocations().stream()
+        .collect(Collectors.groupingBy(location -> location.getServerName(),
+          Collectors.mapping(location -> location.getRegion(), Collectors.toList())));
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+
+    if (shouldBeDistributed) {
+      // Ensure no server hosts more than one replica of any region
+      for (Map.Entry<ServerName, List<RegionInfo>> serverAndRegions : serverToRegions.entrySet()) {
+        List<RegionInfo> regionInfos = serverAndRegions.getValue();
+        Set<byte[]> startKeys = new HashSet<>();
+        for (RegionInfo regionInfo : regionInfos) {
+          // each region should have a distinct start key
+          assertFalse(
+            "Each region should have its own start key, "
+              + "demonstrating it is not a replica of any others on this host",
+            startKeys.contains(regionInfo.getStartKey()));
+          startKeys.add(regionInfo.getStartKey());
+        }
+      }
+    } else {
+      // Ensure all replicas are on the same server
+      assertEquals("All regions should share one server", 1, serverToRegions.size());
+    }
+  }
+
+  static void validateRegionLocations(Map<TableName, Set<ServerName>> tableToServers,
+    TableName productTableName, boolean shouldBeBalanced) {
+    ServerName metaServer =
+      tableToServers.get(TableName.META_TABLE_NAME).stream().findFirst().get();
+    ServerName quotaServer =
+      tableToServers.get(QuotaUtil.QUOTA_TABLE_NAME).stream().findFirst().get();
+    Set<ServerName> productServers = tableToServers.get(productTableName);
+
+    if (shouldBeBalanced) {
+      for (ServerName server : productServers) {
+        assertNotEquals("Meta table and product table should not share servers", server,
+          metaServer);
+        assertNotEquals("Quota table and product table should not share servers", server,
+          quotaServer);
+      }
+      assertNotEquals("The meta server and quotas server should be different", metaServer,
+        quotaServer);
+    } else {
+      for (ServerName server : productServers) {
+        assertEquals("Meta table and product table must share servers", server, metaServer);
+        assertEquals("Quota table and product table must share servers", server, quotaServer);
+      }
+      assertEquals("The meta server and quotas server must be the same", metaServer, quotaServer);
+    }
+  }
+
+  static Map<TableName, Set<ServerName>> getTableToServers(Connection connection,
+    Set<TableName> tableNames) {
+    return tableNames.stream().collect(Collectors.toMap(t -> t, t -> {
+      try {
+        return connection.getRegionLocator(t).getAllRegionLocations().stream()
+          .map(HRegionLocation::getServerName).collect(Collectors.toSet());
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    }));
+  }
+
+  @FunctionalInterface
+  interface AssertionRunnable {
+    void run() throws AssertionError;
+  }
+
+  static void validateAssertionsWithRetries(HBaseTestingUtility testUtil,
+    boolean runBalancerOnFailure, AssertionRunnable assertion) {
+    validateAssertionsWithRetries(testUtil, runBalancerOnFailure, ImmutableSet.of(assertion));
+  }
+
+  static void validateAssertionsWithRetries(HBaseTestingUtility testUtil,
+    boolean runBalancerOnFailure, Set<AssertionRunnable> assertions) {
+    int maxAttempts = 50;
+    for (int i = 0; i < maxAttempts; i++) {
+      try {
+        for (AssertionRunnable assertion : assertions) {
+          assertion.run();
+        }
+      } catch (AssertionError e) {
+        if (i == maxAttempts - 1) {
+          throw e;
+        }
+        try {
+          LOG.warn("Failed to validate region locations. Will retry", e);
+          Thread.sleep(1000);
+          BalancerConditionalsTestUtil.printRegionLocations(testUtil.getConnection());
+          if (runBalancerOnFailure) {
+            testUtil.getAdmin().balance();
+          }
+          Thread.sleep(1000);
+        } catch (Exception ex) {
+          throw new RuntimeException(ex);
+        }
+      }
+    }
+  }
+
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java
new file mode 100644
index 000000000000..4f6e8f70f305
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import static org.apache.hadoop.hbase.master.balancer.StochasticLoadBalancer.MAX_RUNNING_TIME_KEY;
+import static org.apache.hadoop.hbase.master.balancer.StochasticLoadBalancer.MIN_COST_NEED_BALANCE_KEY;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.Base64;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Function;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.master.MasterServices;
+import org.apache.hadoop.hbase.master.RegionPlan;
+import org.apache.hadoop.hbase.master.balancer.replicas.ReplicaKey;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class CandidateGeneratorTestUtil {
+
+  private static final Logger LOG = LoggerFactory.getLogger(CandidateGeneratorTestUtil.class);
+
+  private static final MasterServices MOCK_MASTER_SERVICES = mock(MasterServices.class);
+
+  private CandidateGeneratorTestUtil() {
+  }
+
+  static void runBalancerToExhaustion(Configuration conf,
+    Map<ServerName, List<RegionInfo>> serverToRegions,
+    Set<Function<BalancerClusterState, Boolean>> expectations, float targetMaxBalancerCost) {
+    // Do the full plan. We're testing with a lot of regions
+    conf.setBoolean("hbase.master.balancer.stochastic.runMaxSteps", true);
+    conf.setLong(MAX_RUNNING_TIME_KEY, 15000);
+
+    conf.setFloat(MIN_COST_NEED_BALANCE_KEY, targetMaxBalancerCost);
+
+    BalancerClusterState cluster = createMockBalancerClusterState(serverToRegions);
+    StochasticLoadBalancer stochasticLoadBalancer = buildStochasticLoadBalancer(cluster, conf);
+    printClusterDistribution(cluster, 0);
+    int balancerRuns = 0;
+    int actionsTaken = 0;
+    long balancingMillis = 0;
+    boolean isBalanced = false;
+    while (!isBalanced) {
+      balancerRuns++;
+      if (balancerRuns > 1000) {
+        throw new RuntimeException("Balancer failed to find balance & meet expectations");
+      }
+      long start = System.currentTimeMillis();
+      List<RegionPlan> regionPlans =
+        stochasticLoadBalancer.balanceCluster(partitionRegionsByTable(serverToRegions));
+      balancingMillis += System.currentTimeMillis() - start;
+      actionsTaken++;
+      if (regionPlans != null) {
+        // Apply all plans to serverToRegions
+        for (RegionPlan rp : regionPlans) {
+          ServerName source = rp.getSource();
+          ServerName dest = rp.getDestination();
+          RegionInfo region = rp.getRegionInfo();
+
+          // Update serverToRegions
+          serverToRegions.get(source).remove(region);
+          serverToRegions.get(dest).add(region);
+          actionsTaken++;
+        }
+
+        // Now rebuild cluster and balancer from updated serverToRegions
+        cluster = createMockBalancerClusterState(serverToRegions);
+        stochasticLoadBalancer = buildStochasticLoadBalancer(cluster, conf);
+      }
+      printClusterDistribution(cluster, actionsTaken);
+      isBalanced = true;
+      for (Function<BalancerClusterState, Boolean> condition : expectations) {
+        // Check if we've met all expectations for the candidate generator
+        if (!condition.apply(cluster)) {
+          isBalanced = false;
+          break;
+        }
+      }
+      if (isBalanced) { // Check if the balancer thinks we're done too
+        LOG.info("All balancer conditions passed. Checking if balancer thinks it's done.");
+        if (stochasticLoadBalancer.needsBalance(HConstants.ENSEMBLE_TABLE_NAME, cluster)) {
+          LOG.info("Balancer would still like to run");
+          isBalanced = false;
+        } else {
+          LOG.info("Balancer is done");
+        }
+      }
+    }
+    LOG.info("Balancing took {}sec", Duration.ofMillis(balancingMillis).toMinutes());
+  }
+
+  /**
+   * Prints the current cluster distribution of regions per table per server
+   */
+  static void printClusterDistribution(BalancerClusterState cluster, long actionsTaken) {
+    LOG.info("=== Cluster Distribution after {} balancer actions taken ===", actionsTaken);
+
+    for (int i = 0; i < cluster.numServers; i++) {
+      int[] regions = cluster.regionsPerServer[i];
+      int regionCount = (regions == null) ? 0 : regions.length;
+
+      LOG.info("Server {}: {} regions", cluster.servers[i].getServerName(), regionCount);
+
+      if (regionCount > 0) {
+        Map<TableName, Integer> tableRegionCounts = new HashMap<>();
+
+        for (int regionIndex : regions) {
+          RegionInfo regionInfo = cluster.regions[regionIndex];
+          TableName tableName = regionInfo.getTable();
+          tableRegionCounts.put(tableName, tableRegionCounts.getOrDefault(tableName, 0) + 1);
+        }
+
+        tableRegionCounts
+          .forEach((table, count) -> LOG.info("  - Table {}: {} regions", table, count));
+      }
+    }
+
+    LOG.info("===========================================");
+  }
+
+  /**
+   * Partitions the given serverToRegions map by table The tables are derived from the RegionInfo
+   * objects found in serverToRegions.
+   * @param serverToRegions The map of servers to their assigned regions.
+   * @return A map of tables to their server-to-region assignments.
+   */
+  public static Map<TableName, Map<ServerName, List<RegionInfo>>>
+    partitionRegionsByTable(Map<ServerName, List<RegionInfo>> serverToRegions) {
+
+    // First, gather all tables from the regions
+    Set<TableName> allTables = new HashSet<>();
+    for (List<RegionInfo> regions : serverToRegions.values()) {
+      for (RegionInfo region : regions) {
+        allTables.add(region.getTable());
+      }
+    }
+
+    Map<TableName, Map<ServerName, List<RegionInfo>>> tablesToServersToRegions = new HashMap<>();
+
+    // Initialize each table with all servers mapped to empty lists
+    for (TableName table : allTables) {
+      Map<ServerName, List<RegionInfo>> serverMap = new HashMap<>();
+      for (ServerName server : serverToRegions.keySet()) {
+        serverMap.put(server, new ArrayList<>());
+      }
+      tablesToServersToRegions.put(table, serverMap);
+    }
+
+    // Distribute regions to their respective tables
+    for (Map.Entry<ServerName, List<RegionInfo>> serverAndRegions : serverToRegions.entrySet()) {
+      ServerName server = serverAndRegions.getKey();
+      List<RegionInfo> regions = serverAndRegions.getValue();
+
+      for (RegionInfo region : regions) {
+        TableName regionTable = region.getTable();
+        // Now we know for sure regionTable is in allTables
+        Map<ServerName, List<RegionInfo>> tableServerMap =
+          tablesToServersToRegions.get(regionTable);
+        tableServerMap.get(server).add(region);
+      }
+    }
+
+    return tablesToServersToRegions;
+  }
+
+  static StochasticLoadBalancer buildStochasticLoadBalancer(BalancerClusterState cluster,
+    Configuration conf) {
+    StochasticLoadBalancer stochasticLoadBalancer =
+      new StochasticLoadBalancer(new DummyMetricsStochasticBalancer());
+    when(MOCK_MASTER_SERVICES.getConfiguration()).thenReturn(conf);
+    stochasticLoadBalancer.setMasterServices(MOCK_MASTER_SERVICES);
+    stochasticLoadBalancer.loadConf(conf);
+    stochasticLoadBalancer.initCosts(cluster);
+    return stochasticLoadBalancer;
+  }
+
+  static BalancerClusterState
+    createMockBalancerClusterState(Map<ServerName, List<RegionInfo>> serverToRegions) {
+    return new BalancerClusterState(serverToRegions, null, null, null, null);
+  }
+
+  /**
+   * Validates that each replica is isolated from its others. Ensures that no server hosts more than
+   * one replica of the same region (i.e., regions with identical start and end keys).
+   * @param cluster The current state of the cluster.
+   * @return true if all replicas are properly isolated, false otherwise.
+   */
+  static boolean areAllReplicasDistributed(BalancerClusterState cluster) {
+    // Iterate over each server
+    for (int[] regionsPerServer : cluster.regionsPerServer) {
+      if (regionsPerServer == null || regionsPerServer.length == 0) {
+        continue; // Skip empty servers
+      }
+
+      Set<ReplicaKey> foundKeys = new HashSet<>();
+      for (int regionIndex : regionsPerServer) {
+        RegionInfo regionInfo = cluster.regions[regionIndex];
+        ReplicaKey replicaKey = new ReplicaKey(regionInfo);
+        if (foundKeys.contains(replicaKey)) {
+          // Violation: Multiple replicas of the same region on the same server
+          LOG.warn("Replica isolation violated: one server hosts multiple replicas of key [{}].",
+            generateRegionKey(regionInfo));
+          return false;
+        }
+
+        foundKeys.add(replicaKey);
+      }
+    }
+
+    LOG.info(
+      "Replica isolation validation passed: No server hosts multiple replicas of the same region.");
+    return true;
+  }
+
+  /**
+   * Generates a unique key for a region based on its start and end keys. This method ensures that
+   * regions with identical start and end keys have the same key.
+   * @param regionInfo The RegionInfo object.
+   * @return A string representing the unique key of the region.
+   */
+  private static String generateRegionKey(RegionInfo regionInfo) {
+    // Using Base64 encoding for byte arrays to ensure uniqueness and readability
+    String startKey = Base64.getEncoder().encodeToString(regionInfo.getStartKey());
+    String endKey = Base64.getEncoder().encodeToString(regionInfo.getEndKey());
+
+    return regionInfo.getTable().getNameAsString() + ":" + startKey + ":" + endKey;
+  }
+
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasTestConditional.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasTestConditional.java
new file mode 100644
index 000000000000..5a8fa2524fe6
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasTestConditional.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import org.apache.hadoop.conf.Configuration;
+
+public class DistributeReplicasTestConditional extends DistributeReplicasConditional {
+
+  static void enableConditionalReplicaDistributionForTest(Configuration conf) {
+    conf.set(BalancerConditionals.ADDITIONAL_CONDITIONALS_KEY,
+      DistributeReplicasTestConditional.class.getCanonicalName());
+  }
+
+  public DistributeReplicasTestConditional(BalancerConditionals balancerConditionals,
+    BalancerClusterState cluster) {
+    super(balancerConditionals, cluster);
+  }
+
+  @Override
+  public ValidationLevel getValidationLevel() {
+    // Mini-cluster tests can't validate at host/rack levels
+    return ValidationLevel.SERVER;
+  }
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/LoadOnlyFavoredStochasticBalancer.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/LoadOnlyFavoredStochasticBalancer.java
index d658f7cfa167..dfacad1a747c 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/LoadOnlyFavoredStochasticBalancer.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/LoadOnlyFavoredStochasticBalancer.java
@@ -19,6 +19,7 @@
 
 import java.util.HashMap;
 import java.util.Map;
+import org.apache.hadoop.conf.Configuration;
 
 /**
  * Used for FavoredNode unit tests
@@ -27,7 +28,7 @@ public class LoadOnlyFavoredStochasticBalancer extends FavoredStochasticBalancer
 
   @Override
   protected Map<Class<? extends CandidateGenerator>, CandidateGenerator>
-    createCandidateGenerators() {
+    createCandidateGenerators(Configuration conf) {
     Map<Class<? extends CandidateGenerator>, CandidateGenerator> fnPickers = new HashMap<>(1);
     fnPickers.put(FavoredNodeLoadPicker.class, new FavoredNodeLoadPicker());
     return fnPickers;
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestBalancerConditionals.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestBalancerConditionals.java
new file mode 100644
index 000000000000..884331f161ac
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestBalancerConditionals.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.SmallTests;
+import org.junit.Before;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+@Category({ SmallTests.class, MasterTests.class })
+public class TestBalancerConditionals extends BalancerTestBase {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestBalancerConditionals.class);
+
+  private BalancerConditionals balancerConditionals;
+  private BalancerClusterState mockCluster;
+
+  @Before
+  public void setUp() {
+    balancerConditionals = BalancerConditionals.create();
+    mockCluster = mockCluster(new int[] { 0, 1, 2 });
+  }
+
+  @Test
+  public void testDefaultConfiguration() {
+    Configuration conf = new Configuration();
+    balancerConditionals.setConf(conf);
+    balancerConditionals.loadClusterState(mockCluster);
+
+    assertEquals("No conditionals should be loaded by default", 0,
+      balancerConditionals.getConditionalClasses().size());
+  }
+
+  @Test
+  public void testCustomConditionalsViaConfiguration() {
+    Configuration conf = new Configuration();
+    conf.set(BalancerConditionals.ADDITIONAL_CONDITIONALS_KEY,
+      DistributeReplicasConditional.class.getName());
+
+    balancerConditionals.setConf(conf);
+    balancerConditionals.loadClusterState(mockCluster);
+
+    assertTrue("Custom conditionals should be loaded",
+      balancerConditionals.shouldSkipSloppyServerEvaluation());
+  }
+
+  @Test
+  public void testInvalidCustomConditionalClass() {
+    Configuration conf = new Configuration();
+    conf.set(BalancerConditionals.ADDITIONAL_CONDITIONALS_KEY, "java.lang.String");
+
+    balancerConditionals.setConf(conf);
+    balancerConditionals.loadClusterState(mockCluster);
+
+    assertEquals("Invalid classes should not be loaded as conditionals", 0,
+      balancerConditionals.getConditionalClasses().size());
+  }
+
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingConditionalReplicaDistribution.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingConditionalReplicaDistribution.java
new file mode 100644
index 000000000000..9e0a6f24e106
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingConditionalReplicaDistribution.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import static org.apache.hadoop.hbase.master.balancer.CandidateGeneratorTestUtil.runBalancerToExhaustion;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.RegionInfoBuilder;
+import org.apache.hadoop.hbase.master.balancer.replicas.ReplicaKeyCache;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableSet;
+
+@Category({ MediumTests.class, MasterTests.class })
+public class TestLargeClusterBalancingConditionalReplicaDistribution {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestLargeClusterBalancingConditionalReplicaDistribution.class);
+
+  private static final Logger LOG =
+    LoggerFactory.getLogger(TestLargeClusterBalancingConditionalReplicaDistribution.class);
+
+  private static final int NUM_SERVERS = 1000;
+  private static final int NUM_REGIONS = 20_000;
+  private static final int NUM_REPLICAS = 3;
+  private static final int NUM_TABLES = 100;
+
+  private static final ServerName[] servers = new ServerName[NUM_SERVERS];
+  private static final Map<ServerName, List<RegionInfo>> serverToRegions = new HashMap<>();
+
+  @BeforeClass
+  public static void setup() {
+    // Initialize servers
+    for (int i = 0; i < NUM_SERVERS; i++) {
+      servers[i] = ServerName.valueOf("server" + i, i, System.currentTimeMillis());
+      serverToRegions.put(servers[i], new ArrayList<>());
+    }
+
+    // Create primary regions and their replicas
+    List<RegionInfo> allRegions = new ArrayList<>();
+    for (int i = 0; i < NUM_REGIONS; i++) {
+      TableName tableName = getTableName(i);
+      // Define startKey and endKey for the region
+      byte[] startKey = Bytes.toBytes(i);
+      byte[] endKey = Bytes.toBytes(i + 1);
+
+      // Create 3 replicas for each primary region
+      for (int replicaId = 0; replicaId < NUM_REPLICAS; replicaId++) {
+        RegionInfo regionInfo = RegionInfoBuilder.newBuilder(tableName).setStartKey(startKey)
+          .setEndKey(endKey).setReplicaId(replicaId).build();
+        allRegions.add(regionInfo);
+      }
+    }
+
+    // Assign all regions to one server
+    for (RegionInfo regionInfo : allRegions) {
+      serverToRegions.get(servers[0]).add(regionInfo);
+    }
+  }
+
+  private static TableName getTableName(int i) {
+    return TableName.valueOf("userTable" + i % NUM_TABLES);
+  }
+
+  @Test
+  public void testReplicaDistribution() {
+    Configuration conf = new Configuration();
+    DistributeReplicasTestConditional.enableConditionalReplicaDistributionForTest(conf);
+    conf.setBoolean(ReplicaKeyCache.CACHE_REPLICA_KEYS_KEY, true);
+    conf.setInt(ReplicaKeyCache.REPLICA_KEY_CACHE_SIZE_KEY, Integer.MAX_VALUE);
+    conf.setLong("hbase.master.balancer.stochastic.maxRunningTime", 30_000);
+
+    // turn off replica cost functions
+    conf.setLong("hbase.master.balancer.stochastic.regionReplicaRackCostKey", 0);
+    conf.setLong("hbase.master.balancer.stochastic.regionReplicaHostCostKey", 0);
+
+    runBalancerToExhaustion(conf, serverToRegions,
+      ImmutableSet.of(CandidateGeneratorTestUtil::areAllReplicasDistributed), 10.0f);
+    LOG.info("Meta table and system table regions are successfully isolated, "
+      + "meanwhile region replicas are appropriately distributed across RegionServers.");
+  }
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestReplicaDistributionBalancerConditional.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestReplicaDistributionBalancerConditional.java
new file mode 100644
index 000000000000..7807b07e74f9
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestReplicaDistributionBalancerConditional.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import static org.apache.hadoop.hbase.master.balancer.BalancerConditionalsTestUtil.validateAssertionsWithRetries;
+
+import java.util.List;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.Admin;
+import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
+import org.apache.hadoop.hbase.client.Connection;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
+import org.apache.hadoop.hbase.testclassification.LargeTests;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Category({ LargeTests.class, MasterTests.class })
+public class TestReplicaDistributionBalancerConditional {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestReplicaDistributionBalancerConditional.class);
+
+  private static final Logger LOG =
+    LoggerFactory.getLogger(TestReplicaDistributionBalancerConditional.class);
+  private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+  private static final int REPLICAS = 3;
+  private static final int NUM_SERVERS = REPLICAS;
+  private static final int REGIONS_PER_SERVER = 5;
+
+  @Before
+  public void setUp() throws Exception {
+    DistributeReplicasTestConditional
+      .enableConditionalReplicaDistributionForTest(TEST_UTIL.getConfiguration());
+    TEST_UTIL.getConfiguration()
+      .setBoolean(ServerRegionReplicaUtil.REGION_REPLICA_REPLICATION_CONF_KEY, true);
+    TEST_UTIL.getConfiguration().setLong(HConstants.HBASE_BALANCER_PERIOD, 1000L);
+    TEST_UTIL.getConfiguration().setBoolean("hbase.master.balancer.stochastic.runMaxSteps", true);
+
+    // turn off replica cost functions
+    TEST_UTIL.getConfiguration()
+      .setLong("hbase.master.balancer.stochastic.regionReplicaRackCostKey", 0);
+    TEST_UTIL.getConfiguration()
+      .setLong("hbase.master.balancer.stochastic.regionReplicaHostCostKey", 0);
+
+    TEST_UTIL.startMiniCluster(NUM_SERVERS);
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    TEST_UTIL.shutdownMiniCluster();
+  }
+
+  @Test
+  public void testReplicaDistribution() throws Exception {
+    Connection connection = TEST_UTIL.getConnection();
+    Admin admin = connection.getAdmin();
+
+    // Create a "replicated_table" with region replicas
+    TableName replicatedTableName = TableName.valueOf("replicated_table");
+    TableDescriptor replicatedTableDescriptor =
+      TableDescriptorBuilder.newBuilder(replicatedTableName)
+        .setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(Bytes.toBytes("0")).build())
+        .setRegionReplication(REPLICAS).build();
+    admin.createTable(replicatedTableDescriptor,
+      BalancerConditionalsTestUtil.generateSplits(REGIONS_PER_SERVER * NUM_SERVERS));
+
+    // Pause the balancer
+    admin.balancerSwitch(false, true);
+
+    // Collect all region replicas and place them on one RegionServer
+    List<RegionInfo> allRegions = admin.getRegions(replicatedTableName);
+    String targetServer =
+      TEST_UTIL.getHBaseCluster().getRegionServer(0).getServerName().getServerName();
+
+    for (RegionInfo region : allRegions) {
+      admin.move(region.getEncodedNameAsBytes(), Bytes.toBytes(targetServer));
+    }
+
+    BalancerConditionalsTestUtil.printRegionLocations(TEST_UTIL.getConnection());
+    validateAssertionsWithRetries(TEST_UTIL, false, () -> BalancerConditionalsTestUtil
+      .validateReplicaDistribution(connection, replicatedTableName, false));
+
+    // Unpause the balancer and trigger balancing
+    admin.balancerSwitch(true, true);
+    admin.balance();
+
+    validateAssertionsWithRetries(TEST_UTIL, true, () -> BalancerConditionalsTestUtil
+      .validateReplicaDistribution(connection, replicatedTableName, true));
+    BalancerConditionalsTestUtil.printRegionLocations(TEST_UTIL.getConnection());
+  }
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancerHeterogeneousCost.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancerHeterogeneousCost.java
index 960783a8467e..188efa64dcd5 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancerHeterogeneousCost.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancerHeterogeneousCost.java
@@ -254,7 +254,7 @@ static class StochasticLoadTestBalancer extends StochasticLoadBalancer {
     }
 
     @Override
-    protected CandidateGenerator getRandomGenerator() {
+    protected CandidateGenerator getRandomGenerator(BalancerClusterState cluster) {
       return fairRandomCandidateGenerator;
     }
   }

From 9905b7ec37a3a13e9d40e6925477afbd9fc2ec2c Mon Sep 17 00:00:00 2001
From: Ray Mattingly <rmattingly@apache.org>
Date: Thu, 6 Mar 2025 08:23:14 -0500
Subject: [PATCH 19/37] HubSpot Backport: HBASE-29074 Balancer conditionals
 should support meta table isolation (#6722) (#6737) (will be in 2.7)

Signed-off-by: Nick Dimiduk <ndimiduk@apache.org>
Co-authored-by: Ray Mattingly <rmattingly@hubspot.com>
---
 .../master/balancer/BalancerClusterState.java |  14 +-
 .../master/balancer/BalancerConditionals.java |  19 +-
 .../DistributeReplicasCandidateGenerator.java |   6 +-
 .../MetaTableIsolationCandidateGenerator.java |  34 ++++
 .../MetaTableIsolationConditional.java        |  37 ++++
 ...gionPlanConditionalCandidateGenerator.java |   5 +-
 .../SlopFixingCandidateGenerator.java         |  16 +-
 .../balancer/StochasticLoadBalancer.java      |   7 +-
 .../TableIsolationCandidateGenerator.java     | 130 +++++++++++++
 .../balancer/TableIsolationConditional.java   |  83 ++++++++
 .../balancer/CandidateGeneratorTestUtil.java  |  35 ++++
 .../balancer/TestBalancerConditionals.java    |  14 +-
 ...lancingConditionalReplicaDistribution.java |   3 +-
 ...rgeClusterBalancingMetaTableIsolation.java | 103 ++++++++++
 ...gTableIsolationAndReplicaDistribution.java | 122 ++++++++++++
 ...MetaTableIsolationBalancerConditional.java | 181 ++++++++++++++++++
 16 files changed, 787 insertions(+), 22 deletions(-)
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MetaTableIsolationCandidateGenerator.java
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MetaTableIsolationConditional.java
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/TableIsolationCandidateGenerator.java
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/TableIsolationConditional.java
 create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingMetaTableIsolation.java
 create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java
 create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestMetaTableIsolationBalancerConditional.java

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerClusterState.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerClusterState.java
index 67755fc317c6..b07287c1ed19 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerClusterState.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerClusterState.java
@@ -37,6 +37,7 @@
 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
 import org.apache.hadoop.hbase.master.RackManager;
 import org.apache.hadoop.hbase.net.Address;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
 import org.apache.hadoop.hbase.util.Pair;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
@@ -311,11 +312,16 @@ protected BalancerClusterState(Map<ServerName, List<RegionInfo>> clusterState,
       regionIndex++;
     }
 
+    if (LOG.isTraceEnabled()) {
+      for (int i = 0; i < numServers; i++) {
+        LOG.trace("server {} has {} regions", i, regionsPerServer[i].length);
+      }
+    }
     for (int i = 0; i < serversPerHostList.size(); i++) {
       serversPerHost[i] = new int[serversPerHostList.get(i).size()];
       for (int j = 0; j < serversPerHost[i].length; j++) {
         serversPerHost[i][j] = serversPerHostList.get(i).get(j);
-        LOG.debug("server {} is on host {}", serversPerHostList.get(i).get(j), i);
+        LOG.trace("server {} is on host {}", serversPerHostList.get(i).get(j), i);
       }
       if (serversPerHost[i].length > 1) {
         multiServersPerHost = true;
@@ -326,7 +332,7 @@ protected BalancerClusterState(Map<ServerName, List<RegionInfo>> clusterState,
       serversPerRack[i] = new int[serversPerRackList.get(i).size()];
       for (int j = 0; j < serversPerRack[i].length; j++) {
         serversPerRack[i][j] = serversPerRackList.get(i).get(j);
-        LOG.info("server {} is on rack {}", serversPerRackList.get(i).get(j), i);
+        LOG.trace("server {} is on rack {}", serversPerRackList.get(i).get(j), i);
       }
     }
 
@@ -1075,8 +1081,8 @@ void setStopRequestedAt(long stopRequestedAt) {
     this.stopRequestedAt = stopRequestedAt;
   }
 
-  long getStopRequestedAt() {
-    return stopRequestedAt;
+  boolean isStopRequested() {
+    return EnvironmentEdgeManager.currentTime() > stopRequestedAt;
   }
 
   @Override
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java
index c44e47996932..88ceb5a55406 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java
@@ -58,6 +58,10 @@ final class BalancerConditionals implements Configurable {
     "hbase.master.balancer.stochastic.conditionals.distributeReplicas";
   public static final boolean DISTRIBUTE_REPLICAS_DEFAULT = false;
 
+  public static final String ISOLATE_META_TABLE_KEY =
+    "hbase.master.balancer.stochastic.conditionals.isolateMetaTable";
+  public static final boolean ISOLATE_META_TABLE_DEFAULT = false;
+
   public static final String ADDITIONAL_CONDITIONALS_KEY =
     "hbase.master.balancer.stochastic.additionalConditionals";
 
@@ -91,8 +95,14 @@ boolean isReplicaDistributionEnabled() {
       .anyMatch(DistributeReplicasConditional.class::isAssignableFrom);
   }
 
-  boolean shouldSkipSloppyServerEvaluation() {
-    return isConditionalBalancingEnabled();
+  boolean isTableIsolationEnabled() {
+    return conditionalClasses.contains(MetaTableIsolationConditional.class);
+  }
+
+  boolean isServerHostingIsolatedTables(BalancerClusterState cluster, int serverIdx) {
+    return conditionals.stream().filter(TableIsolationConditional.class::isInstance)
+      .map(TableIsolationConditional.class::cast)
+      .anyMatch(conditional -> conditional.isServerHostingIsolatedTables(cluster, serverIdx));
   }
 
   boolean isConditionalBalancingEnabled() {
@@ -193,6 +203,11 @@ public void setConf(Configuration conf) {
       conditionalClasses.add(DistributeReplicasConditional.class);
     }
 
+    boolean isolateMetaTable = conf.getBoolean(ISOLATE_META_TABLE_KEY, ISOLATE_META_TABLE_DEFAULT);
+    if (isolateMetaTable) {
+      conditionalClasses.add(MetaTableIsolationConditional.class);
+    }
+
     Class<?>[] classes = conf.getClasses(ADDITIONAL_CONDITIONALS_KEY);
     for (Class<?> clazz : classes) {
       if (!RegionPlanConditional.class.isAssignableFrom(clazz)) {
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasCandidateGenerator.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasCandidateGenerator.java
index 38fbcc4a0fbc..be7c7871f9c7 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasCandidateGenerator.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasCandidateGenerator.java
@@ -24,7 +24,6 @@
 import java.util.List;
 import java.util.Set;
 import org.apache.hadoop.hbase.master.balancer.replicas.ReplicaKey;
-import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -60,10 +59,7 @@ BalanceAction generateCandidate(BalancerClusterState cluster, boolean isWeighing
     List<MoveRegionAction> moveRegionActions = new ArrayList<>();
     List<Integer> shuffledServerIndices = cluster.getShuffledServerIndices();
     for (int sourceIndex : shuffledServerIndices) {
-      if (
-        moveRegionActions.size() >= BATCH_SIZE
-          || EnvironmentEdgeManager.currentTime() > cluster.getStopRequestedAt()
-      ) {
+      if (moveRegionActions.size() >= BATCH_SIZE || cluster.isStopRequested()) {
         break;
       }
       int[] serverRegions = cluster.regionsPerServer[sourceIndex];
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MetaTableIsolationCandidateGenerator.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MetaTableIsolationCandidateGenerator.java
new file mode 100644
index 000000000000..5aa041f21d7e
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MetaTableIsolationCandidateGenerator.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+
+@InterfaceAudience.Private
+public final class MetaTableIsolationCandidateGenerator extends TableIsolationCandidateGenerator {
+
+  MetaTableIsolationCandidateGenerator(BalancerConditionals balancerConditionals) {
+    super(balancerConditionals);
+  }
+
+  @Override
+  boolean shouldBeIsolated(RegionInfo regionInfo) {
+    return regionInfo.isMetaRegion();
+  }
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MetaTableIsolationConditional.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MetaTableIsolationConditional.java
new file mode 100644
index 000000000000..732693c44f3e
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MetaTableIsolationConditional.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import org.apache.hadoop.hbase.client.RegionInfo;
+
+/**
+ * If enabled, this class will help the balancer ensure that the meta table lives on its own
+ * RegionServer. Configure this via {@link BalancerConditionals#ISOLATE_META_TABLE_KEY}
+ */
+class MetaTableIsolationConditional extends TableIsolationConditional {
+
+  public MetaTableIsolationConditional(BalancerConditionals balancerConditionals,
+    BalancerClusterState cluster) {
+    super(balancerConditionals, cluster);
+  }
+
+  @Override
+  boolean isRegionToIsolate(RegionInfo regionInfo) {
+    return regionInfo.isMetaRegion();
+  }
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionPlanConditionalCandidateGenerator.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionPlanConditionalCandidateGenerator.java
index f8274841f729..d28a507ff3fd 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionPlanConditionalCandidateGenerator.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionPlanConditionalCandidateGenerator.java
@@ -64,8 +64,11 @@ BalanceAction generate(BalancerClusterState cluster) {
     return balanceAction;
   }
 
-  MoveBatchAction batchMovesAndResetClusterState(BalancerClusterState cluster,
+  BalanceAction batchMovesAndResetClusterState(BalancerClusterState cluster,
     List<MoveRegionAction> moves) {
+    if (moves.isEmpty()) {
+      return BalanceAction.NULL_ACTION;
+    }
     MoveBatchAction batchAction = new MoveBatchAction(moves);
     undoBatchAction(cluster, batchAction);
     return batchAction;
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SlopFixingCandidateGenerator.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SlopFixingCandidateGenerator.java
index 070e4903394d..b1ea1de8d2b0 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SlopFixingCandidateGenerator.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SlopFixingCandidateGenerator.java
@@ -44,6 +44,7 @@ final class SlopFixingCandidateGenerator extends RegionPlanConditionalCandidateG
 
   @Override
   BalanceAction generateCandidate(BalancerClusterState cluster, boolean isWeighing) {
+    boolean isTableIsolationEnabled = getBalancerConditionals().isTableIsolationEnabled();
     ClusterLoadState cs = new ClusterLoadState(cluster.clusterState);
     float average = cs.getLoadAverage();
     int ceiling = (int) Math.ceil(average * (1 + slop));
@@ -63,6 +64,13 @@ BalanceAction generateCandidate(BalancerClusterState cluster, boolean isWeighing
     List<MoveRegionAction> moves = new ArrayList<>();
     Set<ServerAndLoad> fixedServers = new HashSet<>();
     for (int sourceServer : sloppyServerIndices) {
+      if (
+        isTableIsolationEnabled
+          && getBalancerConditionals().isServerHostingIsolatedTables(cluster, sourceServer)
+      ) {
+        // Don't fix sloppiness of servers hosting isolated tables
+        continue;
+      }
       for (int regionIdx : cluster.regionsPerServer[sourceServer]) {
         boolean regionFoundMove = false;
         for (ServerAndLoad serverAndLoad : cs.getServersByLoad().keySet()) {
@@ -88,8 +96,8 @@ BalanceAction generateCandidate(BalancerClusterState cluster, boolean isWeighing
         }
         fixedServers.forEach(s -> cs.getServersByLoad().remove(s));
         fixedServers.clear();
-        if (!regionFoundMove) {
-          LOG.debug("Could not find a destination for region {} from server {}.", regionIdx,
+        if (!regionFoundMove && LOG.isTraceEnabled()) {
+          LOG.trace("Could not find a destination for region {} from server {}.", regionIdx,
             sourceServer);
         }
         if (cluster.regionsPerServer[sourceServer].length <= ceiling) {
@@ -98,8 +106,6 @@ BalanceAction generateCandidate(BalancerClusterState cluster, boolean isWeighing
       }
     }
 
-    MoveBatchAction batch = new MoveBatchAction(moves);
-    undoBatchAction(cluster, batch);
-    return batch;
+    return batchMovesAndResetClusterState(cluster, moves);
   }
 }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
index 42784ea4440d..d184cf52e80f 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
@@ -455,7 +455,10 @@ boolean needsBalance(TableName tableName, BalancerClusterState cluster) {
       return true;
     }
 
-    if (sloppyRegionServerExist(cs)) {
+    if (
+      // table isolation is inherently incompatible with naive "sloppy server" checks
+      !balancerConditionals.isTableIsolationEnabled() && sloppyRegionServerExist(cs)
+    ) {
       LOG.info("Running balancer because cluster has sloppy server(s)." + " function cost={}",
         functionCost());
       return true;
@@ -747,7 +750,7 @@ protected List<RegionPlan> balanceTable(TableName tableName,
         updateCostsAndWeightsWithAction(cluster, undoAction);
       }
 
-      if (EnvironmentEdgeManager.currentTime() > cluster.getStopRequestedAt()) {
+      if (cluster.isStopRequested()) {
         break;
       }
     }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/TableIsolationCandidateGenerator.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/TableIsolationCandidateGenerator.java
new file mode 100644
index 000000000000..ec41033999fa
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/TableIsolationCandidateGenerator.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@InterfaceAudience.Private
+public abstract class TableIsolationCandidateGenerator
+  extends RegionPlanConditionalCandidateGenerator {
+
+  private static final Logger LOG = LoggerFactory.getLogger(TableIsolationCandidateGenerator.class);
+
+  TableIsolationCandidateGenerator(BalancerConditionals balancerConditionals) {
+    super(balancerConditionals);
+  }
+
+  abstract boolean shouldBeIsolated(RegionInfo regionInfo);
+
+  @Override
+  BalanceAction generate(BalancerClusterState cluster) {
+    return generateCandidate(cluster, false);
+  }
+
+  BalanceAction generateCandidate(BalancerClusterState cluster, boolean isWeighing) {
+    if (!getBalancerConditionals().isTableIsolationEnabled()) {
+      return BalanceAction.NULL_ACTION;
+    }
+
+    List<MoveRegionAction> moves = new ArrayList<>();
+    List<Integer> serverIndicesHoldingIsolatedRegions = new ArrayList<>();
+    int isolatedTableMaxReplicaCount = 1;
+    for (int serverIdx : cluster.getShuffledServerIndices()) {
+      if (cluster.isStopRequested()) {
+        break;
+      }
+      boolean hasRegionsToIsolate = false;
+      Set<Integer> regionsToMove = new HashSet<>();
+
+      // Move non-target regions away from target regions,
+      // and track replica counts so we know how many isolated hosts we need
+      for (int regionIdx : cluster.regionsPerServer[serverIdx]) {
+        RegionInfo regionInfo = cluster.regions[regionIdx];
+        if (shouldBeIsolated(regionInfo)) {
+          hasRegionsToIsolate = true;
+          int replicaCount = regionInfo.getReplicaId() + 1;
+          if (replicaCount > isolatedTableMaxReplicaCount) {
+            isolatedTableMaxReplicaCount = replicaCount;
+          }
+        } else {
+          regionsToMove.add(regionIdx);
+        }
+      }
+
+      if (hasRegionsToIsolate) {
+        serverIndicesHoldingIsolatedRegions.add(serverIdx);
+      }
+
+      // Generate non-system regions to move, if applicable
+      if (hasRegionsToIsolate && !regionsToMove.isEmpty()) {
+        for (int regionToMove : regionsToMove) {
+          for (int i = 0; i < cluster.numServers; i++) {
+            int targetServer = pickOtherRandomServer(cluster, serverIdx);
+            MoveRegionAction possibleMove =
+              new MoveRegionAction(regionToMove, serverIdx, targetServer);
+            if (!getBalancerConditionals().isViolating(cluster, possibleMove)) {
+              if (isWeighing) {
+                return possibleMove;
+              }
+              cluster.doAction(possibleMove); // Update cluster state to reflect move
+              moves.add(possibleMove);
+              break;
+            }
+          }
+        }
+      }
+    }
+
+    // Try to consolidate regions on only n servers, where n is the number of replicas
+    if (serverIndicesHoldingIsolatedRegions.size() > isolatedTableMaxReplicaCount) {
+      // One target per replica
+      List<Integer> targetServerIndices = new ArrayList<>();
+      for (int i = 0; i < isolatedTableMaxReplicaCount; i++) {
+        targetServerIndices.add(serverIndicesHoldingIsolatedRegions.get(i));
+      }
+      // Move all isolated regions from non-targets to targets
+      for (int i = isolatedTableMaxReplicaCount; i
+          < serverIndicesHoldingIsolatedRegions.size(); i++) {
+        int fromServer = serverIndicesHoldingIsolatedRegions.get(i);
+        for (int regionIdx : cluster.regionsPerServer[fromServer]) {
+          RegionInfo regionInfo = cluster.regions[regionIdx];
+          if (shouldBeIsolated(regionInfo)) {
+            int targetServer = targetServerIndices.get(i % isolatedTableMaxReplicaCount);
+            MoveRegionAction possibleMove =
+              new MoveRegionAction(regionIdx, fromServer, targetServer);
+            if (!getBalancerConditionals().isViolating(cluster, possibleMove)) {
+              if (isWeighing) {
+                return possibleMove;
+              }
+              cluster.doAction(possibleMove); // Update cluster state to reflect move
+              moves.add(possibleMove);
+            }
+          }
+        }
+      }
+    }
+    return batchMovesAndResetClusterState(cluster, moves);
+  }
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/TableIsolationConditional.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/TableIsolationConditional.java
new file mode 100644
index 000000000000..cd3ce0b6fe18
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/TableIsolationConditional.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import java.util.List;
+import java.util.Set;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.master.RegionPlan;
+
+import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableList;
+
+abstract class TableIsolationConditional extends RegionPlanConditional {
+
+  private final List<RegionPlanConditionalCandidateGenerator> candidateGenerators;
+
+  TableIsolationConditional(BalancerConditionals balancerConditionals,
+    BalancerClusterState cluster) {
+    super(balancerConditionals.getConf(), cluster);
+
+    float slop = balancerConditionals.getConf().getFloat(BaseLoadBalancer.REGIONS_SLOP_KEY,
+      BaseLoadBalancer.REGIONS_SLOP_DEFAULT);
+    this.candidateGenerators =
+      ImmutableList.of(new MetaTableIsolationCandidateGenerator(balancerConditionals),
+        new SlopFixingCandidateGenerator(balancerConditionals, slop));
+  }
+
+  abstract boolean isRegionToIsolate(RegionInfo regionInfo);
+
+  boolean isServerHostingIsolatedTables(BalancerClusterState cluster, int serverIdx) {
+    for (int regionIdx : cluster.regionsPerServer[serverIdx]) {
+      if (isRegionToIsolate(cluster.regions[regionIdx])) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  @Override
+  ValidationLevel getValidationLevel() {
+    return ValidationLevel.SERVER;
+  }
+
+  @Override
+  List<RegionPlanConditionalCandidateGenerator> getCandidateGenerators() {
+    return candidateGenerators;
+  }
+
+  @Override
+  public boolean isViolatingServer(RegionPlan regionPlan, Set<RegionInfo> serverRegions) {
+    RegionInfo regionBeingMoved = regionPlan.getRegionInfo();
+    boolean shouldIsolateMovingRegion = isRegionToIsolate(regionBeingMoved);
+    for (RegionInfo destinationRegion : serverRegions) {
+      if (destinationRegion.getEncodedName().equals(regionBeingMoved.getEncodedName())) {
+        // Skip the region being moved
+        continue;
+      }
+      if (shouldIsolateMovingRegion && !isRegionToIsolate(destinationRegion)) {
+        // Ensure every destination region is also a region to isolate
+        return true;
+      } else if (!shouldIsolateMovingRegion && isRegionToIsolate(destinationRegion)) {
+        // Ensure no destination region is a region to isolate
+        return true;
+      }
+    }
+    return false;
+  }
+
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java
index 4f6e8f70f305..d2a2d432ff05 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java
@@ -239,6 +239,41 @@ static boolean areAllReplicasDistributed(BalancerClusterState cluster) {
     return true;
   }
 
+  /**
+   * Generic method to validate table isolation.
+   */
+  static boolean isTableIsolated(BalancerClusterState cluster, TableName tableName,
+    String tableType) {
+    for (int i = 0; i < cluster.numServers; i++) {
+      int[] regionsOnServer = cluster.regionsPerServer[i];
+      if (regionsOnServer == null || regionsOnServer.length == 0) {
+        continue; // Skip empty servers
+      }
+
+      boolean hasTargetTableRegion = false;
+      boolean hasOtherTableRegion = false;
+
+      for (int regionIndex : regionsOnServer) {
+        RegionInfo regionInfo = cluster.regions[regionIndex];
+        if (regionInfo.getTable().equals(tableName)) {
+          hasTargetTableRegion = true;
+        } else {
+          hasOtherTableRegion = true;
+        }
+
+        // If the target table and any other table are on the same server, isolation is violated
+        if (hasTargetTableRegion && hasOtherTableRegion) {
+          LOG.debug(
+            "Server {} has both {} table regions and other table regions, violating isolation.",
+            cluster.servers[i].getServerName(), tableType);
+          return false;
+        }
+      }
+    }
+    LOG.debug("{} table isolation validation passed.", tableType);
+    return true;
+  }
+
   /**
    * Generates a unique key for a region based on its start and end keys. This method ensures that
    * regions with identical start and end keys have the same key.
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestBalancerConditionals.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestBalancerConditionals.java
index 884331f161ac..4dc40cda5481 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestBalancerConditionals.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestBalancerConditionals.java
@@ -65,7 +65,7 @@ public void testCustomConditionalsViaConfiguration() {
     balancerConditionals.loadClusterState(mockCluster);
 
     assertTrue("Custom conditionals should be loaded",
-      balancerConditionals.shouldSkipSloppyServerEvaluation());
+      balancerConditionals.isConditionalBalancingEnabled());
   }
 
   @Test
@@ -80,4 +80,16 @@ public void testInvalidCustomConditionalClass() {
       balancerConditionals.getConditionalClasses().size());
   }
 
+  @Test
+  public void testMetaTableIsolationConditionalEnabled() {
+    Configuration conf = new Configuration();
+    conf.setBoolean(BalancerConditionals.ISOLATE_META_TABLE_KEY, true);
+
+    balancerConditionals.setConf(conf);
+    balancerConditionals.loadClusterState(mockCluster);
+
+    assertTrue("MetaTableIsolationConditional should be active",
+      balancerConditionals.isTableIsolationEnabled());
+  }
+
 }
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingConditionalReplicaDistribution.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingConditionalReplicaDistribution.java
index 9e0a6f24e106..2522a13819f1 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingConditionalReplicaDistribution.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingConditionalReplicaDistribution.java
@@ -108,7 +108,6 @@ public void testReplicaDistribution() {
 
     runBalancerToExhaustion(conf, serverToRegions,
       ImmutableSet.of(CandidateGeneratorTestUtil::areAllReplicasDistributed), 10.0f);
-    LOG.info("Meta table and system table regions are successfully isolated, "
-      + "meanwhile region replicas are appropriately distributed across RegionServers.");
+    LOG.info("Region replicas are appropriately distributed across RegionServers.");
   }
 }
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingMetaTableIsolation.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingMetaTableIsolation.java
new file mode 100644
index 000000000000..27360f3cd570
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingMetaTableIsolation.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import static org.apache.hadoop.hbase.master.balancer.CandidateGeneratorTestUtil.isTableIsolated;
+import static org.apache.hadoop.hbase.master.balancer.CandidateGeneratorTestUtil.runBalancerToExhaustion;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.RegionInfoBuilder;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableSet;
+
+@Category({ MediumTests.class, MasterTests.class })
+public class TestLargeClusterBalancingMetaTableIsolation {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestLargeClusterBalancingMetaTableIsolation.class);
+
+  private static final Logger LOG =
+    LoggerFactory.getLogger(TestLargeClusterBalancingMetaTableIsolation.class);
+
+  private static final TableName NON_META_TABLE_NAME = TableName.valueOf("userTable");
+
+  private static final int NUM_SERVERS = 1000;
+  private static final int NUM_REGIONS = 20_000;
+
+  private static final ServerName[] servers = new ServerName[NUM_SERVERS];
+  private static final Map<ServerName, List<RegionInfo>> serverToRegions = new HashMap<>();
+
+  @BeforeClass
+  public static void setup() {
+    // Initialize servers
+    for (int i = 0; i < NUM_SERVERS; i++) {
+      servers[i] = ServerName.valueOf("server" + i, i, System.currentTimeMillis());
+    }
+
+    // Create regions
+    List<RegionInfo> allRegions = new ArrayList<>();
+    for (int i = 0; i < NUM_REGIONS; i++) {
+      TableName tableName = i < 3 ? TableName.META_TABLE_NAME : NON_META_TABLE_NAME;
+      byte[] startKey = new byte[1];
+      startKey[0] = (byte) i;
+      byte[] endKey = new byte[1];
+      endKey[0] = (byte) (i + 1);
+
+      RegionInfo regionInfo =
+        RegionInfoBuilder.newBuilder(tableName).setStartKey(startKey).setEndKey(endKey).build();
+      allRegions.add(regionInfo);
+    }
+
+    // Assign all regions to the first server
+    serverToRegions.put(servers[0], new ArrayList<>(allRegions));
+    for (int i = 1; i < NUM_SERVERS; i++) {
+      serverToRegions.put(servers[i], new ArrayList<>());
+    }
+  }
+
+  @Test
+  public void testMetaTableIsolation() {
+    Configuration conf = new Configuration(false);
+    conf.setBoolean(BalancerConditionals.ISOLATE_META_TABLE_KEY, true);
+    runBalancerToExhaustion(conf, serverToRegions, ImmutableSet.of(this::isMetaTableIsolated),
+      10.0f);
+    LOG.info("Meta table regions are successfully isolated.");
+  }
+
+  private boolean isMetaTableIsolated(BalancerClusterState cluster) {
+    return isTableIsolated(cluster, TableName.META_TABLE_NAME, "Meta");
+  }
+
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java
new file mode 100644
index 000000000000..5fbddf4878be
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import static org.apache.hadoop.hbase.master.balancer.CandidateGeneratorTestUtil.isTableIsolated;
+import static org.apache.hadoop.hbase.master.balancer.CandidateGeneratorTestUtil.runBalancerToExhaustion;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.RegionInfoBuilder;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableSet;
+
+@Category({ MediumTests.class, MasterTests.class })
+public class TestLargeClusterBalancingTableIsolationAndReplicaDistribution {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE = HBaseClassTestRule
+    .forClass(TestLargeClusterBalancingTableIsolationAndReplicaDistribution.class);
+
+  private static final Logger LOG =
+    LoggerFactory.getLogger(TestLargeClusterBalancingTableIsolationAndReplicaDistribution.class);
+  private static final TableName SYSTEM_TABLE_NAME = TableName.valueOf("hbase:system");
+  private static final TableName NON_ISOLATED_TABLE_NAME = TableName.valueOf("userTable");
+
+  private static final int NUM_SERVERS = 1000;
+  private static final int NUM_REGIONS = 10_000;
+  private static final int NUM_REPLICAS = 3;
+
+  private static final ServerName[] servers = new ServerName[NUM_SERVERS];
+  private static final Map<ServerName, List<RegionInfo>> serverToRegions = new HashMap<>();
+
+  @BeforeClass
+  public static void setup() {
+    // Initialize servers
+    for (int i = 0; i < NUM_SERVERS; i++) {
+      servers[i] = ServerName.valueOf("server" + i, i, System.currentTimeMillis());
+      serverToRegions.put(servers[i], new ArrayList<>());
+    }
+
+    // Create primary regions and their replicas
+    List<RegionInfo> allRegions = new ArrayList<>();
+    for (int i = 0; i < NUM_REGIONS; i++) {
+      TableName tableName;
+      if (i < 1) {
+        tableName = TableName.META_TABLE_NAME;
+      } else if (i < 10) {
+        tableName = SYSTEM_TABLE_NAME;
+      } else {
+        tableName = NON_ISOLATED_TABLE_NAME;
+      }
+
+      // Define startKey and endKey for the region
+      byte[] startKey = new byte[1];
+      startKey[0] = (byte) i;
+      byte[] endKey = new byte[1];
+      endKey[0] = (byte) (i + 1);
+
+      // Create 3 replicas for each primary region
+      for (int replicaId = 0; replicaId < NUM_REPLICAS; replicaId++) {
+        RegionInfo regionInfo = RegionInfoBuilder.newBuilder(tableName).setStartKey(startKey)
+          .setEndKey(endKey).setReplicaId(replicaId).build();
+        allRegions.add(regionInfo);
+      }
+    }
+
+    // Assign all regions to one server
+    for (RegionInfo regionInfo : allRegions) {
+      serverToRegions.get(servers[0]).add(regionInfo);
+    }
+  }
+
+  @Test
+  public void testTableIsolationAndReplicaDistribution() {
+
+    Configuration conf = new Configuration(false);
+    conf.setBoolean(BalancerConditionals.ISOLATE_META_TABLE_KEY, true);
+    DistributeReplicasTestConditional.enableConditionalReplicaDistributionForTest(conf);
+
+    runBalancerToExhaustion(conf, serverToRegions, ImmutableSet.of(this::isMetaTableIsolated,
+      CandidateGeneratorTestUtil::areAllReplicasDistributed), 10.0f);
+    LOG.info("Meta table regions are successfully isolated, "
+      + "and region replicas are appropriately distributed.");
+  }
+
+  /**
+   * Validates whether all meta table regions are isolated.
+   */
+  private boolean isMetaTableIsolated(BalancerClusterState cluster) {
+    return isTableIsolated(cluster, TableName.META_TABLE_NAME, "Meta");
+  }
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestMetaTableIsolationBalancerConditional.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestMetaTableIsolationBalancerConditional.java
new file mode 100644
index 000000000000..d2eb7243ec8e
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestMetaTableIsolationBalancerConditional.java
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.HRegionLocation;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.Admin;
+import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
+import org.apache.hadoop.hbase.client.Connection;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
+import org.apache.hadoop.hbase.quotas.QuotaUtil;
+import org.apache.hadoop.hbase.testclassification.LargeTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableSet;
+
+@Category(LargeTests.class)
+public class TestMetaTableIsolationBalancerConditional {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestMetaTableIsolationBalancerConditional.class);
+
+  private static final Logger LOG =
+    LoggerFactory.getLogger(TestMetaTableIsolationBalancerConditional.class);
+  private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+
+  private static final int NUM_SERVERS = 3;
+
+  @Before
+  public void setUp() throws Exception {
+    TEST_UTIL.getConfiguration().setBoolean(BalancerConditionals.ISOLATE_META_TABLE_KEY, true);
+    TEST_UTIL.getConfiguration().setBoolean(QuotaUtil.QUOTA_CONF_KEY, true); // for another table
+    TEST_UTIL.getConfiguration().setLong(HConstants.HBASE_BALANCER_PERIOD, 1000L);
+    TEST_UTIL.getConfiguration().setBoolean("hbase.master.balancer.stochastic.runMaxSteps", true);
+
+    TEST_UTIL.startMiniCluster(NUM_SERVERS);
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    TEST_UTIL.shutdownMiniCluster();
+  }
+
+  @Test
+  public void testTableIsolation() throws Exception {
+    Connection connection = TEST_UTIL.getConnection();
+    Admin admin = connection.getAdmin();
+
+    // Create "product" table with 3 regions
+    TableName productTableName = TableName.valueOf("product");
+    TableDescriptor productTableDescriptor = TableDescriptorBuilder.newBuilder(productTableName)
+      .setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(Bytes.toBytes("0")).build())
+      .build();
+    admin.createTable(productTableDescriptor,
+      BalancerConditionalsTestUtil.generateSplits(2 * NUM_SERVERS));
+
+    Set<TableName> tablesToBeSeparated = ImmutableSet.<TableName> builder()
+      .add(TableName.META_TABLE_NAME).add(QuotaUtil.QUOTA_TABLE_NAME).add(productTableName).build();
+
+    // Pause the balancer
+    admin.balancerSwitch(false, true);
+
+    // Move all regions (product, meta, and quotas) to one RegionServer
+    List<RegionInfo> allRegions = tablesToBeSeparated.stream().map(t -> {
+      try {
+        return admin.getRegions(t);
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    }).flatMap(Collection::stream).collect(Collectors.toList());
+    String targetServer =
+      TEST_UTIL.getHBaseCluster().getRegionServer(0).getServerName().getServerName();
+    for (RegionInfo region : allRegions) {
+      admin.move(region.getEncodedNameAsBytes(), Bytes.toBytes(targetServer));
+    }
+
+    validateRegionLocationsWithRetry(connection, tablesToBeSeparated, productTableName, false,
+      false);
+
+    // Unpause the balancer and run it
+    admin.balancerSwitch(true, true);
+    admin.balance();
+
+    validateRegionLocationsWithRetry(connection, tablesToBeSeparated, productTableName, true, true);
+  }
+
+  private static void validateRegionLocationsWithRetry(Connection connection,
+    Set<TableName> tableNames, TableName productTableName, boolean areDistributed,
+    boolean runBalancerOnFailure) throws InterruptedException, IOException {
+    for (int i = 0; i < 100; i++) {
+      Map<TableName, Set<ServerName>> tableToServers = getTableToServers(connection, tableNames);
+      try {
+        validateRegionLocations(tableToServers, productTableName, areDistributed);
+      } catch (AssertionError e) {
+        if (i == 99) {
+          throw e;
+        }
+        LOG.warn("Failed to validate region locations. Will retry", e);
+        BalancerConditionalsTestUtil.printRegionLocations(TEST_UTIL.getConnection());
+        if (runBalancerOnFailure) {
+          connection.getAdmin().balance();
+        }
+        Thread.sleep(1000);
+      }
+    }
+  }
+
+  private static void validateRegionLocations(Map<TableName, Set<ServerName>> tableToServers,
+    TableName productTableName, boolean shouldBeBalanced) {
+    // Validate that the region assignments
+    ServerName metaServer =
+      tableToServers.get(TableName.META_TABLE_NAME).stream().findFirst().get();
+    ServerName quotaServer =
+      tableToServers.get(QuotaUtil.QUOTA_TABLE_NAME).stream().findFirst().get();
+    Set<ServerName> productServers = tableToServers.get(productTableName);
+
+    if (shouldBeBalanced) {
+      assertNotEquals("Meta table and quota table should not share a server", metaServer,
+        quotaServer);
+      for (ServerName productServer : productServers) {
+        assertNotEquals("Meta table and product table should not share servers", productServer,
+          metaServer);
+      }
+    } else {
+      assertEquals("Quota table and product table must share servers", metaServer, quotaServer);
+      for (ServerName server : productServers) {
+        assertEquals("Meta table and product table must share servers", server, metaServer);
+      }
+    }
+  }
+
+  private static Map<TableName, Set<ServerName>> getTableToServers(Connection connection,
+    Set<TableName> tableNames) {
+    return tableNames.stream().collect(Collectors.toMap(t -> t, t -> {
+      try {
+        return connection.getRegionLocator(t).getAllRegionLocations().stream()
+          .map(HRegionLocation::getServerName).collect(Collectors.toSet());
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    }));
+  }
+}

From ed601de1c541ad785442c306554e79f103417509 Mon Sep 17 00:00:00 2001
From: Ray Mattingly <rmattingly@apache.org>
Date: Fri, 7 Mar 2025 08:27:15 -0500
Subject: [PATCH 20/37] HubSpot Backport: HBASE-29075 Balancer conditionals
 should support system table isolation (#6746) (#6765) (will be in 2.7)

Signed-off-by: Nick Dimiduk <ndimiduk@apache.org>
Co-authored-by: Ray Mattingly <rmattingly@hubspot.com>
---
 .../master/balancer/BalancerConditionals.java |  14 +++
 .../DistributeReplicasConditional.java        |   6 +-
 .../MetaTableIsolationConditional.java        |   3 +-
 .../SlopFixingCandidateGenerator.java         |   5 +-
 ...ystemTableIsolationCandidateGenerator.java |  41 +++++++
 .../SystemTableIsolationConditional.java      |  43 ++++++++
 .../balancer/TableIsolationConditional.java   |   9 +-
 .../balancer/CandidateGeneratorTestUtil.java  |   9 +-
 ...eClusterBalancingSystemTableIsolation.java | 104 ++++++++++++++++++
 ...gTableIsolationAndReplicaDistribution.java |  28 +++--
 10 files changed, 236 insertions(+), 26 deletions(-)
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SystemTableIsolationCandidateGenerator.java
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SystemTableIsolationConditional.java
 create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingSystemTableIsolation.java

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java
index 88ceb5a55406..021a34bce6b9 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java
@@ -62,6 +62,10 @@ final class BalancerConditionals implements Configurable {
     "hbase.master.balancer.stochastic.conditionals.isolateMetaTable";
   public static final boolean ISOLATE_META_TABLE_DEFAULT = false;
 
+  public static final String ISOLATE_SYSTEM_TABLES_KEY =
+    "hbase.master.balancer.stochastic.conditionals.isolateSystemTables";
+  public static final boolean ISOLATE_SYSTEM_TABLES_DEFAULT = false;
+
   public static final String ADDITIONAL_CONDITIONALS_KEY =
     "hbase.master.balancer.stochastic.additionalConditionals";
 
@@ -96,6 +100,10 @@ boolean isReplicaDistributionEnabled() {
   }
 
   boolean isTableIsolationEnabled() {
+    return conditionalClasses.stream().anyMatch(TableIsolationConditional.class::isAssignableFrom);
+  }
+
+  boolean isMetaTableIsolationEnabled() {
     return conditionalClasses.contains(MetaTableIsolationConditional.class);
   }
 
@@ -208,6 +216,12 @@ public void setConf(Configuration conf) {
       conditionalClasses.add(MetaTableIsolationConditional.class);
     }
 
+    boolean isolateSystemTables =
+      conf.getBoolean(ISOLATE_SYSTEM_TABLES_KEY, ISOLATE_SYSTEM_TABLES_DEFAULT);
+    if (isolateSystemTables) {
+      conditionalClasses.add(SystemTableIsolationConditional.class);
+    }
+
     Class<?>[] classes = conf.getClasses(ADDITIONAL_CONDITIONALS_KEY);
     for (Class<?> clazz : classes) {
       if (!RegionPlanConditional.class.isAssignableFrom(clazz)) {
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasConditional.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasConditional.java
index 2cd27615e5fd..e99c0e93a159 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasConditional.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/DistributeReplicasConditional.java
@@ -19,7 +19,6 @@
 
 import java.util.List;
 import java.util.Set;
-import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.client.RegionInfo;
 import org.apache.hadoop.hbase.master.RegionPlan;
 import org.apache.hadoop.hbase.master.balancer.replicas.ReplicaKey;
@@ -41,12 +40,9 @@ public class DistributeReplicasConditional extends RegionPlanConditional {
   public DistributeReplicasConditional(BalancerConditionals balancerConditionals,
     BalancerClusterState cluster) {
     super(balancerConditionals.getConf(), cluster);
-    Configuration conf = balancerConditionals.getConf();
-    float slop =
-      conf.getFloat(BaseLoadBalancer.REGIONS_SLOP_KEY, BaseLoadBalancer.REGIONS_SLOP_DEFAULT);
     this.candidateGenerators =
       ImmutableList.of(new DistributeReplicasCandidateGenerator(balancerConditionals),
-        new SlopFixingCandidateGenerator(balancerConditionals, slop));
+        new SlopFixingCandidateGenerator(balancerConditionals));
   }
 
   @Override
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MetaTableIsolationConditional.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MetaTableIsolationConditional.java
index 732693c44f3e..5617468457c4 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MetaTableIsolationConditional.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/MetaTableIsolationConditional.java
@@ -27,7 +27,8 @@ class MetaTableIsolationConditional extends TableIsolationConditional {
 
   public MetaTableIsolationConditional(BalancerConditionals balancerConditionals,
     BalancerClusterState cluster) {
-    super(balancerConditionals, cluster);
+    super(new MetaTableIsolationCandidateGenerator(balancerConditionals), balancerConditionals,
+      cluster);
   }
 
   @Override
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SlopFixingCandidateGenerator.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SlopFixingCandidateGenerator.java
index b1ea1de8d2b0..f78e1573b417 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SlopFixingCandidateGenerator.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SlopFixingCandidateGenerator.java
@@ -37,9 +37,10 @@ final class SlopFixingCandidateGenerator extends RegionPlanConditionalCandidateG
 
   private final float slop;
 
-  SlopFixingCandidateGenerator(BalancerConditionals balancerConditionals, float slop) {
+  SlopFixingCandidateGenerator(BalancerConditionals balancerConditionals) {
     super(balancerConditionals);
-    this.slop = slop;
+    this.slop = balancerConditionals.getConf().getFloat(BaseLoadBalancer.REGIONS_SLOP_KEY,
+      BaseLoadBalancer.REGIONS_SLOP_DEFAULT);
   }
 
   @Override
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SystemTableIsolationCandidateGenerator.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SystemTableIsolationCandidateGenerator.java
new file mode 100644
index 000000000000..7ce8ff202965
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SystemTableIsolationCandidateGenerator.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+
+@InterfaceAudience.Private
+public class SystemTableIsolationCandidateGenerator extends TableIsolationCandidateGenerator {
+
+  private final BalancerConditionals balancerConditionals;
+
+  SystemTableIsolationCandidateGenerator(BalancerConditionals balancerConditionals) {
+    super(balancerConditionals);
+    this.balancerConditionals = balancerConditionals;
+  }
+
+  @Override
+  boolean shouldBeIsolated(RegionInfo regionInfo) {
+    if (balancerConditionals.isMetaTableIsolationEnabled() && regionInfo.isMetaRegion()) {
+      // If meta isolation is enabled, we can ignore meta regions here
+      return false;
+    }
+    return regionInfo.getTable().isSystemTable();
+  }
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SystemTableIsolationConditional.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SystemTableIsolationConditional.java
new file mode 100644
index 000000000000..b5734b82faf7
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/SystemTableIsolationConditional.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.yetus.audience.InterfaceAudience;
+
+@InterfaceAudience.Private
+public class SystemTableIsolationConditional extends TableIsolationConditional {
+
+  private final BalancerConditionals balancerConditionals;
+
+  SystemTableIsolationConditional(BalancerConditionals balancerConditionals,
+    BalancerClusterState cluster) {
+    super(new SystemTableIsolationCandidateGenerator(balancerConditionals), balancerConditionals,
+      cluster);
+    this.balancerConditionals = balancerConditionals;
+  }
+
+  @Override
+  boolean isRegionToIsolate(RegionInfo regionInfo) {
+    if (balancerConditionals.isMetaTableIsolationEnabled() && regionInfo.isMetaRegion()) {
+      // If meta isolation is enabled, we can ignore meta regions here
+      return false;
+    }
+    return regionInfo.getTable().isSystemTable();
+  }
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/TableIsolationConditional.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/TableIsolationConditional.java
index cd3ce0b6fe18..24a6f519e8d8 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/TableIsolationConditional.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/TableIsolationConditional.java
@@ -28,15 +28,12 @@ abstract class TableIsolationConditional extends RegionPlanConditional {
 
   private final List<RegionPlanConditionalCandidateGenerator> candidateGenerators;
 
-  TableIsolationConditional(BalancerConditionals balancerConditionals,
-    BalancerClusterState cluster) {
+  TableIsolationConditional(TableIsolationCandidateGenerator generator,
+    BalancerConditionals balancerConditionals, BalancerClusterState cluster) {
     super(balancerConditionals.getConf(), cluster);
 
-    float slop = balancerConditionals.getConf().getFloat(BaseLoadBalancer.REGIONS_SLOP_KEY,
-      BaseLoadBalancer.REGIONS_SLOP_DEFAULT);
     this.candidateGenerators =
-      ImmutableList.of(new MetaTableIsolationCandidateGenerator(balancerConditionals),
-        new SlopFixingCandidateGenerator(balancerConditionals, slop));
+      ImmutableList.of(generator, new SlopFixingCandidateGenerator(balancerConditionals));
   }
 
   abstract boolean isRegionToIsolate(RegionInfo regionInfo);
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java
index d2a2d432ff05..03bfcce8e150 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java
@@ -54,9 +54,16 @@ private CandidateGeneratorTestUtil() {
   static void runBalancerToExhaustion(Configuration conf,
     Map<ServerName, List<RegionInfo>> serverToRegions,
     Set<Function<BalancerClusterState, Boolean>> expectations, float targetMaxBalancerCost) {
+    runBalancerToExhaustion(conf, serverToRegions, expectations, targetMaxBalancerCost, 15000);
+  }
+
+  static void runBalancerToExhaustion(Configuration conf,
+    Map<ServerName, List<RegionInfo>> serverToRegions,
+    Set<Function<BalancerClusterState, Boolean>> expectations, float targetMaxBalancerCost,
+    long maxRunningTime) {
     // Do the full plan. We're testing with a lot of regions
     conf.setBoolean("hbase.master.balancer.stochastic.runMaxSteps", true);
-    conf.setLong(MAX_RUNNING_TIME_KEY, 15000);
+    conf.setLong(MAX_RUNNING_TIME_KEY, maxRunningTime);
 
     conf.setFloat(MIN_COST_NEED_BALANCE_KEY, targetMaxBalancerCost);
 
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingSystemTableIsolation.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingSystemTableIsolation.java
new file mode 100644
index 000000000000..ef26c548c209
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingSystemTableIsolation.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import static org.apache.hadoop.hbase.master.balancer.CandidateGeneratorTestUtil.isTableIsolated;
+import static org.apache.hadoop.hbase.master.balancer.CandidateGeneratorTestUtil.runBalancerToExhaustion;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.RegionInfoBuilder;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableSet;
+
+@Category({ MediumTests.class, MasterTests.class })
+public class TestLargeClusterBalancingSystemTableIsolation {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestLargeClusterBalancingSystemTableIsolation.class);
+
+  private static final Logger LOG =
+    LoggerFactory.getLogger(TestLargeClusterBalancingSystemTableIsolation.class);
+
+  private static final TableName SYSTEM_TABLE_NAME = TableName.valueOf("hbase:system");
+  private static final TableName NON_SYSTEM_TABLE_NAME = TableName.valueOf("userTable");
+
+  private static final int NUM_SERVERS = 1000;
+  private static final int NUM_REGIONS = 20_000;
+
+  private static final ServerName[] servers = new ServerName[NUM_SERVERS];
+  private static final Map<ServerName, List<RegionInfo>> serverToRegions = new HashMap<>();
+
+  @BeforeClass
+  public static void setup() {
+    // Initialize servers
+    for (int i = 0; i < NUM_SERVERS; i++) {
+      servers[i] = ServerName.valueOf("server" + i, i, System.currentTimeMillis());
+    }
+
+    // Create regions
+    List<RegionInfo> allRegions = new ArrayList<>();
+    for (int i = 0; i < NUM_REGIONS; i++) {
+      TableName tableName = i < 3 ? SYSTEM_TABLE_NAME : NON_SYSTEM_TABLE_NAME;
+      byte[] startKey = new byte[1];
+      startKey[0] = (byte) i;
+      byte[] endKey = new byte[1];
+      endKey[0] = (byte) (i + 1);
+
+      RegionInfo regionInfo =
+        RegionInfoBuilder.newBuilder(tableName).setStartKey(startKey).setEndKey(endKey).build();
+      allRegions.add(regionInfo);
+    }
+
+    // Assign all regions to the first server
+    serverToRegions.put(servers[0], new ArrayList<>(allRegions));
+    for (int i = 1; i < NUM_SERVERS; i++) {
+      serverToRegions.put(servers[i], new ArrayList<>());
+    }
+  }
+
+  @Test
+  public void testSystemTableIsolation() {
+    Configuration conf = new Configuration(false);
+    conf.setBoolean(BalancerConditionals.ISOLATE_SYSTEM_TABLES_KEY, true);
+    runBalancerToExhaustion(conf, serverToRegions, ImmutableSet.of(this::isSystemTableIsolated),
+      10.0f);
+    LOG.info("Meta table regions are successfully isolated.");
+  }
+
+  private boolean isSystemTableIsolated(BalancerClusterState cluster) {
+    return isTableIsolated(cluster, SYSTEM_TABLE_NAME, "System");
+  }
+
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java
index 5fbddf4878be..3a28ae801e4e 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java
@@ -24,6 +24,7 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Random;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.HBaseClassTestRule;
 import org.apache.hadoop.hbase.ServerName;
@@ -53,8 +54,8 @@ public class TestLargeClusterBalancingTableIsolationAndReplicaDistribution {
   private static final TableName SYSTEM_TABLE_NAME = TableName.valueOf("hbase:system");
   private static final TableName NON_ISOLATED_TABLE_NAME = TableName.valueOf("userTable");
 
-  private static final int NUM_SERVERS = 1000;
-  private static final int NUM_REGIONS = 10_000;
+  private static final int NUM_SERVERS = 500;
+  private static final int NUM_REGIONS = 2_500;
   private static final int NUM_REPLICAS = 3;
 
   private static final ServerName[] servers = new ServerName[NUM_SERVERS];
@@ -69,7 +70,6 @@ public static void setup() {
     }
 
     // Create primary regions and their replicas
-    List<RegionInfo> allRegions = new ArrayList<>();
     for (int i = 0; i < NUM_REGIONS; i++) {
       TableName tableName;
       if (i < 1) {
@@ -86,29 +86,28 @@ public static void setup() {
       byte[] endKey = new byte[1];
       endKey[0] = (byte) (i + 1);
 
+      Random random = new Random();
       // Create 3 replicas for each primary region
       for (int replicaId = 0; replicaId < NUM_REPLICAS; replicaId++) {
         RegionInfo regionInfo = RegionInfoBuilder.newBuilder(tableName).setStartKey(startKey)
           .setEndKey(endKey).setReplicaId(replicaId).build();
-        allRegions.add(regionInfo);
+        // Assign region to random server
+        int randomServer = random.nextInt(servers.length);
+        serverToRegions.get(servers[randomServer]).add(regionInfo);
       }
     }
-
-    // Assign all regions to one server
-    for (RegionInfo regionInfo : allRegions) {
-      serverToRegions.get(servers[0]).add(regionInfo);
-    }
   }
 
   @Test
   public void testTableIsolationAndReplicaDistribution() {
-
     Configuration conf = new Configuration(false);
     conf.setBoolean(BalancerConditionals.ISOLATE_META_TABLE_KEY, true);
+    conf.setBoolean(BalancerConditionals.ISOLATE_SYSTEM_TABLES_KEY, true);
     DistributeReplicasTestConditional.enableConditionalReplicaDistributionForTest(conf);
 
     runBalancerToExhaustion(conf, serverToRegions, ImmutableSet.of(this::isMetaTableIsolated,
-      CandidateGeneratorTestUtil::areAllReplicasDistributed), 10.0f);
+      this::isSystemTableIsolated, CandidateGeneratorTestUtil::areAllReplicasDistributed), 10.0f,
+      60_000);
     LOG.info("Meta table regions are successfully isolated, "
       + "and region replicas are appropriately distributed.");
   }
@@ -119,4 +118,11 @@ public void testTableIsolationAndReplicaDistribution() {
   private boolean isMetaTableIsolated(BalancerClusterState cluster) {
     return isTableIsolated(cluster, TableName.META_TABLE_NAME, "Meta");
   }
+
+  /**
+   * Validates whether all meta table regions are isolated.
+   */
+  private boolean isSystemTableIsolated(BalancerClusterState cluster) {
+    return isTableIsolated(cluster, SYSTEM_TABLE_NAME, "System");
+  }
 }

From 5a2b53bf31dfc073aa816bf77f2988adf62306c3 Mon Sep 17 00:00:00 2001
From: Hernan Romer <nanug33@gmail.com>
Date: Wed, 12 Mar 2025 17:50:23 -0400
Subject: [PATCH 21/37] fix backup location (#162)

Co-authored-by: Hernan Gelaf-Romer <hgelafromer@hubspot.com>
---
 .../hadoop/hbase/backup/impl/IncrementalTableBackupClient.java   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java
index b320c3b0d0fb..686c60a6b0a4 100644
--- a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java
+++ b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java
@@ -406,6 +406,7 @@ protected void walToHFiles(List<String> dirPaths, List<String> tableList) throws
     Path bulkOutputPath = getBulkOutputDir();
     conf.set(WALPlayer.BULK_OUTPUT_CONF_KEY, bulkOutputPath.toString());
     conf.set(WALPlayer.INPUT_FILES_SEPARATOR_KEY, ";");
+    conf.setBoolean(HFileOutputFormat2.TABLE_NAME_WITH_NAMESPACE_INCLUSIVE_KEY, true);
     conf.setBoolean(WALPlayer.MULTI_TABLES_SUPPORT, true);
     conf.set(JOB_NAME_CONF_KEY, jobname);
     String[] playerArgs = { dirs, StringUtils.join(tableList, ",") };

From 4933d3d56b8f07a0aed6c537230ba1a46bc0b259 Mon Sep 17 00:00:00 2001
From: Ray Mattingly <rmdmattingly@gmail.com>
Date: Mon, 17 Mar 2025 13:21:25 -0400
Subject: [PATCH 22/37] HubSpot Backport: HBASE-29186 RegionPlanConditionals
 can produce a null pointer (#6796) (will be in 2.7) (#164)

Signed-off-by: Nick Dimiduk <ndimiduk@apache.org>
Co-authored-by: Ray Mattingly <rmattingly@hubspot.com>
---
 .../balancer/RegionPlanConditional.java       | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionPlanConditional.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionPlanConditional.java
index 8de371d341cd..063f3ba5f726 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionPlanConditional.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionPlanConditional.java
@@ -18,6 +18,7 @@
 package org.apache.hadoop.hbase.master.balancer;
 
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;
@@ -85,8 +86,8 @@ boolean isViolating(RegionPlan regionPlan) {
 
     // Check Server
     int[] destinationRegionIndices = cluster.regionsPerServer[destinationServerIdx];
-    Set<RegionInfo> serverRegions = Arrays.stream(cluster.regionsPerServer[destinationServerIdx])
-      .mapToObj(idx -> cluster.regions[idx]).collect(Collectors.toSet());
+    Set<RegionInfo> serverRegions =
+      getRegionsFromIndex(destinationServerIdx, cluster.regionsPerServer);
     for (int regionIdx : destinationRegionIndices) {
       serverRegions.add(cluster.regions[regionIdx]);
     }
@@ -100,8 +101,7 @@ boolean isViolating(RegionPlan regionPlan) {
 
     // Check Host
     int hostIdx = cluster.serverIndexToHostIndex[destinationServerIdx];
-    Set<RegionInfo> hostRegions = Arrays.stream(cluster.regionsPerHost[hostIdx])
-      .mapToObj(idx -> cluster.regions[idx]).collect(Collectors.toSet());
+    Set<RegionInfo> hostRegions = getRegionsFromIndex(hostIdx, cluster.regionsPerHost);
     if (isViolatingHost(regionPlan, hostRegions)) {
       return true;
     }
@@ -112,8 +112,7 @@ boolean isViolating(RegionPlan regionPlan) {
 
     // Check Rack
     int rackIdx = cluster.serverIndexToRackIndex[destinationServerIdx];
-    Set<RegionInfo> rackRegions = Arrays.stream(cluster.regionsPerRack[rackIdx])
-      .mapToObj(idx -> cluster.regions[idx]).collect(Collectors.toSet());
+    Set<RegionInfo> rackRegions = getRegionsFromIndex(rackIdx, cluster.regionsPerRack);
     if (isViolatingRack(regionPlan, rackRegions)) {
       return true;
     }
@@ -130,4 +129,13 @@ boolean isViolatingHost(RegionPlan regionPlan, Set<RegionInfo> destinationRegion
   boolean isViolatingRack(RegionPlan regionPlan, Set<RegionInfo> destinationRegions) {
     return false;
   }
+
+  private Set<RegionInfo> getRegionsFromIndex(int index, int[][] regionsPerIndex) {
+    int[] regionIndices = regionsPerIndex[index];
+    if (regionIndices == null) {
+      return Collections.emptySet();
+    }
+    return Arrays.stream(regionIndices).mapToObj(idx -> cluster.regions[idx])
+      .collect(Collectors.toSet());
+  }
 }

From f30572b395eb2e17784f19405148303d052f11ea Mon Sep 17 00:00:00 2001
From: Nick Dimiduk <ndimiduk@apache.org>
Date: Mon, 17 Feb 2025 17:00:43 +0100
Subject: [PATCH 23/37] HubSpot Backport: HBASE-29131 Introduce the option for
 post-compaction validation of HFiles (#6700) (will be in 2.6.3)

Introduces the option for an HStore to fully read the file it just wrote after a flush or
compaction.

To enable this feature, set `hbase.hstore.validate.read_fully=true`. This is an HStore
configuration feature, so it can be enabled in hbase-site.xml, in the TableDescriptor, or in the
ColumnFamilyDescriptor.

Signed-off-by: Peter Somogyi <psomogyi@apache.org >
---
 .../hadoop/hbase/regionserver/HRegion.java    | 130 +++++++++++++-----
 .../hadoop/hbase/regionserver/HStore.java     |  44 +++---
 .../hbase/regionserver/StoreEngine.java       |  64 ++++++---
 .../hbase/regionserver/TestCompaction.java    |  75 +++++++++-
 .../TestCompaction_HFileWithCorruptBlock.gz   | Bin 0 -> 952 bytes
 5 files changed, 240 insertions(+), 73 deletions(-)
 create mode 100644 hbase-server/src/test/resources/org/apache/hadoop/hbase/regionserver/TestCompaction_HFileWithCorruptBlock.gz

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java
index 2381458a48bb..709b38ae926e 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java
@@ -2269,6 +2269,102 @@ public boolean compact(CompactionContext compaction, HStore store,
     return compact(compaction, store, throughputController, null);
   }
 
+  /**
+   * <p>
+   * We are trying to remove / relax the region read lock for compaction. Let's see what are the
+   * potential race conditions among the operations (user scan, region split, region close and
+   * region bulk load).
+   * </p>
+   *
+   * <pre>
+   *   user scan ---> region read lock
+   *   region split --> region close first --> region write lock
+   *   region close --> region write lock
+   *   region bulk load --> region write lock
+   * </pre>
+   * <p>
+   * read lock is compatible with read lock. ---> no problem with user scan/read region bulk load
+   * does not cause problem for compaction (no consistency problem, store lock will help the store
+   * file accounting). They can run almost concurrently at the region level.
+   * </p>
+   * <p>
+   * The only remaining race condition is between the region close and compaction. So we will
+   * evaluate, below, how region close intervenes with compaction if compaction does not acquire
+   * region read lock.
+   * </p>
+   * <p>
+   * Here are the steps for compaction:
+   * <ol>
+   * <li>obtain list of StoreFile's</li>
+   * <li>create StoreFileScanner's based on list from #1</li>
+   * <li>perform compaction and save resulting files under tmp dir</li>
+   * <li>swap in compacted files</li>
+   * </ol>
+   * </p>
+   * <p>
+   * #1 is guarded by store lock. This patch does not change this --> no worse or better For #2, we
+   * obtain smallest read point (for region) across all the Scanners (for both default compactor and
+   * stripe compactor). The read points are for user scans. Region keeps the read points for all
+   * currently open user scanners. Compaction needs to know the smallest read point so that during
+   * re-write of the hfiles, it can remove the mvcc points for the cells if their mvccs are older
+   * than the smallest since they are not needed anymore. This will not conflict with compaction.
+   * </p>
+   * <p>
+   * For #3, it can be performed in parallel to other operations.
+   * </p>
+   * <p>
+   * For #4 bulk load and compaction don't conflict with each other on the region level (for
+   * multi-family atomicy).
+   * </p>
+   * <p>
+   * Region close and compaction are guarded pretty well by the 'writestate'. In HRegion#doClose(),
+   * we have :
+   *
+   * <pre>
+   * synchronized (writestate) {
+   *   // Disable compacting and flushing by background threads for this
+   *   // region.
+   *   canFlush = !writestate.readOnly;
+   *   writestate.writesEnabled = false;
+   *   LOG.debug("Closing " + this + ": disabling compactions & flushes");
+   *   waitForFlushesAndCompactions();
+   * }
+   * </pre>
+   *
+   * {@code waitForFlushesAndCompactions()} would wait for {@code writestate.compacting} to come
+   * down to 0. and in {@code HRegion.compact()}
+   *
+   * <pre>
+   *   try {
+   *     synchronized (writestate) {
+   *       if (writestate.writesEnabled) {
+   *         wasStateSet = true;
+   *         ++writestate.compacting;
+   *       } else {
+   *         String msg = "NOT compacting region " + this + ". Writes disabled.";
+   *         LOG.info(msg);
+   *         status.abort(msg);
+   *         return false;
+   *       }
+   *     }
+   *   }
+   * </pre>
+   *
+   * Also in {@code compactor.performCompaction()}: check periodically to see if a system stop is
+   * requested
+   *
+   * <pre>
+   * if (closeChecker != null && closeChecker.isTimeLimit(store, now)) {
+   *   progress.cancel();
+   *   return false;
+   * }
+   * if (closeChecker != null && closeChecker.isSizeLimit(store, len)) {
+   *   progress.cancel();
+   *   return false;
+   * }
+   * </pre>
+   * </p>
+   */
   public boolean compact(CompactionContext compaction, HStore store,
     ThroughputController throughputController, User user) throws IOException {
     assert compaction != null && compaction.hasSelection();
@@ -2280,40 +2376,6 @@ public boolean compact(CompactionContext compaction, HStore store,
     }
     MonitoredTask status = null;
     boolean requestNeedsCancellation = true;
-    /*
-     * We are trying to remove / relax the region read lock for compaction. Let's see what are the
-     * potential race conditions among the operations (user scan, region split, region close and
-     * region bulk load). user scan ---> region read lock region split --> region close first -->
-     * region write lock region close --> region write lock region bulk load --> region write lock
-     * read lock is compatible with read lock. ---> no problem with user scan/read region bulk load
-     * does not cause problem for compaction (no consistency problem, store lock will help the store
-     * file accounting). They can run almost concurrently at the region level. The only remaining
-     * race condition is between the region close and compaction. So we will evaluate, below, how
-     * region close intervenes with compaction if compaction does not acquire region read lock. Here
-     * are the steps for compaction: 1. obtain list of StoreFile's 2. create StoreFileScanner's
-     * based on list from #1 3. perform compaction and save resulting files under tmp dir 4. swap in
-     * compacted files #1 is guarded by store lock. This patch does not change this --> no worse or
-     * better For #2, we obtain smallest read point (for region) across all the Scanners (for both
-     * default compactor and stripe compactor). The read points are for user scans. Region keeps the
-     * read points for all currently open user scanners. Compaction needs to know the smallest read
-     * point so that during re-write of the hfiles, it can remove the mvcc points for the cells if
-     * their mvccs are older than the smallest since they are not needed anymore. This will not
-     * conflict with compaction. For #3, it can be performed in parallel to other operations. For #4
-     * bulk load and compaction don't conflict with each other on the region level (for multi-family
-     * atomicy). Region close and compaction are guarded pretty well by the 'writestate'. In
-     * HRegion#doClose(), we have : synchronized (writestate) { // Disable compacting and flushing
-     * by background threads for this // region. canFlush = !writestate.readOnly;
-     * writestate.writesEnabled = false; LOG.debug("Closing " + this +
-     * ": disabling compactions & flushes"); waitForFlushesAndCompactions(); }
-     * waitForFlushesAndCompactions() would wait for writestate.compacting to come down to 0. and in
-     * HRegion.compact() try { synchronized (writestate) { if (writestate.writesEnabled) {
-     * wasStateSet = true; ++writestate.compacting; } else { String msg = "NOT compacting region " +
-     * this + ". Writes disabled."; LOG.info(msg); status.abort(msg); return false; } } Also in
-     * compactor.performCompaction(): check periodically to see if a system stop is requested if
-     * (closeChecker != null && closeChecker.isTimeLimit(store, now)) { progress.cancel(); return
-     * false; } if (closeChecker != null && closeChecker.isSizeLimit(store, len)) {
-     * progress.cancel(); return false; }
-     */
     try {
       byte[] cf = Bytes.toBytes(store.getColumnFamilyName());
       if (stores.get(cf) != store) {
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HStore.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HStore.java
index 1df8d0b95807..710c94753093 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HStore.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HStore.java
@@ -832,7 +832,7 @@ protected List<Path> flushCache(final long logCacheFlushId, MemStoreSnapshot sna
         try {
           for (Path pathName : pathNames) {
             lastPathName = pathName;
-            storeEngine.validateStoreFile(pathName);
+            storeEngine.validateStoreFile(pathName, false);
           }
           return pathNames;
         } catch (Exception e) {
@@ -1118,7 +1118,7 @@ public void deleteChangedReaderObserver(ChangedReadersObserver o) {
    * block for long periods.
    * <p>
    * During this time, the Store can work as usual, getting values from StoreFiles and writing new
-   * StoreFiles from the memstore. Existing StoreFiles are not destroyed until the new compacted
+   * StoreFiles from the MemStore. Existing StoreFiles are not destroyed until the new compacted
    * StoreFile is completely written-out to disk.
    * <p>
    * The compactLock prevents multiple simultaneous compactions. The structureLock prevents us from
@@ -1129,21 +1129,29 @@ public void deleteChangedReaderObserver(ChangedReadersObserver o) {
    * <p>
    * Compaction event should be idempotent, since there is no IO Fencing for the region directory in
    * hdfs. A region server might still try to complete the compaction after it lost the region. That
-   * is why the following events are carefully ordered for a compaction: 1. Compaction writes new
-   * files under region/.tmp directory (compaction output) 2. Compaction atomically moves the
-   * temporary file under region directory 3. Compaction appends a WAL edit containing the
-   * compaction input and output files. Forces sync on WAL. 4. Compaction deletes the input files
-   * from the region directory. Failure conditions are handled like this: - If RS fails before 2,
-   * compaction wont complete. Even if RS lives on and finishes the compaction later, it will only
-   * write the new data file to the region directory. Since we already have this data, this will be
-   * idempotent but we will have a redundant copy of the data. - If RS fails between 2 and 3, the
-   * region will have a redundant copy of the data. The RS that failed won't be able to finish
-   * sync() for WAL because of lease recovery in WAL. - If RS fails after 3, the region region
-   * server who opens the region will pick up the the compaction marker from the WAL and replay it
-   * by removing the compaction input files. Failed RS can also attempt to delete those files, but
-   * the operation will be idempotent See HBASE-2231 for details.
+   * is why the following events are carefully ordered for a compaction:
+   * <ol>
+   * <li>Compaction writes new files under region/.tmp directory (compaction output)</li>
+   * <li>Compaction atomically moves the temporary file under region directory</li>
+   * <li>Compaction appends a WAL edit containing the compaction input and output files. Forces sync
+   * on WAL.</li>
+   * <li>Compaction deletes the input files from the region directory.</li>
+   * </ol>
+   * Failure conditions are handled like this:
+   * <ul>
+   * <li>If RS fails before 2, compaction won't complete. Even if RS lives on and finishes the
+   * compaction later, it will only write the new data file to the region directory. Since we
+   * already have this data, this will be idempotent, but we will have a redundant copy of the
+   * data.</li>
+   * <li>If RS fails between 2 and 3, the region will have a redundant copy of the data. The RS that
+   * failed won't be able to finish sync() for WAL because of lease recovery in WAL.</li>
+   * <li>If RS fails after 3, the region server who opens the region will pick up the compaction
+   * marker from the WAL and replay it by removing the compaction input files. Failed RS can also
+   * attempt to delete those files, but the operation will be idempotent</li>
+   * </ul>
+   * See HBASE-2231 for details.
    * @param compaction compaction details obtained from requestCompaction()
-   * @return Storefile we compacted into or null if we failed or opted out early.
+   * @return The storefiles that we compacted into or null if we failed or opted out early.
    */
   public List<HStoreFile> compact(CompactionContext compaction,
     ThroughputController throughputController, User user) throws IOException {
@@ -1186,7 +1194,7 @@ protected List<HStoreFile> doCompaction(CompactionRequestImpl cr,
     throws IOException {
     // Do the steps necessary to complete the compaction.
     setStoragePolicyFromFileName(newFiles);
-    List<HStoreFile> sfs = storeEngine.commitStoreFiles(newFiles, true);
+    List<HStoreFile> sfs = storeEngine.commitStoreFiles(newFiles, true, true);
     if (this.getCoprocessorHost() != null) {
       for (HStoreFile sf : sfs) {
         getCoprocessorHost().postCompact(this, sf, cr.getTracker(), cr, user);
@@ -1978,7 +1986,7 @@ public boolean commit(MonitoredTask status) throws IOException {
           return false;
         }
         status.setStatus("Flushing " + this + ": reopening flushed file");
-        List<HStoreFile> storeFiles = storeEngine.commitStoreFiles(tempFiles, false);
+        List<HStoreFile> storeFiles = storeEngine.commitStoreFiles(tempFiles, false, false);
         for (HStoreFile sf : storeFiles) {
           StoreFileReader r = sf.getReader();
           if (LOG.isInfoEnabled()) {
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreEngine.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreEngine.java
index 5923befbc9de..8d81c90144ff 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreEngine.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreEngine.java
@@ -36,7 +36,9 @@
 import java.util.function.Function;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.Cell;
 import org.apache.hadoop.hbase.CellComparator;
+import org.apache.hadoop.hbase.KeyValue;
 import org.apache.hadoop.hbase.io.hfile.BloomFilterMetrics;
 import org.apache.hadoop.hbase.log.HBaseMarkers;
 import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
@@ -95,6 +97,9 @@ public abstract class StoreEngine<SF extends StoreFlusher, CP extends Compaction
 
   private static final Logger LOG = LoggerFactory.getLogger(StoreEngine.class);
 
+  private static final String READ_FULLY_ON_VALIDATE_KEY = "hbase.hstore.validate.read_fully";
+  private static final boolean DEFAULT_READ_FULLY_ON_VALIDATE = false;
+
   protected SF storeFlusher;
   protected CP compactionPolicy;
   protected C compactor;
@@ -162,7 +167,7 @@ public StoreFileManager getStoreFileManager() {
   }
 
   /** Returns Store flusher to use. */
-  public StoreFlusher getStoreFlusher() {
+  StoreFlusher getStoreFlusher() {
     return this.storeFlusher;
   }
 
@@ -201,7 +206,7 @@ protected final void createComponentsOnce(Configuration conf, HStore store,
     this.openStoreFileThreadPoolCreator = store.getHRegion()::getStoreFileOpenAndCloseThreadPool;
     this.storeFileTracker = createStoreFileTracker(conf, store);
     assert compactor != null && compactionPolicy != null && storeFileManager != null
-      && storeFlusher != null && storeFileTracker != null;
+      && storeFlusher != null;
   }
 
   /**
@@ -229,12 +234,34 @@ public HStoreFile createStoreFileAndReader(StoreFileInfo info) throws IOExceptio
   /**
    * Validates a store file by opening and closing it. In HFileV2 this should not be an expensive
    * operation.
-   * @param path the path to the store file
+   * @param path         the path to the store file
+   * @param isCompaction whether this is called from the context of a compaction
    */
-  public void validateStoreFile(Path path) throws IOException {
+  public void validateStoreFile(Path path, boolean isCompaction) throws IOException {
     HStoreFile storeFile = null;
     try {
       storeFile = createStoreFileAndReader(path);
+      if (conf.getBoolean(READ_FULLY_ON_VALIDATE_KEY, DEFAULT_READ_FULLY_ON_VALIDATE)) {
+        if (!storeFile.getFirstKey().isPresent()) {
+          LOG.debug("'{}=true' but storefile does not contain any data. skipping validation.",
+            READ_FULLY_ON_VALIDATE_KEY);
+          return;
+        }
+        LOG.debug("Validating the store file by reading the first cell from each block : {}", path);
+        StoreFileReader reader = storeFile.getReader();
+        try (StoreFileScanner scanner =
+          reader.getStoreFileScanner(false, false, isCompaction, Long.MAX_VALUE, 0, false)) {
+          boolean hasNext = scanner.seek(KeyValue.LOWESTKEY);
+          assert hasNext : "StoreFile contains no data";
+          for (Cell cell = scanner.next(); cell != null; cell = scanner.next()) {
+            Cell nextIndexedKey = scanner.getNextIndexedKey();
+            if (nextIndexedKey == null) {
+              break;
+            }
+            scanner.seek(nextIndexedKey);
+          }
+        }
+      }
     } catch (IOException e) {
       LOG.error("Failed to open store file : {}, keeping it in tmp location", path, e);
       throw e;
@@ -294,8 +321,7 @@ private List<HStoreFile> openStoreFiles(Collection<StoreFileInfo> files, boolean
     }
     if (ioe != null) {
       // close StoreFile readers
-      boolean evictOnClose =
-        ctx.getCacheConf() != null ? ctx.getCacheConf().shouldEvictOnClose() : true;
+      boolean evictOnClose = ctx.getCacheConf() == null || ctx.getCacheConf().shouldEvictOnClose();
       for (HStoreFile file : results) {
         try {
           if (file != null) {
@@ -315,10 +341,8 @@ private List<HStoreFile> openStoreFiles(Collection<StoreFileInfo> files, boolean
       for (HStoreFile storeFile : results) {
         if (compactedStoreFiles.contains(storeFile.getPath().getName())) {
           LOG.warn("Clearing the compacted storefile {} from {}", storeFile, this);
-          storeFile.getReader()
-            .close(storeFile.getCacheConf() != null
-              ? storeFile.getCacheConf().shouldEvictOnClose()
-              : true);
+          storeFile.getReader().close(
+            storeFile.getCacheConf() == null || storeFile.getCacheConf().shouldEvictOnClose());
           filesToRemove.add(storeFile);
         }
       }
@@ -380,7 +404,7 @@ private void refreshStoreFilesInternal(Collection<StoreFileInfo> newFiles) throw
       compactedFilesSet.put(sf.getFileInfo(), sf);
     }
 
-    Set<StoreFileInfo> newFilesSet = new HashSet<StoreFileInfo>(newFiles);
+    Set<StoreFileInfo> newFilesSet = new HashSet<>(newFiles);
     // Exclude the files that have already been compacted
     newFilesSet = Sets.difference(newFilesSet, compactedFilesSet.keySet());
     Set<StoreFileInfo> toBeAddedFiles = Sets.difference(newFilesSet, currentFilesSet.keySet());
@@ -390,8 +414,8 @@ private void refreshStoreFilesInternal(Collection<StoreFileInfo> newFiles) throw
       return;
     }
 
-    LOG.info("Refreshing store files for " + this + " files to add: " + toBeAddedFiles
-      + " files to remove: " + toBeRemovedFiles);
+    LOG.info("Refreshing store files for {} files to add: {} files to remove: {}", this,
+      toBeAddedFiles, toBeRemovedFiles);
 
     Set<HStoreFile> toBeRemovedStoreFiles = new HashSet<>(toBeRemovedFiles.size());
     for (StoreFileInfo sfi : toBeRemovedFiles) {
@@ -401,7 +425,7 @@ private void refreshStoreFilesInternal(Collection<StoreFileInfo> newFiles) throw
     // try to open the files
     List<HStoreFile> openedFiles = openStoreFiles(toBeAddedFiles, false);
 
-    // propogate the file changes to the underlying store file manager
+    // propagate the file changes to the underlying store file manager
     replaceStoreFiles(toBeRemovedStoreFiles, openedFiles, () -> {
     }, () -> {
     }); // won't throw an exception
@@ -411,11 +435,13 @@ private void refreshStoreFilesInternal(Collection<StoreFileInfo> newFiles) throw
    * Commit the given {@code files}.
    * <p/>
    * We will move the file into data directory, and open it.
-   * @param files    the files want to commit
-   * @param validate whether to validate the store files
+   * @param files        the files want to commit
+   * @param isCompaction whether this is called from the context of a compaction
+   * @param validate     whether to validate the store files
    * @return the committed store files
    */
-  public List<HStoreFile> commitStoreFiles(List<Path> files, boolean validate) throws IOException {
+  public List<HStoreFile> commitStoreFiles(List<Path> files, boolean isCompaction, boolean validate)
+    throws IOException {
     List<HStoreFile> committedFiles = new ArrayList<>(files.size());
     HRegionFileSystem hfs = ctx.getRegionFileSystem();
     String familyName = ctx.getFamily().getNameAsString();
@@ -423,13 +449,13 @@ public List<HStoreFile> commitStoreFiles(List<Path> files, boolean validate) thr
     for (Path file : files) {
       try {
         if (validate) {
-          validateStoreFile(file);
+          validateStoreFile(file, isCompaction);
         }
         Path committedPath;
         // As we want to support writing to data directory directly, here we need to check whether
         // the store file is already in the right place
         if (file.getParent() != null && file.getParent().equals(storeDir)) {
-          // already in the right place, skip renmaing
+          // already in the right place, skip renaming
           committedPath = file;
         } else {
           // Write-out finished successfully, move into the right spot
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompaction.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompaction.java
index 332ecd8a95a0..86fe54c98151 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompaction.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompaction.java
@@ -23,6 +23,12 @@
 import static org.apache.hadoop.hbase.regionserver.Store.PRIORITY_USER;
 import static org.apache.hadoop.hbase.regionserver.compactions.CloseChecker.SIZE_LIMIT_KEY;
 import static org.apache.hadoop.hbase.regionserver.compactions.CloseChecker.TIME_LIMIT_KEY;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.allOf;
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.hasProperty;
+import static org.hamcrest.Matchers.instanceOf;
+import static org.hamcrest.Matchers.notNullValue;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertThrows;
@@ -35,13 +41,17 @@
 import static org.mockito.Mockito.when;
 
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
+import java.util.Objects;
 import java.util.Optional;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
+import java.util.zip.GZIPInputStream;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileStatus;
@@ -60,6 +70,7 @@
 import org.apache.hadoop.hbase.client.Durability;
 import org.apache.hadoop.hbase.client.Put;
 import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.io.compress.Compression;
 import org.apache.hadoop.hbase.io.hfile.HFileScanner;
 import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
 import org.apache.hadoop.hbase.regionserver.compactions.CompactionLifeCycleTracker;
@@ -75,6 +86,7 @@
 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
 import org.apache.hadoop.hbase.util.Threads;
 import org.apache.hadoop.hbase.wal.WAL;
+import org.apache.hadoop.io.IOUtils;
 import org.junit.After;
 import org.junit.Assume;
 import org.junit.Before;
@@ -86,6 +98,7 @@
 import org.mockito.Mockito;
 import org.mockito.invocation.InvocationOnMock;
 import org.mockito.stubbing.Answer;
+import org.slf4j.LoggerFactory;
 
 /**
  * Test compaction framework and common functions
@@ -114,8 +127,6 @@ public class TestCompaction {
 
   /** constructor */
   public TestCompaction() {
-    super();
-
     // Set cache flush size to 1MB
     conf.setInt(HConstants.HREGION_MEMSTORE_FLUSH_SIZE, 1024 * 1024);
     conf.setInt(HConstants.HREGION_MEMSTORE_BLOCK_MULTIPLIER, 100);
@@ -143,6 +154,12 @@ public void setUp() throws Exception {
       hcd.setMaxVersions(65536);
       this.htd.addFamily(hcd);
     }
+    if (name.getMethodName().equals("testCompactionWithCorruptBlock")) {
+      UTIL.getConfiguration().setBoolean("hbase.hstore.validate.read_fully", true);
+      HColumnDescriptor hcd = new HColumnDescriptor(FAMILY);
+      hcd.setCompressionType(Compression.Algorithm.GZ);
+      this.htd.addFamily(hcd);
+    }
     this.r = UTIL.createLocalHRegion(htd, null, null);
   }
 
@@ -354,6 +371,7 @@ public void testCompactionWithCorruptResult() throws Exception {
     try (FSDataOutputStream stream = fs.create(tmpPath, null, true, 512, (short) 3, 1024L, null)) {
       stream.writeChars("CORRUPT FILE!!!!");
     }
+
     // The complete compaction should fail and the corrupt file should remain
     // in the 'tmp' directory;
     assertThrows(IOException.class, () -> store.doCompaction(null, null, null,
@@ -361,6 +379,59 @@ public void testCompactionWithCorruptResult() throws Exception {
     assertTrue(fs.exists(tmpPath));
   }
 
+  /**
+   * This test uses a hand-modified HFile, which is loaded in from the resources' path. That file
+   * was generated from the test support code in this class and then edited to corrupt the
+   * GZ-encoded block by zeroing-out the first two bytes of the GZip header, the "standard
+   * declaration" of {@code 1f 8b}, found at offset 33 in the file. I'm not sure why, but it seems
+   * that in this test context we do not enforce CRC checksums. Thus, this corruption manifests in
+   * the Decompressor rather than in the reader when it loads the block bytes and compares vs. the
+   * header.
+   */
+  @Test
+  public void testCompactionWithCorruptBlock() throws Exception {
+    createStoreFile(r, Bytes.toString(FAMILY));
+    createStoreFile(r, Bytes.toString(FAMILY));
+    HStore store = r.getStore(FAMILY);
+
+    Collection<HStoreFile> storeFiles = store.getStorefiles();
+    DefaultCompactor tool = (DefaultCompactor) store.storeEngine.getCompactor();
+    CompactionRequestImpl request = new CompactionRequestImpl(storeFiles);
+    tool.compact(request, NoLimitThroughputController.INSTANCE, null);
+
+    // insert the hfile with a corrupted data block into the region's tmp directory, where
+    // compaction output is collected.
+    FileSystem fs = store.getFileSystem();
+    Path tmpPath = store.getRegionFileSystem().createTempName();
+    try (
+      InputStream inputStream =
+        getClass().getResourceAsStream("TestCompaction_HFileWithCorruptBlock.gz");
+      GZIPInputStream gzipInputStream = new GZIPInputStream(Objects.requireNonNull(inputStream));
+      OutputStream outputStream = fs.create(tmpPath, null, true, 512, (short) 3, 1024L, null)) {
+      assertThat(gzipInputStream, notNullValue());
+      assertThat(outputStream, notNullValue());
+      IOUtils.copyBytes(gzipInputStream, outputStream, 512);
+    }
+    LoggerFactory.getLogger(TestCompaction.class).info("Wrote corrupted HFile to {}", tmpPath);
+
+    // The complete compaction should fail and the corrupt file should remain
+    // in the 'tmp' directory;
+    try {
+      store.doCompaction(request, storeFiles, null, EnvironmentEdgeManager.currentTime(),
+        Collections.singletonList(tmpPath));
+    } catch (IOException e) {
+      Throwable rootCause = e;
+      while (rootCause.getCause() != null) {
+        rootCause = rootCause.getCause();
+      }
+      assertThat(rootCause, allOf(instanceOf(IOException.class),
+        hasProperty("message", containsString("not a gzip file"))));
+      assertTrue(fs.exists(tmpPath));
+      return;
+    }
+    fail("Compaction should have failed due to corrupt block");
+  }
+
   /**
    * Create a custom compaction request and be sure that we can track it through the queue, knowing
    * when the compaction is completed.
diff --git a/hbase-server/src/test/resources/org/apache/hadoop/hbase/regionserver/TestCompaction_HFileWithCorruptBlock.gz b/hbase-server/src/test/resources/org/apache/hadoop/hbase/regionserver/TestCompaction_HFileWithCorruptBlock.gz
new file mode 100644
index 0000000000000000000000000000000000000000..c93407b455c8c00c2d7745ae4c0faeaa1834a017
GIT binary patch
literal 952
zcmV;p14sNHiwFo{hP-D215{;mbVF}#aA9L~X>V>{NJeRFWmjo*XhUyua&>TYLTqni
zYXEa`3~_Yw@z!EsVA#sQz;yLL6fiL`H~_^ME-^50FhD@)$>XAoj3TTT&iBZ;FB4jN
z!|&Y1BTj2fT-_ro^DEDb3i`T;1eV0UpWWy2zHqN?{HuPZ%mOK{WhXON%@A8|dSHv?
zTg7FEU-lTp`jz**EUXgtnSXhH65E`EJjPFWp3e|W-*TYG^08{#p_yw8g3p!o&MZum
zN;@|*-+=pEQg`L^J?{;?&lUAY7KX8(`8EH2{x$bAZJR9<x6OGgx%u%MJ^ABxTkl;3
zI@@yX+Q&{lzHUHY83A2w4a5rQzKW3VhWUz7f#LrD5P=sPGP8Mdt;%-1s_d<ByZ7nI
zt4j;D7U&;%bU6Bfy!w`~Biaf;#qA3=?(}qt2=ez20hz52#B%6n`yk9tPH1S(ZsIx9
z+1crHGLwg=Cs<c0Ek1#fVMeobksMIL@^f5=k<61|fCEM&n0clM^O)h%i!Fubxq14y
zdiuEmH8O?+qmCQRA&eymjj0n3`W<o*XmhN(;K{Q4ii?A?;m1=qUk5EKE!`fy{WjNf
zPD3Hz8b-MT?jN3bpOoI|oq1r#blX4w|9t*C+5hwt;|6Yy<w4o5ofE`(Z@TuaN&aFS
z<0O(j_45s`xq@LcBCC$Dau=C%UcWGZp-cCa0@Wu?NwI0uWVe^`yIpY3TXNt~P+Zie
zR`J{M<)7cnzhJ!6eJD#gNiT9YD>Fx|S!m2O57AkN-ljC3&OB^cqFSzKx=j6r&h%_^
zLCt>)cx4@`F1A{l9eute;HT(|u>7fQxg5?fs+=4BRnqFW2*w=j6y@SQAC!{L`|;iV
ziVNGX{yzS?C4A+4l}^))7gFE92i;vFXl>B8LM*xYPVCwD7JE*zsTr)0wF^}@5zTzn
zFTQw+PR`Ctrx#BTWfnQWJLg(mz3lzO*OBjSXEWr@_F?~|zxk!jKURBQ#>?EmfL&dG
za0)1K`MQE5%mj!n(Gx%*aso(5NNZqVRY+q`a?^+x^72;fG>l<>B=PLX5f_HG14R**
zKvj~fAN>po0%b!b6>pB+%mSO3B;*wCX=^YVFjz=BFnTZqFhm*T7p3bZ79=KTr0QiP
zrsU@r=w&1&7N_cY=H;ap1()O(r8=kP<T&T&79<uW0=d9sHVQ_;C>RB!U=)mkQ7{Td
a!6+C7qhJ(_f<XhAfd~NoWi=*<69522E!Qjn

literal 0
HcmV?d00001


From 809751ae7390d9f3196ef15fd387ba3c5f02ee63 Mon Sep 17 00:00:00 2001
From: Ray Mattingly <rmattingly@apache.org>
Date: Fri, 21 Mar 2025 08:05:03 -0400
Subject: [PATCH 24/37] HubSpot Backport: HBASE-29202 Balancer conditionals
 make balancer actions more likely to be approved (#6821) (will be in 2.7)

Co-authored-by: Ray Mattingly <rmattingly@hubspot.com>
Signed-off-by: Nick Dimiduk <ndimiduk@apache.org>
---
 .../master/balancer/BalancerConditionals.java |   2 +-
 .../balancer/CandidateGeneratorTestUtil.java  |  32 ++++--
 ...gTableIsolationAndReplicaDistribution.java |   8 +-
 .../TestUnattainableBalancerCostGoal.java     | 108 ++++++++++++++++++
 4 files changed, 136 insertions(+), 14 deletions(-)
 create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestUnattainableBalancerCostGoal.java

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java
index 021a34bce6b9..b82c68b37da3 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerConditionals.java
@@ -146,7 +146,7 @@ int getViolationCountChange(BalancerClusterState cluster, BalanceAction action)
     // Reset cluster
     cluster.doAction(undoAction);
 
-    if (isViolatingPre && isViolatingPost) {
+    if (isViolatingPre == isViolatingPost) {
       return 0;
     } else if (!isViolatingPre && isViolatingPost) {
       return 1;
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java
index 03bfcce8e150..d2a9d17cdba0 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/CandidateGeneratorTestUtil.java
@@ -51,16 +51,22 @@ public final class CandidateGeneratorTestUtil {
   private CandidateGeneratorTestUtil() {
   }
 
+  enum ExhaustionType {
+    COST_GOAL_ACHIEVED,
+    NO_MORE_MOVES;
+  }
+
   static void runBalancerToExhaustion(Configuration conf,
     Map<ServerName, List<RegionInfo>> serverToRegions,
     Set<Function<BalancerClusterState, Boolean>> expectations, float targetMaxBalancerCost) {
-    runBalancerToExhaustion(conf, serverToRegions, expectations, targetMaxBalancerCost, 15000);
+    runBalancerToExhaustion(conf, serverToRegions, expectations, targetMaxBalancerCost, 15000,
+      ExhaustionType.COST_GOAL_ACHIEVED);
   }
 
   static void runBalancerToExhaustion(Configuration conf,
     Map<ServerName, List<RegionInfo>> serverToRegions,
     Set<Function<BalancerClusterState, Boolean>> expectations, float targetMaxBalancerCost,
-    long maxRunningTime) {
+    long maxRunningTime, ExhaustionType exhaustionType) {
     // Do the full plan. We're testing with a lot of regions
     conf.setBoolean("hbase.master.balancer.stochastic.runMaxSteps", true);
     conf.setLong(MAX_RUNNING_TIME_KEY, maxRunningTime);
@@ -76,7 +82,7 @@ static void runBalancerToExhaustion(Configuration conf,
     boolean isBalanced = false;
     while (!isBalanced) {
       balancerRuns++;
-      if (balancerRuns > 1000) {
+      if (balancerRuns > 10) {
         throw new RuntimeException("Balancer failed to find balance & meet expectations");
       }
       long start = System.currentTimeMillis();
@@ -111,16 +117,24 @@ static void runBalancerToExhaustion(Configuration conf,
         }
       }
       if (isBalanced) { // Check if the balancer thinks we're done too
-        LOG.info("All balancer conditions passed. Checking if balancer thinks it's done.");
-        if (stochasticLoadBalancer.needsBalance(HConstants.ENSEMBLE_TABLE_NAME, cluster)) {
-          LOG.info("Balancer would still like to run");
-          isBalanced = false;
+        if (exhaustionType == ExhaustionType.COST_GOAL_ACHIEVED) {
+          // If we expect to achieve the cost goal, then needsBalance should be false
+          if (stochasticLoadBalancer.needsBalance(HConstants.ENSEMBLE_TABLE_NAME, cluster)) {
+            LOG.info("Balancer cost goal is not achieved. needsBalance=true");
+            isBalanced = false;
+          }
         } else {
-          LOG.info("Balancer is done");
+          // If we anticipate running out of moves, then our last balance run should have produced
+          // nothing
+          if (regionPlans != null && !regionPlans.isEmpty()) {
+            LOG.info("Balancer is not out of moves. regionPlans.size()={}", regionPlans.size());
+            isBalanced = false;
+          }
         }
       }
     }
-    LOG.info("Balancing took {}sec", Duration.ofMillis(balancingMillis).toMinutes());
+    LOG.info("Balancer is done. Balancing took {}sec",
+      Duration.ofMillis(balancingMillis).toMinutes());
   }
 
   /**
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java
index 3a28ae801e4e..bc31530f4921 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestLargeClusterBalancingTableIsolationAndReplicaDistribution.java
@@ -104,10 +104,10 @@ public void testTableIsolationAndReplicaDistribution() {
     conf.setBoolean(BalancerConditionals.ISOLATE_META_TABLE_KEY, true);
     conf.setBoolean(BalancerConditionals.ISOLATE_SYSTEM_TABLES_KEY, true);
     DistributeReplicasTestConditional.enableConditionalReplicaDistributionForTest(conf);
-
-    runBalancerToExhaustion(conf, serverToRegions, ImmutableSet.of(this::isMetaTableIsolated,
-      this::isSystemTableIsolated, CandidateGeneratorTestUtil::areAllReplicasDistributed), 10.0f,
-      60_000);
+    runBalancerToExhaustion(conf, serverToRegions,
+      ImmutableSet.of(this::isMetaTableIsolated, this::isSystemTableIsolated,
+        CandidateGeneratorTestUtil::areAllReplicasDistributed),
+      10.0f, 60_000, CandidateGeneratorTestUtil.ExhaustionType.COST_GOAL_ACHIEVED);
     LOG.info("Meta table regions are successfully isolated, "
       + "and region replicas are appropriately distributed.");
   }
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestUnattainableBalancerCostGoal.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestUnattainableBalancerCostGoal.java
new file mode 100644
index 000000000000..ffa2b4a78212
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestUnattainableBalancerCostGoal.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import static org.apache.hadoop.hbase.master.balancer.CandidateGeneratorTestUtil.isTableIsolated;
+import static org.apache.hadoop.hbase.master.balancer.CandidateGeneratorTestUtil.runBalancerToExhaustion;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.RegionInfoBuilder;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * If your minCostNeedsBalance is set too low, then the balancer should still eventually stop making
+ * moves as further cost improvements become impossible, and balancer plan calculation becomes
+ * wasteful. This test ensures that the balancer will not get stuck in a loop of continuously moving
+ * regions.
+ */
+@Category({ MasterTests.class, MediumTests.class })
+public class TestUnattainableBalancerCostGoal {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestUnattainableBalancerCostGoal.class);
+
+  private static final Logger LOG = LoggerFactory.getLogger(TestUnattainableBalancerCostGoal.class);
+
+  private static final TableName SYSTEM_TABLE_NAME = TableName.valueOf("hbase:system");
+  private static final TableName NON_SYSTEM_TABLE_NAME = TableName.valueOf("userTable");
+
+  private static final int NUM_SERVERS = 10;
+  private static final int NUM_REGIONS = 1000;
+  private static final float UNACHIEVABLE_COST_GOAL = 0.01f;
+
+  private static final ServerName[] servers = new ServerName[NUM_SERVERS];
+  private static final Map<ServerName, List<RegionInfo>> serverToRegions = new HashMap<>();
+
+  @BeforeClass
+  public static void setup() {
+    // Initialize servers
+    for (int i = 0; i < NUM_SERVERS; i++) {
+      servers[i] = ServerName.valueOf("server" + i, i, System.currentTimeMillis());
+    }
+
+    // Create regions
+    List<RegionInfo> allRegions = new ArrayList<>();
+    for (int i = 0; i < NUM_REGIONS; i++) {
+      TableName tableName = i < 3 ? SYSTEM_TABLE_NAME : NON_SYSTEM_TABLE_NAME;
+      byte[] startKey = new byte[1];
+      startKey[0] = (byte) i;
+      byte[] endKey = new byte[1];
+      endKey[0] = (byte) (i + 1);
+
+      RegionInfo regionInfo =
+        RegionInfoBuilder.newBuilder(tableName).setStartKey(startKey).setEndKey(endKey).build();
+      allRegions.add(regionInfo);
+    }
+
+    // Assign all regions to the first server
+    serverToRegions.put(servers[0], new ArrayList<>(allRegions));
+    for (int i = 1; i < NUM_SERVERS; i++) {
+      serverToRegions.put(servers[i], new ArrayList<>());
+    }
+  }
+
+  @Test
+  public void testSystemTableIsolation() {
+    Configuration conf = new Configuration(false);
+    conf.setBoolean(BalancerConditionals.ISOLATE_SYSTEM_TABLES_KEY, true);
+    runBalancerToExhaustion(conf, serverToRegions, Set.of(this::isSystemTableIsolated),
+      UNACHIEVABLE_COST_GOAL, 10_000, CandidateGeneratorTestUtil.ExhaustionType.NO_MORE_MOVES);
+    LOG.info("Meta table regions are successfully isolated.");
+  }
+
+  private boolean isSystemTableIsolated(BalancerClusterState cluster) {
+    return isTableIsolated(cluster, SYSTEM_TABLE_NAME, "System");
+  }
+}

From 3a29041b186fd3b8974cfcb6b72497c6e2574eab Mon Sep 17 00:00:00 2001
From: Ray Mattingly <rmattingly@apache.org>
Date: Fri, 21 Mar 2025 08:10:01 -0400
Subject: [PATCH 25/37] HubSpot Backport: HBASE-29203 There should be a
 StorefileSize equivalent to the TableSkewCost (#6825) (will be in 2.7)

Co-authored-by: Ray Mattingly <rmattingly@hubspot.com>
Signed-off-by: Nick Dimiduk <ndimiduk@apache.org>
---
 .../master/balancer/BalancerClusterState.java |   4 +
 .../balancer/CostFromRegionLoadFunction.java  |   2 +-
 .../balancer/StochasticLoadBalancer.java      |   1 +
 .../StoreFileTableSkewCostFunction.java       | 127 ++++++++++
 .../balancer/TestStochasticLoadBalancer.java  |   1 +
 .../TestStoreFileTableSkewCostFunction.java   | 239 ++++++++++++++++++
 6 files changed, 373 insertions(+), 1 deletion(-)
 create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StoreFileTableSkewCostFunction.java
 create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStoreFileTableSkewCostFunction.java

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerClusterState.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerClusterState.java
index b07287c1ed19..efba0aee733b 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerClusterState.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BalancerClusterState.java
@@ -1085,6 +1085,10 @@ boolean isStopRequested() {
     return EnvironmentEdgeManager.currentTime() > stopRequestedAt;
   }
 
+  Deque<BalancerRegionLoad>[] getRegionLoads() {
+    return regionLoads;
+  }
+
   @Override
   public String toString() {
     StringBuilder desc = new StringBuilder("Cluster={servers=[");
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/CostFromRegionLoadFunction.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/CostFromRegionLoadFunction.java
index 199aa10a75fa..bc61ead8da86 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/CostFromRegionLoadFunction.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/CostFromRegionLoadFunction.java
@@ -66,7 +66,7 @@ protected void regionMoved(int region, int oldServer, int newServer) {
   }
 
   @Override
-  protected final double cost() {
+  protected double cost() {
     return cost.cost();
   }
 
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
index d184cf52e80f..44e5aad3a6b8 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
@@ -274,6 +274,7 @@ protected List<CostFunction> createCostFunctions(Configuration conf) {
     addCostFunction(costFunctions, localityCost);
     addCostFunction(costFunctions, rackLocalityCost);
     addCostFunction(costFunctions, new TableSkewCostFunction(conf));
+    addCostFunction(costFunctions, new StoreFileTableSkewCostFunction(conf));
     addCostFunction(costFunctions, regionReplicaHostCostFunction);
     addCostFunction(costFunctions, regionReplicaRackCostFunction);
     addCostFunction(costFunctions, new ReadRequestCostFunction(conf));
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StoreFileTableSkewCostFunction.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StoreFileTableSkewCostFunction.java
new file mode 100644
index 000000000000..d37f8caa72e1
--- /dev/null
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StoreFileTableSkewCostFunction.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import java.util.Collection;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.yetus.audience.InterfaceAudience;
+
+/**
+ * Lightweight cost function that mirrors TableSkewCostFunction but aggregates storefile sizes (in
+ * MB) per table using the CostFromRegionLoadFunction framework. For each table, it computes a
+ * per-server aggregated storefile size by summing the average storefile size for each region (if
+ * there are multiple load metrics, it averages them). The imbalance cost (as computed by
+ * DoubleArrayCost) is then used to drive the balancer to reduce differences between servers.
+ */
+@InterfaceAudience.Private
+public class StoreFileTableSkewCostFunction extends CostFromRegionLoadFunction {
+
+  private static final String STOREFILE_TABLE_SKEW_COST_KEY =
+    "hbase.master.balancer.stochastic.storefileTableSkewCost";
+  private static final float DEFAULT_STOREFILE_TABLE_SKEW_COST = 35;
+
+  // One DoubleArrayCost instance per table.
+  private DoubleArrayCost[] costsPerTable;
+
+  public StoreFileTableSkewCostFunction(Configuration conf) {
+    this.setMultiplier(
+      conf.getFloat(STOREFILE_TABLE_SKEW_COST_KEY, DEFAULT_STOREFILE_TABLE_SKEW_COST));
+  }
+
+  @Override
+  public void prepare(BalancerClusterState cluster) {
+    // First, set the cluster state and allocate one DoubleArrayCost per table.
+    this.cluster = cluster;
+    costsPerTable = new DoubleArrayCost[cluster.numTables];
+    for (int tableIdx = 0; tableIdx < cluster.numTables; tableIdx++) {
+      costsPerTable[tableIdx] = new DoubleArrayCost();
+      costsPerTable[tableIdx].prepare(cluster.numServers);
+      final int tableIndex = tableIdx;
+      costsPerTable[tableIdx].applyCostsChange(costs -> {
+        // For each server, compute the aggregated storefile size for this table.
+        for (int server = 0; server < cluster.numServers; server++) {
+          double totalStorefileMB = 0;
+          // Sum over all regions on this server that belong to the given table.
+          for (int region : cluster.regionsPerServer[server]) {
+            if (cluster.regionIndexToTableIndex[region] == tableIndex) {
+              Collection<BalancerRegionLoad> loads = cluster.getRegionLoads()[region];
+              double regionCost = 0;
+              if (loads != null && !loads.isEmpty()) {
+                // Average the storefile sizes if there are multiple measurements.
+                for (BalancerRegionLoad rl : loads) {
+                  regionCost += getCostFromRl(rl);
+                }
+                regionCost /= loads.size();
+              }
+              totalStorefileMB += regionCost;
+            }
+          }
+          costs[server] = totalStorefileMB;
+        }
+      });
+    }
+  }
+
+  @Override
+  protected void regionMoved(int region, int oldServer, int newServer) {
+    // Determine the affected table.
+    int tableIdx = cluster.regionIndexToTableIndex[region];
+    costsPerTable[tableIdx].applyCostsChange(costs -> {
+      // Recompute for the old server if applicable.
+      updateStoreFilePerServerPerTableCosts(oldServer, tableIdx, costs);
+      // Recompute for the new server.
+      updateStoreFilePerServerPerTableCosts(newServer, tableIdx, costs);
+    });
+  }
+
+  private void updateStoreFilePerServerPerTableCosts(int newServer, int tableIdx, double[] costs) {
+    if (newServer >= 0) {
+      double totalStorefileMB = 0;
+      for (int r : cluster.regionsPerServer[newServer]) {
+        if (cluster.regionIndexToTableIndex[r] == tableIdx) {
+          Collection<BalancerRegionLoad> loads = cluster.getRegionLoads()[r];
+          double regionCost = 0;
+          if (loads != null && !loads.isEmpty()) {
+            for (BalancerRegionLoad rl : loads) {
+              regionCost += getCostFromRl(rl);
+            }
+            regionCost /= loads.size();
+          }
+          totalStorefileMB += regionCost;
+        }
+      }
+      costs[newServer] = totalStorefileMB;
+    }
+  }
+
+  @Override
+  protected double cost() {
+    double totalCost = 0;
+    // Sum the imbalance cost over all tables.
+    for (DoubleArrayCost dac : costsPerTable) {
+      totalCost += dac.cost();
+    }
+    return totalCost;
+  }
+
+  @Override
+  protected double getCostFromRl(BalancerRegionLoad rl) {
+    // Use storefile size in MB as the metric.
+    return rl.getStorefileSizeMB();
+  }
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancer.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancer.java
index 9dc7dab65621..661380814ad9 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancer.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancer.java
@@ -531,6 +531,7 @@ public void testDefaultCostFunctionList() {
       PrimaryRegionCountSkewCostFunction.class.getSimpleName(),
       MoveCostFunction.class.getSimpleName(), RackLocalityCostFunction.class.getSimpleName(),
       TableSkewCostFunction.class.getSimpleName(),
+      StoreFileTableSkewCostFunction.class.getSimpleName(),
       RegionReplicaHostCostFunction.class.getSimpleName(),
       RegionReplicaRackCostFunction.class.getSimpleName(),
       ReadRequestCostFunction.class.getSimpleName(), WriteRequestCostFunction.class.getSimpleName(),
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStoreFileTableSkewCostFunction.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStoreFileTableSkewCostFunction.java
new file mode 100644
index 000000000000..619a055c6502
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStoreFileTableSkewCostFunction.java
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.balancer;
+
+import static org.apache.hadoop.hbase.master.balancer.CandidateGeneratorTestUtil.createMockBalancerClusterState;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.when;
+
+import java.util.ArrayDeque;
+import java.util.Arrays;
+import java.util.Deque;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.RegionMetrics;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.Size;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.RegionInfoBuilder;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.SmallTests;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.mockito.Mockito;
+
+@Category({ MasterTests.class, SmallTests.class })
+public class TestStoreFileTableSkewCostFunction {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestStoreFileTableSkewCostFunction.class);
+
+  private static final TableName DEFAULT_TABLE = TableName.valueOf("testTable");
+  private static final Map<Long, Integer> REGION_TO_STORE_FILE_SIZE_MB = new HashMap<>();
+
+  /**
+   * Tests that a uniform store file distribution (single table) across servers results in zero
+   * cost.
+   */
+  @Test
+  public void testUniformDistribution() {
+    ServerName server1 = ServerName.valueOf("server1.example.org", 1234, 1L);
+    ServerName server2 = ServerName.valueOf("server2.example.org", 1234, 1L);
+    ServerName server3 = ServerName.valueOf("server3.example.org", 1234, 1L);
+    ServerName server4 = ServerName.valueOf("server4.example.org", 1234, 1L);
+
+    Map<ServerName, List<RegionInfo>> serverToRegions = new HashMap<>();
+    serverToRegions.put(server1, Arrays.asList(createMockRegionInfo(10), createMockRegionInfo(10)));
+    serverToRegions.put(server2, Arrays.asList(createMockRegionInfo(10), createMockRegionInfo(10)));
+    serverToRegions.put(server3, Arrays.asList(createMockRegionInfo(10), createMockRegionInfo(10)));
+    serverToRegions.put(server4, Arrays.asList(createMockRegionInfo(10), createMockRegionInfo(10)));
+
+    BalancerClusterState clusterState = createMockBalancerClusterState(serverToRegions);
+    DummyBalancerClusterState state = new DummyBalancerClusterState(clusterState);
+
+    StoreFileTableSkewCostFunction costFunction =
+      new StoreFileTableSkewCostFunction(new Configuration());
+    costFunction.prepare(state);
+    double cost = costFunction.cost();
+
+    // Expect zero cost since all regions (from the same table) are balanced.
+    assertEquals("Uniform distribution should yield zero cost", 0.0, cost, 1e-6);
+  }
+
+  /**
+   * Tests that a skewed store file distribution (single table) results in a positive cost.
+   */
+  @Test
+  public void testSkewedDistribution() {
+    ServerName server1 = ServerName.valueOf("server1.example.org", 1234, 1L);
+    ServerName server2 = ServerName.valueOf("server2.example.org", 1234, 1L);
+    ServerName server3 = ServerName.valueOf("server3.example.org", 1234, 1L);
+    ServerName server4 = ServerName.valueOf("server4.example.org", 1234, 1L);
+
+    Map<ServerName, List<RegionInfo>> serverToRegions = new HashMap<>();
+    // Three servers get regions with 10 store files each,
+    // while one server gets regions with 30 store files each.
+    serverToRegions.put(server1, Arrays.asList(createMockRegionInfo(10), createMockRegionInfo(10)));
+    serverToRegions.put(server2, Arrays.asList(createMockRegionInfo(10), createMockRegionInfo(10)));
+    serverToRegions.put(server3, Arrays.asList(createMockRegionInfo(10), createMockRegionInfo(10)));
+    serverToRegions.put(server4, Arrays.asList(createMockRegionInfo(30), createMockRegionInfo(30)));
+
+    BalancerClusterState clusterState = createMockBalancerClusterState(serverToRegions);
+    DummyBalancerClusterState state = new DummyBalancerClusterState(clusterState);
+
+    StoreFileTableSkewCostFunction costFunction =
+      new StoreFileTableSkewCostFunction(new Configuration());
+    costFunction.prepare(state);
+    double cost = costFunction.cost();
+
+    // Expect a positive cost because the distribution is skewed.
+    assertTrue("Skewed distribution should yield a positive cost", cost > 0.0);
+  }
+
+  /**
+   * Tests that an empty cluster (no servers/regions) is handled gracefully.
+   */
+  @Test
+  public void testEmptyDistribution() {
+    Map<ServerName, List<RegionInfo>> serverToRegions = new HashMap<>();
+
+    BalancerClusterState clusterState = createMockBalancerClusterState(serverToRegions);
+    DummyBalancerClusterState state = new DummyBalancerClusterState(clusterState);
+
+    StoreFileTableSkewCostFunction costFunction =
+      new StoreFileTableSkewCostFunction(new Configuration());
+    costFunction.prepare(state);
+    double cost = costFunction.cost();
+
+    // Expect zero cost when there is no load.
+    assertEquals("Empty distribution should yield zero cost", 0.0, cost, 1e-6);
+  }
+
+  /**
+   * Tests that having multiple tables results in a positive cost when each table's regions are not
+   * balanced across servers – even if the overall load per server is balanced.
+   */
+  @Test
+  public void testMultipleTablesDistribution() {
+    // Two servers.
+    ServerName server1 = ServerName.valueOf("server1.example.org", 1234, 1L);
+    ServerName server2 = ServerName.valueOf("server2.example.org", 1234, 1L);
+
+    // Define two tables.
+    TableName table1 = TableName.valueOf("testTable1");
+    TableName table2 = TableName.valueOf("testTable2");
+
+    // For table1, all regions are on server1.
+    // For table2, all regions are on server2.
+    Map<ServerName, List<RegionInfo>> serverToRegions = new HashMap<>();
+    serverToRegions.put(server1,
+      Arrays.asList(createMockRegionInfo(table1, 10), createMockRegionInfo(table1, 10)));
+    serverToRegions.put(server2,
+      Arrays.asList(createMockRegionInfo(table2, 10), createMockRegionInfo(table2, 10)));
+
+    // Although each server gets 20 MB overall, table1 and table2 are not balanced across servers.
+    BalancerClusterState clusterState = createMockBalancerClusterState(serverToRegions);
+    DummyBalancerClusterState state = new DummyBalancerClusterState(clusterState);
+
+    StoreFileTableSkewCostFunction costFunction =
+      new StoreFileTableSkewCostFunction(new Configuration());
+    costFunction.prepare(state);
+    double cost = costFunction.cost();
+
+    // Expect a positive cost because the skew is computed per table.
+    assertTrue("Multiple table distribution should yield a positive cost", cost > 0.0);
+  }
+
+  /**
+   * Helper method to create a RegionInfo for the default table with the given store file size.
+   */
+  private static RegionInfo createMockRegionInfo(int storeFileSizeMb) {
+    return createMockRegionInfo(DEFAULT_TABLE, storeFileSizeMb);
+  }
+
+  /**
+   * Helper method to create a RegionInfo for a specified table with the given store file size.
+   */
+  private static RegionInfo createMockRegionInfo(TableName table, int storeFileSizeMb) {
+    long regionId = new Random().nextLong();
+    REGION_TO_STORE_FILE_SIZE_MB.put(regionId, storeFileSizeMb);
+    return RegionInfoBuilder.newBuilder(table).setStartKey(generateRandomByteArray(4))
+      .setEndKey(generateRandomByteArray(4)).setReplicaId(0).setRegionId(regionId).build();
+  }
+
+  private static byte[] generateRandomByteArray(int n) {
+    byte[] byteArray = new byte[n];
+    new Random().nextBytes(byteArray);
+    return byteArray;
+  }
+
+  /**
+   * A simplified BalancerClusterState which ensures we provide the intended test RegionMetrics data
+   * when balancing this cluster
+   */
+  private static class DummyBalancerClusterState extends BalancerClusterState {
+    private final RegionInfo[] testRegions;
+
+    DummyBalancerClusterState(BalancerClusterState bcs) {
+      super(bcs.clusterState, null, null, null, null);
+      this.testRegions = bcs.regions;
+    }
+
+    @Override
+    Deque<BalancerRegionLoad>[] getRegionLoads() {
+      @SuppressWarnings("unchecked")
+      Deque<BalancerRegionLoad>[] loads = new Deque[testRegions.length];
+      for (int i = 0; i < testRegions.length; i++) {
+        Deque<BalancerRegionLoad> dq = new ArrayDeque<>();
+        dq.add(new BalancerRegionLoad(createMockRegionMetrics(testRegions[i])) {
+        });
+        loads[i] = dq;
+      }
+      return loads;
+    }
+  }
+
+  /**
+   * Creates a mocked RegionMetrics for the given region.
+   */
+  private static RegionMetrics createMockRegionMetrics(RegionInfo regionInfo) {
+    RegionMetrics regionMetrics = Mockito.mock(RegionMetrics.class);
+
+    // Important
+    int storeFileSizeMb = REGION_TO_STORE_FILE_SIZE_MB.get(regionInfo.getRegionId());
+    when(regionMetrics.getRegionSizeMB()).thenReturn(new Size(storeFileSizeMb, Size.Unit.MEGABYTE));
+    when(regionMetrics.getStoreFileSize())
+      .thenReturn(new Size(storeFileSizeMb, Size.Unit.MEGABYTE));
+
+    // Not important
+    when(regionMetrics.getReadRequestCount()).thenReturn(0L);
+    when(regionMetrics.getCpRequestCount()).thenReturn(0L);
+    when(regionMetrics.getWriteRequestCount()).thenReturn(0L);
+    when(regionMetrics.getMemStoreSize()).thenReturn(new Size(0, Size.Unit.MEGABYTE));
+    when(regionMetrics.getCurrentRegionCachedRatio()).thenReturn(0.0f);
+    return regionMetrics;
+  }
+}

From 8e5892c99029217d7a566d24de394c989a58695d Mon Sep 17 00:00:00 2001
From: Ray Mattingly <rmdmattingly@gmail.com>
Date: Fri, 21 Mar 2025 08:54:04 -0400
Subject: [PATCH 26/37] HubSpot Edit: I messed up 29202, 29203 backports with
 incompatibilities. Can squash this, or delete in 2.7 (#167)

Co-authored-by: Ray Mattingly <rmattingly@hubspot.com>
---
 .../master/balancer/TestStoreFileTableSkewCostFunction.java    | 1 -
 .../master/balancer/TestUnattainableBalancerCostGoal.java      | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStoreFileTableSkewCostFunction.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStoreFileTableSkewCostFunction.java
index 619a055c6502..3977ad96dd9a 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStoreFileTableSkewCostFunction.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStoreFileTableSkewCostFunction.java
@@ -230,7 +230,6 @@ private static RegionMetrics createMockRegionMetrics(RegionInfo regionInfo) {
 
     // Not important
     when(regionMetrics.getReadRequestCount()).thenReturn(0L);
-    when(regionMetrics.getCpRequestCount()).thenReturn(0L);
     when(regionMetrics.getWriteRequestCount()).thenReturn(0L);
     when(regionMetrics.getMemStoreSize()).thenReturn(new Size(0, Size.Unit.MEGABYTE));
     when(regionMetrics.getCurrentRegionCachedRatio()).thenReturn(0.0f);
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestUnattainableBalancerCostGoal.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestUnattainableBalancerCostGoal.java
index ffa2b4a78212..5e95564b6fee 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestUnattainableBalancerCostGoal.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestUnattainableBalancerCostGoal.java
@@ -33,6 +33,7 @@
 import org.apache.hadoop.hbase.client.RegionInfoBuilder;
 import org.apache.hadoop.hbase.testclassification.MasterTests;
 import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableSet;
 import org.junit.BeforeClass;
 import org.junit.ClassRule;
 import org.junit.Test;
@@ -97,7 +98,7 @@ public static void setup() {
   public void testSystemTableIsolation() {
     Configuration conf = new Configuration(false);
     conf.setBoolean(BalancerConditionals.ISOLATE_SYSTEM_TABLES_KEY, true);
-    runBalancerToExhaustion(conf, serverToRegions, Set.of(this::isSystemTableIsolated),
+    runBalancerToExhaustion(conf, serverToRegions, ImmutableSet.of(this::isSystemTableIsolated),
       UNACHIEVABLE_COST_GOAL, 10_000, CandidateGeneratorTestUtil.ExhaustionType.NO_MORE_MOVES);
     LOG.info("Meta table regions are successfully isolated.");
   }

From 4996ad7ada9b5ae7ef0c00e7cae55fb07340c3e3 Mon Sep 17 00:00:00 2001
From: Charles Connell <cconnell@hubspot.com>
Date: Mon, 17 Mar 2025 08:34:08 -0400
Subject: [PATCH 27/37] HubSpot Backport: HBASE-29193: Allow
 ZstdByteBuffDecompressor to take direct ByteBuffer as input and heap
 ByteBuffer as output, or vice versa (not yet merged upstream)

---
 .../zstd/ZstdByteBuffDecompressor.java        | 67 +++++++------------
 .../zstd/TestZstdByteBuffDecompressor.java    | 32 +++++++--
 pom.xml                                       |  2 +-
 3 files changed, 55 insertions(+), 46 deletions(-)

diff --git a/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdByteBuffDecompressor.java b/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdByteBuffDecompressor.java
index ec5315aa4c02..d71d46e2946e 100644
--- a/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdByteBuffDecompressor.java
+++ b/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdByteBuffDecompressor.java
@@ -55,20 +55,8 @@ public class ZstdByteBuffDecompressor implements ByteBuffDecompressor, CanReinit
 
   @Override
   public boolean canDecompress(ByteBuff output, ByteBuff input) {
-    if (!allowByteBuffDecompression) {
-      return false;
-    }
-    if (output instanceof SingleByteBuff && input instanceof SingleByteBuff) {
-      ByteBuffer nioOutput = output.nioByteBuffers()[0];
-      ByteBuffer nioInput = input.nioByteBuffers()[0];
-      if (nioOutput.isDirect() && nioInput.isDirect()) {
-        return true;
-      } else if (!nioOutput.isDirect() && !nioInput.isDirect()) {
-        return true;
-      }
-    }
-
-    return false;
+    return allowByteBuffDecompression && output instanceof SingleByteBuff
+      && input instanceof SingleByteBuff;
   }
 
   @Override
@@ -80,38 +68,35 @@ private int decompressRaw(ByteBuff output, ByteBuff input, int inputLen) throws
     if (output instanceof SingleByteBuff && input instanceof SingleByteBuff) {
       ByteBuffer nioOutput = output.nioByteBuffers()[0];
       ByteBuffer nioInput = input.nioByteBuffers()[0];
+      int origOutputPos = nioOutput.position();
+      int n;
       if (nioOutput.isDirect() && nioInput.isDirect()) {
-        return decompressDirectByteBuffers(nioOutput, nioInput, inputLen);
+        n = ctx.decompressDirectByteBuffer(nioOutput, nioOutput.position(),
+          nioOutput.limit() - nioOutput.position(), nioInput, nioInput.position(), inputLen);
       } else if (!nioOutput.isDirect() && !nioInput.isDirect()) {
-        return decompressHeapByteBuffers(nioOutput, nioInput, inputLen);
+        n = ctx.decompressByteArray(nioOutput.array(),
+          nioOutput.arrayOffset() + nioOutput.position(), nioOutput.limit() - nioOutput.position(),
+          nioInput.array(), nioInput.arrayOffset() + nioInput.position(), inputLen);
+      } else if (nioOutput.isDirect() && !nioInput.isDirect()) {
+        n = ctx.decompressByteArrayToDirectByteBuffer(nioOutput, nioOutput.position(),
+          nioOutput.limit() - nioOutput.position(), nioInput.array(),
+          nioInput.arrayOffset() + nioInput.position(), inputLen);
+      } else if (!nioOutput.isDirect() && nioInput.isDirect()) {
+        n = ctx.decompressDirectByteBufferToByteArray(nioOutput.array(),
+          nioOutput.arrayOffset() + nioOutput.position(), nioOutput.limit() - nioOutput.position(),
+          nioInput, nioInput.position(), inputLen);
+      } else {
+        throw new IllegalStateException("Unreachable line");
       }
-    }
-
-    throw new IllegalStateException("One buffer is direct and the other is not, "
-      + "or one or more not SingleByteBuffs. This is not supported");
-  }
 
-  private int decompressDirectByteBuffers(ByteBuffer output, ByteBuffer input, int inputLen) {
-    int origOutputPos = output.position();
+      nioOutput.position(origOutputPos + n);
+      nioInput.position(input.position() + inputLen);
 
-    int n = ctx.decompressDirectByteBuffer(output, output.position(),
-      output.limit() - output.position(), input, input.position(), inputLen);
-
-    output.position(origOutputPos + n);
-    input.position(input.position() + inputLen);
-    return n;
-  }
-
-  private int decompressHeapByteBuffers(ByteBuffer output, ByteBuffer input, int inputLen) {
-    int origOutputPos = output.position();
-
-    int n = ctx.decompressByteArray(output.array(), output.arrayOffset() + output.position(),
-      output.limit() - output.position(), input.array(), input.arrayOffset() + input.position(),
-      inputLen);
-
-    output.position(origOutputPos + n);
-    input.position(input.position() + inputLen);
-    return n;
+      return n;
+    } else {
+      throw new IllegalStateException(
+        "At least one buffer is not a SingleByteBuff, this is not supported");
+    }
   }
 
   @Override
diff --git a/hbase-compression/hbase-compression-zstd/src/test/java/org/apache/hadoop/hbase/io/compress/zstd/TestZstdByteBuffDecompressor.java b/hbase-compression/hbase-compression-zstd/src/test/java/org/apache/hadoop/hbase/io/compress/zstd/TestZstdByteBuffDecompressor.java
index 86ba921afdbb..94e95e1ae02b 100644
--- a/hbase-compression/hbase-compression-zstd/src/test/java/org/apache/hadoop/hbase/io/compress/zstd/TestZstdByteBuffDecompressor.java
+++ b/hbase-compression/hbase-compression-zstd/src/test/java/org/apache/hadoop/hbase/io/compress/zstd/TestZstdByteBuffDecompressor.java
@@ -60,8 +60,8 @@ public void testCapabilities() {
     try (ZstdByteBuffDecompressor decompressor = new ZstdByteBuffDecompressor(null)) {
       assertTrue(decompressor.canDecompress(emptySingleHeapBuff, emptySingleHeapBuff));
       assertTrue(decompressor.canDecompress(emptySingleDirectBuff, emptySingleDirectBuff));
-      assertFalse(decompressor.canDecompress(emptySingleHeapBuff, emptySingleDirectBuff));
-      assertFalse(decompressor.canDecompress(emptySingleDirectBuff, emptySingleHeapBuff));
+      assertTrue(decompressor.canDecompress(emptySingleHeapBuff, emptySingleDirectBuff));
+      assertTrue(decompressor.canDecompress(emptySingleDirectBuff, emptySingleHeapBuff));
       assertFalse(decompressor.canDecompress(emptyMultiHeapBuff, emptyMultiHeapBuff));
       assertFalse(decompressor.canDecompress(emptyMultiDirectBuff, emptyMultiDirectBuff));
       assertFalse(decompressor.canDecompress(emptySingleHeapBuff, emptyMultiHeapBuff));
@@ -70,7 +70,7 @@ public void testCapabilities() {
   }
 
   @Test
-  public void testDecompressHeap() throws IOException {
+  public void testDecompressHeapToHeap() throws IOException {
     try (ZstdByteBuffDecompressor decompressor = new ZstdByteBuffDecompressor(null)) {
       ByteBuff output = new SingleByteBuff(ByteBuffer.allocate(64));
       ByteBuff input = new SingleByteBuff(ByteBuffer.wrap(COMPRESSED_PAYLOAD));
@@ -81,7 +81,7 @@ public void testDecompressHeap() throws IOException {
   }
 
   @Test
-  public void testDecompressDirect() throws IOException {
+  public void testDecompressDirectToDirect() throws IOException {
     try (ZstdByteBuffDecompressor decompressor = new ZstdByteBuffDecompressor(null)) {
       ByteBuff output = new SingleByteBuff(ByteBuffer.allocateDirect(64));
       ByteBuff input = new SingleByteBuff(ByteBuffer.allocateDirect(COMPRESSED_PAYLOAD.length));
@@ -93,4 +93,28 @@ public void testDecompressDirect() throws IOException {
     }
   }
 
+  @Test
+  public void testDecompressDirectToHeap() throws IOException {
+    try (ZstdByteBuffDecompressor decompressor = new ZstdByteBuffDecompressor(null)) {
+      ByteBuff output = new SingleByteBuff(ByteBuffer.allocate(64));
+      ByteBuff input = new SingleByteBuff(ByteBuffer.allocateDirect(COMPRESSED_PAYLOAD.length));
+      input.put(COMPRESSED_PAYLOAD);
+      input.rewind();
+      int decompressedSize = decompressor.decompress(output, input, COMPRESSED_PAYLOAD.length);
+      assertEquals("HBase is fun to use and very fast",
+        Bytes.toString(output.toBytes(0, decompressedSize)));
+    }
+  }
+
+  @Test
+  public void testDecompressHeapToDirect() throws IOException {
+    try (ZstdByteBuffDecompressor decompressor = new ZstdByteBuffDecompressor(null)) {
+      ByteBuff output = new SingleByteBuff(ByteBuffer.allocateDirect(64));
+      ByteBuff input = new SingleByteBuff(ByteBuffer.wrap(COMPRESSED_PAYLOAD));
+      int decompressedSize = decompressor.decompress(output, input, COMPRESSED_PAYLOAD.length);
+      assertEquals("HBase is fun to use and very fast",
+        Bytes.toString(output.toBytes(0, decompressedSize)));
+    }
+  }
+
 }
diff --git a/pom.xml b/pom.xml
index ce0a86eb8ec7..0cff942e8dad 100644
--- a/pom.xml
+++ b/pom.xml
@@ -662,7 +662,7 @@
     <brotli4j.version>1.11.0</brotli4j.version>
     <lz4.version>1.8.0</lz4.version>
     <snappy.version>1.1.10.4</snappy.version>
-    <zstd-jni.version>1.5.7-1</zstd-jni.version>
+    <zstd-jni.version>1.5.7-2</zstd-jni.version>
     <!--
         Note that the version of protobuf shipped in hbase-thirdparty must match the version used
         in hbase-protocol-shaded and hbase-examples. The version of jackson-[annotations,core,

From ab96902f3a84f5a2af42bf1c92b8fab8579e97af Mon Sep 17 00:00:00 2001
From: Hernan Romer <nanug33@gmail.com>
Date: Tue, 8 Apr 2025 09:48:51 -0400
Subject: [PATCH 28/37] HubSpot Backport: HBASE-29239: Subsequent runs of
 re-splitting HFiles can fail because we don't cleanup the MR directory (will
 be in 2.6.3)

Signed-off-by: Ray Mattingly <rmattingly@apache.org>
Co-authored-by: Hernan Gelaf-Romer <hgelafromer@hubspot.com>
---
 .../hadoop/hbase/backup/impl/IncrementalTableBackupClient.java | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java
index 686c60a6b0a4..05036484473a 100644
--- a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java
+++ b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java
@@ -227,6 +227,9 @@ private void mergeSplitBulkloads(List<String> files, TableName tn) throws IOExce
       result = player.run(args);
     } catch (Exception e) {
       LOG.error("Failed to run MapReduceHFileSplitterJob", e);
+      // Delete the bulkload directory if we fail to run the HFile splitter job for any reason
+      // as it might be re-tried
+      deleteBulkLoadDirectory();
       throw new IOException(e);
     }
 

From 45c5b4eef34f713e2521c04859a145846c509e99 Mon Sep 17 00:00:00 2001
From: Hernan Romer <nanug33@gmail.com>
Date: Tue, 8 Apr 2025 10:08:48 -0400
Subject: [PATCH 29/37] HubSpot Backport: HBASE-29134 Optimize bulkload backup
 process for incremental backups (will be in 2.6.3)

Signed-off-by: Ray Mattingly <rmattingly@apache.org>
---
 .../hbase/backup/impl/IncrementalTableBackupClient.java       | 4 ----
 .../hadoop/hbase/backup/impl/MergeSplitBulkloadInfo.java      | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java
index 05036484473a..3bbac69b49f7 100644
--- a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java
+++ b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java
@@ -18,7 +18,6 @@
 package org.apache.hadoop.hbase.backup.impl;
 
 import static org.apache.hadoop.hbase.backup.BackupRestoreConstants.JOB_NAME_CONF_KEY;
-
 import java.io.IOException;
 import java.net.URI;
 import java.net.URISyntaxException;
@@ -60,9 +59,7 @@
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
-
 import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos;
 
 /**
@@ -172,7 +169,6 @@ protected List<BulkLoad> handleBulkLoad(List<TableName> tablesToBackup) throws I
         LOG.debug("copying archive {} to {}", archive, tgt);
         bulkloadInfo.addArchivedFiles(archive.toString());
       }
-      toBulkload.put(srcTable, bulkloadInfo);
     }
 
     for (MergeSplitBulkloadInfo bulkloadInfo : toBulkload.values()) {
diff --git a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/MergeSplitBulkloadInfo.java b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/MergeSplitBulkloadInfo.java
index 412ea72d1950..2e81b17bc113 100644
--- a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/MergeSplitBulkloadInfo.java
+++ b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/MergeSplitBulkloadInfo.java
@@ -23,7 +23,7 @@
 import org.apache.yetus.audience.InterfaceAudience;
 
 @InterfaceAudience.Private
-public class MergeSplitBulkloadInfo {
+class MergeSplitBulkloadInfo {
   private final List<String> activeFiles = new ArrayList<>();
   private final List<String> archiveFiles = new ArrayList<>();
 

From 33075fcbbbc19d6b351d1f9d2a60b7a322d81bb2 Mon Sep 17 00:00:00 2001
From: Hernan Romer <nanug33@gmail.com>
Date: Tue, 8 Apr 2025 10:41:30 -0400
Subject: [PATCH 30/37] HubSpot Backport: HBASE-29146 Incremental backups can
 fail due to not cleaning up the MR bulkload output directory (not yet merged
 upstream)

Signed-off-by: Ray Mattingly <rmattingly@apache.org>
Co-authored-by: Hernan Gelaf-Romer <hgelafromer@hubspot.com>
---
 .../impl/IncrementalTableBackupClient.java    | 23 +++---
 .../hbase/backup/TestIncrementalBackup.java   | 80 +++++++++++++++++--
 2 files changed, 88 insertions(+), 15 deletions(-)

diff --git a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java
index 3bbac69b49f7..5e24b24de91a 100644
--- a/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java
+++ b/hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/IncrementalTableBackupClient.java
@@ -18,6 +18,7 @@
 package org.apache.hadoop.hbase.backup.impl;
 
 import static org.apache.hadoop.hbase.backup.BackupRestoreConstants.JOB_NAME_CONF_KEY;
+
 import java.io.IOException;
 import java.net.URI;
 import java.net.URISyntaxException;
@@ -59,7 +60,9 @@
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+
 import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
+
 import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos;
 
 /**
@@ -172,25 +175,22 @@ protected List<BulkLoad> handleBulkLoad(List<TableName> tablesToBackup) throws I
     }
 
     for (MergeSplitBulkloadInfo bulkloadInfo : toBulkload.values()) {
-      mergeSplitBulkloads(bulkloadInfo);
-      incrementalCopyBulkloadHFiles(tgtFs, bulkloadInfo.getSrcTable());
+      mergeSplitAndCopyBulkloadedHFiles(bulkloadInfo.getActiveFiles(),
+        bulkloadInfo.getArchiveFiles(), bulkloadInfo.getSrcTable(), tgtFs);
     }
 
     return bulkLoads;
   }
 
-  private void mergeSplitBulkloads(MergeSplitBulkloadInfo bulkload) throws IOException {
+  private void mergeSplitAndCopyBulkloadedHFiles(List<String> activeFiles,
+    List<String> archiveFiles, TableName tn, FileSystem tgtFs) throws IOException {
     int attempt = 1;
-    List<String> activeFiles = bulkload.getActiveFiles();
-    List<String> archiveFiles = bulkload.getArchiveFiles();
-    TableName tn = bulkload.getSrcTable();
-
     while (!activeFiles.isEmpty()) {
       LOG.info("MergeSplit {} active bulk loaded files. Attempt={}", activeFiles.size(), attempt++);
       // Active file can be archived during copy operation,
       // we need to handle this properly
       try {
-        mergeSplitBulkloads(activeFiles, tn);
+        mergeSplitAndCopyBulkloadedHFiles(activeFiles, tn, tgtFs);
         break;
       } catch (IOException e) {
         int numActiveFiles = activeFiles.size();
@@ -204,11 +204,12 @@ private void mergeSplitBulkloads(MergeSplitBulkloadInfo bulkload) throws IOExcep
     }
 
     if (!archiveFiles.isEmpty()) {
-      mergeSplitBulkloads(archiveFiles, tn);
+      mergeSplitAndCopyBulkloadedHFiles(archiveFiles, tn, tgtFs);
     }
   }
 
-  private void mergeSplitBulkloads(List<String> files, TableName tn) throws IOException {
+  private void mergeSplitAndCopyBulkloadedHFiles(List<String> files, TableName tn, FileSystem tgtFs)
+    throws IOException {
     MapReduceHFileSplitterJob player = new MapReduceHFileSplitterJob();
     conf.set(MapReduceHFileSplitterJob.BULK_OUTPUT_CONF_KEY,
       getBulkOutputDirForTable(tn).toString());
@@ -233,6 +234,8 @@ private void mergeSplitBulkloads(List<String> files, TableName tn) throws IOExce
       throw new IOException(
         "Failed to run MapReduceHFileSplitterJob with invalid result: " + result);
     }
+
+    incrementalCopyBulkloadHFiles(tgtFs, tn);
   }
 
   private void updateFileLists(List<String> activeFiles, List<String> archiveFiles)
diff --git a/hbase-backup/src/test/java/org/apache/hadoop/hbase/backup/TestIncrementalBackup.java b/hbase-backup/src/test/java/org/apache/hadoop/hbase/backup/TestIncrementalBackup.java
index a91e6f01a6f1..433a9a9e8dd2 100644
--- a/hbase-backup/src/test/java/org/apache/hadoop/hbase/backup/TestIncrementalBackup.java
+++ b/hbase-backup/src/test/java/org/apache/hadoop/hbase/backup/TestIncrementalBackup.java
@@ -22,7 +22,6 @@
 import static org.junit.Assert.assertNotEquals;
 import static org.junit.Assert.assertThrows;
 import static org.junit.Assert.assertTrue;
-
 import java.io.File;
 import java.io.IOException;
 import java.nio.ByteBuffer;
@@ -58,6 +57,7 @@
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.CommonFSUtils;
 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hbase.util.HFileArchiveUtil;
 import org.apache.hadoop.hbase.util.HFileTestUtil;
 import org.junit.After;
 import org.junit.Assert;
@@ -68,7 +68,6 @@
 import org.junit.runners.Parameterized;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.hbase.thirdparty.com.google.common.base.Throwables;
 import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableList;
 import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
@@ -101,11 +100,14 @@ public TestIncrementalBackup(Boolean b) {
   public void ensurePreviousBackupTestsAreCleanedUp() throws Exception {
     TEST_UTIL.flush(table1);
     TEST_UTIL.flush(table2);
-    TEST_UTIL.flush(table1_restore);
 
     TEST_UTIL.truncateTable(table1).close();
     TEST_UTIL.truncateTable(table2).close();
-    TEST_UTIL.truncateTable(table1_restore).close();
+
+    if (TEST_UTIL.getAdmin().tableExists(table1_restore)) {
+      TEST_UTIL.flush(table1_restore);
+      TEST_UTIL.truncateTable(table1_restore).close();
+    }
 
     TEST_UTIL.getMiniHBaseCluster().getRegionServerThreads().forEach(rst -> {
       try {
@@ -427,6 +429,73 @@ public void TestIncBackupRestoreWithOriginalSplitsSeperateFs() throws Exception
 
   }
 
+  @Test
+  public void TestIncBackupRestoreHandlesArchivedFiles() throws Exception {
+    byte[] fam2 = Bytes.toBytes("f2");
+    TableDescriptor newTable1Desc = TableDescriptorBuilder.newBuilder(table1Desc)
+      .setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(fam2).build()).build();
+    TEST_UTIL.getAdmin().modifyTable(newTable1Desc);
+    try (Connection conn = ConnectionFactory.createConnection(conf1);
+      BackupAdminImpl admin = new BackupAdminImpl(conn)) {
+      String backupTargetDir = TEST_UTIL.getDataTestDir("backupTarget").toString();
+      BACKUP_ROOT_DIR = new File(backupTargetDir).toURI().toString();
+
+      List<TableName> tables = Lists.newArrayList(table1);
+
+      insertIntoTable(conn, table1, famName, 3, 100);
+      String fullBackupId = takeFullBackup(tables, admin, true);
+      assertTrue(checkSucceeded(fullBackupId));
+
+      insertIntoTable(conn, table1, famName, 4, 100);
+
+      HRegion regionToBulkload = TEST_UTIL.getHBaseCluster().getRegions(table1).get(0);
+      String regionName = regionToBulkload.getRegionInfo().getEncodedName();
+      // Requires a mult-fam bulkload to ensure we're appropriately handling
+      // multi-file bulkloads
+      Path regionDir = doBulkload(table1, regionName, famName, fam2);
+
+      // archive the files in the region directory
+      Path archiveDir =
+        HFileArchiveUtil.getStoreArchivePath(conf1, table1, regionName, Bytes.toString(famName));
+      TEST_UTIL.getTestFileSystem().mkdirs(archiveDir);
+      RemoteIterator<LocatedFileStatus> iter =
+        TEST_UTIL.getTestFileSystem().listFiles(regionDir, true);
+      List<Path> paths = new ArrayList<>();
+      while (iter.hasNext()) {
+        Path path = iter.next().getPath();
+        if (path.toString().contains("_SeqId_")) {
+          paths.add(path);
+        }
+      }
+      assertTrue(paths.size() > 1);
+      Path path = paths.get(0);
+      String name = path.toString();
+      int startIdx = name.lastIndexOf(Path.SEPARATOR);
+      String filename = name.substring(startIdx + 1);
+      Path archiveFile = new Path(archiveDir, filename);
+      // archive 1 of the files
+      boolean success = TEST_UTIL.getTestFileSystem().rename(path, archiveFile);
+      assertTrue(success);
+      assertTrue(TEST_UTIL.getTestFileSystem().exists(archiveFile));
+      assertFalse(TEST_UTIL.getTestFileSystem().exists(path));
+
+      BackupRequest request =
+        createBackupRequest(BackupType.INCREMENTAL, tables, BACKUP_ROOT_DIR, true);
+      String incrementalBackupId = admin.backupTables(request);
+      assertTrue(checkSucceeded(incrementalBackupId));
+
+      TableName[] fromTable = new TableName[] { table1 };
+      TableName[] toTable = new TableName[] { table1_restore };
+
+      admin.restore(BackupUtils.createRestoreRequest(BACKUP_ROOT_DIR, incrementalBackupId, false,
+        fromTable, toTable, true));
+
+      int actualRowCount = TEST_UTIL.countRows(table1_restore);
+      int expectedRowCount = TEST_UTIL.countRows(table1);
+      assertEquals(expectedRowCount, actualRowCount);
+    }
+  }
+
   private void checkThrowsCFMismatch(IOException ex, List<TableName> tables) {
     Throwable cause = Throwables.getRootCause(ex);
     assertEquals(cause.getClass(), ColumnFamilyMismatchException.class);
@@ -448,12 +517,13 @@ private String takeFullBackup(List<TableName> tables, BackupAdminImpl backupAdmi
     return backupId;
   }
 
-  private static void doBulkload(TableName tn, String regionName, byte[]... fams)
+  private static Path doBulkload(TableName tn, String regionName, byte[]... fams)
     throws IOException {
     Path regionDir = createHFiles(tn, regionName, fams);
     Map<BulkLoadHFiles.LoadQueueItem, ByteBuffer> results =
       BulkLoadHFiles.create(conf1).bulkLoad(tn, regionDir);
     assertFalse(results.isEmpty());
+    return regionDir;
   }
 
   private static Path createHFiles(TableName tn, String regionName, byte[]... fams)

From a59873cebfb66931636149680a134626982158b5 Mon Sep 17 00:00:00 2001
From: Ray Mattingly <rmattingly@apache.org>
Date: Wed, 9 Apr 2025 18:39:04 -0400
Subject: [PATCH 31/37] HubSpot Backport: HBASE-29229: Throttles should support
 specific restrictions for atomic workloads (#6866) (#6882) (#6883) (will be
 in 2.6.3)

Signed-off-by: Nick Dimiduk <ndimiduk@apache.org>
Co-authored-by: Ray Mattingly <rmattingly@hubspot.com>
---
 .../hbase/quotas/RpcThrottlingException.java  |  30 +++-
 .../hadoop/hbase/quotas/ThrottleSettings.java |   3 +
 .../hadoop/hbase/quotas/ThrottleType.java     |   9 +
 .../hbase/shaded/protobuf/ProtobufUtil.java   |  12 ++
 .../src/main/protobuf/Quota.proto             |   7 +
 .../hbase/quotas/DefaultOperationQuota.java   |  23 ++-
 .../hbase/quotas/ExceedOperationQuota.java    |  21 +--
 .../hbase/quotas/GlobalQuotaSettingsImpl.java |  36 ++++
 .../hbase/quotas/NoopOperationQuota.java      |   3 +-
 .../hadoop/hbase/quotas/NoopQuotaLimiter.java |  10 +-
 .../hadoop/hbase/quotas/OperationQuota.java   |   2 +-
 .../hadoop/hbase/quotas/QuotaCache.java       |   4 +
 .../hadoop/hbase/quotas/QuotaLimiter.java     |   9 +-
 .../apache/hadoop/hbase/quotas/QuotaUtil.java |  12 ++
 .../quotas/RegionServerRpcQuotaManager.java   |  17 +-
 .../hadoop/hbase/quotas/RpcQuotaManager.java  |   2 +-
 .../hadoop/hbase/quotas/TimeBasedLimiter.java |  80 ++++++++-
 .../regionserver/RegionCoprocessorHost.java   |   2 +-
 .../hbase/quotas/TestAtomicReadQuota.java     | 163 ++++++++++++++----
 .../hbase/quotas/TestDefaultAtomicQuota.java  | 160 +++++++++++++++++
 .../quotas/TestDefaultOperationQuota.java     |  42 ++---
 .../hbase/quotas/TestNoopOperationQuota.java  |   3 +-
 .../hadoop/hbase/quotas/TestQuotaAdmin.java   |   8 +
 .../hadoop/hbase/quotas/TestQuotaState.java   |   8 +-
 .../hbase/quotas/ThrottleQuotaTestUtil.java   |  20 ++-
 .../regionserver/TestScannerLeaseCount.java   |   4 +-
 26 files changed, 576 insertions(+), 114 deletions(-)
 create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestDefaultAtomicQuota.java

diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/quotas/RpcThrottlingException.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/quotas/RpcThrottlingException.java
index 2c1f13e94e66..dfa8eacb13b9 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/quotas/RpcThrottlingException.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/quotas/RpcThrottlingException.java
@@ -40,14 +40,17 @@ public enum Type {
     ReadSizeExceeded,
     RequestCapacityUnitExceeded,
     ReadCapacityUnitExceeded,
-    WriteCapacityUnitExceeded
+    WriteCapacityUnitExceeded,
+    AtomicRequestNumberExceeded,
+    AtomicReadSizeExceeded,
+    AtomicWriteSizeExceeded,
   }
 
-  private static final String[] MSG_TYPE =
-    new String[] { "number of requests exceeded", "request size limit exceeded",
-      "number of read requests exceeded", "number of write requests exceeded",
-      "write size limit exceeded", "read size limit exceeded", "request capacity unit exceeded",
-      "read capacity unit exceeded", "write capacity unit exceeded" };
+  private static final String[] MSG_TYPE = new String[] { "number of requests exceeded",
+    "request size limit exceeded", "number of read requests exceeded",
+    "number of write requests exceeded", "write size limit exceeded", "read size limit exceeded",
+    "request capacity unit exceeded", "read capacity unit exceeded", "write capacity unit exceeded",
+    "atomic request number exceeded", "atomic read size exceeded", "atomic write size exceeded" };
 
   private static final String MSG_WAIT = " - wait ";
 
@@ -127,6 +130,21 @@ public static void throwWriteCapacityUnitExceeded(final long waitInterval)
     throwThrottlingException(Type.WriteCapacityUnitExceeded, waitInterval);
   }
 
+  public static void throwAtomicRequestNumberExceeded(final long waitInterval)
+    throws RpcThrottlingException {
+    throwThrottlingException(Type.AtomicRequestNumberExceeded, waitInterval);
+  }
+
+  public static void throwAtomicReadSizeExceeded(final long waitInterval)
+    throws RpcThrottlingException {
+    throwThrottlingException(Type.AtomicReadSizeExceeded, waitInterval);
+  }
+
+  public static void throwAtomicWriteSizeExceeded(final long waitInterval)
+    throws RpcThrottlingException {
+    throwThrottlingException(Type.AtomicWriteSizeExceeded, waitInterval);
+  }
+
   private static void throwThrottlingException(final Type type, final long waitInterval)
     throws RpcThrottlingException {
     String msg = MSG_TYPE[type.ordinal()] + MSG_WAIT + stringFromMillis(waitInterval);
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/quotas/ThrottleSettings.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/quotas/ThrottleSettings.java
index 01dfc3709ae6..efde451c1222 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/quotas/ThrottleSettings.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/quotas/ThrottleSettings.java
@@ -93,11 +93,14 @@ public String toString() {
           case REQUEST_NUMBER:
           case WRITE_NUMBER:
           case READ_NUMBER:
+          case ATOMIC_REQUEST_NUMBER:
             builder.append(String.format("%dreq", timedQuota.getSoftLimit()));
             break;
           case REQUEST_SIZE:
           case WRITE_SIZE:
           case READ_SIZE:
+          case ATOMIC_READ_SIZE:
+          case ATOMIC_WRITE_SIZE:
             builder.append(sizeToString(timedQuota.getSoftLimit()));
             break;
           case REQUEST_CAPACITY_UNIT:
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/quotas/ThrottleType.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/quotas/ThrottleType.java
index 80827dafe6d5..2c5a25acc2c4 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/quotas/ThrottleType.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/quotas/ThrottleType.java
@@ -50,4 +50,13 @@ public enum ThrottleType {
 
   /** Throttling based on the read data capacity unit */
   READ_CAPACITY_UNIT,
+
+  /** Throttling based on the IO footprint of an atomic request */
+  ATOMIC_READ_SIZE,
+
+  /** Throttling based on the number of atomic requests per time-unit */
+  ATOMIC_REQUEST_NUMBER,
+
+  /** Throttling based on the size of atomic write requests */
+  ATOMIC_WRITE_SIZE,
 }
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java
index c104fdcfa33a..46eb86aeb336 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java
@@ -2433,6 +2433,12 @@ public static ThrottleType toThrottleType(final QuotaProtos.ThrottleType proto)
         return ThrottleType.READ_CAPACITY_UNIT;
       case WRITE_CAPACITY_UNIT:
         return ThrottleType.WRITE_CAPACITY_UNIT;
+      case ATOMIC_READ_SIZE:
+        return ThrottleType.ATOMIC_READ_SIZE;
+      case ATOMIC_REQUEST_NUMBER:
+        return ThrottleType.ATOMIC_REQUEST_NUMBER;
+      case ATOMIC_WRITE_SIZE:
+        return ThrottleType.ATOMIC_WRITE_SIZE;
       default:
         throw new RuntimeException("Invalid ThrottleType " + proto);
     }
@@ -2462,6 +2468,12 @@ public static QuotaProtos.ThrottleType toProtoThrottleType(final ThrottleType ty
         return QuotaProtos.ThrottleType.READ_CAPACITY_UNIT;
       case WRITE_CAPACITY_UNIT:
         return QuotaProtos.ThrottleType.WRITE_CAPACITY_UNIT;
+      case ATOMIC_READ_SIZE:
+        return QuotaProtos.ThrottleType.ATOMIC_READ_SIZE;
+      case ATOMIC_REQUEST_NUMBER:
+        return QuotaProtos.ThrottleType.ATOMIC_REQUEST_NUMBER;
+      case ATOMIC_WRITE_SIZE:
+        return QuotaProtos.ThrottleType.ATOMIC_WRITE_SIZE;
       default:
         throw new RuntimeException("Invalid ThrottleType " + type);
     }
diff --git a/hbase-protocol-shaded/src/main/protobuf/Quota.proto b/hbase-protocol-shaded/src/main/protobuf/Quota.proto
index 5b00d74980b5..e524e015b625 100644
--- a/hbase-protocol-shaded/src/main/protobuf/Quota.proto
+++ b/hbase-protocol-shaded/src/main/protobuf/Quota.proto
@@ -49,6 +49,9 @@ enum ThrottleType {
   REQUEST_CAPACITY_UNIT = 7;
   WRITE_CAPACITY_UNIT   = 8;
   READ_CAPACITY_UNIT    = 9;
+  ATOMIC_READ_SIZE    = 10;
+  ATOMIC_REQUEST_NUMBER = 11;
+  ATOMIC_WRITE_SIZE = 12;
 }
 
 message Throttle {
@@ -64,6 +67,10 @@ message Throttle {
   optional TimedQuota req_capacity_unit   = 7;
   optional TimedQuota write_capacity_unit = 8;
   optional TimedQuota read_capacity_unit  = 9;
+
+  optional TimedQuota atomic_read_size =  10;
+  optional TimedQuota atomic_req_num   =  11;
+  optional TimedQuota atomic_write_size = 12;
 }
 
 message ThrottleRequest {
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/DefaultOperationQuota.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/DefaultOperationQuota.java
index 29c3667fb352..f153eca2e5a0 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/DefaultOperationQuota.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/DefaultOperationQuota.java
@@ -62,6 +62,7 @@ public class DefaultOperationQuota implements OperationQuota {
   private boolean useResultSizeBytes;
   private long blockSizeBytes;
   private long maxScanEstimate;
+  private boolean isAtomic = false;
 
   public DefaultOperationQuota(final Configuration conf, final int blockSizeBytes,
     final QuotaLimiter... limiters) {
@@ -92,9 +93,10 @@ public DefaultOperationQuota(final Configuration conf, final List<QuotaLimiter>
   }
 
   @Override
-  public void checkBatchQuota(int numWrites, int numReads) throws RpcThrottlingException {
+  public void checkBatchQuota(int numWrites, int numReads, boolean isAtomic)
+    throws RpcThrottlingException {
     updateEstimateConsumeBatchQuota(numWrites, numReads);
-    checkQuota(numWrites, numReads);
+    checkQuota(numWrites, numReads, isAtomic);
   }
 
   @Override
@@ -102,10 +104,15 @@ public void checkScanQuota(ClientProtos.ScanRequest scanRequest, long maxScanner
     long maxBlockBytesScanned, long prevBlockBytesScannedDifference) throws RpcThrottlingException {
     updateEstimateConsumeScanQuota(scanRequest, maxScannerResultSize, maxBlockBytesScanned,
       prevBlockBytesScannedDifference);
-    checkQuota(0, 1);
+    checkQuota(0, 1, false);
   }
 
-  private void checkQuota(long numWrites, long numReads) throws RpcThrottlingException {
+  private void checkQuota(long numWrites, long numReads, boolean isAtomic)
+    throws RpcThrottlingException {
+    if (isAtomic) {
+      // Remember this flag for later use in close()
+      this.isAtomic = true;
+    }
     readAvailable = Long.MAX_VALUE;
     for (final QuotaLimiter limiter : limiters) {
       if (limiter.isBypass()) {
@@ -121,13 +128,13 @@ private void checkQuota(long numWrites, long numReads) throws RpcThrottlingExcep
       limiter.checkQuota(Math.min(maxWritesToEstimate, numWrites),
         Math.min(maxWriteSizeToEstimate, writeConsumed), Math.min(maxReadsToEstimate, numReads),
         Math.min(maxReadSizeToEstimate, readConsumed), writeCapacityUnitConsumed,
-        readCapacityUnitConsumed);
+        readCapacityUnitConsumed, isAtomic);
       readAvailable = Math.min(readAvailable, limiter.getReadAvailable());
     }
 
     for (final QuotaLimiter limiter : limiters) {
       limiter.grabQuota(numWrites, writeConsumed, numReads, readConsumed, writeCapacityUnitConsumed,
-        readCapacityUnitConsumed);
+        readCapacityUnitConsumed, isAtomic);
     }
   }
 
@@ -154,10 +161,10 @@ public void close() {
 
     for (final QuotaLimiter limiter : limiters) {
       if (writeDiff != 0) {
-        limiter.consumeWrite(writeDiff, writeCapacityUnitDiff);
+        limiter.consumeWrite(writeDiff, writeCapacityUnitDiff, isAtomic);
       }
       if (readDiff != 0) {
-        limiter.consumeRead(readDiff, readCapacityUnitDiff);
+        limiter.consumeRead(readDiff, readCapacityUnitDiff, isAtomic);
       }
     }
   }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/ExceedOperationQuota.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/ExceedOperationQuota.java
index 3077d6dac537..7dcfec6b0623 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/ExceedOperationQuota.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/ExceedOperationQuota.java
@@ -49,10 +49,11 @@ public ExceedOperationQuota(final Configuration conf, int blockSizeBytes,
   }
 
   @Override
-  public void checkBatchQuota(int numWrites, int numReads) throws RpcThrottlingException {
+  public void checkBatchQuota(int numWrites, int numReads, boolean isAtomic)
+    throws RpcThrottlingException {
     Runnable estimateQuota = () -> updateEstimateConsumeBatchQuota(numWrites, numReads);
-    CheckQuotaRunnable checkQuota = () -> super.checkBatchQuota(numWrites, numReads);
-    checkQuota(estimateQuota, checkQuota, numWrites, numReads, 0);
+    CheckQuotaRunnable checkQuota = () -> super.checkBatchQuota(numWrites, numReads, isAtomic);
+    checkQuota(estimateQuota, checkQuota, numWrites, numReads, 0, isAtomic);
   }
 
   @Override
@@ -62,11 +63,11 @@ public void checkScanQuota(ClientProtos.ScanRequest scanRequest, long maxScanner
       maxBlockBytesScanned, prevBlockBytesScannedDifference);
     CheckQuotaRunnable checkQuota = () -> super.checkScanQuota(scanRequest, maxScannerResultSize,
       maxBlockBytesScanned, prevBlockBytesScannedDifference);
-    checkQuota(estimateQuota, checkQuota, 0, 0, 1);
+    checkQuota(estimateQuota, checkQuota, 0, 0, 1, false);
   }
 
   private void checkQuota(Runnable estimateQuota, CheckQuotaRunnable checkQuota, int numWrites,
-    int numReads, int numScans) throws RpcThrottlingException {
+    int numReads, int numScans, boolean isAtomic) throws RpcThrottlingException {
     if (regionServerLimiter.isBypass()) {
       // If region server limiter is bypass, which means no region server quota is set, check and
       // throttle by all other quotas. In this condition, exceed throttle quota will not work.
@@ -77,7 +78,7 @@ private void checkQuota(Runnable estimateQuota, CheckQuotaRunnable checkQuota, i
       estimateQuota.run();
       // 2. Check if region server limiter is enough. If not, throw RpcThrottlingException.
       regionServerLimiter.checkQuota(numWrites, writeConsumed, numReads + numScans, readConsumed,
-        writeCapacityUnitConsumed, readCapacityUnitConsumed);
+        writeCapacityUnitConsumed, readCapacityUnitConsumed, isAtomic);
       // 3. Check if other limiters are enough. If not, exceed other limiters because region server
       // limiter is enough.
       boolean exceed = false;
@@ -93,13 +94,13 @@ private void checkQuota(Runnable estimateQuota, CheckQuotaRunnable checkQuota, i
       // 4. Region server limiter is enough and grab estimated consume quota.
       readAvailable = Math.max(readAvailable, regionServerLimiter.getReadAvailable());
       regionServerLimiter.grabQuota(numWrites, writeConsumed, numReads + numScans, readConsumed,
-        writeCapacityUnitConsumed, writeCapacityUnitConsumed);
+        writeCapacityUnitConsumed, writeCapacityUnitConsumed, isAtomic);
       if (exceed) {
         // 5. Other quota limiter is exceeded and has not been grabbed (because throw
         // RpcThrottlingException in Step 3), so grab it.
         for (final QuotaLimiter limiter : limiters) {
           limiter.grabQuota(numWrites, writeConsumed, numReads + numScans, readConsumed,
-            writeCapacityUnitConsumed, writeCapacityUnitConsumed);
+            writeCapacityUnitConsumed, writeCapacityUnitConsumed, isAtomic);
         }
       }
     }
@@ -109,10 +110,10 @@ private void checkQuota(Runnable estimateQuota, CheckQuotaRunnable checkQuota, i
   public void close() {
     super.close();
     if (writeDiff != 0) {
-      regionServerLimiter.consumeWrite(writeDiff, writeCapacityUnitDiff);
+      regionServerLimiter.consumeWrite(writeDiff, writeCapacityUnitDiff, false);
     }
     if (readDiff != 0) {
-      regionServerLimiter.consumeRead(readDiff, readCapacityUnitDiff);
+      regionServerLimiter.consumeRead(readDiff, readCapacityUnitDiff, false);
     }
   }
 
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/GlobalQuotaSettingsImpl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/GlobalQuotaSettingsImpl.java
index ebde3ed80dc9..6afbebc6e861 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/GlobalQuotaSettingsImpl.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/GlobalQuotaSettingsImpl.java
@@ -159,6 +159,21 @@ private boolean hasThrottle(QuotaProtos.ThrottleType quotaType,
           hasThrottle = true;
         }
         break;
+      case ATOMIC_READ_SIZE:
+        if (throttleBuilder.hasAtomicReadSize()) {
+          hasThrottle = true;
+        }
+        break;
+      case ATOMIC_REQUEST_NUMBER:
+        if (throttleBuilder.hasAtomicReqNum()) {
+          hasThrottle = true;
+        }
+        break;
+      case ATOMIC_WRITE_SIZE:
+        if (throttleBuilder.hasAtomicWriteSize()) {
+          hasThrottle = true;
+        }
+        break;
       default:
     }
     return hasThrottle;
@@ -212,6 +227,15 @@ protected GlobalQuotaSettingsImpl merge(QuotaSettings other) throws IOException
             case WRITE_CAPACITY_UNIT:
               throttleBuilder.clearWriteCapacityUnit();
               break;
+            case ATOMIC_READ_SIZE:
+              throttleBuilder.clearAtomicReadSize();
+              break;
+            case ATOMIC_REQUEST_NUMBER:
+              throttleBuilder.clearAtomicReqNum();
+              break;
+            case ATOMIC_WRITE_SIZE:
+              throttleBuilder.clearAtomicWriteSize();
+              break;
             default:
           }
           boolean hasThrottle = false;
@@ -262,6 +286,15 @@ protected GlobalQuotaSettingsImpl merge(QuotaSettings other) throws IOException
           case WRITE_CAPACITY_UNIT:
             throttleBuilder.setWriteCapacityUnit(otherProto.getTimedQuota());
             break;
+          case ATOMIC_READ_SIZE:
+            throttleBuilder.setAtomicReadSize(otherProto.getTimedQuota());
+            break;
+          case ATOMIC_REQUEST_NUMBER:
+            throttleBuilder.setAtomicReqNum(otherProto.getTimedQuota());
+            break;
+          case ATOMIC_WRITE_SIZE:
+            throttleBuilder.setAtomicWriteSize(otherProto.getTimedQuota());
+            break;
           default:
         }
       }
@@ -341,11 +374,14 @@ public String toString() {
             case REQUEST_NUMBER:
             case WRITE_NUMBER:
             case READ_NUMBER:
+            case ATOMIC_REQUEST_NUMBER:
               builder.append(String.format("%dreq", timedQuota.getSoftLimit()));
               break;
             case REQUEST_SIZE:
             case WRITE_SIZE:
             case READ_SIZE:
+            case ATOMIC_READ_SIZE:
+            case ATOMIC_WRITE_SIZE:
               builder.append(sizeToString(timedQuota.getSoftLimit()));
               break;
             case REQUEST_CAPACITY_UNIT:
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/NoopOperationQuota.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/NoopOperationQuota.java
index 63cf97188d86..9143e12de004 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/NoopOperationQuota.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/NoopOperationQuota.java
@@ -43,7 +43,8 @@ public static OperationQuota get() {
   }
 
   @Override
-  public void checkBatchQuota(int numWrites, int numReads) throws RpcThrottlingException {
+  public void checkBatchQuota(int numWrites, int numReads, boolean isAtomic)
+    throws RpcThrottlingException {
     // no-op
   }
 
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/NoopQuotaLimiter.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/NoopQuotaLimiter.java
index 5ece0be2b5aa..7c02dbc1134f 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/NoopQuotaLimiter.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/NoopQuotaLimiter.java
@@ -34,24 +34,24 @@ private NoopQuotaLimiter() {
 
   @Override
   public void checkQuota(long writeReqs, long estimateWriteSize, long readReqs,
-    long estimateReadSize, long estimateWriteCapacityUnit, long estimateReadCapacityUnit)
-    throws RpcThrottlingException {
+    long estimateReadSize, long estimateWriteCapacityUnit, long estimateReadCapacityUnit,
+    boolean isAtomic) throws RpcThrottlingException {
     // no-op
   }
 
   @Override
   public void grabQuota(long writeReqs, long writeSize, long readReqs, long readSize,
-    long writeCapacityUnit, long readCapacityUnit) {
+    long writeCapacityUnit, long readCapacityUnit, boolean isAtomic) {
     // no-op
   }
 
   @Override
-  public void consumeWrite(final long size, long capacityUnit) {
+  public void consumeWrite(final long size, long capacityUnit, boolean isAtomic) {
     // no-op
   }
 
   @Override
-  public void consumeRead(final long size, long capacityUnit) {
+  public void consumeRead(final long size, long capacityUnit, boolean isAtomic) {
     // no-op
   }
 
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/OperationQuota.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/OperationQuota.java
index 0d9b48b6074b..b95a617e127f 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/OperationQuota.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/OperationQuota.java
@@ -57,7 +57,7 @@ public enum OperationType {
    * @throws RpcThrottlingException if the operation cannot be performed because RPC quota is
    *                                exceeded.
    */
-  void checkBatchQuota(int numWrites, int numReads) throws RpcThrottlingException;
+  void checkBatchQuota(int numWrites, int numReads, boolean isAtomic) throws RpcThrottlingException;
 
   /**
    * Checks if it is possible to execute the scan. The quota will be estimated based on the
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/QuotaCache.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/QuotaCache.java
index 760703a428b2..cecda2a154c6 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/QuotaCache.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/QuotaCache.java
@@ -218,6 +218,10 @@ void triggerCacheRefresh() {
     refreshChore.triggerNow();
   }
 
+  void forceSynchronousCacheRefresh() {
+    refreshChore.chore();
+  }
+
   long getLastUpdate() {
     return refreshChore.lastUpdate;
   }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/QuotaLimiter.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/QuotaLimiter.java
index 12e4c4a7c6a9..1b5a1302a207 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/QuotaLimiter.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/QuotaLimiter.java
@@ -42,7 +42,8 @@ public interface QuotaLimiter {
    * @throws RpcThrottlingException thrown if not enough available resources to perform operation.
    */
   void checkQuota(long writeReqs, long estimateWriteSize, long readReqs, long estimateReadSize,
-    long estimateWriteCapacityUnit, long estimateReadCapacityUnit) throws RpcThrottlingException;
+    long estimateWriteCapacityUnit, long estimateReadCapacityUnit, boolean isAtomic)
+    throws RpcThrottlingException;
 
   /**
    * Removes the specified write and read amount from the quota. At this point the write and read
@@ -56,19 +57,19 @@ void checkQuota(long writeReqs, long estimateWriteSize, long readReqs, long esti
    * @param readCapacityUnit  the read capacity unit num that will be removed from the current quota
    */
   void grabQuota(long writeReqs, long writeSize, long readReqs, long readSize,
-    long writeCapacityUnit, long readCapacityUnit);
+    long writeCapacityUnit, long readCapacityUnit, boolean isAtomic);
 
   /**
    * Removes or add back some write amount to the quota. (called at the end of an operation in case
    * the estimate quota was off)
    */
-  void consumeWrite(long size, long capacityUnit);
+  void consumeWrite(long size, long capacityUnit, boolean isAtomic);
 
   /**
    * Removes or add back some read amount to the quota. (called at the end of an operation in case
    * the estimate quota was off)
    */
-  void consumeRead(long size, long capacityUnit);
+  void consumeRead(long size, long capacityUnit, boolean isAtomic);
 
   /** Returns true if the limiter is a noop */
   boolean isBypass();
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/QuotaUtil.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/QuotaUtil.java
index b4887392196d..ba65cec01d7e 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/QuotaUtil.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/QuotaUtil.java
@@ -95,6 +95,12 @@ public class QuotaUtil extends QuotaTableUtil {
     "hbase.quota.default.user.machine.write.num";
   public static final String QUOTA_DEFAULT_USER_MACHINE_WRITE_SIZE =
     "hbase.quota.default.user.machine.write.size";
+  public static final String QUOTA_DEFAULT_USER_MACHINE_ATOMIC_READ_SIZE =
+    "hbase.quota.default.user.machine.atomic.read.size";
+  public static final String QUOTA_DEFAULT_USER_MACHINE_ATOMIC_REQUEST_NUM =
+    "hbase.quota.default.user.machine.atomic.request.num";
+  public static final String QUOTA_DEFAULT_USER_MACHINE_ATOMIC_WRITE_SIZE =
+    "hbase.quota.default.user.machine.atomic.write.size";
 
   /** Table descriptor for Quota internal table */
   public static final HTableDescriptor QUOTA_TABLE_DESC = new HTableDescriptor(QUOTA_TABLE_NAME);
@@ -388,6 +394,12 @@ protected static UserQuotaState buildDefaultUserQuotaState(Configuration conf, l
       .ifPresent(throttleBuilder::setWriteNum);
     buildDefaultTimedQuota(conf, QUOTA_DEFAULT_USER_MACHINE_WRITE_SIZE)
       .ifPresent(throttleBuilder::setWriteSize);
+    buildDefaultTimedQuota(conf, QUOTA_DEFAULT_USER_MACHINE_ATOMIC_READ_SIZE)
+      .ifPresent(throttleBuilder::setAtomicReadSize);
+    buildDefaultTimedQuota(conf, QUOTA_DEFAULT_USER_MACHINE_ATOMIC_REQUEST_NUM)
+      .ifPresent(throttleBuilder::setAtomicReqNum);
+    buildDefaultTimedQuota(conf, QUOTA_DEFAULT_USER_MACHINE_ATOMIC_WRITE_SIZE)
+      .ifPresent(throttleBuilder::setAtomicWriteSize);
 
     UserQuotaState state = new UserQuotaState(nowTs);
     QuotaProtos.Quotas defaultQuotas =
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/RegionServerRpcQuotaManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/RegionServerRpcQuotaManager.java
index f9a7ccba401b..d847a9eb3dc2 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/RegionServerRpcQuotaManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/RegionServerRpcQuotaManager.java
@@ -186,11 +186,11 @@ public OperationQuota checkBatchQuota(final Region region,
     final OperationQuota.OperationType type) throws IOException, RpcThrottlingException {
     switch (type) {
       case GET:
-        return this.checkBatchQuota(region, 0, 1);
+        return this.checkBatchQuota(region, 0, 1, false);
       case MUTATE:
-        return this.checkBatchQuota(region, 1, 0);
+        return this.checkBatchQuota(region, 1, 0, false);
       case CHECK_AND_MUTATE:
-        return this.checkBatchQuota(region, 1, 1);
+        return this.checkBatchQuota(region, 1, 1, true);
     }
     throw new RuntimeException("Invalid operation type: " + type);
   }
@@ -201,6 +201,7 @@ public OperationQuota checkBatchQuota(final Region region,
     throws IOException, RpcThrottlingException {
     int numWrites = 0;
     int numReads = 0;
+    boolean isAtomic = false;
     for (final ClientProtos.Action action : actions) {
       if (action.hasMutation()) {
         numWrites++;
@@ -208,12 +209,16 @@ public OperationQuota checkBatchQuota(final Region region,
           QuotaUtil.getQuotaOperationType(action, hasCondition);
         if (operationType == OperationQuota.OperationType.CHECK_AND_MUTATE) {
           numReads++;
+          // If any mutations in this batch are atomic, we will count the entire batch as atomic.
+          // This is a conservative approach, but it is the best that we can do without knowing
+          // the block bytes scanned of each individual action.
+          isAtomic = true;
         }
       } else if (action.hasGet()) {
         numReads++;
       }
     }
-    return checkBatchQuota(region, numWrites, numReads);
+    return checkBatchQuota(region, numWrites, numReads, isAtomic);
   }
 
   /**
@@ -227,7 +232,7 @@ public OperationQuota checkBatchQuota(final Region region,
    */
   @Override
   public OperationQuota checkBatchQuota(final Region region, final int numWrites,
-    final int numReads) throws IOException, RpcThrottlingException {
+    final int numReads, boolean isAtomic) throws IOException, RpcThrottlingException {
     Optional<User> user = RpcServer.getRequestUser();
     UserGroupInformation ugi;
     if (user.isPresent()) {
@@ -240,7 +245,7 @@ public OperationQuota checkBatchQuota(final Region region, final int numWrites,
 
     OperationQuota quota = getQuota(ugi, table, region.getMinBlockSizeBytes());
     try {
-      quota.checkBatchQuota(numWrites, numReads);
+      quota.checkBatchQuota(numWrites, numReads, isAtomic);
     } catch (RpcThrottlingException e) {
       LOG.debug("Throttling exception for user=" + ugi.getUserName() + " table=" + table
         + " numWrites=" + numWrites + " numReads=" + numReads + ": " + e.getMessage());
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/RpcQuotaManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/RpcQuotaManager.java
index 60392ca3b3f6..3f84f11a7e5e 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/RpcQuotaManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/RpcQuotaManager.java
@@ -87,6 +87,6 @@ OperationQuota checkBatchQuota(final Region region, final List<ClientProtos.Acti
    * @return the OperationQuota
    * @throws RpcThrottlingException if the operation cannot be executed due to quota exceeded.
    */
-  OperationQuota checkBatchQuota(final Region region, int numWrites, int numReads)
+  OperationQuota checkBatchQuota(final Region region, int numWrites, int numReads, boolean isAtomic)
     throws IOException, RpcThrottlingException;
 }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/TimeBasedLimiter.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/TimeBasedLimiter.java
index f5170b09c83e..e62d98242e44 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/TimeBasedLimiter.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/TimeBasedLimiter.java
@@ -42,6 +42,9 @@ public class TimeBasedLimiter implements QuotaLimiter {
   private RateLimiter reqCapacityUnitLimiter = null;
   private RateLimiter writeCapacityUnitLimiter = null;
   private RateLimiter readCapacityUnitLimiter = null;
+  private RateLimiter atomicReqLimiter = null;
+  private RateLimiter atomicReadSizeLimiter = null;
+  private RateLimiter atomicWriteSizeLimiter = null;
 
   private TimeBasedLimiter() {
     if (
@@ -60,6 +63,9 @@ private TimeBasedLimiter() {
       reqCapacityUnitLimiter = new FixedIntervalRateLimiter(refillInterval);
       writeCapacityUnitLimiter = new FixedIntervalRateLimiter(refillInterval);
       readCapacityUnitLimiter = new FixedIntervalRateLimiter(refillInterval);
+      atomicReqLimiter = new FixedIntervalRateLimiter(refillInterval);
+      atomicReadSizeLimiter = new FixedIntervalRateLimiter(refillInterval);
+      atomicWriteSizeLimiter = new FixedIntervalRateLimiter(refillInterval);
     } else {
       reqsLimiter = new AverageIntervalRateLimiter();
       reqSizeLimiter = new AverageIntervalRateLimiter();
@@ -70,6 +76,9 @@ private TimeBasedLimiter() {
       reqCapacityUnitLimiter = new AverageIntervalRateLimiter();
       writeCapacityUnitLimiter = new AverageIntervalRateLimiter();
       readCapacityUnitLimiter = new AverageIntervalRateLimiter();
+      atomicReqLimiter = new AverageIntervalRateLimiter();
+      atomicReadSizeLimiter = new AverageIntervalRateLimiter();
+      atomicWriteSizeLimiter = new AverageIntervalRateLimiter();
     }
   }
 
@@ -120,6 +129,22 @@ static QuotaLimiter fromThrottle(final Throttle throttle) {
       setFromTimedQuota(limiter.readCapacityUnitLimiter, throttle.getReadCapacityUnit());
       isBypass = false;
     }
+
+    if (throttle.hasAtomicReqNum()) {
+      setFromTimedQuota(limiter.atomicReqLimiter, throttle.getAtomicReqNum());
+      isBypass = false;
+    }
+
+    if (throttle.hasAtomicReadSize()) {
+      setFromTimedQuota(limiter.atomicReadSizeLimiter, throttle.getAtomicReadSize());
+      isBypass = false;
+    }
+
+    if (throttle.hasAtomicWriteSize()) {
+      setFromTimedQuota(limiter.atomicWriteSizeLimiter, throttle.getAtomicWriteSize());
+      isBypass = false;
+    }
+
     return isBypass ? NoopQuotaLimiter.get() : limiter;
   }
 
@@ -133,6 +158,9 @@ public void update(final TimeBasedLimiter other) {
     reqCapacityUnitLimiter.update(other.reqCapacityUnitLimiter);
     writeCapacityUnitLimiter.update(other.writeCapacityUnitLimiter);
     readCapacityUnitLimiter.update(other.readCapacityUnitLimiter);
+    atomicReqLimiter.update(other.atomicReqLimiter);
+    atomicReadSizeLimiter.update(other.atomicReadSizeLimiter);
+    atomicWriteSizeLimiter.update(other.atomicWriteSizeLimiter);
   }
 
   private static void setFromTimedQuota(final RateLimiter limiter, final TimedQuota timedQuota) {
@@ -141,8 +169,8 @@ private static void setFromTimedQuota(final RateLimiter limiter, final TimedQuot
 
   @Override
   public void checkQuota(long writeReqs, long estimateWriteSize, long readReqs,
-    long estimateReadSize, long estimateWriteCapacityUnit, long estimateReadCapacityUnit)
-    throws RpcThrottlingException {
+    long estimateReadSize, long estimateWriteCapacityUnit, long estimateReadCapacityUnit,
+    boolean isAtomic) throws RpcThrottlingException {
     long waitInterval = reqsLimiter.getWaitIntervalMs(writeReqs + readReqs);
     if (waitInterval > 0) {
       RpcThrottlingException.throwNumRequestsExceeded(waitInterval);
@@ -156,6 +184,12 @@ public void checkQuota(long writeReqs, long estimateWriteSize, long readReqs,
     if (waitInterval > 0) {
       RpcThrottlingException.throwRequestCapacityUnitExceeded(waitInterval);
     }
+    if (isAtomic) {
+      waitInterval = atomicReqLimiter.getWaitIntervalMs(writeReqs + readReqs);
+      if (waitInterval > 0) {
+        RpcThrottlingException.throwAtomicRequestNumberExceeded(waitInterval);
+      }
+    }
 
     if (estimateWriteSize > 0) {
       waitInterval = writeReqsLimiter.getWaitIntervalMs(writeReqs);
@@ -170,6 +204,12 @@ public void checkQuota(long writeReqs, long estimateWriteSize, long readReqs,
       if (waitInterval > 0) {
         RpcThrottlingException.throwWriteCapacityUnitExceeded(waitInterval);
       }
+      if (isAtomic) {
+        waitInterval = atomicWriteSizeLimiter.getWaitIntervalMs(writeReqs);
+        if (waitInterval > 0) {
+          RpcThrottlingException.throwAtomicWriteSizeExceeded(waitInterval);
+        }
+      }
     }
 
     if (estimateReadSize > 0) {
@@ -185,12 +225,18 @@ public void checkQuota(long writeReqs, long estimateWriteSize, long readReqs,
       if (waitInterval > 0) {
         RpcThrottlingException.throwReadCapacityUnitExceeded(waitInterval);
       }
+      if (isAtomic) {
+        waitInterval = atomicReadSizeLimiter.getWaitIntervalMs(writeReqs + readReqs);
+        if (waitInterval > 0) {
+          RpcThrottlingException.throwAtomicReadSizeExceeded(waitInterval);
+        }
+      }
     }
   }
 
   @Override
   public void grabQuota(long writeReqs, long writeSize, long readReqs, long readSize,
-    long writeCapacityUnit, long readCapacityUnit) {
+    long writeCapacityUnit, long readCapacityUnit, boolean isAtomic) {
     assert writeSize != 0 || readSize != 0;
 
     reqsLimiter.consume(writeReqs + readReqs);
@@ -212,22 +258,37 @@ public void grabQuota(long writeReqs, long writeSize, long readReqs, long readSi
       reqCapacityUnitLimiter.consume(readCapacityUnit);
       readCapacityUnitLimiter.consume(readCapacityUnit);
     }
+    if (isAtomic) {
+      atomicReqLimiter.consume(writeReqs + readReqs);
+      if (readSize > 0) {
+        atomicReadSizeLimiter.consume(readSize);
+      }
+      if (writeSize > 0) {
+        atomicWriteSizeLimiter.consume(writeSize);
+      }
+    }
   }
 
   @Override
-  public void consumeWrite(final long size, long capacityUnit) {
+  public void consumeWrite(final long size, long capacityUnit, boolean isAtomic) {
     reqSizeLimiter.consume(size);
     writeSizeLimiter.consume(size);
     reqCapacityUnitLimiter.consume(capacityUnit);
     writeCapacityUnitLimiter.consume(capacityUnit);
+    if (isAtomic) {
+      atomicWriteSizeLimiter.consume(size);
+    }
   }
 
   @Override
-  public void consumeRead(final long size, long capacityUnit) {
+  public void consumeRead(final long size, long capacityUnit, boolean isAtomic) {
     reqSizeLimiter.consume(size);
     readSizeLimiter.consume(size);
     reqCapacityUnitLimiter.consume(capacityUnit);
     readCapacityUnitLimiter.consume(capacityUnit);
+    if (isAtomic) {
+      atomicReadSizeLimiter.consume(size);
+    }
   }
 
   @Override
@@ -307,6 +368,15 @@ public String toString() {
     if (!readCapacityUnitLimiter.isBypass()) {
       builder.append(" readCapacityUnit=" + readCapacityUnitLimiter);
     }
+    if (!atomicReqLimiter.isBypass()) {
+      builder.append(" atomicReqLimiter=" + atomicReqLimiter);
+    }
+    if (!atomicReadSizeLimiter.isBypass()) {
+      builder.append(" atomicReadSizeLimiter=" + atomicReadSizeLimiter);
+    }
+    if (!atomicWriteSizeLimiter.isBypass()) {
+      builder.append(" atomicWriteSizeLimiter=" + atomicWriteSizeLimiter);
+    }
     builder.append(')');
     return builder.toString();
   }
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionCoprocessorHost.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionCoprocessorHost.java
index 929b24e521a2..52b3b54f4b24 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionCoprocessorHost.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionCoprocessorHost.java
@@ -229,7 +229,7 @@ public OperationQuota checkBatchQuota(Region region, OperationQuota.OperationTyp
     @Override
     public OperationQuota checkBatchQuota(final Region region, int numWrites, int numReads)
       throws IOException, RpcThrottlingException {
-      return rpcQuotaManager.checkBatchQuota(region, numWrites, numReads);
+      return rpcQuotaManager.checkBatchQuota(region, numWrites, numReads, false);
     }
   }
 
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestAtomicReadQuota.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestAtomicReadQuota.java
index f2beb8f5d27f..12bbc26d364a 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestAtomicReadQuota.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestAtomicReadQuota.java
@@ -28,8 +28,10 @@
 import org.apache.hadoop.hbase.TableName;
 import org.apache.hadoop.hbase.client.Admin;
 import org.apache.hadoop.hbase.client.CheckAndMutate;
+import org.apache.hadoop.hbase.client.Get;
 import org.apache.hadoop.hbase.client.Increment;
 import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.client.RetriesExhaustedWithDetailsException;
 import org.apache.hadoop.hbase.client.RowMutations;
 import org.apache.hadoop.hbase.client.Table;
@@ -81,7 +83,7 @@ public static void setUpBeforeClass() throws Exception {
 
   @Test
   public void testIncrementCountedAgainstReadCapacity() throws Exception {
-    setupQuota();
+    setupGenericQuota();
 
     Increment inc = new Increment(Bytes.toBytes(UUID.randomUUID().toString()));
     inc.addColumn(FAMILY, QUALIFIER, 1);
@@ -90,7 +92,7 @@ public void testIncrementCountedAgainstReadCapacity() throws Exception {
 
   @Test
   public void testConditionalRowMutationsCountedAgainstReadCapacity() throws Exception {
-    setupQuota();
+    setupGenericQuota();
 
     byte[] row = Bytes.toBytes(UUID.randomUUID().toString());
     Increment inc = new Increment(row);
@@ -106,7 +108,7 @@ public void testConditionalRowMutationsCountedAgainstReadCapacity() throws Excep
 
   @Test
   public void testNonConditionalRowMutationsOmittedFromReadCapacity() throws Exception {
-    setupQuota();
+    setupGenericQuota();
 
     byte[] row = Bytes.toBytes(UUID.randomUUID().toString());
     Put put = new Put(row);
@@ -123,44 +125,19 @@ public void testNonConditionalRowMutationsOmittedFromReadCapacity() throws Excep
 
   @Test
   public void testNonAtomicPutOmittedFromReadCapacity() throws Exception {
-    setupQuota();
-
-    byte[] row = Bytes.toBytes(UUID.randomUUID().toString());
-    Put put = new Put(row);
-    put.addColumn(FAMILY, Bytes.toBytes("doot"), Bytes.toBytes("v"));
-    try (Table table = getTable()) {
-      for (int i = 0; i < 100; i++) {
-        table.put(put);
-      }
-    }
+    setupGenericQuota();
+    runNonAtomicPuts();
   }
 
   @Test
   public void testNonAtomicMultiPutOmittedFromReadCapacity() throws Exception {
-    setupQuota();
-
-    Put put1 = new Put(Bytes.toBytes(UUID.randomUUID().toString()));
-    put1.addColumn(FAMILY, Bytes.toBytes("doot"), Bytes.toBytes("v"));
-    Put put2 = new Put(Bytes.toBytes(UUID.randomUUID().toString()));
-    put2.addColumn(FAMILY, Bytes.toBytes("doot"), Bytes.toBytes("v"));
-
-    Increment inc = new Increment(Bytes.toBytes(UUID.randomUUID().toString()));
-    inc.addColumn(FAMILY, Bytes.toBytes("doot"), 1);
-
-    List<Put> puts = new ArrayList<>(2);
-    puts.add(put1);
-    puts.add(put2);
-
-    try (Table table = getTable()) {
-      for (int i = 0; i < 100; i++) {
-        table.put(puts);
-      }
-    }
+    setupGenericQuota();
+    runNonAtomicPuts();
   }
 
   @Test
   public void testCheckAndMutateCountedAgainstReadCapacity() throws Exception {
-    setupQuota();
+    setupGenericQuota();
 
     byte[] row = Bytes.toBytes(UUID.randomUUID().toString());
     byte[] value = Bytes.toBytes("v");
@@ -174,7 +151,49 @@ public void testCheckAndMutateCountedAgainstReadCapacity() throws Exception {
 
   @Test
   public void testAtomicBatchCountedAgainstReadCapacity() throws Exception {
-    setupQuota();
+    setupGenericQuota();
+
+    byte[] row = Bytes.toBytes(UUID.randomUUID().toString());
+    Increment inc = new Increment(row);
+    inc.addColumn(FAMILY, Bytes.toBytes("doot"), 1);
+
+    List<Increment> incs = new ArrayList<>(2);
+    incs.add(inc);
+    incs.add(inc);
+
+    testThrottle(table -> {
+      List<Result> results = new ArrayList<>(incs.size());
+      for (Increment increment : incs) {
+        results.add(table.increment(increment));
+      }
+      return results;
+    });
+  }
+
+  @Test
+  public void testAtomicBatchCountedAgainstAtomicOnlyReqNum() throws Exception {
+    setupAtomicOnlyReqNumQuota();
+
+    byte[] row = Bytes.toBytes(UUID.randomUUID().toString());
+    Increment inc = new Increment(row);
+    inc.addColumn(FAMILY, Bytes.toBytes("doot"), 1);
+
+    List<Increment> incs = new ArrayList<>(2);
+    incs.add(inc);
+    incs.add(inc);
+
+    testThrottle(table -> {
+      List<Result> results = new ArrayList<>(incs.size());
+      for (Increment increment : incs) {
+        results.add(table.increment(increment));
+      }
+      return results;
+    });
+  }
+
+  @Test
+  public void testAtomicBatchCountedAgainstAtomicOnlyReadSize() throws Exception {
+    setupAtomicOnlyReadSizeQuota();
 
     byte[] row = Bytes.toBytes(UUID.randomUUID().toString());
     Increment inc = new Increment(row);
@@ -185,13 +204,67 @@ public void testAtomicBatchCountedAgainstReadCapacity() throws Exception {
     incs.add(inc);
 
     testThrottle(table -> {
-      Object[] results = new Object[incs.size()];
-      table.batch(incs, results);
+      List<Result> results = new ArrayList<>(incs.size());
+      for (Increment increment : incs) {
+        results.add(table.increment(increment));
+      }
       return results;
     });
   }
 
-  private void setupQuota() throws Exception {
+  @Test
+  public void testNonAtomicWritesIgnoredByAtomicOnlyReqNum() throws Exception {
+    setupAtomicOnlyReqNumQuota();
+    runNonAtomicPuts();
+  }
+
+  @Test
+  public void testNonAtomicWritesIgnoredByAtomicOnlyReadSize() throws Exception {
+    setupAtomicOnlyReadSizeQuota();
+    runNonAtomicPuts();
+  }
+
+  @Test
+  public void testNonAtomicReadsIgnoredByAtomicOnlyReqNum() throws Exception {
+    setupAtomicOnlyReqNumQuota();
+    runNonAtomicReads();
+  }
+
+  @Test
+  public void testNonAtomicReadsIgnoredByAtomicOnlyReadSize() throws Exception {
+    setupAtomicOnlyReadSizeQuota();
+    runNonAtomicReads();
+  }
+
+  private void runNonAtomicPuts() throws Exception {
+    Put put1 = new Put(Bytes.toBytes(UUID.randomUUID().toString()));
+    put1.addColumn(FAMILY, Bytes.toBytes("doot"), Bytes.toBytes("v"));
+    Put put2 = new Put(Bytes.toBytes(UUID.randomUUID().toString()));
+    put2.addColumn(FAMILY, Bytes.toBytes("doot"), Bytes.toBytes("v"));
+
+    Increment inc = new Increment(Bytes.toBytes(UUID.randomUUID().toString()));
+    inc.addColumn(FAMILY, Bytes.toBytes("doot"), 1);
+
+    List<Put> puts = new ArrayList<>(2);
+    puts.add(put1);
+    puts.add(put2);
+
+    try (Table table = getTable()) {
+      for (int i = 0; i < 100; i++) {
+        table.put(puts);
+      }
+    }
+  }
+
+  private void runNonAtomicReads() throws Exception {
+    try (Table table = getTable()) {
+      byte[] row = Bytes.toBytes(UUID.randomUUID().toString());
+      Get get = new Get(row);
+      table.get(get);
+    }
+  }
+
+  private void setupGenericQuota() throws Exception {
     try (Admin admin = TEST_UTIL.getAdmin()) {
       admin.setQuota(QuotaSettingsFactory.throttleUser(User.getCurrent().getShortName(),
         ThrottleType.READ_NUMBER, 1, TimeUnit.MINUTES));
@@ -199,6 +272,22 @@ private void setupQuota() throws Exception {
     ThrottleQuotaTestUtil.triggerUserCacheRefresh(TEST_UTIL, false, TABLE_NAME);
   }
 
+  private void setupAtomicOnlyReqNumQuota() throws Exception {
+    try (Admin admin = TEST_UTIL.getAdmin()) {
+      admin.setQuota(QuotaSettingsFactory.throttleUser(User.getCurrent().getShortName(),
+        ThrottleType.ATOMIC_REQUEST_NUMBER, 1, TimeUnit.MINUTES));
+    }
+    ThrottleQuotaTestUtil.triggerUserCacheRefresh(TEST_UTIL, false, TABLE_NAME);
+  }
+
+  private void setupAtomicOnlyReadSizeQuota() throws Exception {
+    try (Admin admin = TEST_UTIL.getAdmin()) {
+      admin.setQuota(QuotaSettingsFactory.throttleUser(User.getCurrent().getShortName(),
+        ThrottleType.ATOMIC_READ_SIZE, 1, TimeUnit.MINUTES));
+    }
+    ThrottleQuotaTestUtil.triggerUserCacheRefresh(TEST_UTIL, false, TABLE_NAME);
+  }
+
   private void cleanupQuota() throws Exception {
     try (Admin admin = TEST_UTIL.getAdmin()) {
       admin.setQuota(QuotaSettingsFactory.unthrottleUser(User.getCurrent().getShortName()));
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestDefaultAtomicQuota.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestDefaultAtomicQuota.java
new file mode 100644
index 000000000000..966bce6bcdb9
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestDefaultAtomicQuota.java
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.quotas;
+
+import static org.apache.hadoop.hbase.quotas.ThrottleQuotaTestUtil.triggerUserCacheRefresh;
+import static org.apache.hadoop.hbase.quotas.ThrottleQuotaTestUtil.waitMinuteQuota;
+
+import java.io.IOException;
+import java.util.UUID;
+import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.Admin;
+import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.security.User;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.testclassification.RegionServerTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+@Category({ RegionServerTests.class, MediumTests.class })
+public class TestDefaultAtomicQuota {
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestDefaultAtomicQuota.class);
+  private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+  private static final TableName TABLE_NAME = TableName.valueOf(UUID.randomUUID().toString());
+  private static final int REFRESH_TIME = 5;
+  private static final byte[] FAMILY = Bytes.toBytes("cf");
+  private static final byte[] QUALIFIER = Bytes.toBytes("q");
+
+  @AfterClass
+  public static void tearDown() throws Exception {
+    ThrottleQuotaTestUtil.clearQuotaCache(TEST_UTIL);
+    EnvironmentEdgeManager.reset();
+    TEST_UTIL.deleteTable(TABLE_NAME);
+    TEST_UTIL.shutdownMiniCluster();
+  }
+
+  @BeforeClass
+  public static void setUpBeforeClass() throws Exception {
+    // quotas enabled, using block bytes scanned
+    TEST_UTIL.getConfiguration().setBoolean(QuotaUtil.QUOTA_CONF_KEY, true);
+    TEST_UTIL.getConfiguration().setInt(QuotaCache.REFRESH_CONF_KEY, REFRESH_TIME);
+    TEST_UTIL.getConfiguration().setInt(QuotaUtil.QUOTA_DEFAULT_USER_MACHINE_ATOMIC_READ_SIZE, 1);
+    TEST_UTIL.getConfiguration().setInt(QuotaUtil.QUOTA_DEFAULT_USER_MACHINE_ATOMIC_REQUEST_NUM, 1);
+    TEST_UTIL.getConfiguration().setInt(QuotaUtil.QUOTA_DEFAULT_USER_MACHINE_ATOMIC_WRITE_SIZE, 1);
+
+    // don't cache blocks to make IO predictable
+    TEST_UTIL.getConfiguration().setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f);
+
+    TEST_UTIL.startMiniCluster(1);
+    TEST_UTIL.waitTableAvailable(QuotaTableUtil.QUOTA_TABLE_NAME);
+    TEST_UTIL.createTable(TABLE_NAME, FAMILY);
+    TEST_UTIL.waitTableAvailable(TABLE_NAME);
+    QuotaCache.TEST_FORCE_REFRESH = true;
+    TEST_UTIL.flush(TABLE_NAME);
+  }
+
+  @Test
+  public void testDefaultAtomicReadLimits() throws Exception {
+    // No write throttling
+    configureLenientThrottle(ThrottleType.ATOMIC_WRITE_SIZE);
+    refreshQuotas();
+
+    // Should have a strict throttle by default
+    TEST_UTIL.waitFor(60_000, () -> runIncTest(100) < 100);
+
+    // Add big quota and should be effectively unlimited
+    configureLenientThrottle(ThrottleType.ATOMIC_READ_SIZE);
+    configureLenientThrottle(ThrottleType.ATOMIC_REQUEST_NUMBER);
+    refreshQuotas();
+    // Should run without error
+    TEST_UTIL.waitFor(60_000, () -> runIncTest(100) == 100);
+
+    // Remove all the limits, and should revert to strict default
+    unsetQuota();
+    TEST_UTIL.waitFor(60_000, () -> runIncTest(100) < 100);
+  }
+
+  @Test
+  public void testDefaultAtomicWriteLimits() throws Exception {
+    // No read throttling
+    configureLenientThrottle(ThrottleType.ATOMIC_REQUEST_NUMBER);
+    configureLenientThrottle(ThrottleType.ATOMIC_READ_SIZE);
+    refreshQuotas();
+
+    // Should have a strict throttle by default
+    TEST_UTIL.waitFor(60_000, () -> runIncTest(100) < 100);
+
+    // Add big quota and should be effectively unlimited
+    configureLenientThrottle(ThrottleType.ATOMIC_WRITE_SIZE);
+    refreshQuotas();
+    // Should run without error
+    TEST_UTIL.waitFor(60_000, () -> runIncTest(100) == 100);
+
+    // Remove all the limits, and should revert to strict default
+    unsetQuota();
+    TEST_UTIL.waitFor(60_000, () -> runIncTest(100) < 100);
+  }
+
+  private void configureLenientThrottle(ThrottleType throttleType) throws IOException {
+    try (Admin admin = TEST_UTIL.getAdmin()) {
+      admin.setQuota(
+        QuotaSettingsFactory.throttleUser(getUserName(), throttleType, 100_000, TimeUnit.SECONDS));
+    }
+  }
+
+  private static String getUserName() throws IOException {
+    return User.getCurrent().getShortName();
+  }
+
+  private void refreshQuotas() throws Exception {
+    triggerUserCacheRefresh(TEST_UTIL, false, TABLE_NAME);
+    waitMinuteQuota();
+  }
+
+  private void unsetQuota() throws Exception {
+    try (Admin admin = TEST_UTIL.getAdmin()) {
+      admin.setQuota(QuotaSettingsFactory.unthrottleUser(getUserName()));
+    }
+    refreshQuotas();
+  }
+
+  private long runIncTest(int attempts) throws Exception {
+    refreshQuotas();
+    try (Table table = getTable()) {
+      return ThrottleQuotaTestUtil.doIncrements(attempts, FAMILY, QUALIFIER, table);
+    }
+  }
+
+  private Table getTable() throws IOException {
+    TEST_UTIL.getConfiguration().setInt("hbase.client.pause", 100);
+    TEST_UTIL.getConfiguration().setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 1);
+    return TEST_UTIL.getConnection().getTableBuilder(TABLE_NAME, null).setOperationTimeout(250)
+      .build();
+  }
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestDefaultOperationQuota.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestDefaultOperationQuota.java
index a6b7ba6fee59..beeab8aef5c4 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestDefaultOperationQuota.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestDefaultOperationQuota.java
@@ -153,14 +153,14 @@ public void testLargeBatchSaturatesReadNumLimit()
     DefaultOperationQuota quota = new DefaultOperationQuota(new Configuration(), 65536, limiter);
 
     // use the whole limit
-    quota.checkBatchQuota(0, limit);
+    quota.checkBatchQuota(0, limit, false);
 
     // the next request should be rejected
-    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(0, 1));
+    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(0, 1, false));
 
     envEdge.incValue(1000);
     // after the TimeUnit, the limit should be refilled
-    quota.checkBatchQuota(0, limit);
+    quota.checkBatchQuota(0, limit, false);
   }
 
   @Test
@@ -174,14 +174,14 @@ public void testLargeBatchSaturatesReadWriteLimit()
     DefaultOperationQuota quota = new DefaultOperationQuota(new Configuration(), 65536, limiter);
 
     // use the whole limit
-    quota.checkBatchQuota(limit, 0);
+    quota.checkBatchQuota(limit, 0, false);
 
     // the next request should be rejected
-    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(1, 0));
+    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(1, 0, false));
 
     envEdge.incValue(1000);
     // after the TimeUnit, the limit should be refilled
-    quota.checkBatchQuota(limit, 0);
+    quota.checkBatchQuota(limit, 0, false);
   }
 
   @Test
@@ -195,14 +195,14 @@ public void testTooLargeReadBatchIsNotBlocked()
     DefaultOperationQuota quota = new DefaultOperationQuota(new Configuration(), 65536, limiter);
 
     // use more than the limit, which should succeed rather than being indefinitely blocked
-    quota.checkBatchQuota(0, 10 + limit);
+    quota.checkBatchQuota(0, 10 + limit, false);
 
     // the next request should be blocked
-    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(0, 1));
+    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(0, 1, false));
 
     envEdge.incValue(1000);
     // even after the TimeUnit, the limit should not be refilled because we oversubscribed
-    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(0, limit));
+    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(0, limit, false));
   }
 
   @Test
@@ -216,14 +216,14 @@ public void testTooLargeWriteBatchIsNotBlocked()
     DefaultOperationQuota quota = new DefaultOperationQuota(new Configuration(), 65536, limiter);
 
     // use more than the limit, which should succeed rather than being indefinitely blocked
-    quota.checkBatchQuota(10 + limit, 0);
+    quota.checkBatchQuota(10 + limit, 0, false);
 
     // the next request should be blocked
-    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(1, 0));
+    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(1, 0, false));
 
     envEdge.incValue(1000);
     // even after the TimeUnit, the limit should not be refilled because we oversubscribed
-    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(limit, 0));
+    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(limit, 0, false));
   }
 
   @Test
@@ -237,14 +237,14 @@ public void testTooLargeWriteSizeIsNotBlocked()
     DefaultOperationQuota quota = new DefaultOperationQuota(new Configuration(), 65536, limiter);
 
     // writes are estimated a 100 bytes, so this will use 2x the limit but should not be blocked
-    quota.checkBatchQuota(1, 0);
+    quota.checkBatchQuota(1, 0, false);
 
     // the next request should be blocked
-    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(1, 0));
+    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(1, 0, false));
 
     envEdge.incValue(1000);
     // even after the TimeUnit, the limit should not be refilled because we oversubscribed
-    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(limit, 0));
+    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(limit, 0, false));
   }
 
   @Test
@@ -260,14 +260,14 @@ public void testTooLargeReadSizeIsNotBlocked()
       new DefaultOperationQuota(new Configuration(), (int) blockSize, limiter);
 
     // reads are estimated at 1 block each, so this will use ~2x the limit but should not be blocked
-    quota.checkBatchQuota(0, 1);
+    quota.checkBatchQuota(0, 1, false);
 
     // the next request should be blocked
-    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(0, 1));
+    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(0, 1, false));
 
     envEdge.incValue(1000);
     // even after the TimeUnit, the limit should not be refilled because we oversubscribed
-    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota((int) limit, 1));
+    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota((int) limit, 1, false));
   }
 
   @Test
@@ -283,13 +283,13 @@ public void testTooLargeRequestSizeIsNotBlocked()
       new DefaultOperationQuota(new Configuration(), (int) blockSize, limiter);
 
     // reads are estimated at 1 block each, so this will use ~2x the limit but should not be blocked
-    quota.checkBatchQuota(0, 1);
+    quota.checkBatchQuota(0, 1, false);
 
     // the next request should be blocked
-    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(0, 1));
+    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota(0, 1, false));
 
     envEdge.incValue(1000);
     // even after the TimeUnit, the limit should not be refilled because we oversubscribed
-    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota((int) limit, 1));
+    assertThrows(RpcThrottlingException.class, () -> quota.checkBatchQuota((int) limit, 1, false));
   }
 }
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestNoopOperationQuota.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestNoopOperationQuota.java
index ad2b79075a31..7fd686de94b8 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestNoopOperationQuota.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestNoopOperationQuota.java
@@ -28,7 +28,8 @@ public class TestNoopOperationQuota implements OperationQuota {
   public static final TestNoopOperationQuota INSTANCE = new TestNoopOperationQuota();
 
   @Override
-  public void checkBatchQuota(int numWrites, int numReads) throws RpcThrottlingException {
+  public void checkBatchQuota(int numWrites, int numReads, boolean isAtomic)
+    throws RpcThrottlingException {
   }
 
   @Override
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestQuotaAdmin.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestQuotaAdmin.java
index 5b560129ecea..ac037909bbc8 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestQuotaAdmin.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestQuotaAdmin.java
@@ -773,6 +773,14 @@ private void assertRPCQuota(ThrottleType type, long limit, TimeUnit tu, QuotaSco
         assertTrue(rpcQuota.hasWriteCapacityUnit());
         t = rpcQuota.getWriteCapacityUnit();
         break;
+      case ATOMIC_READ_SIZE:
+        assertTrue(rpcQuota.hasAtomicReadSize());
+        t = rpcQuota.getAtomicReadSize();
+        break;
+      case ATOMIC_REQUEST_NUMBER:
+        assertTrue(rpcQuota.hasAtomicReqNum());
+        t = rpcQuota.getAtomicReqNum();
+        break;
       default:
     }
 
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestQuotaState.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestQuotaState.java
index cbd40f7bd81c..d64b1002b1e5 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestQuotaState.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestQuotaState.java
@@ -224,7 +224,7 @@ public void testTableThrottleWithBatch() {
     assertFalse(quotaInfo.isBypass());
     QuotaLimiter limiter = quotaInfo.getTableLimiter(TABLE_A);
     try {
-      limiter.checkQuota(TABLE_A_THROTTLE_1 + 1, TABLE_A_THROTTLE_1 + 1, 0, 0, 1, 0);
+      limiter.checkQuota(TABLE_A_THROTTLE_1 + 1, TABLE_A_THROTTLE_1 + 1, 0, 0, 1, 0, false);
       fail("Should have thrown RpcThrottlingException");
     } catch (RpcThrottlingException e) {
       // expected
@@ -241,7 +241,7 @@ private Quotas buildReqNumThrottle(final long limit) {
   private void assertThrottleException(final QuotaLimiter limiter, final int availReqs) {
     assertNoThrottleException(limiter, availReqs);
     try {
-      limiter.checkQuota(1, 1, 0, 0, 1, 0);
+      limiter.checkQuota(1, 1, 0, 0, 1, 0, false);
       fail("Should have thrown RpcThrottlingException");
     } catch (RpcThrottlingException e) {
       // expected
@@ -251,11 +251,11 @@ private void assertThrottleException(final QuotaLimiter limiter, final int avail
   private void assertNoThrottleException(final QuotaLimiter limiter, final int availReqs) {
     for (int i = 0; i < availReqs; ++i) {
       try {
-        limiter.checkQuota(1, 1, 0, 0, 1, 0);
+        limiter.checkQuota(1, 1, 0, 0, 1, 0, false);
       } catch (RpcThrottlingException e) {
         fail("Unexpected RpcThrottlingException after " + i + " requests. limit=" + availReqs);
       }
-      limiter.grabQuota(1, 1, 0, 0, 1, 0);
+      limiter.grabQuota(1, 1, 0, 0, 1, 0, false);
     }
   }
 
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/ThrottleQuotaTestUtil.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/ThrottleQuotaTestUtil.java
index adfc46bb4a57..b343799b89db 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/ThrottleQuotaTestUtil.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/ThrottleQuotaTestUtil.java
@@ -28,6 +28,7 @@
 import org.apache.hadoop.hbase.TableName;
 import org.apache.hadoop.hbase.Waiter.ExplainingPredicate;
 import org.apache.hadoop.hbase.client.Get;
+import org.apache.hadoop.hbase.client.Increment;
 import org.apache.hadoop.hbase.client.Put;
 import org.apache.hadoop.hbase.client.ResultScanner;
 import org.apache.hadoop.hbase.client.Scan;
@@ -129,6 +130,23 @@ static long doGets(int maxOps, byte[] family, byte[] qualifier, final Table... t
     return count;
   }
 
+  static long doIncrements(int maxOps, byte[] family, byte[] qualifier, final Table... tables) {
+    int count = 0;
+    try {
+      while (count < maxOps) {
+        Increment inc = new Increment(Bytes.toBytes("row-" + count));
+        inc.addColumn(family, qualifier, 1L);
+        for (final Table table : tables) {
+          table.increment(inc);
+        }
+        count += tables.length;
+      }
+    } catch (IOException e) {
+      LOG.error("increment failed after nRetries=" + count, e);
+    }
+    return count;
+  }
+
   static long doMultiGets(int maxOps, int batchSize, int rowCount, byte[] family, byte[] qualifier,
     final Table... tables) {
     int opCount = 0;
@@ -202,7 +220,7 @@ private static void triggerCacheRefresh(HBaseTestingUtility testUtil, boolean by
       RegionServerRpcQuotaManager quotaManager =
         rst.getRegionServer().getRegionServerRpcQuotaManager();
       QuotaCache quotaCache = quotaManager.getQuotaCache();
-      quotaCache.triggerCacheRefresh();
+      quotaCache.forceSynchronousCacheRefresh();
       Thread.sleep(250);
       testUtil.waitFor(60000, 250, new ExplainingPredicate<Exception>() {
 
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestScannerLeaseCount.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestScannerLeaseCount.java
index cf99c53e1d9f..fc7387b48069 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestScannerLeaseCount.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestScannerLeaseCount.java
@@ -199,8 +199,8 @@ public OperationQuota checkBatchQuota(Region region, List<ClientProtos.Action> a
     }
 
     @Override
-    public OperationQuota checkBatchQuota(Region region, int numWrites, int numReads)
-      throws IOException, RpcThrottlingException {
+    public OperationQuota checkBatchQuota(Region region, int numWrites, int numReads,
+      boolean isAtomic) throws IOException, RpcThrottlingException {
       if (SHOULD_THROW) {
         throw EX;
       }

From 0b7c5b97fed03023d172145d12025d7d2984d4c9 Mon Sep 17 00:00:00 2001
From: Ray Mattingly <rmdmattingly@gmail.com>
Date: Tue, 15 Apr 2025 10:11:01 -0400
Subject: [PATCH 32/37] HubSpot Backport: HBASE-29262 StochasticLoadBalancer
 should use the CostFunction epsilon when evaluating whether a move improved
 costs (#6907) (will be in 2.7) (#174)

Signed-off-by: Nick Dimiduk <ndimiduk@apache.org>
Co-authored-by: Ray Mattingly <rmattingly@hubspot.com>
---
 .../hbase/master/balancer/StochasticLoadBalancer.java       | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
index 44e5aad3a6b8..689c65fd6ca4 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
@@ -730,8 +730,12 @@ protected List<RegionPlan> balanceTable(TableName tableName,
 
       newCost = computeCost(cluster, currentCost);
 
+      double costImprovement = currentCost - newCost;
+      double minimumImprovement =
+        Math.max(CostFunction.getCostEpsilon(currentCost), CostFunction.getCostEpsilon(newCost));
+      boolean costsImproved = costImprovement > minimumImprovement;
       boolean conditionalsSimilarCostsImproved =
-        (newCost < currentCost && conditionalViolationsChange == 0 && !isViolatingConditionals);
+        (costsImproved && conditionalViolationsChange == 0 && !isViolatingConditionals);
       // Our first priority is to reduce conditional violations
       // Our second priority is to reduce balancer cost
       // change, regardless of cost change

From ac6f349d0f9ddc0b4f82fc66da79b36f843823f8 Mon Sep 17 00:00:00 2001
From: Charles Connell <cconnell@hubspot.com>
Date: Tue, 25 Mar 2025 21:22:04 -0400
Subject: [PATCH 33/37] HubSpot Backport: HBASE-29218: Pass around an
 HFileDecompressionContext to reduce calls to Configuration.get() (not yet
 merged upstream)

---
 hbase-common/pom.xml                          |  4 -
 .../compress/ByteBuffDecompressionCodec.java  |  4 +
 .../io/compress/ByteBuffDecompressor.java     |  8 ++
 .../hadoop/hbase/io/compress/CodecPool.java   | 24 ++---
 .../hadoop/hbase/io/compress/Compression.java | 28 ++++++
 .../hbase/io/compress/DictionaryCache.java    |  6 +-
 .../HFileBlockDefaultDecodingContext.java     |  8 +-
 .../hadoop/hbase/io/hfile/HFileContext.java   | 37 ++++++-
 .../hbase/io/hfile/HFileContextBuilder.java   | 16 ++-
 .../zstd/ZstdByteBuffDecompressor.java        | 71 +++++---------
 .../hbase/io/compress/zstd/ZstdCodec.java     | 42 +++++++-
 .../zstd/ZstdHFileDecompressionContext.java   | 97 +++++++++++++++++++
 .../hadoop/hbase/io/hfile/HFileInfo.java      |  2 +
 13 files changed, 268 insertions(+), 79 deletions(-)
 create mode 100644 hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdHFileDecompressionContext.java

diff --git a/hbase-common/pom.xml b/hbase-common/pom.xml
index 3d009654140b..cc0260cdee92 100644
--- a/hbase-common/pom.xml
+++ b/hbase-common/pom.xml
@@ -109,10 +109,6 @@
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-crypto</artifactId>
     </dependency>
-    <dependency>
-      <groupId>com.github.ben-manes.caffeine</groupId>
-      <artifactId>caffeine</artifactId>
-    </dependency>
     <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/ByteBuffDecompressionCodec.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/ByteBuffDecompressionCodec.java
index 821f0d825446..233fc0160bd5 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/ByteBuffDecompressionCodec.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/ByteBuffDecompressionCodec.java
@@ -17,6 +17,7 @@
  */
 package org.apache.hadoop.hbase.io.compress;
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.yetus.audience.InterfaceAudience;
 
 @InterfaceAudience.Private
@@ -26,4 +27,7 @@ public interface ByteBuffDecompressionCodec {
 
   ByteBuffDecompressor createByteBuffDecompressor();
 
+  Compression.HFileDecompressionContext
+    getDecompressionContextFromConfiguration(Configuration conf);
+
 }
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/ByteBuffDecompressor.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/ByteBuffDecompressor.java
index 8a0ff71919a9..432b903fe4d6 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/ByteBuffDecompressor.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/ByteBuffDecompressor.java
@@ -17,6 +17,7 @@
  */
 package org.apache.hadoop.hbase.io.compress;
 
+import edu.umd.cs.findbugs.annotations.Nullable;
 import java.io.Closeable;
 import java.io.IOException;
 import org.apache.hadoop.hbase.nio.ByteBuff;
@@ -45,4 +46,11 @@ public interface ByteBuffDecompressor extends Closeable {
    */
   boolean canDecompress(ByteBuff output, ByteBuff input);
 
+  /**
+   * Call before every use of {@link #canDecompress(ByteBuff, ByteBuff)} and
+   * {@link #decompress(ByteBuff, ByteBuff, int)} to reinitialize the decompressor with settings
+   * from the HFileInfo. This can matter because ByteBuffDecompressors are reused many times.
+   */
+  void reinit(@Nullable Compression.HFileDecompressionContext newHFileDecompressionContext);
+
 }
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/CodecPool.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/CodecPool.java
index 8096af050037..437ca67f0a94 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/CodecPool.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/CodecPool.java
@@ -17,8 +17,6 @@
  */
 package org.apache.hadoop.hbase.io.compress;
 
-import com.github.benmanes.caffeine.cache.Caffeine;
-import com.github.benmanes.caffeine.cache.LoadingCache;
 import edu.umd.cs.findbugs.annotations.Nullable;
 import java.util.Comparator;
 import java.util.NavigableSet;
@@ -37,6 +35,10 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheLoader;
+import org.apache.hbase.thirdparty.com.google.common.cache.LoadingCache;
+
 /**
  * A global compressor/decompressor pool used to save and reuse (possibly native)
  * compression/decompression codecs. Copied from the class of the same name in hadoop-common and
@@ -56,7 +58,12 @@ public class CodecPool {
     NavigableSet<ByteBuffDecompressor>> BYTE_BUFF_DECOMPRESSOR_POOL = new ConcurrentHashMap<>();
 
   private static <T> LoadingCache<Class<T>, AtomicInteger> createCache() {
-    return Caffeine.newBuilder().build(key -> new AtomicInteger());
+    return CacheBuilder.newBuilder().build(new CacheLoader<Class<T>, AtomicInteger>() {
+      @Override
+      public AtomicInteger load(Class<T> key) throws Exception {
+        return new AtomicInteger();
+      }
+    });
   }
 
   /**
@@ -108,26 +115,19 @@ private static <T> boolean payback(ConcurrentMap<Class<T>, NavigableSet<T>> pool
   /**
    * Copied from hadoop-common without significant modification.
    */
-  @SuppressWarnings("unchecked")
-  @edu.umd.cs.findbugs.annotations.SuppressWarnings(
-      value = "NP_NULL_ON_SOME_PATH_FROM_RETURN_VALUE",
-      justification = "LoadingCache will compute value if absent")
   private static <T> int getLeaseCount(LoadingCache<Class<T>, AtomicInteger> usageCounts,
     Class<? extends T> codecClass) {
-    return usageCounts.get((Class<T>) codecClass).get();
+    return usageCounts.getUnchecked((Class<T>) codecClass).get();
   }
 
   /**
    * Copied from hadoop-common without significant modification.
    */
-  @edu.umd.cs.findbugs.annotations.SuppressWarnings(
-      value = "NP_NULL_ON_SOME_PATH_FROM_RETURN_VALUE",
-      justification = "LoadingCache will compute value if absent")
   private static <T> void updateLeaseCount(LoadingCache<Class<T>, AtomicInteger> usageCounts,
     T codec, int delta) {
     if (codec != null && usageCounts != null) {
       Class<T> codecClass = ReflectionUtils.getClass(codec);
-      usageCounts.get(codecClass).addAndGet(delta);
+      usageCounts.getUnchecked(codecClass).addAndGet(delta);
     }
   }
 
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/Compression.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/Compression.java
index c187a96702d0..2697ed152844 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/Compression.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/Compression.java
@@ -17,8 +17,10 @@
  */
 package org.apache.hadoop.hbase.io.compress;
 
+import edu.umd.cs.findbugs.annotations.Nullable;
 import java.io.BufferedInputStream;
 import java.io.BufferedOutputStream;
+import java.io.Closeable;
 import java.io.FilterOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
@@ -26,6 +28,7 @@
 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.io.HeapSize;
 import org.apache.hadoop.hbase.nio.ByteBuff;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionInputStream;
@@ -548,11 +551,36 @@ public void returnByteBuffDecompressor(ByteBuffDecompressor decompressor) {
       }
     }
 
+    /**
+     * Get an object that holds settings used by ByteBuffDecompressor. It's expensive to pull these
+     * from a Configuration object every time we decompress a block, so pull them here when, for
+     * example, opening an HFile, and reuse the returned HFileDecompressionContext as much as
+     * possible. The concrete class of this object will be one that is specific to the codec
+     * implementation in use. You don't need to inspect it yourself, just pass it along to
+     * {@link ByteBuffDecompressor#reinit(HFileDecompressionContext)}.
+     */
+    @Nullable
+    public HFileDecompressionContext
+      getHFileDecompressionContextForConfiguration(Configuration conf) {
+      if (supportsByteBuffDecompression()) {
+        return ((ByteBuffDecompressionCodec) getCodec(conf))
+          .getDecompressionContextFromConfiguration(conf);
+      } else {
+        return null;
+      }
+    }
+
     public String getName() {
       return compressName;
     }
   }
 
+  /**
+   * See {@link Algorithm#getHFileDecompressionContextForConfiguration(Configuration)}.
+   */
+  public static abstract class HFileDecompressionContext implements Closeable, HeapSize {
+  }
+
   public static Algorithm getCompressionAlgorithmByName(String compressName) {
     Algorithm[] algos = Algorithm.class.getEnumConstants();
 
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/DictionaryCache.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/DictionaryCache.java
index 1d6e25675f26..78fa448b63df 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/DictionaryCache.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/compress/DictionaryCache.java
@@ -59,11 +59,11 @@ private DictionaryCache() {
    * @param path the hadoop Path where the dictionary is located, as a String
    * @return the dictionary bytes if successful, null otherwise
    */
-  public static byte[] getDictionary(final Configuration conf, final String path)
-    throws IOException {
+  public static byte[] getDictionary(final Configuration conf, final String path) {
     if (path == null || path.isEmpty()) {
       return null;
     }
+
     // Create the dictionary loading cache if we haven't already
     if (CACHE == null) {
       synchronized (DictionaryCache.class) {
@@ -91,7 +91,7 @@ public byte[] load(String s) throws Exception {
     try {
       return CACHE.get(path);
     } catch (ExecutionException e) {
-      throw new IOException(e);
+      throw new RuntimeException("Unable to load dictionary at " + path, e);
     }
   }
 
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/HFileBlockDefaultDecodingContext.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/HFileBlockDefaultDecodingContext.java
index 2cdbdc620e07..81f8e5fa6a24 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/HFileBlockDefaultDecodingContext.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/HFileBlockDefaultDecodingContext.java
@@ -139,9 +139,7 @@ private void decompressViaByteBuff(ByteBuff blockBufferWithoutHeader, ByteBuff o
     Compression.Algorithm compression = fileContext.getCompression();
     ByteBuffDecompressor decompressor = compression.getByteBuffDecompressor();
     try {
-      if (decompressor instanceof CanReinit) {
-        ((CanReinit) decompressor).reinit(conf);
-      }
+      decompressor.reinit(fileContext.getDecompressionContext());
       decompressor.decompress(blockBufferWithoutHeader, onDiskBlock, onDiskSizeWithoutHeader);
     } finally {
       compression.returnByteBuffDecompressor(decompressor);
@@ -160,9 +158,7 @@ private boolean canDecompressViaByteBuff(ByteBuff blockBufferWithoutHeader,
     } else {
       ByteBuffDecompressor decompressor = fileContext.getCompression().getByteBuffDecompressor();
       try {
-        if (decompressor instanceof CanReinit) {
-          ((CanReinit) decompressor).reinit(conf);
-        }
+        decompressor.reinit(fileContext.getDecompressionContext());
         // Even if we have a ByteBuffDecompressor, we still need to check if it can decompress
         // our particular ByteBuffs
         return decompressor.canDecompress(blockBufferWithoutHeader, onDiskBlock);
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileContext.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileContext.java
index 98520d949af4..5dbf34304266 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileContext.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileContext.java
@@ -17,6 +17,8 @@
  */
 package org.apache.hadoop.hbase.io.hfile;
 
+import edu.umd.cs.findbugs.annotations.Nullable;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.CellComparator;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.InnerStoreCellComparator;
@@ -50,6 +52,11 @@ public class HFileContext implements HeapSize, Cloneable {
   private boolean includesTags;
   /** Compression algorithm used **/
   private Compression.Algorithm compressAlgo = Compression.Algorithm.NONE;
+  /**
+   * Details used by compression algorithm that are more efficiently loaded once and then reused
+   **/
+  @Nullable
+  private Compression.HFileDecompressionContext decompressionContext = null;
   /** Whether tags to be compressed or not **/
   private boolean compressTags;
   /** the checksum type **/
@@ -80,6 +87,7 @@ public HFileContext(HFileContext context) {
     this.includesMvcc = context.includesMvcc;
     this.includesTags = context.includesTags;
     this.compressAlgo = context.compressAlgo;
+    this.decompressionContext = context.decompressionContext;
     this.compressTags = context.compressTags;
     this.checksumType = context.checksumType;
     this.bytesPerChecksum = context.bytesPerChecksum;
@@ -95,14 +103,16 @@ public HFileContext(HFileContext context) {
   }
 
   HFileContext(boolean useHBaseChecksum, boolean includesMvcc, boolean includesTags,
-    Compression.Algorithm compressAlgo, boolean compressTags, ChecksumType checksumType,
-    int bytesPerChecksum, int blockSize, DataBlockEncoding encoding,
-    Encryption.Context cryptoContext, long fileCreateTime, String hfileName, byte[] columnFamily,
-    byte[] tableName, CellComparator cellComparator, IndexBlockEncoding indexBlockEncoding) {
+    Compression.Algorithm compressAlgo, Compression.HFileDecompressionContext decompressionContext,
+    boolean compressTags, ChecksumType checksumType, int bytesPerChecksum, int blockSize,
+    DataBlockEncoding encoding, Encryption.Context cryptoContext, long fileCreateTime,
+    String hfileName, byte[] columnFamily, byte[] tableName, CellComparator cellComparator,
+    IndexBlockEncoding indexBlockEncoding) {
     this.usesHBaseChecksum = useHBaseChecksum;
     this.includesMvcc = includesMvcc;
     this.includesTags = includesTags;
     this.compressAlgo = compressAlgo;
+    this.decompressionContext = decompressionContext;
     this.compressTags = compressTags;
     this.checksumType = checksumType;
     this.bytesPerChecksum = bytesPerChecksum;
@@ -141,6 +151,20 @@ public Compression.Algorithm getCompression() {
     return compressAlgo;
   }
 
+  /**
+   * Get an object that, if non-null, may be cast into a codec-specific type that exposes some
+   * information from the store-file-specific Configuration that is relevant to decompression. For
+   * example, ZSTD tables can have "hbase.io.compress.zstd.dictionary" on their table descriptor,
+   * and decompressions of blocks in that table must use that dictionary. It's cheaper for HBase to
+   * load these settings into an object of their own once and check this upon each block
+   * decompression, than it is to call into {@link Configuration#get(String)} on each block
+   * decompression.
+   */
+  @Nullable
+  public Compression.HFileDecompressionContext getDecompressionContext() {
+    return decompressionContext;
+  }
+
   public boolean isUseHBaseChecksum() {
     return usesHBaseChecksum;
   }
@@ -238,6 +262,9 @@ public long heapSize() {
     if (this.tableName != null) {
       size += ClassSize.sizeOfByteArray(this.tableName.length);
     }
+    if (this.decompressionContext != null) {
+      size += this.decompressionContext.heapSize();
+    }
     return size;
   }
 
@@ -274,6 +301,8 @@ public String toString() {
     sb.append(compressAlgo);
     sb.append(", compressTags=");
     sb.append(compressTags);
+    sb.append(", decompressionContext=");
+    sb.append(decompressionContext);
     sb.append(", cryptoContext=[");
     sb.append(cryptoContext);
     sb.append("]");
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileContextBuilder.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileContextBuilder.java
index 0394f12144e3..341461b26b1f 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileContextBuilder.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileContextBuilder.java
@@ -17,8 +17,10 @@
  */
 package org.apache.hadoop.hbase.io.hfile;
 
+import edu.umd.cs.findbugs.annotations.Nullable;
 import org.apache.hadoop.hbase.CellComparator;
 import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.io.compress.Compression;
 import org.apache.hadoop.hbase.io.compress.Compression.Algorithm;
 import org.apache.hadoop.hbase.io.crypto.Encryption;
 import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
@@ -42,6 +44,8 @@ public class HFileContextBuilder {
   private boolean includesTags = false;
   /** Compression algorithm used **/
   private Algorithm compression = Algorithm.NONE;
+  @Nullable
+  private Compression.HFileDecompressionContext decompressionContext = null;
   /** Whether tags to be compressed or not **/
   private boolean compressTags = false;
   /** the checksum type **/
@@ -73,6 +77,7 @@ public HFileContextBuilder(final HFileContext hfc) {
     this.includesMvcc = hfc.isIncludesMvcc();
     this.includesTags = hfc.isIncludesTags();
     this.compression = hfc.getCompression();
+    this.decompressionContext = hfc.getDecompressionContext();
     this.compressTags = hfc.isCompressTags();
     this.checkSumType = hfc.getChecksumType();
     this.bytesPerChecksum = hfc.getBytesPerChecksum();
@@ -107,6 +112,12 @@ public HFileContextBuilder withCompression(Algorithm compression) {
     return this;
   }
 
+  public HFileContextBuilder
+    withDecompressionContext(@Nullable Compression.HFileDecompressionContext decompressionContext) {
+    this.decompressionContext = decompressionContext;
+    return this;
+  }
+
   public HFileContextBuilder withCompressTags(boolean compressTags) {
     this.compressTags = compressTags;
     return this;
@@ -169,7 +180,8 @@ public HFileContextBuilder withCellComparator(CellComparator cellComparator) {
 
   public HFileContext build() {
     return new HFileContext(usesHBaseChecksum, includesMvcc, includesTags, compression,
-      compressTags, checkSumType, bytesPerChecksum, blockSize, encoding, cryptoContext,
-      fileCreateTime, hfileName, columnFamily, tableName, cellComparator, indexBlockEncoding);
+      decompressionContext, compressTags, checkSumType, bytesPerChecksum, blockSize, encoding,
+      cryptoContext, fileCreateTime, hfileName, columnFamily, tableName, cellComparator,
+      indexBlockEncoding);
   }
 }
diff --git a/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdByteBuffDecompressor.java b/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdByteBuffDecompressor.java
index d71d46e2946e..fd3f778edf89 100644
--- a/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdByteBuffDecompressor.java
+++ b/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdByteBuffDecompressor.java
@@ -22,10 +22,9 @@
 import edu.umd.cs.findbugs.annotations.Nullable;
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.io.compress.BlockDecompressorHelper;
 import org.apache.hadoop.hbase.io.compress.ByteBuffDecompressor;
-import org.apache.hadoop.hbase.io.compress.CanReinit;
+import org.apache.hadoop.hbase.io.compress.Compression;
 import org.apache.hadoop.hbase.nio.ByteBuff;
 import org.apache.hadoop.hbase.nio.SingleByteBuff;
 import org.apache.yetus.audience.InterfaceAudience;
@@ -34,21 +33,18 @@
  * Glue for ByteBuffDecompressor on top of zstd-jni
  */
 @InterfaceAudience.Private
-public class ZstdByteBuffDecompressor implements ByteBuffDecompressor, CanReinit {
+public class ZstdByteBuffDecompressor implements ByteBuffDecompressor {
 
   protected int dictId;
-  @Nullable
-  protected ZstdDictDecompress dict;
   protected ZstdDecompressCtx ctx;
   // Intended to be set to false by some unit tests
   private boolean allowByteBuffDecompression;
 
-  ZstdByteBuffDecompressor(@Nullable byte[] dictionary) {
+  ZstdByteBuffDecompressor(@Nullable byte[] dictionaryBytes) {
     ctx = new ZstdDecompressCtx();
-    if (dictionary != null) {
-      this.dictId = ZstdCodec.getDictionaryId(dictionary);
-      this.dict = new ZstdDictDecompress(dictionary);
-      this.ctx.loadDict(this.dict);
+    if (dictionaryBytes != null) {
+      this.ctx.loadDict(new ZstdDictDecompress(dictionaryBytes));
+      dictId = ZstdCodec.getDictionaryId(dictionaryBytes);
     }
     allowByteBuffDecompression = true;
   }
@@ -100,44 +96,29 @@ private int decompressRaw(ByteBuff output, ByteBuff input, int inputLen) throws
   }
 
   @Override
-  public void close() {
-    ctx.close();
-    if (dict != null) {
-      dict.close();
-    }
-  }
-
-  @Override
-  public void reinit(Configuration conf) {
-    if (conf != null) {
-      // Dictionary may have changed
-      byte[] b = ZstdCodec.getDictionary(conf);
-      if (b != null) {
-        // Don't casually create dictionary objects; they consume native memory
-        int thisDictId = ZstdCodec.getDictionaryId(b);
-        if (dict == null || dictId != thisDictId) {
-          dictId = thisDictId;
-          ZstdDictDecompress oldDict = dict;
-          dict = new ZstdDictDecompress(b);
-          ctx.loadDict(dict);
-          if (oldDict != null) {
-            oldDict.close();
-          }
+  public void reinit(@Nullable Compression.HFileDecompressionContext newHFileDecompressionContext) {
+    if (newHFileDecompressionContext != null) {
+      if (newHFileDecompressionContext instanceof ZstdHFileDecompressionContext) {
+        ZstdHFileDecompressionContext zstdContext =
+          (ZstdHFileDecompressionContext) newHFileDecompressionContext;
+        allowByteBuffDecompression = zstdContext.isAllowByteBuffDecompression();
+        if (zstdContext.getDict() == null && dictId != 0) {
+          ctx.loadDict((byte[]) null);
+          dictId = 0;
+        } else if (zstdContext.getDictId() != dictId) {
+          this.ctx.loadDict(zstdContext.getDict());
+          this.dictId = zstdContext.getDictId();
         }
       } else {
-        ZstdDictDecompress oldDict = dict;
-        dict = null;
-        dictId = 0;
-        // loadDict((byte[]) accepts null to clear the dictionary
-        ctx.loadDict((byte[]) null);
-        if (oldDict != null) {
-          oldDict.close();
-        }
+        throw new IllegalArgumentException(
+          "ZstdByteBuffDecompression#reinit() was given an HFileDecompressionContext that was not a ZstdHFileDecompressionContext, this should never happen");
       }
-
-      // unit test helper
-      this.allowByteBuffDecompression =
-        conf.getBoolean("hbase.io.compress.zstd.allowByteBuffDecompression", true);
     }
   }
+
+  @Override
+  public void close() {
+    ctx.close();
+  }
+
 }
diff --git a/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdCodec.java b/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdCodec.java
index b06b93e3167b..e934aa12c6cf 100644
--- a/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdCodec.java
+++ b/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdCodec.java
@@ -18,17 +18,23 @@
 package org.apache.hadoop.hbase.io.compress.zstd;
 
 import com.github.luben.zstd.Zstd;
+import com.github.luben.zstd.ZstdDictDecompress;
+import edu.umd.cs.findbugs.annotations.Nullable;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.CommonConfigurationKeys;
 import org.apache.hadoop.hbase.io.compress.ByteBuffDecompressionCodec;
 import org.apache.hadoop.hbase.io.compress.ByteBuffDecompressor;
+import org.apache.hadoop.hbase.io.compress.Compression;
 import org.apache.hadoop.hbase.io.compress.DictionaryCache;
+import org.apache.hadoop.hbase.util.Pair;
 import org.apache.hadoop.io.compress.BlockCompressorStream;
 import org.apache.hadoop.io.compress.BlockDecompressorStream;
 import org.apache.hadoop.io.compress.CompressionCodec;
@@ -38,6 +44,9 @@
 import org.apache.hadoop.io.compress.Decompressor;
 import org.apache.yetus.audience.InterfaceAudience;
 
+import org.apache.hbase.thirdparty.com.google.common.cache.Cache;
+import org.apache.hbase.thirdparty.com.google.common.cache.CacheBuilder;
+
 /**
  * Hadoop ZStandard codec implemented with zstd-jni.
  * <p>
@@ -51,6 +60,9 @@ public class ZstdCodec implements Configurable, CompressionCodec, ByteBuffDecomp
   public static final int ZSTD_BUFFER_SIZE_DEFAULT = 256 * 1024;
   public static final String ZSTD_DICTIONARY_KEY = "hbase.io.compress.zstd.dictionary";
 
+  private static final Cache<String, Pair<ZstdDictDecompress, Integer>> DECOMPRESS_DICT_CACHE =
+    CacheBuilder.newBuilder().maximumSize(100).expireAfterAccess(10, TimeUnit.MINUTES).build();
+
   private Configuration conf;
   private int bufferSize;
   private int level;
@@ -125,6 +137,12 @@ public Class<? extends ByteBuffDecompressor> getByteBuffDecompressorType() {
     return ZstdByteBuffDecompressor.class;
   }
 
+  @Override
+  public Compression.HFileDecompressionContext
+    getDecompressionContextFromConfiguration(Configuration conf) {
+    return ZstdHFileDecompressionContext.fromConfiguration(conf);
+  }
+
   @Override
   public String getDefaultExtension() {
     return ".zst";
@@ -145,12 +163,30 @@ static int getBufferSize(Configuration conf) {
     return size > 0 ? size : ZSTD_BUFFER_SIZE_DEFAULT;
   }
 
+  @Nullable
   static byte[] getDictionary(final Configuration conf) {
     String path = conf.get(ZSTD_DICTIONARY_KEY);
+    return DictionaryCache.getDictionary(conf, path);
+  }
+
+  /**
+   * Returns dictionary and its ID number, useful for comparing to other dictionaries for equality
+   */
+  @Nullable
+  static Pair<ZstdDictDecompress, Integer> getDecompressDictionary(final Configuration conf) {
+    String path = conf.get(ZSTD_DICTIONARY_KEY);
+    if (path == null) {
+      return null;
+    }
+
     try {
-      return DictionaryCache.getDictionary(conf, path);
-    } catch (IOException e) {
-      throw new RuntimeException("Unable to load dictionary at " + path, e);
+      return DECOMPRESS_DICT_CACHE.get(path, () -> {
+        byte[] dictBytes = DictionaryCache.getDictionary(conf, path);
+        int dictId = getDictionaryId(dictBytes);
+        return new Pair<>(new ZstdDictDecompress(dictBytes), dictId);
+      });
+    } catch (ExecutionException e) {
+      throw new RuntimeException("Unable to load ZSTD dictionary", e);
     }
   }
 
diff --git a/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdHFileDecompressionContext.java b/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdHFileDecompressionContext.java
new file mode 100644
index 000000000000..d85f9b30b1b4
--- /dev/null
+++ b/hbase-compression/hbase-compression-zstd/src/main/java/org/apache/hadoop/hbase/io/compress/zstd/ZstdHFileDecompressionContext.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.compress.zstd;
+
+import com.github.luben.zstd.ZstdDictDecompress;
+import edu.umd.cs.findbugs.annotations.Nullable;
+import java.io.IOException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.io.compress.Compression;
+import org.apache.hadoop.hbase.io.hfile.HFileContext;
+import org.apache.hadoop.hbase.util.ClassSize;
+import org.apache.hadoop.hbase.util.Pair;
+import org.apache.yetus.audience.InterfaceAudience;
+
+/**
+ * Holds HFile-level settings used by ZstdByteBuffDecompressor. It's expensive to pull these from a
+ * Configuration object every time we decompress a block, so pull them upon opening an HFile, and
+ * reuse them in every block that gets decompressed.
+ */
+@InterfaceAudience.Private
+public class ZstdHFileDecompressionContext extends Compression.HFileDecompressionContext {
+
+  public static final long FIXED_OVERHEAD = ClassSize.estimateBase(HFileContext.class, false);
+
+  @Nullable
+  private final ZstdDictDecompress dict;
+  private final int dictId;
+  // Intended to be set to false by some unit tests
+  private final boolean allowByteBuffDecompression;
+
+  private ZstdHFileDecompressionContext(@Nullable ZstdDictDecompress dict, int dictId,
+    boolean allowByteBuffDecompression) {
+    this.dict = dict;
+    this.dictId = dictId;
+    this.allowByteBuffDecompression = allowByteBuffDecompression;
+  }
+
+  @Nullable
+  public ZstdDictDecompress getDict() {
+    return dict;
+  }
+
+  public int getDictId() {
+    return dictId;
+  }
+
+  public boolean isAllowByteBuffDecompression() {
+    return allowByteBuffDecompression;
+  }
+
+  public static ZstdHFileDecompressionContext fromConfiguration(Configuration conf) {
+    boolean allowByteBuffDecompression =
+      conf.getBoolean("hbase.io.compress.zstd.allowByteBuffDecompression", true);
+    Pair<ZstdDictDecompress, Integer> dictAndId = ZstdCodec.getDecompressDictionary(conf);
+    if (dictAndId != null) {
+      return new ZstdHFileDecompressionContext(dictAndId.getFirst(), dictAndId.getSecond(),
+        allowByteBuffDecompression);
+    } else {
+      return new ZstdHFileDecompressionContext(null, 0, allowByteBuffDecompression);
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    if (dict != null) {
+      dict.close();
+    }
+  }
+
+  @Override
+  public long heapSize() {
+    // ZstdDictDecompress objects are cached and shared between ZstdHFileDecompressionContexts, so
+    // don't include ours in our heap size.
+    return FIXED_OVERHEAD;
+  }
+
+  @Override
+  public String toString() {
+    return "ZstdHFileDecompressionContext{dictId=" + dictId + ", allowByteBuffDecompression="
+      + allowByteBuffDecompression + '}';
+  }
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileInfo.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileInfo.java
index 1f2e5ec6d965..e16235373856 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileInfo.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileInfo.java
@@ -404,6 +404,8 @@ private HFileContext createHFileContext(Path path, FixedFileTrailer trailer, Con
     throws IOException {
     HFileContextBuilder builder = new HFileContextBuilder().withHBaseCheckSum(true)
       .withHFileName(path.getName()).withCompression(trailer.getCompressionCodec())
+      .withDecompressionContext(
+        trailer.getCompressionCodec().getHFileDecompressionContextForConfiguration(conf))
       .withCellComparator(FixedFileTrailer.createComparator(trailer.getComparatorClassName()));
     // Check for any key material available
     byte[] keyBytes = trailer.getEncryptionKey();

From 2ceddc9c5916ab0e8f771b98098b354c16112a37 Mon Sep 17 00:00:00 2001
From: Charles Connell <cconnell@hubspot.com>
Date: Fri, 11 Apr 2025 15:04:08 -0400
Subject: [PATCH 34/37] HubSpot Backport: HBASE-29253: Avoid allocating a new
 closure on every row processed by StoreScanner (not yet merged upstream)

---
 .../hadoop/hbase/regionserver/StoreScanner.java   | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java
index 6b552bc10f48..451ff93137ae 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java
@@ -25,6 +25,7 @@
 import java.util.Optional;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.locks.ReentrantLock;
+import java.util.function.IntConsumer;
 import org.apache.hadoop.hbase.Cell;
 import org.apache.hadoop.hbase.CellComparator;
 import org.apache.hadoop.hbase.CellUtil;
@@ -585,6 +586,13 @@ public boolean next(List<Cell> outResult, ScannerContext scannerContext) throws
 
     Optional<RpcCall> rpcCall =
       matcher.isUserScan() ? RpcServer.getCurrentCall() : Optional.empty();
+    // re-useable closure to avoid allocations
+    IntConsumer recordBlockSize = blockSize -> {
+      if (rpcCall.isPresent()) {
+        rpcCall.get().incrementBlockBytesScanned(blockSize);
+      }
+      scannerContext.incrementBlockProgress(blockSize);
+    };
 
     int count = 0;
     long totalBytesRead = 0;
@@ -625,12 +633,7 @@ public boolean next(List<Cell> outResult, ScannerContext scannerContext) throws
           scannerContext.returnImmediately();
         }
 
-        heap.recordBlockSize(blockSize -> {
-          if (rpcCall.isPresent()) {
-            rpcCall.get().incrementBlockBytesScanned(blockSize);
-          }
-          scannerContext.incrementBlockProgress(blockSize);
-        });
+        heap.recordBlockSize(recordBlockSize);
 
         prevCell = cell;
         scannerContext.setLastPeekedCell(cell);

From 96402e7f81d59051e3b01736231eaf107d3c200d Mon Sep 17 00:00:00 2001
From: Charles Connell <cconnell@hubspot.com>
Date: Fri, 11 Apr 2025 16:01:22 -0400
Subject: [PATCH 35/37] HubSpot Backport: HBASE-29252: Reduce allocations in
 RowIndexSeekerV1 (not yet merged upstream)

---
 .../hbase/ByteBufferKeyOnlyKeyValue.java      |  11 ++
 .../hbase/io/encoding/RowIndexSeekerV1.java   |  17 ++-
 .../io/hfile/TestRowIndexV1RoundTrip.java     | 144 ++++++++++++++++++
 3 files changed, 164 insertions(+), 8 deletions(-)
 create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestRowIndexV1RoundTrip.java

diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/ByteBufferKeyOnlyKeyValue.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/ByteBufferKeyOnlyKeyValue.java
index a29a98a8c091..8e453fdb985d 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/ByteBufferKeyOnlyKeyValue.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/ByteBufferKeyOnlyKeyValue.java
@@ -296,4 +296,15 @@ public long heapSize() {
     }
     return ClassSize.align(FIXED_OVERHEAD);
   }
+
+  /**
+   * Completely clears the state of this cell. Useful if you want to reuse this object to avoid
+   * allocations.
+   */
+  public void clear() {
+    this.buf = null;
+    this.offset = 0;
+    this.length = 0;
+    this.rowLen = 0;
+  }
 }
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/RowIndexSeekerV1.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/RowIndexSeekerV1.java
index e283803a143b..c82906dc21a0 100644
--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/RowIndexSeekerV1.java
+++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/encoding/RowIndexSeekerV1.java
@@ -84,10 +84,10 @@ public void setCurrentBuffer(ByteBuff buffer) {
   public Cell getKey() {
     if (current.keyBuffer.hasArray()) {
       return new KeyValue.KeyOnlyKeyValue(current.keyBuffer.array(),
-        current.keyBuffer.arrayOffset() + current.keyBuffer.position(), current.keyLength);
+        current.keyBuffer.arrayOffset() + current.keyOffset, current.keyLength);
     } else {
       final byte[] key = new byte[current.keyLength];
-      ByteBufferUtils.copyFromBufferToArray(key, current.keyBuffer, current.keyBuffer.position(), 0,
+      ByteBufferUtils.copyFromBufferToArray(key, current.keyBuffer, current.keyOffset, 0,
         current.keyLength);
       return new KeyValue.KeyOnlyKeyValue(key, 0, current.keyLength);
     }
@@ -254,9 +254,8 @@ protected void decodeNext() {
     currentBuffer.skip(Bytes.SIZEOF_LONG);
     // key part
     currentBuffer.asSubByteBuffer(currentBuffer.position(), current.keyLength, tmpPair);
-    ByteBuffer key = tmpPair.getFirst().duplicate();
-    key.position(tmpPair.getSecond()).limit(tmpPair.getSecond() + current.keyLength);
-    current.keyBuffer = key;
+    current.keyBuffer = tmpPair.getFirst();
+    current.keyOffset = tmpPair.getSecond();
     currentBuffer.skip(current.keyLength);
     // value part
     current.valueOffset = currentBuffer.position();
@@ -270,7 +269,7 @@ protected void decodeNext() {
       current.memstoreTS = 0;
     }
     current.nextKvOffset = currentBuffer.position();
-    current.currentKey.setKey(current.keyBuffer, tmpPair.getSecond(), current.keyLength);
+    current.currentKey.setKey(current.keyBuffer, current.keyOffset, current.keyLength);
   }
 
   protected void decodeTags() {
@@ -288,6 +287,7 @@ private class SeekerState {
 
     protected ByteBuff currentBuffer;
     protected int startOffset = -1;
+    protected int keyOffset = -1;
     protected int valueOffset = -1;
     protected int keyLength;
     protected int valueLength;
@@ -297,7 +297,7 @@ private class SeekerState {
     protected ByteBuffer keyBuffer = null;
     protected long memstoreTS;
     protected int nextKvOffset;
-    // buffer backed keyonlyKV
+    // buffer backed keyonlyKV, reset and re-used as necessary to avoid allocations
     private ByteBufferKeyOnlyKeyValue currentKey = new ByteBufferKeyOnlyKeyValue();
 
     protected boolean isValid() {
@@ -306,7 +306,7 @@ protected boolean isValid() {
 
     protected void invalidate() {
       valueOffset = -1;
-      currentKey = new ByteBufferKeyOnlyKeyValue();
+      currentKey.clear();
       currentBuffer = null;
     }
 
@@ -320,6 +320,7 @@ protected void copyFromNext(SeekerState nextState) {
         nextState.currentKey.getRowPosition() - Bytes.SIZEOF_SHORT, nextState.keyLength);
 
       startOffset = nextState.startOffset;
+      keyOffset = nextState.keyOffset;
       valueOffset = nextState.valueOffset;
       keyLength = nextState.keyLength;
       valueLength = nextState.valueLength;
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestRowIndexV1RoundTrip.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestRowIndexV1RoundTrip.java
new file mode 100644
index 000000000000..2004e20aad6c
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestRowIndexV1RoundTrip.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.io.hfile;
+
+import static org.apache.hadoop.hbase.io.ByteBuffAllocator.MIN_ALLOCATE_SIZE_KEY;
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.CellComparatorImpl;
+import org.apache.hadoop.hbase.CellUtil;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.SizeCachedNoTagsByteBufferKeyValue;
+import org.apache.hadoop.hbase.SizeCachedNoTagsKeyValue;
+import org.apache.hadoop.hbase.io.ByteBuffAllocator;
+import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
+import org.apache.hadoop.hbase.testclassification.IOTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.junit.Before;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+@Category({ IOTests.class, MediumTests.class })
+public class TestRowIndexV1RoundTrip {
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestRowIndexV1RoundTrip.class);
+  private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+  private static final DataBlockEncoding DATA_BLOCK_ENCODING = DataBlockEncoding.ROW_INDEX_V1;
+  private static final int ENTRY_COUNT = 100;
+
+  private Configuration conf;
+  private FileSystem fs;
+
+  @Before
+  public void setUp() throws IOException {
+    conf = TEST_UTIL.getConfiguration();
+    conf.setLong(MIN_ALLOCATE_SIZE_KEY, 0);
+    fs = FileSystem.get(conf);
+  }
+
+  @Test
+  public void testReadMyWritesOnHeap() throws IOException {
+    Path hfilePath = new Path(TEST_UTIL.getDataTestDir(), "testHFileFormatV3");
+    writeDataToHFile(hfilePath, ENTRY_COUNT);
+    readDataFromHFile(hfilePath, ENTRY_COUNT, true);
+  }
+
+  @Test
+  public void testReadMyWritesOnDirectMem() throws IOException {
+    Path hfilePath = new Path(TEST_UTIL.getDataTestDir(), "testHFileFormatV3");
+    writeDataToHFile(hfilePath, ENTRY_COUNT);
+    readDataFromHFile(hfilePath, ENTRY_COUNT, false);
+  }
+
+  private void writeDataToHFile(Path hfilePath, int entryCount) throws IOException {
+    HFileContext context =
+      new HFileContextBuilder().withBlockSize(1024).withDataBlockEncoding(DATA_BLOCK_ENCODING)
+        .withCellComparator(CellComparatorImpl.COMPARATOR).build();
+    CacheConfig cacheConfig = new CacheConfig(conf);
+    HFile.Writer writer = new HFile.WriterFactory(conf, cacheConfig).withPath(fs, hfilePath)
+      .withFileContext(context).create();
+
+    List<KeyValue> keyValues = new ArrayList<>(entryCount);
+
+    writeKeyValues(entryCount, writer, keyValues);
+  }
+
+  private void writeKeyValues(int entryCount, HFile.Writer writer, List<KeyValue> keyValues)
+    throws IOException {
+    for (int i = 0; i < entryCount; ++i) {
+      byte[] keyBytes = intToBytes(i);
+
+      byte[] valueBytes = Bytes.toBytes(String.format("value %d", i));
+      KeyValue keyValue = new KeyValue(keyBytes, null, null, valueBytes);
+
+      writer.append(keyValue);
+      keyValues.add(keyValue);
+    }
+    writer.close();
+  }
+
+  private void readDataFromHFile(Path hfilePath, int entryCount, boolean onHeap)
+    throws IOException {
+    CacheConfig cacheConfig;
+    if (onHeap) {
+      cacheConfig = new CacheConfig(conf);
+    } else {
+      ByteBuffAllocator allocator = ByteBuffAllocator.create(conf, true);
+      cacheConfig = new CacheConfig(conf, null, null, allocator);
+    }
+    HFile.Reader reader = HFile.createReader(fs, hfilePath, cacheConfig, false, conf);
+    HFileScanner scanner = reader.getScanner(conf, false, false);
+    scanner.seekTo();
+    int i = 1;
+    while (scanner.next()) {
+      byte[] keyBytes = intToBytes(i);
+      // check row key from getKey() and getCell() separately because they use different code paths
+      assertArrayEquals(keyBytes, CellUtil.cloneRow(scanner.getKey()));
+      assertArrayEquals(keyBytes, CellUtil.cloneRow(scanner.getCell()));
+      assertArrayEquals(Bytes.toBytes(String.format("value %d", i)),
+        CellUtil.cloneValue(scanner.getCell()));
+      if (onHeap) {
+        assertTrue(scanner.getCell() instanceof SizeCachedNoTagsKeyValue);
+      } else {
+        assertTrue(scanner.getCell() instanceof SizeCachedNoTagsByteBufferKeyValue);
+      }
+      i += 1;
+    }
+    assertEquals(entryCount, i);
+  }
+
+  private byte[] intToBytes(final int i) {
+    ByteBuffer bb = ByteBuffer.allocate(4);
+    bb.putInt(i);
+    return bb.array();
+  }
+}

From e813342b77c409ba677fd0e85424cf7c8188da2b Mon Sep 17 00:00:00 2001
From: Charles Connell <cconnell@hubspot.com>
Date: Wed, 19 Mar 2025 10:10:53 -0400
Subject: [PATCH 36/37] HubSpot Backport: HBASE-29204:
 BufferedMutatorParams#clone() should copy the requestAttributes field (not
 yet merged upstream)

---
 .../apache/hadoop/hbase/client/BufferedMutatorParams.java    | 3 +++
 .../hadoop/hbase/client/TestBufferedMutatorParams.java       | 5 ++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/BufferedMutatorParams.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/BufferedMutatorParams.java
index de414cb86234..3f13ee9834bb 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/BufferedMutatorParams.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/BufferedMutatorParams.java
@@ -24,6 +24,8 @@
 import org.apache.hadoop.hbase.TableName;
 import org.apache.yetus.audience.InterfaceAudience;
 
+import org.apache.hbase.thirdparty.com.google.common.collect.Maps;
+
 /**
  * Parameters for instantiating a {@link BufferedMutator}.
  */
@@ -223,6 +225,7 @@ public BufferedMutatorParams clone() {
     clone.writeBufferPeriodicFlushTimerTickMs = this.writeBufferPeriodicFlushTimerTickMs;
     clone.maxKeyValueSize = this.maxKeyValueSize;
     clone.maxMutations = this.maxMutations;
+    clone.requestAttributes = Maps.newHashMap(this.requestAttributes);
     clone.pool = this.pool;
     clone.listener = this.listener;
     clone.implementationClassName = this.implementationClassName;
diff --git a/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestBufferedMutatorParams.java b/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestBufferedMutatorParams.java
index fdc7c305500f..b6c52a0cd0d6 100644
--- a/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestBufferedMutatorParams.java
+++ b/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestBufferedMutatorParams.java
@@ -21,6 +21,7 @@
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
+import java.nio.charset.StandardCharsets;
 import java.util.Collection;
 import java.util.List;
 import java.util.concurrent.Callable;
@@ -141,7 +142,8 @@ public void testClone() {
     BufferedMutator.ExceptionListener listener = new MockExceptionListener();
     bmp.writeBufferSize(17).setWriteBufferPeriodicFlushTimeoutMs(123)
       .setWriteBufferPeriodicFlushTimerTickMs(456).maxKeyValueSize(13).setMaxMutations(3737)
-      .pool(pool).listener(listener);
+      .setRequestAttribute("foo", "bar".getBytes(StandardCharsets.UTF_8)).pool(pool)
+      .listener(listener);
     bmp.implementationClassName("someClassName");
     BufferedMutatorParams clone = bmp.clone();
 
@@ -180,6 +182,7 @@ private void cloneTest(BufferedMutatorParams some, BufferedMutatorParams clone)
       clone.getWriteBufferPeriodicFlushTimerTickMs());
     assertEquals(some.getMaxKeyValueSize(), clone.getMaxKeyValueSize());
     assertTrue(some.getMaxMutations() == clone.getMaxMutations());
+    assertEquals(some.requestAttributes, clone.requestAttributes);
     assertTrue(some.getListener() == clone.getListener());
     assertTrue(some.getPool() == clone.getPool());
     assertEquals(some.getImplementationClassName(), clone.getImplementationClassName());

From ca80fd549b7c7cac0ecd8f384caa3235540cc82b Mon Sep 17 00:00:00 2001
From: Duo Zhang <zhangduo@apache.org>
Date: Sun, 20 Apr 2025 13:57:32 +0800
Subject: [PATCH 37/37] HBASE-29259 Master crash when loading procedures
 (#6906)

Signed-off-by: Nick Dimiduk <ndimiduk@apache.org>
---
 .../hadoop/hbase/procedure2/Procedure.java    |  19 +++
 .../hbase/procedure2/ProcedureExecutor.java   |   4 +
 .../assignment/RegionRemoteProcedureBase.java |  15 ++-
 .../TransitRegionStateProcedure.java          |  18 ++-
 .../TestTRSPPersistUninitializedSubProc.java  | 125 ++++++++++++++++++
 5 files changed, 168 insertions(+), 13 deletions(-)
 create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTRSPPersistUninitializedSubProc.java

diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java
index 0356f806bf48..4d07e2fbdaef 100644
--- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java
+++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java
@@ -346,6 +346,25 @@ protected void afterReplay(TEnvironment env) {
     // no-op
   }
 
+  /**
+   * Called before we call the execute method of this procedure, but after we acquire the execution
+   * lock and procedure scheduler lock.
+   */
+  protected void beforeExec(TEnvironment env) throws ProcedureSuspendedException {
+    // no-op
+  }
+
+  /**
+   * Called after we call the execute method of this procedure, and also after we initialize all the
+   * sub procedures and persist the the state if persistence is needed.
+   * <p>
+   * This is for doing some hooks after we initialize the sub procedures. See HBASE-29259 for more
+   * details on why we can not release the region lock inside the execute method.
+   */
+  protected void afterExec(TEnvironment env) {
+    // no-op
+  }
+
   /**
    * Called when the procedure is marked as completed (success or rollback). The procedure
    * implementor may use this method to cleanup in-memory states. This operation will not be retried
diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
index 0a3c43b6790b..b19cb01a947c 100644
--- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
+++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java
@@ -1781,6 +1781,7 @@ private void execProcedure(RootProcedureState<TEnvironment> procStack,
       reExecute = false;
       procedure.resetPersistence();
       try {
+        procedure.beforeExec(getEnvironment());
         subprocs = procedure.doExecute(getEnvironment());
         if (subprocs != null && subprocs.length == 0) {
           subprocs = null;
@@ -1790,11 +1791,13 @@ private void execProcedure(RootProcedureState<TEnvironment> procStack,
         suspended = true;
       } catch (ProcedureYieldException e) {
         LOG.trace("Yield {}", procedure, e);
+        procedure.afterExec(getEnvironment());
         yieldProcedure(procedure);
         return;
       } catch (InterruptedException e) {
         LOG.trace("Yield interrupt {}", procedure, e);
         handleInterruptedException(procedure, e);
+        procedure.afterExec(getEnvironment());
         yieldProcedure(procedure);
         return;
       } catch (Throwable e) {
@@ -1866,6 +1869,7 @@ private void execProcedure(RootProcedureState<TEnvironment> procStack,
           updateStoreOnExec(procStack, procedure, subprocs);
         }
       }
+      procedure.afterExec(getEnvironment());
 
       // if the store is not running we are aborting
       if (!store.isRunning()) {
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java
index d1caa2094212..dd377881ae26 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java
@@ -283,11 +283,22 @@ private void unattach(MasterProcedureEnv env) {
     getParent(env).unattachRemoteProc(this);
   }
 
+  @Override
+  protected void beforeExec(MasterProcedureEnv env) {
+    RegionStateNode regionNode = getRegionNode(env);
+    regionNode.lock();
+  }
+
+  @Override
+  protected void afterExec(MasterProcedureEnv env) {
+    RegionStateNode regionNode = getRegionNode(env);
+    regionNode.unlock();
+  }
+
   @Override
   protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env)
     throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException {
     RegionStateNode regionNode = getRegionNode(env);
-    regionNode.lock();
     try {
       switch (state) {
         case REGION_REMOTE_PROCEDURE_DISPATCH: {
@@ -333,8 +344,6 @@ protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env)
       setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
       skipPersistence();
       throw new ProcedureSuspendedException();
-    } finally {
-      regionNode.unlock();
     }
   }
 
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
index b1d4483bd9e5..18fe47ad31a1 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java
@@ -39,7 +39,6 @@
 import org.apache.hadoop.hbase.master.procedure.AbstractStateMachineRegionProcedure;
 import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
 import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
-import org.apache.hadoop.hbase.procedure2.Procedure;
 import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
 import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
 import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
@@ -390,19 +389,18 @@ private Flow confirmClosed(MasterProcedureEnv env, RegionStateNode regionNode)
     return Flow.HAS_MORE_STATE;
   }
 
-  // Override to lock RegionStateNode
-  @SuppressWarnings("rawtypes")
   @Override
-  protected Procedure[] execute(MasterProcedureEnv env)
-    throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
+  protected void beforeExec(MasterProcedureEnv env) {
     RegionStateNode regionNode =
       env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
     regionNode.lock();
-    try {
-      return super.execute(env);
-    } finally {
-      regionNode.unlock();
-    }
+  }
+
+  @Override
+  protected void afterExec(MasterProcedureEnv env) {
+    RegionStateNode regionNode =
+      env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
+    regionNode.unlock();
   }
 
   private RegionStateNode getRegionStateNode(MasterProcedureEnv env) {
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTRSPPersistUninitializedSubProc.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTRSPPersistUninitializedSubProc.java
new file mode 100644
index 000000000000..3145c340102a
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTRSPPersistUninitializedSubProc.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.assignment;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.master.HMaster;
+import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure.TransitionType;
+import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
+import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
+import org.apache.hadoop.hbase.procedure2.Procedure;
+import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
+import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
+import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
+import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState;
+
+/**
+ * Testcase for HBASE-29259
+ */
+@Category({ MasterTests.class, MediumTests.class })
+public class TestTRSPPersistUninitializedSubProc {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestTRSPPersistUninitializedSubProc.class);
+
+  private static HBaseTestingUtility UTIL = new HBaseTestingUtility();
+
+  private static byte[] CF = Bytes.toBytes("cf");
+
+  private static TableName TN = TableName.valueOf("tn");
+
+  public static class TRSPForTest extends TransitRegionStateProcedure {
+
+    private boolean injected = false;
+
+    public TRSPForTest() {
+    }
+
+    public TRSPForTest(MasterProcedureEnv env, RegionInfo hri, ServerName assignCandidate,
+      boolean forceNewPlan, TransitionType type) {
+      super(env, hri, assignCandidate, forceNewPlan, type);
+    }
+
+    @Override
+    protected Procedure[] execute(MasterProcedureEnv env)
+      throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
+      Procedure[] subProcs = super.execute(env);
+      if (!injected && subProcs != null && subProcs[0] instanceof CloseRegionProcedure) {
+        injected = true;
+        ServerName sn = ((CloseRegionProcedure) subProcs[0]).targetServer;
+        env.getMasterServices().getServerManager().expireServer(sn);
+        try {
+          UTIL.waitFor(15000, () -> env.getMasterServices().getProcedures().stream().anyMatch(
+            p -> p instanceof ServerCrashProcedure && p.getState() != ProcedureState.INITIALIZING));
+        } catch (IOException e) {
+          throw new UncheckedIOException(e);
+        }
+        // sleep 10 seconds to let the SCP interrupt the TRSP, where we will call TRSP.serverCrashed
+        Thread.sleep(10000);
+      }
+      return subProcs;
+    }
+  }
+
+  @BeforeClass
+  public static void setUpBeforeClass() throws Exception {
+    UTIL.startMiniCluster(2);
+    UTIL.getAdmin().balancerSwitch(false, true);
+    UTIL.createTable(TN, CF);
+    UTIL.waitTableAvailable(TN);
+  }
+
+  @AfterClass
+  public static void tearDownAfterClass() throws Exception {
+    UTIL.shutdownMiniCluster();
+  }
+
+  @Test
+  public void testServerCrash() throws Exception {
+    HMaster master = UTIL.getHBaseCluster().getMaster();
+    ProcedureExecutor<MasterProcedureEnv> procExec = master.getMasterProcedureExecutor();
+    RegionInfo region = UTIL.getAdmin().getRegions(TN).get(0);
+    RegionStateNode rsn =
+      master.getAssignmentManager().getRegionStates().getRegionStateNode(region);
+    TRSPForTest trsp =
+      new TRSPForTest(procExec.getEnvironment(), region, null, false, TransitionType.REOPEN);
+    // attach it to RegionStateNode, to simulate normal reopen
+    rsn.setProcedure(trsp);
+    procExec.submitProcedure(trsp);
+    ProcedureTestingUtility.waitProcedure(procExec, trsp);
+    // make sure we do not store invalid procedure to procedure store
+    ProcedureTestingUtility.restart(procExec);
+  }
+}