Skip to content

Commit

Permalink
[improve](cloud-mow)Add delete bitmap metrics (#47028)
Browse files Browse the repository at this point in the history
Count the max delete bitmap of tablet and base rowset, adn put them to
be metrics.
  • Loading branch information
hust-hhb authored Feb 6, 2025
1 parent df9eda8 commit 8373a2e
Show file tree
Hide file tree
Showing 17 changed files with 476 additions and 0 deletions.
28 changes: 28 additions & 0 deletions be/src/cloud/cloud_storage_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,12 @@ Status CloudStorageEngine::start_bg_threads(std::shared_ptr<WorkloadGroup> wg_sp

LOG(INFO) << "lease compaction thread started";

RETURN_IF_ERROR(Thread::create(
"StorageEngine", "check_tablet_delete_bitmap_score_thread",
[this]() { this->_check_tablet_delete_bitmap_score_callback(); },
&_bg_threads.emplace_back()));
LOG(INFO) << "check tablet delete bitmap score thread started";

return Status::OK();
}

Expand Down Expand Up @@ -796,6 +802,28 @@ void CloudStorageEngine::_lease_compaction_thread_callback() {
}
}

void CloudStorageEngine::_check_tablet_delete_bitmap_score_callback() {
LOG(INFO) << "try to start check tablet delete bitmap score!";
while (!_stop_background_threads_latch.wait_for(
std::chrono::seconds(config::check_tablet_delete_bitmap_interval_seconds))) {
if (!config::enable_check_tablet_delete_bitmap_score) {
return;
}
uint64_t max_delete_bitmap_score = 0;
uint64_t max_base_rowset_delete_bitmap_score = 0;
std::vector<CloudTabletSPtr> tablets;
tablet_mgr().get_topn_tablet_delete_bitmap_score(&max_delete_bitmap_score,
&max_base_rowset_delete_bitmap_score);
if (max_delete_bitmap_score > 0) {
_tablet_max_delete_bitmap_score_metrics->set_value(max_delete_bitmap_score);
}
if (max_base_rowset_delete_bitmap_score > 0) {
_tablet_max_base_rowset_delete_bitmap_score_metrics->set_value(
max_base_rowset_delete_bitmap_score);
}
}
}

Status CloudStorageEngine::get_compaction_status_json(std::string* result) {
rapidjson::Document root;
root.SetObject();
Expand Down
1 change: 1 addition & 0 deletions be/src/cloud/cloud_storage_engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ class CloudStorageEngine final : public BaseStorageEngine {
Status _submit_cumulative_compaction_task(const CloudTabletSPtr& tablet);
Status _submit_full_compaction_task(const CloudTabletSPtr& tablet);
void _lease_compaction_thread_callback();
void _check_tablet_delete_bitmap_score_callback();

std::atomic_bool _stopped {false};

Expand Down
46 changes: 46 additions & 0 deletions be/src/cloud/cloud_tablet_mgr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -421,4 +421,50 @@ void CloudTabletMgr::get_tablet_info(int64_t num_tablets, std::vector<TabletInfo
}
}

void CloudTabletMgr::get_topn_tablet_delete_bitmap_score(
uint64_t* max_delete_bitmap_score, uint64_t* max_base_rowset_delete_bitmap_score) {
int64_t max_delete_bitmap_score_tablet_id = 0;
OlapStopWatch watch;
uint64_t total_delete_map_count = 0;
int64_t max_base_rowset_delete_bitmap_score_tablet_id = 0;
int n = config::check_tablet_delete_bitmap_score_top_n;
std::vector<std::pair<std::shared_ptr<CloudTablet>, int64_t>> buf;
buf.reserve(n + 1);
auto handler = [&](const std::weak_ptr<CloudTablet>& tablet_wk) {
auto t = tablet_wk.lock();
if (!t) return;
uint64_t delete_bitmap_count =
t.get()->tablet_meta()->delete_bitmap().get_delete_bitmap_count();
total_delete_map_count += delete_bitmap_count;
if (delete_bitmap_count > *max_delete_bitmap_score) {
max_delete_bitmap_score_tablet_id = t->tablet_id();
*max_delete_bitmap_score = delete_bitmap_count;
}
buf.emplace_back(std::move(t), delete_bitmap_count);
std::sort(buf.begin(), buf.end(), [](auto& a, auto& b) { return a.second > b.second; });
if (buf.size() > n) {
buf.pop_back();
}
};
auto weak_tablets = get_weak_tablets();
std::for_each(weak_tablets.begin(), weak_tablets.end(), handler);
for (auto& [t, _] : buf) {
t->get_base_rowset_delete_bitmap_count(max_base_rowset_delete_bitmap_score,
&max_base_rowset_delete_bitmap_score_tablet_id);
}
std::stringstream ss;
for (auto& i : buf) {
ss << i.first->tablet_id() << ":" << i.second << ",";
}
LOG(INFO) << "get_topn_tablet_delete_bitmap_score, n=" << n
<< ",tablet size=" << weak_tablets.size()
<< ",total_delete_map_count=" << total_delete_map_count
<< ",cost(us)=" << watch.get_elapse_time_us()
<< ",max_delete_bitmap_score=" << *max_delete_bitmap_score
<< ",max_delete_bitmap_score_tablet_id=" << max_delete_bitmap_score_tablet_id
<< ",max_base_rowset_delete_bitmap_score=" << *max_base_rowset_delete_bitmap_score
<< ",max_base_rowset_delete_bitmap_score_tablet_id="
<< max_base_rowset_delete_bitmap_score_tablet_id << ",tablets=[" << ss.str() << "]";
}

} // namespace doris
3 changes: 3 additions & 0 deletions be/src/cloud/cloud_tablet_mgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ class CloudTabletMgr {

void get_tablet_info(int64_t num_tablets, std::vector<TabletInfo>* tablets_info);

void get_topn_tablet_delete_bitmap_score(uint64_t* max_delete_bitmap_score,
uint64_t* max_base_rowset_delete_bitmap_score);

private:
CloudStorageEngine& _engine;

Expand Down
4 changes: 4 additions & 0 deletions be/src/common/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1412,6 +1412,10 @@ DEFINE_mBool(enable_sleep_between_delete_cumu_compaction, "false");

DEFINE_mInt32(compaction_num_per_round, "1");

DEFINE_mInt32(check_tablet_delete_bitmap_interval_seconds, "300");
DEFINE_mInt32(check_tablet_delete_bitmap_score_top_n, "10");
DEFINE_mBool(enable_check_tablet_delete_bitmap_score, "true");

// clang-format off
#ifdef BE_TEST
// test s3
Expand Down
4 changes: 4 additions & 0 deletions be/src/common/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -1498,6 +1498,10 @@ DECLARE_mBool(enable_sleep_between_delete_cumu_compaction);

DECLARE_mInt32(compaction_num_per_round);

DECLARE_mInt32(check_tablet_delete_bitmap_interval_seconds);
DECLARE_mInt32(check_tablet_delete_bitmap_score_top_n);
DECLARE_mBool(enable_check_tablet_delete_bitmap_score);

#ifdef BE_TEST
// test s3
DECLARE_String(test_s3_resource);
Expand Down
34 changes: 34 additions & 0 deletions be/src/olap/base_tablet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1876,4 +1876,38 @@ Status BaseTablet::show_nested_index_file(std::string* json_meta) {
return Status::OK();
}

void BaseTablet::get_base_rowset_delete_bitmap_count(
uint64_t* max_base_rowset_delete_bitmap_score,
int64_t* max_base_rowset_delete_bitmap_score_tablet_id) {
std::vector<RowsetSharedPtr> rowsets_;
std::string base_rowset_id_str;
{
std::shared_lock rowset_ldlock(this->get_header_lock());
for (const auto& it : _rs_version_map) {
rowsets_.emplace_back(it.second);
}
}
std::sort(rowsets_.begin(), rowsets_.end(), Rowset::comparator);
if (!rowsets_.empty()) {
bool base_found = false;
for (auto& rowset : rowsets_) {
if (rowset->start_version() > 2) {
break;
}
base_found = true;
uint64_t base_rowset_delete_bitmap_count =
this->tablet_meta()->delete_bitmap().get_count_with_range(
{rowset->rowset_id(), 0, 0},
{rowset->rowset_id(), UINT32_MAX, UINT64_MAX});
if (base_rowset_delete_bitmap_count > *max_base_rowset_delete_bitmap_score) {
*max_base_rowset_delete_bitmap_score = base_rowset_delete_bitmap_count;
*max_base_rowset_delete_bitmap_score_tablet_id = this->tablet_id();
}
}
if (!base_found) {
LOG(WARNING) << "can not found base rowset for tablet " << tablet_id();
}
}
}

} // namespace doris
4 changes: 4 additions & 0 deletions be/src/olap/base_tablet.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,10 @@ class BaseTablet {
TabletUid tablet_uid() const { return _tablet_meta->tablet_uid(); }
TabletInfo get_tablet_info() const { return TabletInfo(tablet_id(), tablet_uid()); }

void get_base_rowset_delete_bitmap_count(
uint64_t* max_base_rowset_delete_bitmap_score,
int64_t* max_base_rowset_delete_bitmap_score_tablet_id);

protected:
// Find the missed versions until the spec_version.
//
Expand Down
28 changes: 28 additions & 0 deletions be/src/olap/olap_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,12 @@ Status StorageEngine::start_bg_threads(std::shared_ptr<WorkloadGroup> wg_sptr) {
[this]() { this->_async_publish_callback(); }, &_async_publish_thread));
LOG(INFO) << "async publish thread started";

RETURN_IF_ERROR(Thread::create(
"StorageEngine", "check_tablet_delete_bitmap_score_thread",
[this]() { this->_check_tablet_delete_bitmap_score_callback(); },
&_check_delete_bitmap_score_thread));
LOG(INFO) << "check tablet delete bitmap score thread started";

LOG(INFO) << "all storage engine's background threads are started.";
return Status::OK();
}
Expand Down Expand Up @@ -1642,4 +1648,26 @@ void StorageEngine::_async_publish_callback() {
}
}

void StorageEngine::_check_tablet_delete_bitmap_score_callback() {
LOG(INFO) << "try to start check tablet delete bitmap score!";
while (!_stop_background_threads_latch.wait_for(
std::chrono::seconds(config::check_tablet_delete_bitmap_interval_seconds))) {
if (!config::enable_check_tablet_delete_bitmap_score) {
return;
}
uint64_t max_delete_bitmap_score = 0;
uint64_t max_base_rowset_delete_bitmap_score = 0;
std::vector<CloudTabletSPtr> tablets;
_tablet_manager.get()->get_topn_tablet_delete_bitmap_score(
&max_delete_bitmap_score, &max_base_rowset_delete_bitmap_score);
if (max_delete_bitmap_score > 0) {
_tablet_max_delete_bitmap_score_metrics->set_value(max_delete_bitmap_score);
}
if (max_base_rowset_delete_bitmap_score > 0) {
_tablet_max_base_rowset_delete_bitmap_score_metrics->set_value(
max_base_rowset_delete_bitmap_score);
}
}
}

} // namespace doris
5 changes: 5 additions & 0 deletions be/src/olap/storage_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ BaseStorageEngine::BaseStorageEngine(Type type, const UniqueId& backend_uid)
_stop_background_threads_latch(1) {
_memory_limitation_bytes_for_schema_change =
static_cast<int64_t>(MemInfo::soft_mem_limit() * config::schema_change_mem_limit_frac);
_tablet_max_delete_bitmap_score_metrics =
std::make_shared<bvar::Status<size_t>>("tablet_max", "delete_bitmap_score", 0);
_tablet_max_base_rowset_delete_bitmap_score_metrics = std::make_shared<bvar::Status<size_t>>(
"tablet_max_base_rowset", "delete_bitmap_score", 0);
}

BaseStorageEngine::~BaseStorageEngine() = default;
Expand Down Expand Up @@ -706,6 +710,7 @@ void StorageEngine::stop() {
THREAD_JOIN(_async_publish_thread);
THREAD_JOIN(_cold_data_compaction_producer_thread);
THREAD_JOIN(_cooldown_tasks_producer_thread);
THREAD_JOIN(_check_delete_bitmap_score_thread);
#undef THREAD_JOIN

#define THREADS_JOIN(threads) \
Expand Down
9 changes: 9 additions & 0 deletions be/src/olap/storage_engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#pragma once

#include <butil/macros.h>
#include <bvar/bvar.h>
#include <gen_cpp/Types_types.h>
#include <gen_cpp/internal_service.pb.h>
#include <gen_cpp/olap_file.pb.h>
Expand Down Expand Up @@ -168,6 +169,9 @@ class BaseStorageEngine {
int _disk_num {-1};

std::shared_ptr<StreamLoadRecorder> _stream_load_recorder;

std::shared_ptr<bvar::Status<size_t>> _tablet_max_delete_bitmap_score_metrics;
std::shared_ptr<bvar::Status<size_t>> _tablet_max_base_rowset_delete_bitmap_score_metrics;
};

class CompactionSubmitRegistry {
Expand Down Expand Up @@ -430,6 +434,8 @@ class StorageEngine final : public BaseStorageEngine {

int32_t _auto_get_interval_by_disk_capacity(DataDir* data_dir);

void _check_tablet_delete_bitmap_score_callback();

private:
EngineOptions _options;
std::mutex _store_lock;
Expand Down Expand Up @@ -536,6 +542,9 @@ class StorageEngine final : public BaseStorageEngine {
std::unique_ptr<CreateTabletRRIdxCache> _create_tablet_idx_lru_cache;

std::unique_ptr<SnapshotManager> _snapshot_mgr;

// thread to check tablet delete bitmap count tasks
scoped_refptr<Thread> _check_delete_bitmap_score_thread;
};

// lru cache for create tabelt round robin in disks
Expand Down
43 changes: 43 additions & 0 deletions be/src/olap/tablet_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1764,4 +1764,47 @@ bool TabletManager::update_tablet_partition_id(::doris::TPartitionId partition_i
return true;
}

void TabletManager::get_topn_tablet_delete_bitmap_score(
uint64_t* max_delete_bitmap_score, uint64_t* max_base_rowset_delete_bitmap_score) {
int64_t max_delete_bitmap_score_tablet_id = 0;
int64_t max_base_rowset_delete_bitmap_score_tablet_id = 0;
OlapStopWatch watch;
uint64_t total_delete_map_count = 0;
int n = config::check_tablet_delete_bitmap_score_top_n;
std::vector<std::pair<std::shared_ptr<Tablet>, int64_t>> buf;
buf.reserve(n + 1);
auto handler = [&](const TabletSharedPtr& tablet) {
uint64_t delete_bitmap_count =
tablet->tablet_meta()->delete_bitmap().get_delete_bitmap_count();
total_delete_map_count += delete_bitmap_count;
if (delete_bitmap_count > *max_delete_bitmap_score) {
max_delete_bitmap_score_tablet_id = tablet->tablet_id();
*max_delete_bitmap_score = delete_bitmap_count;
}
buf.emplace_back(std::move(tablet), delete_bitmap_count);
std::sort(buf.begin(), buf.end(), [](auto& a, auto& b) { return a.second > b.second; });
if (buf.size() > n) {
buf.pop_back();
}
};
for_each_tablet(handler, filter_all_tablets);
for (auto& [t, _] : buf) {
t->get_base_rowset_delete_bitmap_count(max_base_rowset_delete_bitmap_score,
&max_base_rowset_delete_bitmap_score_tablet_id);
}
std::stringstream ss;
for (auto& i : buf) {
ss << i.first->tablet_id() << ":" << i.second << ",";
}
LOG(INFO) << "get_topn_tablet_delete_bitmap_score, n=" << n
<< ",tablet size=" << _tablets_shards.size()
<< ",total_delete_map_count=" << total_delete_map_count
<< ",cost(us)=" << watch.get_elapse_time_us()
<< ",max_delete_bitmap_score=" << *max_delete_bitmap_score
<< ",max_delete_bitmap_score_tablet_id=" << max_delete_bitmap_score_tablet_id
<< ",max_base_rowset_delete_bitmap_score=" << *max_base_rowset_delete_bitmap_score
<< ",max_base_rowset_delete_bitmap_score_tablet_id="
<< max_base_rowset_delete_bitmap_score_tablet_id << ",tablets=[" << ss.str() << "]";
}

} // end namespace doris
3 changes: 3 additions & 0 deletions be/src/olap/tablet_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,9 @@ class TabletManager {
bool update_tablet_partition_id(::doris::TPartitionId partition_id,
::doris::TTabletId tablet_id);

void get_topn_tablet_delete_bitmap_score(uint64_t* max_delete_bitmap_score,
uint64_t* max_base_rowset_delete_bitmap_score);

private:
// Add a tablet pointer to StorageEngine
// If force, drop the existing tablet add this new one
Expand Down
14 changes: 14 additions & 0 deletions be/src/olap/tablet_meta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1186,6 +1186,20 @@ void DeleteBitmap::subset(const BitmapKey& start, const BitmapKey& end,
}
}

size_t DeleteBitmap::get_count_with_range(const BitmapKey& start, const BitmapKey& end) const {
DCHECK(start < end);
size_t count = 0;
std::shared_lock l(lock);
for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
auto& [k, bm] = *it;
if (k >= end) {
break;
}
count++;
}
return count;
}

void DeleteBitmap::merge(const BitmapKey& bmk, const roaring::Roaring& segment_delete_bitmap) {
std::lock_guard l(lock);
auto [iter, succ] = delete_bitmap.emplace(bmk, segment_delete_bitmap);
Expand Down
8 changes: 8 additions & 0 deletions be/src/olap/tablet_meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,14 @@ class DeleteBitmap {
void subset(const BitmapKey& start, const BitmapKey& end,
DeleteBitmap* subset_delete_map) const;

/**
* Gets count of delete_bitmap with given range [start, end)
*
* @parma start start
* @parma end end
*/
size_t get_count_with_range(const BitmapKey& start, const BitmapKey& end) const;

/**
* Merges the given segment delete bitmap into *this
*
Expand Down
12 changes: 12 additions & 0 deletions regression-test/data/metrics_p0/test_delete_bitmap_metrics.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --
0 0 8
1 1 1
2 2 2
3 3 3
4 4 4
5 5 5
6 6 6
7 7 7
8 8 8

Loading

0 comments on commit 8373a2e

Please sign in to comment.