Skip to content

Commit 8e65f9c

Browse files
committedMar 28, 2025
[#26455] docdb: Block-based ANALYZE should handle dynamic tablet splits
Summary: It can happen that tablets are split during execution of ANALYZE query. Added `PgAnalyzeTest.AnalyzeSamplingNonColocatedWithConcurrentSplits` for testing this scenario. Updated `SampleBlocksFeed::FetchTo` to support splitting sample blocks and assign to child tablets in case split happened after we've determined sample blocks but before we've created requests. Without that change sample block on split boundary could be assigned only to one child tablet and we won't take data from another child tablet into account that would affect pg_stats accuracy and estimated number of rows. Jira: DB-15817 Test Plan: PgAnalyzeTest.AnalyzeSamplingNonColocatedWithConcurrentSplits Reviewers: amartsinchyk Reviewed By: amartsinchyk Subscribers: yql, ybase Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D42723
1 parent f95a77f commit 8e65f9c

File tree

4 files changed

+547
-210
lines changed

4 files changed

+547
-210
lines changed
 

‎src/yb/client/async_rpc.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,6 @@ void AsyncRpc::Finished(const Status& status) {
243243
DEBUG_ONLY_TEST_SYNC_POINT("AsyncRpc::Finished:SetTimedOut:1");
244244
DEBUG_ONLY_TEST_SYNC_POINT("AsyncRpc::Finished:SetTimedOut:2");
245245
}
246-
247246
}
248247
if (tablet_invoker_.Done(&new_status)) {
249248
if (tablet().is_split() ||
@@ -841,6 +840,7 @@ void ReadRpc::CallRemoteMethod() {
841840
TRACE_TO(trace, "SendRpcToTserver");
842841
ADOPT_TRACE(trace.get());
843842

843+
DEBUG_ONLY_TEST_SYNC_POINT_CALLBACK("ReadRpc::CallRemoteMethod", &req_);
844844
tablet_invoker_.proxy()->ReadAsync(
845845
req_, &resp_, PrepareController(), std::bind(&ReadRpc::Finished, this, Status::OK()));
846846
TRACE_TO(trace, "RpcDispatched Asynchronously");
@@ -918,6 +918,7 @@ Status ReadRpc::SwapResponses() {
918918
}
919919

920920
void ReadRpc::NotifyBatcher(const Status& status) {
921+
DEBUG_ONLY_TEST_SYNC_POINT_CALLBACK("ReadRpc::NotifyBatcher", &resp_);
921922
batcher_->ProcessReadResponse(*this, status);
922923
}
923924

‎src/yb/docdb/pgsql_operation.cc

+8-7
Original file line numberDiff line numberDiff line change
@@ -2319,7 +2319,7 @@ Status SampleRowsFromBlocks(
23192319
bool fetch_next_needed = false;
23202320

23212321
size_t sample_block_idx = 0;
2322-
size_t num_blocks_present = 0;
2322+
size_t num_blocks_with_rows = 0;
23232323
for (const auto& sample_block : sample_blocks) {
23242324
const auto[lower_bound_key_inclusive, upper_bound_key_exclusive] =
23252325
GetSampleBlockBounds(sample_block);
@@ -2385,20 +2385,21 @@ Status SampleRowsFromBlocks(
23852385
}
23862386

23872387
if (found_row) {
2388-
++num_blocks_present;
2388+
++num_blocks_with_rows;
23892389
}
23902390

2391-
VLOG(3) << "num_sample_rows: " << size_t(num_sample_rows)
2392-
<< ", ybctid after the sample block #" << sample_block_idx << ": "
2393-
<< DebugKeySliceToString(table_iter->GetTupleId())
2394-
<< " row_key: " << DebugKeySliceToString(row_key);
2391+
VLOG(3) << "num_sample_rows: " << size_t(num_sample_rows) << ", ybctid after the sample block #"
2392+
<< sample_block_idx << ": " << DebugKeySliceToString(table_iter->GetTupleId())
2393+
<< " row_key: " << DebugKeySliceToString(row_key)
2394+
<< " found_row: " << found_row
2395+
<< " num_blocks_with_rows: " << num_blocks_with_rows;
23952396
if (reached_end_of_tablet) {
23962397
break;
23972398
}
23982399
++sample_block_idx;
23992400
}
24002401

2401-
VLOG(2) << "num_blocks_present: " << num_blocks_present;
2402+
VLOG(2) << "num_blocks_with_rows: " << num_blocks_with_rows;
24022403
sampling_state->set_samplerows(num_sample_rows);
24032404
sampling_state->set_rowstoskip(rows_to_skip);
24042405
sampling_state->set_numrows(num_rows_collected);

‎src/yb/yql/pggate/pg_sample.cc

+129-43
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@
3333

3434
#include "yb/yql/pggate/util/ybc_guc.h"
3535

36+
DEFINE_test_flag(bool, refresh_partitions_after_fetched_sample_blocks, false,
37+
"Force table partitions refresh after sample blocks are fetched.");
3638
DEFINE_test_flag(int64, delay_after_table_analyze_ms, 0,
3739
"Add this delay after each table is analyzed.");
3840

@@ -56,6 +58,11 @@ class SampleRowsPickerIf {
5658

5759
namespace {
5860

61+
std::string AsDebugHexString(const LWPgsqlSampleBlockPB& sample_block_pb) {
62+
return AsDebugHexString(
63+
std::make_pair(sample_block_pb.lower_bound(), sample_block_pb.upper_bound()));
64+
}
65+
5966
class PgDocSampleOp : public PgDocReadOp {
6067
public:
6168
struct SamplingStats {
@@ -80,13 +87,14 @@ class PgDocSampleOp : public PgDocReadOp {
8087
template_read_req.has_sampling_state(), IllegalState,
8188
"PgDocSampleOp is expected to have sampling state");
8289
SCHECK(
83-
sample_blocks_.empty() || !template_read_req.sampling_state().is_blocks_sampling_stage(),
90+
sorted_sample_blocks_.empty() ||
91+
!template_read_req.sampling_state().is_blocks_sampling_stage(),
8492
IllegalState, "Sample blocks are not expected to be set for blocks sampling stage");
8593

8694
// Sample blocks will be distributed across tablets/table partitions below.
8795
std::optional<SampleBlocksFeed> sample_blocks_feed;
88-
if (!sample_blocks_.empty()) {
89-
sample_blocks_feed.emplace(sample_blocks_);
96+
if (!sorted_sample_blocks_.empty()) {
97+
sample_blocks_feed.emplace(sorted_sample_blocks_);
9098
}
9199

92100
// Create one PgsqlOp per partition
@@ -114,6 +122,9 @@ class PgDocSampleOp : public PgDocReadOp {
114122
VERIFY_RESULT(AssignSampleBlocks(
115123
&read_req, partition_keys, partition, &sample_blocks_feed.value()))) {
116124
pgsql_ops_[partition]->set_active(true);
125+
VLOG_WITH_PREFIX_AND_FUNC(3)
126+
<< "Request #" << partition << " of " << partition_keys.size()
127+
<< " for partition: " << Slice(partition_keys[partition]).ToDebugHexString();
117128
++active_op_count_;
118129
}
119130
}
@@ -174,8 +185,41 @@ class PgDocSampleOp : public PgDocReadOp {
174185
const SamplingStats& GetSamplingStats() const { return sampling_stats_; }
175186

176187
Status SetSampleBlocksBounds(std::vector<std::pair<KeyBuffer, KeyBuffer>>&& sample_blocks) {
177-
sample_blocks_ = std::move(sample_blocks);
178-
SCHECK(!sample_blocks_.empty(), IllegalState, "Sample blocks list should not be empty.");
188+
sorted_sample_blocks_ = std::move(sample_blocks);
189+
SCHECK(!sorted_sample_blocks_.empty(), IllegalState, "Sample blocks list should not be empty.");
190+
191+
std::sort(
192+
sorted_sample_blocks_.begin(), sorted_sample_blocks_.end(),
193+
[](const std::pair<KeyBuffer, KeyBuffer>& b1, const std::pair<KeyBuffer, KeyBuffer>& b2) {
194+
return b1.first < b2.first;
195+
});
196+
197+
if (VLOG_IS_ON(3)) {
198+
size_t idx = 0;
199+
for (const auto& sample_block : sorted_sample_blocks_) {
200+
VLOG_WITH_FUNC(3) << "Sorted sample block #" << idx << ": "
201+
<< AsDebugHexString(sample_block);
202+
++idx;
203+
}
204+
}
205+
206+
Slice prev_upper_bound;
207+
size_t idx = 0;
208+
for (const auto& sample_block : sorted_sample_blocks_) {
209+
if (sample_block.first.AsSlice() < prev_upper_bound) {
210+
return STATUS_FORMAT(
211+
InternalError, "Sorted sample block #$0: $1 starts before prev_upper_bound: $2", idx,
212+
AsDebugHexString(sample_block), AsDebugHexString(prev_upper_bound));
213+
}
214+
prev_upper_bound = sample_block.second.AsSlice();
215+
++idx;
216+
}
217+
218+
if (FLAGS_TEST_refresh_partitions_after_fetched_sample_blocks) {
219+
const auto pg_table_id = table_->pg_table_id();
220+
pg_session_->InvalidateTableCache(pg_table_id, InvalidateOnPgClient::kTrue);
221+
table_ = PgTable(CHECK_RESULT(pg_session_->LoadTable(pg_table_id)));
222+
}
179223
return Status::OK();
180224
}
181225

@@ -184,18 +228,19 @@ class PgDocSampleOp : public PgDocReadOp {
184228
class SampleBlocksFeed {
185229
public:
186230
// Transfers all sample blocks from `other` list into internal storage.
187-
explicit SampleBlocksFeed(const std::vector<std::pair<KeyBuffer, KeyBuffer>>& sample_blocks)
188-
: sample_blocks_(sample_blocks) {
189-
sample_block_iter_ = sample_blocks_.cbegin();
190-
is_single_unbounded_block_ = sample_block_iter_ != sample_blocks_.cend() &&
231+
explicit SampleBlocksFeed(
232+
const std::vector<std::pair<KeyBuffer, KeyBuffer>>& sorted_sample_blocks)
233+
: sorted_sample_blocks_(sorted_sample_blocks) {
234+
sample_block_iter_ = sorted_sample_blocks_.cbegin();
235+
is_single_unbounded_block_ = sample_block_iter_ != sorted_sample_blocks_.cend() &&
191236
sample_block_iter_->first.empty() &&
192237
sample_block_iter_->second.empty();
193238
}
194239

195240
// Fetches sample block boundaries from internal storage until `exclusive_upper_bound` and
196241
// assigns them to `dst`.
197242
Status FetchTo(
198-
::yb::ArenaList<::yb::LWPgsqlSampleBlockPB>* dst, std::string exclusive_upper_bound) {
243+
::yb::ArenaList<LWPgsqlSampleBlockPB>* dst, const std::string& exclusive_upper_bound) {
199244
if (is_single_unbounded_block_) {
200245
// We should fully sample all tablets.
201246
auto& sample_block_pb = *dst->Add();
@@ -204,37 +249,92 @@ class PgDocSampleOp : public PgDocReadOp {
204249
return Status::OK();
205250
}
206251

207-
for (; sample_block_iter_ != sample_blocks_.cend() &&
252+
for (; sample_block_iter_ != sorted_sample_blocks_.cend() &&
208253
(exclusive_upper_bound.empty() ||
209-
sample_block_iter_->first.AsSlice() < exclusive_upper_bound);
254+
(!sample_block_iter_->second.empty() &&
255+
sample_block_iter_->second.AsSlice() <= exclusive_upper_bound));
210256
sample_block_iter_++) {
211-
212-
const auto cmp = sample_block_iter_->first.AsSlice().compare(prev_upper_bound);
213-
if (cmp < 0) {
214-
return STATUS_FORMAT(
215-
InternalError, "Sample block: $0 starts before prev_upper_bound: $1",
216-
AsDebugHexString(*sample_block_iter_), AsDebugHexString(prev_upper_bound));
217-
}
218-
if (cmp == 0 && !dst->empty()) {
219-
// Combine with the previous block.
220-
*dst->back().mutable_upper_bound() = sample_block_iter_->second.AsSlice();
257+
LWPgsqlSampleBlockPB* sample_block_pb;
258+
259+
if (!override_next_block_lower_bound_.empty()) {
260+
SCHECK(
261+
dst->empty(), InternalError,
262+
Format(
263+
"Expected dst (has $0 blocks) to be empty when override_next_block_lower_bound_ "
264+
"is set "
265+
"($1)",
266+
dst->size(), AsDebugHexString(override_next_block_lower_bound_)));
267+
sample_block_pb = dst->Add();
268+
sample_block_pb->dup_lower_bound(override_next_block_lower_bound_.AsSlice());
269+
override_next_block_lower_bound_.clear();
221270
} else {
222-
auto& sample_block_pb = *dst->Add();
223-
*sample_block_pb.mutable_lower_bound() = sample_block_iter_->first.AsSlice();
224-
*sample_block_pb.mutable_upper_bound() = sample_block_iter_->second.AsSlice();
271+
sample_block_pb = AddOrUpdateBlock(dst, sample_block_iter_->first.AsSlice());
225272
}
226273

227-
prev_upper_bound = sample_block_iter_->second.AsSlice();
274+
*sample_block_pb->mutable_upper_bound() = sample_block_iter_->second.AsSlice();
275+
RETURN_NOT_OK(OnLatestBlockBoundsSet(dst));
276+
}
277+
278+
SCHECK(
279+
!exclusive_upper_bound.empty() || sample_block_iter_ == sorted_sample_blocks_.cend(),
280+
InternalError,
281+
Format(
282+
"Unexpected stop at sorted sample block $0 while exclusive_upper_bound is empty",
283+
AsDebugHexString(*sample_block_iter_)));
284+
285+
if (sample_block_iter_ == sorted_sample_blocks_.cend() ||
286+
sample_block_iter_->first.AsSlice() >= exclusive_upper_bound) {
287+
return Status::OK();
228288
}
229289

290+
// Sample block might cross exclusive_upper_bound due to tablet has been split since
291+
// block boundaries were calculated - split the block in this case.
292+
VLOG_WITH_FUNC(1)
293+
<< "Splitting the sample block: " << AsDebugHexString(*sample_block_iter_)
294+
<< " exclusive_upper_bound: " << AsDebugHexString(Slice(exclusive_upper_bound));
295+
296+
auto* sample_block_pb = AddOrUpdateBlock(dst, sample_block_iter_->first.AsSlice());
297+
sample_block_pb->dup_upper_bound(exclusive_upper_bound);
298+
RETURN_NOT_OK(OnLatestBlockBoundsSet(dst));
299+
300+
override_next_block_lower_bound_ = exclusive_upper_bound;
301+
230302
return Status::OK();
231303
}
232304

233305
private:
234-
const std::vector<std::pair<KeyBuffer, KeyBuffer>>& sample_blocks_;
306+
LWPgsqlSampleBlockPB* AddOrUpdateBlock(
307+
::yb::ArenaList<LWPgsqlSampleBlockPB>* dst, Slice lower_bound) {
308+
if (!dst->empty() && dst->back().upper_bound() == sample_block_iter_->first.AsSlice()) {
309+
// Update previous block to combine with the current one.
310+
return &dst->back();
311+
}
312+
auto* block = dst->Add();
313+
*block->mutable_lower_bound() = lower_bound;
314+
return block;
315+
}
316+
317+
Status OnLatestBlockBoundsSet(::yb::ArenaList<LWPgsqlSampleBlockPB>* dst) {
318+
auto* sample_block_pb = &dst->back();
319+
VLOG_WITH_FUNC(2) << "Sample block at dst[" << dst->size() - 1 << "]: "
320+
<< AsDebugHexString(*sample_block_pb);
321+
SCHECK(
322+
sample_block_pb->upper_bound().empty() ||
323+
sample_block_pb->lower_bound() < sample_block_pb->upper_bound(),
324+
InternalError,
325+
Format(
326+
"Wrong bounds order for sample block: $0",
327+
AsDebugHexString(*sample_block_pb)));
328+
return Status::OK();
329+
}
330+
331+
const std::vector<std::pair<KeyBuffer, KeyBuffer>>& sorted_sample_blocks_;
235332
std::vector<std::pair<KeyBuffer, KeyBuffer>>::const_iterator sample_block_iter_;
236-
Slice prev_upper_bound;
237333
bool is_single_unbounded_block_;
334+
// Not empty iff we've split sample block at sample_block_iter_ during previous FetchTo call.
335+
// In this case we use the rest of this block starting with override_next_block_lower_bound_
336+
// for the next FetchTo call.
337+
KeyBuffer override_next_block_lower_bound_;
238338
};
239339

240340
Result<bool> AssignSampleBlocks(
@@ -262,7 +362,7 @@ class PgDocSampleOp : public PgDocReadOp {
262362

263363
const std::string LogPrefix() const { return log_prefix_; }
264364

265-
std::vector<std::pair<KeyBuffer, KeyBuffer>> sample_blocks_;
365+
std::vector<std::pair<KeyBuffer, KeyBuffer>> sorted_sample_blocks_;
266366
SamplingStats sampling_stats_;
267367
std::string log_prefix_;
268368
};
@@ -387,20 +487,6 @@ class SampleBlocksPicker : public SamplePickerBase {
387487
return Status::OK();
388488
}
389489

390-
std::sort(
391-
blocks_reservoir_.begin(), blocks_reservoir_.end(),
392-
[](const std::pair<KeyBuffer, KeyBuffer>& b1, const std::pair<KeyBuffer, KeyBuffer>& b2) {
393-
return b1.first < b2.first;
394-
});
395-
396-
if (VLOG_IS_ON(3)) {
397-
size_t idx = 0;
398-
for (const auto& sample_block : blocks_reservoir_) {
399-
VLOG_WITH_FUNC(3) << "Sorted sample block #" << idx << ": "
400-
<< AsDebugHexString(sample_block);
401-
++idx;
402-
}
403-
}
404490
return Status::OK();
405491
}
406492

0 commit comments

Comments
 (0)