Skip to content

Commit 30ebe42

Browse files
authored
[opt](bloomfilter index) optimize memory usage for bloom filter index writer (#45833)
### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Optimize memory usage when adding string values for bloom filter index. Using uint64 hash value instead of string values itself, it is expected to save a lot of memory for especially long text
1 parent 6b51e9d commit 30ebe42

File tree

3 files changed

+56
-12
lines changed

3 files changed

+56
-12
lines changed

be/src/olap/rowset/segment_v2/bloom_filter.h

+10
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,16 @@ class BloomFilter {
167167
return hash_code;
168168
}
169169

170+
static Result<uint64_t> hash(const char* buf, uint32_t size, HashStrategyPB strategy) {
171+
if (strategy == HASH_MURMUR3_X64_64) {
172+
uint64_t hash_code;
173+
murmur_hash3_x64_64(buf, size, DEFAULT_SEED, &hash_code);
174+
return hash_code;
175+
} else {
176+
return Status::InvalidArgument("invalid strategy:{}", strategy);
177+
}
178+
}
179+
170180
virtual void add_bytes(const char* buf, uint32_t size) {
171181
if (buf == nullptr) {
172182
*_has_null = true;

be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp

+16-11
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,10 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter {
7878
for (int i = 0; i < count; ++i) {
7979
if (_values.find(*v) == _values.end()) {
8080
if constexpr (_is_slice_type()) {
81-
CppType new_value;
82-
RETURN_IF_CATCH_EXCEPTION(_type_info->deep_copy(&new_value, v, &_arena));
83-
_values.insert(new_value);
81+
const auto* s = reinterpret_cast<const Slice*>(v);
82+
auto hash =
83+
DORIS_TRY(BloomFilter::hash(s->data, s->size, _bf_options.strategy));
84+
_hash_values.insert(hash);
8485
} else if constexpr (_is_int128()) {
8586
int128_t new_value;
8687
memcpy(&new_value, v, sizeof(PackedInt128));
@@ -99,25 +100,28 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter {
99100
Status flush() override {
100101
std::unique_ptr<BloomFilter> bf;
101102
RETURN_IF_ERROR(BloomFilter::create(BLOCK_BLOOM_FILTER, &bf));
102-
RETURN_IF_ERROR(bf->init(_values.size(), _bf_options.fpp, _bf_options.strategy));
103-
bf->set_has_null(_has_null);
104-
for (auto& v : _values) {
105-
if constexpr (_is_slice_type()) {
106-
auto* s = (Slice*)&v;
107-
bf->add_bytes(s->data, s->size);
108-
} else {
103+
if constexpr (_is_slice_type()) {
104+
RETURN_IF_ERROR(bf->init(_hash_values.size(), _bf_options.fpp, _bf_options.strategy));
105+
for (const auto& h : _hash_values) {
106+
bf->add_hash(h);
107+
}
108+
} else {
109+
RETURN_IF_ERROR(bf->init(_values.size(), _bf_options.fpp, _bf_options.strategy));
110+
for (auto& v : _values) {
109111
bf->add_bytes((char*)&v, sizeof(CppType));
110112
}
111113
}
114+
bf->set_has_null(_has_null);
112115
_bf_buffer_size += bf->size();
113116
_bfs.push_back(std::move(bf));
114117
_values.clear();
118+
_hash_values.clear();
115119
_has_null = false;
116120
return Status::OK();
117121
}
118122

119123
Status finish(io::FileWriter* file_writer, ColumnIndexMetaPB* index_meta) override {
120-
if (_values.size() > 0) {
124+
if (_values.size() > 0 || !_hash_values.empty()) {
121125
RETURN_IF_ERROR(flush());
122126
}
123127
index_meta->set_type(BLOOM_FILTER_INDEX);
@@ -166,6 +170,7 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter {
166170
// distinct values
167171
ValueDict _values;
168172
std::vector<std::unique_ptr<BloomFilter>> _bfs;
173+
std::set<uint64_t> _hash_values;
169174
};
170175

171176
} // namespace

be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp

+30-1
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,12 @@ Status test_bloom_filter_index_reader_writer_template(
180180
}
181181
// test nullptr
182182
EXPECT_TRUE(bf->test_bytes(nullptr, 1));
183-
183+
if (is_slice_type) {
184+
Slice* value = (Slice*)(not_exist_value);
185+
EXPECT_FALSE(bf->test_bytes(value->data, value->size));
186+
} else {
187+
EXPECT_FALSE(bf->test_bytes((char*)not_exist_value, sizeof(CppType)));
188+
}
184189
delete reader;
185190
}
186191
return Status::OK();
@@ -803,5 +808,29 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_bloom_filter_fpp_multiple) {
803808
test_bloom_filter_fpp(fpp);
804809
}
805810
}
811+
812+
TEST_F(BloomFilterIndexReaderWriterTest, test_slice_memory_usage) {
813+
size_t num = 1024 * 3;
814+
const size_t slice_size = 256;
815+
816+
std::vector<char> data_buffer;
817+
data_buffer.resize(num * slice_size);
818+
819+
std::vector<Slice> slice_vals(num);
820+
for (size_t i = 0; i < num; ++i) {
821+
char* ptr = data_buffer.data() + i * slice_size;
822+
memset(ptr, 'a' + (i % 26), slice_size);
823+
824+
slice_vals[i].data = ptr;
825+
slice_vals[i].size = slice_size;
826+
}
827+
828+
std::string not_exist_str = "not_exist_val";
829+
Slice not_exist_value(not_exist_str);
830+
831+
auto st = test_bloom_filter_index_reader_writer_template<FieldType::OLAP_FIELD_TYPE_VARCHAR>(
832+
"bloom_filter_large_slices", slice_vals.data(), num, 1, &not_exist_value, true, false);
833+
EXPECT_TRUE(st.ok());
834+
}
806835
} // namespace segment_v2
807836
} // namespace doris

0 commit comments

Comments
 (0)