-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathBloomFilter.hpp
406 lines (371 loc) · 14.7 KB
/
BloomFilter.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
**/
#ifndef QUICKSTEP_UTILITY_BLOOM_FILTER_HPP
#define QUICKSTEP_UTILITY_BLOOM_FILTER_HPP
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <utility>
#include <vector>
#include "storage/StorageConstants.hpp"
#include "threading/Mutex.hpp"
#include "threading/SharedMutex.hpp"
#include "threading/SpinSharedMutex.hpp"
#include "utility/BloomFilter.pb.h"
#include "utility/Macros.hpp"
#include "glog/logging.h"
namespace quickstep {
/** \addtogroup Utility
* @{
*/
/**
* @brief A simple Bloom Filter implementation with basic primitives
* based on Partow's Bloom Filter implementation.
**/
class BloomFilter {
public:
static const uint32_t kNumBitsPerByte = 8;
/**
* @brief Constructor.
* @note When no bit_array is being passed to the constructor,
* then the bit_array is owned and managed by this class.
*
* @param random_seed A random_seed that generates unique hash functions.
* @param hash_fn_count The number of hash functions used by this bloom filter.
* @param bit_array_size_in_bytes Size of the bit array.
**/
BloomFilter(const std::uint64_t random_seed,
const std::size_t hash_fn_count,
const std::uint64_t bit_array_size_in_bytes)
: random_seed_(random_seed),
hash_fn_count_(hash_fn_count),
array_size_in_bytes_(bit_array_size_in_bytes),
array_size_(array_size_in_bytes_ * kNumBitsPerByte),
bit_array_(new std::uint8_t[array_size_in_bytes_]),
is_bit_array_owner_(true) {
reset();
generate_unique_hash_fn();
}
/**
* @brief Constructor.
* @note When a bit_array is passed as an argument to the constructor,
* then the ownership of the bit array lies with the caller.
*
* @param random_seed A random_seed that generates unique hash functions.
* @param hash_fn_count The number of hash functions used by this bloom filter.
* @param bit_array_size_in_bytes Size of the bit array.
* @param bit_array A pointer to the memory region that is used to store bit array.
* @param is_initialized A boolean that indicates whether to zero-out the region
* before use or not.
**/
BloomFilter(const std::uint64_t random_seed,
const std::size_t hash_fn_count,
const std::uint64_t bit_array_size_in_bytes,
std::uint8_t *bit_array,
const bool is_initialized)
: random_seed_(random_seed),
hash_fn_count_(hash_fn_count),
array_size_in_bytes_(bit_array_size_in_bytes),
array_size_(bit_array_size_in_bytes * kNumBitsPerByte),
bit_array_(bit_array), // Owned by the calling method.
is_bit_array_owner_(false) {
if (!is_initialized) {
reset();
}
generate_unique_hash_fn();
}
/**
* @brief Constructor.
* @note When a bloom filter proto is passed as an initializer,
* then the bit_array is owned and managed by this class.
*
* @param bloom_filter_proto The protobuf representation of a
* bloom filter configuration.
**/
explicit BloomFilter(const serialization::BloomFilter &bloom_filter_proto)
: random_seed_(bloom_filter_proto.bloom_filter_seed()),
hash_fn_count_(bloom_filter_proto.number_of_hashes()),
array_size_in_bytes_(bloom_filter_proto.bloom_filter_size()),
array_size_(array_size_in_bytes_ * kNumBitsPerByte),
bit_array_(new std::uint8_t[array_size_in_bytes_]),
is_bit_array_owner_(true) {
reset();
generate_unique_hash_fn();
}
/**
* @brief Destructor.
**/
~BloomFilter() {
if (is_bit_array_owner_) {
bit_array_.reset();
} else {
bit_array_.release();
}
}
static bool ProtoIsValid(const serialization::BloomFilter &bloom_filter_proto) {
return bloom_filter_proto.IsInitialized();
}
/**
* @brief Zeros out the contents of the bit array.
**/
inline void reset() {
// Initialize the bit_array with all zeros.
std::fill_n(bit_array_.get(), array_size_in_bytes_, 0x00);
inserted_element_count_ = 0;
}
/**
* @brief Get the random seed that was used to initialize this bloom filter.
*
* @return Returns the random seed.
**/
inline std::uint64_t getRandomSeed() const {
return random_seed_;
}
/**
* @brief Get the number of hash functions used in this bloom filter.
*
* @return Returns the number of hash functions.
**/
inline std::uint32_t getNumberOfHashes() const {
return hash_fn_count_;
}
/**
* @brief Get the size of the bit array in bytes for this bloom filter.
*
* @return Returns the bit array size (in bytes).
**/
inline std::uint64_t getBitArraySize() const {
return array_size_in_bytes_;
}
/**
* @brief Get the constant pointer to the bit array for this bloom filter
*
* @return Returns constant pointer to the bit array.
**/
inline const std::uint8_t* getBitArray() const {
return bit_array_.get();
}
/**
* @brief Inserts a given value into the bloom filter in a thread-safe manner.
*
* @param key_begin A pointer to the value being inserted.
* @param length Size of the value being inserted in bytes.
*/
inline void insert(const std::uint8_t *key_begin, const std::size_t length) {
// Locks are needed during insertion, when multiple workers may be modifying the
// bloom filter concurrently. However, locks are not required during membership test.
std::size_t bit_index = 0;
std::size_t bit = 0;
std::vector<std::pair<std::size_t, std::size_t>> modified_bit_positions;
std::vector<bool> is_bit_position_correct;
// Determine all the bit positions that are required to be set.
for (std::size_t i = 0; i < hash_fn_count_; ++i) {
compute_indices(hash_ap(key_begin, length, hash_fn_[i]), &bit_index, &bit);
modified_bit_positions.push_back(std::make_pair(bit_index, bit));
}
// Acquire a reader lock and check which of the bit positions are already set.
{
SpinSharedMutexSharedLock<false> shared_reader_lock(bloom_filter_insert_mutex_);
for (std::size_t i = 0; i < hash_fn_count_; ++i) {
bit_index = modified_bit_positions[i].first;
bit = modified_bit_positions[i].second;
if (((bit_array_.get())[bit_index / kNumBitsPerByte] & (1 << bit)) != (1 << bit)) {
is_bit_position_correct.push_back(false);
} else {
is_bit_position_correct.push_back(true);
}
}
}
// Acquire a writer lock and set the bit positions are which are not set.
{
SpinSharedMutexExclusiveLock<false> exclusive_writer_lock(bloom_filter_insert_mutex_);
for (std::size_t i = 0; i < hash_fn_count_; ++i) {
if (!is_bit_position_correct[i]) {
bit_index = modified_bit_positions[i].first;
bit = modified_bit_positions[i].second;
(bit_array_.get())[bit_index / kNumBitsPerByte] |= (1 << bit);
}
}
}
++inserted_element_count_;
}
/**
* @brief Inserts a given value into the bloom filter.
* @Warning This is a faster thread-unsafe version of the insert() function.
* The caller needs to ensure the thread safety.
*
* @param key_begin A pointer to the value being inserted.
* @param length Size of the value being inserted in bytes.
*/
inline void insertUnSafe(const std::uint8_t *key_begin, const std::size_t length) {
std::size_t bit_index = 0;
std::size_t bit = 0;
for (std::size_t i = 0; i < hash_fn_count_; ++i) {
compute_indices(hash_ap(key_begin, length, hash_fn_[i]), &bit_index, &bit);
(bit_array_.get())[bit_index / kNumBitsPerByte] |= (1 << bit);
}
++inserted_element_count_;
}
/**
* @brief Test membership of a given value in the bloom filter.
* If true is returned, then a value may or may not be present in the bloom filter.
* If false is returned, a value is certainly not present in the bloom filter.
*
* @note The membersip test does not require any locks, because the assumption is that
* the bloom filter will only be used after it has been built.
*
* @param key_begin A pointer to the value being tested for membership.
* @param length Size of the value being inserted in bytes.
*/
inline bool contains(const std::uint8_t *key_begin, const std::size_t length) const {
std::size_t bit_index = 0;
std::size_t bit = 0;
for (std::size_t i = 0; i < hash_fn_count_; ++i) {
compute_indices(hash_ap(key_begin, length, hash_fn_[i]), &bit_index, &bit);
if (((bit_array_.get())[bit_index / kNumBitsPerByte] & (1 << bit)) != (1 << bit)) {
return false;
}
}
return true;
}
/**
* @brief Perform a bitwise-OR of the given Bloom filter with this bloom filter.
* Essentially, it does a union of this bloom filter with the passed bloom filter.
*
* @param bloom_filter A const pointer to the bloom filter object to do bitwise-OR with.
*/
inline void bitwiseOr(const BloomFilter *bloom_filter) {
SpinSharedMutexExclusiveLock<false> exclusive_writer_lock(bloom_filter_insert_mutex_);
for (std::size_t byte_index = 0; byte_index < bloom_filter->getBitArraySize(); ++byte_index) {
(bit_array_.get())[byte_index] |= bloom_filter->getBitArray()[byte_index];
}
}
/**
* @brief Return the number of elements currently inserted into bloom filter.
*
* @return The number of elements inserted into bloom filter.
**/
inline std::size_t element_count() const {
return inserted_element_count_;
}
protected:
inline void compute_indices(const std::uint32_t &hash, std::size_t *bit_index, std::size_t *bit) const {
*bit_index = hash % array_size_;
*bit = *bit_index % kNumBitsPerByte;
}
void generate_unique_hash_fn() {
hash_fn_.reserve(hash_fn_count_);
const std::uint32_t predef_hash_fn_count = 128;
static const std::uint32_t predef_hash_fn[predef_hash_fn_count] = {
0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
};
if (hash_fn_count_ <= predef_hash_fn_count) {
std::copy(predef_hash_fn, predef_hash_fn + hash_fn_count_, hash_fn_.begin());
for (std::uint32_t i = 0; i < hash_fn_.size(); ++i) {
hash_fn_[i] = hash_fn_[i] * hash_fn_[(i + 3) % hash_fn_count_] + static_cast<std::uint32_t>(random_seed_);
}
} else {
LOG(FATAL) << "Requested number of hash functions is too large.";
}
}
inline std::uint32_t hash_ap(const std::uint8_t *begin, std::size_t remaining_length, std::uint32_t hash) const {
const std::uint8_t *itr = begin;
std::uint32_t loop = 0;
while (remaining_length >= 8) {
const std::uint32_t &i1 = *(reinterpret_cast<const std::uint32_t*>(itr)); itr += sizeof(std::uint32_t);
const std::uint32_t &i2 = *(reinterpret_cast<const std::uint32_t*>(itr)); itr += sizeof(std::uint32_t);
hash ^= (hash << 7) ^ i1 * (hash >> 3) ^ (~((hash << 11) + (i2 ^ (hash >> 5))));
remaining_length -= 8;
}
if (remaining_length) {
if (remaining_length >= 4) {
const std::uint32_t &i = *(reinterpret_cast<const std::uint32_t*>(itr));
if (loop & 0x01) {
hash ^= (hash << 7) ^ i * (hash >> 3);
} else {
hash ^= (~((hash << 11) + (i ^ (hash >> 5))));
}
++loop;
remaining_length -= 4;
itr += sizeof(std::uint32_t);
}
if (remaining_length >= 2) {
const std::uint16_t &i = *(reinterpret_cast<const std::uint16_t*>(itr));
if (loop & 0x01) {
hash ^= (hash << 7) ^ i * (hash >> 3);
} else {
hash ^= (~((hash << 11) + (i ^ (hash >> 5))));
}
++loop;
remaining_length -= 2;
itr += sizeof(std::uint16_t);
}
if (remaining_length) {
hash += ((*itr) ^ (hash * 0xA5A5A5A5)) + loop;
}
}
return hash;
}
private:
const std::uint64_t random_seed_;
std::vector<std::uint32_t> hash_fn_;
const std::uint32_t hash_fn_count_;
std::uint64_t array_size_in_bytes_;
std::uint64_t array_size_;
std::unique_ptr<std::uint8_t> bit_array_;
std::uint32_t inserted_element_count_;
const bool is_bit_array_owner_;
alignas(kCacheLineBytes) mutable SpinSharedMutex<false> bloom_filter_insert_mutex_;
DISALLOW_COPY_AND_ASSIGN(BloomFilter);
};
/** @} */
} // namespace quickstep
#endif // QUICKSTEP_UTILITY_BLOOM_FILTER_HPP