12
12
13
13
log = logging .getLogger (__name__ )
14
14
15
+
15
16
class HashAlgorithm (IntEnum ):
16
17
MURMUR3 = 1
17
18
19
+
18
20
# A simple-as-possible bloom filter implementation making use of version 3 of the 32-bit murmur
19
21
# hash function (for compat with multi-level-bloom-filter-js).
20
22
# mgoodwin 2018
21
23
class Bloomer :
22
- LAYER_FMT = b' <BIIB'
24
+ LAYER_FMT = b" <BIIB"
23
25
24
26
def __init__ (self , * , size , nHashFuncs , level , hashAlg = HashAlgorithm .MURMUR3 ):
25
27
self .nHashFuncs = nHashFuncs
26
28
self .size = size
27
29
self .level = level
28
30
self .hashAlg = hashAlg
29
31
30
- self .bitarray = bitarray .bitarray (self .size , endian = ' little' )
32
+ self .bitarray = bitarray .bitarray (self .size , endian = " little" )
31
33
self .bitarray .setall (False )
32
34
33
35
def hash (self , seed , key ):
34
36
if not isinstance (key , bytes ):
35
37
to_bytes_op = getattr (key , "to_bytes" , None )
36
38
if isinstance (key , str ):
37
- key = key .encode (' utf-8' )
39
+ key = key .encode (" utf-8" )
38
40
elif callable (to_bytes_op ):
39
41
key = to_bytes_op ()
40
42
else :
41
- key = str (key ).encode (' utf-8' )
43
+ key = str (key ).encode (" utf-8" )
42
44
43
45
if self .hashAlg != HashAlgorithm .MURMUR3 :
44
46
raise Exception (f"Unknown hash algorithm: { self .hashAlg } " )
@@ -67,7 +69,9 @@ def tofile(self, f):
67
69
"""Write the bloom filter to file object `f'. Underlying bits
68
70
are written as machine values. This is much more space
69
71
efficient than pickling the object."""
70
- f .write (pack (self .LAYER_FMT , self .hashAlg , self .size , self .nHashFuncs , self .level ))
72
+ f .write (
73
+ pack (self .LAYER_FMT , self .hashAlg , self .size , self .nHashFuncs , self .level )
74
+ )
71
75
f .flush ()
72
76
self .bitarray .tofile (f )
73
77
@@ -92,22 +96,34 @@ def from_buf(cls, buf):
92
96
hashAlgInt , size , nHashFuncs , level = unpack (Bloomer .LAYER_FMT , buf [0 :10 ])
93
97
byte_count = math .ceil (size / 8 )
94
98
ba = bitarray .bitarray (endian = "little" )
95
- ba .frombytes (buf [10 :10 + byte_count ])
96
- bloomer = Bloomer (size = 1 , nHashFuncs = nHashFuncs , level = level , hashAlg = HashAlgorithm (hashAlgInt ))
99
+ ba .frombytes (buf [10 : 10 + byte_count ])
100
+ bloomer = Bloomer (
101
+ size = 1 ,
102
+ nHashFuncs = nHashFuncs ,
103
+ level = level ,
104
+ hashAlg = HashAlgorithm (hashAlgInt ),
105
+ )
97
106
bloomer .size = size
98
- log .debug ("Size is {}, level {}, nHashFuncs, {}" .format (
99
- size , level , nHashFuncs ))
107
+ log .debug (
108
+ "Size is {}, level {}, nHashFuncs, {}" .format (size , level , nHashFuncs )
109
+ )
100
110
bloomer .bitarray = ba
101
111
102
- return (buf [10 + byte_count :], bloomer )
112
+ return (buf [10 + byte_count :], bloomer )
103
113
104
114
105
115
class FilterCascade :
106
- DIFF_FMT = b'<III'
107
- VERSION_FMT = b'<H'
108
-
109
- def __init__ (self , filters , error_rates = [0.02 , 0.5 ], growth_factor = 1.1 ,
110
- min_filter_length = 10000 , version = 1 ):
116
+ DIFF_FMT = b"<III"
117
+ VERSION_FMT = b"<H"
118
+
119
+ def __init__ (
120
+ self ,
121
+ filters ,
122
+ error_rates = [0.02 , 0.5 ],
123
+ growth_factor = 1.1 ,
124
+ min_filter_length = 10000 ,
125
+ version = 1 ,
126
+ ):
111
127
self .filters = filters
112
128
self .error_rates = error_rates
113
129
self .growth_factor = growth_factor
@@ -149,22 +165,29 @@ def initialize(self, *, include, exclude):
149
165
Bloomer .filter_with_characteristics (
150
166
max (
151
167
int (include_len * self .growth_factor ),
152
- self .min_filter_length ), er , depth ))
168
+ self .min_filter_length ,
169
+ ),
170
+ er ,
171
+ depth ,
172
+ )
173
+ )
153
174
else :
154
175
# Filter already created for this layer. Check size and resize if needed.
155
176
required_size = Bloomer .calc_size (
156
- self .filters [depth - 1 ].nHashFuncs , include_len , er )
177
+ self .filters [depth - 1 ].nHashFuncs , include_len , er
178
+ )
157
179
if self .filters [depth - 1 ].size < required_size :
158
180
# Resize filter
159
- self .filters [depth -
160
- 1 ] = Bloomer .filter_with_characteristics (
161
- int (include_len * self .growth_factor ),
162
- er , depth )
181
+ self .filters [depth - 1 ] = Bloomer .filter_with_characteristics (
182
+ int (include_len * self .growth_factor ), er , depth
183
+ )
163
184
log .info ("Resized filter at {}-depth layer" .format (depth ))
164
185
filter = self .filters [depth - 1 ]
165
186
log .debug (
166
- "Initializing the {}-depth layer. err={} include_len={} size={} hashes={}"
167
- .format (depth , er , include_len , filter .size , filter .nHashFuncs ))
187
+ "Initializing the {}-depth layer. err={} include_len={} size={} hashes={}" .format (
188
+ depth , er , include_len , filter .size , filter .nHashFuncs
189
+ )
190
+ )
168
191
# loop over the elements that *should* be there. Add them to the filter.
169
192
for elem in include :
170
193
filter .add (elem )
@@ -180,22 +203,32 @@ def initialize(self, *, include, exclude):
180
203
endtime = datetime .datetime .utcnow ()
181
204
log .debug (
182
205
"Took {} ms to process layer {} with bit count {}" .format (
183
- (endtime - starttime ).seconds * 1000 +
184
- (endtime - starttime ).microseconds / 1000 , depth ,
185
- len (filter .bitarray )))
206
+ (endtime - starttime ).seconds * 1000
207
+ + (endtime - starttime ).microseconds / 1000 ,
208
+ depth ,
209
+ len (filter .bitarray ),
210
+ )
211
+ )
186
212
# Sanity check layer growth. Bit count should be going down
187
213
# as false positive rate decreases.
188
214
if depth > 2 :
189
215
if len (filter .bitarray ) > len (self .filters [depth - 3 ].bitarray ):
190
216
sequentialGrowthLayers += 1
191
217
log .warning (
192
218
"Increase in false positive rate detected. Depth {} has {}"
193
- " bits and depth {} has {} bits. {}/{} allowed warnings."
194
- .format (depth , len (filter .bitarray ), depth - 3 + 1 ,
195
- len (self .filters [depth - 3 ].bitarray ),
196
- sequentialGrowthLayers , maxSequentialGrowthLayers ))
219
+ " bits and depth {} has {} bits. {}/{} allowed warnings." .format (
220
+ depth ,
221
+ len (filter .bitarray ),
222
+ depth - 3 + 1 ,
223
+ len (self .filters [depth - 3 ].bitarray ),
224
+ sequentialGrowthLayers ,
225
+ maxSequentialGrowthLayers ,
226
+ )
227
+ )
197
228
if sequentialGrowthLayers >= maxSequentialGrowthLayers :
198
- log .error ("Too many sequential false positive increases detected. Aborting." )
229
+ log .error (
230
+ "Too many sequential false positive increases detected. Aborting."
231
+ )
199
232
self .filters .clear ()
200
233
return
201
234
else :
@@ -211,8 +244,9 @@ def initialize(self, *, include, exclude):
211
244
del self .filters [depth :]
212
245
213
246
def __contains__ (self , elem ):
214
- for layer , filter in [(idx + 1 , self .filters [idx ])
215
- for idx in range (len (self .filters ))]:
247
+ for layer , filter in [
248
+ (idx + 1 , self .filters [idx ]) for idx in range (len (self .filters ))
249
+ ]:
216
250
even = layer % 2 == 0
217
251
if elem in filter :
218
252
if layer == len (self .filters ):
@@ -238,8 +272,10 @@ def layerCount(self):
238
272
def saveDiffMeta (self , f ):
239
273
for filter in self .filters :
240
274
f .write (
241
- pack (FilterCascade .DIFF_FMT , filter .size , filter .nHashFuncs ,
242
- filter .level ))
275
+ pack (
276
+ FilterCascade .DIFF_FMT , filter .size , filter .nHashFuncs , filter .level
277
+ )
278
+ )
243
279
244
280
# Follows the bitarray.tofile parameter convention.
245
281
def tofile (self , f ):
@@ -255,7 +291,7 @@ def tofile(self, f):
255
291
@classmethod
256
292
def from_buf (cls , buf ):
257
293
log .debug (len (buf ))
258
- (version , ) = unpack (FilterCascade .VERSION_FMT , buf [0 :2 ])
294
+ (version ,) = unpack (FilterCascade .VERSION_FMT , buf [0 :2 ])
259
295
if version != 1 :
260
296
raise Exception (f"Unknown version: { version } " )
261
297
buf = buf [2 :]
@@ -273,15 +309,14 @@ def loadDiffMeta(cls, f):
273
309
size = calcsize (FilterCascade .DIFF_FMT )
274
310
data = f .read ()
275
311
while len (data ) >= size :
276
- filtersize , nHashFuncs , level = unpack (FilterCascade .DIFF_FMT ,
277
- data [:size ])
278
- filters .append (
279
- Bloomer (size = filtersize , nHashFuncs = nHashFuncs , level = level ))
312
+ filtersize , nHashFuncs , level = unpack (FilterCascade .DIFF_FMT , data [:size ])
313
+ filters .append (Bloomer (size = filtersize , nHashFuncs = nHashFuncs , level = level ))
280
314
data = data [size :]
281
315
return FilterCascade (filters )
282
316
283
317
@classmethod
284
318
def cascade_with_characteristics (cls , capacity , error_rates , layer = 0 ):
285
319
return FilterCascade (
286
320
[Bloomer .filter_with_characteristics (capacity , error_rates [0 ])],
287
- error_rates = error_rates )
321
+ error_rates = error_rates ,
322
+ )
0 commit comments