7
7
import logging
8
8
import math
9
9
import mmh3
10
+ import hashlib
11
+ from deprecated import deprecated
10
12
from struct import pack , unpack , calcsize
11
- from enum import IntEnum
13
+ from enum import IntEnum , unique
12
14
13
15
log = logging .getLogger (__name__ )
14
16
15
17
18
+ @unique
16
19
class HashAlgorithm (IntEnum ):
17
20
MURMUR3 = 1
21
+ SHA256 = 2
18
22
19
23
20
24
# A simple-as-possible bloom filter implementation making use of version 3 of the 32-bit murmur
@@ -23,15 +27,23 @@ class HashAlgorithm(IntEnum):
23
27
class Bloomer :
24
28
LAYER_FMT = b"<BIIB"
25
29
26
- def __init__ (self , * , size , nHashFuncs , level , hashAlg = HashAlgorithm .MURMUR3 ):
30
+ def __init__ (
31
+ self , * , size , nHashFuncs , level , hashAlg = HashAlgorithm .MURMUR3 , salt = None
32
+ ):
27
33
self .nHashFuncs = nHashFuncs
28
34
self .size = size
29
35
self .level = level
30
- self .hashAlg = hashAlg
36
+ self .hashAlg = HashAlgorithm (hashAlg )
37
+ self .salt = salt
31
38
32
39
self .bitarray = bitarray .bitarray (self .size , endian = "little" )
33
40
self .bitarray .setall (False )
34
41
42
+ if self .salt and not isinstance (self .salt , bytes ):
43
+ raise ValueError ("salts must be passed as bytes" )
44
+ if self .salt and self .hashAlg == HashAlgorithm .MURMUR3 :
45
+ raise ValueError ("salts not permitted for MurmurHash3" )
46
+
35
47
def hash (self , seed , key ):
36
48
if not isinstance (key , bytes ):
37
49
to_bytes_op = getattr (key , "to_bytes" , None )
@@ -42,12 +54,27 @@ def hash(self, seed, key):
42
54
else :
43
55
key = str (key ).encode ("utf-8" )
44
56
45
- if self .hashAlg != HashAlgorithm .MURMUR3 :
46
- raise Exception (f"Unknown hash algorithm: { self .hashAlg } " )
47
-
48
57
hash_seed = ((seed << 16 ) + self .level ) & 0xFFFFFFFF
49
- h = (mmh3 .hash (key , hash_seed ) & 0xFFFFFFFF ) % self .size
50
- return h
58
+
59
+ if self .hashAlg == HashAlgorithm .MURMUR3 :
60
+ if self .salt :
61
+ raise ValueError ("salts not permitted for MurmurHash3" )
62
+ h = (mmh3 .hash (key , hash_seed ) & 0xFFFFFFFF ) % self .size
63
+ return h
64
+
65
+ if self .hashAlg == HashAlgorithm .SHA256 :
66
+ m = hashlib .sha256 ()
67
+ if self .salt :
68
+ m .update (salt )
69
+ m .update (hash_seed )
70
+ m .update (key )
71
+ h = (
72
+ int .from_bytes (m .digest ()[:4 ], byteorder = "little" , signed = False )
73
+ % self .size
74
+ )
75
+ return h
76
+
77
+ raise Exception (f"Unknown hash algorithm: { self .hashAlg } " )
51
78
52
79
def add (self , key ):
53
80
for i in range (self .nHashFuncs ):
@@ -76,10 +103,20 @@ def tofile(self, f):
76
103
self .bitarray .tofile (f )
77
104
78
105
@classmethod
79
- def filter_with_characteristics (cls , elements , falsePositiveRate , level = 1 ):
106
+ def filter_with_characteristics (
107
+ cls ,
108
+ * ,
109
+ elements ,
110
+ falsePositiveRate ,
111
+ hashAlg = HashAlgorithm .MURMUR3 ,
112
+ salt = None ,
113
+ level = 1 ,
114
+ ):
80
115
nHashFuncs = Bloomer .calc_n_hashes (falsePositiveRate )
81
116
size = Bloomer .calc_size (nHashFuncs , elements , falsePositiveRate )
82
- return Bloomer (size = size , nHashFuncs = nHashFuncs , level = level )
117
+ return Bloomer (
118
+ size = size , nHashFuncs = nHashFuncs , level = level , hashAlg = hashAlg , salt = salt
119
+ )
83
120
84
121
@classmethod
85
122
def calc_n_hashes (cls , falsePositiveRate ):
@@ -91,7 +128,7 @@ def calc_size(cls, nHashFuncs, elements, falsePositiveRate):
91
128
return math .ceil (1.44 * elements * math .log (1 / falsePositiveRate , 2 ))
92
129
93
130
@classmethod
94
- def from_buf (cls , buf ):
131
+ def from_buf (cls , buf , salt = None ):
95
132
log .debug (len (buf ))
96
133
hashAlgInt , size , nHashFuncs , level = unpack (Bloomer .LAYER_FMT , buf [0 :10 ])
97
134
byte_count = math .ceil (size / 8 )
@@ -102,6 +139,7 @@ def from_buf(cls, buf):
102
139
nHashFuncs = nHashFuncs ,
103
140
level = level ,
104
141
hashAlg = HashAlgorithm (hashAlgInt ),
142
+ salt = salt ,
105
143
)
106
144
bloomer .size = size
107
145
log .debug (
@@ -123,12 +161,21 @@ def __init__(
123
161
growth_factor = 1.1 ,
124
162
min_filter_length = 10000 ,
125
163
version = 1 ,
164
+ hashAlg = HashAlgorithm .MURMUR3 ,
165
+ salt = None ,
126
166
):
127
167
self .filters = filters
128
168
self .error_rates = error_rates
129
169
self .growth_factor = growth_factor
130
170
self .min_filter_length = min_filter_length
131
171
self .version = version
172
+ self .hashAlg = hashAlg
173
+ self .salt = salt
174
+
175
+ if self .salt and not isinstance (self .salt , bytes ):
176
+ raise ValueError ("salts must be passed as byteas" )
177
+ if self .salt and self .hashAlg == HashAlgorithm .MURMUR3 :
178
+ raise ValueError ("salts not permitted for MurmurHash3" )
132
179
133
180
def initialize (self , * , include , exclude ):
134
181
"""
@@ -163,12 +210,12 @@ def initialize(self, *, include, exclude):
163
210
# For growth-stability reasons, we force all layers to be at least
164
211
# min_filter_length large. This is important for the deep layers near the end.
165
212
Bloomer .filter_with_characteristics (
166
- max (
213
+ elements = max (
167
214
int (include_len * self .growth_factor ),
168
215
self .min_filter_length ,
169
216
),
170
- er ,
171
- depth ,
217
+ falsePositiveRate = er ,
218
+ level = depth ,
172
219
)
173
220
)
174
221
else :
@@ -254,10 +301,17 @@ def __contains__(self, elem):
254
301
else :
255
302
return False != even
256
303
304
+ @deprecated (
305
+ version = "0.2.3" ,
306
+ reason = "Use the verify function which has the same semantics as initialize" ,
307
+ )
257
308
def check (self , * , entries , exclusions ):
258
- for entry in entries :
309
+ self .verify (include = entries , exclude = exclusions )
310
+
311
+ def verify (self , * , include , exclude ):
312
+ for entry in include :
259
313
assert entry in self , "oops! false negative!"
260
- for entry in exclusions :
314
+ for entry in exclude :
261
315
assert not entry in self , "oops! false positive!"
262
316
263
317
def bitCount (self ):
@@ -315,8 +369,16 @@ def loadDiffMeta(cls, f):
315
369
return FilterCascade (filters )
316
370
317
371
@classmethod
318
- def cascade_with_characteristics (cls , capacity , error_rates , layer = 0 ):
372
+ def cascade_with_characteristics (
373
+ cls , * , capacity , error_rates , hashAlg = HashAlgorithm .MURMUR3 , salt = None , layer = 0
374
+ ):
319
375
return FilterCascade (
320
- [Bloomer .filter_with_characteristics (capacity , error_rates [0 ])],
376
+ [
377
+ Bloomer .filter_with_characteristics (
378
+ elements = capacity , falsePositiveRate = error_rates [0 ]
379
+ )
380
+ ],
321
381
error_rates = error_rates ,
382
+ hashAlg = hashAlg ,
383
+ salt = salt ,
322
384
)
0 commit comments