Skip to content

Commit 6499648

Browse files
committed
format with black and add a pre-commit config
1 parent 346c2d7 commit 6499648

File tree

7 files changed

+125
-71
lines changed

7 files changed

+125
-71
lines changed

.pre-commit-config.yaml

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
- repo: git://github.com/pre-commit/pre-commit-hooks
2+
rev: HEAD
3+
hooks:
4+
- id: check-ast
5+
- id: detect-private-key
6+
- id: detect-aws-credentials
7+
- id: check-merge-conflict
8+
- id: end-of-file-fixer
9+
- id: requirements-txt-fixer
10+
- id: trailing-whitespace
11+
- repo: git://github.com/psf/black
12+
rev: HEAD
13+
hooks:
14+
- id: black
15+
- repo: local
16+
hooks:
17+
- id: filter-cascade-tests
18+
name: Tests for filter-cascade
19+
language: system
20+
entry: python3 -m unittest filtercascade/test.py
21+
pass_filenames: false
22+
files: '.py$'

LICENSE

+1-2
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ Mozilla Public License Version 2.0
3535
means any form of the work other than Source Code Form.
3636

3737
1.7. "Larger Work"
38-
means a work that combines Covered Software with other material, in
38+
means a work that combines Covered Software with other material, in
3939
a separate file or files, that is not Covered Software.
4040

4141
1.8. "License"
@@ -371,4 +371,3 @@ Exhibit B - "Incompatible With Secondary Licenses" Notice
371371

372372
This Source Code Form is "Incompatible With Secondary Licenses", as
373373
defined by the Mozilla Public License, v. 2.0.
374-

README.md

-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
11
# filter-cascade
22
A python filter cascade implementation
3-

filtercascade/__init__.py

+76-41
Original file line numberDiff line numberDiff line change
@@ -12,33 +12,35 @@
1212

1313
log = logging.getLogger(__name__)
1414

15+
1516
class HashAlgorithm(IntEnum):
1617
MURMUR3 = 1
1718

19+
1820
# A simple-as-possible bloom filter implementation making use of version 3 of the 32-bit murmur
1921
# hash function (for compat with multi-level-bloom-filter-js).
2022
# mgoodwin 2018
2123
class Bloomer:
22-
LAYER_FMT = b'<BIIB'
24+
LAYER_FMT = b"<BIIB"
2325

2426
def __init__(self, *, size, nHashFuncs, level, hashAlg=HashAlgorithm.MURMUR3):
2527
self.nHashFuncs = nHashFuncs
2628
self.size = size
2729
self.level = level
2830
self.hashAlg = hashAlg
2931

30-
self.bitarray = bitarray.bitarray(self.size, endian='little')
32+
self.bitarray = bitarray.bitarray(self.size, endian="little")
3133
self.bitarray.setall(False)
3234

3335
def hash(self, seed, key):
3436
if not isinstance(key, bytes):
3537
to_bytes_op = getattr(key, "to_bytes", None)
3638
if isinstance(key, str):
37-
key = key.encode('utf-8')
39+
key = key.encode("utf-8")
3840
elif callable(to_bytes_op):
3941
key = to_bytes_op()
4042
else:
41-
key = str(key).encode('utf-8')
43+
key = str(key).encode("utf-8")
4244

4345
if self.hashAlg != HashAlgorithm.MURMUR3:
4446
raise Exception(f"Unknown hash algorithm: {self.hashAlg}")
@@ -67,7 +69,9 @@ def tofile(self, f):
6769
"""Write the bloom filter to file object `f'. Underlying bits
6870
are written as machine values. This is much more space
6971
efficient than pickling the object."""
70-
f.write(pack(self.LAYER_FMT, self.hashAlg, self.size, self.nHashFuncs, self.level))
72+
f.write(
73+
pack(self.LAYER_FMT, self.hashAlg, self.size, self.nHashFuncs, self.level)
74+
)
7175
f.flush()
7276
self.bitarray.tofile(f)
7377

@@ -92,22 +96,34 @@ def from_buf(cls, buf):
9296
hashAlgInt, size, nHashFuncs, level = unpack(Bloomer.LAYER_FMT, buf[0:10])
9397
byte_count = math.ceil(size / 8)
9498
ba = bitarray.bitarray(endian="little")
95-
ba.frombytes(buf[10:10 + byte_count])
96-
bloomer = Bloomer(size=1, nHashFuncs=nHashFuncs, level=level, hashAlg=HashAlgorithm(hashAlgInt))
99+
ba.frombytes(buf[10 : 10 + byte_count])
100+
bloomer = Bloomer(
101+
size=1,
102+
nHashFuncs=nHashFuncs,
103+
level=level,
104+
hashAlg=HashAlgorithm(hashAlgInt),
105+
)
97106
bloomer.size = size
98-
log.debug("Size is {}, level {}, nHashFuncs, {}".format(
99-
size, level, nHashFuncs))
107+
log.debug(
108+
"Size is {}, level {}, nHashFuncs, {}".format(size, level, nHashFuncs)
109+
)
100110
bloomer.bitarray = ba
101111

102-
return (buf[10 + byte_count:], bloomer)
112+
return (buf[10 + byte_count :], bloomer)
103113

104114

105115
class FilterCascade:
106-
DIFF_FMT = b'<III'
107-
VERSION_FMT = b'<H'
108-
109-
def __init__(self, filters, error_rates=[0.02, 0.5], growth_factor=1.1,
110-
min_filter_length=10000, version=1):
116+
DIFF_FMT = b"<III"
117+
VERSION_FMT = b"<H"
118+
119+
def __init__(
120+
self,
121+
filters,
122+
error_rates=[0.02, 0.5],
123+
growth_factor=1.1,
124+
min_filter_length=10000,
125+
version=1,
126+
):
111127
self.filters = filters
112128
self.error_rates = error_rates
113129
self.growth_factor = growth_factor
@@ -149,22 +165,29 @@ def initialize(self, *, include, exclude):
149165
Bloomer.filter_with_characteristics(
150166
max(
151167
int(include_len * self.growth_factor),
152-
self.min_filter_length), er, depth))
168+
self.min_filter_length,
169+
),
170+
er,
171+
depth,
172+
)
173+
)
153174
else:
154175
# Filter already created for this layer. Check size and resize if needed.
155176
required_size = Bloomer.calc_size(
156-
self.filters[depth - 1].nHashFuncs, include_len, er)
177+
self.filters[depth - 1].nHashFuncs, include_len, er
178+
)
157179
if self.filters[depth - 1].size < required_size:
158180
# Resize filter
159-
self.filters[depth -
160-
1] = Bloomer.filter_with_characteristics(
161-
int(include_len * self.growth_factor),
162-
er, depth)
181+
self.filters[depth - 1] = Bloomer.filter_with_characteristics(
182+
int(include_len * self.growth_factor), er, depth
183+
)
163184
log.info("Resized filter at {}-depth layer".format(depth))
164185
filter = self.filters[depth - 1]
165186
log.debug(
166-
"Initializing the {}-depth layer. err={} include_len={} size={} hashes={}"
167-
.format(depth, er, include_len, filter.size, filter.nHashFuncs))
187+
"Initializing the {}-depth layer. err={} include_len={} size={} hashes={}".format(
188+
depth, er, include_len, filter.size, filter.nHashFuncs
189+
)
190+
)
168191
# loop over the elements that *should* be there. Add them to the filter.
169192
for elem in include:
170193
filter.add(elem)
@@ -180,22 +203,32 @@ def initialize(self, *, include, exclude):
180203
endtime = datetime.datetime.utcnow()
181204
log.debug(
182205
"Took {} ms to process layer {} with bit count {}".format(
183-
(endtime - starttime).seconds * 1000 +
184-
(endtime - starttime).microseconds / 1000, depth,
185-
len(filter.bitarray)))
206+
(endtime - starttime).seconds * 1000
207+
+ (endtime - starttime).microseconds / 1000,
208+
depth,
209+
len(filter.bitarray),
210+
)
211+
)
186212
# Sanity check layer growth. Bit count should be going down
187213
# as false positive rate decreases.
188214
if depth > 2:
189215
if len(filter.bitarray) > len(self.filters[depth - 3].bitarray):
190216
sequentialGrowthLayers += 1
191217
log.warning(
192218
"Increase in false positive rate detected. Depth {} has {}"
193-
" bits and depth {} has {} bits. {}/{} allowed warnings."
194-
.format(depth, len(filter.bitarray), depth - 3 + 1,
195-
len(self.filters[depth - 3].bitarray),
196-
sequentialGrowthLayers, maxSequentialGrowthLayers))
219+
" bits and depth {} has {} bits. {}/{} allowed warnings.".format(
220+
depth,
221+
len(filter.bitarray),
222+
depth - 3 + 1,
223+
len(self.filters[depth - 3].bitarray),
224+
sequentialGrowthLayers,
225+
maxSequentialGrowthLayers,
226+
)
227+
)
197228
if sequentialGrowthLayers >= maxSequentialGrowthLayers:
198-
log.error("Too many sequential false positive increases detected. Aborting.")
229+
log.error(
230+
"Too many sequential false positive increases detected. Aborting."
231+
)
199232
self.filters.clear()
200233
return
201234
else:
@@ -211,8 +244,9 @@ def initialize(self, *, include, exclude):
211244
del self.filters[depth:]
212245

213246
def __contains__(self, elem):
214-
for layer, filter in [(idx + 1, self.filters[idx])
215-
for idx in range(len(self.filters))]:
247+
for layer, filter in [
248+
(idx + 1, self.filters[idx]) for idx in range(len(self.filters))
249+
]:
216250
even = layer % 2 == 0
217251
if elem in filter:
218252
if layer == len(self.filters):
@@ -238,8 +272,10 @@ def layerCount(self):
238272
def saveDiffMeta(self, f):
239273
for filter in self.filters:
240274
f.write(
241-
pack(FilterCascade.DIFF_FMT, filter.size, filter.nHashFuncs,
242-
filter.level))
275+
pack(
276+
FilterCascade.DIFF_FMT, filter.size, filter.nHashFuncs, filter.level
277+
)
278+
)
243279

244280
# Follows the bitarray.tofile parameter convention.
245281
def tofile(self, f):
@@ -255,7 +291,7 @@ def tofile(self, f):
255291
@classmethod
256292
def from_buf(cls, buf):
257293
log.debug(len(buf))
258-
(version, ) = unpack(FilterCascade.VERSION_FMT, buf[0:2])
294+
(version,) = unpack(FilterCascade.VERSION_FMT, buf[0:2])
259295
if version != 1:
260296
raise Exception(f"Unknown version: {version}")
261297
buf = buf[2:]
@@ -273,15 +309,14 @@ def loadDiffMeta(cls, f):
273309
size = calcsize(FilterCascade.DIFF_FMT)
274310
data = f.read()
275311
while len(data) >= size:
276-
filtersize, nHashFuncs, level = unpack(FilterCascade.DIFF_FMT,
277-
data[:size])
278-
filters.append(
279-
Bloomer(size=filtersize, nHashFuncs=nHashFuncs, level=level))
312+
filtersize, nHashFuncs, level = unpack(FilterCascade.DIFF_FMT, data[:size])
313+
filters.append(Bloomer(size=filtersize, nHashFuncs=nHashFuncs, level=level))
280314
data = data[size:]
281315
return FilterCascade(filters)
282316

283317
@classmethod
284318
def cascade_with_characteristics(cls, capacity, error_rates, layer=0):
285319
return FilterCascade(
286320
[Bloomer.filter_with_characteristics(capacity, error_rates[0])],
287-
error_rates=error_rates)
321+
error_rates=error_rates,
322+
)

filtercascade/test.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,9 @@ def test_fc_input_formats(self):
9898
def test_fc_include_not_list(self):
9999
f = filtercascade.FilterCascade([])
100100
with self.assertRaises(TypeError):
101-
f.initialize(include=predictable_serial_gen(1),
102-
exclude=predictable_serial_gen(1))
101+
f.initialize(
102+
include=predictable_serial_gen(1), exclude=predictable_serial_gen(1)
103+
)
103104

104105
def test_fc_exclude_must_be_iterable(self):
105106
f = filtercascade.FilterCascade([])
@@ -114,14 +115,13 @@ def test_fc_iterable(self):
114115
# slice off a set and re-use the remainder
115116
revocations = set(islice(serials, 3_000))
116117

117-
f.initialize(include=revocations,
118-
exclude=serials)
118+
f.initialize(include=revocations, exclude=serials)
119119

120120
self.assertEqual(len(f.filters), 3)
121121
self.assertEqual(f.filters[0].size, 81272)
122122
self.assertEqual(f.filters[1].size, 14400)
123123
self.assertEqual(f.filters[2].size, 14400)
124124

125125

126-
if __name__ == '__main__':
126+
if __name__ == "__main__":
127127
unittest.main()

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
bitarray >= 0.9.2
2-
mmh3 >= 2.5.1
2+
mmh3 >= 2.5.1

setup.py

+20-21
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,22 @@
11
from setuptools import setup, find_packages
22

3-
setup(name='filtercascade',
4-
version='0.2.2',
5-
description='A simple bloom filter cascade implementation in Python',
6-
long_description='A bloom filter cascade implementation in Python using the 32-bit variant of murmurhash3.',
7-
classifiers=[
8-
'Development Status :: 5 - Production/Stable',
9-
'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
10-
'Programming Language :: Python :: 3',
11-
],
12-
keywords='bloom filter cascade multi level mlbf',
13-
url='http://github.com/mozmark/filter-cascade',
14-
author='Mark Goodwin',
15-
author_email='[email protected]',
16-
license='Mozilla Public License 2.0 (MPL 2.0)',
17-
packages=['filtercascade'],
18-
install_requires=[
19-
'bitarray>=0.9.2',
20-
'mmh3>=2.5.1',
21-
],
22-
include_package_data=True,
23-
zip_safe=False)
3+
setup(
4+
name="filtercascade",
5+
version="0.2.2",
6+
description="A simple bloom filter cascade implementation in Python",
7+
long_description="A bloom filter cascade implementation in Python using the 32-bit variant of murmurhash3.",
8+
classifiers=[
9+
"Development Status :: 5 - Production/Stable",
10+
"License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
11+
"Programming Language :: Python :: 3",
12+
],
13+
keywords="bloom filter cascade multi level mlbf",
14+
url="http://github.com/mozmark/filter-cascade",
15+
author="Mark Goodwin",
16+
author_email="[email protected]",
17+
license="Mozilla Public License 2.0 (MPL 2.0)",
18+
packages=["filtercascade"],
19+
install_requires=["bitarray>=0.9.2", "mmh3>=2.5.1",],
20+
include_package_data=True,
21+
zip_safe=False,
22+
)

0 commit comments

Comments
 (0)