From a2c1b710fb9b8397a9034649a6c40b1b424ff0b6 Mon Sep 17 00:00:00 2001 From: Senad Ibraimoski Date: Fri, 19 Aug 2022 16:41:39 +0100 Subject: [PATCH] Fix name construction for all values of b (#190) * Update storage.py https://github.com/ekzhu/datasketch/blob/master/datasketch/lsh.py#L118 In lsh.py when we are iterating over values of b and b reaches 95 the packed value of 95 contains an underscore ```python In [7]: for i in range(100): ...: print(i, struct.pack('>H', i)) ...: 0 b'\x00\x00' 1 b'\x00\x01' 2 b'\x00\x02' 3 b'\x00\x03' 4 b'\x00\x04' 5 b'\x00\x05' 6 b'\x00\x06' 7 b'\x00\x07' 8 b'\x00\x08' 9 b'\x00\t' 10 b'\x00\n' 11 b'\x00\x0b' 12 b'\x00\x0c' 13 b'\x00\r' 14 b'\x00\x0e' 15 b'\x00\x0f' 16 b'\x00\x10' 17 b'\x00\x11' 18 b'\x00\x12' 19 b'\x00\x13' 20 b'\x00\x14' 21 b'\x00\x15' 22 b'\x00\x16' 23 b'\x00\x17' 24 b'\x00\x18' 25 b'\x00\x19' 26 b'\x00\x1a' 27 b'\x00\x1b' 28 b'\x00\x1c' 29 b'\x00\x1d' 30 b'\x00\x1e' 31 b'\x00\x1f' 32 b'\x00 ' 33 b'\x00!' 34 b'\x00"' 35 b'\x00#' 36 b'\x00$' 37 b'\x00%' 38 b'\x00&' 39 b"\x00'" 40 b'\x00(' 41 b'\x00)' 42 b'\x00*' 43 b'\x00+' 44 b'\x00,' 45 b'\x00-' 46 b'\x00.' 47 b'\x00/' 48 b'\x000' 49 b'\x001' 50 b'\x002' 51 b'\x003' 52 b'\x004' 53 b'\x005' 54 b'\x006' 55 b'\x007' 56 b'\x008' 57 b'\x009' 58 b'\x00:' 59 b'\x00;' 60 b'\x00<' 61 b'\x00=' 62 b'\x00>' 63 b'\x00?' 64 b'\x00@' 65 b'\x00A' 66 b'\x00B' 67 b'\x00C' 68 b'\x00D' 69 b'\x00E' 70 b'\x00F' 71 b'\x00G' 72 b'\x00H' 73 b'\x00I' 74 b'\x00J' 75 b'\x00K' 76 b'\x00L' 77 b'\x00M' 78 b'\x00N' 79 b'\x00O' 80 b'\x00P' 81 b'\x00Q' 82 b'\x00R' 83 b'\x00S' 84 b'\x00T' 85 b'\x00U' 86 b'\x00V' 87 b'\x00W' 88 b'\x00X' 89 b'\x00Y' 90 b'\x00Z' 91 b'\x00[' 92 b'\x00\\' 93 b'\x00]' 94 b'\x00^' 95 b'\x00_' # This is an issue for unpacking after split 96 b'\x00`' 97 b'\x00a' 98 b'\x00b' 99 b'\x00c' ``` this completely breaks this naming here due to split and you get unpacking error. ```python In [32]: name = b''.join([_random_name(11), b'_bucket_', struct.pack('>H', 95)]) In [33]: basename, _, ret = name.split(b'_') --------------------------------------------------------------------------- ValueError Traceback (most recent call last) Input In [33], in () ----> 1 basename, _, ret = name.split(b'_') ValueError: too many values to unpack (expected 3) ``` while the fix ```python In [41]: name.split(b'_', 2) Out[41]: [b'kulxcuapyqa', b'bucket', b'\x00_'] ``` Gives us expected behaviour * Add unit test. Co-authored-by: Eric Zhu --- datasketch/storage.py | 2 +- test/test_lsh.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/datasketch/storage.py b/datasketch/storage.py index eecdb5ba..7a3db1f8 100644 --- a/datasketch/storage.py +++ b/datasketch/storage.py @@ -388,7 +388,7 @@ def __init__(self, cassandra_params, name, buffer_size): # only one Cassandra table for both table types (so we can keep one single storage) and # we specify different encoders/decoders based on the table type. if b'bucket' in name: - basename, _, ret = name.split(b'_') + basename, _, ret = name.split(b'_', 2) name = basename + b'_bucket_' + binascii.hexlify(ret) self._key_decoder = lambda x: x self._key_encoder = lambda x: x diff --git a/test/test_lsh.py b/test/test_lsh.py index a9bdb0b8..a376c218 100644 --- a/test/test_lsh.py +++ b/test/test_lsh.py @@ -39,6 +39,16 @@ def test__H(self): lsh.insert("m", m) sizes = [len(H) for ht in lsh.hashtables for H in ht] self.assertTrue(all(sizes[0] == s for s in sizes)) + + def test_unpacking(self): + for b in range(1, 1024 + 1): + lsh = MinHashLSH(num_perm=b * 4, params=(b, 4)) + m = MinHash(num_perm=b * 4) + m.update("abcdefg".encode("utf8")) + m.update("1234567".encode("utf8")) + lsh.insert("m", m) + sizes = [len(H) for ht in lsh.hashtables for H in ht] + self.assertTrue(all(sizes[0] == s for s in sizes)) def test_insert(self): lsh = MinHashLSH(threshold=0.5, num_perm=16)