Fix name construction for all values of b (#190)

* Update storage.py https://github.com/ekzhu/datasketch/blob/master/datasketch/lsh.py#L118 In lsh.py when we are iterating over values of b and b reaches 95 the packed value of 95 contains an underscore ```python In [7]: for i in range(100): ...: print(i, struct.pack('>H', i)) ...: 0 b'\x00\x00' 1 b'\x00\x01' 2 b'\x00\x02' 3 b'\x00\x03' 4 b'\x00\x04' 5 b'\x00\x05' 6 b'\x00\x06' 7 b'\x00\x07' 8 b'\x00\x08' 9 b'\x00\t' 10 b'\x00\n' 11 b'\x00\x0b' 12 b'\x00\x0c' 13 b'\x00\r' 14 b'\x00\x0e' 15 b'\x00\x0f' 16 b'\x00\x10' 17 b'\x00\x11' 18 b'\x00\x12' 19 b'\x00\x13' 20 b'\x00\x14' 21 b'\x00\x15' 22 b'\x00\x16' 23 b'\x00\x17' 24 b'\x00\x18' 25 b'\x00\x19' 26 b'\x00\x1a' 27 b'\x00\x1b' 28 b'\x00\x1c' 29 b'\x00\x1d' 30 b'\x00\x1e' 31 b'\x00\x1f' 32 b'\x00 ' 33 b'\x00!' 34 b'\x00"' 35 b'\x00#' 36 b'\x00$' 37 b'\x00%' 38 b'\x00&' 39 b"\x00'" 40 b'\x00(' 41 b'\x00)' 42 b'\x00*' 43 b'\x00+' 44 b'\x00,' 45 b'\x00-' 46 b'\x00.' 47 b'\x00/' 48 b'\x000' 49 b'\x001' 50 b'\x002' 51 b'\x003' 52 b'\x004' 53 b'\x005' 54 b'\x006' 55 b'\x007' 56 b'\x008' 57 b'\x009' 58 b'\x00:' 59 b'\x00;' 60 b'\x00<' 61 b'\x00=' 62 b'\x00>' 63 b'\x00?' 64 b'\x00@' 65 b'\x00A' 66 b'\x00B' 67 b'\x00C' 68 b'\x00D' 69 b'\x00E' 70 b'\x00F' 71 b'\x00G' 72 b'\x00H' 73 b'\x00I' 74 b'\x00J' 75 b'\x00K' 76 b'\x00L' 77 b'\x00M' 78 b'\x00N' 79 b'\x00O' 80 b'\x00P' 81 b'\x00Q' 82 b'\x00R' 83 b'\x00S' 84 b'\x00T' 85 b'\x00U' 86 b'\x00V' 87 b'\x00W' 88 b'\x00X' 89 b'\x00Y' 90 b'\x00Z' 91 b'\x00[' 92 b'\x00\\' 93 b'\x00]' 94 b'\x00^' 95 b'\x00_' # This is an issue for unpacking after split 96 b'\x00`' 97 b'\x00a' 98 b'\x00b' 99 b'\x00c' ``` this completely breaks this naming here due to split and you get unpacking error. ```python In [32]: name = b''.join([_random_name(11), b'_bucket_', struct.pack('>H', 95)]) In [33]: basename, _, ret = name.split(b'_') --------------------------------------------------------------------------- ValueError Traceback (most recent call last) Input In [33], in <cell line: 1>() ----> 1 basename, _, ret = name.split(b'_') ValueError: too many values to unpack (expected 3) ``` while the fix ```python In [41]: name.split(b'_', 2) Out[41]: [b'kulxcuapyqa', b'bucket', b'\x00_'] ``` Gives us expected behaviour * Add unit test. Co-authored-by: Eric Zhu <[email protected]>
ekzhu · Aug 19, 2022 · a2c1b71 · a2c1b71
1 parent 6e7d228
commit a2c1b71
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 1 deletion.
diff --git a/datasketch/storage.py b/datasketch/storage.py
@@ -388,7 +388,7 @@ def __init__(self, cassandra_params, name, buffer_size):
             # only one Cassandra table for both table types (so we can keep one single storage) and
             # we specify different encoders/decoders based on the table type.
             if b'bucket' in name:
-                basename, _, ret = name.split(b'_')
+                basename, _, ret = name.split(b'_', 2)
                 name = basename + b'_bucket_' + binascii.hexlify(ret)
                 self._key_decoder = lambda x: x
                 self._key_encoder = lambda x: x

diff --git a/test/test_lsh.py b/test/test_lsh.py
@@ -39,6 +39,16 @@ def test__H(self):
             lsh.insert("m", m)
             sizes = [len(H) for ht in lsh.hashtables for H in ht]
             self.assertTrue(all(sizes[0] == s for s in sizes))
+
+    def test_unpacking(self):
+        for b in range(1, 1024 + 1):
+            lsh = MinHashLSH(num_perm=b * 4, params=(b, 4))
+            m = MinHash(num_perm=b * 4)
+            m.update("abcdefg".encode("utf8"))
+            m.update("1234567".encode("utf8"))
+            lsh.insert("m", m)
+            sizes = [len(H) for ht in lsh.hashtables for H in ht]
+            self.assertTrue(all(sizes[0] == s for s in sizes))
 
     def test_insert(self):
         lsh = MinHashLSH(threshold=0.5, num_perm=16)