From a2c1b710fb9b8397a9034649a6c40b1b424ff0b6 Mon Sep 17 00:00:00 2001
From: Senad Ibraimoski <senad.ibraimoski@gmail.com>
Date: Fri, 19 Aug 2022 16:41:39 +0100
Subject: [PATCH] Fix name construction for all values of b (#190)

* Update storage.py

https://github.com/ekzhu/datasketch/blob/master/datasketch/lsh.py#L118

In lsh.py when we are iterating over values of b and b reaches 95 the packed value of 95 contains an underscore

```python
In [7]: for i in range(100):
   ...:     print(i, struct.pack('>H', i))
   ...:
0 b'\x00\x00'
1 b'\x00\x01'
2 b'\x00\x02'
3 b'\x00\x03'
4 b'\x00\x04'
5 b'\x00\x05'
6 b'\x00\x06'
7 b'\x00\x07'
8 b'\x00\x08'
9 b'\x00\t'
10 b'\x00\n'
11 b'\x00\x0b'
12 b'\x00\x0c'
13 b'\x00\r'
14 b'\x00\x0e'
15 b'\x00\x0f'
16 b'\x00\x10'
17 b'\x00\x11'
18 b'\x00\x12'
19 b'\x00\x13'
20 b'\x00\x14'
21 b'\x00\x15'
22 b'\x00\x16'
23 b'\x00\x17'
24 b'\x00\x18'
25 b'\x00\x19'
26 b'\x00\x1a'
27 b'\x00\x1b'
28 b'\x00\x1c'
29 b'\x00\x1d'
30 b'\x00\x1e'
31 b'\x00\x1f'
32 b'\x00 '
33 b'\x00!'
34 b'\x00"'
35 b'\x00#'
36 b'\x00$'
37 b'\x00%'
38 b'\x00&'
39 b"\x00'"
40 b'\x00('
41 b'\x00)'
42 b'\x00*'
43 b'\x00+'
44 b'\x00,'
45 b'\x00-'
46 b'\x00.'
47 b'\x00/'
48 b'\x000'
49 b'\x001'
50 b'\x002'
51 b'\x003'
52 b'\x004'
53 b'\x005'
54 b'\x006'
55 b'\x007'
56 b'\x008'
57 b'\x009'
58 b'\x00:'
59 b'\x00;'
60 b'\x00<'
61 b'\x00='
62 b'\x00>'
63 b'\x00?'
64 b'\x00@'
65 b'\x00A'
66 b'\x00B'
67 b'\x00C'
68 b'\x00D'
69 b'\x00E'
70 b'\x00F'
71 b'\x00G'
72 b'\x00H'
73 b'\x00I'
74 b'\x00J'
75 b'\x00K'
76 b'\x00L'
77 b'\x00M'
78 b'\x00N'
79 b'\x00O'
80 b'\x00P'
81 b'\x00Q'
82 b'\x00R'
83 b'\x00S'
84 b'\x00T'
85 b'\x00U'
86 b'\x00V'
87 b'\x00W'
88 b'\x00X'
89 b'\x00Y'
90 b'\x00Z'
91 b'\x00['
92 b'\x00\\'
93 b'\x00]'
94 b'\x00^'
95 b'\x00_' # This is an issue for unpacking after split
96 b'\x00`'
97 b'\x00a'
98 b'\x00b'
99 b'\x00c'
```

this completely breaks this naming here due to split and you get unpacking error.

```python

In [32]: name = b''.join([_random_name(11), b'_bucket_', struct.pack('>H', 95)])

In [33]: basename, _, ret = name.split(b'_')
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [33], in <cell line: 1>()
----> 1 basename, _, ret = name.split(b'_')

ValueError: too many values to unpack (expected 3)
```

while the fix

```python
In [41]: name.split(b'_', 2)
Out[41]: [b'kulxcuapyqa', b'bucket', b'\x00_']
```

Gives us expected behaviour

* Add unit test.

Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com>
---
 datasketch/storage.py |  2 +-
 test/test_lsh.py      | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/datasketch/storage.py b/datasketch/storage.py
index eecdb5ba..7a3db1f8 100644
--- a/datasketch/storage.py
+++ b/datasketch/storage.py
@@ -388,7 +388,7 @@ def __init__(self, cassandra_params, name, buffer_size):
             # only one Cassandra table for both table types (so we can keep one single storage) and
             # we specify different encoders/decoders based on the table type.
             if b'bucket' in name:
-                basename, _, ret = name.split(b'_')
+                basename, _, ret = name.split(b'_', 2)
                 name = basename + b'_bucket_' + binascii.hexlify(ret)
                 self._key_decoder = lambda x: x
                 self._key_encoder = lambda x: x
diff --git a/test/test_lsh.py b/test/test_lsh.py
index a9bdb0b8..a376c218 100644
--- a/test/test_lsh.py
+++ b/test/test_lsh.py
@@ -39,6 +39,16 @@ def test__H(self):
             lsh.insert("m", m)
             sizes = [len(H) for ht in lsh.hashtables for H in ht]
             self.assertTrue(all(sizes[0] == s for s in sizes))
+    
+    def test_unpacking(self):
+        for b in range(1, 1024 + 1):
+            lsh = MinHashLSH(num_perm=b * 4, params=(b, 4))
+            m = MinHash(num_perm=b * 4)
+            m.update("abcdefg".encode("utf8"))
+            m.update("1234567".encode("utf8"))
+            lsh.insert("m", m)
+            sizes = [len(H) for ht in lsh.hashtables for H in ht]
+            self.assertTrue(all(sizes[0] == s for s in sizes))
 
     def test_insert(self):
         lsh = MinHashLSH(threshold=0.5, num_perm=16)