From b06b585e61b8709b17a4fc5cbf7a436b5db0cc49 Mon Sep 17 00:00:00 2001 From: "rupesh.kumar" <57129475+rupeshkumaar@users.noreply.github.com> Date: Sat, 20 Jan 2024 14:59:36 +0530 Subject: [PATCH 1/5] -Issue: #205: Merging (Identically Specified) MinHashLSH objects --- datasketch/lsh.py | 49 ++++++++++++++++++++++++++++++++++++++++ docs/lsh.rst | 8 +++++++ examples/lsh_examples.py | 13 +++++++++++ test/test_lsh.py | 39 ++++++++++++++++++++++++++++++++ 4 files changed, 109 insertions(+) diff --git a/datasketch/lsh.py b/datasketch/lsh.py index f77e36e3..a25f5891 100644 --- a/datasketch/lsh.py +++ b/datasketch/lsh.py @@ -226,6 +226,25 @@ def insert( """ self._insert(key, minhash, check_duplication=check_duplication, buffer=False) + def merge( + self, + other: MinHashLSH, + check_disjointness: bool = False + ): + """Merge the other MinHashLSH with this one, making this one the union + of both the MinHashLSH. + + Args: + other (MinHashLSH): The other MinHashLSH. + check_duplication (bool): To avoid duplicate keys in the storage + (`default=True`) + + Raises: + ValueError: If the two MinHashLSH have different initialization + parameters. + """ + self._merge(other, check_disjointness=check_disjointness, buffer=False) + def insertion_session(self, buffer_size: int = 50000) -> MinHashLSHInsertionSession: """ Create a context manager for fast insertion into this index. @@ -282,6 +301,36 @@ def _insert( for H, hashtable in zip(Hs, self.hashtables): hashtable.insert(H, key, buffer=buffer) + def __eq__(self, other:MinHashLSH) -> bool: + """ + Returns: + bool: If the two MinHashLSH has equal num_perm then two are equivalent. + """ + return ( + type(self) is type(other) and + self.h == other.h + ) + + def _merge( + self, + other: MinHashLSH, + check_disjointness: bool = False, + buffer: bool = False + ) -> MinHashLSH: + if self == other: + if check_disjointness and set(self.keys).intersection(set(other.keys)): + raise ValueError("The keys are not disjoint, duplicate key exists.") + for key in other.keys: + Hs = other.keys.get(key) + self.keys.insert(key, *Hs, buffer=buffer) + for H, hashtable in zip(Hs, self.hashtables): + hashtable.insert(H, key, buffer=buffer) + else: + if type(self) is not type(other): + raise ValueError(f"Cannot merge type MinHashLSH and type {type(other).__name__}.") + raise ValueError( + "Cannot merge MinHashLSH with different initialization parameters.") + def query(self, minhash) -> List[Hashable]: """ Giving the MinHash of the query set, retrieve diff --git a/docs/lsh.rst b/docs/lsh.rst index 9df92e82..39097c7b 100644 --- a/docs/lsh.rst +++ b/docs/lsh.rst @@ -77,6 +77,14 @@ plotting code. .. figure:: /_static/lsh_benchmark.png :alt: MinHashLSH Benchmark +You can merge two MinHashLSH object using the ``merge`` function. This +makes MinHashLSH useful in parallel processing. + +.. code:: python + + # The merges the lsh1 with lsh2. + lsh1.merge(lsh2) + There are other optional parameters that can be used to tune the index. See the documentation of :class:`datasketch.MinHashLSH` for details. diff --git a/examples/lsh_examples.py b/examples/lsh_examples.py index b16edf4f..6d50563c 100644 --- a/examples/lsh_examples.py +++ b/examples/lsh_examples.py @@ -37,6 +37,19 @@ def eg1(): result = lsh.query(m1) print("Approximate neighbours with Jaccard similarity > 0.5", result) + # Merge two LSH index + lsh1 = MinHashLSH(threshold=0.5, num_perm=128) + lsh1.insert("m2", m2) + lsh1.insert("m3", m3) + + lsh2 = MinHashLSH(threshold=0.5, num_perm=128) + lsh2.insert("m1", m1) + + lsh1.merge(lsh2) + print("Does m1 exist in the lsh1...", "m1" in lsh1.keys) + # if check_disjointness flag is set to True then it will check the disjointness of the keys in the two MinHashLSH + lsh1.merge(lsh2,check_disjointness=True) + def eg2(): mg = WeightedMinHashGenerator(10, 5) m1 = mg.minhash(v1) diff --git a/test/test_lsh.py b/test/test_lsh.py index 38f8844f..a15be323 100644 --- a/test/test_lsh.py +++ b/test/test_lsh.py @@ -240,6 +240,45 @@ def test_get_counts(self): for table in counts: self.assertEqual(sum(table.values()), 2) + def test_merge(self): + lsh1 = MinHashLSH(threshold=0.5, num_perm=16) + m1 = MinHash(16) + m1.update("a".encode("utf-8")) + m2 = MinHash(16) + m2.update("b".encode("utf-8")) + lsh1.insert("a",m1) + lsh1.insert("b",m2) + + lsh2 = MinHashLSH(threshold=0.5, num_perm=16) + m3 = MinHash(16) + m3.update("c".encode("utf-8")) + m4 = MinHash(16) + m4.update("d".encode("utf-8")) + lsh2.insert("c",m1) + lsh2.insert("d",m2) + + lsh1.merge(lsh2) + for t in lsh1.hashtables: + self.assertTrue(len(t) >= 1) + items = [] + for H in t: + items.extend(t[H]) + self.assertTrue("c" in items) + self.assertTrue("d" in items) + self.assertTrue("a" in lsh1) + self.assertTrue("b" in lsh1) + for i, H in enumerate(lsh1.keys["c"]): + self.assertTrue("c" in lsh1.hashtables[i][H]) + + self.assertTrue(lsh1.merge, lsh2) + self.assertRaises(ValueError, lsh1.merge, lsh2, check_disjointness=True) + + m5 = MinHash(32) + m5.update("e".encode("utf-8")) + lsh3 = MinHashLSH(threshold=0.5, num_perm=32) + lsh3.insert("a",m5) + + self.assertRaises(ValueError, lsh1.merge, lsh3, check_disjointness=True) class TestWeightedMinHashLSH(unittest.TestCase): From d96b6e705d2a6de5b8ac928317472e29a84d6c22 Mon Sep 17 00:00:00 2001 From: "rupesh.kumar" <57129475+rupeshkumaar@users.noreply.github.com> Date: Sun, 21 Jan 2024 12:04:24 +0530 Subject: [PATCH 2/5] Merging (Identically Specified) MinHashLSH objects Fixes #205 --- datasketch/lsh.py | 10 +++++---- test/test_lsh.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/datasketch/lsh.py b/datasketch/lsh.py index a25f5891..069e23be 100644 --- a/datasketch/lsh.py +++ b/datasketch/lsh.py @@ -301,14 +301,16 @@ def _insert( for H, hashtable in zip(Hs, self.hashtables): hashtable.insert(H, key, buffer=buffer) - def __eq__(self, other:MinHashLSH) -> bool: + def __equivalent(self, other:MinHashLSH) -> bool: """ Returns: - bool: If the two MinHashLSH has equal num_perm then two are equivalent. + bool: If the two MinHashLSH has equal num_perm, band size and size of each bands then two are equivalent. """ return ( type(self) is type(other) and - self.h == other.h + self.h == other.h and + self.b == other.b and + self.r == other.r ) def _merge( @@ -317,7 +319,7 @@ def _merge( check_disjointness: bool = False, buffer: bool = False ) -> MinHashLSH: - if self == other: + if self.__equivalent(other): if check_disjointness and set(self.keys).intersection(set(other.keys)): raise ValueError("The keys are not disjoint, duplicate key exists.") for key in other.keys: diff --git a/test/test_lsh.py b/test/test_lsh.py index a15be323..c2b080f8 100644 --- a/test/test_lsh.py +++ b/test/test_lsh.py @@ -267,6 +267,8 @@ def test_merge(self): self.assertTrue("d" in items) self.assertTrue("a" in lsh1) self.assertTrue("b" in lsh1) + self.assertTrue("c" in lsh1) + self.assertTrue("d" in lsh1) for i, H in enumerate(lsh1.keys["c"]): self.assertTrue("c" in lsh1.hashtables[i][H]) @@ -280,6 +282,56 @@ def test_merge(self): self.assertRaises(ValueError, lsh1.merge, lsh3, check_disjointness=True) + def test_merge_redis(self): + with patch('redis.Redis', fake_redis) as mock_redis: + lsh1 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={ + 'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379} + }) + lsh2 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={ + 'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379} + }) + + m1 = MinHash(16) + m1.update("a".encode("utf8")) + m2 = MinHash(16) + m2.update("b".encode("utf8")) + lsh1.insert("a", m1) + lsh1.insert("b", m2) + + m3 = MinHash(16) + m3.update("c".encode("utf8")) + m4 = MinHash(16) + m4.update("d".encode("utf8")) + lsh2.insert("c", m3) + lsh2.insert("d", m4) + + lsh1.merge(lsh2) + for t in lsh1.hashtables: + self.assertTrue(len(t) >= 1) + items = [] + for H in t: + items.extend(t[H]) + self.assertTrue(pickle.dumps("c") in items) + self.assertTrue(pickle.dumps("d") in items) + self.assertTrue("a" in lsh1) + self.assertTrue("b" in lsh1) + self.assertTrue("c" in lsh1) + self.assertTrue("d" in lsh1) + for i, H in enumerate(lsh1.keys[pickle.dumps("c")]): + self.assertTrue(pickle.dumps("c") in lsh1.hashtables[i][H]) + + self.assertTrue(lsh1.merge, lsh2) + self.assertRaises(ValueError, lsh1.merge, lsh2, check_disjointness=True) + + m5 = MinHash(32) + m5.update("e".encode("utf-8")) + lsh3 = MinHashLSH(threshold=0.5, num_perm=32, storage_config={ + 'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379} + }) + lsh3.insert("a",m5) + + self.assertRaises(ValueError, lsh1.merge, lsh3, check_disjointness=True) + class TestWeightedMinHashLSH(unittest.TestCase): def test_init(self): From 6628db819271080b47d595640688f3e3ac725551 Mon Sep 17 00:00:00 2001 From: "rupesh.kumar" <57129475+rupeshkumaar@users.noreply.github.com> Date: Mon, 22 Jan 2024 12:17:09 +0530 Subject: [PATCH 3/5] Merging (Identically Specified) MinHashLSH objects Fixes #205 --- datasketch/lsh.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datasketch/lsh.py b/datasketch/lsh.py index 069e23be..2f1c9b40 100644 --- a/datasketch/lsh.py +++ b/datasketch/lsh.py @@ -304,13 +304,14 @@ def _insert( def __equivalent(self, other:MinHashLSH) -> bool: """ Returns: - bool: If the two MinHashLSH has equal num_perm, band size and size of each bands then two are equivalent. + bool: If the two MinHashLSH have equal num_perm, number of bands, size of each band and hashfunc (if provided) then two are equivalent. """ return ( type(self) is type(other) and self.h == other.h and self.b == other.b and - self.r == other.r + self.r == other.r and + type(self.hashfunc) == type(other.hashfunc) ) def _merge( From ce29b01a1867712d2a7ba7c06a2ad4deadd80453 Mon Sep 17 00:00:00 2001 From: "rupesh.kumar" <57129475+rupeshkumaar@users.noreply.github.com> Date: Mon, 11 Mar 2024 11:41:19 +0530 Subject: [PATCH 4/5] Merging (Identically Specified) MinHashLSH objects --- .github/workflows/test.yml | 2 +- datasketch/lsh.py | 19 +++++++++---------- examples/lsh_examples.py | 4 ++-- test/test_lsh.py | 36 ++++++++++++++++++++++++++++-------- 4 files changed, 40 insertions(+), 21 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6259a6b3..495104a3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -8,7 +8,7 @@ jobs: runs-on: "ubuntu-latest" strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} diff --git a/datasketch/lsh.py b/datasketch/lsh.py index 2f1c9b40..4a682f84 100644 --- a/datasketch/lsh.py +++ b/datasketch/lsh.py @@ -229,21 +229,21 @@ def insert( def merge( self, other: MinHashLSH, - check_disjointness: bool = False + check_overlap: bool = False ): """Merge the other MinHashLSH with this one, making this one the union of both the MinHashLSH. Args: other (MinHashLSH): The other MinHashLSH. - check_duplication (bool): To avoid duplicate keys in the storage + check_overlap (bool): Check if there are any overlapping keys before merging and raise if there are any. (`default=True`) Raises: ValueError: If the two MinHashLSH have different initialization - parameters. + parameters, or if `check_overlap` is `True` and there are overlapping keys. """ - self._merge(other, check_disjointness=check_disjointness, buffer=False) + self._merge(other, check_overlap=check_overlap, buffer=False) def insertion_session(self, buffer_size: int = 50000) -> MinHashLSHInsertionSession: """ @@ -304,25 +304,24 @@ def _insert( def __equivalent(self, other:MinHashLSH) -> bool: """ Returns: - bool: If the two MinHashLSH have equal num_perm, number of bands, size of each band and hashfunc (if provided) then two are equivalent. + bool: If the two MinHashLSH have equal num_perm, number of bands, size of each band then two are equivalent. """ return ( type(self) is type(other) and self.h == other.h and self.b == other.b and - self.r == other.r and - type(self.hashfunc) == type(other.hashfunc) + self.r == other.r ) def _merge( self, other: MinHashLSH, - check_disjointness: bool = False, + check_overlap: bool = False, buffer: bool = False ) -> MinHashLSH: if self.__equivalent(other): - if check_disjointness and set(self.keys).intersection(set(other.keys)): - raise ValueError("The keys are not disjoint, duplicate key exists.") + if check_overlap and set(self.keys).intersection(set(other.keys)): + raise ValueError("The keys are overlapping, duplicate key exists.") for key in other.keys: Hs = other.keys.get(key) self.keys.insert(key, *Hs, buffer=buffer) diff --git a/examples/lsh_examples.py b/examples/lsh_examples.py index 6d50563c..007e1399 100644 --- a/examples/lsh_examples.py +++ b/examples/lsh_examples.py @@ -47,8 +47,8 @@ def eg1(): lsh1.merge(lsh2) print("Does m1 exist in the lsh1...", "m1" in lsh1.keys) - # if check_disjointness flag is set to True then it will check the disjointness of the keys in the two MinHashLSH - lsh1.merge(lsh2,check_disjointness=True) + # if check_overlap flag is set to True then it will check the overlapping of the keys in the two MinHashLSH + lsh1.merge(lsh2,check_overlap=True) def eg2(): mg = WeightedMinHashGenerator(10, 5) diff --git a/test/test_lsh.py b/test/test_lsh.py index c2b080f8..a2893753 100644 --- a/test/test_lsh.py +++ b/test/test_lsh.py @@ -273,14 +273,24 @@ def test_merge(self): self.assertTrue("c" in lsh1.hashtables[i][H]) self.assertTrue(lsh1.merge, lsh2) - self.assertRaises(ValueError, lsh1.merge, lsh2, check_disjointness=True) + self.assertRaises(ValueError, lsh1.merge, lsh2, check_overlap=True) - m5 = MinHash(32) + m5 = MinHash(16) m5.update("e".encode("utf-8")) - lsh3 = MinHashLSH(threshold=0.5, num_perm=32) + lsh3 = MinHashLSH(threshold=0.5, num_perm=16) lsh3.insert("a",m5) - self.assertRaises(ValueError, lsh1.merge, lsh3, check_disjointness=True) + self.assertRaises(ValueError, lsh1.merge, lsh3, check_overlap=True) + + lsh1.merge(lsh3) + + m6 = MinHash(16) + m6.update("e".encode("utf-8")) + lsh4 = MinHashLSH(threshold=0.5, num_perm=16) + lsh4.insert("a",m6) + + lsh1.merge(lsh4, check_overlap=False) + def test_merge_redis(self): with patch('redis.Redis', fake_redis) as mock_redis: @@ -321,16 +331,26 @@ def test_merge_redis(self): self.assertTrue(pickle.dumps("c") in lsh1.hashtables[i][H]) self.assertTrue(lsh1.merge, lsh2) - self.assertRaises(ValueError, lsh1.merge, lsh2, check_disjointness=True) + self.assertRaises(ValueError, lsh1.merge, lsh2, check_overlap=True) - m5 = MinHash(32) + m5 = MinHash(16) m5.update("e".encode("utf-8")) - lsh3 = MinHashLSH(threshold=0.5, num_perm=32, storage_config={ + lsh3 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={ 'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379} }) lsh3.insert("a",m5) - self.assertRaises(ValueError, lsh1.merge, lsh3, check_disjointness=True) + self.assertRaises(ValueError, lsh1.merge, lsh3, check_overlap=True) + + m6 = MinHash(16) + m6.update("e".encode("utf-8")) + lsh4 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={ + 'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379} + }) + lsh4.insert("a",m6) + + lsh1.merge(lsh4, check_overlap=False) + class TestWeightedMinHashLSH(unittest.TestCase): From 39e60f3eb819dc34b31e8bd8fce51221a3b5811f Mon Sep 17 00:00:00 2001 From: "rupesh.kumar" <57129475+rupeshkumaar@users.noreply.github.com> Date: Tue, 12 Mar 2024 15:36:45 +0530 Subject: [PATCH 5/5] Merging (Identically Specified) MinHashLSH objects Fixes #205 --- .github/workflows/test.yml | 2 +- datasketch/lsh.py | 8 ++++++-- docs/lsh.rst | 4 ++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 495104a3..6259a6b3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -8,7 +8,7 @@ jobs: runs-on: "ubuntu-latest" strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} diff --git a/datasketch/lsh.py b/datasketch/lsh.py index 4a682f84..dbaa3ec9 100644 --- a/datasketch/lsh.py +++ b/datasketch/lsh.py @@ -232,12 +232,16 @@ def merge( check_overlap: bool = False ): """Merge the other MinHashLSH with this one, making this one the union - of both the MinHashLSH. + of both. + + Note: + Only num_perm, number of bands and sizes of each band is checked for equivalency of two MinHashLSH indexes. + Other initialization parameters threshold, weights, storage_config, prepickle and hash_func are not checked. Args: other (MinHashLSH): The other MinHashLSH. check_overlap (bool): Check if there are any overlapping keys before merging and raise if there are any. - (`default=True`) + (`default=False`) Raises: ValueError: If the two MinHashLSH have different initialization diff --git a/docs/lsh.rst b/docs/lsh.rst index 39097c7b..dcd0d47a 100644 --- a/docs/lsh.rst +++ b/docs/lsh.rst @@ -77,12 +77,12 @@ plotting code. .. figure:: /_static/lsh_benchmark.png :alt: MinHashLSH Benchmark -You can merge two MinHashLSH object using the ``merge`` function. This +You can merge two MinHashLSH indexes to create a union index using the ``merge`` method. This makes MinHashLSH useful in parallel processing. .. code:: python - # The merges the lsh1 with lsh2. + # This merges the lsh1 with lsh2. lsh1.merge(lsh2) There are other optional parameters that can be used to tune the index.