From 134b5fae63d78e3614c5138b807ab9b5ce042ebf Mon Sep 17 00:00:00 2001 From: tjacovich Date: Tue, 5 Aug 2025 12:23:35 -0400 Subject: [PATCH 01/11] Remove additional bib_data fields that are responsible for environment mismatches. --- SciXPipelineUtils/scix_id.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py index c5ede42..f034b85 100644 --- a/SciXPipelineUtils/scix_id.py +++ b/SciXPipelineUtils/scix_id.py @@ -193,6 +193,20 @@ def generate_bib_data_hash(hash_data): "first_author", "first_author_norm", "identifier", + "orcid_pub", + "links_data", + "alternate_bibcode", + "doctype", + "doctype_facet_hier", + "entry_date", + "keyword_norm", + "keyword_facet", + "citation", + "citation_count", + "citation_count_norm", + "read_count", + "date", + "copyright", ] for field in unique_fields: try: From 034801f25f4b74376d2f409b6a3fb9e79621640b Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 6 Aug 2025 12:06:46 -0400 Subject: [PATCH 02/11] Add regex to remove special characters from abstracts. --- SciXPipelineUtils/scix_id.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py index f034b85..144bc94 100644 --- a/SciXPipelineUtils/scix_id.py +++ b/SciXPipelineUtils/scix_id.py @@ -213,6 +213,8 @@ def generate_bib_data_hash(hash_data): hash_data.pop(field) except Exception: continue + if hash_data.get("abstract"): + hash_data["abstract"][0] = re.sub(r"\W+", "", hash_data.get("abstract")[0]) encoded_hash_data = json.dumps(hash_data).encode("utf-8") return hashlib.md5(encoded_hash_data).hexdigest() From 48188b5e1efc763dec752223951b82f59c7fde0c Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 6 Aug 2025 12:27:40 -0400 Subject: [PATCH 03/11] Strip html tags as well. --- SciXPipelineUtils/scix_id.py | 5 +++-- tests/test_scix_id.py | 40 ++++++++++++++++++++++++------------ 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py index 144bc94..f380dc4 100644 --- a/SciXPipelineUtils/scix_id.py +++ b/SciXPipelineUtils/scix_id.py @@ -213,8 +213,9 @@ def generate_bib_data_hash(hash_data): hash_data.pop(field) except Exception: continue - if hash_data.get("abstract"): - hash_data["abstract"][0] = re.sub(r"\W+", "", hash_data.get("abstract")[0]) + if hash_data.get("abs"): + hash_data["abs"][0] = re.sub("<[^<]+?>", "", hash_data.get("abs")[0]) + hash_data["abs"][0] = re.sub(r"\W+", "", hash_data.get("abs")[0]) encoded_hash_data = json.dumps(hash_data).encode("utf-8") return hashlib.md5(encoded_hash_data).hexdigest() diff --git a/tests/test_scix_id.py b/tests/test_scix_id.py index 38f55e6..a067880 100644 --- a/tests/test_scix_id.py +++ b/tests/test_scix_id.py @@ -22,18 +22,18 @@ def test_generate_bib_data_hash(self): "id": 1, "author": ["Lias, Alberta", "Smith, J."], "title": "Test", - "abs": "words", + "abs": ["words"], "bibcode": "Test", } hash = scixid.generate_bib_data_hash(test_bib_data) - self.assertEqual(hash, "ca77650a961fe043bf18e60618f43b49") + self.assertEqual(hash, "703f2f82ef742c10101840e4fc85bc53") test_bib_data = { "title": "Test", - "abs": "words", + "abs": ["words"], } hash2 = scixid.generate_bib_data_hash(test_bib_data) - self.assertEqual(hash2, "ca77650a961fe043bf18e60618f43b49") + self.assertEqual(hash2, "703f2f82ef742c10101840e4fc85bc53") self.assertEqual(hash, hash2) @@ -42,33 +42,47 @@ def test_get_rand_from_hash(self): "id": 1, "author": ["Lias, Alberta", "Smith, J."], "title": "Test", - "abs": "words", + "abs": ["words"], } hash = scixid.generate_bib_data_hash(test_bib_data) rand_num = scixid.get_rand_from_hash(hash) - self.assertEqual(rand_num, 12446194448305896) + self.assertEqual(rand_num, 8784826954018605) def test_scix_id_from_hash(self): test_bib_data = { "id": 1, "author": ["Lias, Alberta", "Smith, J."], "title": "Test", - "abs": "words", + "abs": ["words"], } hash = scixid.generate_bib_data_hash(test_bib_data) scix_id = scixid.scix_id_from_hash(hash) - self.assertEqual(scix_id, "B1QQ-XVEB-3Q83") + self.assertEqual(scix_id, "7SNR-3N03-VSD6") def test_generate_scix_id(self): test_bib_data = { "id": 1, "author": ["Lias, Alberta", "Smith, J."], "title": "Test", - "abs": "words", + "abs": ["words"], } scix_id = scixid.generate_scix_id(test_bib_data) scix_id_2 = scixid.generate_scix_id(json.dumps(test_bib_data)) - self.assertEqual(scix_id, "B1QQ-XVEB-3Q83") + self.assertEqual(scix_id, "7SNR-3N03-VSD6") + self.assertEqual(scix_id, scix_id_2) + + def test_generate_scix_id_special_characters(self): + test_bib_data = { + "id": 1, + "author": ["Lias, Alberta", "Smith, J."], + "title": "Test", + "abs": ["words<"], + } + # import pudb + # pudb.set_trace() + scix_id = scixid.generate_scix_id(test_bib_data) + scix_id_2 = scixid.generate_scix_id(json.dumps(test_bib_data)) + self.assertEqual(scix_id, "7SNR-3N03-VSD6") self.assertEqual(scix_id, scix_id_2) def test_generate_scix_id_other(self): @@ -76,8 +90,8 @@ def test_generate_scix_id_other(self): "id": 1, "author": ["Lias, Alberta", "Smith, J."], "title": "Test", - "abs": "words", + "abs": ["words"], } scix_id = scixid.generate_scix_id(json.dumps(test_bib_data), hash_data_type="other") - self.assertNotEqual(scix_id, "B1QQ-XVEB-3Q83") - self.assertEqual(scix_id, "9G4K-K9BH-SA63") + self.assertNotEqual(scix_id, "880N-W2DE-VHDV") + self.assertEqual(scix_id, "6N22-EN04-7GHF") From eeaa225b3596d1a1adf6252d47e082fbe62bf4ec Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 6 Aug 2025 12:28:11 -0400 Subject: [PATCH 04/11] Strip html tags as well. --- tests/test_scix_id.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_scix_id.py b/tests/test_scix_id.py index a067880..009ef02 100644 --- a/tests/test_scix_id.py +++ b/tests/test_scix_id.py @@ -76,7 +76,7 @@ def test_generate_scix_id_special_characters(self): "id": 1, "author": ["Lias, Alberta", "Smith, J."], "title": "Test", - "abs": ["words<"], + "abs": ["words < "], } # import pudb # pudb.set_trace() From d6e2eb48663fbbf8f2a2e10410fa7854bd1b360d Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 6 Aug 2025 12:29:19 -0400 Subject: [PATCH 05/11] Strip html tags as well. --- tests/test_scix_id.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_scix_id.py b/tests/test_scix_id.py index 009ef02..9ddb707 100644 --- a/tests/test_scix_id.py +++ b/tests/test_scix_id.py @@ -93,5 +93,5 @@ def test_generate_scix_id_other(self): "abs": ["words"], } scix_id = scixid.generate_scix_id(json.dumps(test_bib_data), hash_data_type="other") - self.assertNotEqual(scix_id, "880N-W2DE-VHDV") + self.assertNotEqual(scix_id, "7SNR-3N03-VSD6") self.assertEqual(scix_id, "6N22-EN04-7GHF") From fe8a9ecb169e9c2d18b46795931ded1aeb3dcc82 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 6 Aug 2025 13:44:07 -0400 Subject: [PATCH 06/11] Allow character stripping to be configurable. --- SciXPipelineUtils/scix_id.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py index f380dc4..3f287ea 100644 --- a/SciXPipelineUtils/scix_id.py +++ b/SciXPipelineUtils/scix_id.py @@ -179,7 +179,7 @@ def scix_id_from_hash(hash, checksum=True, split=4, string_length=12): return encode(rand_int) -def generate_bib_data_hash(hash_data): +def generate_bib_data_hash(hash_data, strip_characters=True): unique_fields = [ "id", "aff", @@ -213,7 +213,8 @@ def generate_bib_data_hash(hash_data): hash_data.pop(field) except Exception: continue - if hash_data.get("abs"): + + if strip_characters and hash_data.get("abs"): hash_data["abs"][0] = re.sub("<[^<]+?>", "", hash_data.get("abs")[0]) hash_data["abs"][0] = re.sub(r"\W+", "", hash_data.get("abs")[0]) encoded_hash_data = json.dumps(hash_data).encode("utf-8") @@ -221,7 +222,12 @@ def generate_bib_data_hash(hash_data): def generate_scix_id( - hash_data, hash_data_type="bib_data", checksum=True, split=4, string_length=12 + hash_data, + hash_data_type="bib_data", + checksum=True, + split=4, + string_length=12, + stripped_characters=True, ): if hash_data_type == "bib_data": if type(hash_data) != dict: @@ -229,7 +235,7 @@ def generate_scix_id( hash_data = json.loads(hash_data) except ValueError as e: raise e - hashed_data = generate_bib_data_hash(hash_data) + hashed_data = generate_bib_data_hash(hash_data, stripped_characters=stripped_characters) elif hash_data_type == "other": encoded_hash_data = str(hash_data).encode("utf-8") hashed_data = hashlib.md5(encoded_hash_data).hexdigest() From 30152de5b767047efd124cbcbb6efb8cb72a09f6 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 6 Aug 2025 13:44:56 -0400 Subject: [PATCH 07/11] Fix typo in variable name. --- SciXPipelineUtils/scix_id.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py index 3f287ea..c3a7056 100644 --- a/SciXPipelineUtils/scix_id.py +++ b/SciXPipelineUtils/scix_id.py @@ -227,7 +227,7 @@ def generate_scix_id( checksum=True, split=4, string_length=12, - stripped_characters=True, + strip_characters=True, ): if hash_data_type == "bib_data": if type(hash_data) != dict: @@ -235,7 +235,7 @@ def generate_scix_id( hash_data = json.loads(hash_data) except ValueError as e: raise e - hashed_data = generate_bib_data_hash(hash_data, stripped_characters=stripped_characters) + hashed_data = generate_bib_data_hash(hash_data, strip_characters=strip_characters) elif hash_data_type == "other": encoded_hash_data = str(hash_data).encode("utf-8") hashed_data = hashlib.md5(encoded_hash_data).hexdigest() From 8e4deedf0a0750db082a3dc749e4a6fc758e79aa Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 6 Aug 2025 14:05:34 -0400 Subject: [PATCH 08/11] Add additional tests. --- tests/test_scix_id.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/test_scix_id.py b/tests/test_scix_id.py index 9ddb707..011e8ea 100644 --- a/tests/test_scix_id.py +++ b/tests/test_scix_id.py @@ -71,20 +71,30 @@ def test_generate_scix_id(self): self.assertEqual(scix_id, "7SNR-3N03-VSD6") self.assertEqual(scix_id, scix_id_2) - def test_generate_scix_id_special_characters(self): + def test_generate_scix_id_special_characters_true(self): test_bib_data = { "id": 1, "author": ["Lias, Alberta", "Smith, J."], "title": "Test", "abs": ["words < "], } - # import pudb - # pudb.set_trace() scix_id = scixid.generate_scix_id(test_bib_data) scix_id_2 = scixid.generate_scix_id(json.dumps(test_bib_data)) self.assertEqual(scix_id, "7SNR-3N03-VSD6") self.assertEqual(scix_id, scix_id_2) + def test_generate_scix_id_special_characters_false(self): + test_bib_data = { + "id": 1, + "author": ["Lias, Alberta", "Smith, J."], + "title": "Test", + "abs": ["words < "], + } + scix_id = scixid.generate_scix_id(test_bib_data, strip_characters=False) + scix_id_2 = scixid.generate_scix_id(test_bib_data) + self.assertEqual(scix_id, "APGB-1BCS-SAG1") + self.assertNotEqual(scix_id, scix_id_2) + def test_generate_scix_id_other(self): test_bib_data = { "id": 1, From e9435fae02c11b40f551107aa0666ff0c6cdd6b7 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 6 Aug 2025 14:59:21 -0400 Subject: [PATCH 09/11] Add additional special character stripping. Add additional test for special characters. --- SciXPipelineUtils/scix_id.py | 6 ++++++ tests/test_scix_id.py | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py index c3a7056..fa1db58 100644 --- a/SciXPipelineUtils/scix_id.py +++ b/SciXPipelineUtils/scix_id.py @@ -217,6 +217,12 @@ def generate_bib_data_hash(hash_data, strip_characters=True): if strip_characters and hash_data.get("abs"): hash_data["abs"][0] = re.sub("<[^<]+?>", "", hash_data.get("abs")[0]) hash_data["abs"][0] = re.sub(r"\W+", "", hash_data.get("abs")[0]) + hash_data["abs"][0] = re.sub( + r"&[a-zA-Z]+;", "", hash_data.get("abs")[0] + ) # Remove HTML entities + hash_data["abs"][0] = re.sub( + r"[^\x00-\x7F]", "", hash_data.get("abs")[0] + ) # Remove special Unicode characters like Greek and math encoded_hash_data = json.dumps(hash_data).encode("utf-8") return hashlib.md5(encoded_hash_data).hexdigest() diff --git a/tests/test_scix_id.py b/tests/test_scix_id.py index 011e8ea..2afee13 100644 --- a/tests/test_scix_id.py +++ b/tests/test_scix_id.py @@ -83,6 +83,26 @@ def test_generate_scix_id_special_characters_true(self): self.assertEqual(scix_id, "7SNR-3N03-VSD6") self.assertEqual(scix_id, scix_id_2) + def test_generate_scix_id_special_characters_true_comparison(self): + test_bib_data = { + "id": 1, + "author": ["Lias, Alberta", "Smith, J."], + "title": "Test", + "abs": ["words < "], + } + + test_bib_data_2 = { + "id": 1, + "author": ["Lias, Alberta", "Smith, J."], + "title": "Test", + "abs": ["words <"], + } + + scix_id = scixid.generate_scix_id(test_bib_data) + scix_id_2 = scixid.generate_scix_id(test_bib_data_2) + self.assertEqual(scix_id, "7SNR-3N03-VSD6") + self.assertEqual(scix_id, scix_id_2) + def test_generate_scix_id_special_characters_false(self): test_bib_data = { "id": 1, From ecf03e9c10bccc1b7effd12cf946e5f012115968 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 6 Aug 2025 16:46:38 -0400 Subject: [PATCH 10/11] Allow users to pass custom fields to hash generation. --- SciXPipelineUtils/scix_id.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/SciXPipelineUtils/scix_id.py b/SciXPipelineUtils/scix_id.py index fa1db58..fcc7dba 100644 --- a/SciXPipelineUtils/scix_id.py +++ b/SciXPipelineUtils/scix_id.py @@ -179,7 +179,7 @@ def scix_id_from_hash(hash, checksum=True, split=4, string_length=12): return encode(rand_int) -def generate_bib_data_hash(hash_data, strip_characters=True): +def generate_bib_data_hash(hash_data, strip_characters=True, user_fields=None): unique_fields = [ "id", "aff", @@ -208,6 +208,10 @@ def generate_bib_data_hash(hash_data, strip_characters=True): "date", "copyright", ] + + if user_fields: + unique_fields = user_fields + for field in unique_fields: try: hash_data.pop(field) @@ -234,6 +238,7 @@ def generate_scix_id( split=4, string_length=12, strip_characters=True, + user_fields=None, ): if hash_data_type == "bib_data": if type(hash_data) != dict: @@ -241,7 +246,9 @@ def generate_scix_id( hash_data = json.loads(hash_data) except ValueError as e: raise e - hashed_data = generate_bib_data_hash(hash_data, strip_characters=strip_characters) + hashed_data = generate_bib_data_hash( + hash_data, strip_characters=strip_characters, user_fields=user_fields + ) elif hash_data_type == "other": encoded_hash_data = str(hash_data).encode("utf-8") hashed_data = hashlib.md5(encoded_hash_data).hexdigest() From 01554c43c2deb3e45bdde0b747873f54fe66dc47 Mon Sep 17 00:00:00 2001 From: tjacovich Date: Wed, 6 Aug 2025 16:49:02 -0400 Subject: [PATCH 11/11] Add test for custom user fields. --- tests/test_scix_id.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_scix_id.py b/tests/test_scix_id.py index 2afee13..35593d2 100644 --- a/tests/test_scix_id.py +++ b/tests/test_scix_id.py @@ -71,6 +71,19 @@ def test_generate_scix_id(self): self.assertEqual(scix_id, "7SNR-3N03-VSD6") self.assertEqual(scix_id, scix_id_2) + def test_generate_scix_id_user_fields(self): + test_bib_data = { + "id": 1, + "author": ["Lias, Alberta", "Smith, J."], + "title": "Test", + "abs": ["words"], + } + user_fields = ["id"] + scix_id = scixid.generate_scix_id(test_bib_data, user_fields=user_fields) + scix_id_2 = scixid.generate_scix_id(test_bib_data) + self.assertEqual(scix_id, "1NMC-KCFG-RVH8") + self.assertNotEqual(scix_id, scix_id_2) + def test_generate_scix_id_special_characters_true(self): test_bib_data = { "id": 1,