Skip to content
29 changes: 26 additions & 3 deletions SciXPipelineUtils/scix_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def scix_id_from_hash(hash, checksum=True, split=4, string_length=12):
return encode(rand_int)


def generate_bib_data_hash(hash_data):
def generate_bib_data_hash(hash_data, strip_characters=True):
unique_fields = [
Comment thread
tjacovich marked this conversation as resolved.
"id",
"aff",
Expand All @@ -193,26 +193,49 @@ def generate_bib_data_hash(hash_data):
"first_author",
"first_author_norm",
"identifier",
"orcid_pub",
"links_data",
"alternate_bibcode",
"doctype",
"doctype_facet_hier",
"entry_date",
"keyword_norm",
"keyword_facet",
"citation",
"citation_count",
"citation_count_norm",
"read_count",
"date",
"copyright",
]
for field in unique_fields:
try:
hash_data.pop(field)
except Exception:
continue

if strip_characters and hash_data.get("abs"):
hash_data["abs"][0] = re.sub("<[^<]+?>", "", hash_data.get("abs")[0])
hash_data["abs"][0] = re.sub(r"\W+", "", hash_data.get("abs")[0])
Comment thread
tjacovich marked this conversation as resolved.
encoded_hash_data = json.dumps(hash_data).encode("utf-8")
return hashlib.md5(encoded_hash_data).hexdigest()


def generate_scix_id(
hash_data, hash_data_type="bib_data", checksum=True, split=4, string_length=12
hash_data,
hash_data_type="bib_data",
checksum=True,
split=4,
string_length=12,
strip_characters=True,
):
if hash_data_type == "bib_data":
if type(hash_data) != dict:
try:
hash_data = json.loads(hash_data)
except ValueError as e:
raise e
hashed_data = generate_bib_data_hash(hash_data)
hashed_data = generate_bib_data_hash(hash_data, strip_characters=strip_characters)
elif hash_data_type == "other":
encoded_hash_data = str(hash_data).encode("utf-8")
hashed_data = hashlib.md5(encoded_hash_data).hexdigest()
Expand Down
50 changes: 37 additions & 13 deletions tests/test_scix_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,18 @@ def test_generate_bib_data_hash(self):
"id": 1,
"author": ["Lias, Alberta", "Smith, J."],
"title": "Test",
"abs": "words",
"abs": ["words"],
"bibcode": "Test",
}
hash = scixid.generate_bib_data_hash(test_bib_data)
self.assertEqual(hash, "ca77650a961fe043bf18e60618f43b49")
self.assertEqual(hash, "703f2f82ef742c10101840e4fc85bc53")

test_bib_data = {
"title": "Test",
"abs": "words",
"abs": ["words"],
}
hash2 = scixid.generate_bib_data_hash(test_bib_data)
self.assertEqual(hash2, "ca77650a961fe043bf18e60618f43b49")
self.assertEqual(hash2, "703f2f82ef742c10101840e4fc85bc53")

self.assertEqual(hash, hash2)

Expand All @@ -42,42 +42,66 @@ def test_get_rand_from_hash(self):
"id": 1,
"author": ["Lias, Alberta", "Smith, J."],
"title": "Test",
"abs": "words",
"abs": ["words"],
}
hash = scixid.generate_bib_data_hash(test_bib_data)
rand_num = scixid.get_rand_from_hash(hash)
self.assertEqual(rand_num, 12446194448305896)
self.assertEqual(rand_num, 8784826954018605)

def test_scix_id_from_hash(self):
test_bib_data = {
"id": 1,
"author": ["Lias, Alberta", "Smith, J."],
"title": "Test",
"abs": "words",
"abs": ["words"],
}
hash = scixid.generate_bib_data_hash(test_bib_data)
scix_id = scixid.scix_id_from_hash(hash)
self.assertEqual(scix_id, "B1QQ-XVEB-3Q83")
self.assertEqual(scix_id, "7SNR-3N03-VSD6")

def test_generate_scix_id(self):
test_bib_data = {
"id": 1,
"author": ["Lias, Alberta", "Smith, J."],
"title": "Test",
"abs": "words",
"abs": ["words"],
}
scix_id = scixid.generate_scix_id(test_bib_data)
scix_id_2 = scixid.generate_scix_id(json.dumps(test_bib_data))
self.assertEqual(scix_id, "B1QQ-XVEB-3Q83")
self.assertEqual(scix_id, "7SNR-3N03-VSD6")
self.assertEqual(scix_id, scix_id_2)

def test_generate_scix_id_special_characters_true(self):
test_bib_data = {
"id": 1,
"author": ["Lias, Alberta", "Smith, J."],
"title": "Test",
"abs": ["words < <lt\\>"],
Comment thread
tjacovich marked this conversation as resolved.
}
scix_id = scixid.generate_scix_id(test_bib_data)
scix_id_2 = scixid.generate_scix_id(json.dumps(test_bib_data))
self.assertEqual(scix_id, "7SNR-3N03-VSD6")
self.assertEqual(scix_id, scix_id_2)

def test_generate_scix_id_special_characters_false(self):
test_bib_data = {
"id": 1,
"author": ["Lias, Alberta", "Smith, J."],
"title": "Test",
"abs": ["words < <lt\\>"],
}
scix_id = scixid.generate_scix_id(test_bib_data, strip_characters=False)
scix_id_2 = scixid.generate_scix_id(test_bib_data)
self.assertEqual(scix_id, "APGB-1BCS-SAG1")
self.assertNotEqual(scix_id, scix_id_2)

def test_generate_scix_id_other(self):
test_bib_data = {
"id": 1,
"author": ["Lias, Alberta", "Smith, J."],
"title": "Test",
"abs": "words",
"abs": ["words"],
}
scix_id = scixid.generate_scix_id(json.dumps(test_bib_data), hash_data_type="other")
self.assertNotEqual(scix_id, "B1QQ-XVEB-3Q83")
self.assertEqual(scix_id, "9G4K-K9BH-SA63")
self.assertNotEqual(scix_id, "7SNR-3N03-VSD6")
self.assertEqual(scix_id, "6N22-EN04-7GHF")