diff --git a/adsrefpipe/refparsers/handler.py b/adsrefpipe/refparsers/handler.py index 25075b5..28fa716 100644 --- a/adsrefpipe/refparsers/handler.py +++ b/adsrefpipe/refparsers/handler.py @@ -22,6 +22,7 @@ from adsrefpipe.refparsers.NatureXML import NATUREtoREFs from adsrefpipe.refparsers.NLM3xml import NLMtoREFs from adsrefpipe.refparsers.ONCPxml import ONCPtoREFs +from adsrefpipe.refparsers.OUPFTxml import OUPFTtoREFs from adsrefpipe.refparsers.OUPxml import OUPtoREFs from adsrefpipe.refparsers.PASAxml import PASAtoREFs from adsrefpipe.refparsers.RSCxml import RSCtoREFs @@ -77,6 +78,7 @@ 'CUP': CUPtoREFs, 'EDP': EDPtoREFs, 'EGU': EGUtoREFs, + 'EGUE2': EGUtoREFs, 'ELSEVIER': ELSEVIERtoREFs, 'ELSEVIERE2': ELSEVIERtoREFs, # with multiple extensions 'ICARUS': ICARUStoREFs, @@ -86,6 +88,7 @@ 'IOPFT': IOPFTtoREFs, 'IPAP': IPAPtoREFs, 'JATS': JATStoREFs, + 'JATSE2': JATStoREFs, 'JLVEnHTML': JLVEnHTMLtoREFs, 'JSTAGE': JSTAGEtoREFs, 'LivingReviews': LivingReviewsToREFs, @@ -97,6 +100,7 @@ 'ObsOCR': ObsOCRtoREFs, 'ONCP': ONCPtoREFs, 'OUP': OUPtoREFs, + 'OUPFT': OUPFTtoREFs, 'PairsTXT': PairsTXTtoREFs, 'PairsTXTE2': PairsTXTtoREFs, 'PairsTXTE3': PairsTXTtoREFs, diff --git a/adsrefpipe/refparsers/unicode.py b/adsrefpipe/refparsers/unicode.py index 39453d2..c8169ac 100755 --- a/adsrefpipe/refparsers/unicode.py +++ b/adsrefpipe/refparsers/unicode.py @@ -228,7 +228,8 @@ def __sub_numasc_entity(self, match: re.Match) -> str: try: return unicodedata.normalize('NFKD', chr(entno)) except OverflowError: - raise UnicodeHandlerError('Unknown numeric entity: %s' % match.group(0)) + logger.error(UnicodeHandlerError('Unknown numeric entity: %s, replacing by ""' % match.group(0))) + return "" def __sub_hexnumasc_entity(self, match: re.Match) -> str: """ @@ -244,8 +245,11 @@ def __sub_hexnumasc_entity(self, match: re.Match) -> str: elif entno < 255: return self.u2asc(chr(entno)) except IndexError: - logger.error(UnicodeHandlerError('Unknown hexadecimal entity: %s' % match.group(0))) - return "" + try: + return unicodedata.normalize('NFKD', chr(entno)) + except (OverflowError, ValueError): + logger.error(UnicodeHandlerError('Unknown hexadecimal entity: %s, replacing by ""' % match.group(0))) + return "" def __sub_hexnum_toent(self, match: re.Match) -> str: """ @@ -262,7 +266,8 @@ def __sub_hexnum_toent(self, match: re.Match) -> str: if self.unicode[entno]: return '&%s;' % self.unicode[entno].entity else: - raise UnicodeHandlerError('Unknown hexadecimal entity: %s' % entno) + logger.error(UnicodeHandlerError('Unknown hexadecimal entity: %s, replacing by ""' % entno)) + return "" def __sub_asc_entity(self, match: re.Match) -> str: """ @@ -276,8 +281,8 @@ def __sub_asc_entity(self, match: re.Match) -> str: ret = self[ent].ascii return ret else: - logger.error(UnicodeHandlerError('Unknown named entity: %s, replacing by WHITE SQUARE' % match.group(0))) - return self.unicode[9633].ascii + logger.error(UnicodeHandlerError('Unknown named entity: %s, replacing by ""' % match.group(0))) + return "" def __toascii(self, char: str) -> str: """ @@ -294,8 +299,8 @@ def __toascii(self, char: str) -> str: if self.unicode[ascii_value]: return self.unicode[ascii_value].ascii else: - logger.error(UnicodeHandlerError('Unknown character code: %d, replacing by WHITE SQUARE' % ascii_value)) - return self.unicode[9633].ascii + logger.error(UnicodeHandlerError('Unknown character code: %d, replacing by ""' % ascii_value)) + return "" def __toentity(self, char: str) -> str: """ diff --git a/adsrefpipe/tests/unittests/test_app.py b/adsrefpipe/tests/unittests/test_app.py index 5cc678e..e3d72bf 100644 --- a/adsrefpipe/tests/unittests/test_app.py +++ b/adsrefpipe/tests/unittests/test_app.py @@ -26,6 +26,8 @@ from adsrefpipe.refparsers.WileyXML import WILEYtoREFs from adsrefpipe.refparsers.NLM3xml import NLMtoREFs from adsrefpipe.refparsers.AGUxml import AGUtoREFs, AGUreference +from adsrefpipe.refparsers.EGUxml import EGUtoREFs +from adsrefpipe.refparsers.OUPFTxml import OUPFTtoREFs from adsrefpipe.refparsers.arXivTXT import ARXIVtoREFs from adsrefpipe.refparsers.handler import verify from adsrefpipe.tests.unittests.stubdata.dbdata import actions_records, parsers_records @@ -390,6 +392,7 @@ def test_parser_name(self): 'CrossRef': ['/PLoSO/0007/10.1371_journal.pone.0048146.xref.xml', CrossRefToREFs], 'ELSEVIER': ['/AtmEn/0230/iss.elsevier.xml', ELSEVIERtoREFs], 'JATS': ['/NatSR/0009/iss36.jats.xml', JATStoREFs], + 'JATSE2': ['/IEEE/0001/iss1.ieee.xml', JATStoREFs], 'IOP': ['/JPhCS/1085/iss4.iop.xml', IOPtoREFs], 'SPRINGER': ['/JHEP/2019/iss06.springer.xml', SPRINGERtoREFs], 'APS': ['/PhRvB/0081/2010PhRvB..81r4520P.ref.xml', APStoREFs], @@ -398,6 +401,8 @@ def test_parser_name(self): 'WILEY': ['/JGR/0101/issD14.wiley2.xml', WILEYtoREFs], 'NLM': ['/PNAS/0109/iss17.nlm3.xml', NLMtoREFs], 'AGU': ['/JGR/0101/issD14.agu.xml', AGUtoREFs], + 'EGUE2': ['/EGUSp/0001/iss1.copernicus.xml', EGUtoREFs], + 'OUPFT': ['/MNRAS/0001/iss1.oupft.xml', OUPFTtoREFs], 'arXiv': ['/arXiv/2011/00324.raw', ARXIVtoREFs], } diff --git a/adsrefpipe/tests/unittests/test_ref_parsers.py b/adsrefpipe/tests/unittests/test_ref_parsers.py index d0a3434..f98729f 100755 --- a/adsrefpipe/tests/unittests/test_ref_parsers.py +++ b/adsrefpipe/tests/unittests/test_ref_parsers.py @@ -273,13 +273,11 @@ def test_sub_numasc_entity_exception(self): with patch("unicodedata.normalize", return_value="normalized_value"): self.assertEqual(handler._UnicodeHandler__sub_numasc_entity(match), "normalized_value") - # test OverflowError handling (raises UnicodeHandlerError) + # test OverflowError handling (logs and replaces with empty string) match = re.match(r'&#(?P\d+);', "�") if match: with patch("unicodedata.normalize", side_effect=OverflowError): - with self.assertRaises(UnicodeHandlerError) as context: - handler._UnicodeHandler__sub_numasc_entity(match) - self.assertEqual(str(context.exception), "Unknown numeric entity: �") + self.assertEqual(handler._UnicodeHandler__sub_numasc_entity(match), "") def test_sub_hexnumasc_entity(self): """ test __sub_hexnumasc_entity method """ @@ -304,11 +302,18 @@ def test_sub_hexnumasc_entity(self): handler.unicode = MagicMock() handler.unicode.__getitem__.side_effect = IndexError - # large invalid hex value to trigger returning and empty string "" - match = re.match(r'&#x(?P[0-9A-Fa-f]+);', "򙦙") + # supplementary-plane hex value should normalize instead of being dropped + match = re.match(r'&#x(?P[0-9A-Fa-f]+);', "𝑣") if match: - result = handler._UnicodeHandler__sub_hexnumasc_entity(match) - self.assertEqual(result, "") + with patch("unicodedata.normalize", return_value="v") as mock_normalize: + result = handler._UnicodeHandler__sub_hexnumasc_entity(match) + self.assertEqual(result, "v") + mock_normalize.assert_called_once_with("NFKD", "𝑣") + + # oversized hex value should log and replace with empty string + match = re.match(r'&#x(?P[0-9A-Fa-f]+);', "�") + if match: + self.assertEqual(handler._UnicodeHandler__sub_hexnumasc_entity(match), "") def test_sub_hexnum_toent(self): """ test __sub_hexnum_toent method """ @@ -330,15 +335,12 @@ def test_sub_hexnum_toent(self): if match: self.assertEqual(handler._UnicodeHandler__sub_hexnum_toent(match), "£") - # test UnicodeHandlerError for unknown entity by ensuring index is in range but has no entity + # test unknown entity handling by ensuring index is in range but has no entity handler.unicode = MagicMock() handler.unicode.__getitem__.return_value = None match = re.match(r'&#x(?P[0-9A-Fa-f]+);', "򙦙") if match: - with self.assertRaises(UnicodeHandlerError) as context: - handler._UnicodeHandler__sub_hexnum_toent(match) - # ensure the exception message is correct - self.assertEqual(str(context.exception), "Unknown hexadecimal entity: 629145") + self.assertEqual(handler._UnicodeHandler__sub_hexnum_toent(match), "") def test_toentity(self): """ test __toentity method """ diff --git a/adsrefpipe/tests/unittests/test_ref_parsers_xml.py b/adsrefpipe/tests/unittests/test_ref_parsers_xml.py index ab4c4b2..3346381 100644 --- a/adsrefpipe/tests/unittests/test_ref_parsers_xml.py +++ b/adsrefpipe/tests/unittests/test_ref_parsers_xml.py @@ -3,6 +3,7 @@ if project_home not in sys.path: sys.path.insert(0, project_home) +import copy import unittest from unittest.mock import patch, MagicMock, mock_open import xml.dom.minidom as dom @@ -1919,9 +1920,19 @@ class TestWileytoREFs(unittest.TestCase): def test_init(self): """ test init """ + def _normalize_unicode_placeholders(value): + if isinstance(value, str): + return value.replace('□□', '').replace('□', '') + if isinstance(value, list): + return [_normalize_unicode_placeholders(item) for item in value] + if isinstance(value, dict): + return {key: _normalize_unicode_placeholders(item) for key, item in value.items()} + return value + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.wiley2.xml') references = WILEYtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_wiley) + expected = _normalize_unicode_placeholders(copy.deepcopy(parsed_references.parsed_wiley)) + self.assertEqual(references, expected) def test_process_and_dispatch_exception(self): """ test exception in process_and_dispatch """ @@ -1944,4 +1955,4 @@ def test_process_and_dispatch_exception(self): if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main()