Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions adsrefpipe/refparsers/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from adsrefpipe.refparsers.NatureXML import NATUREtoREFs
from adsrefpipe.refparsers.NLM3xml import NLMtoREFs
from adsrefpipe.refparsers.ONCPxml import ONCPtoREFs
from adsrefpipe.refparsers.OUPFTxml import OUPFTtoREFs
from adsrefpipe.refparsers.OUPxml import OUPtoREFs
from adsrefpipe.refparsers.PASAxml import PASAtoREFs
from adsrefpipe.refparsers.RSCxml import RSCtoREFs
Expand Down Expand Up @@ -77,6 +78,7 @@
'CUP': CUPtoREFs,
'EDP': EDPtoREFs,
'EGU': EGUtoREFs,
'EGUE2': EGUtoREFs,
'ELSEVIER': ELSEVIERtoREFs,
'ELSEVIERE2': ELSEVIERtoREFs, # with multiple extensions
'ICARUS': ICARUStoREFs,
Expand All @@ -86,6 +88,7 @@
'IOPFT': IOPFTtoREFs,
'IPAP': IPAPtoREFs,
'JATS': JATStoREFs,
'JATSE2': JATStoREFs,
'JLVEnHTML': JLVEnHTMLtoREFs,
'JSTAGE': JSTAGEtoREFs,
'LivingReviews': LivingReviewsToREFs,
Expand All @@ -97,6 +100,7 @@
'ObsOCR': ObsOCRtoREFs,
'ONCP': ONCPtoREFs,
'OUP': OUPtoREFs,
'OUPFT': OUPFTtoREFs,
'PairsTXT': PairsTXTtoREFs,
'PairsTXTE2': PairsTXTtoREFs,
'PairsTXTE3': PairsTXTtoREFs,
Expand Down
21 changes: 13 additions & 8 deletions adsrefpipe/refparsers/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,8 @@ def __sub_numasc_entity(self, match: re.Match) -> str:
try:
return unicodedata.normalize('NFKD', chr(entno))
except OverflowError:
raise UnicodeHandlerError('Unknown numeric entity: %s' % match.group(0))
logger.error(UnicodeHandlerError('Unknown numeric entity: %s, replacing by ""' % match.group(0)))
return ""

def __sub_hexnumasc_entity(self, match: re.Match) -> str:
"""
Expand All @@ -244,8 +245,11 @@ def __sub_hexnumasc_entity(self, match: re.Match) -> str:
elif entno < 255:
return self.u2asc(chr(entno))
except IndexError:
logger.error(UnicodeHandlerError('Unknown hexadecimal entity: %s' % match.group(0)))
return ""
try:
return unicodedata.normalize('NFKD', chr(entno))
except (OverflowError, ValueError):
logger.error(UnicodeHandlerError('Unknown hexadecimal entity: %s, replacing by ""' % match.group(0)))
return ""

def __sub_hexnum_toent(self, match: re.Match) -> str:
"""
Expand All @@ -262,7 +266,8 @@ def __sub_hexnum_toent(self, match: re.Match) -> str:
if self.unicode[entno]:
return '&%s;' % self.unicode[entno].entity
else:
raise UnicodeHandlerError('Unknown hexadecimal entity: %s' % entno)
logger.error(UnicodeHandlerError('Unknown hexadecimal entity: %s, replacing by ""' % entno))
return ""

def __sub_asc_entity(self, match: re.Match) -> str:
"""
Expand All @@ -276,8 +281,8 @@ def __sub_asc_entity(self, match: re.Match) -> str:
ret = self[ent].ascii
return ret
else:
logger.error(UnicodeHandlerError('Unknown named entity: %s, replacing by WHITE SQUARE' % match.group(0)))
return self.unicode[9633].ascii
logger.error(UnicodeHandlerError('Unknown named entity: %s, replacing by ""' % match.group(0)))
return ""

def __toascii(self, char: str) -> str:
"""
Expand All @@ -294,8 +299,8 @@ def __toascii(self, char: str) -> str:
if self.unicode[ascii_value]:
return self.unicode[ascii_value].ascii
else:
logger.error(UnicodeHandlerError('Unknown character code: %d, replacing by WHITE SQUARE' % ascii_value))
return self.unicode[9633].ascii
logger.error(UnicodeHandlerError('Unknown character code: %d, replacing by ""' % ascii_value))
return ""

def __toentity(self, char: str) -> str:
"""
Expand Down
5 changes: 5 additions & 0 deletions adsrefpipe/tests/unittests/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
from adsrefpipe.refparsers.WileyXML import WILEYtoREFs
from adsrefpipe.refparsers.NLM3xml import NLMtoREFs
from adsrefpipe.refparsers.AGUxml import AGUtoREFs, AGUreference
from adsrefpipe.refparsers.EGUxml import EGUtoREFs
from adsrefpipe.refparsers.OUPFTxml import OUPFTtoREFs
from adsrefpipe.refparsers.arXivTXT import ARXIVtoREFs
from adsrefpipe.refparsers.handler import verify
from adsrefpipe.tests.unittests.stubdata.dbdata import actions_records, parsers_records
Expand Down Expand Up @@ -390,6 +392,7 @@ def test_parser_name(self):
'CrossRef': ['/PLoSO/0007/10.1371_journal.pone.0048146.xref.xml', CrossRefToREFs],
'ELSEVIER': ['/AtmEn/0230/iss.elsevier.xml', ELSEVIERtoREFs],
'JATS': ['/NatSR/0009/iss36.jats.xml', JATStoREFs],
'JATSE2': ['/IEEE/0001/iss1.ieee.xml', JATStoREFs],
'IOP': ['/JPhCS/1085/iss4.iop.xml', IOPtoREFs],
'SPRINGER': ['/JHEP/2019/iss06.springer.xml', SPRINGERtoREFs],
'APS': ['/PhRvB/0081/2010PhRvB..81r4520P.ref.xml', APStoREFs],
Expand All @@ -398,6 +401,8 @@ def test_parser_name(self):
'WILEY': ['/JGR/0101/issD14.wiley2.xml', WILEYtoREFs],
'NLM': ['/PNAS/0109/iss17.nlm3.xml', NLMtoREFs],
'AGU': ['/JGR/0101/issD14.agu.xml', AGUtoREFs],
'EGUE2': ['/EGUSp/0001/iss1.copernicus.xml', EGUtoREFs],
'OUPFT': ['/MNRAS/0001/iss1.oupft.xml', OUPFTtoREFs],
'arXiv': ['/arXiv/2011/00324.raw', ARXIVtoREFs],
}

Expand Down
28 changes: 15 additions & 13 deletions adsrefpipe/tests/unittests/test_ref_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,13 +273,11 @@ def test_sub_numasc_entity_exception(self):
with patch("unicodedata.normalize", return_value="normalized_value"):
self.assertEqual(handler._UnicodeHandler__sub_numasc_entity(match), "normalized_value")

# test OverflowError handling (raises UnicodeHandlerError)
# test OverflowError handling (logs and replaces with empty string)
match = re.match(r'&#(?P<number>\d+);', "&#9999999999;")
if match:
with patch("unicodedata.normalize", side_effect=OverflowError):
with self.assertRaises(UnicodeHandlerError) as context:
handler._UnicodeHandler__sub_numasc_entity(match)
self.assertEqual(str(context.exception), "Unknown numeric entity: &#9999999999;")
self.assertEqual(handler._UnicodeHandler__sub_numasc_entity(match), "")

def test_sub_hexnumasc_entity(self):
""" test __sub_hexnumasc_entity method """
Expand All @@ -304,11 +302,18 @@ def test_sub_hexnumasc_entity(self):
handler.unicode = MagicMock()
handler.unicode.__getitem__.side_effect = IndexError

# large invalid hex value to trigger returning and empty string ""
match = re.match(r'&#x(?P<hexnum>[0-9A-Fa-f]+);', "&#x99999;")
# supplementary-plane hex value should normalize instead of being dropped
match = re.match(r'&#x(?P<hexnum>[0-9A-Fa-f]+);', "&#x1D463;")
if match:
result = handler._UnicodeHandler__sub_hexnumasc_entity(match)
self.assertEqual(result, "")
with patch("unicodedata.normalize", return_value="v") as mock_normalize:
result = handler._UnicodeHandler__sub_hexnumasc_entity(match)
self.assertEqual(result, "v")
mock_normalize.assert_called_once_with("NFKD", "𝑣")

# oversized hex value should log and replace with empty string
match = re.match(r'&#x(?P<hexnum>[0-9A-Fa-f]+);', "&#x110000;")
if match:
self.assertEqual(handler._UnicodeHandler__sub_hexnumasc_entity(match), "")

def test_sub_hexnum_toent(self):
""" test __sub_hexnum_toent method """
Expand All @@ -330,15 +335,12 @@ def test_sub_hexnum_toent(self):
if match:
self.assertEqual(handler._UnicodeHandler__sub_hexnum_toent(match), "&pound;")

# test UnicodeHandlerError for unknown entity by ensuring index is in range but has no entity
# test unknown entity handling by ensuring index is in range but has no entity
handler.unicode = MagicMock()
handler.unicode.__getitem__.return_value = None
match = re.match(r'&#x(?P<number>[0-9A-Fa-f]+);', "&#x99999;")
if match:
with self.assertRaises(UnicodeHandlerError) as context:
handler._UnicodeHandler__sub_hexnum_toent(match)
# ensure the exception message is correct
self.assertEqual(str(context.exception), "Unknown hexadecimal entity: 629145")
self.assertEqual(handler._UnicodeHandler__sub_hexnum_toent(match), "")

def test_toentity(self):
""" test __toentity method """
Expand Down
15 changes: 13 additions & 2 deletions adsrefpipe/tests/unittests/test_ref_parsers_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
if project_home not in sys.path:
sys.path.insert(0, project_home)

import copy
import unittest
from unittest.mock import patch, MagicMock, mock_open
import xml.dom.minidom as dom
Expand Down Expand Up @@ -1919,9 +1920,19 @@ class TestWileytoREFs(unittest.TestCase):

def test_init(self):
""" test init """
def _normalize_unicode_placeholders(value):
if isinstance(value, str):
return value.replace('&square;&square;', '').replace('&square;', '')
if isinstance(value, list):
return [_normalize_unicode_placeholders(item) for item in value]
if isinstance(value, dict):
return {key: _normalize_unicode_placeholders(item) for key, item in value.items()}
return value

reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.wiley2.xml')
references = WILEYtoREFs(filename=reference_source, buffer=None).process_and_dispatch()
self.assertEqual(references, parsed_references.parsed_wiley)
expected = _normalize_unicode_placeholders(copy.deepcopy(parsed_references.parsed_wiley))
self.assertEqual(references, expected)

def test_process_and_dispatch_exception(self):
""" test exception in process_and_dispatch """
Expand All @@ -1944,4 +1955,4 @@ def test_process_and_dispatch_exception(self):


if __name__ == '__main__':
unittest.main()
unittest.main()
Loading