Skip to content

Commit de51e2f

Browse files
committed
RefExtract: Update eprint extraction in engine and regex
Signed-off-by: Melissa Clegg <[email protected]>
1 parent 889cb59 commit de51e2f

File tree

4 files changed

+77
-13
lines changed

4 files changed

+77
-13
lines changed

refextract/references/engine.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -586,11 +586,13 @@ def add_recid_elements(splitted_citations):
586586

587587

588588
def arxiv_urls_to_report_numbers(citation_elements):
589-
arxiv_url_prefix = 'http://arxiv.org/abs/'
589+
arxiv_url_prefix = re.compile('^https?:\/\/(?:(?:cn\.|de\.|in\.|lanl\.)?arxiv\.org|xxx\.lanl\.gov)\/(?:abs|pdf)\/(\S+\d{4})(?:v\d)?(?:\.pdf)?', re.UNICODE | re.IGNORECASE)
590590
for el in citation_elements:
591-
if el['type'] == 'URL' and el['url_string'].startswith(arxiv_url_prefix):
592-
el['type'] = 'REPORTNUMBER'
593-
el['report_num'] = el['url_string'].replace(arxiv_url_prefix, 'arXiv:')
591+
if el['type'] == 'URL' and el['url_string']:
592+
matchobj = arxiv_url_prefix.match(el['url_string'])
593+
if matchobj:
594+
el['type'] = 'REPORTNUMBER'
595+
el['report_num'] = matchobj.group(1)
594596

595597

596598
def look_for_hdl(citation_elements):

refextract/references/regexs.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -72,20 +72,26 @@ def compute_pos_patterns(patterns):
7272

7373
# Pattern for arxiv numbers
7474
# arxiv 9910-1234v9 [physics.ins-det]
75-
re_arxiv = re.compile(ur"""
76-
ARXIV[\s:-]*(?P<year>\d{2})-?(?P<month>\d{2})
75+
re_arxiv = re.compile(ur"""(?:(?:https?://(?:www\.)?arxiv\.org/(?:abs|pdf)/)|
76+
(?:https?://(?:xxx\.)?lanl\.gov/(?:abs|pdf)/)|
77+
(?:ARXIV[\s:-]*))(?P<year>\d{2})-?(?P<month>\d{2})
7778
[\s.-]*(?P<num>\d{4})(?!\d)(?:[\s-]*V(?P<version>\d))?
78-
\s*(?P<suffix>\[[A-Z.-]+\])? """, re.VERBOSE | re.UNICODE | re.IGNORECASE)
79+
\s*(?P<suffix>\[[A-Z.-]+\])?
80+
(?:\.pdf)? """, re.VERBOSE | re.UNICODE | re.IGNORECASE)
7981

80-
re_arxiv_5digits = re.compile(ur"""
81-
ARXIV[\s:-]*(?P<year>(1[3-9]|[2-8][0-9]))-?(?P<month>(0[1-9]|1[0-2]))
82-
[\s.-]*(?P<num>\d{5})(?!\d)(?:[\s-]*V(?P<version>\d))?
83-
\s*(?P<suffix>\[[A-Z.-]+\])? """, re.VERBOSE | re.UNICODE | re.IGNORECASE)
82+
re_arxiv_5digits = re.compile(ur"""(?:(?:https?://(?:www\.)?arxiv\.org/(?:abs|pdf)/)|
83+
(?:https?://(?:xxx\.)?lanl\.gov/(?:abs|pdf)/)|
84+
(?:ARXIV[\s:-]*))(?P<year>(1[3-9]|[2-8][0-9]))-?(?P<month>(0[1-9]|1[0-2]))
85+
[\s.-]*(?P<num>\d{5})(?!\d)
86+
(?:[\s-]*V(?P<version>\d))?
87+
\s*(?P<suffix>\[[A-Z.-]+\])?
88+
(?:\.pdf)? """, re.VERBOSE | re.UNICODE | re.IGNORECASE)
8489

8590
# Pattern for arxiv numbers catchup
8691
# arxiv:9910-123 [physics.ins-det]
87-
RE_ARXIV_CATCHUP = re.compile(ur"""
88-
ARXIV[\s:-]*(?P<year>\d{2})-?(?P<month>\d{2})
92+
RE_ARXIV_CATCHUP = re.compile(ur"""(?:(?:https?://(?:www\.)?arxiv\.org/(?:abs|pdf)/)|
93+
(?:https?://(?:xxx\.)?lanl\.gov/(?:abs|pdf)/)|
94+
(?:ARXIV[\s:-]*))(?P<year>\d{2})-?(?P<month>\d{2})
8995
[\s.-]*(?P<num>\d{3})
9096
\s*\[(?P<suffix>[A-Z.-]+)\]""", re.VERBOSE | re.UNICODE | re.IGNORECASE)
9197

tests/test_engine.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,38 @@ def test_doi_subdivisions():
266266
assert references[0]['linemarker'] == [u'10']
267267

268268

269+
def test_old_arxiv():
270+
ref_line = u'[20] B. Moore, T. R. Quinn, F. Governato, J. Stadel, and G. Lake, "Cold collapse and the corecatastrophe," Mon. Not. Roy. Astron. Soc.310(1999) 1147–1152, arXiv:astro-ph/9903164 [astro-ph].'
271+
res = get_references(ref_line)
272+
references = res[0]
273+
assert references[0]['reportnumber'] == [u'astro-ph/9903164']
274+
assert references[0]['linemarker'] == [u'20']
275+
276+
277+
def test_old_lanl_url_version():
278+
ref_line = u'[44] Navarro, J.F., Frenk, C.S., White, S.D.M. http://xxx.lanl.gov/pdf/astro-ph/9508025v1'
279+
res = get_references(ref_line)
280+
references = res[0]
281+
assert references[0]['reportnumber'] == [u'astro-ph/9508025']
282+
assert references[0]['linemarker'] == [u'44']
283+
284+
285+
def test_old_arxiv_url():
286+
ref_line = u'[298] V. Allori, D. Duerr, S. Goldstein, and N. Zanghi. 2002. Seven steps towards the classical world. Journal of Optics B : Quantum and semiclassical Optics, Volume 4, number 4. https://arxiv.org/abs/quant-ph/0112005'
287+
res = get_references(ref_line)
288+
references = res[0]
289+
assert references[0]['reportnumber'] == [u'quant-ph/0112005']
290+
assert references[0]['linemarker'] == [u'298']
291+
292+
293+
def test_old_arxiv_mirror_url():
294+
ref_line = u'[13] A. Zupanc, et al, Belle Collaboration, https://cn.arxiv.org/abs/hep-ex/0703040 2007'
295+
res = get_references(ref_line)
296+
references = res[0]
297+
assert references[0]['reportnumber'] == [u'hep-ex/0703040']
298+
assert references[0]['linemarker'] == [u'13']
299+
300+
269301
def test_get_plaintext_document_body(tmpdir):
270302
input = [u"Some text\n", u"on multiple lines\n"]
271303
f = tmpdir.join("plain.txt")

tests/test_tag.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,3 +175,27 @@ def test_5_digits_suffix_version_new_2012():
175175
ref_line = u"""{any prefix}1210.12345v9 [physics.ins-det]{any postfix}"""
176176
r = tag_arxiv(ref_line)
177177
assert r.strip(': ') == u"{any prefix}1210.12345v9 [physics.ins-det]{any postfix}"
178+
179+
180+
def test_4_digits_new_url():
181+
ref_line = u"""{any prefix}https://arxiv.org/abs/1311.2198{any postfix}"""
182+
r = tag_arxiv(ref_line)
183+
assert r.strip(': ') == u"{any prefix}<cds.REPORTNUMBER>arXiv:1311.2198</cds.REPORTNUMBER>{any postfix}"
184+
185+
186+
def test_5_digits_new_url():
187+
ref_line = u"""{any prefix}https://arxiv.org/abs/1602.03988{any postfix}"""
188+
r = tag_arxiv(ref_line)
189+
assert r.strip(': ') == u"{any prefix}<cds.REPORTNUMBER>arXiv:1602.03988</cds.REPORTNUMBER>{any postfix}"
190+
191+
192+
def test_4_digits_version_new_url():
193+
ref_line = u"""{any prefix}https://arxiv.org/abs/0708.0882v1{any postfix}"""
194+
r = tag_arxiv(ref_line)
195+
assert r.strip(': ') == u"{any prefix}<cds.REPORTNUMBER>arXiv:0708.0882</cds.REPORTNUMBER>{any postfix}"
196+
197+
198+
def test_5_digits_new_pdf_url():
199+
ref_line = u"""{any prefix}https://arxiv.org/pdf/1712.03976.pdf{any postfix}"""
200+
r = tag_arxiv(ref_line)
201+
assert r.strip(': ') == u"{any prefix}<cds.REPORTNUMBER>arXiv:1712.03976</cds.REPORTNUMBER>{any postfix}"

0 commit comments

Comments
 (0)