Skip to content

Commit

Permalink
ajdust line width
Browse files Browse the repository at this point in the history
  • Loading branch information
PascalEgn committed Aug 6, 2024
1 parent cb36b4f commit 5359d99
Show file tree
Hide file tree
Showing 39 changed files with 1,089 additions and 896 deletions.
22 changes: 16 additions & 6 deletions inspire_dojson/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,12 @@
try:
unichr(0x100000)
RE_INVALID_CHARS_FOR_XML = re.compile(
u'[^\U00000009\U0000000A\U0000000D\U00000020-\U0000D7FF\U0000E000-\U0000FFFD\U00010000-\U0010FFFF]+')
u'[^\U00000009\U0000000A\U0000000D\U00000020-\U0000D7FF\U0000E000-\U0000FFFD\U00010000-\U0010FFFF]+'
)
except ValueError: # pragma: no cover
RE_INVALID_CHARS_FOR_XML = re.compile(
u'[^\U00000009\U0000000A\U0000000D\U00000020-\U0000D7FF\U0000E000-\U0000FFFD]+')
u'[^\U00000009\U0000000A\U0000000D\U00000020-\U0000D7FF\U0000E000-\U0000FFFD]+'
)

RECORD = E.record
CONTROLFIELD = E.controlfield
Expand Down Expand Up @@ -105,7 +107,9 @@ def record2marcxml_etree(record):
elif schema_name == 'authors':
marcjson = hepnames2marc.do(record)
else:
raise NotSupportedError(u'JSON -> MARC rules missing for "{}"'.format(schema_name))
raise NotSupportedError(
u'JSON -> MARC rules missing for "{}"'.format(schema_name)
)

record = RECORD()

Expand All @@ -115,15 +119,19 @@ def record2marcxml_etree(record):
value = force_single_element(values)
if not isinstance(value, text_type):
value = text_type(value)
record.append(CONTROLFIELD(_strip_invalid_chars_for_xml(value), {'tag': tag}))
record.append(
CONTROLFIELD(_strip_invalid_chars_for_xml(value), {'tag': tag})
)
else:
for value in force_list(values):
datafield = DATAFIELD({'tag': tag, 'ind1': ind1, 'ind2': ind2})
for code, els in sorted(iteritems(value)):
for el in force_list(els):
if not isinstance(el, text_type):
el = text_type(el)
datafield.append(SUBFIELD(_strip_invalid_chars_for_xml(el), {'code': code}))
datafield.append(
SUBFIELD(_strip_invalid_chars_for_xml(el), {'code': code})
)
record.append(datafield)

return record
Expand Down Expand Up @@ -153,7 +161,9 @@ def cds_marcxml2record(marcxml):


def _get_collections(marcjson):
collections = chain.from_iterable([force_list(el) for el in force_list(get_value(marcjson, '980__.a'))])
collections = chain.from_iterable(
[force_list(el) for el in force_list(get_value(marcjson, '980__.a'))]
)
normalized_collections = [el.lower() for el in collections]

return normalized_collections
Expand Down
26 changes: 16 additions & 10 deletions inspire_dojson/cds/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,24 @@ def add_control_number(record, blob):
if '001' not in blob:
return record

collections = (value.lower() for value in chain(force_list(get_value(blob, '980__.a', default=[])),
force_list(get_value(blob, '980__.c', default=[]))))
collections = (
value.lower()
for value in chain(
force_list(get_value(blob, '980__.a', default=[])),
force_list(get_value(blob, '980__.c', default=[])),
)
)
if 'hidden' in collections:
record.setdefault('595__', []).append({
'9': 'CDS',
'a': u'CDS-{}'.format(blob['001'])
})
record.setdefault('595__', []).append(
{'9': 'CDS', 'a': u'CDS-{}'.format(blob['001'])}
)
else:
record.setdefault('035__', []).append({
'9': 'CDS',
'a': blob['001'],
})
record.setdefault('035__', []).append(
{
'9': 'CDS',
'a': blob['001'],
}
)

return record

Expand Down
113 changes: 78 additions & 35 deletions inspire_dojson/cds/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def escape_url(url):
else:
scheme = ''

url = quote_url(url[len(scheme):])
url = quote_url(url[len(scheme) :])
return scheme + url


Expand All @@ -135,8 +135,19 @@ def persistent_identifiers(self, key, value):
@cds2hep_marc.over('035__', '^035..')
@utils.for_each_value
def external_sytem_identifiers(self, key, value):
ignored = {'cercer', 'inspire', 'xx', 'cern annual report', 'cmscms', 'wai01', 'spires'}
if any(val.lower() in ignored for val in chain(force_list(value.get('9')), force_list(value.get('a')))):
ignored = {
'cercer',
'inspire',
'xx',
'cern annual report',
'cmscms',
'wai01',
'spires',
}
if any(
val.lower() in ignored
for val in chain(force_list(value.get('9')), force_list(value.get('a')))
):
return
if any(val.lower().endswith('cercer') for val in force_list(value.get('a'))):
return
Expand All @@ -151,7 +162,15 @@ def secondary_report_numbers(self, key, value):
Also populates the ``500``, ``595`` and ``980`` MARC field through side effects.
"""
preliminary_results_prefixes = ['ATLAS-CONF-', 'CMS-PAS-', 'CMS-DP-', 'LHCB-CONF-']
note_prefixes = ['ALICE-INT-', 'ATL-', 'ATLAS-CONF-', 'CMS-DP-', 'CMS-PAS-', 'LHCB-CONF-', 'LHCB-PUB-']
note_prefixes = [
'ALICE-INT-',
'ATL-',
'ATLAS-CONF-',
'CMS-DP-',
'CMS-PAS-',
'LHCB-CONF-',
'LHCB-PUB-',
]

result_037 = self.get('037__', [])
result_500 = self.get('500__', [])
Expand All @@ -165,17 +184,21 @@ def secondary_report_numbers(self, key, value):
if any(report.upper().startswith(prefix) for prefix in note_prefixes):
result_980.append({'a': 'NOTE'})

if any(report.upper().startswith(prefix) for prefix in preliminary_results_prefixes):
if any(
report.upper().startswith(prefix) for prefix in preliminary_results_prefixes
):
result_500.append({'9': 'CDS', 'a': 'Preliminary results'})

is_barcode = hidden_report.startswith('P0') or hidden_report.startswith('CM-P0')
if not report.startswith('SIS-') and not is_barcode:
result_037.append({
'9': source,
'a': report,
'c': value.get('c'),
'z': hidden_report if source == 'CDS' else None,
})
result_037.append(
{
'9': source,
'a': report,
'c': value.get('c'),
'z': hidden_report if source == 'CDS' else None,
}
)

self['500__'] = result_500
self['595__'] = result_595
Expand All @@ -196,7 +219,9 @@ def languages(self, key, value):
languages.append({'a': pycountry.languages.get(alpha_3=alpha_3).name})
except KeyError:
with contextlib.suppress(KeyError):
languages.append({'a': pycountry.languages.get(bibliographic=alpha_3).name})
languages.append(
{'a': pycountry.languages.get(bibliographic=alpha_3).name}
)

return languages

Expand Down Expand Up @@ -262,7 +287,9 @@ def nonfirst_authors(self, key, value):
field_700 = self.get('700__', [])
field_701 = self.get('701__', [])

is_supervisor = any(el.lower().startswith('dir') for el in force_list(value.get('e', '')))
is_supervisor = any(
el.lower().startswith('dir') for el in force_list(value.get('e', ''))
)
if is_supervisor:
field_701.append(_converted_author(value))
else:
Expand Down Expand Up @@ -346,7 +373,7 @@ def categories(self, key, value):
result = {
'2': 'INSPIRE',
# XXX: will fail validation and be logged if invalid category
'a': CATEGORIES.get(value.get('a'), value.get('a'))
'a': CATEGORIES.get(value.get('a'), value.get('a')),
}
else:
result = vanilla_dict(value)
Expand Down Expand Up @@ -405,20 +432,28 @@ def urls(self, key, value):
Also populate the ``FFT`` field through side effects.
"""

def _is_preprint(value):
return value.get('y', '').lower() == 'preprint'

def _is_fulltext(value):
return value['u'].endswith('.pdf') and value['u'].startswith('http://cds.cern.ch')
return value['u'].endswith('.pdf') and value['u'].startswith(
'http://cds.cern.ch'
)

def _is_local_copy(value):
return 'local copy' in value.get('y', '')

def _is_ignored_domain(value):
ignored_domains = ['http://cdsweb.cern.ch', 'http://cms.cern.ch',
'http://cmsdoc.cern.ch', 'http://documents.cern.ch',
'http://preprints.cern.ch', 'http://cds.cern.ch',
'http://arxiv.org']
ignored_domains = [
'http://cdsweb.cern.ch',
'http://cms.cern.ch',
'http://cmsdoc.cern.ch',
'http://documents.cern.ch',
'http://preprints.cern.ch',
'http://cds.cern.ch',
'http://arxiv.org',
]
return any(value['u'].startswith(domain) for domain in ignored_domains)

field_8564 = self.get('8564_', [])
Expand All @@ -431,26 +466,34 @@ def _is_ignored_domain(value):

if _is_fulltext(value) and not _is_preprint(value):
if _is_local_copy(value):
description = value.get('y', '').replace('local copy', 'on CERN Document Server')
field_8564.append({
'u': url,
'y': description,
})
description = value.get('y', '').replace(
'local copy', 'on CERN Document Server'
)
field_8564.append(
{
'u': url,
'y': description,
}
)
else:
_, file_name = os.path.split(urllib.parse.urlparse(value['u']).path)
_, extension = os.path.splitext(file_name)
field_FFT.append({
't': 'CDS',
'a': url,
'd': value.get('y', ''),
'n': file_name,
'f': extension,
})
field_FFT.append(
{
't': 'CDS',
'a': url,
'd': value.get('y', ''),
'n': file_name,
'f': extension,
}
)
elif not _is_ignored_domain(value):
field_8564.append({
'u': url,
'y': value.get('y'),
})
field_8564.append(
{
'u': url,
'y': value.get('y'),
}
)

self['FFT__'] = field_FFT
return field_8564
Expand Down
Loading

0 comments on commit 5359d99

Please sign in to comment.