Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
74 commits
Select commit Hold shift + click to select a range
328e605
add configuration option to use a postgres backend instead of datacit…
axfelix Jan 21, 2020
595ef3b
add lookup functions from frdr harvester for postgres export
axfelix Jan 21, 2020
fe861ee
cleanups, add other helper function stubs
axfelix Jan 21, 2020
2bb3759
refactor returning single records, clarify todos, implement cursor
axfelix Jan 24, 2020
dbc1637
add preliminary result formatting for datacite
axfelix Jan 24, 2020
9f08a03
fix outstanding TODOs, let's test this
axfelix Jan 29, 2020
e628330
change mentions of postgres to FRDR where appropriate
axfelix Feb 6, 2020
41f0428
fix up pipfile for postgres, clean up some syntax
axfelix Feb 14, 2020
8d9c829
lots of recasting, postgres cleanups
axfelix Feb 14, 2020
f1d876c
properly support resumption token
axfelix Feb 18, 2020
e7e6a82
fix calls to get sets and identifiers
axfelix Feb 19, 2020
6352d5b
Add new set for openaire_data
KellyStathis Apr 16, 2020
321abe0
Construct XML from FRDR metadata for oai_datacite
KellyStathis Apr 17, 2020
fbd7aae
Only include rightsList if there is a rights entry
KellyStathis Apr 17, 2020
b39e5f3
Hardcode "Dataset" for resourcetypes (for Dublin Core)
KellyStathis Apr 17, 2020
fde0bd8
Include subjects and tags in dc:subjects, deduplicate when needed
KellyStathis Apr 17, 2020
6a07425
Use dateutil to parse year for oai_datacite
KellyStathis Apr 17, 2020
d9de944
Revert "Use dateutil to parse year for oai_datacite"
KellyStathis Apr 17, 2020
229ae49
Remove contact info (email) from funding_references and remove print …
KellyStathis Apr 17, 2020
8963d4e
Merge remote-tracking branch 'upstream/master'
axfelix Apr 17, 2020
cf17a3a
Add FRDR as contributor with type HostingInstitution
KellyStathis Apr 17, 2020
1c91392
Include access in rightsList where applicable. Remove container eleme…
KellyStathis Apr 17, 2020
fa46e92
always use datacite unless frdr configuration provided
axfelix Apr 20, 2020
6a49965
support from and until parameters for frdr
axfelix Apr 23, 2020
6ab32f1
Use URL for identifier, remove dx from DOIs
KellyStathis Apr 23, 2020
14a16ab
update deletedRecord policy for frdr
axfelix Apr 23, 2020
369a478
Use database pk for identifiers and add URL to the identifiers list (…
KellyStathis May 7, 2020
ad6e097
Merge pull request #1 from axfelix/identifier_fix
axfelix May 8, 2020
e2610cf
Use item_url as OAI identifier with "oai:" prefix
KellyStathis May 26, 2020
ec3537a
Merge pull request #2 from axfelix/identifier_fix
axfelix May 26, 2020
36bb95b
Capitalize publicationYear
KellyStathis May 27, 2020
174b870
Move series into description with SeriesInformation type
KellyStathis May 27, 2020
fd91f59
Merge branch 'identifier_fix'
KellyStathis May 27, 2020
bc8143f
Put openaire_data set first in list
KellyStathis May 28, 2020
a3a8c79
merge upstream
axfelix May 29, 2020
355c351
Only include rights entries that have a URL
KellyStathis Jul 2, 2020
e28231f
Update contributor to have xml:lang attribute, separate en/fr
KellyStathis Jul 2, 2020
0e7e3fa
fix resetting cursor position after two pages
axfelix Jul 2, 2020
2eb82e6
Include rights entries that do not have a URL
KellyStathis Jul 13, 2020
daa5938
Add xml:lang for description and subject
KellyStathis Jul 13, 2020
66d8bfe
fix parsing single-element list values, make sure cursor is cast as i…
axfelix Jul 13, 2020
7699d72
Only include subjects block if not empty
KellyStathis Jul 20, 2020
f6efb24
Use OAI-compliant identifier and query database using local_identifier
KellyStathis Jul 20, 2020
f87f974
Use info:eu-repo-Access-Terms vocabulary for access metadata (openAcc…
KellyStathis Jul 20, 2020
c1dda75
Use repo_oai_name in identifier and for setSpec
KellyStathis Jul 24, 2020
b9750ab
Add French titles, descriptions, and categories; subject/tags renamed…
KellyStathis Jul 24, 2020
ec43f70
Add DOI as identifier when available
KellyStathis Jul 24, 2020
6a7b583
Only include one identifier (URL or DOI)
KellyStathis Jul 24, 2020
2666785
Fix issue where local_identifiers with colons weren't working
KellyStathis Jul 24, 2020
6504bce
Add contributors to XML metadata (type is unknown, use "Other")
KellyStathis Jul 24, 2020
ca5df37
Add openAccess statement for records without explicit access statemen…
KellyStathis Jul 24, 2020
35fab8f
Use rightsURI for eu-repo/semantics terms
KellyStathis Aug 4, 2020
1fee002
Only retrieve records with pub_date
KellyStathis Aug 10, 2020
b07edd2
fix null value handling and order by record ids to facilitate debugging
axfelix Sep 10, 2020
6523551
fix reporting totals and paging at end of listrecords
axfelix Sep 10, 2020
d786214
forgot to sideload ftfy dependencies into pipfile lock
axfelix Sep 11, 2020
d11a5e0
revert less than or equals test for batch sizes
axfelix Sep 11, 2020
92fe12c
Continue iteration when there are fewer than 50 records per page; onl…
KellyStathis Sep 16, 2020
e263657
Use ftfy.fix_text in XML for oai_datacite and datacite
KellyStathis Sep 21, 2020
34a2779
helper function for fixing xml to check if none/zero length
KellyStathis Sep 21, 2020
dafcbad
Add openAccess or restrictedAccess flag to oai_dc for Primo
KellyStathis Sep 23, 2020
ec2f12b
Replace form feed chars (\x0c) with space
KellyStathis Oct 6, 2020
edd4fe0
Use repository_name for publisher field in oai_dc (matches oai_datacite)
KellyStathis Oct 7, 2020
1404d23
Ensure that only strings are passed to ftfy.fix_text and catch except…
KellyStathis Oct 7, 2020
7e6ac12
Exclude deleted records and records without item_url from selection
KellyStathis Oct 7, 2020
2a84e4e
Only set one info:eu-repo/semantics access statement (still needs tes…
KellyStathis Nov 9, 2020
1cc56bd
Parse pub_date to YYYY-MM-DD format from datetime
KellyStathis Dec 15, 2020
75df506
Switch from pub_date to upstream_modified_timestamp
KellyStathis Dec 15, 2020
d7d9f8a
Merging upstream changes
KellyStathis Dec 16, 2020
30b9a1a
Add GeoLocation metadata to oai_datacite format
KellyStathis Dec 22, 2020
e47e355
Use the same dict cursor throughout; comments and spacing
KellyStathis Dec 22, 2020
3681d00
Merge pull request #4 from axfelix/geospatial
KellyStathis Jan 4, 2021
44b3035
Stop ListIdentifiers iteration when total records is exceeded
KellyStathis Mar 11, 2021
b9d5663
Change record_id to record_uuid
KellyStathis Jan 4, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,6 @@ dmypy.json
.vscode

# Env configs
*.env
*.env

.DS_Store
4 changes: 3 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ python-dotenv = "*"

[packages]
flask = "*"
ftfy = "*"
pyoai = "*"
psycopg2-binary = "*"
requests = "*"
python-dateutil = "*"
lxml = "*"
Expand All @@ -23,4 +25,4 @@ sentry-sdk = {extras = ["flask"],version = "*"}
python-dotenv = "*"

[requires]
python_version = "3.6"
python_version = "3.5"
314 changes: 194 additions & 120 deletions Pipfile.lock

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions vendor/docker/env.conf
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@ env OAIPMH_BASE_URL;
env DATACITE_API_URL;
env OAIPMH_REPOS_NAME;
env OAIPMH_ADMIN_EMAIL;
env OAIPMH_IDENTIFIER;
env SENTRY_DSN;
env DATACITE_API_ADMIN_USERNAME;
env DATACITE_API_ADMIN_PASSWORD;
env RESULT_SET_SIZE;
env CATALOG_SET;
env POSTGRES_SERVER;
env POSTGRES_DB;
env POSTGRES_USER;
env POSTGRES_PASSWORD;
263 changes: 262 additions & 1 deletion viringo/catalogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from viringo import config
from .services import datacite
from .services import frdr


class DataCiteOAIServer():
Expand Down Expand Up @@ -285,6 +286,267 @@ def build_metadata_map(self, result):
return metadata


class FRDROAIServer():
"""Build OAI-PMH responses from the FRDR Postgres server"""
def identify(self):
"""Construct common identification for the OAI service"""

identify = common.Identify(
repositoryName=config.OAIPMH_REPOS_NAME,
baseURL=config.OAIPMH_BASE_URL,
protocolVersion="2.0",
adminEmails=[config.OAIPMH_ADMIN_EMAIL],
earliestDatestamp=datetime(2011, 1, 1),
deletedRecord='no',
granularity='YYYY-MM-DDThh:mm:ssZ',
compression=['gzip', 'deflate'],
toolkit_description=False)

# Specify a custom description
frdr_desc = """
<oai-identifier xmlns="http://www.openarchives.org/OAI/2.0/oai-identifier" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai-identifier http://www.openarchives.org/OAI/2.0/oai-identifier.xsd">
<scheme>oai</scheme>
<repositoryIdentifier>""" + config.OAIPMH_IDENTIFIER + """</repositoryIdentifier>
<delimiter>:</delimiter>
<sampleIdentifier>oai""" + config.OAIPMH_IDENTIFIER + """:1</sampleIdentifier>
</oai-identifier>
"""

identify.add_description(xml_string=frdr_desc)

return identify

def listMetadataFormats(self, identifier=None):
#pylint: disable=no-self-use,invalid-name
"""Returns metadata formats available for the repository

Identifier does nothing as our repository responds in all formats for all dois
"""
# PyOAI Expects result format (metadataPrefix, schema, metadataNamespace)

format_oai_dc = (
'oai_dc',
'http://www.openarchives.org/OAI/2.0/oai_dc.xsd',
'http://www.openarchives.org/OAI/2.0/oai_dc/'
)

format_oai_datacite = (
'oai_datacite',
'http://schema.datacite.org/oai/oai-1.1/oai.xsd',
'http://schema.datacite.org/oai/oai-1.1/'
)

format_datacite = (
'datacite',
'http://schema.datacite.org/meta/nonexistant/nonexistant.xsd',
'http://datacite.org/schema/nonexistant'
)

return [format_oai_dc, format_oai_datacite, format_datacite]

def getRecord(self, metadataPrefix, identifier):
#pylint: disable=no-self-use,invalid-name
"""Returns pyoai data tuple for specific record"""

# Should we implement this based on source_url and local_identifier the way we currently do for the harvester?

result = frdr.get_metadata(identifier, db=config.POSTGRES_DB, user=config.POSTGRES_USER, password=config.POSTGRES_PASSWORD, server=config.POSTGRES_SERVER, port=config.POSTGRES_PORT)
if not result:
raise error.IdDoesNotExistError(
"\"%s\" is unknown or illegal in this repository" % identifier
)

# Build metadata based on requested format and result
metadata = self.build_metadata_map(result)

header = self.build_header(result)
record = self.build_record(metadata)
data = (
header,
record,
None # About string - not used
)

return data

def listRecords(
self,
metadataPrefix=None,
from_=None,
until=None,
set=None,
paging_cursor=None
):

#pylint: disable=no-self-use,invalid-name
"""Returns pyoai data tuple for list of records"""

# If available get the search query from the set param
search_query = set_to_search_query(set)

results, total_records, paging_cursor = frdr.get_metadata_list(
server=config.POSTGRES_SERVER,
db=config.POSTGRES_DB,
user=config.POSTGRES_USER,
password=config.POSTGRES_PASSWORD,
port=config.POSTGRES_PORT,
query=search_query,
set=set,
from_datetime=from_,
until_datetime=until,
cursor=paging_cursor
)

if paging_cursor >= total_records:
paging_cursor = None

records = []
if results:
for result in results:
# Build metadata based on requested format and result
metadata = self.build_metadata_map(result)

header = self.build_header(result)
record = self.build_record(metadata)

data = (
header,
record,
None # About string - not used
)

records.append(data)

# This differs from the pyoai implementation in that we have to return a cursor here
# But this is okay as we have a custom server to handle it.
return records, total_records, paging_cursor

def listIdentifiers(
self,
metadataPrefix=None,
from_=None,
until=None,
set=None,
paging_cursor=None
):
#pylint: disable=no-self-use,invalid-name
"""Returns pyoai data tuple for list of identifiers"""

# If available get the search query from the set param
search_query = set_to_search_query(set)

results, total_records, paging_cursor = frdr.get_metadata_list(
server=config.POSTGRES_SERVER,
db=config.POSTGRES_DB,
user=config.POSTGRES_USER,
password=config.POSTGRES_PASSWORD,
port=config.POSTGRES_PORT,
query=search_query,
set=set,
from_datetime=from_,
until_datetime=until,
cursor=paging_cursor
)

if paging_cursor >= total_records:
paging_cursor = None

records = []
if results:
for result in results:
header = self.build_header(result)

records.append(header)

# This differs from the pyoai implementation in that we have to return a cursor here
# But this is okay as we have a custom server to handle it.
return records, total_records, paging_cursor

def listSets(
self,
paging_cursor=0
):
#pylint: disable=no-self-use,invalid-name
"""Returns pyoai data tuple for list of sets"""

# Note this implementation is not super efficient as we request
# the full set everytime regardles of actual paging
# The paging is handled just by offsetting the records returned.
# This is however acceptable given sets are a small subset of data.

# We know we're always dealing with a integer value here
paging_cursor = int(paging_cursor)

batch_size = 50
next_batch = paging_cursor + batch_size
results, total_results = frdr.get_sets(db=config.POSTGRES_DB, user=config.POSTGRES_USER, password=config.POSTGRES_PASSWORD, server=config.POSTGRES_SERVER, port=config.POSTGRES_PORT)
results = results[paging_cursor: next_batch]

if len(results) < batch_size:
paging_cursor = None
else:
paging_cursor = next_batch

records = []
if results:
for identifier, name in results:
# Format of a set is setSpec, setName, setDescription
records.append((identifier, name, None))

# This differs from the pyoai implementation in that we have to return a cursor here
# But this is okay as we have a custom server to handle it.
return records, total_results, paging_cursor

def build_header(self, result):
"""Construct a OAI-PMH record header"""

return common.Header(
None,
str(result.identifier),
result.updated_datetime,
setspec=[result.client],
deleted=not result.active
)

def build_record(self, metadata):
"""Construct a OAI-PMH payload for a record"""

return common.Metadata(
None,
metadata
)

def build_metadata_map(self, result):
"""Construct a metadata map object for oai metadata writing"""
identifiers = result.identifiers

relations = [
identifier_to_string(relation)
for relation in result.relations
]

metadata = {
'title': result.titles,
'creator': result.creators,
'subject': result.subjects,
'description': result.descriptions,
'publisher': [result.publisher] if result.publisher else [],
'contributor': result.contributors,
'date': result.dates,
'type': result.resource_types,
'format': result.formats,
'identifier': identifiers,
'relation': relations,
'language': [result.language] if result.language else [],
'rights': result.rights,
'xml': result.xml,
'set': result.client,
'metadata_version': result.metadata_version
}

return metadata


def set_to_search_query(unparsed_set):
"""Take a oai set and extract any base64url encoded search query"""

Expand All @@ -298,7 +560,6 @@ def set_to_search_query(unparsed_set):

return ""


def set_to_provider_client(unparsed_set):
"""Take a oai set and convert into provider_id and client_id"""

Expand Down
14 changes: 14 additions & 0 deletions viringo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,19 @@
OAIPMH_BASE_URL = os.getenv('OAIPMH_BASE_URL', 'https://oai.datacite.org/oai')
# Admin e-mail for the OAI-PMH service
OAIPMH_ADMIN_EMAIL = os.getenv('OAIPMH_ADMIN_EMAIL', 'support@datacite.org')
# OAI repository identifier
OAIPMH_IDENTIFIER = os.getenv('OAIPMH_IDENTIFIER', 'oai.datacite.org')
# Page size of results shown for result listings
RESULT_SET_SIZE = int(os.getenv('RESULT_SET_SIZE', '50'))
# Source metadata catalog (DataCite or FRDR)
CATALOG_SET = os.getenv('OAIPMH_CATALOG', 'DataCite')
# FRDR Postgres server
POSTGRES_SERVER = os.getenv('OAIPMH_POSTGRES_SERVER', '')
# FRDR Postgres db
POSTGRES_DB = os.getenv('OAIPMH_POSTGRES_DB', '')
# FRDR Postgres user
POSTGRES_USER = os.getenv('OAIPMH_POSTGRES_USER', '')
# FRDR Postgres password
POSTGRES_PASSWORD = os.getenv('OAIPMH_POSTGRES_PASSWORD', '')
# FRDR Postgres port
POSTGRES_PORT = os.getenv('OAIPMH_POSTGRES_PORT', '5432')
22 changes: 18 additions & 4 deletions viringo/metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""This module deals with handling the representation of metadata formats for OAI"""

import re
import ftfy
from lxml import etree

NS_OAIPMH = 'http://www.openarchives.org/OAI/2.0/'
Expand Down Expand Up @@ -43,10 +44,20 @@ def nsdc(name):
]:
for value in _map.get(name, []):
if value:
if isinstance(value, list):
if len(value) == 1:
value = value[0]
else:
value = str(value)
new_element = etree.SubElement(e_dc, nsdc(name))
# The regular expression here is to filter only valid XML chars
# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
new_element.text = re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+', '', value)
if isinstance(value, str):
try:
value = value.replace('\x0c', " ")
new_element.text = ftfy.fix_text(value)
except:
new_element.text = ''
else:
new_element.text = ''

def datacite_writer(element: etree.Element, metadata):
"""Writer for writing data in a metadata object out into raw datacite format"""
Expand All @@ -62,7 +73,10 @@ def oai_datacite_writer(element: etree.Element, metadata):
_map = metadata.getMap()
raw_xml = _map.get('xml', '')

xml_resource_element = etree.fromstring(raw_xml)
try:
xml_resource_element = etree.fromstring(raw_xml)
except:
print(raw_xml)

e_oai_datacite = etree.SubElement(
element, "oai_datacite", {'xmlns': 'http://schema.datacite.org/oai/oai-1.1/'},
Expand Down
Loading