diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py index 8b83fac69..11dd7a0f8 100644 --- a/metadata_fetcher/fetchers/Fetcher.py +++ b/metadata_fetcher/fetchers/Fetcher.py @@ -52,6 +52,9 @@ def fetch_page(self): f"[{self.collection_id}]: fetching page {self.write_page} " f"at {page.get('url')}" ) + if 'url' not in page: + raise InvalidHarvestEndpoint( + f"[{self.collection_id}]: invalid harvest endpoint") try: response = requests.get(**page) response.raise_for_status() @@ -91,7 +94,7 @@ def build_fetch_request(self): {'headers': {}, 'params': {}} or any other options accepted by https://docs.python-requests.org/en/latest/api/#requests.get """ - pass + raise NotImplementedError def get_records(self, http_resp): """parses http_resp from institutional API into a list of records @@ -100,7 +103,7 @@ def get_records(self, http_resp): by json.dumps into json line format; takes as an argument: https://docs.python-requests.org/en/latest/api/#requests.Response """ - pass + raise NotImplementedError def increment(self, http_resp): """increment internal state for fetching the next page diff --git a/metadata_fetcher/fetchers/oai_fetcher.py b/metadata_fetcher/fetchers/oai_fetcher.py index 21efec36c..45b1c4d41 100644 --- a/metadata_fetcher/fetchers/oai_fetcher.py +++ b/metadata_fetcher/fetchers/oai_fetcher.py @@ -17,7 +17,7 @@ class OaiFetcher(Fetcher): def __init__(self, params): super(OaiFetcher, self).__init__(params) - self.oai = params.get('harvest_data') + self.oai = params.get('harvest_data', {}) if self.oai.get('harvest_extra_data'): # see if we have a query string, @@ -68,8 +68,13 @@ def build_fetch_request(self): def check_page(self, http_resp: requests.Response) -> int: xml_resp = ElementTree.fromstring(http_resp.content) - xml_hits = xml_resp.find( - 'oai2:ListRecords', NAMESPACE).findall('oai2:record', NAMESPACE) + + xml_list = xml_resp.find('oai2:ListRecords', NAMESPACE) + if xml_list is None: + raise ValueError( + "No records found in response from {http_resp.request.url}") + + xml_hits = xml_list.findall('oai2:record', NAMESPACE) if len(xml_hits) > 0: logging.debug(