[WIP] Rough out PRA fetcher

lthurston · lthurston · commit fc3805937213 · 2023-03-22T16:41:31.000-07:00
diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py
@@ -78,29 +78,34 @@ def fetch_page(self):
             raise FetchError(
                 f"[{self.collection_id}]: unable to fetch page {page}")
 
+
         if self.check_page(response):
+            text = self.get_text_from_response(response)
             if settings.DATA_DEST == 'local':
-                self.fetchtolocal(response.text)
+                self.fetchtolocal(text)
             else:
-                self.fetchtos3(response.text)
+                self.fetchtos3(text)
 
         self.increment(response)
 
         return self.json()
 
+    def get_text_from_response(self, response):
+        return response.text
+
     def build_fetch_request(self):
         """build parameters for the institution's requests.get()
 
-        this should minimally return {'url': str} but may also include 
-        {'headers': {}, 'params': {}} or any other options accepted by 
+        this should minimally return {'url': str} but may also include
+        {'headers': {}, 'params': {}} or any other options accepted by
         https://docs.python-requests.org/en/latest/api/#requests.get
         """
         pass
 
     def get_records(self, http_resp):
         """parses http_resp from institutional API into a list of records
 
-        should return a list of dictionaries which can easily be serialized 
+        should return a list of dictionaries which can easily be serialized
         by json.dumps into json line format; takes as an argument:
         https://docs.python-requests.org/en/latest/api/#requests.Response
         """
diff --git a/metadata_fetcher/fetchers/oac_fetcher.py b/metadata_fetcher/fetchers/oac_fetcher.py
@@ -43,9 +43,9 @@ def __init__(self, params):
             # https://stackoverflow.com/questions/20129996/why-does-boolxml-etree-elementtree-element-evaluate-to-false
             counts = {
                 'total': total.attrib['totalDocs'] if total else 0,
-                'image': int(image_group.attrib['totalDocs']) 
+                'image': int(image_group.attrib['totalDocs'])
                 if image_group is not None else 0,
-                'text': int(text_group.attrib['totalDocs']) 
+                'text': int(text_group.attrib['totalDocs'])
                 if text_group is not None else 0,
                 'harvested': 0,
                 'harvested_image': 0,
diff --git a/metadata_fetcher/fetchers/pra_fetcher.py b/metadata_fetcher/fetchers/pra_fetcher.py
@@ -0,0 +1,159 @@
+import json
+from xml.etree import ElementTree
+from .Fetcher import Fetcher
+from requests.auth import HTTPBasicAuth
+import requests
+import re
+
+
+class PraFetcher(Fetcher):
+
+    BASE_URL = "https://us.preservica.com/api/entity/v6.0"
+
+    NAMESPACES = {"pra": "http://preservica.com/EntityAPI/v6.0"}
+
+    def __init__(self, params):
+        super(PraFetcher, self).__init__(params)
+
+        # If `next_url` is a param, we know that this is not
+        # the fetch of the first page, so skip setting those
+        # attributes
+        if "next_url" in params:
+            for key in params:
+                setattr(self, key, params[key])
+            return
+
+        credentials = params.get("harvest_data").get("harvest_extra_data")
+        self.basic_auth_credentials = ([v.strip() for v in credentials.split(',')])
+        self.internal_collection_id = re.search(r"(?<=SO_)[0-9a-f-]+",
+                                                params.get("harvest_data").get("url")).group(0)
+        self.original_url = f"{self.BASE_URL}/structural-objects/{self.internal_collection_id}/children"
+        self.next_url = self.get_first_page_url()
+
+    def get_first_page_url(self):
+        """
+        Two possibilities exist:
+
+        1) The `original_url` contains a list of IO children and is the first page of results, or
+        2) The `original_url` is a list of SO children, hopefully contain 1 item, and we must do more
+           to get to the list of IO children:
+
+           Fetching the first page of items requires two requests to get to it. The first, the original_url,
+           returns a ChildrenResponse, from which the URL from the first Child is extracted. The second request returns
+           an EntityResponse, from which the URL is extracted from AdditionalInformation/Children's text node.
+        """
+        request = self.build_url_request(self.original_url)
+        response = requests.get(**request)
+        root = ElementTree.fromstring(response.text)
+
+        # If we have IO (Information Object) children, then this is the first page. Otherwise,
+        # we have to continue digging.
+        io_children = root.findall(".//pra:Child[@type='IO']", self.NAMESPACES)
+        if len(io_children) > 0:
+            return self.original_url
+
+        child_url = root.find(".//pra:Child[@type='SO']", self.NAMESPACES).text
+
+        request = self.build_url_request(child_url)
+        response = requests.get(**request)
+        root = ElementTree.fromstring(response.text)
+
+        return root.find(".//pra:AdditionalInformation/pra:Children", self.NAMESPACES).text
+
+    def build_fetch_request(self):
+        request = self.build_url_request(self.next_url)
+
+        print(
+            f"[{self.collection_id}]: Fetching page {self.write_page} "
+            f"at {request.get('url')}")
+
+        return request
+
+    def get_text_from_response(self, response):
+        # Starting with a list of `information-objects` URLs
+        object_url_elements = ElementTree.fromstring(response.text).findall("pra:Children/"
+                                                                            "pra:Child", self.NAMESPACES)
+
+        object_urls = [element.text for element in object_url_elements]
+
+        # Getting an individual `information-object`, extracting the URL
+        # for the oai_dc metadata fragment
+        metadata_urls = {object_url: self.get_metadata_url_from_object(object_url)
+                         for object_url in object_urls}
+
+        # Getting the metadata
+        items = {object_url: self.get_metadata_from_url(metadata_url)
+                 for (object_url, metadata_url) in metadata_urls.items()
+                 if metadata_url is not None}
+
+        # Replace XML text node with the response
+        output_document = response.text
+
+        for search, replace in items.items():
+            output_document = output_document.replace(search, replace)
+
+        return output_document
+
+    def get_metadata_url_from_object(self, url):
+        request = self.build_url_request(url)
+        response = requests.get(**request)
+        root = ElementTree.fromstring(response.text)
+        fragment = root.find(".//pra:Fragment[@schema='http://www.openarchives.org/OAI/2.0/oai_dc/']", self.NAMESPACES)
+
+        return fragment.text if fragment is not None else None
+
+    def get_metadata_from_url(self, url):
+        request = self.build_url_request(url)
+        response = requests.get(**request)
+        return response.text
+
+        # TODO: strip superfluous junk out of metadata XML
+        # root = ElementTree.fromstring(response.text)
+        # metadata_response = root.find(".//pra:MetadataResponse", self.NAMESPACES)
+        #
+        # return metadata_response.text if metadata_response is not None else None
+
+    def check_page(self, http_resp):
+        """
+        TODO: review other fetchers, do what they do
+        """
+        hits = len(ElementTree.fromstring(http_resp.content).\
+            findall(".//pra:Child", self.NAMESPACES))
+
+        print(
+            f"[{self.collection_id}]: Fetched page {self.write_page} "
+            f"at {http_resp.url} with {hits} hits"
+        )
+
+        return True
+
+    def increment(self, http_resp):
+        """
+        TODO: DOCUMENT ME
+        """
+        super(PraFetcher, self).increment(http_resp)
+
+        next_element = ElementTree.fromstring(http_resp.content).find("pra:Paging/pra:Next", self.NAMESPACES)
+        self.next_url = next_element.text if next_element is not None else None
+
+    def json(self):
+        """
+        TODO: DOCUMENT ME
+        """
+        current_state = {
+            "harvest_type": self.harvest_type,
+            "basic_auth_credentials": self.basic_auth_credentials,
+            "collection_id": self.collection_id,
+            "internal_collection_id": self.internal_collection_id,
+            "original_url": self.original_url,
+            "next_url": self.next_url,
+            "write_page": self.write_page
+        }
+
+        if not self.next_url:
+            current_state.update({"finished": True})
+
+        return json.dumps(current_state)
+
+    def build_url_request(self, url):
+        return {"url": url, "auth": HTTPBasicAuth(*self.basic_auth_credentials)}