Skip to content

Commit fc38059

Browse files
committed
[WIP] Rough out PRA fetcher
1 parent 5c13baa commit fc38059

3 files changed

Lines changed: 171 additions & 7 deletions

File tree

metadata_fetcher/fetchers/Fetcher.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -78,29 +78,34 @@ def fetch_page(self):
7878
raise FetchError(
7979
f"[{self.collection_id}]: unable to fetch page {page}")
8080

81+
8182
if self.check_page(response):
83+
text = self.get_text_from_response(response)
8284
if settings.DATA_DEST == 'local':
83-
self.fetchtolocal(response.text)
85+
self.fetchtolocal(text)
8486
else:
85-
self.fetchtos3(response.text)
87+
self.fetchtos3(text)
8688

8789
self.increment(response)
8890

8991
return self.json()
9092

93+
def get_text_from_response(self, response):
94+
return response.text
95+
9196
def build_fetch_request(self):
9297
"""build parameters for the institution's requests.get()
9398
94-
this should minimally return {'url': str} but may also include
95-
{'headers': {}, 'params': {}} or any other options accepted by
99+
this should minimally return {'url': str} but may also include
100+
{'headers': {}, 'params': {}} or any other options accepted by
96101
https://docs.python-requests.org/en/latest/api/#requests.get
97102
"""
98103
pass
99104

100105
def get_records(self, http_resp):
101106
"""parses http_resp from institutional API into a list of records
102107
103-
should return a list of dictionaries which can easily be serialized
108+
should return a list of dictionaries which can easily be serialized
104109
by json.dumps into json line format; takes as an argument:
105110
https://docs.python-requests.org/en/latest/api/#requests.Response
106111
"""

metadata_fetcher/fetchers/oac_fetcher.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,9 @@ def __init__(self, params):
4343
# https://stackoverflow.com/questions/20129996/why-does-boolxml-etree-elementtree-element-evaluate-to-false
4444
counts = {
4545
'total': total.attrib['totalDocs'] if total else 0,
46-
'image': int(image_group.attrib['totalDocs'])
46+
'image': int(image_group.attrib['totalDocs'])
4747
if image_group is not None else 0,
48-
'text': int(text_group.attrib['totalDocs'])
48+
'text': int(text_group.attrib['totalDocs'])
4949
if text_group is not None else 0,
5050
'harvested': 0,
5151
'harvested_image': 0,
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
import json
2+
from xml.etree import ElementTree
3+
from .Fetcher import Fetcher
4+
from requests.auth import HTTPBasicAuth
5+
import requests
6+
import re
7+
8+
9+
class PraFetcher(Fetcher):
10+
11+
BASE_URL = "https://us.preservica.com/api/entity/v6.0"
12+
13+
NAMESPACES = {"pra": "http://preservica.com/EntityAPI/v6.0"}
14+
15+
def __init__(self, params):
16+
super(PraFetcher, self).__init__(params)
17+
18+
# If `next_url` is a param, we know that this is not
19+
# the fetch of the first page, so skip setting those
20+
# attributes
21+
if "next_url" in params:
22+
for key in params:
23+
setattr(self, key, params[key])
24+
return
25+
26+
credentials = params.get("harvest_data").get("harvest_extra_data")
27+
self.basic_auth_credentials = ([v.strip() for v in credentials.split(',')])
28+
self.internal_collection_id = re.search(r"(?<=SO_)[0-9a-f-]+",
29+
params.get("harvest_data").get("url")).group(0)
30+
self.original_url = f"{self.BASE_URL}/structural-objects/{self.internal_collection_id}/children"
31+
self.next_url = self.get_first_page_url()
32+
33+
def get_first_page_url(self):
34+
"""
35+
Two possibilities exist:
36+
37+
1) The `original_url` contains a list of IO children and is the first page of results, or
38+
2) The `original_url` is a list of SO children, hopefully contain 1 item, and we must do more
39+
to get to the list of IO children:
40+
41+
Fetching the first page of items requires two requests to get to it. The first, the original_url,
42+
returns a ChildrenResponse, from which the URL from the first Child is extracted. The second request returns
43+
an EntityResponse, from which the URL is extracted from AdditionalInformation/Children's text node.
44+
"""
45+
request = self.build_url_request(self.original_url)
46+
response = requests.get(**request)
47+
root = ElementTree.fromstring(response.text)
48+
49+
# If we have IO (Information Object) children, then this is the first page. Otherwise,
50+
# we have to continue digging.
51+
io_children = root.findall(".//pra:Child[@type='IO']", self.NAMESPACES)
52+
if len(io_children) > 0:
53+
return self.original_url
54+
55+
child_url = root.find(".//pra:Child[@type='SO']", self.NAMESPACES).text
56+
57+
request = self.build_url_request(child_url)
58+
response = requests.get(**request)
59+
root = ElementTree.fromstring(response.text)
60+
61+
return root.find(".//pra:AdditionalInformation/pra:Children", self.NAMESPACES).text
62+
63+
def build_fetch_request(self):
64+
request = self.build_url_request(self.next_url)
65+
66+
print(
67+
f"[{self.collection_id}]: Fetching page {self.write_page} "
68+
f"at {request.get('url')}")
69+
70+
return request
71+
72+
def get_text_from_response(self, response):
73+
# Starting with a list of `information-objects` URLs
74+
object_url_elements = ElementTree.fromstring(response.text).findall("pra:Children/"
75+
"pra:Child", self.NAMESPACES)
76+
77+
object_urls = [element.text for element in object_url_elements]
78+
79+
# Getting an individual `information-object`, extracting the URL
80+
# for the oai_dc metadata fragment
81+
metadata_urls = {object_url: self.get_metadata_url_from_object(object_url)
82+
for object_url in object_urls}
83+
84+
# Getting the metadata
85+
items = {object_url: self.get_metadata_from_url(metadata_url)
86+
for (object_url, metadata_url) in metadata_urls.items()
87+
if metadata_url is not None}
88+
89+
# Replace XML text node with the response
90+
output_document = response.text
91+
92+
for search, replace in items.items():
93+
output_document = output_document.replace(search, replace)
94+
95+
return output_document
96+
97+
def get_metadata_url_from_object(self, url):
98+
request = self.build_url_request(url)
99+
response = requests.get(**request)
100+
root = ElementTree.fromstring(response.text)
101+
fragment = root.find(".//pra:Fragment[@schema='http://www.openarchives.org/OAI/2.0/oai_dc/']", self.NAMESPACES)
102+
103+
return fragment.text if fragment is not None else None
104+
105+
def get_metadata_from_url(self, url):
106+
request = self.build_url_request(url)
107+
response = requests.get(**request)
108+
return response.text
109+
110+
# TODO: strip superfluous junk out of metadata XML
111+
# root = ElementTree.fromstring(response.text)
112+
# metadata_response = root.find(".//pra:MetadataResponse", self.NAMESPACES)
113+
#
114+
# return metadata_response.text if metadata_response is not None else None
115+
116+
def check_page(self, http_resp):
117+
"""
118+
TODO: review other fetchers, do what they do
119+
"""
120+
hits = len(ElementTree.fromstring(http_resp.content).\
121+
findall(".//pra:Child", self.NAMESPACES))
122+
123+
print(
124+
f"[{self.collection_id}]: Fetched page {self.write_page} "
125+
f"at {http_resp.url} with {hits} hits"
126+
)
127+
128+
return True
129+
130+
def increment(self, http_resp):
131+
"""
132+
TODO: DOCUMENT ME
133+
"""
134+
super(PraFetcher, self).increment(http_resp)
135+
136+
next_element = ElementTree.fromstring(http_resp.content).find("pra:Paging/pra:Next", self.NAMESPACES)
137+
self.next_url = next_element.text if next_element is not None else None
138+
139+
def json(self):
140+
"""
141+
TODO: DOCUMENT ME
142+
"""
143+
current_state = {
144+
"harvest_type": self.harvest_type,
145+
"basic_auth_credentials": self.basic_auth_credentials,
146+
"collection_id": self.collection_id,
147+
"internal_collection_id": self.internal_collection_id,
148+
"original_url": self.original_url,
149+
"next_url": self.next_url,
150+
"write_page": self.write_page
151+
}
152+
153+
if not self.next_url:
154+
current_state.update({"finished": True})
155+
156+
return json.dumps(current_state)
157+
158+
def build_url_request(self, url):
159+
return {"url": url, "auth": HTTPBasicAuth(*self.basic_auth_credentials)}

0 commit comments

Comments
 (0)