1+ import json
2+ from xml .etree import ElementTree
3+ from .Fetcher import Fetcher
4+ from requests .auth import HTTPBasicAuth
5+ import requests
6+ import re
7+
8+
9+ class PraFetcher (Fetcher ):
10+
11+ BASE_URL = "https://us.preservica.com/api/entity/v6.0"
12+
13+ NAMESPACES = {"pra" : "http://preservica.com/EntityAPI/v6.0" }
14+
15+ def __init__ (self , params ):
16+ super (PraFetcher , self ).__init__ (params )
17+
18+ # If `next_url` is a param, we know that this is not
19+ # the fetch of the first page, so skip setting those
20+ # attributes
21+ if "next_url" in params :
22+ for key in params :
23+ setattr (self , key , params [key ])
24+ return
25+
26+ credentials = params .get ("harvest_data" ).get ("harvest_extra_data" )
27+ self .basic_auth_credentials = ([v .strip () for v in credentials .split (',' )])
28+ self .internal_collection_id = re .search (r"(?<=SO_)[0-9a-f-]+" ,
29+ params .get ("harvest_data" ).get ("url" )).group (0 )
30+ self .original_url = f"{ self .BASE_URL } /structural-objects/{ self .internal_collection_id } /children"
31+ self .next_url = self .get_first_page_url ()
32+
33+ def get_first_page_url (self ):
34+ """
35+ Two possibilities exist:
36+
37+ 1) The `original_url` contains a list of IO children and is the first page of results, or
38+ 2) The `original_url` is a list of SO children, hopefully contain 1 item, and we must do more
39+ to get to the list of IO children:
40+
41+ Fetching the first page of items requires two requests to get to it. The first, the original_url,
42+ returns a ChildrenResponse, from which the URL from the first Child is extracted. The second request returns
43+ an EntityResponse, from which the URL is extracted from AdditionalInformation/Children's text node.
44+ """
45+ request = self .build_url_request (self .original_url )
46+ response = requests .get (** request )
47+ root = ElementTree .fromstring (response .text )
48+
49+ # If we have IO (Information Object) children, then this is the first page. Otherwise,
50+ # we have to continue digging.
51+ io_children = root .findall (".//pra:Child[@type='IO']" , self .NAMESPACES )
52+ if len (io_children ) > 0 :
53+ return self .original_url
54+
55+ child_url = root .find (".//pra:Child[@type='SO']" , self .NAMESPACES ).text
56+
57+ request = self .build_url_request (child_url )
58+ response = requests .get (** request )
59+ root = ElementTree .fromstring (response .text )
60+
61+ return root .find (".//pra:AdditionalInformation/pra:Children" , self .NAMESPACES ).text
62+
63+ def build_fetch_request (self ):
64+ request = self .build_url_request (self .next_url )
65+
66+ print (
67+ f"[{ self .collection_id } ]: Fetching page { self .write_page } "
68+ f"at { request .get ('url' )} " )
69+
70+ return request
71+
72+ def get_text_from_response (self , response ):
73+ # Starting with a list of `information-objects` URLs
74+ object_url_elements = ElementTree .fromstring (response .text ).findall ("pra:Children/"
75+ "pra:Child" , self .NAMESPACES )
76+
77+ object_urls = [element .text for element in object_url_elements ]
78+
79+ # Getting an individual `information-object`, extracting the URL
80+ # for the oai_dc metadata fragment
81+ metadata_urls = {object_url : self .get_metadata_url_from_object (object_url )
82+ for object_url in object_urls }
83+
84+ # Getting the metadata
85+ items = {object_url : self .get_metadata_from_url (metadata_url )
86+ for (object_url , metadata_url ) in metadata_urls .items ()
87+ if metadata_url is not None }
88+
89+ # Replace XML text node with the response
90+ output_document = response .text
91+
92+ for search , replace in items .items ():
93+ output_document = output_document .replace (search , replace )
94+
95+ return output_document
96+
97+ def get_metadata_url_from_object (self , url ):
98+ request = self .build_url_request (url )
99+ response = requests .get (** request )
100+ root = ElementTree .fromstring (response .text )
101+ fragment = root .find (".//pra:Fragment[@schema='http://www.openarchives.org/OAI/2.0/oai_dc/']" , self .NAMESPACES )
102+
103+ return fragment .text if fragment is not None else None
104+
105+ def get_metadata_from_url (self , url ):
106+ request = self .build_url_request (url )
107+ response = requests .get (** request )
108+ return response .text
109+
110+ # TODO: strip superfluous junk out of metadata XML
111+ # root = ElementTree.fromstring(response.text)
112+ # metadata_response = root.find(".//pra:MetadataResponse", self.NAMESPACES)
113+ #
114+ # return metadata_response.text if metadata_response is not None else None
115+
116+ def check_page (self , http_resp ):
117+ """
118+ TODO: review other fetchers, do what they do
119+ """
120+ hits = len (ElementTree .fromstring (http_resp .content ).\
121+ findall (".//pra:Child" , self .NAMESPACES ))
122+
123+ print (
124+ f"[{ self .collection_id } ]: Fetched page { self .write_page } "
125+ f"at { http_resp .url } with { hits } hits"
126+ )
127+
128+ return True
129+
130+ def increment (self , http_resp ):
131+ """
132+ TODO: DOCUMENT ME
133+ """
134+ super (PraFetcher , self ).increment (http_resp )
135+
136+ next_element = ElementTree .fromstring (http_resp .content ).find ("pra:Paging/pra:Next" , self .NAMESPACES )
137+ self .next_url = next_element .text if next_element is not None else None
138+
139+ def json (self ):
140+ """
141+ TODO: DOCUMENT ME
142+ """
143+ current_state = {
144+ "harvest_type" : self .harvest_type ,
145+ "basic_auth_credentials" : self .basic_auth_credentials ,
146+ "collection_id" : self .collection_id ,
147+ "internal_collection_id" : self .internal_collection_id ,
148+ "original_url" : self .original_url ,
149+ "next_url" : self .next_url ,
150+ "write_page" : self .write_page
151+ }
152+
153+ if not self .next_url :
154+ current_state .update ({"finished" : True })
155+
156+ return json .dumps (current_state )
157+
158+ def build_url_request (self , url ):
159+ return {"url" : url , "auth" : HTTPBasicAuth (* self .basic_auth_credentials )}
0 commit comments