@@ -54,7 +54,7 @@ module OAI
5454 # <http://www.openarchives.org/OAI/openarchivesprotocol.html>.
5555
5656 class Client
57-
57+ UNESCAPED_AMPERSAND = /&(?!(?:amp|lt|gt|quot|apos| \# \d +);)/
5858 # The constructor which must be passed a valid base url for an oai
5959 # service:
6060 #
@@ -198,20 +198,25 @@ def list_sets(opts={})
198198 do_resumable ( OAI ::ListSetsResponse , 'ListSets' , opts )
199199 end
200200
201- private
202-
203- def do_request ( verb , opts = nil )
204- # fire off the request and return appropriate DOM object
205- uri = build_uri ( verb , opts )
206- xml = strip_invalid_utf_8_chars ( get ( uri ) )
201+ def sanitize_xml ( xml )
202+ xml = strip_invalid_utf_8_chars ( xml )
203+ xml = strip_invalid_xml_chars ( xml )
207204 if @parser == 'libxml'
208205 # remove default namespace for oai-pmh since libxml
209206 # isn't able to use our xpaths to get at them
210207 # if you know a way around thins please let me know
211208 xml = xml . gsub (
212209 /xmlns=\" http:\/ \/ www.openarchives.org\/ OAI\/ .\. .\/ \" / , '' )
213210 end
214- return load_document ( xml )
211+ xml
212+ end
213+
214+ private
215+
216+ def do_request ( verb , opts = nil )
217+ # fire off the request and return appropriate DOM object
218+ uri = build_uri ( verb , opts )
219+ return load_document ( get ( uri ) )
215220 end
216221
217222 def do_resumable ( responseClass , verb , opts )
@@ -241,6 +246,7 @@ def encode(value)
241246 end
242247
243248 def load_document ( xml )
249+ xml = sanitize_xml ( xml )
244250 case @parser
245251 when 'libxml'
246252 begin
@@ -354,5 +360,9 @@ def strip_invalid_utf_8_chars(xml)
354360 xml
355361 end
356362
363+ def strip_invalid_xml_chars ( xml )
364+ return xml unless xml =~ UNESCAPED_AMPERSAND
365+ xml . gsub ( UNESCAPED_AMPERSAND , '&' )
366+ end
357367 end
358368end
0 commit comments