@@ -54,7 +54,7 @@ module OAI
5454 # <http://www.openarchives.org/OAI/openarchivesprotocol.html>.
5555
5656 class Client
57-
57+ UNESCAPED_AMPERSAND = /&(?!(?:amp|lt|gt|quot|apos| \# \d +);)/
5858 # The constructor which must be passed a valid base url for an oai
5959 # service:
6060 #
@@ -198,20 +198,25 @@ def list_sets(opts={})
198198 do_resumable ( OAI ::ListSetsResponse , 'ListSets' , opts )
199199 end
200200
201- private
202-
203- def do_request ( verb , opts = nil )
204- # fire off the request and return appropriate DOM object
205- uri = build_uri ( verb , opts )
206- xml = strip_invalid_utf_8_chars ( get ( uri ) )
201+ def sanitize_xml ( xml )
202+ xml = strip_invalid_utf_8_chars ( xml )
203+ xml = strip_invalid_xml_chars ( xml )
207204 if @parser == 'libxml'
208205 # remove default namespace for oai-pmh since libxml
209206 # isn't able to use our xpaths to get at them
210207 # if you know a way around thins please let me know
211208 xml = xml . gsub (
212209 /xmlns=\" http:\/ \/ www.openarchives.org\/ OAI\/ .\. .\/ \" / , '' )
213210 end
214- return load_document ( xml )
211+ xml
212+ end
213+
214+ private
215+
216+ def do_request ( verb , opts = nil )
217+ # fire off the request and return appropriate DOM object
218+ uri = build_uri ( verb , opts )
219+ return load_document ( get ( uri ) )
215220 end
216221
217222 def do_resumable ( responseClass , verb , opts )
@@ -241,6 +246,7 @@ def encode(value)
241246 end
242247
243248 def load_document ( xml )
249+ xml = sanitize_xml ( xml )
244250 case @parser
245251 when 'libxml'
246252 begin
@@ -251,7 +257,6 @@ def load_document(xml)
251257 end
252258 when 'rexml'
253259 begin
254- xml = strip_invalid_xml_chars ( xml )
255260 return REXML ::Document . new ( xml )
256261 rescue REXML ::ParseException => e
257262 raise OAI ::Exception , 'response not well formed XML: ' +e . message , caller
@@ -356,17 +361,8 @@ def strip_invalid_utf_8_chars(xml)
356361 end
357362
358363 def strip_invalid_xml_chars ( xml )
359- invalid = false
360-
361- begin
362- REXML ::Document . new ( xml )
363- rescue REXML ::ParseException => e
364- invalid = true
365- end
366-
367- return xml . gsub! ( /&(?!(?:amp|lt|gt|quot|apos);)/ , '&' ) if invalid
368- return xml
364+ return xml unless xml =~ UNESCAPED_AMPERSAND
365+ xml . gsub ( UNESCAPED_AMPERSAND , '&' )
369366 end
370-
371367 end
372368end
0 commit comments