Skip to content

Commit 46903f2

Browse files
authored
Merge pull request #95 from code4lib/sanitize-ampersands
Sanitize unescaped ampersands in XML content
2 parents cd90bfd + 651bae4 commit 46903f2

File tree

2 files changed

+26
-9
lines changed

2 files changed

+26
-9
lines changed

lib/oai/client.rb

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ module OAI
5454
# <http://www.openarchives.org/OAI/openarchivesprotocol.html>.
5555

5656
class Client
57-
57+
UNESCAPED_AMPERSAND = /&(?!(?:amp|lt|gt|quot|apos|\#\d+);)/
5858
# The constructor which must be passed a valid base url for an oai
5959
# service:
6060
#
@@ -198,20 +198,25 @@ def list_sets(opts={})
198198
do_resumable(OAI::ListSetsResponse, 'ListSets', opts)
199199
end
200200

201-
private
202-
203-
def do_request(verb, opts = nil)
204-
# fire off the request and return appropriate DOM object
205-
uri = build_uri(verb, opts)
206-
xml = strip_invalid_utf_8_chars(get(uri))
201+
def sanitize_xml(xml)
202+
xml = strip_invalid_utf_8_chars(xml)
203+
xml = strip_invalid_xml_chars(xml)
207204
if @parser == 'libxml'
208205
# remove default namespace for oai-pmh since libxml
209206
# isn't able to use our xpaths to get at them
210207
# if you know a way around thins please let me know
211208
xml = xml.gsub(
212209
/xmlns=\"http:\/\/www.openarchives.org\/OAI\/.\..\/\"/, '')
213210
end
214-
return load_document(xml)
211+
xml
212+
end
213+
214+
private
215+
216+
def do_request(verb, opts = nil)
217+
# fire off the request and return appropriate DOM object
218+
uri = build_uri(verb, opts)
219+
return load_document(get(uri))
215220
end
216221

217222
def do_resumable(responseClass, verb, opts)
@@ -241,6 +246,7 @@ def encode(value)
241246
end
242247

243248
def load_document(xml)
249+
xml = sanitize_xml(xml)
244250
case @parser
245251
when 'libxml'
246252
begin
@@ -354,5 +360,9 @@ def strip_invalid_utf_8_chars(xml)
354360
xml
355361
end
356362

363+
def strip_invalid_xml_chars(xml)
364+
return xml unless xml =~ UNESCAPED_AMPERSAND
365+
xml.gsub(UNESCAPED_AMPERSAND, '&amp;')
366+
end
357367
end
358368
end

test/client/tc_utf8_escaping.rb

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,19 @@
11
require 'test_helper_client'
22

33
class UTF8Test < Test::Unit::TestCase
4+
def client
5+
@client ||= OAI::Client.new 'http://localhost:3333/oai'
6+
end
47

58
def test_escaping_invalid_utf_8_characters
6-
client = OAI::Client.new 'http://localhost:3333/oai' #, :parser => 'libxml'
79
invalid_utf_8 = [2, 3, 4, 104, 5, 101, 6, 108, 66897, 108, 66535, 111, 1114112, 33, 55234123, 33].pack("U*")
810
invalid_utf_8 = invalid_utf_8.force_encoding("binary") if invalid_utf_8.respond_to? :force_encoding
911
assert_equal("hello!!", client.send(:strip_invalid_utf_8_chars, invalid_utf_8).gsub(/\?/, ''))
1012
end
1113

14+
def test_unescaped_ampersand_content_correction
15+
src = '<test>Frankie & Johnny <character>&#9829;</character></test>'
16+
expected = '<test>Frankie &amp; Johnny <character>&#9829;</character></test>'
17+
assert_equal(expected, client.sanitize_xml(src))
18+
end
1219
end

0 commit comments

Comments
 (0)