Skip to content

Commit 48d2142

Browse files
committed
feat: add an option to preserve whitespace to FullSanitizer
1 parent 5a1006f commit 48d2142

File tree

3 files changed

+64
-25
lines changed

3 files changed

+64
-25
lines changed

README.md

+7
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,13 @@ All sanitizers respond to `sanitize`.
6060
full_sanitizer = Rails::Html::FullSanitizer.new
6161
full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
6262
# => Bold no more! See more here...
63+
64+
# Whitespace is swallowed by default. If whitespace is significant you must pass an option to preserve it:
65+
66+
full_sanitizer = Rails::Html::FullSanitizer.new
67+
full_sanitizer.sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
68+
# => \nParagraphs\n and \n newlines
69+
6370
```
6471

6572
#### LinkSanitizer

lib/rails/html/sanitizer.rb

+14-2
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,28 @@ def properly_encode(fragment, options)
2424
# full_sanitizer = Rails::Html::FullSanitizer.new
2525
# full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
2626
# # => Bold no more! See more here...
27+
#
28+
# === Options
29+
#
30+
# If whitespace is significant you can pass preserve_whitespace: true.
31+
#
32+
# full_sanitizer = Rails::Html::FullSanitizer.new
33+
# full_sanitizer.sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
34+
# # => \nParagraphs\n and \n newlines
2735
class FullSanitizer < Sanitizer
2836
def sanitize(html, options = {})
2937
return unless html
3038
return html if html.empty?
3139

3240
loofah_fragment = Loofah.fragment(html)
3341

34-
loofah_fragment.scrub!(TextOnlyScrubber.new)
42+
if options[:preserve_whitespace]
43+
loofah_fragment.to_text
44+
else
45+
loofah_fragment.scrub!(TextOnlyScrubber.new)
3546

36-
properly_encode(loofah_fragment, encoding: "UTF-8")
47+
properly_encode(loofah_fragment, encoding: "UTF-8")
48+
end
3749
end
3850
end
3951

test/sanitizer_test.rb

+43-23
Original file line numberDiff line numberDiff line change
@@ -78,50 +78,53 @@ def test_remove_xpaths_called_with_enumerable_xpaths
7878

7979
def test_strip_tags_with_quote
8080
input = '<" <img src="trollface.gif" onload="alert(1)"> hi'
81-
result = full_sanitize(input)
8281
acceptable_results = [
8382
# libxml2 >= 2.9.14 and xerces+neko
8483
%{&lt;" hi},
8584
# other libxml2
8685
%{ hi},
86+
# preserve_whitespace: true
87+
"&lt;&quot; hi",
8788
]
8889

89-
assert_includes(acceptable_results, result)
90+
assert_full_sanitized(acceptable_results, input)
9091
end
9192

9293
def test_strip_invalid_html
93-
assert_equal "&lt;&lt;", full_sanitize("<<<bad html")
94+
assert_full_sanitized "&lt;&lt;", "<<<bad html"
9495
end
9596

9697
def test_strip_nested_tags
9798
expected = "Wei&lt;a onclick='alert(document.cookie);'/&gt;rdos"
9899
input = "Wei<<a>a onclick='alert(document.cookie);'</a>/>rdos"
99-
assert_equal expected, full_sanitize(input)
100+
assert_full_sanitized expected, input
100101
end
101102

102103
def test_strip_tags_multiline
103-
expected = %{This is a test.\n\n\n\nIt no longer contains any HTML.\n}
104104
input = %{<h1>This is <b>a <a href="" target="_blank">test</a></b>.</h1>\n\n<!-- it has a comment -->\n\n<p>It no <b>longer <strong>contains <em>any <strike>HTML</strike></em>.</strong></b></p>\n}
105+
acceptable_results = [
106+
%{This is a test.\n\n\n\nIt no longer contains any HTML.\n},
107+
# preserve_whitespace: true
108+
%{\nThis is a test.\n\nIt no longer contains any HTML.\n\n}
109+
]
105110

106-
assert_equal expected, full_sanitize(input)
111+
assert_full_sanitized acceptable_results, input
107112
end
108113

109114
def test_remove_unclosed_tags
110115
input = "This is <-- not\n a comment here."
111-
result = full_sanitize(input)
112116
acceptable_results = [
113117
# libxml2 >= 2.9.14 and xerces+neko
114118
%{This is &lt;-- not\n a comment here.},
115119
# other libxml2
116120
%{This is },
117121
]
118122

119-
assert_includes(acceptable_results, result)
123+
assert_full_sanitized(acceptable_results, input)
120124
end
121125

122126
def test_strip_cdata
123127
input = "This has a <![CDATA[<section>]]> here."
124-
result = full_sanitize(input)
125128
acceptable_results = [
126129
# libxml2 = 2.9.14
127130
%{This has a &lt;![CDATA[]]&gt; here.},
@@ -131,7 +134,7 @@ def test_strip_cdata
131134
%{This has a here.},
132135
]
133136

134-
assert_includes(acceptable_results, result)
137+
assert_full_sanitized(acceptable_results, input)
135138
end
136139

137140
def test_strip_unclosed_cdata
@@ -153,40 +156,52 @@ def test_strip_unclosed_cdata
153156

154157
def test_strip_blank_string
155158
assert_nil full_sanitize(nil)
156-
assert_equal "", full_sanitize("")
157-
assert_equal " ", full_sanitize(" ")
159+
assert_nil full_sanitize(nil, preserve_whitespace: true)
160+
assert_full_sanitized "", ""
161+
assert_full_sanitized " ", " "
158162
end
159163

160164
def test_strip_tags_with_plaintext
161-
assert_equal "Don't touch me", full_sanitize("Don't touch me")
165+
assert_full_sanitized "Don't touch me", "Don't touch me"
162166
end
163167

164168
def test_strip_tags_with_tags
165-
assert_equal "This is a test.", full_sanitize("<p>This <u>is<u> a <a href='test.html'><strong>test</strong></a>.</p>")
169+
assert_full_sanitized "This is a test.", "<b>This <u>is<u> a <a href='test.html'><strong>test</strong></a>.</b>"
166170
end
167171

168172
def test_escape_tags_with_many_open_quotes
169-
assert_equal "&lt;&lt;", full_sanitize("<<<bad html>")
173+
assert_full_sanitized "&lt;&lt;", "<<<bad html>"
170174
end
171175

172176
def test_strip_tags_with_sentence
173-
assert_equal "This is a test.", full_sanitize("This is a test.")
177+
assert_full_sanitized "This is a test.", "This is a test."
174178
end
175179

176180
def test_strip_tags_with_comment
177-
assert_equal "This has a here.", full_sanitize("This has a <!-- comment --> here.")
181+
assert_full_sanitized "This has a here.", "This has a <!-- comment --> here."
178182
end
179183

180184
def test_strip_tags_with_frozen_string
181-
assert_equal "Frozen string with no tags", full_sanitize("Frozen string with no tags")
185+
assert_full_sanitized "Frozen string with no tags", "Frozen string with no tags"
182186
end
183187

184188
def test_full_sanitize_respect_html_escaping_of_the_given_string
185-
assert_equal 'test\r\nstring', full_sanitize('test\r\nstring')
186-
assert_equal "&amp;", full_sanitize("&")
187-
assert_equal "&amp;", full_sanitize("&amp;")
188-
assert_equal "&amp;amp;", full_sanitize("&amp;amp;")
189-
assert_equal "omg &lt;script&gt;BOM&lt;/script&gt;", full_sanitize("omg &lt;script&gt;BOM&lt;/script&gt;")
189+
assert_full_sanitized 'test\r\nstring', 'test\r\nstring'
190+
assert_full_sanitized "&amp;", "&"
191+
assert_full_sanitized "&amp;", "&amp;"
192+
assert_full_sanitized "&amp;amp;", "&amp;amp;"
193+
assert_full_sanitized "omg &lt;script&gt;BOM&lt;/script&gt;", "omg &lt;script&gt;BOM&lt;/script&gt;"
194+
end
195+
196+
def test_full_sanitize_preserve_whitespace
197+
assert_equal "\na\n\nb\n", full_sanitize("<p>a</p><p>b</p>", preserve_whitespace: true)
198+
end
199+
200+
def test_full_sanitize_preserve_whitespace_ascii_8bit_string
201+
full_sanitize("<a>hello</a>".encode("ASCII-8BIT")).tap do |sanitized|
202+
assert_equal "hello", sanitized
203+
assert_equal Encoding::UTF_8, sanitized.encoding
204+
end
190205
end
191206

192207
def test_strip_links_with_tags_in_tags
@@ -917,6 +932,11 @@ def assert_sanitized(input, expected = nil)
917932
assert_equal((expected || input), safe_list_sanitize(input))
918933
end
919934

935+
def assert_full_sanitized(acceptable_results, input)
936+
assert_includes(Array(acceptable_results), full_sanitize(input))
937+
assert_includes(Array(acceptable_results), full_sanitize(input, preserve_whitespace: true))
938+
end
939+
920940
def sanitize_css(input)
921941
Rails::Html::SafeListSanitizer.new.sanitize_css(input)
922942
end

0 commit comments

Comments
 (0)