Skip to content

Commit 9a336ad

Browse files
committed
feat: add an option to preserve whitespace to FullSanitizer
1 parent 50644ff commit 9a336ad

File tree

4 files changed

+82
-33
lines changed

4 files changed

+82
-33
lines changed

CHANGELOG.md

+3
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414

1515
*Mike Dalessio*
1616

17+
* `FullSanitizer` now supports the optional argument `preserve_whitespace` to keep whitespace around block elements and line break elements.
18+
19+
*Earlopain*
1720

1821
## 1.5.0 / 2023-01-20
1922

README.md

+5-10
Original file line numberDiff line numberDiff line change
@@ -62,20 +62,15 @@ All sanitizers respond to `sanitize`, and are available in variants that use eit
6262
full_sanitizer = Rails::HTML5::FullSanitizer.new
6363
full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
6464
# => Bold no more! See more here...
65-
```
6665

67-
or, if you insist on parsing the content as HTML4:
66+
# Whitespace is swallowed by default. If whitespace is significant you must pass an option to preserve it.
67+
# This option is slower, but is clever about whitespace around block elements and line break elements.
6868

69-
```ruby
70-
full_sanitizer = Rails::HTML4::FullSanitizer.new
71-
full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
72-
# => Bold no more! See more here...
69+
full_sanitizer = Rails::HTML5::FullSanitizer.new
70+
full_sanitizer.sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
71+
# => \nParagraphs\n and \n newlines
7372
```
7473

75-
HTML5 version:
76-
77-
78-
7974
#### LinkSanitizer
8075

8176
```ruby

lib/rails/html/sanitizer.rb

+31
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,19 @@ def parse_fragment(html)
6666
end if Rails::HTML::Sanitizer.html5_support?
6767
end
6868

69+
module Sanitizer
70+
module PreserveWhitespace
71+
def sanitize(html, options = {})
72+
return unless html
73+
if options[:preserve_whitespace]
74+
parse_fragment(html).to_text
75+
else
76+
super
77+
end
78+
end
79+
end
80+
end
81+
6982
module Scrubber
7083
module Full
7184
def scrub(fragment, options = {})
@@ -217,11 +230,20 @@ module HTML4
217230
# full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
218231
# # => "Bold no more! See more here..."
219232
#
233+
# === Options
234+
#
235+
# If whitespace is significant you can pass preserve_whitespace: true.
236+
# This option is slower, but is clever about whitespace around block elements and line break elements.
237+
#
238+
# full_sanitizer = Rails::HTML4::FullSanitizer.new
239+
# full_sanitizer.sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
240+
# # => \nParagraphs\n and \n newlines
220241
class FullSanitizer < Rails::HTML::Sanitizer
221242
include HTML::Concern::ComposedSanitize
222243
include HTML::Concern::Parser::HTML4
223244
include HTML::Concern::Scrubber::Full
224245
include HTML::Concern::Serializer::UTF8Encode
246+
include HTML::Concern::Sanitizer::PreserveWhitespace
225247
end
226248

227249
# == Rails::HTML4::LinkSanitizer
@@ -307,11 +329,20 @@ module HTML5
307329
# full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
308330
# # => "Bold no more! See more here..."
309331
#
332+
# === Options
333+
#
334+
# If whitespace is significant you can pass preserve_whitespace: true.
335+
# This option is slower, but is clever about whitespace around block elements and line break elements.
336+
#
337+
# full_sanitizer = Rails::HTML5::FullSanitizer.new
338+
# full_sanitizer.sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
339+
# # => \nParagraphs\n and \n newlines
310340
class FullSanitizer < Rails::HTML::Sanitizer
311341
include HTML::Concern::ComposedSanitize
312342
include HTML::Concern::Parser::HTML5
313343
include HTML::Concern::Scrubber::Full
314344
include HTML::Concern::Serializer::UTF8Encode
345+
include HTML::Concern::Sanitizer::PreserveWhitespace
315346
end
316347

317348
# == Rails::HTML5::LinkSanitizer

test/sanitizer_test.rb

+43-23
Original file line numberDiff line numberDiff line change
@@ -80,50 +80,53 @@ module FullSanitizerTest
8080

8181
def test_strip_tags_with_quote
8282
input = '<" <img src="trollface.gif" onload="alert(1)"> hi'
83-
result = full_sanitize(input)
8483
acceptable_results = [
8584
# libxml2 >= 2.9.14 and xerces+neko
8685
%{&lt;" hi},
8786
# other libxml2
8887
%{ hi},
88+
# preserve_whitespace: true
89+
"&lt;&quot; hi",
8990
]
9091

91-
assert_includes(acceptable_results, result)
92+
assert_full_sanitized(acceptable_results, input)
9293
end
9394

9495
def test_strip_invalid_html
95-
assert_equal "&lt;&lt;", full_sanitize("<<<bad html")
96+
assert_full_sanitized "&lt;&lt;", "<<<bad html"
9697
end
9798

9899
def test_strip_nested_tags
99100
expected = "Wei&lt;a onclick='alert(document.cookie);'/&gt;rdos"
100101
input = "Wei<<a>a onclick='alert(document.cookie);'</a>/>rdos"
101-
assert_equal expected, full_sanitize(input)
102+
assert_full_sanitized expected, input
102103
end
103104

104105
def test_strip_tags_multiline
105-
expected = %{This is a test.\n\n\n\nIt no longer contains any HTML.\n}
106106
input = %{<h1>This is <b>a <a href="" target="_blank">test</a></b>.</h1>\n\n<!-- it has a comment -->\n\n<p>It no <b>longer <strong>contains <em>any <strike>HTML</strike></em>.</strong></b></p>\n}
107+
acceptable_results = [
108+
%{This is a test.\n\n\n\nIt no longer contains any HTML.\n},
109+
# preserve_whitespace: true
110+
%{\nThis is a test.\n\nIt no longer contains any HTML.\n\n}
111+
]
107112

108-
assert_equal expected, full_sanitize(input)
113+
assert_full_sanitized acceptable_results, input
109114
end
110115

111116
def test_remove_unclosed_tags
112117
input = "This is <-- not\n a comment here."
113-
result = full_sanitize(input)
114118
acceptable_results = [
115119
# libxml2 >= 2.9.14 and xerces+neko
116120
%{This is &lt;-- not\n a comment here.},
117121
# other libxml2
118122
%{This is },
119123
]
120124

121-
assert_includes(acceptable_results, result)
125+
assert_full_sanitized(acceptable_results, input)
122126
end
123127

124128
def test_strip_cdata
125129
input = "This has a <![CDATA[<section>]]> here."
126-
result = full_sanitize(input)
127130
acceptable_results = [
128131
# libxml2 = 2.9.14
129132
%{This has a &lt;![CDATA[]]&gt; here.},
@@ -133,51 +136,68 @@ def test_strip_cdata
133136
%{This has a here.},
134137
]
135138

136-
assert_includes(acceptable_results, result)
139+
assert_full_sanitized(acceptable_results, input)
137140
end
138141

139142
def test_strip_blank_string
140143
assert_nil full_sanitize(nil)
141-
assert_equal "", full_sanitize("")
142-
assert_equal " ", full_sanitize(" ")
144+
assert_nil full_sanitize(nil, preserve_whitespace: true)
145+
assert_full_sanitized "", ""
146+
assert_full_sanitized " ", " "
143147
end
144148

145149
def test_strip_tags_with_plaintext
146-
assert_equal "Don't touch me", full_sanitize("Don't touch me")
150+
assert_full_sanitized "Don't touch me", "Don't touch me"
147151
end
148152

149153
def test_strip_tags_with_tags
150-
assert_equal "This is a test.", full_sanitize("<p>This <u>is<u> a <a href='test.html'><strong>test</strong></a>.</p>")
154+
assert_full_sanitized "This is a test.", "<b>This <u>is<u> a <a href='test.html'><strong>test</strong></a>.</b>"
151155
end
152156

153157
def test_escape_tags_with_many_open_quotes
154-
assert_equal "&lt;&lt;", full_sanitize("<<<bad html>")
158+
assert_full_sanitized "&lt;&lt;", "<<<bad html>"
155159
end
156160

157161
def test_strip_tags_with_sentence
158-
assert_equal "This is a test.", full_sanitize("This is a test.")
162+
assert_full_sanitized "This is a test.", "This is a test."
159163
end
160164

161165
def test_strip_tags_with_comment
162-
assert_equal "This has a here.", full_sanitize("This has a <!-- comment --> here.")
166+
assert_full_sanitized "This has a here.", "This has a <!-- comment --> here."
163167
end
164168

165169
def test_strip_tags_with_frozen_string
166-
assert_equal "Frozen string with no tags", full_sanitize("Frozen string with no tags")
170+
assert_full_sanitized "Frozen string with no tags", "Frozen string with no tags"
167171
end
168172

169173
def test_full_sanitize_respect_html_escaping_of_the_given_string
170-
assert_equal 'test\r\nstring', full_sanitize('test\r\nstring')
171-
assert_equal "&amp;", full_sanitize("&")
172-
assert_equal "&amp;", full_sanitize("&amp;")
173-
assert_equal "&amp;amp;", full_sanitize("&amp;amp;")
174-
assert_equal "omg &lt;script&gt;BOM&lt;/script&gt;", full_sanitize("omg &lt;script&gt;BOM&lt;/script&gt;")
174+
assert_full_sanitized 'test\r\nstring', 'test\r\nstring'
175+
assert_full_sanitized "&amp;", "&"
176+
assert_full_sanitized "&amp;", "&amp;"
177+
assert_full_sanitized "&amp;amp;", "&amp;amp;"
178+
assert_full_sanitized "omg &lt;script&gt;BOM&lt;/script&gt;", "omg &lt;script&gt;BOM&lt;/script&gt;"
179+
end
180+
181+
def test_full_sanitize_preserve_whitespace
182+
assert_equal "\nParagraphs\n and \n newlines", full_sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
183+
end
184+
185+
def test_full_sanitize_preserve_whitespace_ascii_8bit_string
186+
full_sanitize("<a>hello</a>".encode("ASCII-8BIT")).tap do |sanitized|
187+
assert_equal "hello", sanitized
188+
assert_equal Encoding::UTF_8, sanitized.encoding
189+
end
175190
end
176191

177192
protected
178193
def full_sanitize(input, options = {})
179194
module_under_test::FullSanitizer.new.sanitize(input, options)
180195
end
196+
197+
def assert_full_sanitized(acceptable_results, input)
198+
assert_includes(Array(acceptable_results), full_sanitize(input))
199+
assert_includes(Array(acceptable_results), full_sanitize(input, preserve_whitespace: true))
200+
end
181201
end
182202

183203
class HTML4FullSanitizerTest < Minitest::Test

0 commit comments

Comments
 (0)