feat: add an option to preserve whitespace to FullSanitizer

Earlopain · Earlopain · commit 9a336ad18f0c · 2023-05-16T10:04:28.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,9 @@
 
   *Mike Dalessio*
 
+* `FullSanitizer` now supports the optional argument `preserve_whitespace` to keep whitespace around block elements and line break elements.
+
+  *Earlopain*
 
 ## 1.5.0 / 2023-01-20
 
diff --git a/README.md b/README.md
@@ -62,20 +62,15 @@ All sanitizers respond to `sanitize`, and are available in variants that use eit
 full_sanitizer = Rails::HTML5::FullSanitizer.new
 full_sanitizer.sanitize("<b>Bold</b> no more!  <a href='more.html'>See more here</a>...")
 # => Bold no more!  See more here...
-```
 
-or, if you insist on parsing the content as HTML4:
+# Whitespace is swallowed by default. If whitespace is significant you must pass an option to preserve it.
+# This option is slower, but is clever about whitespace around block elements and line break elements.
 
-```ruby
-full_sanitizer = Rails::HTML4::FullSanitizer.new
-full_sanitizer.sanitize("<b>Bold</b> no more!  <a href='more.html'>See more here</a>...")
-# => Bold no more!  See more here...
+full_sanitizer = Rails::HTML5::FullSanitizer.new
+full_sanitizer.sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
+# => \nParagraphs\n and \n newlines
 ```
 
-HTML5 version:
-
-
-
 #### LinkSanitizer
 
 ```ruby
diff --git a/lib/rails/html/sanitizer.rb b/lib/rails/html/sanitizer.rb
@@ -66,6 +66,19 @@ def parse_fragment(html)
         end if Rails::HTML::Sanitizer.html5_support?
       end
 
+      module Sanitizer
+        module PreserveWhitespace
+          def sanitize(html, options = {})
+            return unless html
+            if options[:preserve_whitespace]
+              parse_fragment(html).to_text
+            else
+              super
+            end
+          end
+        end
+      end
+
       module Scrubber
         module Full
           def scrub(fragment, options = {})
@@ -217,11 +230,20 @@ module HTML4
     #   full_sanitizer.sanitize("<b>Bold</b> no more!  <a href='more.html'>See more here</a>...")
     #   # => "Bold no more!  See more here..."
     #
+    # === Options
+    #
+    # If whitespace is significant you can pass preserve_whitespace: true.
+    # This option is slower, but is clever about whitespace around block elements and line break elements.
+    #
+    # full_sanitizer = Rails::HTML4::FullSanitizer.new
+    # full_sanitizer.sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
+    # # => \nParagraphs\n and \n newlines
     class FullSanitizer < Rails::HTML::Sanitizer
       include HTML::Concern::ComposedSanitize
       include HTML::Concern::Parser::HTML4
       include HTML::Concern::Scrubber::Full
       include HTML::Concern::Serializer::UTF8Encode
+      include HTML::Concern::Sanitizer::PreserveWhitespace
     end
 
     # == Rails::HTML4::LinkSanitizer
@@ -307,11 +329,20 @@ module HTML5
     #   full_sanitizer.sanitize("<b>Bold</b> no more!  <a href='more.html'>See more here</a>...")
     #   # => "Bold no more!  See more here..."
     #
+    # === Options
+    #
+    # If whitespace is significant you can pass preserve_whitespace: true.
+    # This option is slower, but is clever about whitespace around block elements and line break elements.
+    #
+    # full_sanitizer = Rails::HTML5::FullSanitizer.new
+    # full_sanitizer.sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
+    # # => \nParagraphs\n and \n newlines
     class FullSanitizer < Rails::HTML::Sanitizer
       include HTML::Concern::ComposedSanitize
       include HTML::Concern::Parser::HTML5
       include HTML::Concern::Scrubber::Full
       include HTML::Concern::Serializer::UTF8Encode
+      include HTML::Concern::Sanitizer::PreserveWhitespace
     end
 
     # == Rails::HTML5::LinkSanitizer
diff --git a/test/sanitizer_test.rb b/test/sanitizer_test.rb
@@ -80,50 +80,53 @@ module FullSanitizerTest
 
     def test_strip_tags_with_quote
       input = '<" <img src="trollface.gif" onload="alert(1)"> hi'
-      result = full_sanitize(input)
       acceptable_results = [
         # libxml2 >= 2.9.14 and xerces+neko
         %{&lt;"  hi},
         # other libxml2
         %{ hi},
+        # preserve_whitespace: true
+        "&lt;&quot;  hi",
       ]
 
-      assert_includes(acceptable_results, result)
+      assert_full_sanitized(acceptable_results, input)
     end
 
     def test_strip_invalid_html
-      assert_equal "&lt;&lt;", full_sanitize("<<<bad html")
+      assert_full_sanitized "&lt;&lt;", "<<<bad html"
     end
 
     def test_strip_nested_tags
       expected = "Wei&lt;a onclick='alert(document.cookie);'/&gt;rdos"
       input = "Wei<<a>a onclick='alert(document.cookie);'</a>/>rdos"
-      assert_equal expected, full_sanitize(input)
+      assert_full_sanitized expected, input
     end
 
     def test_strip_tags_multiline
-      expected = %{This is a test.\n\n\n\nIt no longer contains any HTML.\n}
       input = %{<h1>This is <b>a <a href="" target="_blank">test</a></b>.</h1>\n\n<!-- it has a comment -->\n\n<p>It no <b>longer <strong>contains <em>any <strike>HTML</strike></em>.</strong></b></p>\n}
+      acceptable_results = [
+        %{This is a test.\n\n\n\nIt no longer contains any HTML.\n},
+        # preserve_whitespace: true
+        %{\nThis is a test.\n\nIt no longer contains any HTML.\n\n}
+      ]
 
-      assert_equal expected, full_sanitize(input)
+      assert_full_sanitized acceptable_results, input
     end
 
     def test_remove_unclosed_tags
       input = "This is <-- not\n a comment here."
-      result = full_sanitize(input)
       acceptable_results = [
         # libxml2 >= 2.9.14 and xerces+neko
         %{This is &lt;-- not\n a comment here.},
         # other libxml2
         %{This is },
       ]
 
-      assert_includes(acceptable_results, result)
+      assert_full_sanitized(acceptable_results, input)
     end
 
     def test_strip_cdata
       input = "This has a <![CDATA[<section>]]> here."
-      result = full_sanitize(input)
       acceptable_results = [
         # libxml2 = 2.9.14
         %{This has a &lt;![CDATA[]]&gt; here.},
@@ -133,51 +136,68 @@ def test_strip_cdata
         %{This has a  here.},
       ]
 
-      assert_includes(acceptable_results, result)
+      assert_full_sanitized(acceptable_results, input)
     end
 
     def test_strip_blank_string
       assert_nil full_sanitize(nil)
-      assert_equal "", full_sanitize("")
-      assert_equal "   ", full_sanitize("   ")
+      assert_nil full_sanitize(nil, preserve_whitespace: true)
+      assert_full_sanitized "", ""
+      assert_full_sanitized "   ", "   "
     end
 
     def test_strip_tags_with_plaintext
-      assert_equal "Don't touch me", full_sanitize("Don't touch me")
+      assert_full_sanitized "Don't touch me", "Don't touch me"
     end
 
     def test_strip_tags_with_tags
-      assert_equal "This is a test.", full_sanitize("<p>This <u>is<u> a <a href='test.html'><strong>test</strong></a>.</p>")
+      assert_full_sanitized "This is a test.", "<b>This <u>is<u> a <a href='test.html'><strong>test</strong></a>.</b>"
     end
 
     def test_escape_tags_with_many_open_quotes
-      assert_equal "&lt;&lt;", full_sanitize("<<<bad html>")
+      assert_full_sanitized "&lt;&lt;", "<<<bad html>"
     end
 
     def test_strip_tags_with_sentence
-      assert_equal "This is a test.", full_sanitize("This is a test.")
+      assert_full_sanitized "This is a test.", "This is a test."
     end
 
     def test_strip_tags_with_comment
-      assert_equal "This has a  here.", full_sanitize("This has a <!-- comment --> here.")
+      assert_full_sanitized "This has a  here.", "This has a <!-- comment --> here."
     end
 
     def test_strip_tags_with_frozen_string
-      assert_equal "Frozen string with no tags", full_sanitize("Frozen string with no tags")
+      assert_full_sanitized "Frozen string with no tags", "Frozen string with no tags"
     end
 
     def test_full_sanitize_respect_html_escaping_of_the_given_string
-      assert_equal 'test\r\nstring', full_sanitize('test\r\nstring')
-      assert_equal "&amp;", full_sanitize("&")
-      assert_equal "&amp;", full_sanitize("&amp;")
-      assert_equal "&amp;amp;", full_sanitize("&amp;amp;")
-      assert_equal "omg &lt;script&gt;BOM&lt;/script&gt;", full_sanitize("omg &lt;script&gt;BOM&lt;/script&gt;")
+      assert_full_sanitized 'test\r\nstring', 'test\r\nstring'
+      assert_full_sanitized "&amp;", "&"
+      assert_full_sanitized "&amp;", "&amp;"
+      assert_full_sanitized "&amp;amp;", "&amp;amp;"
+      assert_full_sanitized "omg &lt;script&gt;BOM&lt;/script&gt;", "omg &lt;script&gt;BOM&lt;/script&gt;"
+    end
+
+    def test_full_sanitize_preserve_whitespace
+      assert_equal "\nParagraphs\n and \n newlines", full_sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
+    end
+
+    def test_full_sanitize_preserve_whitespace_ascii_8bit_string
+      full_sanitize("<a>hello</a>".encode("ASCII-8BIT")).tap do |sanitized|
+        assert_equal "hello", sanitized
+        assert_equal Encoding::UTF_8, sanitized.encoding
+      end
     end
 
     protected
       def full_sanitize(input, options = {})
         module_under_test::FullSanitizer.new.sanitize(input, options)
       end
+
+      def assert_full_sanitized(acceptable_results, input)
+        assert_includes(Array(acceptable_results), full_sanitize(input))
+        assert_includes(Array(acceptable_results), full_sanitize(input, preserve_whitespace: true))
+      end
   end
 
   class HTML4FullSanitizerTest < Minitest::Test