Make override of document() method not use super.document

More tests. DAFFODIL-2527
apache · May 28, 2021 · 04187ae · 04187ae
1 parent 0cfbdd2
commit 04187ae
Show file tree

Hide file tree

Showing 2 changed files with 115 additions and 27 deletions.
diff --git a/daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilConstructingLoader.scala b/daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilConstructingLoader.scala
@@ -59,6 +59,9 @@ object Position {
  * properly, creating PCData nodes for the contents of these, and not otherwise
  * messing with the contents.
  *
+ * This code is effectively our fork of the Scala ConstructingParser. This
+ * works around some bugs in it.
+ *
  * Xerces, unfortunately, messes with the contents of these CDATA regions,
  * normalizes whitespace inside them, and generally makes it impossible to do things
  * in XML that depend on line-structure of element content to being preserved.
@@ -304,29 +307,49 @@ class DaffodilConstructingLoader private[xml] (uri: URI,
   /**
    * Override of document to make it tolerant of the start of the file
    * being whitespace instead of a "<" character
+   *
+   * This does not handle DOCTYPEs (aka DTDs) at all. Hence, is not
+   * a true replacement (bug fix) on the original ConstructingParser method
+   * that it overrides.
    */
   override def document(): Document = {
+    doc = new Document()
+    this.dtd = null
+    var children: NodeSeq = null
 
-    if ('<' == ch) super.document()
-    else {
-      // File did not start with "<".
-      // We want to tolerate whitespace and comments prior to the
-      // document element.
-
+    if ('<' == ch) {
       nextch()
+      if ('?' == ch) {
+        // It's an XML Prolog
+        nextch()
+        val info_prolog = prolog()
+        doc.version = info_prolog._1
+        doc.encoding = info_prolog._2
+        doc.standAlone = info_prolog._3
+        children = content(TopScope)
+      } else {
+        val ts = new NodeBuffer()
+        content1(TopScope, ts) // the 1 suffix means "without the first < character"
+        ts &+ content(TopScope)
+        children = NodeSeq.fromSeq(ts)
+      }
+    } else {
+      children = content(TopScope)
+    }
 
-      val children = content(TopScope)
-
-      var isErr = false
-      var elemCount = 0
-      var theNode: Node = null
-      children.foreach{
+    var isErr = false
+    var elemCount = 0
+    var theNode: Node = null
+    children.foreach { c =>
+      c match {
         case _: ProcInstr => // skip
         case _: Comment => // skip
+        // $COVERAGE-OFF$ // constructing parser never creates these - probably due to a bug
         case _: EntityRef => {
           reportSyntaxError("no entity references allowed here")
           isErr = true
         }
+        // $COVERAGE-ON$
         case s: SpecialNode => {
           val txt = s.toString.trim()
           if (txt.length > 0) {
@@ -338,23 +361,21 @@ class DaffodilConstructingLoader private[xml] (uri: URI,
           elemCount += 1
           theNode = m
       }
-      if (1 != elemCount) {
-        reportSyntaxError("document must contain exactly one element")
-        isErr = true
-      }
+    }
+    if (1 != elemCount) {
+      reportSyntaxError("document must contain exactly one element")
+      isErr = true
+    }
 
-      if (!isErr) {
-        val doc = new Document()
-        doc.children = children
-        doc.docElem = theNode
-        doc
-      } else {
-        null
-      }
+    if (!isErr) {
+      doc.children = children
+      doc.docElem = theNode
+      doc
+    } else {
+      null
     }
   }
 
-
   def load(): Node = {
     val res =
       try {

diff --git a/daffodil-lib/src/test/scala/org/apache/daffodil/xml/test/unit/TestXMLLoader.scala b/daffodil-lib/src/test/scala/org/apache/daffodil/xml/test/unit/TestXMLLoader.scala
@@ -22,10 +22,42 @@ import org.apache.daffodil.api.StringSchemaSource
 import org.apache.daffodil.xml.DaffodilXMLLoader
 import org.junit.Assert._
 import org.junit.Test
-import org.xml.sax.SAXParseException
+
+import scala.collection.mutable.ArrayBuffer
+import scala.xml.SAXParseException
 
 class TestXMLLoader {
 
+  @Test
+  def test_schemaLoad(): Unit = {
+    val data =
+      """<xs:schema targetNamespace="http://example.com"
+        |xmlns:ex="http://example.com"
+        |xmlns:xs="http://www.w3.org/2001/XMLSchema"
+        |xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+        |xmlns:dfdl="http://www.ogf.org/dfdl/dfdl-1.0/">
+        |  <xs:include schemaLocation="org/apache/daffodil/xsd/DFDLGeneralFormat.dfdl.xsd"/>
+        |  <xs:annotation>
+        |    <xs:appinfo source="http://www.ogf.org/dfdl/">
+        |      <dfdl:format lengthKind="delimited" ref="ex:GeneralFormat"/>
+        |    </xs:appinfo>
+        |  </xs:annotation>
+        |  <xs:element name="e1">
+        |    <xs:complexType>
+        |      <xs:sequence>
+        |        <xs:element name="s1" type="xs:int"/>
+        |      </xs:sequence>
+        |    </xs:complexType>
+        |  </xs:element>
+        |</xs:schema>
+        |""".stripMargin
+    val loader = new DaffodilXMLLoader()
+    val ss = StringSchemaSource(data)
+    val root =
+      loader.load(ss, None, false)
+    assertEquals("http://example.com", (root \ "@targetNamespace").text)
+  }
+
   /**
    * Characterize behavior of scala's xml loader w.r.t. CDATA preservation.
    *
@@ -83,7 +115,7 @@ b&"<>]]>"""))
       loader.load(ss, None, false, true)
     }
     val m = e.getMessage()
-    assertTrue(m.contains("DOCTYPE is disallowed"))
+    assertTrue(m.contains("DOCTYPE"))
   }
 
   /**
@@ -164,4 +196,39 @@ b&"<>]]>"""))
     val xml = loader.load(ss, None, addPositionAttributes = false)
     assertEquals("foo", xml.text)
   }
+
+  @Test def testLoaderCatchesVarousBadXML(): Unit = {
+    val xmlText = "    \n" + // no prolog some whitespace (tolerated)
+      "&AnEntityRef;\n" + // entity refs not allowed
+      "random text\n" + // just text not allowed
+      "<data>foo</data>\n" +
+    "<!-- comment afterwards --><another>element</another>\n&AnotherEntityRef;\nmore random text\n" // other bad stuff.
+    val teh = new TestErrorHandler()
+    val loader = new DaffodilXMLLoader(teh)
+    val ss = StringSchemaSource(xmlText)
+    val xml = loader.load(ss, None, addPositionAttributes = false)
+    val msgs = teh.exceptions.map{ _.getMessage() }.mkString("\n")
+    println(msgs)
+    assertTrue(msgs.contains("non-empty text nodes not allowed"))
+    assertTrue(msgs.contains("random text"))
+    assertTrue(msgs.contains("more random text"))
+    assertTrue(msgs.contains("exactly one element"))
+  }
 }
+
+class TestErrorHandler extends org.xml.sax.ErrorHandler {
+
+  val exceptions = new ArrayBuffer[SAXParseException]
+
+  def warning(exception: SAXParseException) = {
+    exceptions += exception
+  }
+
+  def error(exception: SAXParseException) = {
+    exceptions += exception
+  }
+
+  def fatalError(exception: SAXParseException) = {
+    exceptions += exception
+  }
+}