Enhance XML Loader to tolerate whitespace

ConstructingParser was intolerant of files that didn't start with "<". Modified to allow whitespace, processing instructions, and comments to precede the document element. Tests are testLoaderToleratesXMLWithLeadingWhitespace testLoaderToleratesXMLWithLeadingComments testLoaderToleratesXMLWithPI Further enhanced to handle PI that starts with <?xml-model ...?> This occurs in test infoset data for the iCalendar DFDL schema. Xerces was tolerating this before, so our constructing loader should also. DAFFODIL-2527
apache · Jun 1, 2021 · 6823c91 · 6823c91
1 parent 0dcdae9
commit 6823c91
Show file tree

Hide file tree

Showing 2 changed files with 272 additions and 2 deletions.
diff --git a/daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilConstructingLoader.scala b/daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilConstructingLoader.scala
@@ -59,6 +59,9 @@ object Position {
  * properly, creating PCData nodes for the contents of these, and not otherwise
  * messing with the contents.
  *
+ * This code is effectively our fork of the Scala ConstructingParser. This
+ * works around some bugs in it.
+ *
  * Xerces, unfortunately, messes with the contents of these CDATA regions,
  * normalizes whitespace inside them, and generally makes it impossible to do things
  * in XML that depend on line-structure of element content to being preserved.
@@ -288,10 +291,149 @@ class DaffodilConstructingLoader private[xml] (uri: URI,
     text(pos, s)
   }
 
+  /**
+   * Same CRLF/CR => LF processing as text gets.
+   */
   override def comment(pos: Int, s: String): Comment = {
     Comment(text(pos, s).text)
   }
 
+  /**
+   * Same CRLF/CR => LF processing as text gets.
+   */
+  override def procInstr(pos: Int, target: String, txt: String) =
+    ProcInstr(target, text(pos, txt).text)
+
+  private def parseXMLPrologAttributes(m: MetaData): (Option[String], Option[String], Option[Boolean]) = {
+
+    var info_ver: Option[String] = None
+    var info_enc: Option[String] = None
+    var info_stdl: Option[Boolean] = None
+
+    var n = 0
+    m("version") match {
+      case null =>
+      case Text("1.0") =>
+        info_ver = Some("1.0"); n += 1
+      case _ => reportSyntaxError("cannot deal with versions != 1.0")
+    }
+
+    m("encoding") match {
+      case null =>
+      case Text(enc) =>
+        if (!isValidIANAEncoding(enc))
+          reportSyntaxError("\"" + enc + "\" is not a valid encoding")
+        else {
+          info_enc = Some(enc)
+          n += 1
+        }
+    }
+
+    m("standalone") match {
+      case null =>
+      case Text("yes") =>
+        info_stdl = Some(true); n += 1
+      case Text("no") =>
+        info_stdl = Some(false); n += 1
+      case _ => reportSyntaxError("either 'yes' or 'no' expected")
+    }
+
+    if (m.length - n != 0) {
+      reportSyntaxError(
+        "only 'version', 'encoding', and 'standalone' attributes are expected in xml prolog. Found: " + m)
+    }
+
+    (info_ver, info_enc, info_stdl)
+  }
+
+  /**
+   * Override of document to make it tolerant of the start of the file
+   * being whitespace instead of a "<" character
+   *
+   * This does not handle DOCTYPEs (aka DTDs) at all. Hence, is not
+   * a true replacement (bug fix) on the original ConstructingParser method
+   * that it overrides.
+   */
+  override def document(): Document = {
+    doc = new Document()
+    this.dtd = null
+    var children: NodeSeq = null
+
+    if ('<' == ch) {
+      nextch()
+      if ('?' == ch) {
+        nextch()
+        // It's probably an XML prolog, but
+        // there are cases where there is no XML Prolog, but a starting
+        // PI of <?xml-model href="...."?>
+        // So we have to recognize as a general PI, then look and see if
+        // it is a prolog.
+        val name = xName
+        xSpace()
+        val (md, scp) = xAttributes(TopScope)
+        if (scp != TopScope)
+          reportSyntaxError("no xmlns definitions allowed.")
+        xToken('?')
+        xToken('>')
+        if (name == "xml") {
+          val info_prolog = parseXMLPrologAttributes(md)
+          doc.version = info_prolog._1
+          doc.encoding = info_prolog._2
+          doc.standAlone = info_prolog._3
+        } else {
+          // not an xml prolog. It's some other PI
+          // do nothing. We're just skipping those PIs
+        }
+        children = content(TopScope)
+      } else {
+        val ts = new NodeBuffer()
+        content1(TopScope, ts) // the 1 suffix means "without the first < character"
+        ts &+ content(TopScope)
+        children = NodeSeq.fromSeq(ts)
+      }
+    } else {
+      children = content(TopScope)
+    }
+
+    var isErr = false
+    var elemCount = 0
+    var theNode: Node = null
+    children.foreach { c =>
+      c match {
+        case _: ProcInstr => // skip
+        case _: Comment => // skip
+        // $COVERAGE-OFF$ // constructing parser never creates these - probably due to a bug
+        case _: EntityRef => {
+          reportSyntaxError("no entity references allowed here")
+          isErr = true
+        }
+        // $COVERAGE-ON$
+        case s: SpecialNode => {
+          val txt = s.toString.trim()
+          if (txt.length > 0) {
+            reportSyntaxError("non-empty text nodes not allowed: '" + txt + "'.")
+            isErr = true
+          }
+        }
+        case m: Elem =>
+          elemCount += 1
+          theNode = m
+      }
+    }
+    if (1 != elemCount) {
+      reportSyntaxError("document must contain exactly one element")
+      isErr = true
+    }
+
+    if (!isErr) {
+      doc.children = children
+      doc.docElem = theNode
+      doc
+    } else {
+      null
+    }
+  }
+
   def load(): Node = {
     val res =
       try {

diff --git a/daffodil-lib/src/test/scala/org/apache/daffodil/xml/test/unit/TestXMLLoader.scala b/daffodil-lib/src/test/scala/org/apache/daffodil/xml/test/unit/TestXMLLoader.scala
@@ -22,10 +22,76 @@ import org.apache.daffodil.api.StringSchemaSource
 import org.apache.daffodil.xml.DaffodilXMLLoader
 import org.junit.Assert._
 import org.junit.Test
-import org.xml.sax.SAXParseException
+
+import scala.collection.mutable.ArrayBuffer
+import scala.xml.SAXParseException
 
 class TestXMLLoader {
 
+  @Test
+  def test_schemaLoad(): Unit = {
+    val data =
+      """<?xml version="1.0"?>
+        |<xs:schema
+        |targetNamespace="http://example.com"
+        |xmlns:ex="http://example.com"
+        |xmlns:xs="http://www.w3.org/2001/XMLSchema"
+        |xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+        |xmlns:dfdl="http://www.ogf.org/dfdl/dfdl-1.0/">
+        |  <xs:include schemaLocation="org/apache/daffodil/xsd/DFDLGeneralFormat.dfdl.xsd"/>
+        |  <xs:annotation>
+        |    <xs:appinfo source="http://www.ogf.org/dfdl/">
+        |      <dfdl:format lengthKind="delimited" ref="ex:GeneralFormat"/>
+        |    </xs:appinfo>
+        |  </xs:annotation>
+        |  <xs:element name="e1">
+        |    <xs:complexType>
+        |      <xs:sequence>
+        |        <xs:element name="s1" type="xs:int"/>
+        |      </xs:sequence>
+        |    </xs:complexType>
+        |  </xs:element>
+        |</xs:schema>
+        |""".stripMargin
+    val loader = new DaffodilXMLLoader()
+    val ss = StringSchemaSource(data)
+    val root =
+      loader.load(ss, None, false)
+    assertEquals("http://example.com", (root \ "@targetNamespace").text)
+  }
+
+  @Test
+  def test_startsWithPINotProlog(): Unit = {
+    val data =
+      """<?xml-model href="../Schematron/iCalendar.sch" type="application/xml" schematypens="http://purl.oclc.org/dsdl/schematron"?>
+        |<xs:schema
+        |targetNamespace="http://example.com"
+        |xmlns:ex="http://example.com"
+        |xmlns:xs="http://www.w3.org/2001/XMLSchema"
+        |xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+        |xmlns:dfdl="http://www.ogf.org/dfdl/dfdl-1.0/">
+        |  <xs:include schemaLocation="org/apache/daffodil/xsd/DFDLGeneralFormat.dfdl.xsd"/>
+        |  <xs:annotation>
+        |    <xs:appinfo source="http://www.ogf.org/dfdl/">
+        |      <dfdl:format lengthKind="delimited" ref="ex:GeneralFormat"/>
+        |    </xs:appinfo>
+        |  </xs:annotation>
+        |  <xs:element name="e1">
+        |    <xs:complexType>
+        |      <xs:sequence>
+        |        <xs:element name="s1" type="xs:int"/>
+        |      </xs:sequence>
+        |    </xs:complexType>
+        |  </xs:element>
+        |</xs:schema>
+        |""".stripMargin
+    val loader = new DaffodilXMLLoader()
+    val ss = StringSchemaSource(data)
+    val root =
+      loader.load(ss, None, false)
+    assertEquals("http://example.com", (root \ "@targetNamespace").text)
+  }
+
   /**
    * Characterize behavior of scala's xml loader w.r.t. CDATA preservation.
    *
@@ -83,7 +149,7 @@ b&"<>]]>"""))
       loader.load(ss, None, false, true)
     }
     val m = e.getMessage()
-    assertTrue(m.contains("DOCTYPE is disallowed"))
+    assertTrue(m.contains("DOCTYPE"))
   }
 
   /**
@@ -137,4 +203,66 @@ b&"<>]]>"""))
       xmlFromDafLoaderNormalized.text)
 
   }
+
+  @Test def testLoaderToleratesXMLWithLeadingWhitespace(): Unit = {
+    val xmlTextWithLeadingWhitespace = "    \n" + "<data>foo</data>\n"
+    val loader = new DaffodilXMLLoader()
+    val ss = StringSchemaSource(xmlTextWithLeadingWhitespace)
+    val xml = loader.load(ss, None, addPositionAttributes = false)
+    assertEquals("foo", xml.text)
+  }
+
+  @Test def testLoaderToleratesXMLWithLeadingComments(): Unit = {
+    val xmlText = "    \n" +
+      "  <!-- a comment -->  \n  <data>foo</data>\n"
+    val loader = new DaffodilXMLLoader()
+    val ss = StringSchemaSource(xmlText)
+    val xml = loader.load(ss, None, addPositionAttributes = false)
+    assertEquals("foo", xml.text)
+  }
+
+  @Test def testLoaderToleratesXMLWithPI(): Unit = {
+    val xmlText = "    \n" +
+    "<?aProcInstr yadda yadda ?>\n" +
+      "  <!-- a comment -->  \n  <data>foo</data>\n"
+    val loader = new DaffodilXMLLoader()
+    val ss = StringSchemaSource(xmlText)
+    val xml = loader.load(ss, None, addPositionAttributes = false)
+    assertEquals("foo", xml.text)
+  }
+
+  @Test def testLoaderCatchesVarousBadXML(): Unit = {
+    val xmlText = "    \n" + // no prolog some whitespace (tolerated)
+      "&AnEntityRef;\n" + // entity refs not allowed
+      "random text\n" + // just text not allowed
+      "<data>foo</data>\n" +
+    "<!-- comment afterwards --><another>element</another>\n&AnotherEntityRef;\nmore random text\n" // other bad stuff.
+    val teh = new TestErrorHandler()
+    val loader = new DaffodilXMLLoader(teh)
+    val ss = StringSchemaSource(xmlText)
+    val xml = loader.load(ss, None, addPositionAttributes = false)
+    val msgs = teh.exceptions.map{ _.getMessage() }.mkString("\n")
+    println(msgs)
+    assertTrue(msgs.contains("non-empty text nodes not allowed"))
+    assertTrue(msgs.contains("random text"))
+    assertTrue(msgs.contains("more random text"))
+    assertTrue(msgs.contains("exactly one element"))
+  }
 }
+
+class TestErrorHandler extends org.xml.sax.ErrorHandler {
+
+  val exceptions = new ArrayBuffer[SAXParseException]
+
+  def warning(exception: SAXParseException) = {
+    exceptions += exception
+  }
+
+  def error(exception: SAXParseException) = {
+    exceptions += exception
+  }
+
+  def fatalError(exception: SAXParseException) = {
+    exceptions += exception
+  }
+}