Skip to content

Commit

Permalink
Enhance XML Loader to tolerate whitespace
Browse files Browse the repository at this point in the history
ConstructingParser was intolerant of files that didn't
start with "<".

Modified to allow whitespace, processing instructions, and comments
to precede the document element.

Tests are

testLoaderToleratesXMLWithLeadingWhitespace
testLoaderToleratesXMLWithLeadingComments
testLoaderToleratesXMLWithPI

Further enhanced to handle PI that starts with <?xml-model ...?>

This occurs in test infoset data for the iCalendar DFDL schema.
Xerces was tolerating this before, so our constructing loader
should also.

DAFFODIL-2527
  • Loading branch information
mbeckerle committed Jun 1, 2021
1 parent 0dcdae9 commit 6823c91
Show file tree
Hide file tree
Showing 2 changed files with 272 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ object Position {
* properly, creating PCData nodes for the contents of these, and not otherwise
* messing with the contents.
*
* This code is effectively our fork of the Scala ConstructingParser. This
* works around some bugs in it.
*
* Xerces, unfortunately, messes with the contents of these CDATA regions,
* normalizes whitespace inside them, and generally makes it impossible to do things
* in XML that depend on line-structure of element content to being preserved.
Expand Down Expand Up @@ -288,10 +291,149 @@ class DaffodilConstructingLoader private[xml] (uri: URI,
text(pos, s)
}

/**
* Same CRLF/CR => LF processing as text gets.
*/
override def comment(pos: Int, s: String): Comment = {
Comment(text(pos, s).text)
}

/**
* Same CRLF/CR => LF processing as text gets.
*/
override def procInstr(pos: Int, target: String, txt: String) =
ProcInstr(target, text(pos, txt).text)

private def parseXMLPrologAttributes(m: MetaData): (Option[String], Option[String], Option[Boolean]) = {

var info_ver: Option[String] = None
var info_enc: Option[String] = None
var info_stdl: Option[Boolean] = None

var n = 0
m("version") match {
case null =>
case Text("1.0") =>
info_ver = Some("1.0"); n += 1
case _ => reportSyntaxError("cannot deal with versions != 1.0")
}

m("encoding") match {
case null =>
case Text(enc) =>
if (!isValidIANAEncoding(enc))
reportSyntaxError("\"" + enc + "\" is not a valid encoding")
else {
info_enc = Some(enc)
n += 1
}
}

m("standalone") match {
case null =>
case Text("yes") =>
info_stdl = Some(true); n += 1
case Text("no") =>
info_stdl = Some(false); n += 1
case _ => reportSyntaxError("either 'yes' or 'no' expected")
}

if (m.length - n != 0) {
reportSyntaxError(
"only 'version', 'encoding', and 'standalone' attributes are expected in xml prolog. Found: " + m)
}

(info_ver, info_enc, info_stdl)
}

/**
* Override of document to make it tolerant of the start of the file
* being whitespace instead of a "<" character
*
* This does not handle DOCTYPEs (aka DTDs) at all. Hence, is not
* a true replacement (bug fix) on the original ConstructingParser method
* that it overrides.
*/
override def document(): Document = {
doc = new Document()
this.dtd = null
var children: NodeSeq = null

if ('<' == ch) {
nextch()
if ('?' == ch) {
nextch()
// It's probably an XML prolog, but
// there are cases where there is no XML Prolog, but a starting
// PI of <?xml-model href="...."?>
// So we have to recognize as a general PI, then look and see if
// it is a prolog.
val name = xName
xSpace()
val (md, scp) = xAttributes(TopScope)
if (scp != TopScope)
reportSyntaxError("no xmlns definitions allowed.")
xToken('?')
xToken('>')
if (name == "xml") {
val info_prolog = parseXMLPrologAttributes(md)
doc.version = info_prolog._1
doc.encoding = info_prolog._2
doc.standAlone = info_prolog._3
} else {
// not an xml prolog. It's some other PI
// do nothing. We're just skipping those PIs
}
children = content(TopScope)
} else {
val ts = new NodeBuffer()
content1(TopScope, ts) // the 1 suffix means "without the first < character"
ts &+ content(TopScope)
children = NodeSeq.fromSeq(ts)
}
} else {
children = content(TopScope)
}

var isErr = false
var elemCount = 0
var theNode: Node = null
children.foreach { c =>
c match {
case _: ProcInstr => // skip
case _: Comment => // skip
// $COVERAGE-OFF$ // constructing parser never creates these - probably due to a bug
case _: EntityRef => {
reportSyntaxError("no entity references allowed here")
isErr = true
}
// $COVERAGE-ON$
case s: SpecialNode => {
val txt = s.toString.trim()
if (txt.length > 0) {
reportSyntaxError("non-empty text nodes not allowed: '" + txt + "'.")
isErr = true
}
}
case m: Elem =>
elemCount += 1
theNode = m
}
}
if (1 != elemCount) {
reportSyntaxError("document must contain exactly one element")
isErr = true
}

if (!isErr) {
doc.children = children
doc.docElem = theNode
doc
} else {
null
}
}

def load(): Node = {
val res =
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,76 @@ import org.apache.daffodil.api.StringSchemaSource
import org.apache.daffodil.xml.DaffodilXMLLoader
import org.junit.Assert._
import org.junit.Test
import org.xml.sax.SAXParseException

import scala.collection.mutable.ArrayBuffer
import scala.xml.SAXParseException

class TestXMLLoader {

@Test
def test_schemaLoad(): Unit = {
val data =
"""<?xml version="1.0"?>
|<xs:schema
|targetNamespace="http://example.com"
|xmlns:ex="http://example.com"
|xmlns:xs="http://www.w3.org/2001/XMLSchema"
|xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|xmlns:dfdl="http://www.ogf.org/dfdl/dfdl-1.0/">
| <xs:include schemaLocation="org/apache/daffodil/xsd/DFDLGeneralFormat.dfdl.xsd"/>
| <xs:annotation>
| <xs:appinfo source="http://www.ogf.org/dfdl/">
| <dfdl:format lengthKind="delimited" ref="ex:GeneralFormat"/>
| </xs:appinfo>
| </xs:annotation>
| <xs:element name="e1">
| <xs:complexType>
| <xs:sequence>
| <xs:element name="s1" type="xs:int"/>
| </xs:sequence>
| </xs:complexType>
| </xs:element>
|</xs:schema>
|""".stripMargin
val loader = new DaffodilXMLLoader()
val ss = StringSchemaSource(data)
val root =
loader.load(ss, None, false)
assertEquals("http://example.com", (root \ "@targetNamespace").text)
}

@Test
def test_startsWithPINotProlog(): Unit = {
val data =
"""<?xml-model href="../Schematron/iCalendar.sch" type="application/xml" schematypens="http://purl.oclc.org/dsdl/schematron"?>
|<xs:schema
|targetNamespace="http://example.com"
|xmlns:ex="http://example.com"
|xmlns:xs="http://www.w3.org/2001/XMLSchema"
|xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|xmlns:dfdl="http://www.ogf.org/dfdl/dfdl-1.0/">
| <xs:include schemaLocation="org/apache/daffodil/xsd/DFDLGeneralFormat.dfdl.xsd"/>
| <xs:annotation>
| <xs:appinfo source="http://www.ogf.org/dfdl/">
| <dfdl:format lengthKind="delimited" ref="ex:GeneralFormat"/>
| </xs:appinfo>
| </xs:annotation>
| <xs:element name="e1">
| <xs:complexType>
| <xs:sequence>
| <xs:element name="s1" type="xs:int"/>
| </xs:sequence>
| </xs:complexType>
| </xs:element>
|</xs:schema>
|""".stripMargin
val loader = new DaffodilXMLLoader()
val ss = StringSchemaSource(data)
val root =
loader.load(ss, None, false)
assertEquals("http://example.com", (root \ "@targetNamespace").text)
}

/**
* Characterize behavior of scala's xml loader w.r.t. CDATA preservation.
*
Expand Down Expand Up @@ -83,7 +149,7 @@ b&"<>]]>"""))
loader.load(ss, None, false, true)
}
val m = e.getMessage()
assertTrue(m.contains("DOCTYPE is disallowed"))
assertTrue(m.contains("DOCTYPE"))
}

/**
Expand Down Expand Up @@ -137,4 +203,66 @@ b&"<>]]>"""))
xmlFromDafLoaderNormalized.text)

}

@Test def testLoaderToleratesXMLWithLeadingWhitespace(): Unit = {
val xmlTextWithLeadingWhitespace = " \n" + "<data>foo</data>\n"
val loader = new DaffodilXMLLoader()
val ss = StringSchemaSource(xmlTextWithLeadingWhitespace)
val xml = loader.load(ss, None, addPositionAttributes = false)
assertEquals("foo", xml.text)
}

@Test def testLoaderToleratesXMLWithLeadingComments(): Unit = {
val xmlText = " \n" +
" <!-- a comment --> \n <data>foo</data>\n"
val loader = new DaffodilXMLLoader()
val ss = StringSchemaSource(xmlText)
val xml = loader.load(ss, None, addPositionAttributes = false)
assertEquals("foo", xml.text)
}

@Test def testLoaderToleratesXMLWithPI(): Unit = {
val xmlText = " \n" +
"<?aProcInstr yadda yadda ?>\n" +
" <!-- a comment --> \n <data>foo</data>\n"
val loader = new DaffodilXMLLoader()
val ss = StringSchemaSource(xmlText)
val xml = loader.load(ss, None, addPositionAttributes = false)
assertEquals("foo", xml.text)
}

@Test def testLoaderCatchesVarousBadXML(): Unit = {
val xmlText = " \n" + // no prolog some whitespace (tolerated)
"&AnEntityRef;\n" + // entity refs not allowed
"random text\n" + // just text not allowed
"<data>foo</data>\n" +
"<!-- comment afterwards --><another>element</another>\n&AnotherEntityRef;\nmore random text\n" // other bad stuff.
val teh = new TestErrorHandler()
val loader = new DaffodilXMLLoader(teh)
val ss = StringSchemaSource(xmlText)
val xml = loader.load(ss, None, addPositionAttributes = false)
val msgs = teh.exceptions.map{ _.getMessage() }.mkString("\n")
println(msgs)
assertTrue(msgs.contains("non-empty text nodes not allowed"))
assertTrue(msgs.contains("random text"))
assertTrue(msgs.contains("more random text"))
assertTrue(msgs.contains("exactly one element"))
}
}

class TestErrorHandler extends org.xml.sax.ErrorHandler {

val exceptions = new ArrayBuffer[SAXParseException]

def warning(exception: SAXParseException) = {
exceptions += exception
}

def error(exception: SAXParseException) = {
exceptions += exception
}

def fatalError(exception: SAXParseException) = {
exceptions += exception
}
}

0 comments on commit 6823c91

Please sign in to comment.