Skip to content

Commit

Permalink
Make override of document() method not use super.document
Browse files Browse the repository at this point in the history
More tests.

DAFFODIL-2527
  • Loading branch information
mbeckerle committed May 28, 2021
1 parent 0cfbdd2 commit 04187ae
Show file tree
Hide file tree
Showing 2 changed files with 115 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ object Position {
* properly, creating PCData nodes for the contents of these, and not otherwise
* messing with the contents.
*
* This code is effectively our fork of the Scala ConstructingParser. This
* works around some bugs in it.
*
* Xerces, unfortunately, messes with the contents of these CDATA regions,
* normalizes whitespace inside them, and generally makes it impossible to do things
* in XML that depend on line-structure of element content to being preserved.
Expand Down Expand Up @@ -304,29 +307,49 @@ class DaffodilConstructingLoader private[xml] (uri: URI,
/**
* Override of document to make it tolerant of the start of the file
* being whitespace instead of a "<" character
*
* This does not handle DOCTYPEs (aka DTDs) at all. Hence, is not
* a true replacement (bug fix) on the original ConstructingParser method
* that it overrides.
*/
override def document(): Document = {
doc = new Document()
this.dtd = null
var children: NodeSeq = null

if ('<' == ch) super.document()
else {
// File did not start with "<".
// We want to tolerate whitespace and comments prior to the
// document element.

if ('<' == ch) {
nextch()
if ('?' == ch) {
// It's an XML Prolog
nextch()
val info_prolog = prolog()
doc.version = info_prolog._1
doc.encoding = info_prolog._2
doc.standAlone = info_prolog._3
children = content(TopScope)
} else {
val ts = new NodeBuffer()
content1(TopScope, ts) // the 1 suffix means "without the first < character"
ts &+ content(TopScope)
children = NodeSeq.fromSeq(ts)
}
} else {
children = content(TopScope)
}

val children = content(TopScope)

var isErr = false
var elemCount = 0
var theNode: Node = null
children.foreach{
var isErr = false
var elemCount = 0
var theNode: Node = null
children.foreach { c =>
c match {
case _: ProcInstr => // skip
case _: Comment => // skip
// $COVERAGE-OFF$ // constructing parser never creates these - probably due to a bug
case _: EntityRef => {
reportSyntaxError("no entity references allowed here")
isErr = true
}
// $COVERAGE-ON$
case s: SpecialNode => {
val txt = s.toString.trim()
if (txt.length > 0) {
Expand All @@ -338,23 +361,21 @@ class DaffodilConstructingLoader private[xml] (uri: URI,
elemCount += 1
theNode = m
}
if (1 != elemCount) {
reportSyntaxError("document must contain exactly one element")
isErr = true
}
}
if (1 != elemCount) {
reportSyntaxError("document must contain exactly one element")
isErr = true
}

if (!isErr) {
val doc = new Document()
doc.children = children
doc.docElem = theNode
doc
} else {
null
}
if (!isErr) {
doc.children = children
doc.docElem = theNode
doc
} else {
null
}
}


def load(): Node = {
val res =
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,42 @@ import org.apache.daffodil.api.StringSchemaSource
import org.apache.daffodil.xml.DaffodilXMLLoader
import org.junit.Assert._
import org.junit.Test
import org.xml.sax.SAXParseException

import scala.collection.mutable.ArrayBuffer
import scala.xml.SAXParseException

class TestXMLLoader {

@Test
def test_schemaLoad(): Unit = {
val data =
"""<xs:schema targetNamespace="http://example.com"
|xmlns:ex="http://example.com"
|xmlns:xs="http://www.w3.org/2001/XMLSchema"
|xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|xmlns:dfdl="http://www.ogf.org/dfdl/dfdl-1.0/">
| <xs:include schemaLocation="org/apache/daffodil/xsd/DFDLGeneralFormat.dfdl.xsd"/>
| <xs:annotation>
| <xs:appinfo source="http://www.ogf.org/dfdl/">
| <dfdl:format lengthKind="delimited" ref="ex:GeneralFormat"/>
| </xs:appinfo>
| </xs:annotation>
| <xs:element name="e1">
| <xs:complexType>
| <xs:sequence>
| <xs:element name="s1" type="xs:int"/>
| </xs:sequence>
| </xs:complexType>
| </xs:element>
|</xs:schema>
|""".stripMargin
val loader = new DaffodilXMLLoader()
val ss = StringSchemaSource(data)
val root =
loader.load(ss, None, false)
assertEquals("http://example.com", (root \ "@targetNamespace").text)
}

/**
* Characterize behavior of scala's xml loader w.r.t. CDATA preservation.
*
Expand Down Expand Up @@ -83,7 +115,7 @@ b&"<>]]>"""))
loader.load(ss, None, false, true)
}
val m = e.getMessage()
assertTrue(m.contains("DOCTYPE is disallowed"))
assertTrue(m.contains("DOCTYPE"))
}

/**
Expand Down Expand Up @@ -164,4 +196,39 @@ b&"<>]]>"""))
val xml = loader.load(ss, None, addPositionAttributes = false)
assertEquals("foo", xml.text)
}

@Test def testLoaderCatchesVarousBadXML(): Unit = {
val xmlText = " \n" + // no prolog some whitespace (tolerated)
"&AnEntityRef;\n" + // entity refs not allowed
"random text\n" + // just text not allowed
"<data>foo</data>\n" +
"<!-- comment afterwards --><another>element</another>\n&AnotherEntityRef;\nmore random text\n" // other bad stuff.
val teh = new TestErrorHandler()
val loader = new DaffodilXMLLoader(teh)
val ss = StringSchemaSource(xmlText)
val xml = loader.load(ss, None, addPositionAttributes = false)
val msgs = teh.exceptions.map{ _.getMessage() }.mkString("\n")
println(msgs)
assertTrue(msgs.contains("non-empty text nodes not allowed"))
assertTrue(msgs.contains("random text"))
assertTrue(msgs.contains("more random text"))
assertTrue(msgs.contains("exactly one element"))
}
}

class TestErrorHandler extends org.xml.sax.ErrorHandler {

val exceptions = new ArrayBuffer[SAXParseException]

def warning(exception: SAXParseException) = {
exceptions += exception
}

def error(exception: SAXParseException) = {
exceptions += exception
}

def fatalError(exception: SAXParseException) = {
exceptions += exception
}
}

0 comments on commit 04187ae

Please sign in to comment.