Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: checkpoint on preserve CR issue #1244

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.daffodil.xml

import org.apache.commons.io.FileUtils
import org.junit.Test
import org.apache.daffodil.CLI.Util._
import org.apache.daffodil.Main.ExitCode
import org.junit.Assert.assertTrue

import java.nio.charset.StandardCharsets

class TestXMLConversionControl {

//
// To run tests conveniently under IntelliJ IDEA,
// rename the src/test dir to src/test1. Rename the src/it dir to src/test.
// Then modify this val to be "test".
// Then you can run these as ordinary junit-style tests under the IDE.
val test = "it"

@Test def test_CLI_XMLConversionControlConvertCR(): Unit = {
withTempFile { output =>
val schema = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/aString.dfdl.xsd")
val config = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/config-convertCR.cfg.xml")
val input = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/input/inputWithCRLFs.bin")

runCLI(args"parse -s $schema -c $config --root a -o $output $input") {
cli => //ok
}(ExitCode.Success)

val res = FileUtils.readFileToString(output.toFile, StandardCharsets.UTF_8)
assertTrue(res.contains("<ex:a xmlns:ex=\"urn:ex\">abc\ndef\nghi</ex:a>"))
}
}

@Test def test_CLI_XMLConversionControlPreserveCRParse(): Unit = {
withTempFile { output =>
val schema = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/aString.dfdl.xsd")
val config = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/config-preserveCR.cfg.xml")
val input = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/input/inputWithCRLFs.bin")

runCLI(args"parse -s $schema -c $config --root a -o $output $input") { cli =>
//ok
}(ExitCode.Success)

val res = FileUtils.readFileToString(output.toFile, StandardCharsets.UTF_8)
assertTrue(res.contains("<ex:a xmlns:ex=\"urn:ex\">abc\uE00D\ndef\uE00D\nghi</ex:a>"))
}
}

@Test def test_CLI_XMLConversionControlPreserveCRRoundTrip(): Unit = {
withTempFile { output =>
withTempFile { xmlOut =>
val schema = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/aString.dfdl.xsd")
val config = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/config-preserveCR.cfg.xml")
val input = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/input/inputWithCRLFs.bin")

var cmd = args"parse -s $schema -c $config --root a -o $xmlOut $input "
runCLI(cmd) { cli =>
// ok
}(ExitCode.Success)

cmd = args"unparse -s $schema -c $config --root a -o $output $xmlOut"
runCLI(cmd) { cli =>
// ok
}(ExitCode.Success)


val xml = FileUtils.readFileToString(xmlOut.toFile, StandardCharsets.UTF_8)
assertTrue(xml.toString.contains("abc\uE00D\ndef\uE00D\nghi"))
}

val xml = FileUtils.readFileToString(output.toFile, StandardCharsets.UTF_8)
assertTrue(xml.toString.contains("abc\r\ndef\r\nghi"))
}
}

@Test def test_CLI_XMLConversionControlPreserveCRUnparseToFile(): Unit = {
withTempFile { output =>
val schema = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/aString.dfdl.xsd")
val config = path(s"daffodil-cli/src/$test/resources/org/apache/daffodil/CLI/config-preserveCR.cfg.xml")

runCLI(args"unparse -s $schema -c $config --root a -o $output ") { cli =>
cli.send("<ex:a xmlns:ex='urn:ex'>abc\uE00D\ndef\uE00D\nghi</ex:a>", inputDone = true)
}(ExitCode.Success)

val res = FileUtils.readFileToString(output.toFile, StandardCharsets.UTF_8)
assertTrue(res.contains("abc\r\ndef\r\nghi"))
}
}
}
32 changes: 17 additions & 15 deletions daffodil-cli/src/main/scala/org/apache/daffodil/InfosetTypes.scala
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ case class XMLTextInfosetHandler(dataProcessor: DataProcessor)
extends InfosetHandler {

def parse(input: InputSourceDataInputStream, os: OutputStream): InfosetParseResult = {
val output = new XMLTextInfosetOutputter(os, pretty = true)
val output = new XMLTextInfosetOutputter(os, pretty = true, dataProcessor.daffodilConfig.xmlConversionControl)
val pr = parseWithInfosetOutputter(input, output)
new InfosetParseResult(pr)
}
Expand All @@ -286,7 +286,7 @@ case class XMLTextInfosetHandler(dataProcessor: DataProcessor)
case bytes: Array[Byte] => new ByteArrayInputStream(bytes)
case is: InputStream => is
}
val input = new XMLTextInfosetInputter(is)
val input = new XMLTextInfosetInputter(is, dataProcessor.daffodilConfig.xmlConversionControl)
val ur = unparseWithInfosetInputter(input, output)
ur
}
Expand Down Expand Up @@ -330,14 +330,14 @@ case class JDOMInfosetHandler(dataProcessor: DataProcessor)
extends InfosetHandler {

def parse(input: InputSourceDataInputStream, os: OutputStream): InfosetParseResult = {
val output = new JDOMInfosetOutputter()
val output = new JDOMInfosetOutputter(dataProcessor.daffodilConfig.xmlConversionControl)
val pr = parseWithInfosetOutputter(input, output)
new JDOMInfosetParseResult(pr, output)
}

def unparse(data: AnyRef, output: DFDL.Output): UnparseResult = {
val doc = data.asInstanceOf[org.jdom2.Document]
val input = new JDOMInfosetInputter(doc)
val input = new JDOMInfosetInputter(doc, dataProcessor.daffodilConfig.xmlConversionControl)
val ur = unparseWithInfosetInputter(input, output)
ur
}
Expand Down Expand Up @@ -372,14 +372,14 @@ case class ScalaXMLInfosetHandler(dataProcessor: DataProcessor)
extends InfosetHandler {

def parse(input: InputSourceDataInputStream, os: OutputStream): InfosetParseResult = {
val output = new ScalaXMLInfosetOutputter()
val output = new ScalaXMLInfosetOutputter(dataProcessor.daffodilConfig.xmlConversionControl)
val pr = parseWithInfosetOutputter(input, output)
new ScalaXMLInfosetParseResult(pr, output)
}

def unparse(data: AnyRef, output: DFDL.Output): UnparseResult = {
val node = data.asInstanceOf[scala.xml.Node]
val input = new ScalaXMLInfosetInputter(node)
val input = new ScalaXMLInfosetInputter(node, dataProcessor.daffodilConfig.xmlConversionControl)
val ur = unparseWithInfosetInputter(input, output)
ur
}
Expand Down Expand Up @@ -414,15 +414,17 @@ class ScalaXMLInfosetParseResult(parseResult: ParseResult, output: ScalaXMLInfos
case class W3CDOMInfosetHandler(dataProcessor: DataProcessor)
extends InfosetHandler {

private val xcc = dataProcessor.daffodilConfig.xmlConversionControl

def parse(input: InputSourceDataInputStream, os: OutputStream): InfosetParseResult = {
val output = new W3CDOMInfosetOutputter()
val output = new W3CDOMInfosetOutputter(xcc)
val pr = parseWithInfosetOutputter(input, output)
new W3CDOMInfosetParseResult(pr, output)
}

def unparse(data: AnyRef, output: DFDL.Output): UnparseResult = {
val doc = data.asInstanceOf[ThreadLocal[org.w3c.dom.Document]].get
val input = new W3CDOMInfosetInputter(doc)
val input = new W3CDOMInfosetInputter(doc, xcc)
val ur = unparseWithInfosetInputter(input, output)
ur
}
Expand Down Expand Up @@ -473,18 +475,18 @@ case class NULLInfosetHandler(dataProcessor: DataProcessor)
}

def unparse(data: AnyRef, output: DFDL.Output): UnparseResult = {
val events = data.asInstanceOf[Array[NullInfosetInputter.Event]]
val input = new NullInfosetInputter(events)
val is = data match {
case bytes: Array[Byte] => new ByteArrayInputStream(bytes)
case is: InputStream => is
}
val input = new NullInfosetInputter(is, dataProcessor.daffodilConfig.xmlConversionControl)
val ur = unparseWithInfosetInputter(input, output)
ur
}

def dataToInfoset(bytes: Array[Byte]): AnyRef = dataToInfoset(new ByteArrayInputStream(bytes))
def dataToInfoset(bytes: Array[Byte]): AnyRef = bytes

def dataToInfoset(stream: InputStream): AnyRef = {
val events = NullInfosetInputter.toEvents(stream)
events
}
def dataToInfoset(stream: InputStream): AnyRef = stream
}

/**
Expand Down
2 changes: 1 addition & 1 deletion daffodil-cli/src/main/scala/org/apache/daffodil/Main.scala
Original file line number Diff line number Diff line change
Expand Up @@ -874,7 +874,7 @@ object Main {
val tunables = DaffodilTunables.configPlusMoreTunablesMap(performanceOpts.tunables, optDafConfig)
createProcessorFromSchema(performanceOpts.schema(), performanceOpts.rootNS.toOption, performanceOpts.path.toOption, tunables, validate)
}
}.map{ _.withExternalVariables(combineExternalVariables(performanceOpts.vars, optDafConfig)) }
}.map{ _.withExternalVariables(combineExternalVariables(performanceOpts.vars, dafConfig)) }
.map{ _.withValidationMode(validate) }

val rc: ExitCode.Value = processor match {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<schema
xmlns:dfdl="http://www.ogf.org/dfdl/dfdl-1.0/"
targetNamespace="urn:ex"
xmlns:ex="urn:ex"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns="http://www.w3.org/2001/XMLSchema">

<include schemaLocation="org/apache/daffodil/xsd/DFDLGeneralFormat.dfdl.xsd" />

<annotation>
<appinfo source="http://www.ogf.org/dfdl/">
<dfdl:format ref="ex:GeneralFormat"
lengthKind="delimited"/>
</appinfo>
</annotation>

<element name="a" type="xs:string"/>

</schema>
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
abc
def
ghi
45 changes: 43 additions & 2 deletions daffodil-core/src/main/scala/org/apache/daffodil/dsom/Facets.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,41 @@ import java.math.BigInteger
import scala.xml.Node
import org.apache.daffodil.exceptions.Assert
import org.apache.daffodil.dpath.NodeInfo.PrimType
import org.apache.daffodil.xml.XMLUtils
import org.apache.daffodil.xml.RemapPUAToXMLIllegalChar

object Facets {

/**
* Remapper used to convert pattern facet values
* so that they can describe the DFDL infoset (for use
* in our limited Daffodil-itself validation, as well
* as the same pattern being useful for full validation
* by a regular XSD validator.
*
* A regular XML validator (ex: Xerces) will need to look at the
* infoset as we've mapped it to the PUA. Hence, if the
* pattern is looking for say, control characters, it cannot
* look for control-A (U+0001), because that will have been
* remapped to U+E001.
*
* So the pattern facet value will have E001 in it, likely
* expressed as `&#xE001;`. That will work fine for
* external validation by Xerces or other.
*
* But Daffodil's internal (aka limited) validation operates
* on the regular DFDL infoset, before any remapping for XML occurs.
*
* So we instead map the pattern facet value itself down
* so that the `&#xE001;` in the pattern turns into an actual
* NUL (\u0000 or \x00) in the regex as is used for limited validation.
*/
private val remapper =
new RemapPUAToXMLIllegalChar()
}

trait Facets { self: Restriction =>
import org.apache.daffodil.dsom.FacetTypes._
import Facets._

private def retrieveFacetValueFromRestrictionBase(xml: Node, facetName: Facet.Type): String = {
val res = xml \\ "restriction" \ facetName.toString() \ "@value"
Expand Down Expand Up @@ -151,7 +182,17 @@ trait Facets { self: Restriction =>
// The XSD numeric character entity &#xE000; can be used to match ASCII NUL
// (char code 0).
//
val remapped: String = XMLUtils.remapPUAToXMLIllegalCharacters(v)
// This remapping is for pattern facets, which are inside a DFDL schema,
// and so will not contain CR characters, since XML reading will convert those
// to LF. To discuss CR in this pattern we can't use `&#x0d;` syntax because that
// turns into a CR which gets turned into a LF. Plus the pattern value is
// an XML attribute, the value of which gets its whitespace collapsed, all
// line-ending chars converted to spaces, and adjacent spaces collapsed to one.
//
// So a pattern facet must use `\r` and '\n' to describe line-endings within the pattern.
// And in general one must be careful about whitespace.
//
val remapped: String = remapper.remap(v)
(f, remapped.r)
}
}
Expand Down
Loading