From ca709af6caca2d93dab31e479b86284fd3610ece Mon Sep 17 00:00:00 2001 From: Michael Beckerle Date: Wed, 11 Oct 2023 08:21:17 -0400 Subject: [PATCH 1/8] Adding Daffodil to Drill as a 'contrib' 3.7.0-SNAPSHOT of Daffodil which has metadata support we're using. New format-daffodil module created Still uses absolute paths for the schemaFileURI. (which is cheating. Wouldn't work in a true distributed drill environment.) We have yet to work out how to enable Drill to provide access for DFDL schemas in XML form with include/import to be resolved. The input data stream is, however, being accessed in the proper Drill manner. Gunzip happened automatically. Nice. Note: Fix boxed Boolean vs. boolean problem. Don't use boxed primitives in Format config objects. Test show this works for data as complex as having nested repeating sub-records. These DFDL types are supported: - int - long - short - byte - boolean - double - float (does not work. Bug DAFFODIL-2367) - hexBinary - string https://github.com/apache/drill/issues/2835 --- contrib/format-daffodil/README.md | 19 ++ contrib/format-daffodil/pom.xml | 94 ++++++ .../store/daffodil/DaffodilBatchReader.java | 184 +++++++++++ .../DaffodilDrillInfosetOutputter.java | 296 ++++++++++++++++++ .../store/daffodil/DaffodilFormatConfig.java | 119 +++++++ .../store/daffodil/DaffodilFormatPlugin.java | 83 +++++ .../store/daffodil/DaffodilMessageParser.java | 198 ++++++++++++ .../schema/DaffodilDataProcessorFactory.java | 154 +++++++++ .../schema/DrillDaffodilSchemaUtils.java | 102 ++++++ .../schema/DrillDaffodilSchemaVisitor.java | 225 +++++++++++++ .../resources/bootstrap-format-plugins.json | 37 +++ .../src/main/resources/drill-module.conf | 25 ++ .../store/daffodil/TestDaffodilReader.java | 256 +++++++++++++++ ...TestDaffodilToDrillMetadataConversion.java | 71 +++++ .../src/test/resources/data/data01Int.dat | Bin 0 -> 4 bytes .../src/test/resources/data/data01Int.dat.gz | Bin 0 -> 35 bytes .../src/test/resources/data/data02Int.dat | Bin 0 -> 8 bytes .../src/test/resources/data/data06Int.dat | Bin 0 -> 24 bytes .../src/test/resources/data/data12Int.dat | Bin 0 -> 48 bytes .../test/resources/data/moreTypes1.txt.dat | 2 + .../test/resources/schema/complex1.dfdl.xsd | 54 ++++ .../test/resources/schema/complex2.dfdl.xsd | 65 ++++ .../resources/schema/complexArray1.dfdl.xsd | 65 ++++ .../resources/schema/complexArray2.dfdl.xsd | 65 ++++ .../test/resources/schema/moreTypes1.dfdl.xsd | 70 +++++ .../src/test/resources/schema/simple.dfdl.xsd | 71 +++++ .../schema/simpleArrayField1.dfdl.xsd | 71 +++++ contrib/pom.xml | 1 + distribution/pom.xml | 5 + distribution/src/assemble/component.xml | 1 + pom.xml | 6 + 31 files changed, 2339 insertions(+) create mode 100644 contrib/format-daffodil/README.md create mode 100644 contrib/format-daffodil/pom.xml create mode 100644 contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilBatchReader.java create mode 100644 contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java create mode 100644 contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatConfig.java create mode 100644 contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatPlugin.java create mode 100644 contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilMessageParser.java create mode 100644 contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java create mode 100644 contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java create mode 100644 contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java create mode 100644 contrib/format-daffodil/src/main/resources/bootstrap-format-plugins.json create mode 100644 contrib/format-daffodil/src/main/resources/drill-module.conf create mode 100644 contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java create mode 100644 contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/schema/TestDaffodilToDrillMetadataConversion.java create mode 100644 contrib/format-daffodil/src/test/resources/data/data01Int.dat create mode 100644 contrib/format-daffodil/src/test/resources/data/data01Int.dat.gz create mode 100644 contrib/format-daffodil/src/test/resources/data/data02Int.dat create mode 100644 contrib/format-daffodil/src/test/resources/data/data06Int.dat create mode 100644 contrib/format-daffodil/src/test/resources/data/data12Int.dat create mode 100644 contrib/format-daffodil/src/test/resources/data/moreTypes1.txt.dat create mode 100644 contrib/format-daffodil/src/test/resources/schema/complex1.dfdl.xsd create mode 100644 contrib/format-daffodil/src/test/resources/schema/complex2.dfdl.xsd create mode 100644 contrib/format-daffodil/src/test/resources/schema/complexArray1.dfdl.xsd create mode 100644 contrib/format-daffodil/src/test/resources/schema/complexArray2.dfdl.xsd create mode 100644 contrib/format-daffodil/src/test/resources/schema/moreTypes1.dfdl.xsd create mode 100644 contrib/format-daffodil/src/test/resources/schema/simple.dfdl.xsd create mode 100644 contrib/format-daffodil/src/test/resources/schema/simpleArrayField1.dfdl.xsd diff --git a/contrib/format-daffodil/README.md b/contrib/format-daffodil/README.md new file mode 100644 index 00000000000..d036c63b2e8 --- /dev/null +++ b/contrib/format-daffodil/README.md @@ -0,0 +1,19 @@ +# Daffodil 'Format' Reader +This plugin enables Drill to read DFDL-described data from files by way of the Apache Daffodil DFDL implementation. + +## Limitations: TBD + +At the moment, the DFDL schema is found on the local file system, which won't continue to work. + +There are restrictions on the DFDL schemas that this can handle. + +In particular, all element children must have distinct element names, including across choice branches. +(This rules out a number of large DFDL schemas.) + +The data is parsed fully from its native form into a Drill data structure held in memory. +No attempt is made to avoid access to parts of the DFDL-described data that are not needed to answer the query. + +If the data is not well-formed, an error occurs and the query fails. + +If the data is invalid, and validity checking by Daffodil is enabled, then an error occurs and the query fails. + diff --git a/contrib/format-daffodil/pom.xml b/contrib/format-daffodil/pom.xml new file mode 100644 index 00000000000..01955c746dc --- /dev/null +++ b/contrib/format-daffodil/pom.xml @@ -0,0 +1,94 @@ + + + + 4.0.0 + + + drill-contrib-parent + org.apache.drill.contrib + 1.22.0-SNAPSHOT + + + drill-format-daffodil + Drill : Contrib : Format : Daffodil + + + + org.apache.drill.exec + drill-java-exec + ${project.version} + + + org.apache.daffodil + daffodil-japi_2.12 + 3.7.0-SNAPSHOT + + + org.apache.daffodil + daffodil-runtime1_2.12 + 3.7.0-SNAPSHOT + + + + org.apache.drill.exec + drill-java-exec + tests + ${project.version} + test + + + + org.apache.drill + drill-common + tests + ${project.version} + test + + + + + + + maven-resources-plugin + + + copy-java-sources + process-sources + + copy-resources + + + ${basedir}/target/classes/org/apache/drill/exec/store/daffodil + + + + src/main/java/org/apache/drill/exec/store/daffodil + true + + + + + + + + + diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilBatchReader.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilBatchReader.java new file mode 100644 index 00000000000..20a0d81d4e0 --- /dev/null +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilBatchReader.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.daffodil; + +import java.io.InputStream; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Objects; + +import org.apache.daffodil.japi.DataProcessor; +import org.apache.drill.common.AutoCloseables; +import org.apache.drill.common.exceptions.CustomErrorContext; +import org.apache.drill.common.exceptions.UserException; +import org.apache.drill.exec.physical.impl.scan.v3.ManagedReader; +import org.apache.drill.exec.physical.impl.scan.v3.file.FileDescrip; +import org.apache.drill.exec.physical.impl.scan.v3.file.FileSchemaNegotiator; +import org.apache.drill.exec.physical.resultSet.RowSetLoader; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.apache.drill.exec.store.daffodil.schema.DaffodilDataProcessorFactory; +import org.apache.drill.exec.store.dfs.DrillFileSystem; +import org.apache.drill.exec.store.dfs.easy.EasySubScan; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaUtils.daffodilDataProcessorToDrillSchema; + + +public class DaffodilBatchReader implements ManagedReader { + + private static final Logger logger = LoggerFactory.getLogger(DaffodilBatchReader.class); + private final DaffodilFormatConfig dafConfig; + private final RowSetLoader rowSetLoader; + private final CustomErrorContext errorContext; + private final DaffodilMessageParser dafParser; + private final InputStream dataInputStream; + + static class DaffodilReaderConfig { + final DaffodilFormatPlugin plugin; + DaffodilReaderConfig(DaffodilFormatPlugin plugin) { + this.plugin = plugin; + } + } + + public DaffodilBatchReader (DaffodilReaderConfig readerConfig, EasySubScan scan, FileSchemaNegotiator negotiator) { + + errorContext = negotiator.parentErrorContext(); + this.dafConfig = readerConfig.plugin.getConfig(); + + String schemaURIString = dafConfig.getSchemaURI(); // "schema/complexArray1.dfdl.xsd"; + String rootName = dafConfig.getRootName(); + String rootNamespace = dafConfig.getRootNamespace(); + boolean validationMode = dafConfig.getValidationMode(); + + URI dfdlSchemaURI; + try { + dfdlSchemaURI = new URI(schemaURIString); + } catch (URISyntaxException e) { + throw UserException.validationError(e) + .build(logger); + } + + FileDescrip file = negotiator.file(); + DrillFileSystem fs = file.fileSystem(); + URI fsSchemaURI = fs.getUri().resolve(dfdlSchemaURI); + + + DaffodilDataProcessorFactory dpf = new DaffodilDataProcessorFactory(); + DataProcessor dp; + try { + dp = dpf.getDataProcessor(fsSchemaURI, validationMode, rootName, rootNamespace); + } catch (Exception e) { + throw UserException.dataReadError(e) + .message(String.format("Failed to get Daffodil DFDL processor for: %s", fsSchemaURI)) + .addContext(errorContext).addContext(e.getMessage()).build(logger); + } + // Create the corresponding Drill schema. + // Note: this could be a very large schema. Think of a large complex RDBMS schema, + // all of it, hundreds of tables, but all part of the same metadata tree. + TupleMetadata drillSchema = daffodilDataProcessorToDrillSchema(dp); + // Inform Drill about the schema + negotiator.tableSchema(drillSchema, true); + + // + // DATA TIME: Next we construct the runtime objects, and open files. + // + // We get the DaffodilMessageParser, which is a stateful driver for daffodil that + // actually does the parsing. + rowSetLoader = negotiator.build().writer(); + + // We construct the Daffodil InfosetOutputter which the daffodil parser uses to + // convert infoset event calls to fill in a Drill row via a rowSetLoader. + DaffodilDrillInfosetOutputter outputter = new DaffodilDrillInfosetOutputter(rowSetLoader); + + // Now we can setup the dafParser with the outputter it will drive with + // the parser-produced infoset. + dafParser = new DaffodilMessageParser(dp); // needs further initialization after this. + dafParser.setInfosetOutputter(outputter); + + Path dataPath = file.split().getPath(); + // Lastly, we open the data stream + try { + dataInputStream = fs.openPossiblyCompressedStream(dataPath); + } catch (Exception e) { + throw UserException.dataReadError(e) + .message(String.format("Failed to open input file: %s", dataPath.toString())) + .addContext(errorContext).addContext(e.getMessage()).build(logger); + } + // And lastly,... tell daffodil the input data stream. + dafParser.setInputStream(dataInputStream); + } + + + /** + * This is the core of actual processing - data movement from Daffodil to Drill. + *

+ * If there is space in the batch, and there is data available to parse + * then this calls the daffodil parser, which parses data, delivering it to the rowWriter + * by way of the infoset outputter. + *

+ * Repeats until the rowWriter is full (a batch is full), or there is no more data, or + * a parse error ends execution with a throw. + *

+ * Validation errors and other warnings are not errors and are logged but do not cause + * parsing to fail/throw. + * @return true if there are rows retrieved, false if no rows were retrieved, which means + * no more will ever be retrieved (end of data). + * @throws RuntimeException on parse errors. + */ + @Override + public boolean next() { + // Check assumed invariants + // We don't know if there is data or not. This could be called on an empty data file. + // We DO know that this won't be called if there is no space in the batch for even 1 + // row. + if (dafParser.isEOF()) { + return false; // return without even checking for more rows or trying to parse. + } + while (rowSetLoader.start() && !dafParser.isEOF()) { // we never zero-trip this loop. + // the predicate is always true once. + try { + dafParser.parse(); + if (dafParser.isProcessingError()) { + assert(Objects.nonNull(dafParser.getDiagnostics())); + throw UserException.dataReadError().message(dafParser.getDiagnosticsAsString()) + .addContext(errorContext).build(logger); + } + if (dafParser.isValidationError()) { + logger.warn(dafParser.getDiagnosticsAsString()); + // Note that even if daffodil is set to not validate, validation errors may still occur + // from DFDL's "recoverableError" assertions. + } + } catch (Exception e) { + throw UserException.dataReadError(e).message("Error parsing file: " + e.getMessage()) + .addContext(errorContext).build(logger); + } + rowSetLoader.save(); + } + int nRows = rowSetLoader.rowCount(); + assert nRows > 0; // This cannot be zero. If the parse failed we will have already thrown out of here. + return true; + } + + @Override + public void close() { + AutoCloseables.closeSilently(dataInputStream); + } +} diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java new file mode 100644 index 00000000000..df9aebf08f4 --- /dev/null +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.daffodil; + +import org.apache.daffodil.runtime1.api.ComplexElementMetadata; +import org.apache.daffodil.runtime1.api.ElementMetadata; +import org.apache.daffodil.runtime1.api.InfosetArray; +import org.apache.daffodil.runtime1.api.InfosetComplexElement; +import org.apache.daffodil.japi.infoset.InfosetOutputter; +import org.apache.daffodil.runtime1.api.InfosetSimpleElement; +import org.apache.daffodil.runtime1.api.PrimitiveType; +import org.apache.drill.common.types.TypeProtos; +import org.apache.drill.exec.physical.resultSet.RowSetLoader; +import org.apache.drill.exec.record.metadata.ColumnMetadata; +import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaUtils; +import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaVisitor; +import org.apache.drill.exec.vector.accessor.ArrayWriter; +import org.apache.drill.exec.vector.accessor.ColumnWriter; +import org.apache.drill.exec.vector.accessor.ObjectType; +import org.apache.drill.exec.vector.accessor.TupleWriter; +import org.apache.drill.exec.vector.complex.writer.BaseWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Stack; + +/** + * Adapts Daffodil parser infoset event calls to Drill writer calls + * to fill in Drill data rows. + */ +public class DaffodilDrillInfosetOutputter + extends InfosetOutputter { + + private boolean isOriginalRoot() { + boolean result = currentTupleWriter() == rowSetWriter; + if (result) + assert(tupleWriterStack.size() == 1); + return result; + } + + /** + * True if the next startComplex call will be for the + * DFDL infoset root element whose children are the columns of + * the row set. + */ + private boolean isRootElement = true; + + /** + * Stack that is used only if we have sub-structures that are not + * simple-type fields of the row. + */ + private final Stack tupleWriterStack = new Stack<>(); + + private final Stack arrayWriterStack = new Stack<>(); + + private TupleWriter currentTupleWriter() { + return tupleWriterStack.peek(); + } + + private ArrayWriter currentArrayWriter() { + return arrayWriterStack.peek(); + } + + + private static final Logger logger = LoggerFactory.getLogger(DaffodilDrillInfosetOutputter.class); + + private DaffodilDrillInfosetOutputter() {} // no default constructor + + private RowSetLoader rowSetWriter; + + public DaffodilDrillInfosetOutputter(RowSetLoader writer) { + this.rowSetWriter = writer; + this.tupleWriterStack.push(writer); + } + + @Override + public void reset() { + tupleWriterStack.clear(); + tupleWriterStack.push(rowSetWriter); + arrayWriterStack.clear(); + this.isRootElement = true; + checkCleanState(); + } + + private void checkCleanState() { + assert(isOriginalRoot()); + assert(arrayWriterStack.isEmpty()); + assert(isRootElement); + } + + @Override + public void startDocument() { + checkCleanState(); + } + + @Override + public void endDocument() { + checkCleanState(); + } + + private String colName(ElementMetadata md) { + return DrillDaffodilSchemaVisitor.makeColumnName(md); + } + + @Override + public void startSimple(InfosetSimpleElement ise) { + assert (!isRootElement); + ElementMetadata md = ise.metadata(); + String colName = colName(md); + ColumnWriter cw; + if (md.isArray()) { + // A simple type array + assert(!arrayWriterStack.isEmpty()); + cw = currentArrayWriter().scalar(); + } else { + // A simple element within a map + // Note the map itself might be an array + // but we don't care about that here. + cw = currentTupleWriter().column(colName); + } + ColumnMetadata cm = cw.schema(); + assert(cm.isScalar()); + if (md.isNillable() && ise.isNilled()) { + assert cm.isNullable(); + cw.setNull(); + } else { + convertDaffodilValueToDrillValue(ise, cm, cw); + } + } + + @Override + public void endSimple(InfosetSimpleElement diSimple) { + assert (!isRootElement); + // do nothing + } + + @Override + public void startComplex(InfosetComplexElement ce) { + ComplexElementMetadata md = ce.metadata(); + String colName = colName(ce.metadata()); + if (isRootElement) { + assert(isOriginalRoot()); + // This complex element's corresponds to the root element of the + // DFDL schema. We don't treat this as a column of the row set. + // Rather, it's children are the columns of the row set. + // + // If we do nothing at all here, then we'll start getting + // even calls for the children. + isRootElement = false; + return; + } + if (md.isArray()) { + assert(!arrayWriterStack.isEmpty()); + // FIXME: is this the way to add a complex array child item (i.e., each array item is a map) + tupleWriterStack.push(currentArrayWriter().tuple()); + } else { + tupleWriterStack.push(currentTupleWriter().tuple(colName)); + } + } + + @Override + public void endComplex(InfosetComplexElement ce) { + ComplexElementMetadata md = ce.metadata(); + if (isOriginalRoot()) { + isRootElement = true; + // do nothing else. The row gets closed-out in the DaffodilBatchReader.next() method. + } else { + // it's a map. + // We seem to not need to do anything to end the map. No action taken here works. + if (md.isArray()) { + assert (!arrayWriterStack.isEmpty()); + currentArrayWriter().save(); // required for map array entries. + } + tupleWriterStack.pop(); + } + } + + @Override + public void startArray(InfosetArray diArray) { + ElementMetadata md = diArray.metadata(); + assert (md.isArray()); + // DFDL has no notion of an array directly within another array. A named field (map) is necessary + // before you can have another array. + assert (currentTupleWriter().type() == ObjectType.TUPLE); // parent is a map, or the top level row. + String colName = colName(md); + TupleWriter enclosingParentTupleWriter = currentTupleWriter(); + ArrayWriter aw = enclosingParentTupleWriter.array(colName); + arrayWriterStack.push(aw); + } + + @Override + public void endArray(InfosetArray ia) { + ElementMetadata md = ia.metadata(); + assert (md.isArray()); + assert (!arrayWriterStack.empty()); + // FIXME: How do we end/close-out an array? + // note that each array instance, when the instance is a map, must have + // save called after it is written to the array but that happens + // in endComplex events since it must be called not once per array, but + // once per array item. + arrayWriterStack.pop(); + } + + private void convertDaffodilValueToDrillValue(InfosetSimpleElement ise, ColumnMetadata cm, ColumnWriter cw) { + PrimitiveType dafType = ise.metadata().primitiveType(); + TypeProtos.MinorType drillType = DrillDaffodilSchemaUtils.getDrillDataType(dafType); + assert(drillType == cm.type()); + switch (drillType) { + case INT: { + // + // FIXME: Javadoc for setObject says "primarily for testing" + // So how are we supposed to assign the column value then? + // Is there a way to get from a ColumnWriter to a typed scalar writer (downcast perhaps?) + cw.setObject(ise.getInt()); + break; + } + case BIGINT: { + cw.setObject(ise.getLong()); + break; + } + case SMALLINT: { + cw.setObject(ise.getShort()); + break; + } + case TINYINT: { + cw.setObject(ise.getByte()); + break; + } +// .put("UNSIGNEDLONG", TypeProtos.MinorType.UINT8) +// .put("UNSIGNEDINT", TypeProtos.MinorType.UINT4) +// .put("UNSIGNEDSHORT", TypeProtos.MinorType.UINT2) +// .put("UNSIGNEDBYTE", TypeProtos.MinorType.UINT1) +// .put("INTEGER", TypeProtos.MinorType.BIGINT) +// .put("NONNEGATIVEINTEGER", TypeProtos.MinorType.BIGINT) + case BIT: { + cw.setObject(ise.getBoolean()); + break; + } +// .put("DATE", TypeProtos.MinorType.DATE) // requires conversion +// .put("DATETIME", TypeProtos.MinorType.TIMESTAMP) // requires conversion +// .put("DECIMAL", TypeProtos.MinorType.VARDECIMAL) // requires conversion (maybe) + case FLOAT8: { + cw.setObject(ise.getDouble()); + break; + } + case FLOAT4: { + cw.setObject(ise.getFloat()); + break; + } + case VARBINARY: { + cw.setObject(ise.getHexBinary()); + break; + } + case VARCHAR: { + // + // FIXME: VARCHAR is defined in drill as utf8 string. + // Is Drill expecting something other than a Java string in this setObject call? + // Should we be mapping Daffodil strings to Drill VAR16CHAR type? + // + String s = ise.getString(); + cw.setObject(s); + break; + } +// .put("TIME", TypeProtos.MinorType.TIME) // requires conversion + + } + } + + private void DFDLParseError(String s) { + throw new RuntimeException(s); + } + + private static void nyi() { + throw new RuntimeException("not yet implemented."); + } + + private static void fatalError(String s) { + throw new RuntimeException(s); + } +} + diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatConfig.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatConfig.java new file mode 100644 index 00000000000..80e2ba5110d --- /dev/null +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatConfig.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.daffodil; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonTypeName; +import com.google.common.collect.ImmutableList; +import org.apache.drill.common.PlanStringBuilder; +import org.apache.drill.common.logical.FormatPluginConfig; +import org.apache.drill.exec.store.daffodil.DaffodilBatchReader.DaffodilReaderConfig; + +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +@JsonTypeName(DaffodilFormatPlugin.DEFAULT_NAME) +@JsonInclude(JsonInclude.Include.NON_DEFAULT) +public class DaffodilFormatConfig implements FormatPluginConfig { + + public final List extensions; + public final String schemaURI; + public final boolean validationMode; + public final String rootName; + public final String rootNamespace; + /** + * In the constructor for a format config, you should not use + * boxed versions of primitive types. It creates problems with + * defaulting them (they default to null) which cannot be unboxed. + */ + @JsonCreator + public DaffodilFormatConfig(@JsonProperty("extensions") List extensions, + @JsonProperty("schemaURI") String schemaURI, + @JsonProperty("rootName") String rootName, + @JsonProperty("rootNamespace") String rootNamespace, + @JsonProperty("validationMode") boolean validationMode) { + + this.extensions = extensions == null ? Collections.singletonList("dat") : ImmutableList.copyOf(extensions); + this.rootName = rootName; + this.rootNamespace = rootNamespace; + this.schemaURI = schemaURI; + // no default. Users must pick. + this.validationMode = validationMode; + } + + @JsonInclude(JsonInclude.Include.NON_DEFAULT) + public List getExtensions() { + return extensions; + } + + public String getSchemaURI() { + return schemaURI; + } + + public String getRootName() { + return rootName; + } + + public String getRootNamespace() { + return rootNamespace; + } + + @JsonInclude(JsonInclude.Include.NON_DEFAULT) + public boolean getValidationMode() { + return validationMode; + } + + public DaffodilReaderConfig getReaderConfig(DaffodilFormatPlugin plugin) { + DaffodilReaderConfig readerConfig = new DaffodilReaderConfig(plugin); + return readerConfig; + } + + @Override + public int hashCode() { + return Objects.hash(schemaURI, validationMode, rootName, rootNamespace); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + DaffodilFormatConfig other = (DaffodilFormatConfig) obj; + return Objects.equals(schemaURI, other.schemaURI) + && Objects.equals(rootName, other.rootName) + && Objects.equals(rootNamespace, other.rootNamespace) + && Objects.equals(validationMode, other.validationMode); + } + + @Override + public String toString() { + return new PlanStringBuilder(this) + .field("schemaURI", schemaURI) + .field("rootName", rootName) + .field("rootNamespace", rootNamespace) + .field("validationMode", validationMode) + .toString(); + } +} diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatPlugin.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatPlugin.java new file mode 100644 index 00000000000..95b5e82aaac --- /dev/null +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatPlugin.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.daffodil; + + +import org.apache.drill.common.logical.StoragePluginConfig; +import org.apache.drill.common.types.TypeProtos.MinorType; +import org.apache.drill.common.types.Types; +import org.apache.drill.exec.physical.impl.scan.v3.ManagedReader; +import org.apache.drill.exec.physical.impl.scan.v3.file.FileReaderFactory; +import org.apache.drill.exec.physical.impl.scan.v3.file.FileScanLifecycleBuilder; +import org.apache.drill.exec.physical.impl.scan.v3.file.FileSchemaNegotiator; +import org.apache.drill.exec.server.DrillbitContext; +import org.apache.drill.exec.store.dfs.easy.EasyFormatPlugin; +import org.apache.drill.exec.store.dfs.easy.EasySubScan; +import org.apache.hadoop.conf.Configuration; + + +public class DaffodilFormatPlugin extends EasyFormatPlugin { + + public static final String DEFAULT_NAME = "daffodil"; + public static final String OPERATOR_TYPE = "DAFFODIL_SUB_SCAN"; + + public static class DaffodilReaderFactory extends FileReaderFactory { + private final DaffodilBatchReader.DaffodilReaderConfig readerConfig; + + private final EasySubScan scan; + + public DaffodilReaderFactory(DaffodilBatchReader.DaffodilReaderConfig config, EasySubScan scan) { + this.readerConfig = config; + this.scan = scan; + } + + @Override + public ManagedReader newReader(FileSchemaNegotiator negotiator) { + return new DaffodilBatchReader(readerConfig, scan, negotiator); + } + } + + public DaffodilFormatPlugin(String name, DrillbitContext context, + Configuration fsConf, StoragePluginConfig storageConfig, + DaffodilFormatConfig formatConfig) { + super(name, easyConfig(fsConf, formatConfig), context, storageConfig, formatConfig); + } + + private static EasyFormatConfig easyConfig(Configuration fsConf, DaffodilFormatConfig pluginConfig) { + return EasyFormatConfig.builder() + .readable(true) + .writable(false) + .blockSplittable(false) + .compressible(true) + .extensions(pluginConfig.getExtensions()) + .fsConf(fsConf) + .readerOperatorType(OPERATOR_TYPE) + .scanVersion(ScanFrameworkVersion.EVF_V2) + .supportsLimitPushdown(true) + .supportsProjectPushdown(true) + .defaultName(DaffodilFormatPlugin.DEFAULT_NAME) + .build(); + } + + @Override + protected void configureScan(FileScanLifecycleBuilder builder, EasySubScan scan) { + builder.nullType(Types.optional(MinorType.VARCHAR)); + builder.readerFactory(new DaffodilReaderFactory(formatConfig.getReaderConfig(this), scan)); + } +} diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilMessageParser.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilMessageParser.java new file mode 100644 index 00000000000..67b74d2571a --- /dev/null +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilMessageParser.java @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.daffodil; + + +import org.apache.daffodil.japi.DataProcessor; +import org.apache.daffodil.japi.Diagnostic; +import org.apache.daffodil.japi.ParseResult; +import org.apache.daffodil.japi.infoset.InfosetOutputter; +import org.apache.daffodil.japi.io.InputSourceDataInputStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.InputStream; +import java.util.List; +import java.util.stream.Collectors; + +/** + * DFDL Daffodil Streaming message parser + *
+ * You construct this providing a DataProcessor obtained from the + * DaffodilDataProcessorFactory. + * The DataProcessor contains the compiled DFDL schema, ready to use, as + * well as whether validation while parsing has been requested. + *
+ * The DataProcessor object may be shared/reused by multiple threads each of which + * has its own copy of this class. + * This object is, however, stateful, and must not be shared by multiple threads. + *
+ * You must call setInputStream, and setInfosetOutputter before + * you call parse(). + * The input stream and the InfosetOutputter objects are also private to one thread and are stateful + * and owned by this object. + * Once you have called setInputStream, you should view the input stream as the private property of + * this object. + * The parse() will invoke the InfosetOutputter's methods to deliver + * parsed data, and it may optionally create diagnostics (obtained via getDiagnostics) + * indicating which kind they are via the getIsProcessingError, getIsValidationError. + *
+ * Note that the InfosetOutputter may be called many times before a processing error is detected, + * as Daffodil delivers result data incrementally. + *
+ * Validation errors do not affect the InfosetOutputter output calls, but indicate that data was + * detected that is invalid. + *
+ * When parse() returns, the parse has ended and one can check for errors/diagnostics. + * One can call parse() again if there is still data to consume, which is checked via the + * isEOF() method. + *
+ * There are no guarantees about where the input stream is positioned between parse() calls. + * In particular, it may not be positioned at the start of the next message, as Daffodil may + * have pre-fetched additional bytes from the input stream which it found are not part of the + * current infoset, but the next one. + * The positioning of the input stream may in fact be somewhere in the middle of a byte, + * as Daffodil does not require messages to be of lengths that are in whole byte units. + * Hence, once you give the input stream to this object via setInputStream, that input stream is + * owned privately by this class for ever after. + */ +public class DaffodilMessageParser { + + /** + * Constructs the parser using a DataProcessor obtained from + * a DaffodilDataProcessorFactory. + * @param dp + */ + DaffodilMessageParser(DataProcessor dp) { + this.dp = dp; + } + + /** + * Provide the input stream from which data is to be parsed. + *
+ * This input stream is then owned by this object and becomes part of its state. + *
+ * It is; however, the responsibility of the caller to close this + * input stream after the completion of all parse calls. + * In particular, if a parse error is considered fatal, then + * the caller should close the input stream. + * There are advanced error-recovery techniques that may attempt to find + * data that can be parsed later in the data stream. + * In those cases the input stream would not be closed after a processing error, + * but such usage is beyond the scope of this javadoc. + * @param inputStream + */ + public void setInputStream(InputStream inputStream) { + dis = new InputSourceDataInputStream(inputStream); + } + + /** + * Provides the InfosetOutputter which will be called to deliver + * the Infoset via calls to its methods. + * @param outputter + */ + public void setInfosetOutputter(InfosetOutputter outputter) { + this.outputter = outputter; + } + + /** + * Called to pull messages from the data stream. + * The message 'Infoset' is delivered by way of calls to the InfosetOutputter's methods. + *
+ * After calling this, one may call getIsProcessingError, getIsValiationError, isEOF, and + * getDiagnostics. + */ + public void parse() { + if (dis == null) + throw new IllegalStateException("Input stream must be provided by setInputStream() call."); + if (outputter == null) + throw new IllegalStateException("InfosetOutputter must be provided by setInfosetOutputter() call."); + + reset(); + ParseResult res = dp.parse(dis, outputter); + isProcessingError = res.isProcessingError(); + isValidationError = res.isValidationError(); + diagnostics = res.getDiagnostics(); + } + + /** + * True if the input stream is known to contain no more data. + * If the input stream is a true stream, not a file, then temporary unavailability of data + * may cause this call to block until the stream is closed from the other end, or data becomes + * available. + *
+ * False if the input stream is at EOF, and no more data can be obtained. + * It is an error to call parse() after isEOF has returned true. + * @return + */ + public boolean isEOF() { + return !dis.hasData(); + } + + /** + * True if the parse() call failed with a processing error. + * This indicates that the data was not well-formed and could not be + * parsed successfully. + *
+ * It is possible for isProcessingError and isValidationError to both be true. + * @return + */ + public boolean isProcessingError() { return isProcessingError; } + + /** + * True if a validation error occurred during parsing. + * Subsequently to a validation error occurring, parsing may succeed or fail. + * after the validation error was detected. + * @return + */ + public boolean isValidationError() { return isValidationError; } + + /** + * After a parse() call this returns null or a list of 1 or more diagnostics. + *
+ * If isProcessingError or isValidationError are true, then this will contain at least 1 + * diagnostic. + * If both are true this will contain at least 2 diagnostics. + * @return + */ + public List getDiagnostics() { return diagnostics; } + public String getDiagnosticsAsString() { + String result = diagnostics.stream() + .map(Diagnostic::getMessage) + .collect(Collectors.joining("\n")); + return result; + } + + + private static final Logger logger = LoggerFactory.getLogger(DaffodilMessageParser.class); + + private List diagnostics; // diagnostics. + private boolean isProcessingError; + private boolean isValidationError; + + private InputSourceDataInputStream dis; + private InfosetOutputter outputter; + private DataProcessor dp; + + private void reset() { + outputter.reset(); + isProcessingError = false; + isValidationError = false; + diagnostics = null; + } +} diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java new file mode 100644 index 00000000000..149e0b2ceec --- /dev/null +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.daffodil.schema; + + +import org.apache.daffodil.japi.Compiler; +import org.apache.daffodil.japi.Daffodil; +import org.apache.daffodil.japi.DataProcessor; +import org.apache.daffodil.japi.Diagnostic; +import org.apache.daffodil.japi.InvalidParserException; +import org.apache.daffodil.japi.InvalidUsageException; +import org.apache.daffodil.japi.ProcessorFactory; +import org.apache.daffodil.japi.ValidationMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.channels.Channels; +import java.util.List; +import java.util.Objects; + +/** + * Compiles a DFDL schema (mostly for tests) or loads a pre-compiled DFDL schema so + * that one can obtain a DataProcessor for use with DaffodilMessageParser. + *
+ * TODO: Needs to use a cache to avoid reloading/recompiling every time. + */ +public class DaffodilDataProcessorFactory { + // Default constructor is used. + + private static final Logger logger = LoggerFactory.getLogger(DaffodilDataProcessorFactory.class); + + private DataProcessor dp; + + /** + * Thrown if schema compilation fails. + *
+ * Contains diagnostic objects which give the cause(s) of the failure. + */ + public static class CompileFailure extends Exception { + List diags; + + CompileFailure(List diagnostics) { + super("DFDL Schema Compile Failure"); + diags = diagnostics; + } + } + + /** + * Gets a Daffodil DataProcessor given the necessary arguments to compile or reload it. + * @param schemaFileURI pre-compiled dfdl schema (.bin extension) or DFDL schema source (.xsd extension) + * @param validationMode Use true to request Daffodil built-in 'limited' validation. + * Use false for no validation. + * @param rootName Local name of root element of the message. Can be null to use the first element + * declaration of the primary schema file. Ignored if reloading a pre-compiled schema. + * @param rootNS Namespace URI as a string. Can be null to use the target namespace + * of the primary schema file or if it is unambiguous what element is the rootName. + * Ignored if reloading a pre-compiled schema. + * @return the DataProcessor + * @throws CompileFailure - if schema compilation fails + * @throws IOException - if the schemaFileURI cannot be opened or is not found. + * @throws URISyntaxException - if the schemaFileURI is not legal syntax. + * @throws InvalidParserException - if the reloading of the parser from pre-compiled binary fails. + */ + public DataProcessor getDataProcessor( + URI schemaFileURI, + boolean validationMode, + String rootName, + String rootNS) + throws CompileFailure, IOException, URISyntaxException, InvalidParserException { + + DaffodilDataProcessorFactory dmp = new DaffodilDataProcessorFactory(); + boolean isPrecompiled = schemaFileURI.toString().endsWith(".bin"); + if (isPrecompiled) { + if (Objects.nonNull(rootName) && !rootName.isEmpty()) { + // A usage error. You shouldn't supply the name and optionally namespace if loading + // precompiled schema because those are built into it. Should be null or "". + logger.warn("Root element name '{}' is ignored when used with precompiled DFDL schema.", rootName); + } + dmp.loadSchema(schemaFileURI); + dmp.setupDP(validationMode, null); + } else { + List pfDiags = dmp.compileSchema(schemaFileURI, rootName, rootNS); + dmp.setupDP(validationMode, pfDiags); + } + return dmp.dp; + } + + private void loadSchema(URI schemaFileURI) + throws IOException, InvalidParserException { + Compiler c = Daffodil.compiler(); + dp = c.reload(Channels.newChannel(schemaFileURI.toURL().openStream())); + } + + @SuppressWarnings("ReassignedVariable") + private List compileSchema(URI schemaFileURI, String rootName, String rootNS) + throws URISyntaxException, IOException, CompileFailure { + Compiler c = Daffodil.compiler(); + ProcessorFactory pf = c.compileSource(schemaFileURI, rootName, rootNS); + List pfDiags = pf.getDiagnostics(); + if (pf.isError()) { + pfDiags.forEach(diag -> + logger.error(diag.getSomeMessage()) + ); + throw new CompileFailure(pfDiags); + } + dp = pf.onPath("/"); + return pfDiags; // must be just warnings. If it was errors we would have thrown. + } + + /** + * Common setup steps used whether or not we reloaded or compiled a DFDL schema. + */ + private void setupDP(boolean validationMode, List pfDiags) throws CompileFailure { + Objects.requireNonNull(dp); // true because failure to produce a dp throws CompileFailure. + if (validationMode) { + try { + dp = dp.withValidationMode(ValidationMode.Limited); + } catch (InvalidUsageException e) { + // impossible + throw new Error(e); + } + } + List dpDiags = dp.getDiagnostics(); + if (dp.isError()) { + throw new CompileFailure(dpDiags); + } + // well this part is only if we compiled, and provided the pfDiags arg as non-null. + List compilationWarnings; + if (pfDiags != null && !pfDiags.isEmpty()) { + compilationWarnings = pfDiags; + compilationWarnings.addAll(dpDiags); // dpDiags might be empty. That's ok. + } else { + compilationWarnings = dpDiags; // dpDiags might be empty. That's ok. + } + } +} diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java new file mode 100644 index 00000000000..7a1c6314578 --- /dev/null +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.daffodil.schema; + +import org.apache.daffodil.japi.InvalidParserException; +import org.apache.daffodil.japi.DataProcessor; +import org.apache.daffodil.runtime1.api.PrimitiveType; +import org.apache.drill.common.types.TypeProtos.MinorType; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableMap; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; + + +public class DrillDaffodilSchemaUtils { + private static final MinorType DEFAULT_TYPE = MinorType.VARCHAR; + private static final Logger logger = LoggerFactory.getLogger(DrillDaffodilSchemaUtils.class); + + /** + * This map maps the data types defined by the DFDL definition to Drill data types. + */ + public static final ImmutableMap DFDL_TYPE_MAPPINGS = + ImmutableMap.builder() + .put("LONG", MinorType.BIGINT) + .put("INT", MinorType.INT) + .put("SHORT", MinorType.SMALLINT) + .put("BYTE", MinorType.TINYINT) + .put("UNSIGNEDLONG", MinorType.UINT8) + .put("UNSIGNEDINT", MinorType.UINT4) + .put("UNSIGNEDSHORT", MinorType.UINT2) + .put("UNSIGNEDBYTE", MinorType.UINT1) + .put("INTEGER", MinorType.BIGINT) + .put("NONNEGATIVEINTEGER", MinorType.BIGINT) + .put("BOOLEAN", MinorType.BIT) + .put("DATE", MinorType.DATE) // requires conversion + .put("DATETIME", MinorType.TIMESTAMP) // requires conversion + .put("DECIMAL", MinorType.VARDECIMAL) // requires conversion (maybe) + .put("DOUBLE", MinorType.FLOAT8) + .put("FLOAT", MinorType.FLOAT4) + .put("HEXBINARY", MinorType.VARBINARY) + .put("STRING", MinorType.VARCHAR) + .put("TIME", MinorType.TIME) // requires conversion + .build(); + + + @VisibleForTesting + public static TupleMetadata processSchema(URI dfdlSchemaURI, String rootName, String namespace) + throws IOException, DaffodilDataProcessorFactory.CompileFailure, + URISyntaxException, InvalidParserException { + DaffodilDataProcessorFactory dpf = new DaffodilDataProcessorFactory(); + DataProcessor dp = dpf.getDataProcessor(dfdlSchemaURI, true, rootName, namespace); + return daffodilDataProcessorToDrillSchema(dp); + } + + public static TupleMetadata daffodilDataProcessorToDrillSchema(DataProcessor dp) { + DrillDaffodilSchemaVisitor schemaVisitor = new DrillDaffodilSchemaVisitor(); + dp.walkMetadata(schemaVisitor); + TupleMetadata drillSchema = schemaVisitor.getDrillSchema(); + return drillSchema; + } + + /** + * Returns a {@link MinorType} of the corresponding DFDL Data Type. Defaults to VARCHAR if unknown + * @param dfdlType A String of the DFDL Data Type (local name only, i.e., no "xs:" prefix. + * @return A {@link MinorType} of the Drill data type. + */ + public static MinorType getDrillDataType(PrimitiveType dfdlType) { + try { + MinorType type = DrillDaffodilSchemaUtils.DFDL_TYPE_MAPPINGS.get(dfdlType.name().toUpperCase()); + if (type == null) { + return DEFAULT_TYPE; + } else { + return type; + } + } catch (NullPointerException e) { + logger.warn("Unknown data type found in XSD reader: {}. Returning VARCHAR.", dfdlType); + return DEFAULT_TYPE; + } + } +} diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java new file mode 100644 index 00000000000..c199a9635b0 --- /dev/null +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java @@ -0,0 +1,225 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.daffodil.schema; + +import org.apache.daffodil.runtime1.api.ChoiceMetadata; +import org.apache.daffodil.runtime1.api.ComplexElementMetadata; +import org.apache.daffodil.runtime1.api.ElementMetadata; +import org.apache.daffodil.runtime1.api.InfosetSimpleElement; +import org.apache.daffodil.runtime1.api.MetadataHandler; +import org.apache.daffodil.runtime1.api.SequenceMetadata; +import org.apache.daffodil.runtime1.api.SimpleElementMetadata; +import org.apache.drill.common.types.TypeProtos.MinorType; +import org.apache.drill.exec.record.metadata.MapBuilder; +import org.apache.drill.exec.record.metadata.SchemaBuilder; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Stack; + +/** + * This class transforms a DFDL/Daffodil schema into a Drill Schema. + */ +public class DrillDaffodilSchemaVisitor + extends MetadataHandler { + private static final Logger logger = LoggerFactory.getLogger(DrillDaffodilSchemaVisitor.class); + + /** + * Changes to false after the very first invocation for the root element. + */ + private boolean isOriginalRoot = true; + + /** + * Unfortunately, SchemaBuilder and MapBuilder, while similar, do not share a base class + * so we have a stack of MapBuilders, and when empty we use the SchemaBuilder + */ + private final SchemaBuilder builder = new SchemaBuilder(); + + private final Stack mapBuilderStack = new Stack<>(); + + private MapBuilder mapBuilder() { + return mapBuilderStack.peek(); + } + + /** + * Returns a {@link TupleMetadata} representation of the DFDL schema. + * Should only be called after the walk of the DFDL schema with this visitor has been called. + * + * @return A {@link TupleMetadata} representation of the DFDL schema. + */ + public TupleMetadata getDrillSchema() { + return builder.build(); + } + + public static String makeColumnName(ElementMetadata md) { + return md.toQName().replace(":", "_"); + } + + @Override + public void simpleElementMetadata(SimpleElementMetadata md) { + assert (!isOriginalRoot); + if (mapBuilderStack.isEmpty()) { + simpleElementAsRowSetColumnMetadata(md); + } else { + simpleElementWithinComplexElementMetadata(md); + } + } + + private void simpleElementAsRowSetColumnMetadata(SimpleElementMetadata md) { + assert (!isOriginalRoot); + assert (mapBuilderStack.isEmpty()); + String colName = makeColumnName(md); + MinorType drillType = DrillDaffodilSchemaUtils.getDrillDataType(md.primitiveType()); + // + // below code adds to the schema builder directly, not a map builder + // + if (md.isArray()) + builder.addArray(colName, drillType); + else if (md.isOptional() || md.isNillable()) + builder.addNullable(colName, drillType); + else + builder.add(colName, drillType); + } + + private void simpleElementWithinComplexElementMetadata(SimpleElementMetadata md) { + assert (!isOriginalRoot); + assert (!mapBuilderStack.isEmpty()); + String colName = makeColumnName(md); + MinorType drillType = DrillDaffodilSchemaUtils.getDrillDataType(md.primitiveType()); + // + // The code below adds to a map builder from the stack + // + if (md.isArray()) { + mapBuilder().addArray(colName, drillType); + } else if (md.isOptional() || md.isNillable()) { + mapBuilder().addNullable(colName, drillType); + } else { + mapBuilder().add(colName, drillType); + } + } + + @Override + public void startComplexElementMetadata(ComplexElementMetadata md) { + if (isOriginalRoot) { + startComplexOriginalRootMetadata(md); + } else if (mapBuilderStack.isEmpty()) { + startComplexElementAsRowSetColumnMetadata(md); + } else { + startComplexChildElementMetadata(md); + } + } + + /** + * The original root given to Drill needs to be a schema element corresponding + * to one row of data. + *

+ * Drill will call daffodil parse() to parse one such element. The + * children elements of this element will become the column contents of the + * row. + *

+ * So the metadata for this row, to drill, is ONLY the columns of this + * top level element type. + * @param md + */ + private void startComplexOriginalRootMetadata(ComplexElementMetadata md) { + assert (isOriginalRoot); + assert (mapBuilderStack.isEmpty()); + isOriginalRoot = false; + if (md instanceof InfosetSimpleElement) + DFDLSchemaError("Row as a simple type element is not supported."); + if (md.isArray()) + DFDLSchemaError("Row element must not be an array."); + // + // We do nothing else here. Essentially the name of this top level element + // is not relevant to drill metadata. Think of it as a table name, or a name for the + // entire final set of rows that are the query result. + } + + /** + * This complex type element becomes a column of the row set which is itself a map. + * @param md + */ + private void startComplexElementAsRowSetColumnMetadata(ComplexElementMetadata md) { + assert(!isOriginalRoot); + assert(mapBuilderStack.isEmpty()); + String colName = makeColumnName(md); + // + // This directly adds to the builder, as this complex element itself is a column + // of the top level row + // + // Then it creates a map builder for recursively adding the contents. + // + if (md.isArray()) + mapBuilderStack.push(builder.addMapArray(colName)); + else + mapBuilderStack.push(builder.addMap(colName)); // also handles optional complex elements + } + + private void startComplexChildElementMetadata(ComplexElementMetadata md) { + assert (!mapBuilderStack.isEmpty()); + assert (!isOriginalRoot); + String colName = makeColumnName(md); + // + // This is for non-top-level columns, but adding a field within a map. + // That map represents a complex type element that does NOT correspond to + // the top-level rows. + // + if (md.isArray()) { + // there is no notion of optional/nullable in drill for arrays. + // Note that our arrays are always maps. We don't have pure-value + // simple-type arrays. + mapBuilderStack.push(mapBuilder().addMapArray(colName)); + } else { + // there is no concept of optional/nullable in drill for maps. + mapBuilderStack.push(mapBuilder().addMap(colName)); + } + } + + @Override + public void endComplexElementMetadata(ComplexElementMetadata md) { + // the mapBuilderStack can be empty if we had the most basic + // kind of schema with a repeating row-set containing only + // simple type children. + if (!mapBuilderStack.isEmpty()) { + if (mapBuilderStack.size() == 1) { + mapBuilder().resumeSchema(); + } else { + mapBuilder().resumeMap(); + } + mapBuilderStack.pop(); // only pop if there was something on the stack. + } + } + + @Override + public void startSequenceMetadata(SequenceMetadata m) {} + + @Override + public void endSequenceMetadata(SequenceMetadata m) {} + + @Override + public void startChoiceMetadata(ChoiceMetadata m) {} + + @Override + public void endChoiceMetadata(ChoiceMetadata m) {} + + private void DFDLSchemaError(String s) { + throw new RuntimeException(s); + } +} diff --git a/contrib/format-daffodil/src/main/resources/bootstrap-format-plugins.json b/contrib/format-daffodil/src/main/resources/bootstrap-format-plugins.json new file mode 100644 index 00000000000..966d9ba1b5b --- /dev/null +++ b/contrib/format-daffodil/src/main/resources/bootstrap-format-plugins.json @@ -0,0 +1,37 @@ +{ + "storage":{ + "dfs": { + "type": "file", + "formats": { + "daffodil": { + "type": "daffodil", + "extensions": [ + "dat" + ] + } + } + }, + "cp": { + "type": "file", + "formats": { + "daffodil": { + "type": "daffodil", + "extensions": [ + "dat" + ] + } + } + }, + "s3": { + "type": "file", + "formats": { + "daffodil": { + "type": "daffodil", + "extensions": [ + "dat" + ] + } + } + } + } +} diff --git a/contrib/format-daffodil/src/main/resources/drill-module.conf b/contrib/format-daffodil/src/main/resources/drill-module.conf new file mode 100644 index 00000000000..52a902572e3 --- /dev/null +++ b/contrib/format-daffodil/src/main/resources/drill-module.conf @@ -0,0 +1,25 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This file tells Drill to consider this module when class path scanning. +# This file can also include any supplementary configuration information. +# This file is in HOCON format, see https://github.com/typesafehub/config/blob/master/HOCON.md for more information. + +drill.classpath.scanning: { + packages += "org.apache.drill.exec.store.daffodil" +} diff --git a/contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java b/contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java new file mode 100644 index 00000000000..7df13619324 --- /dev/null +++ b/contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.daffodil; + +import org.apache.drill.categories.RowSetTest; +import org.apache.drill.common.types.TypeProtos.MinorType; +import org.apache.drill.exec.physical.rowSet.RowSet; +import org.apache.drill.exec.physical.rowSet.RowSetReader; +import org.apache.drill.exec.record.metadata.SchemaBuilder; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.apache.drill.test.ClusterFixture; +import org.apache.drill.test.ClusterTest; +import org.apache.drill.test.QueryBuilder; +import org.apache.drill.test.rowSet.RowSetComparison; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import java.nio.file.Paths; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; + +@Category(RowSetTest.class) +public class TestDaffodilReader extends ClusterTest { + + String schemaURIRoot = "file:///opt/drill/contrib/format-daffodil/src/test/resources/"; + @BeforeClass + public static void setup() throws Exception { + // boilerplate call to start test rig + ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher)); + + DaffodilFormatConfig formatConfig = + new DaffodilFormatConfig(null, + "", + "", + "", + false); + + cluster.defineFormat("dfs", "daffodil", formatConfig); + + // Needed to test against compressed files. + // Copies data from src/test/resources to the dfs root. + dirTestWatcher.copyResourceToRoot(Paths.get("data/")); + dirTestWatcher.copyResourceToRoot(Paths.get("schema/")); + } + + private String selectRow(String schema, String file) { + return "SELECT * FROM table(dfs.`data/" + file + "` " + + " (type => 'daffodil'," + + " validationMode => 'true', " + + " schemaURI => '" + schemaURIRoot + "schema/" + schema + ".dfdl.xsd'," + + " rootName => 'row'," + + " rootNamespace => null " + + "))"; + } + + /** + * This unit test tests a simple data file + * + * @throws Exception Throw exception if anything goes wrong + */ + @Test + public void testSimpleQuery1() throws Exception { + + QueryBuilder qb = client.queryBuilder(); + QueryBuilder query = qb.sql(selectRow("simple", "data01Int.dat.gz")); + RowSet results = query.rowSet(); + results.print(); + assertEquals(1, results.rowCount()); + + // create the expected metadata and data for this test + // metadata first + TupleMetadata expectedSchema = new SchemaBuilder() + .add("col", MinorType.INT) + .buildSchema(); + + RowSet expected = client.rowSetBuilder(expectedSchema) + .addRow(0x00000101) // aka 257 + .build(); + + new RowSetComparison(expected).verifyAndClearAll(results); + } + + @Test + public void testSimpleQuery2() throws Exception { + + QueryBuilder qb = client.queryBuilder(); + QueryBuilder query = qb.sql(selectRow("simple","data06Int.dat")); + RowSet results = query.rowSet(); + results.print(); + assertEquals(6, results.rowCount()); + + // create the expected metadata and data for this test + // metadata first + TupleMetadata expectedSchema = new SchemaBuilder() + .add("col", MinorType.INT) + .buildSchema(); + + RowSet expected = client.rowSetBuilder(expectedSchema) + .addRow(0x00000101) + .addRow(0x00000102) + .addRow(0x00000103) + .addRow(0x00000104) + .addRow(0x00000105) + .addRow(0x00000106) + .build(); + + new RowSetComparison(expected).verifyAndClearAll(results); + } + + @Test + public void testComplexQuery1() throws Exception { + + QueryBuilder qb = client.queryBuilder(); + QueryBuilder query = qb.sql(selectRow("complex1", "data02Int.dat")); + RowSet results = query.rowSet(); + results.print(); + assertEquals(1, results.rowCount()); + + RowSetReader rdr = results.reader(); + rdr.next(); + String col = rdr.getAsString(); + assertEquals("{257, 258}", col); + assertFalse(rdr.next()); + results.clear(); + } + + @Test + public void testComplexQuery2() throws Exception { + + QueryBuilder qb = client.queryBuilder(); + QueryBuilder query = qb.sql(selectRow("complex1", "data06Int.dat")); + RowSet results = query.rowSet(); + results.print(); + assertEquals(3, results.rowCount()); + + RowSetReader rdr = results.reader(); + rdr.next(); + String map = rdr.getAsString(); + assertEquals("{257, 258}", map); + rdr.next(); + map = rdr.getAsString(); + assertEquals("{259, 260}", map); + rdr.next(); + map = rdr.getAsString(); + assertEquals("{261, 262}", map); + assertFalse(rdr.next()); + results.clear(); + } + + /** + * Tests data which is rows of two ints and an array containing a + * map containing two ints. + * Each row can be visualized like this: "{257, 258, [{259, 260},...]}" + * @throws Exception + */ + @Test + public void testComplexArrayQuery1() throws Exception { + + QueryBuilder qb = client.queryBuilder(); + QueryBuilder query = qb.sql(selectRow("complexArray1", "data12Int.dat")); + RowSet results = query.rowSet(); + results.print(); + assertEquals(1, results.rowCount()); + + RowSetReader rdr = results.reader(); + rdr.next(); + String map = rdr.getAsString(); + assertEquals("{257, 258, [{259, 260}, {261, 262}, {257, 258}, {259, 260}, {261, 262}]}", map); + assertFalse(rdr.next()); + results.clear(); + } + + /** + * Tests data which is an array of ints in one column of the row set + * @throws Exception + */ + @Test + public void testSimpleArrayQuery1() throws Exception { + + QueryBuilder qb = client.queryBuilder(); + QueryBuilder query = qb.sql(selectRow("simpleArrayField1", "data12Int.dat")); + RowSet results = query.rowSet(); + results.print(); + assertEquals(1, results.rowCount()); + + RowSetReader rdr = results.reader(); + rdr.next(); + String map = rdr.getAsString(); + assertEquals("{[257, 258, 259, 260, 261, 262, 257, 258, 259, 260, 261, 262]}", map); + assertFalse(rdr.next()); + results.clear(); + } + + /** + * Tests data which is rows of two ints and an array containing a + * map containing an int and a vector of ints. + * Each row can be visualized like this: "{257, 258, [{259, [260, 261, 262]},...]}" + * @throws Exception + */ + @Test + public void testComplexArrayQuery2() throws Exception { + + QueryBuilder qb = client.queryBuilder(); + QueryBuilder query = qb.sql(selectRow("complexArray2", "data12Int.dat")); + RowSet results = query.rowSet(); + results.print(); + assertEquals(1, results.rowCount()); + + RowSetReader rdr = results.reader(); + rdr.next(); + String map = rdr.getAsString(); + assertEquals("{257, 258, [{259, [260, 261, 262]}, {257, [258, 259, 260]}, {261, [262]}]}", map); + assertFalse(rdr.next()); + results.clear(); + } + + @Test + public void testMoreTypes1() throws Exception { + + QueryBuilder qb = client.queryBuilder(); + QueryBuilder query = qb.sql(selectRow("moreTypes1", "moreTypes1.txt.dat")); + RowSet results = query.rowSet(); + results.print(); + assertEquals(2, results.rowCount()); + + RowSetReader rdr = results.reader(); + rdr.next(); + String map = rdr.getAsString(); + assertEquals("{2147483647, 9223372036854775807, 32767, 127, true, " + + "1.7976931348623157E308, 3.4028235E38, [31, 32, 33, 34, 35, 36, 37, 38], \"daffodil\"}", map); + rdr.next(); + map = rdr.getAsString(); + assertEquals("{-2147483648, -9223372036854775808, -32768, -128, false, " + + "-1.7976931348623157E308, -3.4028235E38, [38, 37, 36, 35, 34, 33, 32, 31], \"drill\"}", map); + assertFalse(rdr.next()); + results.clear(); + } +} diff --git a/contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/schema/TestDaffodilToDrillMetadataConversion.java b/contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/schema/TestDaffodilToDrillMetadataConversion.java new file mode 100644 index 00000000000..8076fe5e8cf --- /dev/null +++ b/contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/schema/TestDaffodilToDrillMetadataConversion.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.daffodil.schema; + +import org.apache.drill.common.types.TypeProtos.MinorType; +import org.apache.drill.exec.record.metadata.SchemaBuilder; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.junit.Test; + +import java.net.URI; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestDaffodilToDrillMetadataConversion { + + @Test + public void testSimple() throws Exception { + URI schemaURI = getClass().getResource("/schema/simple.dfdl.xsd").toURI(); + TupleMetadata schema = DrillDaffodilSchemaUtils.processSchema(schemaURI, "row", null); + + TupleMetadata expectedSchema = new SchemaBuilder() + .add("col", MinorType.INT) + .buildSchema(); + assertTrue(expectedSchema.isEquivalent(schema)); + } + + @Test + public void testComplex1() throws Exception { + URI schemaURI = getClass().getResource("/schema/complex1.dfdl.xsd").toURI(); + TupleMetadata schema = DrillDaffodilSchemaUtils.processSchema(schemaURI, "row", null); + + TupleMetadata expectedSchema = new SchemaBuilder() + .add("a1", MinorType.INT) + .add("a2", MinorType.INT) + .buildSchema(); + assertTrue(expectedSchema.isEquivalent(schema)); + } + + @Test + public void testComplex2() throws Exception { + URI schemaURI = getClass().getResource("/schema/complex2.dfdl.xsd").toURI(); + TupleMetadata schema = DrillDaffodilSchemaUtils.processSchema(schemaURI, "row", null); + + TupleMetadata expectedSchema = new SchemaBuilder() + .add("a1", MinorType.INT) + .add("a2", MinorType.INT) + .addMap("b") + .add("b1", MinorType.INT) + .add("b2", MinorType.INT) + .resumeSchema() + .buildSchema(); + assertTrue(expectedSchema.isEquivalent(schema)); + } + +} diff --git a/contrib/format-daffodil/src/test/resources/data/data01Int.dat b/contrib/format-daffodil/src/test/resources/data/data01Int.dat new file mode 100644 index 0000000000000000000000000000000000000000..dee9c4c8adafc2e6af2ee0379ef082b7b43cd95f GIT binary patch literal 4 LcmZQzU}OXU00#gA literal 0 HcmV?d00001 diff --git a/contrib/format-daffodil/src/test/resources/data/data01Int.dat.gz b/contrib/format-daffodil/src/test/resources/data/data01Int.dat.gz new file mode 100644 index 0000000000000000000000000000000000000000..5e4b3b37acf4947e8f87c4a6e483941bbec8cbd2 GIT binary patch literal 35 qcmb2|=HQqVVw1|iT%4I(kdvyHl32o!oRE;h!f^Utgg*-d0|NlNJ_;28 literal 0 HcmV?d00001 diff --git a/contrib/format-daffodil/src/test/resources/data/data02Int.dat b/contrib/format-daffodil/src/test/resources/data/data02Int.dat new file mode 100644 index 0000000000000000000000000000000000000000..8577a2591813ab0b81c36ace127183058a4b462a GIT binary patch literal 8 PcmZQzU}R)qU}ORS02crT literal 0 HcmV?d00001 diff --git a/contrib/format-daffodil/src/test/resources/data/data06Int.dat b/contrib/format-daffodil/src/test/resources/data/data06Int.dat new file mode 100644 index 0000000000000000000000000000000000000000..8c29db2ec519e3a9f2b6ea1d7f2f6d1436e43d97 GIT binary patch literal 24 ZcmZQzU}R)qU}OSfW*}w(Vpbq#0{{U502}}S literal 0 HcmV?d00001 diff --git a/contrib/format-daffodil/src/test/resources/data/data12Int.dat b/contrib/format-daffodil/src/test/resources/data/data12Int.dat new file mode 100644 index 0000000000000000000000000000000000000000..39fa5271b4f6b364acb6d2b8008168d044c6b69a GIT binary patch literal 48 ccmZQzU}R)qU}OSfW*}w(Vpbq#17fV=00fEvHvj+t literal 0 HcmV?d00001 diff --git a/contrib/format-daffodil/src/test/resources/data/moreTypes1.txt.dat b/contrib/format-daffodil/src/test/resources/data/moreTypes1.txt.dat new file mode 100644 index 00000000000..cd0b5c1591a --- /dev/null +++ b/contrib/format-daffodil/src/test/resources/data/moreTypes1.txt.dat @@ -0,0 +1,2 @@ +2147483647 9223372036854775807 32767 127 T 1.7976931348623157E308 3.4028235E38 12345678 'daffodil' +-2147483648 -9223372036854775808 -32768 -128 F -1.7976931348623157E308 -3.4028235E38 87654321 'drill' diff --git a/contrib/format-daffodil/src/test/resources/schema/complex1.dfdl.xsd b/contrib/format-daffodil/src/test/resources/schema/complex1.dfdl.xsd new file mode 100644 index 00000000000..1f4ab8954ef --- /dev/null +++ b/contrib/format-daffodil/src/test/resources/schema/complex1.dfdl.xsd @@ -0,0 +1,54 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/format-daffodil/src/test/resources/schema/complex2.dfdl.xsd b/contrib/format-daffodil/src/test/resources/schema/complex2.dfdl.xsd new file mode 100644 index 00000000000..d4d74aefe1f --- /dev/null +++ b/contrib/format-daffodil/src/test/resources/schema/complex2.dfdl.xsd @@ -0,0 +1,65 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/format-daffodil/src/test/resources/schema/complexArray1.dfdl.xsd b/contrib/format-daffodil/src/test/resources/schema/complexArray1.dfdl.xsd new file mode 100644 index 00000000000..59d285bd31e --- /dev/null +++ b/contrib/format-daffodil/src/test/resources/schema/complexArray1.dfdl.xsd @@ -0,0 +1,65 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/format-daffodil/src/test/resources/schema/complexArray2.dfdl.xsd b/contrib/format-daffodil/src/test/resources/schema/complexArray2.dfdl.xsd new file mode 100644 index 00000000000..89d74fa2de9 --- /dev/null +++ b/contrib/format-daffodil/src/test/resources/schema/complexArray2.dfdl.xsd @@ -0,0 +1,65 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/format-daffodil/src/test/resources/schema/moreTypes1.dfdl.xsd b/contrib/format-daffodil/src/test/resources/schema/moreTypes1.dfdl.xsd new file mode 100644 index 00000000000..b1a8085a9bd --- /dev/null +++ b/contrib/format-daffodil/src/test/resources/schema/moreTypes1.dfdl.xsd @@ -0,0 +1,70 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/format-daffodil/src/test/resources/schema/simple.dfdl.xsd b/contrib/format-daffodil/src/test/resources/schema/simple.dfdl.xsd new file mode 100644 index 00000000000..eea582b9a56 --- /dev/null +++ b/contrib/format-daffodil/src/test/resources/schema/simple.dfdl.xsd @@ -0,0 +1,71 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/format-daffodil/src/test/resources/schema/simpleArrayField1.dfdl.xsd b/contrib/format-daffodil/src/test/resources/schema/simpleArrayField1.dfdl.xsd new file mode 100644 index 00000000000..6c72c375159 --- /dev/null +++ b/contrib/format-daffodil/src/test/resources/schema/simpleArrayField1.dfdl.xsd @@ -0,0 +1,71 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/pom.xml b/contrib/pom.xml index 59747eba87f..7379fd37325 100644 --- a/contrib/pom.xml +++ b/contrib/pom.xml @@ -42,6 +42,7 @@ data format-access + format-daffodil format-deltalake format-esri format-excel diff --git a/distribution/pom.xml b/distribution/pom.xml index c5c1d678099..560bfaa3780 100644 --- a/distribution/pom.xml +++ b/distribution/pom.xml @@ -468,6 +468,11 @@ drill-format-log ${project.version} + + org.apache.drill.contrib + drill-format-daffodil + ${project.version} + org.apache.drill.contrib drill-druid-storage diff --git a/distribution/src/assemble/component.xml b/distribution/src/assemble/component.xml index 48a6360d8a9..06b71fee826 100644 --- a/distribution/src/assemble/component.xml +++ b/distribution/src/assemble/component.xml @@ -29,6 +29,7 @@ org.apache.drill.contrib.data:tpch-sample-data:jar org.apache.drill.contrib:drill-deltalake-format:jar org.apache.drill.contrib:drill-druid-storage:jar + org.apache.drill.contrib:drill-format-daffodil:jar org.apache.drill.contrib:drill-format-esri:jar org.apache.drill.contrib:drill-format-excel:jar org.apache.drill.contrib:drill-format-hdf5:jar diff --git a/pom.xml b/pom.xml index b787756ab80..558f9a37a7e 100644 --- a/pom.xml +++ b/pom.xml @@ -133,6 +133,7 @@ 4.3.3 2.1.12 + 3.7.0-SNAPSHOT @@ -317,6 +318,11 @@ logback-core ${logback.version} + + org.apache.daffodil + daffodil-japi_2.12 + ${daffodil.version} + From 13183ac4ac8c28cabe8d9640e5daf05002c1059a Mon Sep 17 00:00:00 2001 From: Michael Beckerle Date: Thu, 4 Jan 2024 12:08:32 -0500 Subject: [PATCH 2/8] Uses ScalarWriter now for typed setters for all DFDL types Date, Time, DateTime, Boolean, Unsigned integers, Integer, NonNegativeInteger,Decimal, float, double, hexBinary. --- .../DaffodilDrillInfosetOutputter.java | 174 ++++++++++++++---- .../schema/DrillDaffodilSchemaUtils.java | 23 ++- .../store/daffodil/TestDaffodilReader.java | 20 ++ .../test/resources/data/moreTypes2.txt.dat | 1 + .../test/resources/schema/moreTypes1.dfdl.xsd | 7 +- .../test/resources/schema/moreTypes2.dfdl.xsd | 64 +++++++ 6 files changed, 240 insertions(+), 49 deletions(-) create mode 100644 contrib/format-daffodil/src/test/resources/data/moreTypes2.txt.dat create mode 100644 contrib/format-daffodil/src/test/resources/schema/moreTypes2.dfdl.xsd diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java index df9aebf08f4..409cc1eb5f0 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java @@ -17,6 +17,8 @@ */ package org.apache.drill.exec.store.daffodil; +import com.ibm.icu.util.Calendar; +import com.ibm.icu.util.TimeZone; import org.apache.daffodil.runtime1.api.ComplexElementMetadata; import org.apache.daffodil.runtime1.api.ElementMetadata; import org.apache.daffodil.runtime1.api.InfosetArray; @@ -32,11 +34,17 @@ import org.apache.drill.exec.vector.accessor.ArrayWriter; import org.apache.drill.exec.vector.accessor.ColumnWriter; import org.apache.drill.exec.vector.accessor.ObjectType; +import org.apache.drill.exec.vector.accessor.ScalarWriter; import org.apache.drill.exec.vector.accessor.TupleWriter; import org.apache.drill.exec.vector.complex.writer.BaseWriter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.math.BigDecimal; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalTime; +import java.time.ZoneId; import java.util.Stack; /** @@ -122,7 +130,7 @@ public void startSimple(InfosetSimpleElement ise) { assert (!isRootElement); ElementMetadata md = ise.metadata(); String colName = colName(md); - ColumnWriter cw; + ScalarWriter cw; if (md.isArray()) { // A simple type array assert(!arrayWriterStack.isEmpty()); @@ -131,7 +139,7 @@ public void startSimple(InfosetSimpleElement ise) { // A simple element within a map // Note the map itself might be an array // but we don't care about that here. - cw = currentTupleWriter().column(colName); + cw = currentTupleWriter().scalar(colName); } ColumnMetadata cm = cw.schema(); assert(cm.isScalar()); @@ -216,68 +224,160 @@ public void endArray(InfosetArray ia) { arrayWriterStack.pop(); } - private void convertDaffodilValueToDrillValue(InfosetSimpleElement ise, ColumnMetadata cm, ColumnWriter cw) { + private void invariantFailed(String dafTypeName, ColumnMetadata cm) { + String msg = String.format("Daffodil to Drill Conversion Invariant Failed: dafType %s, drill type %s.", dafTypeName, cm.typeString()); + logger.error(msg); + fatalError(msg); + } + + private void convertDaffodilValueToDrillValue(InfosetSimpleElement ise, ColumnMetadata cm, ScalarWriter cw) { PrimitiveType dafType = ise.metadata().primitiveType(); + String dafTypeName = dafType.name(); TypeProtos.MinorType drillType = DrillDaffodilSchemaUtils.getDrillDataType(dafType); assert(drillType == cm.type()); switch (drillType) { - case INT: { - // - // FIXME: Javadoc for setObject says "primarily for testing" - // So how are we supposed to assign the column value then? - // Is there a way to get from a ColumnWriter to a typed scalar writer (downcast perhaps?) - cw.setObject(ise.getInt()); + case BIGINT: { // not a bignum, BIGINT is a signed 8-byte long in Drill. + switch (dafTypeName) { + case "unsignedInt": { + cw.setLong(ise.getUnsignedInt()); + break; + } + case "long": { + cw.setLong(ise.getLong()); + break; + } + default: invariantFailed(dafTypeName, cm); + } break; } - case BIGINT: { - cw.setObject(ise.getLong()); + case INT: { + cw.setInt(ise.getInt()); break; } case SMALLINT: { - cw.setObject(ise.getShort()); + cw.setInt(ise.getShort()); // there is no setShort break; } case TINYINT: { - cw.setObject(ise.getByte()); + cw.setInt(ise.getByte()); // there is no setByte + break; + } + case UINT4: { + // daffodil represents unsigned int as long. + // drill represents unsigned int as int. + cw.setInt(ise.getUnsignedInt().intValue()); + break; + } + case UINT2: { + cw.setInt(ise.getUnsignedShort()); + break; + } + case UINT1: { + cw.setInt(ise.getUnsignedByte()); + break; + } + case VARDECIMAL: { + switch (dafTypeName) { + case "unsignedLong": { + cw.setDecimal(new BigDecimal(ise.getUnsignedLong())); + break; + } + case "integer": { + cw.setDecimal(new BigDecimal(ise.getInteger())); + break; + } + case "nonNegativeInteger": { + cw.setDecimal(new BigDecimal(ise.getNonNegativeInteger())); + break; + } + default: invariantFailed(dafTypeName, cm); + } break; } -// .put("UNSIGNEDLONG", TypeProtos.MinorType.UINT8) -// .put("UNSIGNEDINT", TypeProtos.MinorType.UINT4) -// .put("UNSIGNEDSHORT", TypeProtos.MinorType.UINT2) -// .put("UNSIGNEDBYTE", TypeProtos.MinorType.UINT1) -// .put("INTEGER", TypeProtos.MinorType.BIGINT) -// .put("NONNEGATIVEINTEGER", TypeProtos.MinorType.BIGINT) case BIT: { - cw.setObject(ise.getBoolean()); + cw.setBoolean(ise.getBoolean()); break; } -// .put("DATE", TypeProtos.MinorType.DATE) // requires conversion -// .put("DATETIME", TypeProtos.MinorType.TIMESTAMP) // requires conversion -// .put("DECIMAL", TypeProtos.MinorType.VARDECIMAL) // requires conversion (maybe) case FLOAT8: { - cw.setObject(ise.getDouble()); + switch (dafTypeName) { + case "double": { + cw.setDouble(ise.getDouble()); + break; + } + case "float": { + // converting a float to a double by doubleValue() fails here + // Float.MaxValue converted to a double via doubleValue() + // then placed in a FLOAT8 column displays as + // 3.4028234663852886E38 not 3.4028235E38. + // But converting to string first, then to double works properly. + cw.setDouble(Double.valueOf(ise.getFloat().toString())); + break; + } + default: + invariantFailed(dafTypeName, cm); + } break; } case FLOAT4: { - cw.setObject(ise.getFloat()); + // we don't use float4, we always use float8. + invariantFailed(dafTypeName, cm); + // cw.setFloat(ise.getFloat()); break; } case VARBINARY: { - cw.setObject(ise.getHexBinary()); + byte[] hexBinary = ise.getHexBinary(); + cw.setBytes(hexBinary, hexBinary.length); break; } case VARCHAR: { - // - // FIXME: VARCHAR is defined in drill as utf8 string. - // Is Drill expecting something other than a Java string in this setObject call? - // Should we be mapping Daffodil strings to Drill VAR16CHAR type? - // - String s = ise.getString(); - cw.setObject(s); + switch (dafTypeName) { + case "decimal": { + BigDecimal decimal = ise.getDecimal(); + cw.setString(decimal.toString()); + break; + } + case "string": { + String s = ise.getString(); + cw.setString(s); + break; + } + default: + invariantFailed(dafTypeName, cm); + } break; } -// .put("TIME", TypeProtos.MinorType.TIME) // requires conversion - + case TIME: { + Calendar icuCal = ise.getTime(); + Instant instant = Instant.ofEpochMilli(icuCal.getTimeInMillis()); + TimeZone icuZone = icuCal.getTimeZone(); + String zoneString = icuZone.getID(); + ZoneId zoneId = ZoneId.of(zoneString); + LocalTime localTime = instant.atZone(zoneId).toLocalTime(); + cw.setTime(localTime); + break; + } + case DATE: { + Calendar icuCalendar = ise.getDate(); + // Extract year, month, and day from ICU Calendar + int year = icuCalendar.get(Calendar.YEAR); + // Note: ICU Calendar months are zero-based, similar to java.util.Calendar + int month = icuCalendar.get(Calendar.MONTH) + 1; + int day = icuCalendar.get(Calendar.DAY_OF_MONTH); + // Create a LocalDate + LocalDate localDate = LocalDate.of(year, month, day); + cw.setDate(localDate); + break; + } + case TIMESTAMP: { + Calendar icuCalendar = ise.getDateTime(); + // Get time in milliseconds from the epoch + long millis = icuCalendar.getTimeInMillis(); + // Create an Instant from milliseconds + Instant instant = Instant.ofEpochMilli(millis); + cw.setTimestamp(instant); + break; + } + default: invariantFailed(dafTypeName, cm); } } @@ -286,11 +386,11 @@ private void DFDLParseError(String s) { } private static void nyi() { - throw new RuntimeException("not yet implemented."); + throw new IllegalStateException("not yet implemented."); } private static void fatalError(String s) { - throw new RuntimeException(s); + throw new IllegalStateException(s); } } diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java index 7a1c6314578..a901cb15cc7 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java @@ -47,18 +47,29 @@ public class DrillDaffodilSchemaUtils { .put("INT", MinorType.INT) .put("SHORT", MinorType.SMALLINT) .put("BYTE", MinorType.TINYINT) - .put("UNSIGNEDLONG", MinorType.UINT8) - .put("UNSIGNEDINT", MinorType.UINT4) + // daffodil unsigned longs are modeled as DECIMAL(38, 0) which is the default for VARDECIMAL + .put("UNSIGNEDLONG", MinorType.VARDECIMAL) + .put("UNSIGNEDINT", MinorType.BIGINT) .put("UNSIGNEDSHORT", MinorType.UINT2) .put("UNSIGNEDBYTE", MinorType.UINT1) - .put("INTEGER", MinorType.BIGINT) - .put("NONNEGATIVEINTEGER", MinorType.BIGINT) + // daffodil integer, nonNegativeInteger, are modeled as DECIMAL(38, 0) which is the default for VARDECIMAL + .put("INTEGER", MinorType.VARDECIMAL) + .put("NONNEGATIVEINTEGER", MinorType.VARDECIMAL) + // decimal has to be modeled as string since we really have no idea what to set the + // scale to. + .put("DECIMAL", MinorType.VARCHAR) .put("BOOLEAN", MinorType.BIT) .put("DATE", MinorType.DATE) // requires conversion .put("DATETIME", MinorType.TIMESTAMP) // requires conversion - .put("DECIMAL", MinorType.VARDECIMAL) // requires conversion (maybe) .put("DOUBLE", MinorType.FLOAT8) - .put("FLOAT", MinorType.FLOAT4) + // + // daffodil float type is mapped to double aka Float8 in drill because there + // seems to be bugs in FLOAT4. Float.MaxValue in a Float4 column displays as + // 3.4028234663852886E38 not 3.4028235E38. + // + // We don't really care about single float precision, so we just use double precision. + // + .put("FLOAT", MinorType.FLOAT8) .put("HEXBINARY", MinorType.VARBINARY) .put("STRING", MinorType.VARCHAR) .put("TIME", MinorType.TIME) // requires conversion diff --git a/contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java b/contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java index 7df13619324..8bedadbb787 100644 --- a/contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java +++ b/contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java @@ -253,4 +253,24 @@ public void testMoreTypes1() throws Exception { assertFalse(rdr.next()); results.clear(); } + + @Test + public void testMoreTypes2() throws Exception { + + QueryBuilder qb = client.queryBuilder(); + QueryBuilder query = qb.sql(selectRow("moreTypes2", "moreTypes2.txt.dat")); + RowSet results = query.rowSet(); + results.print(); + assertEquals(1, results.rowCount()); + + RowSetReader rdr = results.reader(); + rdr.next(); + String map = rdr.getAsString(); + assertEquals("{4294967295, 18446744073709551615, 65535, 255, " + + "-18446744073709551616, 18446744073709551616, " + + "\"0.18446744073709551616\", " + // xs:decimal is modeled as VARCHAR i.e., a string. So needs quotation marks. + "1970-01-01, 00:00, 1970-01-01T00:00:00Z}", map); + assertFalse(rdr.next()); + results.clear(); + } } diff --git a/contrib/format-daffodil/src/test/resources/data/moreTypes2.txt.dat b/contrib/format-daffodil/src/test/resources/data/moreTypes2.txt.dat new file mode 100644 index 00000000000..fca536e8543 --- /dev/null +++ b/contrib/format-daffodil/src/test/resources/data/moreTypes2.txt.dat @@ -0,0 +1 @@ +4294967295 18446744073709551615 65535 255 -18446744073709551616 18446744073709551616 0.18446744073709551616 1970-01-01 00:00:00+0000 1970-01-01T00:00:00 diff --git a/contrib/format-daffodil/src/test/resources/schema/moreTypes1.dfdl.xsd b/contrib/format-daffodil/src/test/resources/schema/moreTypes1.dfdl.xsd index b1a8085a9bd..a294e29e548 100644 --- a/contrib/format-daffodil/src/test/resources/schema/moreTypes1.dfdl.xsd +++ b/contrib/format-daffodil/src/test/resources/schema/moreTypes1.dfdl.xsd @@ -55,12 +55,7 @@ - - + diff --git a/contrib/format-daffodil/src/test/resources/schema/moreTypes2.dfdl.xsd b/contrib/format-daffodil/src/test/resources/schema/moreTypes2.dfdl.xsd new file mode 100644 index 00000000000..00bd3d7a155 --- /dev/null +++ b/contrib/format-daffodil/src/test/resources/schema/moreTypes2.dfdl.xsd @@ -0,0 +1,64 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 225504a06f0c452a3aaaffd80b93e85cadd7c93d Mon Sep 17 00:00:00 2001 From: Michael Beckerle Date: Thu, 4 Jan 2024 17:24:47 -0500 Subject: [PATCH 3/8] Code reformatting and reorg I imported the dev-support/formatter/eclipse settings and used them to reformat the code in IntelliJ IDEA. No functional changes in this commit. --- .../store/daffodil/DaffodilBatchReader.java | 61 +++--- .../DaffodilDrillInfosetOutputter.java | 114 ++++++----- .../store/daffodil/DaffodilFormatConfig.java | 32 ++-- .../store/daffodil/DaffodilFormatPlugin.java | 53 +++--- .../store/daffodil/DaffodilMessageParser.java | 178 +++++++++--------- .../schema/DaffodilDataProcessorFactory.java | 82 ++++---- .../schema/DrillDaffodilSchemaVisitor.java | 76 ++++---- .../store/daffodil/TestDaffodilReader.java | 81 ++++---- 8 files changed, 328 insertions(+), 349 deletions(-) diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilBatchReader.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilBatchReader.java index 20a0d81d4e0..455a7c35721 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilBatchReader.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilBatchReader.java @@ -18,11 +18,6 @@ package org.apache.drill.exec.store.daffodil; -import java.io.InputStream; -import java.net.URI; -import java.net.URISyntaxException; -import java.util.Objects; - import org.apache.daffodil.japi.DataProcessor; import org.apache.drill.common.AutoCloseables; import org.apache.drill.common.exceptions.CustomErrorContext; @@ -39,8 +34,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaUtils.daffodilDataProcessorToDrillSchema; +import java.io.InputStream; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Objects; +import static org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaUtils.daffodilDataProcessorToDrillSchema; public class DaffodilBatchReader implements ManagedReader { @@ -51,14 +50,8 @@ public class DaffodilBatchReader implements ManagedReader { private final DaffodilMessageParser dafParser; private final InputStream dataInputStream; - static class DaffodilReaderConfig { - final DaffodilFormatPlugin plugin; - DaffodilReaderConfig(DaffodilFormatPlugin plugin) { - this.plugin = plugin; - } - } - - public DaffodilBatchReader (DaffodilReaderConfig readerConfig, EasySubScan scan, FileSchemaNegotiator negotiator) { + public DaffodilBatchReader(DaffodilReaderConfig readerConfig, EasySubScan scan, + FileSchemaNegotiator negotiator) { errorContext = negotiator.parentErrorContext(); this.dafConfig = readerConfig.plugin.getConfig(); @@ -72,15 +65,13 @@ public DaffodilBatchReader (DaffodilReaderConfig readerConfig, EasySubScan scan, try { dfdlSchemaURI = new URI(schemaURIString); } catch (URISyntaxException e) { - throw UserException.validationError(e) - .build(logger); + throw UserException.validationError(e).build(logger); } FileDescrip file = negotiator.file(); DrillFileSystem fs = file.fileSystem(); URI fsSchemaURI = fs.getUri().resolve(dfdlSchemaURI); - DaffodilDataProcessorFactory dpf = new DaffodilDataProcessorFactory(); DataProcessor dp; try { @@ -126,22 +117,23 @@ public DaffodilBatchReader (DaffodilReaderConfig readerConfig, EasySubScan scan, dafParser.setInputStream(dataInputStream); } - /** * This is the core of actual processing - data movement from Daffodil to Drill. *

- * If there is space in the batch, and there is data available to parse - * then this calls the daffodil parser, which parses data, delivering it to the rowWriter - * by way of the infoset outputter. + * If there is space in the batch, and there is data available to parse then this calls the + * daffodil parser, which parses data, delivering it to the rowWriter by way of the infoset + * outputter. *

- * Repeats until the rowWriter is full (a batch is full), or there is no more data, or - * a parse error ends execution with a throw. + * Repeats until the rowWriter is full (a batch is full), or there is no more data, or a parse + * error ends execution with a throw. *

- * Validation errors and other warnings are not errors and are logged but do not cause - * parsing to fail/throw. - * @return true if there are rows retrieved, false if no rows were retrieved, which means - * no more will ever be retrieved (end of data). - * @throws RuntimeException on parse errors. + * Validation errors and other warnings are not errors and are logged but do not cause parsing to + * fail/throw. + * + * @return true if there are rows retrieved, false if no rows were retrieved, which means no more + * will ever be retrieved (end of data). + * @throws RuntimeException + * on parse errors. */ @Override public boolean next() { @@ -157,7 +149,7 @@ public boolean next() { try { dafParser.parse(); if (dafParser.isProcessingError()) { - assert(Objects.nonNull(dafParser.getDiagnostics())); + assert (Objects.nonNull(dafParser.getDiagnostics())); throw UserException.dataReadError().message(dafParser.getDiagnosticsAsString()) .addContext(errorContext).build(logger); } @@ -173,7 +165,8 @@ public boolean next() { rowSetLoader.save(); } int nRows = rowSetLoader.rowCount(); - assert nRows > 0; // This cannot be zero. If the parse failed we will have already thrown out of here. + assert nRows > 0; // This cannot be zero. If the parse failed we will have already thrown out + // of here. return true; } @@ -181,4 +174,12 @@ public boolean next() { public void close() { AutoCloseables.closeSilently(dataInputStream); } + + static class DaffodilReaderConfig { + final DaffodilFormatPlugin plugin; + + DaffodilReaderConfig(DaffodilFormatPlugin plugin) { + this.plugin = plugin; + } + } } diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java index 409cc1eb5f0..803f269a166 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java @@ -19,11 +19,11 @@ import com.ibm.icu.util.Calendar; import com.ibm.icu.util.TimeZone; +import org.apache.daffodil.japi.infoset.InfosetOutputter; import org.apache.daffodil.runtime1.api.ComplexElementMetadata; import org.apache.daffodil.runtime1.api.ElementMetadata; import org.apache.daffodil.runtime1.api.InfosetArray; import org.apache.daffodil.runtime1.api.InfosetComplexElement; -import org.apache.daffodil.japi.infoset.InfosetOutputter; import org.apache.daffodil.runtime1.api.InfosetSimpleElement; import org.apache.daffodil.runtime1.api.PrimitiveType; import org.apache.drill.common.types.TypeProtos; @@ -32,11 +32,9 @@ import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaUtils; import org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaVisitor; import org.apache.drill.exec.vector.accessor.ArrayWriter; -import org.apache.drill.exec.vector.accessor.ColumnWriter; import org.apache.drill.exec.vector.accessor.ObjectType; import org.apache.drill.exec.vector.accessor.ScalarWriter; import org.apache.drill.exec.vector.accessor.TupleWriter; -import org.apache.drill.exec.vector.complex.writer.BaseWriter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -48,52 +46,53 @@ import java.util.Stack; /** - * Adapts Daffodil parser infoset event calls to Drill writer calls - * to fill in Drill data rows. + * Adapts Daffodil parser infoset event calls to Drill writer calls to fill in Drill data rows. */ -public class DaffodilDrillInfosetOutputter - extends InfosetOutputter { - - private boolean isOriginalRoot() { - boolean result = currentTupleWriter() == rowSetWriter; - if (result) - assert(tupleWriterStack.size() == 1); - return result; - } +public class DaffodilDrillInfosetOutputter extends InfosetOutputter { + private static final Logger logger = LoggerFactory.getLogger(DaffodilDrillInfosetOutputter.class); /** - * True if the next startComplex call will be for the - * DFDL infoset root element whose children are the columns of - * the row set. + * Stack that is used only if we have sub-structures that are not simple-type fields of the row. */ - private boolean isRootElement = true; - + private final Stack tupleWriterStack = new Stack<>(); + private final Stack arrayWriterStack = new Stack<>(); /** - * Stack that is used only if we have sub-structures that are not - * simple-type fields of the row. + * True if the next startComplex call will be for the DFDL infoset root element whose children are + * the columns of the row set. */ - private final Stack tupleWriterStack = new Stack<>(); + private boolean isRootElement = true; + private RowSetLoader rowSetWriter; - private final Stack arrayWriterStack = new Stack<>(); + private DaffodilDrillInfosetOutputter() { + } // no default constructor - private TupleWriter currentTupleWriter() { - return tupleWriterStack.peek(); + public DaffodilDrillInfosetOutputter(RowSetLoader writer) { + this.rowSetWriter = writer; + this.tupleWriterStack.push(writer); } - private ArrayWriter currentArrayWriter() { - return arrayWriterStack.peek(); + private static void nyi() { + throw new IllegalStateException("not yet implemented."); } + private static void fatalError(String s) { + throw new IllegalStateException(s); + } - private static final Logger logger = LoggerFactory.getLogger(DaffodilDrillInfosetOutputter.class); - - private DaffodilDrillInfosetOutputter() {} // no default constructor + private boolean isOriginalRoot() { + boolean result = currentTupleWriter() == rowSetWriter; + if (result) { + assert (tupleWriterStack.size() == 1); + } + return result; + } - private RowSetLoader rowSetWriter; + private TupleWriter currentTupleWriter() { + return tupleWriterStack.peek(); + } - public DaffodilDrillInfosetOutputter(RowSetLoader writer) { - this.rowSetWriter = writer; - this.tupleWriterStack.push(writer); + private ArrayWriter currentArrayWriter() { + return arrayWriterStack.peek(); } @Override @@ -106,9 +105,9 @@ public void reset() { } private void checkCleanState() { - assert(isOriginalRoot()); - assert(arrayWriterStack.isEmpty()); - assert(isRootElement); + assert (isOriginalRoot()); + assert (arrayWriterStack.isEmpty()); + assert (isRootElement); } @Override @@ -133,7 +132,7 @@ public void startSimple(InfosetSimpleElement ise) { ScalarWriter cw; if (md.isArray()) { // A simple type array - assert(!arrayWriterStack.isEmpty()); + assert (!arrayWriterStack.isEmpty()); cw = currentArrayWriter().scalar(); } else { // A simple element within a map @@ -142,7 +141,7 @@ public void startSimple(InfosetSimpleElement ise) { cw = currentTupleWriter().scalar(colName); } ColumnMetadata cm = cw.schema(); - assert(cm.isScalar()); + assert (cm.isScalar()); if (md.isNillable() && ise.isNilled()) { assert cm.isNullable(); cw.setNull(); @@ -162,7 +161,7 @@ public void startComplex(InfosetComplexElement ce) { ComplexElementMetadata md = ce.metadata(); String colName = colName(ce.metadata()); if (isRootElement) { - assert(isOriginalRoot()); + assert (isOriginalRoot()); // This complex element's corresponds to the root element of the // DFDL schema. We don't treat this as a column of the row set. // Rather, it's children are the columns of the row set. @@ -173,7 +172,7 @@ public void startComplex(InfosetComplexElement ce) { return; } if (md.isArray()) { - assert(!arrayWriterStack.isEmpty()); + assert (!arrayWriterStack.isEmpty()); // FIXME: is this the way to add a complex array child item (i.e., each array item is a map) tupleWriterStack.push(currentArrayWriter().tuple()); } else { @@ -202,9 +201,10 @@ public void endComplex(InfosetComplexElement ce) { public void startArray(InfosetArray diArray) { ElementMetadata md = diArray.metadata(); assert (md.isArray()); - // DFDL has no notion of an array directly within another array. A named field (map) is necessary - // before you can have another array. - assert (currentTupleWriter().type() == ObjectType.TUPLE); // parent is a map, or the top level row. + // DFDL has no notion of an array directly within another array. A named field (map) is + // necessary before you can have another array. + assert (currentTupleWriter().type() == ObjectType.TUPLE); // parent is a map, or the top + // level row. String colName = colName(md); TupleWriter enclosingParentTupleWriter = currentTupleWriter(); ArrayWriter aw = enclosingParentTupleWriter.array(colName); @@ -225,16 +225,19 @@ public void endArray(InfosetArray ia) { } private void invariantFailed(String dafTypeName, ColumnMetadata cm) { - String msg = String.format("Daffodil to Drill Conversion Invariant Failed: dafType %s, drill type %s.", dafTypeName, cm.typeString()); + String msg = String.format( + "Daffodil to Drill Conversion Invariant Failed: dafType %s, drill type %s.", dafTypeName, + cm.typeString()); logger.error(msg); fatalError(msg); } - private void convertDaffodilValueToDrillValue(InfosetSimpleElement ise, ColumnMetadata cm, ScalarWriter cw) { + private void convertDaffodilValueToDrillValue(InfosetSimpleElement ise, ColumnMetadata cm, + ScalarWriter cw) { PrimitiveType dafType = ise.metadata().primitiveType(); String dafTypeName = dafType.name(); TypeProtos.MinorType drillType = DrillDaffodilSchemaUtils.getDrillDataType(dafType); - assert(drillType == cm.type()); + assert (drillType == cm.type()); switch (drillType) { case BIGINT: { // not a bignum, BIGINT is a signed 8-byte long in Drill. switch (dafTypeName) { @@ -246,7 +249,8 @@ private void convertDaffodilValueToDrillValue(InfosetSimpleElement ise, ColumnMe cw.setLong(ise.getLong()); break; } - default: invariantFailed(dafTypeName, cm); + default: + invariantFailed(dafTypeName, cm); } break; } @@ -290,7 +294,8 @@ private void convertDaffodilValueToDrillValue(InfosetSimpleElement ise, ColumnMe cw.setDecimal(new BigDecimal(ise.getNonNegativeInteger())); break; } - default: invariantFailed(dafTypeName, cm); + default: + invariantFailed(dafTypeName, cm); } break; } @@ -377,20 +382,13 @@ private void convertDaffodilValueToDrillValue(InfosetSimpleElement ise, ColumnMe cw.setTimestamp(instant); break; } - default: invariantFailed(dafTypeName, cm); + default: + invariantFailed(dafTypeName, cm); } } private void DFDLParseError(String s) { throw new RuntimeException(s); } - - private static void nyi() { - throw new IllegalStateException("not yet implemented."); - } - - private static void fatalError(String s) { - throw new IllegalStateException(s); - } } diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatConfig.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatConfig.java index 80e2ba5110d..c4a02f20f84 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatConfig.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatConfig.java @@ -40,19 +40,20 @@ public class DaffodilFormatConfig implements FormatPluginConfig { public final boolean validationMode; public final String rootName; public final String rootNamespace; + /** - * In the constructor for a format config, you should not use - * boxed versions of primitive types. It creates problems with - * defaulting them (they default to null) which cannot be unboxed. + * In the constructor for a format config, you should not use boxed versions of primitive types. + * It creates problems with defaulting them (they default to null) which cannot be unboxed. */ @JsonCreator public DaffodilFormatConfig(@JsonProperty("extensions") List extensions, - @JsonProperty("schemaURI") String schemaURI, - @JsonProperty("rootName") String rootName, - @JsonProperty("rootNamespace") String rootNamespace, - @JsonProperty("validationMode") boolean validationMode) { + @JsonProperty("schemaURI") String schemaURI, @JsonProperty("rootName") String rootName, + @JsonProperty("rootNamespace") String rootNamespace, + @JsonProperty("validationMode") boolean validationMode) { - this.extensions = extensions == null ? Collections.singletonList("dat") : ImmutableList.copyOf(extensions); + this.extensions = extensions == null + ? Collections.singletonList("dat") + : ImmutableList.copyOf(extensions); this.rootName = rootName; this.rootNamespace = rootNamespace; this.schemaURI = schemaURI; @@ -101,19 +102,14 @@ public boolean equals(Object obj) { return false; } DaffodilFormatConfig other = (DaffodilFormatConfig) obj; - return Objects.equals(schemaURI, other.schemaURI) - && Objects.equals(rootName, other.rootName) - && Objects.equals(rootNamespace, other.rootNamespace) - && Objects.equals(validationMode, other.validationMode); + return Objects.equals(schemaURI, other.schemaURI) && Objects.equals(rootName, + other.rootName) && Objects.equals(rootNamespace, other.rootNamespace) && Objects.equals( + validationMode, other.validationMode); } @Override public String toString() { - return new PlanStringBuilder(this) - .field("schemaURI", schemaURI) - .field("rootName", rootName) - .field("rootNamespace", rootNamespace) - .field("validationMode", validationMode) - .toString(); + return new PlanStringBuilder(this).field("schemaURI", schemaURI).field("rootName", rootName) + .field("rootNamespace", rootNamespace).field("validationMode", validationMode).toString(); } } diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatPlugin.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatPlugin.java index 95b5e82aaac..7ee94d12310 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatPlugin.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatPlugin.java @@ -18,7 +18,6 @@ package org.apache.drill.exec.store.daffodil; - import org.apache.drill.common.logical.StoragePluginConfig; import org.apache.drill.common.types.TypeProtos.MinorType; import org.apache.drill.common.types.Types; @@ -31,18 +30,38 @@ import org.apache.drill.exec.store.dfs.easy.EasySubScan; import org.apache.hadoop.conf.Configuration; - public class DaffodilFormatPlugin extends EasyFormatPlugin { public static final String DEFAULT_NAME = "daffodil"; public static final String OPERATOR_TYPE = "DAFFODIL_SUB_SCAN"; + public DaffodilFormatPlugin(String name, DrillbitContext context, Configuration fsConf, + StoragePluginConfig storageConfig, DaffodilFormatConfig formatConfig) { + super(name, easyConfig(fsConf, formatConfig), context, storageConfig, formatConfig); + } + + private static EasyFormatConfig easyConfig(Configuration fsConf, + DaffodilFormatConfig pluginConfig) { + return EasyFormatConfig.builder().readable(true).writable(false).blockSplittable(false) + .compressible(true).extensions(pluginConfig.getExtensions()).fsConf(fsConf) + .readerOperatorType(OPERATOR_TYPE).scanVersion(ScanFrameworkVersion.EVF_V2) + .supportsLimitPushdown(true).supportsProjectPushdown(true) + .defaultName(DaffodilFormatPlugin.DEFAULT_NAME).build(); + } + + @Override + protected void configureScan(FileScanLifecycleBuilder builder, EasySubScan scan) { + builder.nullType(Types.optional(MinorType.VARCHAR)); + builder.readerFactory(new DaffodilReaderFactory(formatConfig.getReaderConfig(this), scan)); + } + public static class DaffodilReaderFactory extends FileReaderFactory { private final DaffodilBatchReader.DaffodilReaderConfig readerConfig; private final EasySubScan scan; - public DaffodilReaderFactory(DaffodilBatchReader.DaffodilReaderConfig config, EasySubScan scan) { + public DaffodilReaderFactory(DaffodilBatchReader.DaffodilReaderConfig config, + EasySubScan scan) { this.readerConfig = config; this.scan = scan; } @@ -52,32 +71,4 @@ public ManagedReader newReader(FileSchemaNegotiator negotiator) { return new DaffodilBatchReader(readerConfig, scan, negotiator); } } - - public DaffodilFormatPlugin(String name, DrillbitContext context, - Configuration fsConf, StoragePluginConfig storageConfig, - DaffodilFormatConfig formatConfig) { - super(name, easyConfig(fsConf, formatConfig), context, storageConfig, formatConfig); - } - - private static EasyFormatConfig easyConfig(Configuration fsConf, DaffodilFormatConfig pluginConfig) { - return EasyFormatConfig.builder() - .readable(true) - .writable(false) - .blockSplittable(false) - .compressible(true) - .extensions(pluginConfig.getExtensions()) - .fsConf(fsConf) - .readerOperatorType(OPERATOR_TYPE) - .scanVersion(ScanFrameworkVersion.EVF_V2) - .supportsLimitPushdown(true) - .supportsProjectPushdown(true) - .defaultName(DaffodilFormatPlugin.DEFAULT_NAME) - .build(); - } - - @Override - protected void configureScan(FileScanLifecycleBuilder builder, EasySubScan scan) { - builder.nullType(Types.optional(MinorType.VARCHAR)); - builder.readerFactory(new DaffodilReaderFactory(formatConfig.getReaderConfig(this), scan)); - } } diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilMessageParser.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilMessageParser.java index 67b74d2571a..97f06d7d35e 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilMessageParser.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilMessageParser.java @@ -17,7 +17,6 @@ */ package org.apache.drill.exec.store.daffodil; - import org.apache.daffodil.japi.DataProcessor; import org.apache.daffodil.japi.Diagnostic; import org.apache.daffodil.japi.ParseResult; @@ -32,50 +31,51 @@ /** * DFDL Daffodil Streaming message parser - *
- * You construct this providing a DataProcessor obtained from the - * DaffodilDataProcessorFactory. - * The DataProcessor contains the compiled DFDL schema, ready to use, as - * well as whether validation while parsing has been requested. - *
- * The DataProcessor object may be shared/reused by multiple threads each of which - * has its own copy of this class. - * This object is, however, stateful, and must not be shared by multiple threads. - *
- * You must call setInputStream, and setInfosetOutputter before - * you call parse(). - * The input stream and the InfosetOutputter objects are also private to one thread and are stateful - * and owned by this object. - * Once you have called setInputStream, you should view the input stream as the private property of - * this object. - * The parse() will invoke the InfosetOutputter's methods to deliver - * parsed data, and it may optionally create diagnostics (obtained via getDiagnostics) - * indicating which kind they are via the getIsProcessingError, getIsValidationError. - *
- * Note that the InfosetOutputter may be called many times before a processing error is detected, - * as Daffodil delivers result data incrementally. - *
+ *

+ * You construct this providing a DataProcessor obtained from the DaffodilDataProcessorFactory. The + * DataProcessor contains the compiled DFDL schema, ready to use, as well as whether validation + * while parsing has been requested. + *

+ * The DataProcessor object may be shared/reused by multiple threads each of which has its own copy + * of this class. This object is, however, stateful, and must not be shared by multiple threads. + *

+ * You must call setInputStream, and setInfosetOutputter before you call parse(). The input stream + * and the InfosetOutputter objects are also private to one thread and are stateful and owned by + * this object. Once you have called setInputStream, you should view the input stream as the private + * property of this object. The parse() will invoke the InfosetOutputter's methods to deliver parsed + * data, and it may optionally create diagnostics (obtained via getDiagnostics) indicating which + * kind they are via the getIsProcessingError, getIsValidationError. + *

+ * Note that the InfosetOutputter may be called many times before a processing error is detected, as + * Daffodil delivers result data incrementally. + *

* Validation errors do not affect the InfosetOutputter output calls, but indicate that data was * detected that is invalid. - *
- * When parse() returns, the parse has ended and one can check for errors/diagnostics. - * One can call parse() again if there is still data to consume, which is checked via the - * isEOF() method. - *
- * There are no guarantees about where the input stream is positioned between parse() calls. - * In particular, it may not be positioned at the start of the next message, as Daffodil may - * have pre-fetched additional bytes from the input stream which it found are not part of the - * current infoset, but the next one. - * The positioning of the input stream may in fact be somewhere in the middle of a byte, - * as Daffodil does not require messages to be of lengths that are in whole byte units. - * Hence, once you give the input stream to this object via setInputStream, that input stream is - * owned privately by this class for ever after. + *

+ * When parse() returns, the parse has ended and one can check for errors/diagnostics. One can call + * parse() again if there is still data to consume, which is checked via the isEOF() method. + *

+ * There are no guarantees about where the input stream is positioned between parse() calls. In + * particular, it may not be positioned at the start of the next message, as Daffodil may have + * pre-fetched additional bytes from the input stream which it found are not part of the current + * infoset, but the next one. The positioning of the input stream may in fact be somewhere in the + * middle of a byte, as Daffodil does not require messages to be of lengths that are in whole byte + * units. Hence, once you give the input stream to this object via setInputStream, that input stream + * is owned privately by this class for ever after. */ public class DaffodilMessageParser { + private static final Logger logger = LoggerFactory.getLogger(DaffodilMessageParser.class); + private List diagnostics; // diagnostics. + private boolean isProcessingError; + private boolean isValidationError; + private InputSourceDataInputStream dis; + private InfosetOutputter outputter; + private DataProcessor dp; + /** - * Constructs the parser using a DataProcessor obtained from - * a DaffodilDataProcessorFactory. + * Constructs the parser using a DataProcessor obtained from a DaffodilDataProcessorFactory. + * * @param dp */ DaffodilMessageParser(DataProcessor dp) { @@ -84,17 +84,16 @@ public class DaffodilMessageParser { /** * Provide the input stream from which data is to be parsed. - *
+ *

* This input stream is then owned by this object and becomes part of its state. - *
- * It is; however, the responsibility of the caller to close this - * input stream after the completion of all parse calls. - * In particular, if a parse error is considered fatal, then - * the caller should close the input stream. - * There are advanced error-recovery techniques that may attempt to find - * data that can be parsed later in the data stream. - * In those cases the input stream would not be closed after a processing error, - * but such usage is beyond the scope of this javadoc. + *

+ * It is; however, the responsibility of the caller to close this input stream after the + * completion of all parse calls. In particular, if a parse error is considered fatal, then the + * caller should close the input stream. There are advanced error-recovery techniques that may + * attempt to find data that can be parsed later in the data stream. In those cases the input + * stream would not be closed after a processing error, but such usage is beyond the scope of this + * javadoc. + * * @param inputStream */ public void setInputStream(InputStream inputStream) { @@ -102,8 +101,9 @@ public void setInputStream(InputStream inputStream) { } /** - * Provides the InfosetOutputter which will be called to deliver - * the Infoset via calls to its methods. + * Provides the InfosetOutputter which will be called to deliver the Infoset via calls to its + * methods. + * * @param outputter */ public void setInfosetOutputter(InfosetOutputter outputter) { @@ -111,17 +111,20 @@ public void setInfosetOutputter(InfosetOutputter outputter) { } /** - * Called to pull messages from the data stream. - * The message 'Infoset' is delivered by way of calls to the InfosetOutputter's methods. - *
+ * Called to pull messages from the data stream. The message 'Infoset' is delivered by way of + * calls to the InfosetOutputter's methods. + *

* After calling this, one may call getIsProcessingError, getIsValiationError, isEOF, and * getDiagnostics. */ public void parse() { - if (dis == null) + if (dis == null) { throw new IllegalStateException("Input stream must be provided by setInputStream() call."); - if (outputter == null) - throw new IllegalStateException("InfosetOutputter must be provided by setInfosetOutputter() call."); + } + if (outputter == null) { + throw new IllegalStateException( + "InfosetOutputter must be provided by setInfosetOutputter() call."); + } reset(); ParseResult res = dp.parse(dis, outputter); @@ -131,13 +134,13 @@ public void parse() { } /** - * True if the input stream is known to contain no more data. - * If the input stream is a true stream, not a file, then temporary unavailability of data - * may cause this call to block until the stream is closed from the other end, or data becomes - * available. - *
- * False if the input stream is at EOF, and no more data can be obtained. - * It is an error to call parse() after isEOF has returned true. + * True if the input stream is known to contain no more data. If the input stream is a true + * stream, not a file, then temporary unavailability of data may cause this call to block until + * the stream is closed from the other end, or data becomes available. + *

+ * False if the input stream is at EOF, and no more data can be obtained. It is an error to call + * parse() after isEOF has returned true. + * * @return */ public boolean isEOF() { @@ -145,50 +148,45 @@ public boolean isEOF() { } /** - * True if the parse() call failed with a processing error. - * This indicates that the data was not well-formed and could not be - * parsed successfully. - *
+ * True if the parse() call failed with a processing error. This indicates that the data was not + * well-formed and could not be parsed successfully. + *

* It is possible for isProcessingError and isValidationError to both be true. + * * @return */ - public boolean isProcessingError() { return isProcessingError; } + public boolean isProcessingError() { + return isProcessingError; + } /** - * True if a validation error occurred during parsing. - * Subsequently to a validation error occurring, parsing may succeed or fail. - * after the validation error was detected. + * True if a validation error occurred during parsing. Subsequently to a validation error + * occurring, parsing may succeed or fail. after the validation error was detected. + * * @return */ - public boolean isValidationError() { return isValidationError; } + public boolean isValidationError() { + return isValidationError; + } /** * After a parse() call this returns null or a list of 1 or more diagnostics. - *
+ *

* If isProcessingError or isValidationError are true, then this will contain at least 1 - * diagnostic. - * If both are true this will contain at least 2 diagnostics. + * diagnostic. If both are true this will contain at least 2 diagnostics. + * * @return */ - public List getDiagnostics() { return diagnostics; } + public List getDiagnostics() { + return diagnostics; + } + public String getDiagnosticsAsString() { - String result = diagnostics.stream() - .map(Diagnostic::getMessage) + String result = diagnostics.stream().map(Diagnostic::getMessage) .collect(Collectors.joining("\n")); return result; } - - private static final Logger logger = LoggerFactory.getLogger(DaffodilMessageParser.class); - - private List diagnostics; // diagnostics. - private boolean isProcessingError; - private boolean isValidationError; - - private InputSourceDataInputStream dis; - private InfosetOutputter outputter; - private DataProcessor dp; - private void reset() { outputter.reset(); isProcessingError = false; diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java index 149e0b2ceec..a7267586948 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java @@ -17,7 +17,6 @@ */ package org.apache.drill.exec.store.daffodil.schema; - import org.apache.daffodil.japi.Compiler; import org.apache.daffodil.japi.Daffodil; import org.apache.daffodil.japi.DataProcessor; @@ -37,9 +36,9 @@ import java.util.Objects; /** - * Compiles a DFDL schema (mostly for tests) or loads a pre-compiled DFDL schema so - * that one can obtain a DataProcessor for use with DaffodilMessageParser. - *
+ * Compiles a DFDL schema (mostly for tests) or loads a pre-compiled DFDL schema so that one can + * obtain a DataProcessor for use with DaffodilMessageParser. + *

* TODO: Needs to use a cache to avoid reloading/recompiling every time. */ public class DaffodilDataProcessorFactory { @@ -49,40 +48,31 @@ public class DaffodilDataProcessorFactory { private DataProcessor dp; - /** - * Thrown if schema compilation fails. - *
- * Contains diagnostic objects which give the cause(s) of the failure. - */ - public static class CompileFailure extends Exception { - List diags; - - CompileFailure(List diagnostics) { - super("DFDL Schema Compile Failure"); - diags = diagnostics; - } - } - /** * Gets a Daffodil DataProcessor given the necessary arguments to compile or reload it. - * @param schemaFileURI pre-compiled dfdl schema (.bin extension) or DFDL schema source (.xsd extension) - * @param validationMode Use true to request Daffodil built-in 'limited' validation. - * Use false for no validation. - * @param rootName Local name of root element of the message. Can be null to use the first element - * declaration of the primary schema file. Ignored if reloading a pre-compiled schema. - * @param rootNS Namespace URI as a string. Can be null to use the target namespace - * of the primary schema file or if it is unambiguous what element is the rootName. - * Ignored if reloading a pre-compiled schema. + * + * @param schemaFileURI + * pre-compiled dfdl schema (.bin extension) or DFDL schema source (.xsd extension) + * @param validationMode + * Use true to request Daffodil built-in 'limited' validation. Use false for no validation. + * @param rootName + * Local name of root element of the message. Can be null to use the first element declaration + * of the primary schema file. Ignored if reloading a pre-compiled schema. + * @param rootNS + * Namespace URI as a string. Can be null to use the target namespace of the primary schema + * file or if it is unambiguous what element is the rootName. Ignored if reloading a + * pre-compiled schema. * @return the DataProcessor - * @throws CompileFailure - if schema compilation fails - * @throws IOException - if the schemaFileURI cannot be opened or is not found. - * @throws URISyntaxException - if the schemaFileURI is not legal syntax. - * @throws InvalidParserException - if the reloading of the parser from pre-compiled binary fails. + * @throws CompileFailure + * - if schema compilation fails + * @throws IOException + * - if the schemaFileURI cannot be opened or is not found. + * @throws URISyntaxException + * - if the schemaFileURI is not legal syntax. + * @throws InvalidParserException + * - if the reloading of the parser from pre-compiled binary fails. */ - public DataProcessor getDataProcessor( - URI schemaFileURI, - boolean validationMode, - String rootName, + public DataProcessor getDataProcessor(URI schemaFileURI, boolean validationMode, String rootName, String rootNS) throws CompileFailure, IOException, URISyntaxException, InvalidParserException { @@ -92,7 +82,8 @@ public DataProcessor getDataProcessor( if (Objects.nonNull(rootName) && !rootName.isEmpty()) { // A usage error. You shouldn't supply the name and optionally namespace if loading // precompiled schema because those are built into it. Should be null or "". - logger.warn("Root element name '{}' is ignored when used with precompiled DFDL schema.", rootName); + logger.warn("Root element name '{}' is ignored when used with precompiled DFDL schema.", + rootName); } dmp.loadSchema(schemaFileURI); dmp.setupDP(validationMode, null); @@ -103,8 +94,7 @@ public DataProcessor getDataProcessor( return dmp.dp; } - private void loadSchema(URI schemaFileURI) - throws IOException, InvalidParserException { + private void loadSchema(URI schemaFileURI) throws IOException, InvalidParserException { Compiler c = Daffodil.compiler(); dp = c.reload(Channels.newChannel(schemaFileURI.toURL().openStream())); } @@ -116,9 +106,7 @@ private List compileSchema(URI schemaFileURI, String rootName, Strin ProcessorFactory pf = c.compileSource(schemaFileURI, rootName, rootNS); List pfDiags = pf.getDiagnostics(); if (pf.isError()) { - pfDiags.forEach(diag -> - logger.error(diag.getSomeMessage()) - ); + pfDiags.forEach(diag -> logger.error(diag.getSomeMessage())); throw new CompileFailure(pfDiags); } dp = pf.onPath("/"); @@ -151,4 +139,18 @@ private void setupDP(boolean validationMode, List pfDiags) throws Co compilationWarnings = dpDiags; // dpDiags might be empty. That's ok. } } + + /** + * Thrown if schema compilation fails. + *

+ * Contains diagnostic objects which give the cause(s) of the failure. + */ + public static class CompileFailure extends Exception { + List diags; + + CompileFailure(List diagnostics) { + super("DFDL Schema Compile Failure"); + diags = diagnostics; + } + } } diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java index c199a9635b0..712ca639a78 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java @@ -37,30 +37,30 @@ /** * This class transforms a DFDL/Daffodil schema into a Drill Schema. */ -public class DrillDaffodilSchemaVisitor - extends MetadataHandler { +public class DrillDaffodilSchemaVisitor extends MetadataHandler { private static final Logger logger = LoggerFactory.getLogger(DrillDaffodilSchemaVisitor.class); - /** - * Changes to false after the very first invocation for the root element. + * Unfortunately, SchemaBuilder and MapBuilder, while similar, do not share a base class so we + * have a stack of MapBuilders, and when empty we use the SchemaBuilder */ - private boolean isOriginalRoot = true; - + private final SchemaBuilder builder = new SchemaBuilder(); + private final Stack mapBuilderStack = new Stack<>(); /** - * Unfortunately, SchemaBuilder and MapBuilder, while similar, do not share a base class - * so we have a stack of MapBuilders, and when empty we use the SchemaBuilder + * Changes to false after the very first invocation for the root element. */ - private final SchemaBuilder builder = new SchemaBuilder(); + private boolean isOriginalRoot = true; - private final Stack mapBuilderStack = new Stack<>(); + public static String makeColumnName(ElementMetadata md) { + return md.toQName().replace(":", "_"); + } private MapBuilder mapBuilder() { return mapBuilderStack.peek(); } /** - * Returns a {@link TupleMetadata} representation of the DFDL schema. - * Should only be called after the walk of the DFDL schema with this visitor has been called. + * Returns a {@link TupleMetadata} representation of the DFDL schema. Should only be called after + * the walk of the DFDL schema with this visitor has been called. * * @return A {@link TupleMetadata} representation of the DFDL schema. */ @@ -68,10 +68,6 @@ public TupleMetadata getDrillSchema() { return builder.build(); } - public static String makeColumnName(ElementMetadata md) { - return md.toQName().replace(":", "_"); - } - @Override public void simpleElementMetadata(SimpleElementMetadata md) { assert (!isOriginalRoot); @@ -90,12 +86,13 @@ private void simpleElementAsRowSetColumnMetadata(SimpleElementMetadata md) { // // below code adds to the schema builder directly, not a map builder // - if (md.isArray()) + if (md.isArray()) { builder.addArray(colName, drillType); - else if (md.isOptional() || md.isNillable()) + } else if (md.isOptional() || md.isNillable()) { builder.addNullable(colName, drillType); - else + } else { builder.add(colName, drillType); + } } private void simpleElementWithinComplexElementMetadata(SimpleElementMetadata md) { @@ -127,25 +124,26 @@ public void startComplexElementMetadata(ComplexElementMetadata md) { } /** - * The original root given to Drill needs to be a schema element corresponding - * to one row of data. + * The original root given to Drill needs to be a schema element corresponding to one row of + * data. *

- * Drill will call daffodil parse() to parse one such element. The - * children elements of this element will become the column contents of the - * row. + * Drill will call daffodil parse() to parse one such element. The children elements of this + * element will become the column contents of the row. *

- * So the metadata for this row, to drill, is ONLY the columns of this - * top level element type. + * So the metadata for this row, to drill, is ONLY the columns of this top level element type. + * * @param md */ private void startComplexOriginalRootMetadata(ComplexElementMetadata md) { assert (isOriginalRoot); assert (mapBuilderStack.isEmpty()); isOriginalRoot = false; - if (md instanceof InfosetSimpleElement) + if (md instanceof InfosetSimpleElement) { DFDLSchemaError("Row as a simple type element is not supported."); - if (md.isArray()) + } + if (md.isArray()) { DFDLSchemaError("Row element must not be an array."); + } // // We do nothing else here. Essentially the name of this top level element // is not relevant to drill metadata. Think of it as a table name, or a name for the @@ -154,11 +152,12 @@ private void startComplexOriginalRootMetadata(ComplexElementMetadata md) { /** * This complex type element becomes a column of the row set which is itself a map. + * * @param md */ private void startComplexElementAsRowSetColumnMetadata(ComplexElementMetadata md) { - assert(!isOriginalRoot); - assert(mapBuilderStack.isEmpty()); + assert (!isOriginalRoot); + assert (mapBuilderStack.isEmpty()); String colName = makeColumnName(md); // // This directly adds to the builder, as this complex element itself is a column @@ -166,10 +165,11 @@ private void startComplexElementAsRowSetColumnMetadata(ComplexElementMetadata md // // Then it creates a map builder for recursively adding the contents. // - if (md.isArray()) + if (md.isArray()) { mapBuilderStack.push(builder.addMapArray(colName)); - else + } else { mapBuilderStack.push(builder.addMap(colName)); // also handles optional complex elements + } } private void startComplexChildElementMetadata(ComplexElementMetadata md) { @@ -208,16 +208,20 @@ public void endComplexElementMetadata(ComplexElementMetadata md) { } @Override - public void startSequenceMetadata(SequenceMetadata m) {} + public void startSequenceMetadata(SequenceMetadata m) { + } @Override - public void endSequenceMetadata(SequenceMetadata m) {} + public void endSequenceMetadata(SequenceMetadata m) { + } @Override - public void startChoiceMetadata(ChoiceMetadata m) {} + public void startChoiceMetadata(ChoiceMetadata m) { + } @Override - public void endChoiceMetadata(ChoiceMetadata m) {} + public void endChoiceMetadata(ChoiceMetadata m) { + } private void DFDLSchemaError(String s) { throw new RuntimeException(s); diff --git a/contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java b/contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java index 8bedadbb787..2b1e9820516 100644 --- a/contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java +++ b/contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java @@ -41,17 +41,13 @@ public class TestDaffodilReader extends ClusterTest { String schemaURIRoot = "file:///opt/drill/contrib/format-daffodil/src/test/resources/"; + @BeforeClass public static void setup() throws Exception { // boilerplate call to start test rig ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher)); - DaffodilFormatConfig formatConfig = - new DaffodilFormatConfig(null, - "", - "", - "", - false); + DaffodilFormatConfig formatConfig = new DaffodilFormatConfig(null, "", "", "", false); cluster.defineFormat("dfs", "daffodil", formatConfig); @@ -62,19 +58,16 @@ public static void setup() throws Exception { } private String selectRow(String schema, String file) { - return "SELECT * FROM table(dfs.`data/" + file + "` " + - " (type => 'daffodil'," + - " validationMode => 'true', " + - " schemaURI => '" + schemaURIRoot + "schema/" + schema + ".dfdl.xsd'," + - " rootName => 'row'," + - " rootNamespace => null " + - "))"; + return "SELECT * FROM table(dfs.`data/" + file + "` " + " (type => 'daffodil'," + " " + + "validationMode => 'true', " + " schemaURI => '" + schemaURIRoot + "schema/" + schema + + ".dfdl.xsd'," + " rootName => 'row'," + " rootNamespace => null " + "))"; } /** * This unit test tests a simple data file * - * @throws Exception Throw exception if anything goes wrong + * @throws Exception + * Throw exception if anything goes wrong */ @Test public void testSimpleQuery1() throws Exception { @@ -87,12 +80,9 @@ public void testSimpleQuery1() throws Exception { // create the expected metadata and data for this test // metadata first - TupleMetadata expectedSchema = new SchemaBuilder() - .add("col", MinorType.INT) - .buildSchema(); + TupleMetadata expectedSchema = new SchemaBuilder().add("col", MinorType.INT).buildSchema(); - RowSet expected = client.rowSetBuilder(expectedSchema) - .addRow(0x00000101) // aka 257 + RowSet expected = client.rowSetBuilder(expectedSchema).addRow(0x00000101) // aka 257 .build(); new RowSetComparison(expected).verifyAndClearAll(results); @@ -102,25 +92,17 @@ public void testSimpleQuery1() throws Exception { public void testSimpleQuery2() throws Exception { QueryBuilder qb = client.queryBuilder(); - QueryBuilder query = qb.sql(selectRow("simple","data06Int.dat")); + QueryBuilder query = qb.sql(selectRow("simple", "data06Int.dat")); RowSet results = query.rowSet(); results.print(); assertEquals(6, results.rowCount()); // create the expected metadata and data for this test // metadata first - TupleMetadata expectedSchema = new SchemaBuilder() - .add("col", MinorType.INT) - .buildSchema(); - - RowSet expected = client.rowSetBuilder(expectedSchema) - .addRow(0x00000101) - .addRow(0x00000102) - .addRow(0x00000103) - .addRow(0x00000104) - .addRow(0x00000105) - .addRow(0x00000106) - .build(); + TupleMetadata expectedSchema = new SchemaBuilder().add("col", MinorType.INT).buildSchema(); + + RowSet expected = client.rowSetBuilder(expectedSchema).addRow(0x00000101).addRow(0x00000102) + .addRow(0x00000103).addRow(0x00000104).addRow(0x00000105).addRow(0x00000106).build(); new RowSetComparison(expected).verifyAndClearAll(results); } @@ -166,9 +148,9 @@ public void testComplexQuery2() throws Exception { } /** - * Tests data which is rows of two ints and an array containing a - * map containing two ints. - * Each row can be visualized like this: "{257, 258, [{259, 260},...]}" + * Tests data which is rows of two ints and an array containing a map containing two ints. Each + * row can be visualized like this: "{257, 258, [{259, 260},...]}" + * * @throws Exception */ @Test @@ -190,6 +172,7 @@ public void testComplexArrayQuery1() throws Exception { /** * Tests data which is an array of ints in one column of the row set + * * @throws Exception */ @Test @@ -210,9 +193,10 @@ public void testSimpleArrayQuery1() throws Exception { } /** - * Tests data which is rows of two ints and an array containing a - * map containing an int and a vector of ints. - * Each row can be visualized like this: "{257, 258, [{259, [260, 261, 262]},...]}" + * Tests data which is rows of two ints and an array containing a map containing an int and a + * vector of ints. Each row can be visualized like this: "{257, 258, [{259, [260, 261, + * 262]},...]}" + * * @throws Exception */ @Test @@ -244,12 +228,16 @@ public void testMoreTypes1() throws Exception { RowSetReader rdr = results.reader(); rdr.next(); String map = rdr.getAsString(); - assertEquals("{2147483647, 9223372036854775807, 32767, 127, true, " + - "1.7976931348623157E308, 3.4028235E38, [31, 32, 33, 34, 35, 36, 37, 38], \"daffodil\"}", map); + assertEquals( + "{2147483647, 9223372036854775807, 32767, 127, true, " + "1.7976931348623157E308, 3" + + ".4028235E38, [31, 32, 33, 34, 35, 36, 37, 38], \"daffodil\"}", + map); rdr.next(); map = rdr.getAsString(); - assertEquals("{-2147483648, -9223372036854775808, -32768, -128, false, " + - "-1.7976931348623157E308, -3.4028235E38, [38, 37, 36, 35, 34, 33, 32, 31], \"drill\"}", map); + assertEquals( + "{-2147483648, -9223372036854775808, -32768, -128, false, " + "-1.7976931348623157E308, " + + "-3.4028235E38, [38, 37, 36, 35, 34, 33, 32, 31], \"drill\"}", + map); assertFalse(rdr.next()); results.clear(); } @@ -266,10 +254,11 @@ public void testMoreTypes2() throws Exception { RowSetReader rdr = results.reader(); rdr.next(); String map = rdr.getAsString(); - assertEquals("{4294967295, 18446744073709551615, 65535, 255, " + - "-18446744073709551616, 18446744073709551616, " + - "\"0.18446744073709551616\", " + // xs:decimal is modeled as VARCHAR i.e., a string. So needs quotation marks. - "1970-01-01, 00:00, 1970-01-01T00:00:00Z}", map); + assertEquals( + "{4294967295, 18446744073709551615, 65535, 255, " + "-18446744073709551616, " + + "18446744073709551616, " + "\"0.18446744073709551616\", " + // xs:decimal is modeled + // as VARCHAR i.e., a string. So needs quotation marks. + "1970-01-01, 00:00, 1970-01-01T00:00:00Z}", map); assertFalse(rdr.next()); results.clear(); } From b80e74a091754a255a09a807ca59b782496fe100 Mon Sep 17 00:00:00 2001 From: Michael Beckerle Date: Thu, 4 Jan 2024 17:56:15 -0500 Subject: [PATCH 4/8] Remove catches of Exception and test printing Also a few code cleanups. --- .../store/daffodil/DaffodilBatchReader.java | 44 +++++++++---------- .../store/daffodil/DaffodilFormatConfig.java | 4 +- .../store/daffodil/DaffodilFormatPlugin.java | 4 +- .../schema/DaffodilDataProcessorFactory.java | 28 +++++++----- .../store/daffodil/TestDaffodilReader.java | 17 +------ 5 files changed, 41 insertions(+), 56 deletions(-) diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilBatchReader.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilBatchReader.java index 455a7c35721..9a779f335de 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilBatchReader.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilBatchReader.java @@ -34,17 +34,18 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.net.URISyntaxException; import java.util.Objects; +import static org.apache.drill.exec.store.daffodil.schema.DaffodilDataProcessorFactory.*; import static org.apache.drill.exec.store.daffodil.schema.DrillDaffodilSchemaUtils.daffodilDataProcessorToDrillSchema; public class DaffodilBatchReader implements ManagedReader { private static final Logger logger = LoggerFactory.getLogger(DaffodilBatchReader.class); - private final DaffodilFormatConfig dafConfig; private final RowSetLoader rowSetLoader; private final CustomErrorContext errorContext; private final DaffodilMessageParser dafParser; @@ -54,7 +55,7 @@ public DaffodilBatchReader(DaffodilReaderConfig readerConfig, EasySubScan scan, FileSchemaNegotiator negotiator) { errorContext = negotiator.parentErrorContext(); - this.dafConfig = readerConfig.plugin.getConfig(); + DaffodilFormatConfig dafConfig = readerConfig.plugin.getConfig(); String schemaURIString = dafConfig.getSchemaURI(); // "schema/complexArray1.dfdl.xsd"; String rootName = dafConfig.getRootName(); @@ -76,7 +77,7 @@ public DaffodilBatchReader(DaffodilReaderConfig readerConfig, EasySubScan scan, DataProcessor dp; try { dp = dpf.getDataProcessor(fsSchemaURI, validationMode, rootName, rootNamespace); - } catch (Exception e) { + } catch (CompileFailure e) { throw UserException.dataReadError(e) .message(String.format("Failed to get Daffodil DFDL processor for: %s", fsSchemaURI)) .addContext(errorContext).addContext(e.getMessage()).build(logger); @@ -99,7 +100,7 @@ public DaffodilBatchReader(DaffodilReaderConfig readerConfig, EasySubScan scan, // convert infoset event calls to fill in a Drill row via a rowSetLoader. DaffodilDrillInfosetOutputter outputter = new DaffodilDrillInfosetOutputter(rowSetLoader); - // Now we can setup the dafParser with the outputter it will drive with + // Now we can set up the dafParser with the outputter it will drive with // the parser-produced infoset. dafParser = new DaffodilMessageParser(dp); // needs further initialization after this. dafParser.setInfosetOutputter(outputter); @@ -108,7 +109,7 @@ public DaffodilBatchReader(DaffodilReaderConfig readerConfig, EasySubScan scan, // Lastly, we open the data stream try { dataInputStream = fs.openPossiblyCompressedStream(dataPath); - } catch (Exception e) { + } catch (IOException e) { throw UserException.dataReadError(e) .message(String.format("Failed to open input file: %s", dataPath.toString())) .addContext(errorContext).addContext(e.getMessage()).build(logger); @@ -146,22 +147,17 @@ public boolean next() { } while (rowSetLoader.start() && !dafParser.isEOF()) { // we never zero-trip this loop. // the predicate is always true once. - try { - dafParser.parse(); - if (dafParser.isProcessingError()) { - assert (Objects.nonNull(dafParser.getDiagnostics())); - throw UserException.dataReadError().message(dafParser.getDiagnosticsAsString()) - .addContext(errorContext).build(logger); - } - if (dafParser.isValidationError()) { - logger.warn(dafParser.getDiagnosticsAsString()); - // Note that even if daffodil is set to not validate, validation errors may still occur - // from DFDL's "recoverableError" assertions. - } - } catch (Exception e) { - throw UserException.dataReadError(e).message("Error parsing file: " + e.getMessage()) + dafParser.parse(); + if (dafParser.isProcessingError()) { + assert (Objects.nonNull(dafParser.getDiagnostics())); + throw UserException.dataReadError().message(dafParser.getDiagnosticsAsString()) .addContext(errorContext).build(logger); } + if (dafParser.isValidationError()) { + logger.warn(dafParser.getDiagnosticsAsString()); + // Note that even if daffodil is set to not validate, validation errors may still occur + // from DFDL's "recoverableError" assertions. + } rowSetLoader.save(); } int nRows = rowSetLoader.rowCount(); @@ -174,12 +170,12 @@ public boolean next() { public void close() { AutoCloseables.closeSilently(dataInputStream); } +} - static class DaffodilReaderConfig { - final DaffodilFormatPlugin plugin; +class DaffodilReaderConfig { + final DaffodilFormatPlugin plugin; - DaffodilReaderConfig(DaffodilFormatPlugin plugin) { - this.plugin = plugin; - } + DaffodilReaderConfig(DaffodilFormatPlugin plugin) { + this.plugin = plugin; } } diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatConfig.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatConfig.java index c4a02f20f84..1d197bbd8e7 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatConfig.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatConfig.java @@ -25,7 +25,6 @@ import com.google.common.collect.ImmutableList; import org.apache.drill.common.PlanStringBuilder; import org.apache.drill.common.logical.FormatPluginConfig; -import org.apache.drill.exec.store.daffodil.DaffodilBatchReader.DaffodilReaderConfig; import java.util.Collections; import java.util.List; @@ -84,8 +83,7 @@ public boolean getValidationMode() { } public DaffodilReaderConfig getReaderConfig(DaffodilFormatPlugin plugin) { - DaffodilReaderConfig readerConfig = new DaffodilReaderConfig(plugin); - return readerConfig; + return new DaffodilReaderConfig(plugin); } @Override diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatPlugin.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatPlugin.java index 7ee94d12310..b6a74467286 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatPlugin.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilFormatPlugin.java @@ -56,11 +56,11 @@ protected void configureScan(FileScanLifecycleBuilder builder, EasySubScan scan) } public static class DaffodilReaderFactory extends FileReaderFactory { - private final DaffodilBatchReader.DaffodilReaderConfig readerConfig; + private final DaffodilReaderConfig readerConfig; private final EasySubScan scan; - public DaffodilReaderFactory(DaffodilBatchReader.DaffodilReaderConfig config, + public DaffodilReaderFactory(DaffodilReaderConfig config, EasySubScan scan) { this.readerConfig = config; this.scan = scan; diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java index a7267586948..dbd00929062 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java @@ -65,16 +65,10 @@ public class DaffodilDataProcessorFactory { * @return the DataProcessor * @throws CompileFailure * - if schema compilation fails - * @throws IOException - * - if the schemaFileURI cannot be opened or is not found. - * @throws URISyntaxException - * - if the schemaFileURI is not legal syntax. - * @throws InvalidParserException - * - if the reloading of the parser from pre-compiled binary fails. */ public DataProcessor getDataProcessor(URI schemaFileURI, boolean validationMode, String rootName, String rootNS) - throws CompileFailure, IOException, URISyntaxException, InvalidParserException { + throws CompileFailure { DaffodilDataProcessorFactory dmp = new DaffodilDataProcessorFactory(); boolean isPrecompiled = schemaFileURI.toString().endsWith(".bin"); @@ -85,10 +79,19 @@ public DataProcessor getDataProcessor(URI schemaFileURI, boolean validationMode, logger.warn("Root element name '{}' is ignored when used with precompiled DFDL schema.", rootName); } - dmp.loadSchema(schemaFileURI); + try { + dmp.loadSchema(schemaFileURI); + } catch (IOException | InvalidParserException e) { + throw new CompileFailure(e); + } dmp.setupDP(validationMode, null); } else { - List pfDiags = dmp.compileSchema(schemaFileURI, rootName, rootNS); + List pfDiags; + try { + pfDiags = dmp.compileSchema(schemaFileURI, rootName, rootNS); + } catch (URISyntaxException | IOException e) { + throw new CompileFailure(e); + } dmp.setupDP(validationMode, pfDiags); } return dmp.dp; @@ -99,7 +102,6 @@ private void loadSchema(URI schemaFileURI) throws IOException, InvalidParserExce dp = c.reload(Channels.newChannel(schemaFileURI.toURL().openStream())); } - @SuppressWarnings("ReassignedVariable") private List compileSchema(URI schemaFileURI, String rootName, String rootNS) throws URISyntaxException, IOException, CompileFailure { Compiler c = Daffodil.compiler(); @@ -143,7 +145,8 @@ private void setupDP(boolean validationMode, List pfDiags) throws Co /** * Thrown if schema compilation fails. *

- * Contains diagnostic objects which give the cause(s) of the failure. + * Contains diagnostic objects which give the cause(s) of the failure, or + * contains a cause Throwable providing the reason. */ public static class CompileFailure extends Exception { List diags; @@ -152,5 +155,8 @@ public static class CompileFailure extends Exception { super("DFDL Schema Compile Failure"); diags = diagnostics; } + CompileFailure(Throwable cause) { + super(cause); + } } } diff --git a/contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java b/contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java index 2b1e9820516..d5aa3a23058 100644 --- a/contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java +++ b/contrib/format-daffodil/src/test/java/org/apache/drill/exec/store/daffodil/TestDaffodilReader.java @@ -75,7 +75,6 @@ public void testSimpleQuery1() throws Exception { QueryBuilder qb = client.queryBuilder(); QueryBuilder query = qb.sql(selectRow("simple", "data01Int.dat.gz")); RowSet results = query.rowSet(); - results.print(); assertEquals(1, results.rowCount()); // create the expected metadata and data for this test @@ -94,7 +93,6 @@ public void testSimpleQuery2() throws Exception { QueryBuilder qb = client.queryBuilder(); QueryBuilder query = qb.sql(selectRow("simple", "data06Int.dat")); RowSet results = query.rowSet(); - results.print(); assertEquals(6, results.rowCount()); // create the expected metadata and data for this test @@ -113,7 +111,6 @@ public void testComplexQuery1() throws Exception { QueryBuilder qb = client.queryBuilder(); QueryBuilder query = qb.sql(selectRow("complex1", "data02Int.dat")); RowSet results = query.rowSet(); - results.print(); assertEquals(1, results.rowCount()); RowSetReader rdr = results.reader(); @@ -130,7 +127,6 @@ public void testComplexQuery2() throws Exception { QueryBuilder qb = client.queryBuilder(); QueryBuilder query = qb.sql(selectRow("complex1", "data06Int.dat")); RowSet results = query.rowSet(); - results.print(); assertEquals(3, results.rowCount()); RowSetReader rdr = results.reader(); @@ -150,16 +146,13 @@ public void testComplexQuery2() throws Exception { /** * Tests data which is rows of two ints and an array containing a map containing two ints. Each * row can be visualized like this: "{257, 258, [{259, 260},...]}" - * - * @throws Exception */ @Test - public void testComplexArrayQuery1() throws Exception { + public void testComplexArrayQuery1() throws Exception { QueryBuilder qb = client.queryBuilder(); QueryBuilder query = qb.sql(selectRow("complexArray1", "data12Int.dat")); RowSet results = query.rowSet(); - results.print(); assertEquals(1, results.rowCount()); RowSetReader rdr = results.reader(); @@ -172,8 +165,6 @@ public void testComplexArrayQuery1() throws Exception { /** * Tests data which is an array of ints in one column of the row set - * - * @throws Exception */ @Test public void testSimpleArrayQuery1() throws Exception { @@ -181,7 +172,6 @@ public void testSimpleArrayQuery1() throws Exception { QueryBuilder qb = client.queryBuilder(); QueryBuilder query = qb.sql(selectRow("simpleArrayField1", "data12Int.dat")); RowSet results = query.rowSet(); - results.print(); assertEquals(1, results.rowCount()); RowSetReader rdr = results.reader(); @@ -196,8 +186,6 @@ public void testSimpleArrayQuery1() throws Exception { * Tests data which is rows of two ints and an array containing a map containing an int and a * vector of ints. Each row can be visualized like this: "{257, 258, [{259, [260, 261, * 262]},...]}" - * - * @throws Exception */ @Test public void testComplexArrayQuery2() throws Exception { @@ -205,7 +193,6 @@ public void testComplexArrayQuery2() throws Exception { QueryBuilder qb = client.queryBuilder(); QueryBuilder query = qb.sql(selectRow("complexArray2", "data12Int.dat")); RowSet results = query.rowSet(); - results.print(); assertEquals(1, results.rowCount()); RowSetReader rdr = results.reader(); @@ -222,7 +209,6 @@ public void testMoreTypes1() throws Exception { QueryBuilder qb = client.queryBuilder(); QueryBuilder query = qb.sql(selectRow("moreTypes1", "moreTypes1.txt.dat")); RowSet results = query.rowSet(); - results.print(); assertEquals(2, results.rowCount()); RowSetReader rdr = results.reader(); @@ -248,7 +234,6 @@ public void testMoreTypes2() throws Exception { QueryBuilder qb = client.queryBuilder(); QueryBuilder query = qb.sql(selectRow("moreTypes2", "moreTypes2.txt.dat")); RowSet results = query.rowSet(); - results.print(); assertEquals(1, results.rowCount()); RowSetReader rdr = results.reader(); From ab567d97252e980687466b9c510ac1d8e0cb084e Mon Sep 17 00:00:00 2001 From: Michael Beckerle Date: Sun, 21 Jan 2024 14:53:45 -0500 Subject: [PATCH 5/8] Factored common MapBuilderLike out of SchemaBuilder and MapBuilder This significantly simplifies the metadata walking to convert Daffodil metadata to drill metadata. --- contrib/format-daffodil/README.md | 10 +- .../DaffodilDrillInfosetOutputter.java | 9 +- .../schema/DaffodilDataProcessorFactory.java | 3 + .../schema/DrillDaffodilSchemaUtils.java | 3 +- .../schema/DrillDaffodilSchemaVisitor.java | 153 +++--------------- .../exec/record/metadata/MapBuilder.java | 29 +++- .../exec/record/metadata/MapBuilderLike.java | 41 +++++ .../exec/record/metadata/SchemaBuilder.java | 14 +- 8 files changed, 125 insertions(+), 137 deletions(-) create mode 100644 exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilderLike.java diff --git a/contrib/format-daffodil/README.md b/contrib/format-daffodil/README.md index d036c63b2e8..784e7a7dcf5 100644 --- a/contrib/format-daffodil/README.md +++ b/contrib/format-daffodil/README.md @@ -1,15 +1,23 @@ # Daffodil 'Format' Reader This plugin enables Drill to read DFDL-described data from files by way of the Apache Daffodil DFDL implementation. +## Validation + +Data read by Daffodil is always validated using Daffodil's Limited Validation mode. + +TBD: do we need an option to control escalating validation errors to fatal? Currently this is not provided. + ## Limitations: TBD -At the moment, the DFDL schema is found on the local file system, which won't continue to work. +At the moment, the DFDL schema is found on the local file system, which won't support Drill's distributed architecture. There are restrictions on the DFDL schemas that this can handle. In particular, all element children must have distinct element names, including across choice branches. (This rules out a number of large DFDL schemas.) +TBD: Auto renaming as part of the Daffodil-to-Drill metadata mapping? + The data is parsed fully from its native form into a Drill data structure held in memory. No attempt is made to avoid access to parts of the DFDL-described data that are not needed to answer the query. diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java index 803f269a166..a8a72b8f0af 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java @@ -136,7 +136,7 @@ public void startSimple(InfosetSimpleElement ise) { cw = currentArrayWriter().scalar(); } else { // A simple element within a map - // Note the map itself might be an array + // Note the map itself might be an array, // but we don't care about that here. cw = currentTupleWriter().scalar(colName); } @@ -167,13 +167,12 @@ public void startComplex(InfosetComplexElement ce) { // Rather, it's children are the columns of the row set. // // If we do nothing at all here, then we'll start getting - // even calls for the children. + // event calls for the children. isRootElement = false; return; } if (md.isArray()) { assert (!arrayWriterStack.isEmpty()); - // FIXME: is this the way to add a complex array child item (i.e., each array item is a map) tupleWriterStack.push(currentArrayWriter().tuple()); } else { tupleWriterStack.push(currentTupleWriter().tuple(colName)); @@ -239,7 +238,7 @@ private void convertDaffodilValueToDrillValue(InfosetSimpleElement ise, ColumnMe TypeProtos.MinorType drillType = DrillDaffodilSchemaUtils.getDrillDataType(dafType); assert (drillType == cm.type()); switch (drillType) { - case BIGINT: { // not a bignum, BIGINT is a signed 8-byte long in Drill. + case BIGINT: { // BIGINT type is not a Java BigInteger, BIGINT is a signed 8-byte long in Drill. switch (dafTypeName) { case "unsignedInt": { cw.setLong(ise.getUnsignedInt()); @@ -315,7 +314,7 @@ private void convertDaffodilValueToDrillValue(InfosetSimpleElement ise, ColumnMe // then placed in a FLOAT8 column displays as // 3.4028234663852886E38 not 3.4028235E38. // But converting to string first, then to double works properly. - cw.setDouble(Double.valueOf(ise.getFloat().toString())); + cw.setDouble(Double.parseDouble(ise.getFloat().toString())); break; } default: diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java index dbd00929062..e338f266b7e 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DaffodilDataProcessorFactory.java @@ -122,6 +122,9 @@ private void setupDP(boolean validationMode, List pfDiags) throws Co Objects.requireNonNull(dp); // true because failure to produce a dp throws CompileFailure. if (validationMode) { try { + // We don't have the DFDL schema text, and we're not creating XML as an intermediate form, + // we're taking data direct from Daffodil into Drill rows, so using any Xerces-based + // XML Validator is not possible. dp = dp.withValidationMode(ValidationMode.Limited); } catch (InvalidUsageException e) { // impossible diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java index a901cb15cc7..f5b5094dffc 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java @@ -81,7 +81,8 @@ public static TupleMetadata processSchema(URI dfdlSchemaURI, String rootName, St throws IOException, DaffodilDataProcessorFactory.CompileFailure, URISyntaxException, InvalidParserException { DaffodilDataProcessorFactory dpf = new DaffodilDataProcessorFactory(); - DataProcessor dp = dpf.getDataProcessor(dfdlSchemaURI, true, rootName, namespace); + boolean validationMode = true; // use Daffodil's limited validation always + DataProcessor dp = dpf.getDataProcessor(dfdlSchemaURI, validationMode, rootName, namespace); return daffodilDataProcessorToDrillSchema(dp); } diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java index 712ca639a78..1f0107de5f2 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java @@ -21,12 +21,11 @@ import org.apache.daffodil.runtime1.api.ChoiceMetadata; import org.apache.daffodil.runtime1.api.ComplexElementMetadata; import org.apache.daffodil.runtime1.api.ElementMetadata; -import org.apache.daffodil.runtime1.api.InfosetSimpleElement; import org.apache.daffodil.runtime1.api.MetadataHandler; import org.apache.daffodil.runtime1.api.SequenceMetadata; import org.apache.daffodil.runtime1.api.SimpleElementMetadata; import org.apache.drill.common.types.TypeProtos.MinorType; -import org.apache.drill.exec.record.metadata.MapBuilder; +import org.apache.drill.exec.record.metadata.MapBuilderLike; import org.apache.drill.exec.record.metadata.SchemaBuilder; import org.apache.drill.exec.record.metadata.TupleMetadata; import org.slf4j.Logger; @@ -38,26 +37,28 @@ * This class transforms a DFDL/Daffodil schema into a Drill Schema. */ public class DrillDaffodilSchemaVisitor extends MetadataHandler { + private static final Logger logger = LoggerFactory.getLogger(DrillDaffodilSchemaVisitor.class); + /** - * Unfortunately, SchemaBuilder and MapBuilder, while similar, do not share a base class so we - * have a stack of MapBuilders, and when empty we use the SchemaBuilder + * SchemaBuilder and MapBuilder share a polymorphic interface MapBuilderLike */ private final SchemaBuilder builder = new SchemaBuilder(); - private final Stack mapBuilderStack = new Stack<>(); + private final Stack mapBuilderStack = new Stack<>(); + + private MapBuilderLike mapBuilder() { + return mapBuilderStack.peek(); + } + /** - * Changes to false after the very first invocation for the root element. + * Converts Daffodil names into appropriate Drill column names. + * @param md Daffodil element metadata, which contains an element name. + * @return a string usable as a Drill column name */ - private boolean isOriginalRoot = true; - public static String makeColumnName(ElementMetadata md) { return md.toQName().replace(":", "_"); } - private MapBuilder mapBuilder() { - return mapBuilderStack.peek(); - } - /** * Returns a {@link TupleMetadata} representation of the DFDL schema. Should only be called after * the walk of the DFDL schema with this visitor has been called. @@ -70,39 +71,9 @@ public TupleMetadata getDrillSchema() { @Override public void simpleElementMetadata(SimpleElementMetadata md) { - assert (!isOriginalRoot); - if (mapBuilderStack.isEmpty()) { - simpleElementAsRowSetColumnMetadata(md); - } else { - simpleElementWithinComplexElementMetadata(md); - } - } - - private void simpleElementAsRowSetColumnMetadata(SimpleElementMetadata md) { - assert (!isOriginalRoot); - assert (mapBuilderStack.isEmpty()); - String colName = makeColumnName(md); - MinorType drillType = DrillDaffodilSchemaUtils.getDrillDataType(md.primitiveType()); - // - // below code adds to the schema builder directly, not a map builder - // - if (md.isArray()) { - builder.addArray(colName, drillType); - } else if (md.isOptional() || md.isNillable()) { - builder.addNullable(colName, drillType); - } else { - builder.add(colName, drillType); - } - } - - private void simpleElementWithinComplexElementMetadata(SimpleElementMetadata md) { - assert (!isOriginalRoot); assert (!mapBuilderStack.isEmpty()); String colName = makeColumnName(md); MinorType drillType = DrillDaffodilSchemaUtils.getDrillDataType(md.primitiveType()); - // - // The code below adds to a map builder from the stack - // if (md.isArray()) { mapBuilder().addArray(colName, drillType); } else if (md.isOptional() || md.isNillable()) { @@ -114,97 +85,25 @@ private void simpleElementWithinComplexElementMetadata(SimpleElementMetadata md) @Override public void startComplexElementMetadata(ComplexElementMetadata md) { - if (isOriginalRoot) { - startComplexOriginalRootMetadata(md); - } else if (mapBuilderStack.isEmpty()) { - startComplexElementAsRowSetColumnMetadata(md); - } else { - startComplexChildElementMetadata(md); - } - } - - /** - * The original root given to Drill needs to be a schema element corresponding to one row of - * data. - *

- * Drill will call daffodil parse() to parse one such element. The children elements of this - * element will become the column contents of the row. - *

- * So the metadata for this row, to drill, is ONLY the columns of this top level element type. - * - * @param md - */ - private void startComplexOriginalRootMetadata(ComplexElementMetadata md) { - assert (isOriginalRoot); - assert (mapBuilderStack.isEmpty()); - isOriginalRoot = false; - if (md instanceof InfosetSimpleElement) { - DFDLSchemaError("Row as a simple type element is not supported."); - } - if (md.isArray()) { - DFDLSchemaError("Row element must not be an array."); - } - // - // We do nothing else here. Essentially the name of this top level element - // is not relevant to drill metadata. Think of it as a table name, or a name for the - // entire final set of rows that are the query result. - } - - /** - * This complex type element becomes a column of the row set which is itself a map. - * - * @param md - */ - private void startComplexElementAsRowSetColumnMetadata(ComplexElementMetadata md) { - assert (!isOriginalRoot); - assert (mapBuilderStack.isEmpty()); - String colName = makeColumnName(md); - // - // This directly adds to the builder, as this complex element itself is a column - // of the top level row - // - // Then it creates a map builder for recursively adding the contents. - // - if (md.isArray()) { - mapBuilderStack.push(builder.addMapArray(colName)); - } else { - mapBuilderStack.push(builder.addMap(colName)); // also handles optional complex elements - } - } - - private void startComplexChildElementMetadata(ComplexElementMetadata md) { - assert (!mapBuilderStack.isEmpty()); - assert (!isOriginalRoot); - String colName = makeColumnName(md); - // - // This is for non-top-level columns, but adding a field within a map. - // That map represents a complex type element that does NOT correspond to - // the top-level rows. - // - if (md.isArray()) { - // there is no notion of optional/nullable in drill for arrays. - // Note that our arrays are always maps. We don't have pure-value - // simple-type arrays. - mapBuilderStack.push(mapBuilder().addMapArray(colName)); + if (mapBuilderStack.isEmpty()) { + // root element case. The SchemaBuilder top level row is the container of the root element's children + mapBuilderStack.push(builder); } else { - // there is no concept of optional/nullable in drill for maps. - mapBuilderStack.push(mapBuilder().addMap(colName)); + // enclosed complex element case. Create a map field. + String colName = makeColumnName(md); + if (md.isArray()) { + mapBuilderStack.push(mapBuilder().addMapArray(colName)); + } else { + mapBuilderStack.push(mapBuilder().addMap(colName)); // also handles optional complex elements + } } } @Override public void endComplexElementMetadata(ComplexElementMetadata md) { - // the mapBuilderStack can be empty if we had the most basic - // kind of schema with a repeating row-set containing only - // simple type children. - if (!mapBuilderStack.isEmpty()) { - if (mapBuilderStack.size() == 1) { - mapBuilder().resumeSchema(); - } else { - mapBuilder().resumeMap(); - } - mapBuilderStack.pop(); // only pop if there was something on the stack. - } + assert (!mapBuilderStack.isEmpty()); + mapBuilder().resume(); + mapBuilderStack.pop(); } @Override diff --git a/exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilder.java b/exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilder.java index f333cb9cd2b..f49fd3bdf20 100644 --- a/exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilder.java +++ b/exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilder.java @@ -22,6 +22,8 @@ import org.apache.drill.common.types.TypeProtos.MinorType; import org.apache.drill.exec.record.MaterializedField; +import java.util.Objects; + /** * Internal structure for building a map. A map is just a schema, * but one that is part of a parent column. @@ -33,7 +35,7 @@ * All resumeXXX methods do not produce any action and return null. * To access built column {@link #buildColumn()} should be used. */ -public class MapBuilder implements SchemaContainer { +public class MapBuilder implements MapBuilderLike, SchemaContainer { private final SchemaContainer parent; private final TupleBuilder tupleBuilder = new TupleBuilder(); private final String memberName; @@ -68,6 +70,7 @@ public MapBuilder add(String name, MinorType type, DataMode mode) { return this; } + @Override public MapBuilder add(String name, MinorType type) { tupleBuilder.add(name, type); return this; @@ -82,6 +85,7 @@ public MapBuilder add(String name, MinorType type, int precision, int scale) { return addDecimal(name, type, DataMode.REQUIRED, precision, scale); } + @Override public MapBuilder addNullable(String name, MinorType type) { tupleBuilder.addNullable(name, type); return this; @@ -96,6 +100,7 @@ public MapBuilder addNullable(String name, MinorType type, int precision, int sc return addDecimal(name, type, DataMode.OPTIONAL, precision, scale); } + @Override public MapBuilder addArray(String name, MinorType type) { tupleBuilder.addArray(name, type); return this; @@ -129,10 +134,12 @@ public MapBuilder addDecimal(String name, MinorType type, * @param name the name of the map column * @return a builder for the map */ + @Override public MapBuilder addMap(String name) { return tupleBuilder.addMap(this, name); } + @Override public MapBuilder addMapArray(String name) { return tupleBuilder.addMapArray(this, name); } @@ -185,6 +192,26 @@ public MapBuilder resumeMap() { return (MapBuilder) parent; } + /** + * Depending on whether the parent is a schema builder or map builder + * we resume appropriately. + */ + @Override + public void resume() { + if (Objects.isNull(parent)) + throw new IllegalStateException("Call to resume() on MapBuilder with no parent."); + if (parent instanceof MapBuilder) + resumeMap(); + else { + assert(parent instanceof SchemaBuilder); + // + // This would be extended for other kinds of possible containers of a Map. + // First version only needed SchemaBuilder parents + // + resumeSchema(); + } + } + public RepeatedListBuilder resumeList() { build(); return (RepeatedListBuilder) parent; diff --git a/exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilderLike.java b/exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilderLike.java new file mode 100644 index 00000000000..086c6a5b5e7 --- /dev/null +++ b/exec/vector/src/main/java/org/apache/drill/exec/record/metadata/MapBuilderLike.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.record.metadata; + +import org.apache.drill.common.types.TypeProtos; + +/** + * A common interface shared by SchemaBuilder and MapBuilder allowing one to do most + * operations for constructing metadata for hierarchical data + * without having to keep track of whether you are dealing with the top-level schema/row + * level or a map within it. + */ +public interface MapBuilderLike { + + MapBuilderLike addArray(String colName, TypeProtos.MinorType drillType); + + MapBuilderLike addNullable(String colName, TypeProtos.MinorType drillType); + + MapBuilderLike add(String colName, TypeProtos.MinorType drillType); + + MapBuilderLike addMapArray(String colName); + + MapBuilderLike addMap(String colName); + + void resume(); +} diff --git a/exec/vector/src/main/java/org/apache/drill/exec/record/metadata/SchemaBuilder.java b/exec/vector/src/main/java/org/apache/drill/exec/record/metadata/SchemaBuilder.java index c1b97609357..f2aee1d2190 100644 --- a/exec/vector/src/main/java/org/apache/drill/exec/record/metadata/SchemaBuilder.java +++ b/exec/vector/src/main/java/org/apache/drill/exec/record/metadata/SchemaBuilder.java @@ -61,7 +61,7 @@ * */ -public class SchemaBuilder implements SchemaContainer { +public class SchemaBuilder implements MapBuilderLike, SchemaContainer { /** * Actual tuple schema builder. The odd layered structure is needed @@ -114,6 +114,7 @@ public SchemaBuilder add(String name, MinorType type, DataMode mode) { return this; } + @Override public SchemaBuilder add(String name, MinorType type) { tupleBuilder.add(name, type); return this; @@ -128,6 +129,7 @@ public SchemaBuilder add(String name, MinorType type, int precision, int scale) return addDecimal(name, type, DataMode.REQUIRED, precision, scale); } + @Override public SchemaBuilder addNullable(String name, MinorType type) { tupleBuilder.addNullable(name, type); return this; @@ -142,6 +144,7 @@ public SchemaBuilder addNullable(String name, MinorType type, int precision, int return addDecimal(name, type, DataMode.OPTIONAL, precision, scale); } + @Override public SchemaBuilder addArray(String name, MinorType type) { tupleBuilder.addArray(name, type); return this; @@ -157,7 +160,7 @@ public SchemaBuilder addDecimal(String name, MinorType type, DataMode mode, int } /** - * Add a multi-dimensional array, implemented as a repeated vector + * Add a multidimensional array, implemented as a repeated vector * along with 0 or more repeated list vectors. * * @param name column name @@ -191,10 +194,12 @@ public SchemaBuilder addAll(TupleMetadata from) { * @param name the name of the map column * @return a builder for the map */ + @Override public MapBuilder addMap(String name) { return tupleBuilder.addMap(this, name); } + @Override public MapBuilder addMapArray(String name) { return tupleBuilder.addMapArray(this, name); } @@ -237,4 +242,9 @@ public TupleMetadata buildSchema() { public TupleMetadata build() { return tupleBuilder.schema(); } + + @Override + public void resume() { + // do nothing. There is nothing else to resume. + } } From ad25972bfec7b5be7a9de02cd436b5ad7d211d18 Mon Sep 17 00:00:00 2001 From: Michael Beckerle Date: Mon, 12 Feb 2024 21:13:07 -0500 Subject: [PATCH 6/8] Update to latest flavor of Daffodil Metadata Uses JPrimType now, not strings. --- .../DaffodilDrillInfosetOutputter.java | 30 +++++------ .../schema/DrillDaffodilSchemaUtils.java | 50 +++++++++---------- .../schema/DrillDaffodilSchemaVisitor.java | 2 +- 3 files changed, 41 insertions(+), 41 deletions(-) diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java index a8a72b8f0af..a0a0a0292fe 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java @@ -25,7 +25,7 @@ import org.apache.daffodil.runtime1.api.InfosetArray; import org.apache.daffodil.runtime1.api.InfosetComplexElement; import org.apache.daffodil.runtime1.api.InfosetSimpleElement; -import org.apache.daffodil.runtime1.api.PrimitiveType; +import org.apache.daffodil.runtime1.api.JPrimType; import org.apache.drill.common.types.TypeProtos; import org.apache.drill.exec.physical.resultSet.RowSetLoader; import org.apache.drill.exec.record.metadata.ColumnMetadata; @@ -233,18 +233,18 @@ private void invariantFailed(String dafTypeName, ColumnMetadata cm) { private void convertDaffodilValueToDrillValue(InfosetSimpleElement ise, ColumnMetadata cm, ScalarWriter cw) { - PrimitiveType dafType = ise.metadata().primitiveType(); + JPrimType dafType = ise.metadata().jPrimType(); String dafTypeName = dafType.name(); TypeProtos.MinorType drillType = DrillDaffodilSchemaUtils.getDrillDataType(dafType); assert (drillType == cm.type()); switch (drillType) { case BIGINT: { // BIGINT type is not a Java BigInteger, BIGINT is a signed 8-byte long in Drill. - switch (dafTypeName) { - case "unsignedInt": { + switch (dafType) { + case UnsignedInt: { cw.setLong(ise.getUnsignedInt()); break; } - case "long": { + case Long: { cw.setLong(ise.getLong()); break; } @@ -280,16 +280,16 @@ private void convertDaffodilValueToDrillValue(InfosetSimpleElement ise, ColumnMe break; } case VARDECIMAL: { - switch (dafTypeName) { - case "unsignedLong": { + switch (dafType) { + case UnsignedLong: { cw.setDecimal(new BigDecimal(ise.getUnsignedLong())); break; } - case "integer": { + case Integer: { cw.setDecimal(new BigDecimal(ise.getInteger())); break; } - case "nonNegativeInteger": { + case NonNegativeInteger: { cw.setDecimal(new BigDecimal(ise.getNonNegativeInteger())); break; } @@ -303,12 +303,12 @@ private void convertDaffodilValueToDrillValue(InfosetSimpleElement ise, ColumnMe break; } case FLOAT8: { - switch (dafTypeName) { - case "double": { + switch (dafType) { + case Double: { cw.setDouble(ise.getDouble()); break; } - case "float": { + case Float: { // converting a float to a double by doubleValue() fails here // Float.MaxValue converted to a double via doubleValue() // then placed in a FLOAT8 column displays as @@ -334,13 +334,13 @@ private void convertDaffodilValueToDrillValue(InfosetSimpleElement ise, ColumnMe break; } case VARCHAR: { - switch (dafTypeName) { - case "decimal": { + switch (dafType) { + case Decimal: { BigDecimal decimal = ise.getDecimal(); cw.setString(decimal.toString()); break; } - case "string": { + case String: { String s = ise.getString(); cw.setString(s); break; diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java index f5b5094dffc..b4072668e65 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java @@ -20,7 +20,7 @@ import org.apache.daffodil.japi.InvalidParserException; import org.apache.daffodil.japi.DataProcessor; -import org.apache.daffodil.runtime1.api.PrimitiveType; +import org.apache.daffodil.runtime1.api.JPrimType; import org.apache.drill.common.types.TypeProtos.MinorType; import org.apache.drill.exec.record.metadata.TupleMetadata; import com.google.common.annotations.VisibleForTesting; @@ -41,27 +41,27 @@ public class DrillDaffodilSchemaUtils { /** * This map maps the data types defined by the DFDL definition to Drill data types. */ - public static final ImmutableMap DFDL_TYPE_MAPPINGS = - ImmutableMap.builder() - .put("LONG", MinorType.BIGINT) - .put("INT", MinorType.INT) - .put("SHORT", MinorType.SMALLINT) - .put("BYTE", MinorType.TINYINT) + public static final ImmutableMap DFDL_TYPE_MAPPINGS = + ImmutableMap.builder() + .put(JPrimType.Long, MinorType.BIGINT) + .put(JPrimType.Int, MinorType.INT) + .put(JPrimType.Short, MinorType.SMALLINT) + .put(JPrimType.Byte, MinorType.TINYINT) // daffodil unsigned longs are modeled as DECIMAL(38, 0) which is the default for VARDECIMAL - .put("UNSIGNEDLONG", MinorType.VARDECIMAL) - .put("UNSIGNEDINT", MinorType.BIGINT) - .put("UNSIGNEDSHORT", MinorType.UINT2) - .put("UNSIGNEDBYTE", MinorType.UINT1) + .put(JPrimType.UnsignedLong, MinorType.VARDECIMAL) + .put(JPrimType.UnsignedInt, MinorType.BIGINT) + .put(JPrimType.UnsignedShort, MinorType.UINT2) + .put(JPrimType.UnsignedByte, MinorType.UINT1) // daffodil integer, nonNegativeInteger, are modeled as DECIMAL(38, 0) which is the default for VARDECIMAL - .put("INTEGER", MinorType.VARDECIMAL) - .put("NONNEGATIVEINTEGER", MinorType.VARDECIMAL) + .put(JPrimType.Integer, MinorType.VARDECIMAL) + .put(JPrimType.NonNegativeInteger, MinorType.VARDECIMAL) // decimal has to be modeled as string since we really have no idea what to set the // scale to. - .put("DECIMAL", MinorType.VARCHAR) - .put("BOOLEAN", MinorType.BIT) - .put("DATE", MinorType.DATE) // requires conversion - .put("DATETIME", MinorType.TIMESTAMP) // requires conversion - .put("DOUBLE", MinorType.FLOAT8) + .put(JPrimType.Decimal, MinorType.VARCHAR) + .put(JPrimType.Boolean, MinorType.BIT) + .put(JPrimType.Date, MinorType.DATE) // requires conversion + .put(JPrimType.DateTime, MinorType.TIMESTAMP) // requires conversion + .put(JPrimType.Double, MinorType.FLOAT8) // // daffodil float type is mapped to double aka Float8 in drill because there // seems to be bugs in FLOAT4. Float.MaxValue in a Float4 column displays as @@ -69,10 +69,10 @@ public class DrillDaffodilSchemaUtils { // // We don't really care about single float precision, so we just use double precision. // - .put("FLOAT", MinorType.FLOAT8) - .put("HEXBINARY", MinorType.VARBINARY) - .put("STRING", MinorType.VARCHAR) - .put("TIME", MinorType.TIME) // requires conversion + .put(JPrimType.Float, MinorType.FLOAT8) + .put(JPrimType.HexBinary, MinorType.VARBINARY) + .put(JPrimType.String, MinorType.VARCHAR) + .put(JPrimType.Time, MinorType.TIME) // requires conversion .build(); @@ -95,12 +95,12 @@ public static TupleMetadata daffodilDataProcessorToDrillSchema(DataProcessor dp) /** * Returns a {@link MinorType} of the corresponding DFDL Data Type. Defaults to VARCHAR if unknown - * @param dfdlType A String of the DFDL Data Type (local name only, i.e., no "xs:" prefix. + * @param dfdlType The type as provided by Daffodil. * @return A {@link MinorType} of the Drill data type. */ - public static MinorType getDrillDataType(PrimitiveType dfdlType) { + public static MinorType getDrillDataType(JPrimType dfdlType) { try { - MinorType type = DrillDaffodilSchemaUtils.DFDL_TYPE_MAPPINGS.get(dfdlType.name().toUpperCase()); + MinorType type = DrillDaffodilSchemaUtils.DFDL_TYPE_MAPPINGS.get(dfdlType); if (type == null) { return DEFAULT_TYPE; } else { diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java index 1f0107de5f2..93e0791845f 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java @@ -73,7 +73,7 @@ public TupleMetadata getDrillSchema() { public void simpleElementMetadata(SimpleElementMetadata md) { assert (!mapBuilderStack.isEmpty()); String colName = makeColumnName(md); - MinorType drillType = DrillDaffodilSchemaUtils.getDrillDataType(md.primitiveType()); + MinorType drillType = DrillDaffodilSchemaUtils.getDrillDataType(md.jPrimType()); if (md.isArray()) { mapBuilder().addArray(colName, drillType); } else if (md.isOptional() || md.isNillable()) { From 756791183af88168cbe79413bb67e5cb656f8fe5 Mon Sep 17 00:00:00 2001 From: Michael Beckerle Date: Wed, 14 Feb 2024 10:16:47 -0500 Subject: [PATCH 7/8] Uses DFDLPrimType now, not JPrimType --- .../DaffodilDrillInfosetOutputter.java | 4 +- .../schema/DrillDaffodilSchemaUtils.java | 46 +++++++++---------- .../schema/DrillDaffodilSchemaVisitor.java | 2 +- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java index a0a0a0292fe..189d2e0c18b 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/DaffodilDrillInfosetOutputter.java @@ -25,7 +25,7 @@ import org.apache.daffodil.runtime1.api.InfosetArray; import org.apache.daffodil.runtime1.api.InfosetComplexElement; import org.apache.daffodil.runtime1.api.InfosetSimpleElement; -import org.apache.daffodil.runtime1.api.JPrimType; +import org.apache.daffodil.runtime1.api.DFDLPrimType; import org.apache.drill.common.types.TypeProtos; import org.apache.drill.exec.physical.resultSet.RowSetLoader; import org.apache.drill.exec.record.metadata.ColumnMetadata; @@ -233,7 +233,7 @@ private void invariantFailed(String dafTypeName, ColumnMetadata cm) { private void convertDaffodilValueToDrillValue(InfosetSimpleElement ise, ColumnMetadata cm, ScalarWriter cw) { - JPrimType dafType = ise.metadata().jPrimType(); + DFDLPrimType dafType = ise.metadata().dfdlType(); String dafTypeName = dafType.name(); TypeProtos.MinorType drillType = DrillDaffodilSchemaUtils.getDrillDataType(dafType); assert (drillType == cm.type()); diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java index b4072668e65..5481f1da7ad 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaUtils.java @@ -20,7 +20,7 @@ import org.apache.daffodil.japi.InvalidParserException; import org.apache.daffodil.japi.DataProcessor; -import org.apache.daffodil.runtime1.api.JPrimType; +import org.apache.daffodil.runtime1.api.DFDLPrimType; import org.apache.drill.common.types.TypeProtos.MinorType; import org.apache.drill.exec.record.metadata.TupleMetadata; import com.google.common.annotations.VisibleForTesting; @@ -41,27 +41,27 @@ public class DrillDaffodilSchemaUtils { /** * This map maps the data types defined by the DFDL definition to Drill data types. */ - public static final ImmutableMap DFDL_TYPE_MAPPINGS = - ImmutableMap.builder() - .put(JPrimType.Long, MinorType.BIGINT) - .put(JPrimType.Int, MinorType.INT) - .put(JPrimType.Short, MinorType.SMALLINT) - .put(JPrimType.Byte, MinorType.TINYINT) + public static final ImmutableMap DFDL_TYPE_MAPPINGS = + ImmutableMap.builder() + .put(DFDLPrimType.Long, MinorType.BIGINT) + .put(DFDLPrimType.Int, MinorType.INT) + .put(DFDLPrimType.Short, MinorType.SMALLINT) + .put(DFDLPrimType.Byte, MinorType.TINYINT) // daffodil unsigned longs are modeled as DECIMAL(38, 0) which is the default for VARDECIMAL - .put(JPrimType.UnsignedLong, MinorType.VARDECIMAL) - .put(JPrimType.UnsignedInt, MinorType.BIGINT) - .put(JPrimType.UnsignedShort, MinorType.UINT2) - .put(JPrimType.UnsignedByte, MinorType.UINT1) + .put(DFDLPrimType.UnsignedLong, MinorType.VARDECIMAL) + .put(DFDLPrimType.UnsignedInt, MinorType.BIGINT) + .put(DFDLPrimType.UnsignedShort, MinorType.UINT2) + .put(DFDLPrimType.UnsignedByte, MinorType.UINT1) // daffodil integer, nonNegativeInteger, are modeled as DECIMAL(38, 0) which is the default for VARDECIMAL - .put(JPrimType.Integer, MinorType.VARDECIMAL) - .put(JPrimType.NonNegativeInteger, MinorType.VARDECIMAL) + .put(DFDLPrimType.Integer, MinorType.VARDECIMAL) + .put(DFDLPrimType.NonNegativeInteger, MinorType.VARDECIMAL) // decimal has to be modeled as string since we really have no idea what to set the // scale to. - .put(JPrimType.Decimal, MinorType.VARCHAR) - .put(JPrimType.Boolean, MinorType.BIT) - .put(JPrimType.Date, MinorType.DATE) // requires conversion - .put(JPrimType.DateTime, MinorType.TIMESTAMP) // requires conversion - .put(JPrimType.Double, MinorType.FLOAT8) + .put(DFDLPrimType.Decimal, MinorType.VARCHAR) + .put(DFDLPrimType.Boolean, MinorType.BIT) + .put(DFDLPrimType.Date, MinorType.DATE) // requires conversion + .put(DFDLPrimType.DateTime, MinorType.TIMESTAMP) // requires conversion + .put(DFDLPrimType.Double, MinorType.FLOAT8) // // daffodil float type is mapped to double aka Float8 in drill because there // seems to be bugs in FLOAT4. Float.MaxValue in a Float4 column displays as @@ -69,10 +69,10 @@ public class DrillDaffodilSchemaUtils { // // We don't really care about single float precision, so we just use double precision. // - .put(JPrimType.Float, MinorType.FLOAT8) - .put(JPrimType.HexBinary, MinorType.VARBINARY) - .put(JPrimType.String, MinorType.VARCHAR) - .put(JPrimType.Time, MinorType.TIME) // requires conversion + .put(DFDLPrimType.Float, MinorType.FLOAT8) + .put(DFDLPrimType.HexBinary, MinorType.VARBINARY) + .put(DFDLPrimType.String, MinorType.VARCHAR) + .put(DFDLPrimType.Time, MinorType.TIME) // requires conversion .build(); @@ -98,7 +98,7 @@ public static TupleMetadata daffodilDataProcessorToDrillSchema(DataProcessor dp) * @param dfdlType The type as provided by Daffodil. * @return A {@link MinorType} of the Drill data type. */ - public static MinorType getDrillDataType(JPrimType dfdlType) { + public static MinorType getDrillDataType(DFDLPrimType dfdlType) { try { MinorType type = DrillDaffodilSchemaUtils.DFDL_TYPE_MAPPINGS.get(dfdlType); if (type == null) { diff --git a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java index 93e0791845f..0bd8992eee7 100644 --- a/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java +++ b/contrib/format-daffodil/src/main/java/org/apache/drill/exec/store/daffodil/schema/DrillDaffodilSchemaVisitor.java @@ -73,7 +73,7 @@ public TupleMetadata getDrillSchema() { public void simpleElementMetadata(SimpleElementMetadata md) { assert (!mapBuilderStack.isEmpty()); String colName = makeColumnName(md); - MinorType drillType = DrillDaffodilSchemaUtils.getDrillDataType(md.jPrimType()); + MinorType drillType = DrillDaffodilSchemaUtils.getDrillDataType(md.dfdlType()); if (md.isArray()) { mapBuilder().addArray(colName, drillType); } else if (md.isOptional() || md.isNillable()) { From e15707a352e82b28234673b2daf78c2abf17875f Mon Sep 17 00:00:00 2001 From: Michael Beckerle Date: Sat, 27 Apr 2024 14:11:07 -0400 Subject: [PATCH 8/8] Change to Daffodil 3.7.0 official release. --- contrib/format-daffodil/pom.xml | 4 ++-- pom.xml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/format-daffodil/pom.xml b/contrib/format-daffodil/pom.xml index 01955c746dc..308f8efa8f0 100644 --- a/contrib/format-daffodil/pom.xml +++ b/contrib/format-daffodil/pom.xml @@ -40,12 +40,12 @@ org.apache.daffodil daffodil-japi_2.12 - 3.7.0-SNAPSHOT + 3.7.0 org.apache.daffodil daffodil-runtime1_2.12 - 3.7.0-SNAPSHOT + 3.7.0 diff --git a/pom.xml b/pom.xml index 558f9a37a7e..02aafad90b4 100644 --- a/pom.xml +++ b/pom.xml @@ -133,7 +133,7 @@ 4.3.3 2.1.12 - 3.7.0-SNAPSHOT + 3.7.0