From 8fd4018e7383212a795270e79b1c6c6707dd7b93 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Thu, 15 Jul 2021 13:09:58 -0400 Subject: [PATCH 01/41] Start of fixed width format plugin --- contrib/format-fixedwidth/pom.xml | 84 +++++++++++++ .../fixedwidth/FixedwidthBatchReader.java | 113 ++++++++++++++++++ .../fixedwidth/FixedwidthFieldConfig.java | 71 +++++++++++ .../fixedwidth/FixedwidthFormatConfig.java | 84 +++++++++++++ .../fixedwidth/FixedwidthFormatPlugin.java | 94 +++++++++++++++ .../src/main/resources/drill-module.conf | 23 ++++ .../test/java/TestFixedwidthRecordReader.java | 85 +++++++++++++ .../src/test/resources/fwf/test.fwf | 1 + contrib/pom.xml | 1 + 9 files changed, 556 insertions(+) create mode 100644 contrib/format-fixedwidth/pom.xml create mode 100644 contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java create mode 100644 contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java create mode 100644 contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java create mode 100644 contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java create mode 100644 contrib/format-fixedwidth/src/main/resources/drill-module.conf create mode 100644 contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java create mode 100644 contrib/format-fixedwidth/src/test/resources/fwf/test.fwf diff --git a/contrib/format-fixedwidth/pom.xml b/contrib/format-fixedwidth/pom.xml new file mode 100644 index 00000000000..d69c492b1dc --- /dev/null +++ b/contrib/format-fixedwidth/pom.xml @@ -0,0 +1,84 @@ + + + + 4.0.0 + + + drill-contrib-parent + org.apache.drill.contrib + 1.20.0-SNAPSHOT + + + drill-format-fixedwidth + Drill : Contrib : Format : Fixedwidth + + + + org.apache.drill.exec + drill-java-exec + ${project.version} + + + + + + org.apache.drill.exec + drill-java-exec + tests + ${project.version} + test + + + + org.apache.drill + drill-common + tests + ${project.version} + test + + + + + + maven-resources-plugin + + + copy-java-sources + process-sources + + copy-resources + + + ${basedir}/target/classes/org/apache/drill/exec/store/fixedwidth + + + + src/main/java/org/apache/drill/exec/store/fixedwidth + true + + + + + + + + + \ No newline at end of file diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java new file mode 100644 index 00000000000..a0c474ed5b2 --- /dev/null +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.fixedwidth; + +import org.apache.drill.common.exceptions.CustomErrorContext; +import org.apache.drill.common.exceptions.UserException; +import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator; +import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader; +import org.apache.drill.exec.record.metadata.SchemaBuilder; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.apache.hadoop.mapred.FileSplit; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.InputStream; + +public class FixedwidthBatchReader implements ManagedReader{ + + private static final Logger logger = LoggerFactory.getLogger(FixedwidthBatchReader.class); + + private FileSplit split; + + private final int maxRecords; + + private final FixedwidthFormatConfig config; + + private CustomErrorContext errorContext; + + private InputStream fsStream; + + public FixedwidthBatchReader(FixedwidthFormatConfig config, int maxRecords) { + this.config = config; + this.maxRecords = maxRecords; + } + + @Override + public boolean open(FileSchemaNegotiator negotiator) { + split = negotiator.split(); + errorContext = negotiator.parentErrorContext(); + try { + fsStream = negotiator.fileSystem().openPossiblyCompressedStream(split.getPath()); + negotiator.tableSchema(buildSchema(),true); + negotiator.build(); + } catch (Exception e) { + throw UserException + .dataReadError(e) + .message("Failed to open input file: {}", split.getPath().toString()) + .addContext(errorContext) + .addContext(e.getMessage()) + .build(logger); + } + return true; + } + + @Override + public boolean next() { + byte[] byteArray = new byte[10000]; + int bytesRead; + + try { + bytesRead = fsStream.read(byteArray); + System.out.println(new String(byteArray)); + } catch (Exception e) { + throw UserException + .dataReadError(e) + .message("Failed to read input file: {}", split.getPath().toString()) + .addContext(errorContext) + .addContext(e.getMessage()) + .build(logger); + } + return (bytesRead != -1); + } + + @Override + public void close() { + try { + fsStream.close(); + } catch (Exception e) { + throw UserException + .dataReadError(e) + .message("Failed to close input file: {}", split.getPath().toString()) + .addContext(errorContext) + .addContext(e.getMessage()) + .build(logger); + } + } + + private TupleMetadata buildSchema(){ + SchemaBuilder builder = new SchemaBuilder(); + + for (FixedwidthFieldConfig field : config.getFields()){ + builder.addNullable(field.getFieldName(),field.getDataType()); + } + + return builder.buildSchema(); + } +} \ No newline at end of file diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java new file mode 100644 index 00000000000..667ad3d098d --- /dev/null +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java @@ -0,0 +1,71 @@ +package org.apache.drill.exec.store.fixedwidth; + +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.drill.common.types.TypeProtos; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonTypeName; + +@JsonTypeName("fixedwidthReaderFieldDescription") +@JsonInclude(JsonInclude.Include.NON_DEFAULT) +public class FixedwidthFieldConfig { + + private final TypeProtos.MinorType dataType; + private final String fieldName; + private final String dateTimeFormat; + private final int startIndex; + private final int fieldWidth; + + public FixedwidthFieldConfig(@JsonProperty("dataType") TypeProtos.MinorType dataType, + @JsonProperty("fieldName") String fieldName, + @JsonProperty("dateTimeFormat") String dateTimeFormat, + @JsonProperty("startIndex") int startIndex, + @JsonProperty("fieldWidth") int fieldWidth) { + this.dataType = dataType; + this.fieldName = fieldName; + this.dateTimeFormat = dateTimeFormat; + this.startIndex = startIndex; + this.fieldWidth = fieldWidth; + } + + public TypeProtos.MinorType getDataType(){ + return dataType; + } + +// public void setDataType(TypeProtos.MinorType dataType){ +// this.dataType = dataType; +// } + + public String getFieldName(){ + return fieldName; + } + +// public void setFieldName(String fieldName){ +// this.fieldName = fieldName; +// } + + public String getDateTimeFormat() { + return dateTimeFormat; + } + +// public void setDateTimeFormat(String dateTimeFormat) { +// this.dateTimeFormat = dateTimeFormat; +// } + + public int getStartIndex(){ + return startIndex; + } + +// public void setStartIndex(int startIndex){ +// this.startIndex = startIndex; +// } + + public int getFieldWidth(){ + return fieldWidth; + } + +// public void setFieldWidth(int fieldWidth){ +// this.fieldWidth = fieldWidth; +// } + +} diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java new file mode 100644 index 00000000000..a05a7bd0e4d --- /dev/null +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.fixedwidth; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonTypeName; +import org.apache.drill.common.PlanStringBuilder; +import org.apache.drill.common.logical.FormatPluginConfig; +import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList; + +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +@JsonTypeName(FixedwidthFormatPlugin.DEFAULT_NAME) +@JsonInclude(JsonInclude.Include.NON_DEFAULT) +public class FixedwidthFormatConfig implements FormatPluginConfig { + private final List extensions; + private final List fields; + + @JsonCreator + public FixedwidthFormatConfig(@JsonProperty("extensions") List extensions, + @JsonProperty("fields") List fields) { + this.extensions = extensions == null ? Collections.singletonList("sav") : ImmutableList.copyOf(extensions); + this.fields = fields; + } //Change this + + @JsonInclude(JsonInclude.Include.NON_DEFAULT) + public List getExtensions() { + return extensions; + } + + public List getFields() { + return fields; + } + +// public FixedwidthReaderConfig getReaderConfig(FixedwidthFormatPlugin plugin) { +// return new FixedwidthReaderConfig(plugin); +// } + + @Override + public int hashCode() { + return Objects.hash(extensions, fields); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + FixedwidthFormatConfig other = (FixedwidthFormatConfig) obj; + return Objects.equals(extensions, other.extensions) + && Objects.equals(fields, other.fields); + } + + @Override + public String toString() { + return new PlanStringBuilder(this) + .field("extensions", extensions) + .field("fields", fields) + .toString(); + } +} \ No newline at end of file diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java new file mode 100644 index 00000000000..86861cb5040 --- /dev/null +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.fixedwidth; + +import org.apache.drill.common.logical.StoragePluginConfig; +import org.apache.drill.common.types.TypeProtos; +import org.apache.drill.common.types.Types; +import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileReaderFactory; +import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileScanBuilder; +import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator; + +import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader; +import org.apache.drill.exec.server.DrillbitContext; +import org.apache.drill.exec.server.options.OptionManager; +import org.apache.drill.exec.store.dfs.easy.EasyFormatPlugin; +import org.apache.drill.exec.store.dfs.easy.EasySubScan; +import org.apache.hadoop.conf.Configuration; + + +public class FixedwidthFormatPlugin extends EasyFormatPlugin { + + protected static final String DEFAULT_NAME = "fixedwidth"; + + private static class FixedwidthReaderFactory extends FileReaderFactory { + + private final FixedwidthFormatConfig config; + private final int maxRecords; + + public FixedwidthReaderFactory(FixedwidthFormatConfig config, int maxRecords) { + this.config = config; + this.maxRecords = maxRecords; + } + + @Override + public ManagedReader newReader() { + return new FixedwidthBatchReader(config, maxRecords); + } + } + + public FixedwidthFormatPlugin(String name, + DrillbitContext context, + Configuration fsConf, + StoragePluginConfig storageConfig, + FixedwidthFormatConfig formatConfig) { + super(name, easyConfig(fsConf, formatConfig), context, storageConfig, formatConfig); + } //final? + + private static EasyFormatConfig easyConfig(Configuration fsConf, FixedwidthFormatConfig pluginConfig) { + return EasyFormatConfig.builder() + .readable(true) + .writable(false) + .blockSplittable(false) + .compressible(true) + .supportsProjectPushdown(true) + .extensions(pluginConfig.getExtensions()) + .fsConf(fsConf) + .defaultName(DEFAULT_NAME) + .useEnhancedScan(true) + .supportsLimitPushdown(true) + .build(); + } + + @Override + public ManagedReader newBatchReader( + EasySubScan scan, OptionManager options) { + return new FixedwidthBatchReader(getConfig(), scan.getMaxRecords()); + } + + @Override + protected FileScanBuilder frameworkBuilder(OptionManager options, EasySubScan scan) { + FileScanBuilder builder = new FileScanBuilder(); + builder.setReaderFactory(new FixedwidthReaderFactory(getConfig(), scan.getMaxRecords())); + + initScanBuilder(builder, scan); + builder.nullType(Types.optional(TypeProtos.MinorType.VARCHAR)); + return builder; + } +} \ No newline at end of file diff --git a/contrib/format-fixedwidth/src/main/resources/drill-module.conf b/contrib/format-fixedwidth/src/main/resources/drill-module.conf new file mode 100644 index 00000000000..ed3e073f8dd --- /dev/null +++ b/contrib/format-fixedwidth/src/main/resources/drill-module.conf @@ -0,0 +1,23 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This file tells Drill to consider this module when class path scanning. +# This file can also include any supplementary configuration information. +# This file is in HOCON format, see https://github.com/typesafehub/config/blob/master/HOCON.md for more information. + +drill.classpath.scanning.packages += "org.apache.drill.exec.store.fixedwidth" diff --git a/contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java new file mode 100644 index 00000000000..764a37c744e --- /dev/null +++ b/contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java @@ -0,0 +1,85 @@ + +import com.google.common.collect.Lists; +import org.apache.drill.categories.RowSetTests; +import org.apache.drill.common.types.TypeProtos; +import org.apache.drill.exec.physical.rowSet.RowSet; +import org.apache.drill.exec.physical.rowSet.RowSetBuilder; +import org.apache.drill.exec.record.metadata.SchemaBuilder; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.apache.drill.exec.store.fixedwidth.FixedwidthFieldConfig; +import org.apache.drill.exec.store.fixedwidth.FixedwidthFormatConfig; +import org.apache.drill.test.ClusterFixture; +import org.apache.drill.test.ClusterTest; +import org.apache.drill.test.QueryBuilder; +import org.apache.drill.test.rowSet.RowSetComparison; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import java.nio.file.Paths; + +import static org.junit.Assert.assertEquals; + +@Category(RowSetTests.class) +public class TestFixedwidthRecordReader extends ClusterTest { + +// @BeforeClass +// public static void setup() throws Exception { +// ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher)); +// +// // Needed for compressed file unit test +// //dirTestWatcher.copyResourceToRoot(Paths.get("spss/")); +// } + + @BeforeClass + public static void setup() throws Exception { + ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher)); + + FixedwidthFormatConfig formatConfig = new FixedwidthFormatConfig(Lists.newArrayList("fwf") + , Lists.newArrayList()); + cluster.defineFormat("cp", "fwf", formatConfig); + //cluster.defineFormat("dfs", "xml", formatConfig); + + // Needed for compressed file unit test + dirTestWatcher.copyResourceToRoot(Paths.get("fwf/")); + } + + @Test + public void testExplicitQuery() throws Exception { + String sql = "SELECT ID, Urban, Urban_value FROM dfs.`spss/testdata.sav` WHERE d16=4"; + + QueryBuilder q = client.queryBuilder().sql(sql); + RowSet results = q.rowSet(); + + TupleMetadata expectedSchema = new SchemaBuilder() + .addNullable("ID", TypeProtos.MinorType.FLOAT8) + .addNullable("Urban", TypeProtos.MinorType.FLOAT8) + .addNullable("Urban_value", TypeProtos.MinorType.VARCHAR) + .buildSchema(); + + + RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema) + .addRow(47.0, 1.0, "Urban").addRow(53.0, 1.0, "Urban") + .addRow(66.0, 1.0, "Urban") + .build(); + + assertEquals(3, results.rowCount()); + + new RowSetComparison(expected).verifyAndClearAll(results); + } + + @Test + public void testBatchReader() throws Exception { + FixedwidthFieldConfig testField = new FixedwidthFieldConfig(TypeProtos.MinorType.FLOAT8,"date","MM/DD/YYYY",1,10); + System.out.println(testField.getFieldName()); + System.out.println(testField.getStartIndex()); + System.out.println(testField.getFieldWidth()); + System.out.println(testField.getDateTimeFormat()); + System.out.println(testField.getDataType()); + + String sql = "SELECT * FROM cp.`fwf/test.fwf`"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + } + + +} diff --git a/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf b/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf new file mode 100644 index 00000000000..3008d7b1a7a --- /dev/null +++ b/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf @@ -0,0 +1 @@ +1234 test 567 diff --git a/contrib/pom.xml b/contrib/pom.xml index c09508ddaf0..315df87cb84 100644 --- a/contrib/pom.xml +++ b/contrib/pom.xml @@ -46,6 +46,7 @@ format-syslog format-ltsv format-excel + format-fixedwidth format-httpd format-esri format-pdf From 5d1ea8b23477ce3cb4909adc6c01aedc9c45c862 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Thu, 22 Jul 2021 15:27:54 -0400 Subject: [PATCH 02/41] Work in Progress. Producing Rows. Currently complains about buffer not being de-allocated. --- .../fixedwidth/FixedwidthBatchReader.java | 18 +++++++- .../test/java/TestFixedwidthRecordReader.java | 46 ++++++++++++++++++- 2 files changed, 61 insertions(+), 3 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index a0c474ed5b2..dee81c7be74 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -22,6 +22,8 @@ import org.apache.drill.common.exceptions.UserException; import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator; import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader; +import org.apache.drill.exec.physical.resultSet.ResultSetLoader; +import org.apache.drill.exec.physical.resultSet.RowSetLoader; import org.apache.drill.exec.record.metadata.SchemaBuilder; import org.apache.drill.exec.record.metadata.TupleMetadata; import org.apache.hadoop.mapred.FileSplit; @@ -44,6 +46,8 @@ public class FixedwidthBatchReader implements ManagedReader Date: Mon, 26 Jul 2021 11:15:52 -0400 Subject: [PATCH 03/41] First working version --- .../fixedwidth/FixedwidthBatchReader.java | 68 ++++++++++++++++--- .../test/java/TestFixedwidthRecordReader.java | 17 +++-- 2 files changed, 68 insertions(+), 17 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index dee81c7be74..59286777cd8 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -20,17 +20,21 @@ import org.apache.drill.common.exceptions.CustomErrorContext; import org.apache.drill.common.exceptions.UserException; +import org.apache.drill.common.types.TypeProtos; import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator; import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader; import org.apache.drill.exec.physical.resultSet.ResultSetLoader; import org.apache.drill.exec.physical.resultSet.RowSetLoader; import org.apache.drill.exec.record.metadata.SchemaBuilder; import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.apache.drill.shaded.guava.com.google.common.base.Charsets; import org.apache.hadoop.mapred.FileSplit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.BufferedReader; import java.io.InputStream; +import java.io.InputStreamReader; public class FixedwidthBatchReader implements ManagedReader{ @@ -48,6 +52,8 @@ public class FixedwidthBatchReader implements ManagedReader Date: Thu, 29 Jul 2021 12:40:27 -0400 Subject: [PATCH 04/41] Added more data types, refactored code --- .../fixedwidth/FixedwidthBatchReader.java | 67 +++++++++++++------ .../test/java/TestFixedwidthRecordReader.java | 55 +++++++++++---- .../src/test/resources/fwf/test.fwf | 26 ++++++- 3 files changed, 113 insertions(+), 35 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index 59286777cd8..001f8c09987 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -35,6 +35,14 @@ import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.ZoneId; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; +import java.util.Locale; public class FixedwidthBatchReader implements ManagedReader{ @@ -94,17 +102,8 @@ public boolean next() { // Use loader to read data from file to turn into Drill while (!writer.isFull() && line != null) { - Object[] row = parseLine(line); - writer.start(); - - for (int i = 0; i < row.length; i++) { - if (row[i] instanceof Integer) { - writer.scalar(i).setInt((Integer) row[i]); - } else if (row[i] instanceof String) { - writer.scalar(i).setString((String) row[i]); - } - } + parseLine(line, writer); writer.save(); line = reader.readLine(); @@ -145,31 +144,57 @@ private TupleMetadata buildSchema(){ return builder.buildSchema(); } - private Object[] parseLine(String line){ - Object[] row = new Object[config.getFields().size()]; + private void parseLine(String line, RowSetLoader writer) { int i = 0; TypeProtos.MinorType dataType; String dateTimeFormat; + String value; - for (FixedwidthFieldConfig field : config.getFields()){ - row[i] = line.substring(field.getStartIndex()-1,field.getStartIndex()+field.getFieldWidth()-1); + for (FixedwidthFieldConfig field : config.getFields()) { + value = line.substring(field.getStartIndex() - 1, field.getStartIndex() + field.getFieldWidth() - 1); // Convert String to data type in field dataType = field.getDataType(); dateTimeFormat = field.getDateTimeFormat(); - if (dataType == TypeProtos.MinorType.INT){ - row[i] = Integer.parseInt((String) row[i]); - } else if (dataType == TypeProtos.MinorType.VARCHAR){ - } else if (dataType == TypeProtos.MinorType.DATE || dataType == TypeProtos.MinorType.TIME){ - // Check to ensure date time format matches input date? - } else{ + switch (dataType) { + case INT: + writer.scalar(i).setInt(Integer.parseInt(value)); + break; + case VARCHAR: + writer.scalar(i).setString(value); + break; + case DATE: + DateTimeFormatter formatDate = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); + LocalDate date = LocalDate.parse(value, formatDate); + + writer.scalar(i).setDate(date); + break; + case TIME: + DateTimeFormatter formatTime = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); + LocalTime time = LocalTime.parse(value, formatTime); + + writer.scalar(i).setTime(time); + break; + case TIMESTAMP: + DateTimeFormatter formatTS = DateTimeFormatter.ofPattern(dateTimeFormat,Locale.ENGLISH); + LocalDateTime ldt = LocalDateTime.parse(value,formatTS); + ZoneId z = ZoneId.of( "America/Toronto" ) ; + ZonedDateTime zdt = ldt.atZone( z ) ; + Instant timeStamp = zdt.toInstant(); + + writer.scalar(i).setTimestamp(timeStamp); + break; + default: + throw new RuntimeException("Unknown data type specified in fixed width. Found data type " + dataType); + + + } i++; } - return row; } } \ No newline at end of file diff --git a/contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java index 1e6856c46ac..26fcf009244 100644 --- a/contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java @@ -18,6 +18,10 @@ import org.junit.experimental.categories.Category; import java.nio.file.Paths; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; import static org.junit.Assert.assertEquals; @@ -40,10 +44,12 @@ public static void setup() throws Exception { , Lists.newArrayList( new FixedwidthFieldConfig(TypeProtos.MinorType.INT, "Number", "", 1, 4), new FixedwidthFieldConfig(TypeProtos.MinorType.VARCHAR, "Letter", "", 6, 4), - new FixedwidthFieldConfig(TypeProtos.MinorType.INT, "Address", "", 11, 3) - )); + new FixedwidthFieldConfig(TypeProtos.MinorType.INT,"Address","",11,3), + new FixedwidthFieldConfig(TypeProtos.MinorType.DATE,"Date","MM-dd-yyyy",15,10), + new FixedwidthFieldConfig(TypeProtos.MinorType.TIME,"Time","HH:mm:ss",26,8), + new FixedwidthFieldConfig(TypeProtos.MinorType.TIMESTAMP,"DateTime","MM-dd-yyyy'T'HH:mm:ss.SSX",35,23) + )); cluster.defineFormat("cp", "fwf", formatConfig); - //cluster.defineFormat("dfs", "xml", formatConfig); // Needed for compressed file unit test dirTestWatcher.copyResourceToRoot(Paths.get("fwf/")); @@ -75,13 +81,6 @@ public void testExplicitQuery() throws Exception { @Test public void testBatchReader() throws Exception { - FixedwidthFieldConfig testField = new FixedwidthFieldConfig(TypeProtos.MinorType.FLOAT8, "date", "MM/DD/YYYY", 1, 10); - System.out.println(testField.getFieldName()); - System.out.println(testField.getStartIndex()); - System.out.println(testField.getFieldWidth()); - System.out.println(testField.getDateTimeFormat()); - System.out.println(testField.getDataType()); - String sql = "SELECT * FROM cp.`fwf/test.fwf`"; RowSet results = client.queryBuilder().sql(sql).rowSet(); @@ -89,16 +88,46 @@ public void testBatchReader() throws Exception { .addNullable("Number", TypeProtos.MinorType.INT) .addNullable("Letter", TypeProtos.MinorType.VARCHAR) .addNullable("Address", TypeProtos.MinorType.INT) + .addNullable("Date", TypeProtos.MinorType.DATE) + .addNullable("Time",TypeProtos.MinorType.TIME) + .addNullable("DateTime",TypeProtos.MinorType.TIMESTAMP) .buildSchema(); RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema) - .addRow(1234, "test", 567) + .addRow(1234, "test", 567, LocalDate.parse("2021-02-10"), LocalTime.parse("10:30:27"), Instant.parse("2021-02-10T15:30:27.00Z")) + .addRow(5678, "TEST", 890, LocalDate.parse("2021-07-27"), LocalTime.parse("12:40:15"), Instant.parse("2021-07-27T16:40:15.00Z")) + .addRow(1111, "abcd", 111, LocalDate.parse("1111-11-11"), LocalTime.parse("11:11:11"), Instant.parse("1111-11-11T16:28:43.11Z")) + .addRow(2222, "efgh", 222, LocalDate.parse("2222-01-22"), LocalTime.parse("22:22:22"), Instant.parse("2222-01-23T03:22:22.22Z")) + .addRow(3333, "ijkl", 333, LocalDate.parse("3333-02-01"), LocalTime.parse("01:33:33"), Instant.parse("3333-02-01T06:33:33.33Z")) + .addRow(4444, "mnop", 444, LocalDate.parse("4444-03-02"), LocalTime.parse("02:44:44"), Instant.parse("4444-03-02T07:44:44.44Z")) + .addRow(5555, "qrst", 555, LocalDate.parse("5555-04-03"), LocalTime.parse("03:55:55"), Instant.parse("5555-04-03T07:55:55.55Z")) + .addRow(6666, "uvwx", 666, LocalDate.parse("6666-05-04"), LocalTime.parse("04:01:01"), Instant.parse("6666-05-04T08:01:01.01Z")) + .addRow(7777, "yzzz", 777, LocalDate.parse("7777-06-05"), LocalTime.parse("05:11:11"), Instant.parse("7777-06-05T09:11:11.11Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) .build(); - assertEquals(1, results.rowCount()); + System.out.println(expected); + assertEquals(25, results.rowCount()); + + //System.out.println(results.batchSchema()); + System.out.println(results); - System.out.println(results.batchSchema()); new RowSetComparison(expected).verifyAndClearAll(results); System.out.println("Test complete."); diff --git a/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf b/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf index 3008d7b1a7a..9eba0f61944 100644 --- a/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf +++ b/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf @@ -1 +1,25 @@ -1234 test 567 +1234 test 567 02-10-2021 10:30:27 02-10-2021T10:30:27.00Z +5678 TEST 890 07-27-2021 12:40:15 07-27-2021T12:40:15.00Z +1111 abcd 111 11-11-1111 11:11:11 11-11-1111T11:11:11.11Z +2222 efgh 222 01-22-2222 22:22:22 01-22-2222T22:22:22.22Z +3333 ijkl 333 02-01-3333 01:33:33 02-01-3333T01:33:33.33Z +4444 mnop 444 03-02-4444 02:44:44 03-02-4444T02:44:44.44Z +5555 qrst 555 04-03-5555 03:55:55 04-03-5555T03:55:55.55Z +6666 uvwx 666 05-04-6666 04:01:01 05-04-6666T04:01:01.01Z +7777 yzzz 777 06-05-7777 05:11:11 06-05-7777T05:11:11.11Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z \ No newline at end of file From f59d4e43fd3db193d44bfc53b21abc12e1fa9a5c Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Thu, 29 Jul 2021 13:00:10 -0400 Subject: [PATCH 05/41] Checkstyle fixes --- .../fixedwidth/FixedwidthBatchReader.java | 4 +- .../TestFixedwidthRecordReader.java | 47 +++---------------- 2 files changed, 8 insertions(+), 43 deletions(-) rename contrib/format-fixedwidth/src/test/java/{ => org/apache/drill/exec/store/fixedwidth}/TestFixedwidthRecordReader.java (86%) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index 001f8c09987..3dbd8b9dc5f 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -179,8 +179,8 @@ private void parseLine(String line, RowSetLoader writer) { case TIMESTAMP: DateTimeFormatter formatTS = DateTimeFormatter.ofPattern(dateTimeFormat,Locale.ENGLISH); LocalDateTime ldt = LocalDateTime.parse(value,formatTS); - ZoneId z = ZoneId.of( "America/Toronto" ) ; - ZonedDateTime zdt = ldt.atZone( z ) ; + ZoneId z = ZoneId.of( "America/Toronto" ); + ZonedDateTime zdt = ldt.atZone( z ); Instant timeStamp = zdt.toInstant(); writer.scalar(i).setTimestamp(timeStamp); diff --git a/contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java similarity index 86% rename from contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java rename to contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java index 26fcf009244..309837323be 100644 --- a/contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java @@ -1,14 +1,12 @@ +package org.apache.drill.exec.store.fixedwidth; -import com.google.common.collect.Lists; import org.apache.drill.categories.RowSetTests; import org.apache.drill.common.types.TypeProtos; import org.apache.drill.exec.physical.rowSet.RowSet; import org.apache.drill.exec.physical.rowSet.RowSetBuilder; import org.apache.drill.exec.record.metadata.SchemaBuilder; import org.apache.drill.exec.record.metadata.TupleMetadata; -import org.apache.drill.exec.store.fixedwidth.FixedwidthBatchReader; -import org.apache.drill.exec.store.fixedwidth.FixedwidthFieldConfig; -import org.apache.drill.exec.store.fixedwidth.FixedwidthFormatConfig; +import org.apache.drill.shaded.guava.com.google.common.collect.Lists; import org.apache.drill.test.ClusterFixture; import org.apache.drill.test.ClusterTest; import org.apache.drill.test.QueryBuilder; @@ -20,7 +18,6 @@ import java.nio.file.Paths; import java.time.Instant; import java.time.LocalDate; -import java.time.LocalDateTime; import java.time.LocalTime; import static org.junit.Assert.assertEquals; @@ -28,27 +25,19 @@ @Category(RowSetTests.class) public class TestFixedwidthRecordReader extends ClusterTest { -// @BeforeClass -// public static void setup() throws Exception { -// ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher)); -// -// // Needed for compressed file unit test -// //dirTestWatcher.copyResourceToRoot(Paths.get("spss/")); -// } - @BeforeClass public static void setup() throws Exception { ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher)); - FixedwidthFormatConfig formatConfig = new FixedwidthFormatConfig(Lists.newArrayList("fwf") - , Lists.newArrayList( + FixedwidthFormatConfig formatConfig = new FixedwidthFormatConfig(Lists.newArrayList("fwf"), + Lists.newArrayList( new FixedwidthFieldConfig(TypeProtos.MinorType.INT, "Number", "", 1, 4), new FixedwidthFieldConfig(TypeProtos.MinorType.VARCHAR, "Letter", "", 6, 4), new FixedwidthFieldConfig(TypeProtos.MinorType.INT,"Address","",11,3), new FixedwidthFieldConfig(TypeProtos.MinorType.DATE,"Date","MM-dd-yyyy",15,10), new FixedwidthFieldConfig(TypeProtos.MinorType.TIME,"Time","HH:mm:ss",26,8), new FixedwidthFieldConfig(TypeProtos.MinorType.TIMESTAMP,"DateTime","MM-dd-yyyy'T'HH:mm:ss.SSX",35,23) - )); + )); cluster.defineFormat("cp", "fwf", formatConfig); // Needed for compressed file unit test @@ -134,28 +123,4 @@ public void testBatchReader() throws Exception { client.close(); } - - - -/* -BatchSchema [ -fields=[ - [`Number` (INT:OPTIONAL), - children=([`$bits$` (UINT1:REQUIRED)], - [`Number` (INT:OPTIONAL)])], - [`Letter` (VARCHAR:OPTIONAL), - children=([`$bits$` (UINT1:REQUIRED)], - [`Letter` (VARCHAR:OPTIONAL), - children=([`$offsets$` (UINT4:REQUIRED)]) - ] - ) - ], - [`Address` (INT:OPTIONAL), - children=([`$bits$` (UINT1:REQUIRED)], - [`Address` (INT:OPTIONAL)]) - ] -], -selectionVector=NONE] -*/ - -} \ No newline at end of file +} From 9f2648c2d4b363b20107a295dcaa0c78027c5b02 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Thu, 29 Jul 2021 13:54:49 -0400 Subject: [PATCH 06/41] Removed println statement from Batch Reader, Simplified logic --- .../store/fixedwidth/FixedwidthBatchReader.java | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index 3dbd8b9dc5f..30b39611de9 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -97,7 +97,6 @@ public boolean next() { // Use loader to read data from file to turn into Drill try { line = reader.readLine(); - System.out.println(line); RowSetLoader writer = loader.writer(); while (!writer.isFull() && line != null) { @@ -153,9 +152,9 @@ private void parseLine(String line, RowSetLoader writer) { for (FixedwidthFieldConfig field : config.getFields()) { value = line.substring(field.getStartIndex() - 1, field.getStartIndex() + field.getFieldWidth() - 1); - // Convert String to data type in field dataType = field.getDataType(); dateTimeFormat = field.getDateTimeFormat(); + DateTimeFormatter formatter = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); switch (dataType) { case INT: @@ -165,24 +164,18 @@ private void parseLine(String line, RowSetLoader writer) { writer.scalar(i).setString(value); break; case DATE: - DateTimeFormatter formatDate = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); - LocalDate date = LocalDate.parse(value, formatDate); - + LocalDate date = LocalDate.parse(value, formatter); writer.scalar(i).setDate(date); break; case TIME: - DateTimeFormatter formatTime = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); - LocalTime time = LocalTime.parse(value, formatTime); - + LocalTime time = LocalTime.parse(value, formatter); writer.scalar(i).setTime(time); break; case TIMESTAMP: - DateTimeFormatter formatTS = DateTimeFormatter.ofPattern(dateTimeFormat,Locale.ENGLISH); - LocalDateTime ldt = LocalDateTime.parse(value,formatTS); + LocalDateTime ldt = LocalDateTime.parse(value,formatter); ZoneId z = ZoneId.of( "America/Toronto" ); ZonedDateTime zdt = ldt.atZone( z ); Instant timeStamp = zdt.toInstant(); - writer.scalar(i).setTimestamp(timeStamp); break; default: From 8c3f6ebd59b223d0e0352441de749c24f15fc457 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Fri, 6 Aug 2021 16:14:29 -0400 Subject: [PATCH 07/41] Modified format, fixed maxRecords in next(), modified Exception handling in Batch Reader --- contrib/format-fixedwidth/pom.xml | 115 +++++++-------- .../fixedwidth/FixedwidthBatchReader.java | 139 +++++++++--------- .../fixedwidth/FixedwidthFieldConfig.java | 64 ++++---- .../fixedwidth/FixedwidthFormatConfig.java | 8 +- .../fixedwidth/FixedwidthFormatPlugin.java | 28 ++-- .../TestFixedwidthRecordReader.java | 124 +++++++++------- .../src/test/resources/fwf/test.fwf | 2 +- distribution/pom.xml | 5 + distribution/src/assemble/component.xml | 1 + 9 files changed, 237 insertions(+), 249 deletions(-) diff --git a/contrib/format-fixedwidth/pom.xml b/contrib/format-fixedwidth/pom.xml index d69c492b1dc..c30db19a536 100644 --- a/contrib/format-fixedwidth/pom.xml +++ b/contrib/format-fixedwidth/pom.xml @@ -19,66 +19,63 @@ --> - 4.0.0 + 4.0.0 + + drill-contrib-parent + org.apache.drill.contrib + 1.20.0-SNAPSHOT + + drill-format-fixedwidth + Drill : Contrib : Format : Fixedwidth - - drill-contrib-parent - org.apache.drill.contrib - 1.20.0-SNAPSHOT - + + + org.apache.drill.exec + drill-java-exec + ${project.version} + - drill-format-fixedwidth - Drill : Contrib : Format : Fixedwidth + + + org.apache.drill.exec + drill-java-exec + tests + ${project.version} + test + + + org.apache.drill + drill-common + tests + ${project.version} + test + + + + + + maven-resources-plugin + + + copy-java-sources + process-sources + + copy-resources + + + ${basedir}/target/classes/org/apache/drill/exec/store/fixedwidth + + + + src/main/java/org/apache/drill/exec/store/fixedwidth + true + + + + + + + + - - - org.apache.drill.exec - drill-java-exec - ${project.version} - - - - - - org.apache.drill.exec - drill-java-exec - tests - ${project.version} - test - - - - org.apache.drill - drill-common - tests - ${project.version} - test - - - - - - maven-resources-plugin - - - copy-java-sources - process-sources - - copy-resources - - - ${basedir}/target/classes/org/apache/drill/exec/store/fixedwidth - - - - src/main/java/org/apache/drill/exec/store/fixedwidth - true - - - - - - - - \ No newline at end of file diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index 30b39611de9..2be542d8d39 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -33,6 +33,7 @@ import org.slf4j.LoggerFactory; import java.io.BufferedReader; +import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.time.Instant; @@ -44,23 +45,18 @@ import java.time.format.DateTimeFormatter; import java.util.Locale; -public class FixedwidthBatchReader implements ManagedReader{ +public class FixedwidthBatchReader implements ManagedReader { private static final Logger logger = LoggerFactory.getLogger(FixedwidthBatchReader.class); - private FileSplit split; - private final int maxRecords; - private final FixedwidthFormatConfig config; - private CustomErrorContext errorContext; - private InputStream fsStream; - private ResultSetLoader loader; - + private RowSetLoader writer; private BufferedReader reader; + private int lineNum; public FixedwidthBatchReader(FixedwidthFormatConfig config, int maxRecords) { this.config = config; @@ -71,51 +67,47 @@ public FixedwidthBatchReader(FixedwidthFormatConfig config, int maxRecords) { public boolean open(FileSchemaNegotiator negotiator) { split = negotiator.split(); errorContext = negotiator.parentErrorContext(); + lineNum = 0; try { fsStream = negotiator.fileSystem().openPossiblyCompressedStream(split.getPath()); - negotiator.tableSchema(buildSchema(),true); + negotiator.tableSchema(buildSchema(), true); loader = negotiator.build(); } catch (Exception e) { throw UserException - .dataReadError(e) - .message("Failed to open input file: {}", split.getPath().toString()) - .addContext(errorContext) - .addContext(e.getMessage()) - .build(logger); + .dataReadError(e) + .message("Failed to open input file: {}", split.getPath().toString()) + .addContext(errorContext) + .addContext(e.getMessage()) + .build(logger); } - reader = new BufferedReader(new InputStreamReader(fsStream, Charsets.UTF_8)); - return true; - } @Override public boolean next() { // Use loader to read data from file to turn into Drill rows - String line; + RowSetLoader writer = loader.writer(); try { line = reader.readLine(); - RowSetLoader writer = loader.writer(); - while (!writer.isFull() && line != null) { - writer.start(); parseLine(line, writer); writer.save(); - line = reader.readLine(); + lineNum++; } - } catch (Exception e) { + } catch (IOException e) { throw UserException - .dataReadError(e) - .message("Failed to read input file: {}", split.getPath().toString()) - .addContext(errorContext) - .addContext(e.getMessage()) - .build(logger); + .dataReadError(e) + .message("Failed to read input file: {}", split.getPath().toString()) + .addContext(errorContext) + .addContext(e.getMessage()) + .addContext("Line Number", lineNum) + .build(logger); } - return (line != null); + return writer.limitReached(maxRecords); // returns false when maxRecords limit has been reached } @Override @@ -125,69 +117,72 @@ public void close() { loader.close(); } catch (Exception e) { throw UserException - .dataReadError(e) - .message("Failed to close input file: {}", split.getPath().toString()) - .addContext(errorContext) - .addContext(e.getMessage()) - .build(logger); + .dataReadError(e) + .message("Failed to close input file: {}", split.getPath().toString()) + .addContext(errorContext) + .addContext(e.getMessage()) + .build(logger); } } - private TupleMetadata buildSchema(){ + private TupleMetadata buildSchema() { SchemaBuilder builder = new SchemaBuilder(); - - for (FixedwidthFieldConfig field : config.getFields()){ - builder.addNullable(field.getFieldName(),field.getDataType()); + for (FixedwidthFieldConfig field : config.getFields()) { + builder.addNullable(field.getFieldName(), field.getDataType()); } - - return builder.buildSchema(); + return builder.buildSchema(); } - private void parseLine(String line, RowSetLoader writer) { + + private boolean parseLine(String line, RowSetLoader writer) throws IOException { int i = 0; TypeProtos.MinorType dataType; String dateTimeFormat; String value; - for (FixedwidthFieldConfig field : config.getFields()) { value = line.substring(field.getStartIndex() - 1, field.getStartIndex() + field.getFieldWidth() - 1); - dataType = field.getDataType(); dateTimeFormat = field.getDateTimeFormat(); DateTimeFormatter formatter = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); - - switch (dataType) { - case INT: - writer.scalar(i).setInt(Integer.parseInt(value)); - break; - case VARCHAR: - writer.scalar(i).setString(value); - break; - case DATE: - LocalDate date = LocalDate.parse(value, formatter); - writer.scalar(i).setDate(date); - break; - case TIME: - LocalTime time = LocalTime.parse(value, formatter); - writer.scalar(i).setTime(time); - break; - case TIMESTAMP: - LocalDateTime ldt = LocalDateTime.parse(value,formatter); - ZoneId z = ZoneId.of( "America/Toronto" ); - ZonedDateTime zdt = ldt.atZone( z ); - Instant timeStamp = zdt.toInstant(); - writer.scalar(i).setTimestamp(timeStamp); - break; - default: - throw new RuntimeException("Unknown data type specified in fixed width. Found data type " + dataType); - - - + try { + switch (dataType) { + case INT: + writer.scalar(i).setInt(Integer.parseInt(value)); + break; + case VARCHAR: + writer.scalar(i).setString(value); + break; + case DATE: + LocalDate date = LocalDate.parse(value, formatter); + writer.scalar(i).setDate(date); + break; + case TIME: + LocalTime time = LocalTime.parse(value, formatter); + writer.scalar(i).setTime(time); + break; + case TIMESTAMP: + LocalDateTime ldt = LocalDateTime.parse(value, formatter); + ZoneId z = ZoneId.of("America/Toronto"); + ZonedDateTime zdt = ldt.atZone(z); + Instant timeStamp = zdt.toInstant(); + writer.scalar(i).setTimestamp(timeStamp); + break; + case FLOAT4: + writer.scalar(i).setFloat(Float.parseFloat(value)); + break; + case FLOAT8: + writer.scalar(i).setDouble(Double.parseDouble(value)); + break; + default: + throw new RuntimeException("Unknown data type specified in fixed width. Found data type " + dataType); + } + } catch (RuntimeException e){ + throw new IOException("Failed to parse value: " + value + " as " + dataType); } - i++; } + return true; } } \ No newline at end of file diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java index 667ad3d098d..9a9c1260434 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java @@ -1,10 +1,27 @@ -package org.apache.drill.exec.store.fixedwidth; +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ -import com.fasterxml.jackson.annotation.JsonProperty; -import org.apache.drill.common.types.TypeProtos; +package org.apache.drill.exec.store.fixedwidth; import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonTypeName; +import org.apache.drill.common.types.TypeProtos; @JsonTypeName("fixedwidthReaderFieldDescription") @JsonInclude(JsonInclude.Include.NON_DEFAULT) @@ -28,44 +45,13 @@ public FixedwidthFieldConfig(@JsonProperty("dataType") TypeProtos.MinorType data this.fieldWidth = fieldWidth; } - public TypeProtos.MinorType getDataType(){ - return dataType; - } - -// public void setDataType(TypeProtos.MinorType dataType){ -// this.dataType = dataType; -// } + public TypeProtos.MinorType getDataType() {return dataType;} - public String getFieldName(){ - return fieldName; - } + public String getFieldName() {return fieldName;} -// public void setFieldName(String fieldName){ -// this.fieldName = fieldName; -// } - - public String getDateTimeFormat() { - return dateTimeFormat; - } - -// public void setDateTimeFormat(String dateTimeFormat) { -// this.dateTimeFormat = dateTimeFormat; -// } - - public int getStartIndex(){ - return startIndex; - } - -// public void setStartIndex(int startIndex){ -// this.startIndex = startIndex; -// } - - public int getFieldWidth(){ - return fieldWidth; - } + public String getDateTimeFormat() {return dateTimeFormat;} -// public void setFieldWidth(int fieldWidth){ -// this.fieldWidth = fieldWidth; -// } + public int getStartIndex() {return startIndex;} + public int getFieldWidth() {return fieldWidth;} } diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java index a05a7bd0e4d..06f867a2d37 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -39,9 +39,9 @@ public class FixedwidthFormatConfig implements FormatPluginConfig { @JsonCreator public FixedwidthFormatConfig(@JsonProperty("extensions") List extensions, @JsonProperty("fields") List fields) { - this.extensions = extensions == null ? Collections.singletonList("sav") : ImmutableList.copyOf(extensions); + this.extensions = extensions == null ? Collections.singletonList("fwf") : ImmutableList.copyOf(extensions); this.fields = fields; - } //Change this + } @JsonInclude(JsonInclude.Include.NON_DEFAULT) public List getExtensions() { @@ -52,10 +52,6 @@ public List getFields() { return fields; } -// public FixedwidthReaderConfig getReaderConfig(FixedwidthFormatPlugin plugin) { -// return new FixedwidthReaderConfig(plugin); -// } - @Override public int hashCode() { return Objects.hash(extensions, fields); diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java index 86861cb5040..a10aad9ea11 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java @@ -24,7 +24,6 @@ import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileReaderFactory; import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileScanBuilder; import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator; - import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader; import org.apache.drill.exec.server.DrillbitContext; import org.apache.drill.exec.server.options.OptionManager; @@ -59,26 +58,26 @@ public FixedwidthFormatPlugin(String name, StoragePluginConfig storageConfig, FixedwidthFormatConfig formatConfig) { super(name, easyConfig(fsConf, formatConfig), context, storageConfig, formatConfig); - } //final? + } private static EasyFormatConfig easyConfig(Configuration fsConf, FixedwidthFormatConfig pluginConfig) { return EasyFormatConfig.builder() - .readable(true) - .writable(false) - .blockSplittable(false) - .compressible(true) - .supportsProjectPushdown(true) - .extensions(pluginConfig.getExtensions()) - .fsConf(fsConf) - .defaultName(DEFAULT_NAME) - .useEnhancedScan(true) - .supportsLimitPushdown(true) - .build(); + .readable(true) + .writable(false) + .blockSplittable(false) + .compressible(true) + .supportsProjectPushdown(true) + .extensions(pluginConfig.getExtensions()) + .fsConf(fsConf) + .defaultName(DEFAULT_NAME) + .useEnhancedScan(true) + .supportsLimitPushdown(true) + .build(); } @Override public ManagedReader newBatchReader( - EasySubScan scan, OptionManager options) { + EasySubScan scan, OptionManager options) { return new FixedwidthBatchReader(getConfig(), scan.getMaxRecords()); } @@ -86,7 +85,6 @@ public ManagedReader newBatchReader( protected FileScanBuilder frameworkBuilder(OptionManager options, EasySubScan scan) { FileScanBuilder builder = new FileScanBuilder(); builder.setReaderFactory(new FixedwidthReaderFactory(getConfig(), scan.getMaxRecords())); - initScanBuilder(builder, scan); builder.nullType(Types.optional(TypeProtos.MinorType.VARCHAR)); return builder; diff --git a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java index 309837323be..b8ce99f09c6 100644 --- a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java @@ -1,3 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.drill.exec.store.fixedwidth; import org.apache.drill.categories.RowSetTests; @@ -28,16 +46,15 @@ public class TestFixedwidthRecordReader extends ClusterTest { @BeforeClass public static void setup() throws Exception { ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher)); - FixedwidthFormatConfig formatConfig = new FixedwidthFormatConfig(Lists.newArrayList("fwf"), - Lists.newArrayList( - new FixedwidthFieldConfig(TypeProtos.MinorType.INT, "Number", "", 1, 4), - new FixedwidthFieldConfig(TypeProtos.MinorType.VARCHAR, "Letter", "", 6, 4), - new FixedwidthFieldConfig(TypeProtos.MinorType.INT,"Address","",11,3), - new FixedwidthFieldConfig(TypeProtos.MinorType.DATE,"Date","MM-dd-yyyy",15,10), - new FixedwidthFieldConfig(TypeProtos.MinorType.TIME,"Time","HH:mm:ss",26,8), - new FixedwidthFieldConfig(TypeProtos.MinorType.TIMESTAMP,"DateTime","MM-dd-yyyy'T'HH:mm:ss.SSX",35,23) - )); + Lists.newArrayList( + new FixedwidthFieldConfig(TypeProtos.MinorType.INT, "Number", "", 1, 4), + new FixedwidthFieldConfig(TypeProtos.MinorType.VARCHAR, "Letter", "", 6, 4), + new FixedwidthFieldConfig(TypeProtos.MinorType.INT, "Address", "", 11, 3), + new FixedwidthFieldConfig(TypeProtos.MinorType.DATE, "Date", "MM-dd-yyyy", 15, 10), + new FixedwidthFieldConfig(TypeProtos.MinorType.TIME, "Time", "HH:mm:ss", 26, 8), + new FixedwidthFieldConfig(TypeProtos.MinorType.TIMESTAMP, "DateTime", "MM-dd-yyyy'T'HH:mm:ss.SSX", 35, 23) + )); cluster.defineFormat("cp", "fwf", formatConfig); // Needed for compressed file unit test @@ -47,22 +64,18 @@ public static void setup() throws Exception { @Test public void testExplicitQuery() throws Exception { String sql = "SELECT ID, Urban, Urban_value FROM dfs.`spss/testdata.sav` WHERE d16=4"; - QueryBuilder q = client.queryBuilder().sql(sql); RowSet results = q.rowSet(); TupleMetadata expectedSchema = new SchemaBuilder() - .addNullable("ID", TypeProtos.MinorType.FLOAT8) - .addNullable("Urban", TypeProtos.MinorType.FLOAT8) - .addNullable("Urban_value", TypeProtos.MinorType.VARCHAR) - .buildSchema(); - - + .addNullable("ID", TypeProtos.MinorType.FLOAT8) + .addNullable("Urban", TypeProtos.MinorType.FLOAT8) + .addNullable("Urban_value", TypeProtos.MinorType.VARCHAR) + .buildSchema(); RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema) - .addRow(47.0, 1.0, "Urban").addRow(53.0, 1.0, "Urban") - .addRow(66.0, 1.0, "Urban") - .build(); - + .addRow(47.0, 1.0, "Urban").addRow(53.0, 1.0, "Urban") + .addRow(66.0, 1.0, "Urban") + .build(); assertEquals(3, results.rowCount()); new RowSetComparison(expected).verifyAndClearAll(results); @@ -70,46 +83,44 @@ public void testExplicitQuery() throws Exception { @Test public void testBatchReader() throws Exception { - String sql = "SELECT * FROM cp.`fwf/test.fwf`"; + String sql = "SELECT * FROM cp.`fwf/test.fwf` LIMIT 30"; RowSet results = client.queryBuilder().sql(sql).rowSet(); TupleMetadata expectedSchema = new SchemaBuilder() - .addNullable("Number", TypeProtos.MinorType.INT) - .addNullable("Letter", TypeProtos.MinorType.VARCHAR) - .addNullable("Address", TypeProtos.MinorType.INT) - .addNullable("Date", TypeProtos.MinorType.DATE) - .addNullable("Time",TypeProtos.MinorType.TIME) - .addNullable("DateTime",TypeProtos.MinorType.TIMESTAMP) - .buildSchema(); - - + .addNullable("Number", TypeProtos.MinorType.INT) + .addNullable("Letter", TypeProtos.MinorType.VARCHAR) + .addNullable("Address", TypeProtos.MinorType.INT) + .addNullable("Date", TypeProtos.MinorType.DATE) + .addNullable("Time", TypeProtos.MinorType.TIME) + .addNullable("DateTime", TypeProtos.MinorType.TIMESTAMP) + .buildSchema(); RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema) - .addRow(1234, "test", 567, LocalDate.parse("2021-02-10"), LocalTime.parse("10:30:27"), Instant.parse("2021-02-10T15:30:27.00Z")) - .addRow(5678, "TEST", 890, LocalDate.parse("2021-07-27"), LocalTime.parse("12:40:15"), Instant.parse("2021-07-27T16:40:15.00Z")) - .addRow(1111, "abcd", 111, LocalDate.parse("1111-11-11"), LocalTime.parse("11:11:11"), Instant.parse("1111-11-11T16:28:43.11Z")) - .addRow(2222, "efgh", 222, LocalDate.parse("2222-01-22"), LocalTime.parse("22:22:22"), Instant.parse("2222-01-23T03:22:22.22Z")) - .addRow(3333, "ijkl", 333, LocalDate.parse("3333-02-01"), LocalTime.parse("01:33:33"), Instant.parse("3333-02-01T06:33:33.33Z")) - .addRow(4444, "mnop", 444, LocalDate.parse("4444-03-02"), LocalTime.parse("02:44:44"), Instant.parse("4444-03-02T07:44:44.44Z")) - .addRow(5555, "qrst", 555, LocalDate.parse("5555-04-03"), LocalTime.parse("03:55:55"), Instant.parse("5555-04-03T07:55:55.55Z")) - .addRow(6666, "uvwx", 666, LocalDate.parse("6666-05-04"), LocalTime.parse("04:01:01"), Instant.parse("6666-05-04T08:01:01.01Z")) - .addRow(7777, "yzzz", 777, LocalDate.parse("7777-06-05"), LocalTime.parse("05:11:11"), Instant.parse("7777-06-05T09:11:11.11Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .build(); + .addRow(1234, "test", 567, LocalDate.parse("2021-02-10"), LocalTime.parse("10:30:27"), Instant.parse("2021-02-10T15:30:27.00Z")) + .addRow(5678, "TEST", 890, LocalDate.parse("2021-07-27"), LocalTime.parse("12:40:15"), Instant.parse("2021-07-27T16:40:15.00Z")) + .addRow(1111, "abcd", 111, LocalDate.parse("1111-11-11"), LocalTime.parse("11:11:11"), Instant.parse("1111-11-11T16:28:43.11Z")) + .addRow(2222, "efgh", 222, LocalDate.parse("2222-01-22"), LocalTime.parse("22:22:22"), Instant.parse("2222-01-23T03:22:22.22Z")) + .addRow(3333, "ijkl", 333, LocalDate.parse("3333-02-01"), LocalTime.parse("01:33:33"), Instant.parse("3333-02-01T06:33:33.33Z")) + .addRow(4444, "mnop", 444, LocalDate.parse("4444-03-02"), LocalTime.parse("02:44:44"), Instant.parse("4444-03-02T07:44:44.44Z")) + .addRow(5555, "qrst", 555, LocalDate.parse("5555-04-03"), LocalTime.parse("03:55:55"), Instant.parse("5555-04-03T07:55:55.55Z")) + .addRow(6666, "uvwx", 666, LocalDate.parse("6666-05-04"), LocalTime.parse("04:01:01"), Instant.parse("6666-05-04T08:01:01.01Z")) + .addRow(7777, "yzzz", 777, LocalDate.parse("7777-06-05"), LocalTime.parse("05:11:11"), Instant.parse("7777-06-05T09:11:11.11Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .build(); System.out.println(expected); assertEquals(25, results.rowCount()); @@ -117,7 +128,6 @@ public void testBatchReader() throws Exception { //System.out.println(results.batchSchema()); System.out.println(results); - new RowSetComparison(expected).verifyAndClearAll(results); System.out.println("Test complete."); client.close(); diff --git a/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf b/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf index 9eba0f61944..98cea6d8607 100644 --- a/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf +++ b/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf @@ -1,4 +1,4 @@ -1234 test 567 02-10-2021 10:30:27 02-10-2021T10:30:27.00Z +1T34 test 567 02-10-2021 10:30:27 02-10-2021T10:30:27.00Z 5678 TEST 890 07-27-2021 12:40:15 07-27-2021T12:40:15.00Z 1111 abcd 111 11-11-1111 11:11:11 11-11-1111T11:11:11.11Z 2222 efgh 222 01-22-2222 22:22:22 01-22-2222T22:22:22.22Z diff --git a/distribution/pom.xml b/distribution/pom.xml index 2e933179616..40290716d94 100644 --- a/distribution/pom.xml +++ b/distribution/pom.xml @@ -460,6 +460,11 @@ drill-format-excel ${project.version} + + org.apache.drill.contrib + drill-format-fixedwidth + ${project.version} + org.apache.drill.contrib drill-druid-storage diff --git a/distribution/src/assemble/component.xml b/distribution/src/assemble/component.xml index 853793d4d51..4752b1a4e44 100644 --- a/distribution/src/assemble/component.xml +++ b/distribution/src/assemble/component.xml @@ -55,6 +55,7 @@ org.apache.drill.contrib:drill-format-excel:jar org.apache.drill.contrib:drill-format-spss:jar org.apache.drill.contrib:drill-format-sas:jar + org.apache.drill.contrib:drill-format-fixedwidth:jar org.apache.drill.contrib:drill-jdbc-storage:jar org.apache.drill.contrib:drill-kudu-storage:jar org.apache.drill.contrib:drill-storage-phoenix:jar From 7c3b5a23e8297d45f883461f045aba64a1dd051a Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Mon, 13 Sep 2021 15:32:54 -0400 Subject: [PATCH 08/41] Addressing Review Comments. - Simplified FieldConfig variables - Added compressed file test - Added unit test for explicit column references - Modified close() to include AutoCloseables - Added Long data type - Added Decimal data type - not fully implemented --- .../fixedwidth/FixedwidthBatchReader.java | 34 +++-- .../fixedwidth/FixedwidthFieldConfig.java | 96 +++++++++++--- .../fixedwidth/FixedwidthFormatPlugin.java | 2 +- .../TestFixedwidthRecordReader.java | 124 +++++++++++------- .../src/test/resources/fwf/test.fwf | 50 +++---- 5 files changed, 198 insertions(+), 108 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index 2be542d8d39..d389211b930 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -18,6 +18,7 @@ package org.apache.drill.exec.store.fixedwidth; +import org.apache.drill.common.AutoCloseables; import org.apache.drill.common.exceptions.CustomErrorContext; import org.apache.drill.common.exceptions.UserException; import org.apache.drill.common.types.TypeProtos; @@ -36,6 +37,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.math.BigDecimal; import java.time.Instant; import java.time.LocalDate; import java.time.LocalDateTime; @@ -63,6 +65,7 @@ public FixedwidthBatchReader(FixedwidthFormatConfig config, int maxRecords) { this.maxRecords = maxRecords; } + @Override public boolean open(FileSchemaNegotiator negotiator) { split = negotiator.split(); @@ -112,23 +115,21 @@ public boolean next() { // Use loader to read data from file to turn into Drill @Override public void close() { - try { - fsStream.close(); - loader.close(); - } catch (Exception e) { - throw UserException - .dataReadError(e) - .message("Failed to close input file: {}", split.getPath().toString()) - .addContext(errorContext) - .addContext(e.getMessage()) - .build(logger); + if (fsStream != null){ + AutoCloseables.closeSilently(fsStream); + fsStream = null; } } private TupleMetadata buildSchema() { SchemaBuilder builder = new SchemaBuilder(); for (FixedwidthFieldConfig field : config.getFields()) { - builder.addNullable(field.getFieldName(), field.getDataType()); + if (field.getType() == TypeProtos.MinorType.VARDECIMAL){ + builder.addNullable(field.getName(), TypeProtos.MinorType.VARDECIMAL,38,4); + //revisit this + } else { + builder.addNullable(field.getName(), field.getType()); + } } return builder.buildSchema(); } @@ -140,8 +141,8 @@ private boolean parseLine(String line, RowSetLoader writer) throws IOException { String dateTimeFormat; String value; for (FixedwidthFieldConfig field : config.getFields()) { - value = line.substring(field.getStartIndex() - 1, field.getStartIndex() + field.getFieldWidth() - 1); - dataType = field.getDataType(); + value = line.substring(field.getIndex() - 1, field.getIndex() + field.getWidth() - 1); + dataType = field.getType(); dateTimeFormat = field.getDateTimeFormat(); DateTimeFormatter formatter = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); try { @@ -173,6 +174,13 @@ private boolean parseLine(String line, RowSetLoader writer) throws IOException { case FLOAT8: writer.scalar(i).setDouble(Double.parseDouble(value)); break; + case BIGINT: + writer.scalar(i).setLong(Long.parseLong(value)); + break; + case VARDECIMAL: + BigDecimal bigDecimal = new BigDecimal(value); + writer.scalar(i).setDecimal(bigDecimal); + break; default: throw new RuntimeException("Unknown data type specified in fixed width. Found data type " + dataType); } diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java index 9a9c1260434..69ad9b55b6d 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java @@ -21,37 +21,97 @@ import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonTypeName; +import org.apache.drill.common.PlanStringBuilder; import org.apache.drill.common.types.TypeProtos; +import java.util.Objects; + + @JsonTypeName("fixedwidthReaderFieldDescription") @JsonInclude(JsonInclude.Include.NON_DEFAULT) public class FixedwidthFieldConfig { - private final TypeProtos.MinorType dataType; - private final String fieldName; + private final String name; + private final int index; + private final int width; + private final TypeProtos.MinorType type; private final String dateTimeFormat; - private final int startIndex; - private final int fieldWidth; - - public FixedwidthFieldConfig(@JsonProperty("dataType") TypeProtos.MinorType dataType, - @JsonProperty("fieldName") String fieldName, - @JsonProperty("dateTimeFormat") String dateTimeFormat, - @JsonProperty("startIndex") int startIndex, - @JsonProperty("fieldWidth") int fieldWidth) { - this.dataType = dataType; - this.fieldName = fieldName; + + public FixedwidthFieldConfig(@JsonProperty("name") String name, + @JsonProperty("index") int index, + @JsonProperty("width") int width, + @JsonProperty("type") TypeProtos.MinorType type, + @JsonProperty("dateTimeFormat") String dateTimeFormat) { + + this.name = name; + this.index = index; + this.width = width; + this.type = type; this.dateTimeFormat = dateTimeFormat; - this.startIndex = startIndex; - this.fieldWidth = fieldWidth; + + // Need to verify names are different - where can we access all the names of other columns +// if(name != null){ +// this.name = name; +// } else{ +// throw new IllegalArgumentException("Invalid name"); //Is this the right way to throw an exception if blank? What about if not valid SQL? +// } +// +// if (index >= 0){ +// this.index = index; +// } else { +// throw new IllegalArgumentException("Index must be 0 or greater"); +// } +// +// //Can modify this to be optional and be calculated based on start index of this field and next +// this.width = width; +// +// if (type == null){ +// this.type = TypeProtos.MinorType.VARCHAR; +// } else { +// this.type = type; +// } +// this.dateTimeFormat = dateTimeFormat; // No default required, null is allowed } - public TypeProtos.MinorType getDataType() {return dataType;} + public String getName() {return name;} + + public int getIndex() {return index;} - public String getFieldName() {return fieldName;} + public int getWidth() {return width;} + + public TypeProtos.MinorType getType() {return type;} public String getDateTimeFormat() {return dateTimeFormat;} - public int getStartIndex() {return startIndex;} + @Override + public int hashCode() { + return Objects.hash(name, index, width, type, dateTimeFormat); + } - public int getFieldWidth() {return fieldWidth;} + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + FixedwidthFieldConfig other = (FixedwidthFieldConfig) obj; + return Objects.equals(name, other.name) + && Objects.equals(index, other.index) + && Objects.equals(width, other.width) + && Objects.equals(type, other.type) + && Objects.equals(dateTimeFormat, other.dateTimeFormat); + } + + @Override + public String toString() { + return new PlanStringBuilder(this) + .field("name", name) + .field("index", index) + .field("width", width) + .field("type", type) + .field("dateTimeFormat", dateTimeFormat) + .toString(); + } } diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java index a10aad9ea11..f96e4a81f77 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java @@ -64,7 +64,7 @@ private static EasyFormatConfig easyConfig(Configuration fsConf, FixedwidthForma return EasyFormatConfig.builder() .readable(true) .writable(false) - .blockSplittable(false) + .blockSplittable(false) // Change to true .compressible(true) .supportsProjectPushdown(true) .extensions(pluginConfig.getExtensions()) diff --git a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java index b8ce99f09c6..60faf73c723 100644 --- a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java @@ -38,7 +38,7 @@ import java.time.LocalDate; import java.time.LocalTime; -import static org.junit.Assert.assertEquals; +import static org.apache.drill.test.QueryTestUtil.generateCompressedFile; @Category(RowSetTests.class) public class TestFixedwidthRecordReader extends ClusterTest { @@ -46,91 +46,113 @@ public class TestFixedwidthRecordReader extends ClusterTest { @BeforeClass public static void setup() throws Exception { ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher)); + FixedwidthFormatConfig formatConfig = new FixedwidthFormatConfig(Lists.newArrayList("fwf"), Lists.newArrayList( - new FixedwidthFieldConfig(TypeProtos.MinorType.INT, "Number", "", 1, 4), - new FixedwidthFieldConfig(TypeProtos.MinorType.VARCHAR, "Letter", "", 6, 4), - new FixedwidthFieldConfig(TypeProtos.MinorType.INT, "Address", "", 11, 3), - new FixedwidthFieldConfig(TypeProtos.MinorType.DATE, "Date", "MM-dd-yyyy", 15, 10), - new FixedwidthFieldConfig(TypeProtos.MinorType.TIME, "Time", "HH:mm:ss", 26, 8), - new FixedwidthFieldConfig(TypeProtos.MinorType.TIMESTAMP, "DateTime", "MM-dd-yyyy'T'HH:mm:ss.SSX", 35, 23) + new FixedwidthFieldConfig("Number", 1, 5, TypeProtos.MinorType.VARDECIMAL, ""), + new FixedwidthFieldConfig("Letter", 7,4, TypeProtos.MinorType.VARCHAR, ""), + new FixedwidthFieldConfig("Address",12, 3,TypeProtos.MinorType.INT, ""), + new FixedwidthFieldConfig("Date",16, 10,TypeProtos.MinorType.DATE, "MM-dd-yyyy"), + new FixedwidthFieldConfig( "Time", 27, 8,TypeProtos.MinorType.TIME,"HH:mm:ss" ), + new FixedwidthFieldConfig("DateTime", 36, 23,TypeProtos.MinorType.TIMESTAMP, "MM-dd-yyyy'T'HH:mm:ss.SSX" ) )); + cluster.defineFormat("dfs", "fwf", formatConfig); cluster.defineFormat("cp", "fwf", formatConfig); // Needed for compressed file unit test dirTestWatcher.copyResourceToRoot(Paths.get("fwf/")); } + @Test + public void testStarQuery() throws Exception { + String sql = "SELECT * FROM cp.`fwf/test.fwf`"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + RowSet expected = setupTestData(); + new RowSetComparison(expected).verifyAndClearAll(results); + } + + @Test + public void testExplicitAllQuery() throws Exception { + String sql = "SELECT Number, Letter, Address, `Date`, `Time`, DateTime FROM cp.`fwf/test.fwf`"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + RowSet expected = setupTestData(); + new RowSetComparison(expected).verifyAndClearAll(results); + } + @Test public void testExplicitQuery() throws Exception { - String sql = "SELECT ID, Urban, Urban_value FROM dfs.`spss/testdata.sav` WHERE d16=4"; + String sql = "SELECT Number, Letter, Address FROM cp.`fwf/test.fwf` WHERE Letter='yzzz'"; QueryBuilder q = client.queryBuilder().sql(sql); RowSet results = q.rowSet(); TupleMetadata expectedSchema = new SchemaBuilder() - .addNullable("ID", TypeProtos.MinorType.FLOAT8) - .addNullable("Urban", TypeProtos.MinorType.FLOAT8) - .addNullable("Urban_value", TypeProtos.MinorType.VARCHAR) + .addNullable("Number", TypeProtos.MinorType.VARDECIMAL,38,4) + .addNullable("Letter", TypeProtos.MinorType.VARCHAR) + .addNullable("Address", TypeProtos.MinorType.INT) .buildSchema(); RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema) - .addRow(47.0, 1.0, "Urban").addRow(53.0, 1.0, "Urban") - .addRow(66.0, 1.0, "Urban") + .addRow(77.77, "yzzz", 777) .build(); - assertEquals(3, results.rowCount()); new RowSetComparison(expected).verifyAndClearAll(results); } + //Test Serialization/Deserialization + + //Test Compressed File @Test - public void testBatchReader() throws Exception { - String sql = "SELECT * FROM cp.`fwf/test.fwf` LIMIT 30"; + public void testStarQueryWithCompressedFile() throws Exception { + generateCompressedFile("fwf/test.fwf", "zip", "fwf/test.fwf.zip" ); + + String sql = "SELECT * FROM dfs.`fwf/test.fwf.zip`"; + System.out.println("Compressed file generated"); RowSet results = client.queryBuilder().sql(sql).rowSet(); + RowSet expected = setupTestData(); + new RowSetComparison(expected).verifyAndClearAll(results); + } + + // Test Entering invalid schemata - incorrect limits + private RowSet setupTestData(){ TupleMetadata expectedSchema = new SchemaBuilder() - .addNullable("Number", TypeProtos.MinorType.INT) + .addNullable("Number", TypeProtos.MinorType.VARDECIMAL,38,4) .addNullable("Letter", TypeProtos.MinorType.VARCHAR) .addNullable("Address", TypeProtos.MinorType.INT) .addNullable("Date", TypeProtos.MinorType.DATE) .addNullable("Time", TypeProtos.MinorType.TIME) .addNullable("DateTime", TypeProtos.MinorType.TIMESTAMP) .buildSchema(); + RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema) - .addRow(1234, "test", 567, LocalDate.parse("2021-02-10"), LocalTime.parse("10:30:27"), Instant.parse("2021-02-10T15:30:27.00Z")) - .addRow(5678, "TEST", 890, LocalDate.parse("2021-07-27"), LocalTime.parse("12:40:15"), Instant.parse("2021-07-27T16:40:15.00Z")) - .addRow(1111, "abcd", 111, LocalDate.parse("1111-11-11"), LocalTime.parse("11:11:11"), Instant.parse("1111-11-11T16:28:43.11Z")) - .addRow(2222, "efgh", 222, LocalDate.parse("2222-01-22"), LocalTime.parse("22:22:22"), Instant.parse("2222-01-23T03:22:22.22Z")) - .addRow(3333, "ijkl", 333, LocalDate.parse("3333-02-01"), LocalTime.parse("01:33:33"), Instant.parse("3333-02-01T06:33:33.33Z")) - .addRow(4444, "mnop", 444, LocalDate.parse("4444-03-02"), LocalTime.parse("02:44:44"), Instant.parse("4444-03-02T07:44:44.44Z")) - .addRow(5555, "qrst", 555, LocalDate.parse("5555-04-03"), LocalTime.parse("03:55:55"), Instant.parse("5555-04-03T07:55:55.55Z")) - .addRow(6666, "uvwx", 666, LocalDate.parse("6666-05-04"), LocalTime.parse("04:01:01"), Instant.parse("6666-05-04T08:01:01.01Z")) - .addRow(7777, "yzzz", 777, LocalDate.parse("7777-06-05"), LocalTime.parse("05:11:11"), Instant.parse("7777-06-05T09:11:11.11Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(12.34, "test", 567, LocalDate.parse("2021-02-10"), LocalTime.parse("10:30:27"), Instant.parse("2021-02-10T15:30:27.00Z")) + .addRow(56.78, "TEST", 890, LocalDate.parse("2021-07-27"), LocalTime.parse("12:40:15"), Instant.parse("2021-07-27T16:40:15.00Z")) + .addRow(11.11, "abcd", 111, LocalDate.parse("1111-11-11"), LocalTime.parse("11:11:11"), Instant.parse("1111-11-11T16:28:43.11Z")) + .addRow(22.22, "efgh", 222, LocalDate.parse("2222-01-22"), LocalTime.parse("22:22:22"), Instant.parse("2222-01-23T03:22:22.22Z")) + .addRow(33.33, "ijkl", 333, LocalDate.parse("3333-02-01"), LocalTime.parse("01:33:33"), Instant.parse("3333-02-01T06:33:33.33Z")) + .addRow(44.44, "mnop", 444, LocalDate.parse("4444-03-02"), LocalTime.parse("02:44:44"), Instant.parse("4444-03-02T07:44:44.44Z")) + .addRow(55.55, "qrst", 555, LocalDate.parse("5555-04-03"), LocalTime.parse("03:55:55"), Instant.parse("5555-04-03T07:55:55.55Z")) + .addRow(66.66, "uvwx", 666, LocalDate.parse("6666-05-04"), LocalTime.parse("04:01:01"), Instant.parse("6666-05-04T08:01:01.01Z")) + .addRow(77.77, "yzzz", 777, LocalDate.parse("7777-06-05"), LocalTime.parse("05:11:11"), Instant.parse("7777-06-05T09:11:11.11Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) .build(); - System.out.println(expected); - assertEquals(25, results.rowCount()); - - //System.out.println(results.batchSchema()); - System.out.println(results); - - new RowSetComparison(expected).verifyAndClearAll(results); - System.out.println("Test complete."); - client.close(); + return expected; } } + diff --git a/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf b/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf index 98cea6d8607..71be3669fec 100644 --- a/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf +++ b/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf @@ -1,25 +1,25 @@ -1T34 test 567 02-10-2021 10:30:27 02-10-2021T10:30:27.00Z -5678 TEST 890 07-27-2021 12:40:15 07-27-2021T12:40:15.00Z -1111 abcd 111 11-11-1111 11:11:11 11-11-1111T11:11:11.11Z -2222 efgh 222 01-22-2222 22:22:22 01-22-2222T22:22:22.22Z -3333 ijkl 333 02-01-3333 01:33:33 02-01-3333T01:33:33.33Z -4444 mnop 444 03-02-4444 02:44:44 03-02-4444T02:44:44.44Z -5555 qrst 555 04-03-5555 03:55:55 04-03-5555T03:55:55.55Z -6666 uvwx 666 05-04-6666 04:01:01 05-04-6666T04:01:01.01Z -7777 yzzz 777 06-05-7777 05:11:11 06-05-7777T05:11:11.11Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z \ No newline at end of file +12.34 test 567 02-10-2021 10:30:27 02-10-2021T10:30:27.00Z +56.78 TEST 890 07-27-2021 12:40:15 07-27-2021T12:40:15.00Z +11.11 abcd 111 11-11-1111 11:11:11 11-11-1111T11:11:11.11Z +22.22 efgh 222 01-22-2222 22:22:22 01-22-2222T22:22:22.22Z +33.33 ijkl 333 02-01-3333 01:33:33 02-01-3333T01:33:33.33Z +44.44 mnop 444 03-02-4444 02:44:44 03-02-4444T02:44:44.44Z +55.55 qrst 555 04-03-5555 03:55:55 04-03-5555T03:55:55.55Z +66.66 uvwx 666 05-04-6666 04:01:01 05-04-6666T04:01:01.01Z +77.77 yzzz 777 06-05-7777 05:11:11 06-05-7777T05:11:11.11Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z From 57d49db031965298191e5a9bf709e0f4328399b3 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Fri, 15 Oct 2021 10:44:29 -0400 Subject: [PATCH 09/41] Added Serialization/Deserialization test, added blank row test file, cleaned up compressed file test --- .../TestFixedwidthRecordReader.java | 67 ++++++++++++++++++- .../src/test/resources/fwf/test_blankrow.fwf | 26 +++++++ 2 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 contrib/format-fixedwidth/src/test/resources/fwf/test_blankrow.fwf diff --git a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java index 60faf73c723..a24d3498297 100644 --- a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java @@ -39,6 +39,7 @@ import java.time.LocalTime; import static org.apache.drill.test.QueryTestUtil.generateCompressedFile; +import static org.junit.Assert.assertEquals; @Category(RowSetTests.class) public class TestFixedwidthRecordReader extends ClusterTest { @@ -98,20 +99,82 @@ public void testExplicitQuery() throws Exception { } //Test Serialization/Deserialization + @Test + public void testSerDe() throws Exception { + String sql = "SELECT COUNT(*) FROM dfs.`fwf/test.fwf`"; + String plan = queryBuilder().sql(sql).explainJson(); + long cnt = queryBuilder().physical(plan).singletonLong(); + assertEquals(5L, cnt); + } - //Test Compressed File @Test public void testStarQueryWithCompressedFile() throws Exception { generateCompressedFile("fwf/test.fwf", "zip", "fwf/test.fwf.zip" ); String sql = "SELECT * FROM dfs.`fwf/test.fwf.zip`"; - System.out.println("Compressed file generated"); RowSet results = client.queryBuilder().sql(sql).rowSet(); RowSet expected = setupTestData(); new RowSetComparison(expected).verifyAndClearAll(results); } // Test Entering invalid schemata - incorrect limits + // Undefined field, what happens + // Parse invalid file, make sure correct error + + + @Test + public void testOutOfOrder() throws Exception{ + String sql = "SELECT Address, DateTime, `Date`, Letter FROM cp.`fwf/test.fwf`"; + QueryBuilder q = client.queryBuilder().sql(sql); + RowSet results = q.rowSet(); + + TupleMetadata expectedSchema = new SchemaBuilder() + .addNullable("Address", TypeProtos.MinorType.INT) + .addNullable("DateTime", TypeProtos.MinorType.TIMESTAMP) + .addNullable("Date", TypeProtos.MinorType.DATE) + .addNullable("Letter", TypeProtos.MinorType.VARCHAR) + .buildSchema(); + RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema) + .addRow(567, Instant.parse("2021-02-10T15:30:27.00Z"), LocalDate.parse("2021-02-10"), "test") + .addRow(890, Instant.parse("2021-07-27T16:40:15.00Z"), LocalDate.parse("2021-07-27"), "TEST") + .addRow(111, Instant.parse("1111-11-11T16:28:43.11Z"), LocalDate.parse("1111-11-11"), "abcd") + .addRow(222, Instant.parse("2222-01-23T03:22:22.22Z"), LocalDate.parse("2222-01-22"), "efgh") + .addRow(333, Instant.parse("3333-02-01T06:33:33.33Z"), LocalDate.parse("3333-02-01"), "ijkl") + .addRow(444, Instant.parse("4444-03-02T07:44:44.44Z"), LocalDate.parse("4444-03-02"), "mnop") + .addRow(555, Instant.parse("5555-04-03T07:55:55.55Z"), LocalDate.parse("5555-04-03"), "qrst") + .addRow(666, Instant.parse("6666-05-04T08:01:01.01Z"), LocalDate.parse("6666-05-04"), "uvwx") + .addRow(777, Instant.parse("7777-06-05T09:11:11.11Z"), LocalDate.parse("7777-06-05"), "yzzz") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .build(); + + new RowSetComparison(expected).verifyAndClearAll(results); + } + + // How should we be handling an empty/blank row? + @Test + public void testEmptyRow() throws Exception { + String sql = "SELECT * FROM cp.`fwf/test_blankrow.fwf`"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + RowSet expected = setupTestData(); + new RowSetComparison(expected).verifyAndClearAll(results); + } + + // private RowSet setupTestData(){ TupleMetadata expectedSchema = new SchemaBuilder() diff --git a/contrib/format-fixedwidth/src/test/resources/fwf/test_blankrow.fwf b/contrib/format-fixedwidth/src/test/resources/fwf/test_blankrow.fwf new file mode 100644 index 00000000000..6c582f8d615 --- /dev/null +++ b/contrib/format-fixedwidth/src/test/resources/fwf/test_blankrow.fwf @@ -0,0 +1,26 @@ +12.34 test 567 02-10-2021 10:30:27 02-10-2021T10:30:27.00Z +56.78 TEST 890 07-27-2021 12:40:15 07-27-2021T12:40:15.00Z +11.11 abcd 111 11-11-1111 11:11:11 11-11-1111T11:11:11.11Z +22.22 efgh 222 01-22-2222 22:22:22 01-22-2222T22:22:22.22Z +33.33 ijkl 333 02-01-3333 01:33:33 02-01-3333T01:33:33.33Z +44.44 mnop 444 03-02-4444 02:44:44 03-02-4444T02:44:44.44Z +55.55 qrst 555 04-03-5555 03:55:55 04-03-5555T03:55:55.55Z +66.66 uvwx 666 05-04-6666 04:01:01 05-04-6666T04:01:01.01Z +77.77 yzzz 777 06-05-7777 05:11:11 06-05-7777T05:11:11.11Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z + +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z From 1a4818ee795dbdb6750ebfc37cc03f23a1ac7fa8 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Wed, 20 Oct 2021 15:44:09 -0400 Subject: [PATCH 10/41] Fixed Serialization/Deserialization test --- .../exec/store/fixedwidth/FixedwidthBatchReader.java | 12 ++++++++---- .../store/fixedwidth/TestFixedwidthRecordReader.java | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index d389211b930..f63eac3b528 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -143,8 +143,6 @@ private boolean parseLine(String line, RowSetLoader writer) throws IOException { for (FixedwidthFieldConfig field : config.getFields()) { value = line.substring(field.getIndex() - 1, field.getIndex() + field.getWidth() - 1); dataType = field.getType(); - dateTimeFormat = field.getDateTimeFormat(); - DateTimeFormatter formatter = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); try { switch (dataType) { case INT: @@ -154,15 +152,21 @@ private boolean parseLine(String line, RowSetLoader writer) throws IOException { writer.scalar(i).setString(value); break; case DATE: + dateTimeFormat = field.getDateTimeFormat(); + DateTimeFormatter formatter = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); LocalDate date = LocalDate.parse(value, formatter); writer.scalar(i).setDate(date); break; case TIME: - LocalTime time = LocalTime.parse(value, formatter); + dateTimeFormat = field.getDateTimeFormat(); + DateTimeFormatter formatter2 = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); + LocalTime time = LocalTime.parse(value, formatter2); writer.scalar(i).setTime(time); break; case TIMESTAMP: - LocalDateTime ldt = LocalDateTime.parse(value, formatter); + dateTimeFormat = field.getDateTimeFormat(); + DateTimeFormatter formatter3 = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); + LocalDateTime ldt = LocalDateTime.parse(value, formatter3); ZoneId z = ZoneId.of("America/Toronto"); ZonedDateTime zdt = ldt.atZone(z); Instant timeStamp = zdt.toInstant(); diff --git a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java index a24d3498297..f29219e6bd7 100644 --- a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java @@ -101,10 +101,10 @@ public void testExplicitQuery() throws Exception { //Test Serialization/Deserialization @Test public void testSerDe() throws Exception { - String sql = "SELECT COUNT(*) FROM dfs.`fwf/test.fwf`"; + String sql = "SELECT COUNT(*) FROM cp.`fwf/test.fwf`"; String plan = queryBuilder().sql(sql).explainJson(); long cnt = queryBuilder().physical(plan).singletonLong(); - assertEquals(5L, cnt); + assertEquals(25L, cnt); } @Test From 1f1051eabd8bc6afa1904b22db6c013754754bb5 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Thu, 4 Nov 2021 14:30:23 -0400 Subject: [PATCH 11/41] Added another constructor to enable user to not have to enter dateTimeFormat when not appropriate, started adding methods to perform field name verification (not complete). --- .../fixedwidth/FixedwidthFieldConfig.java | 9 +++++ .../fixedwidth/FixedwidthFormatConfig.java | 36 ++++++++++++++++++- .../TestFixedwidthRecordReader.java | 2 +- 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java index 69ad9b55b6d..ae2c7c0f095 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java @@ -37,6 +37,13 @@ public class FixedwidthFieldConfig { private final TypeProtos.MinorType type; private final String dateTimeFormat; + public FixedwidthFieldConfig(@JsonProperty("name") String name, + @JsonProperty("index") int index, + @JsonProperty("width") int width, + @JsonProperty("type") TypeProtos.MinorType type) { + this(name, index, width, type, null); + } + public FixedwidthFieldConfig(@JsonProperty("name") String name, @JsonProperty("index") int index, @JsonProperty("width") int width, @@ -49,6 +56,8 @@ public FixedwidthFieldConfig(@JsonProperty("name") String name, this.type = type; this.dateTimeFormat = dateTimeFormat; + + // Need to verify names are different - where can we access all the names of other columns // if(name != null){ // this.name = name; diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java index 06f867a2d37..053d83676e3 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -19,13 +19,16 @@ package org.apache.drill.exec.store.fixedwidth; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonTypeName; import org.apache.drill.common.PlanStringBuilder; import org.apache.drill.common.logical.FormatPluginConfig; +import org.apache.drill.exec.store.log.LogFormatField; import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Objects; @@ -77,4 +80,35 @@ public String toString() { .field("fields", fields) .toString(); } -} \ No newline at end of file + + + @JsonIgnore + public boolean hasFields() { + return fields != null && ! fields.isEmpty(); + } + + @JsonIgnore + public List getFieldNames() { + List result = new ArrayList<>(); + if (! hasFields()) { + return result; + } + + for (FixedwidthFieldConfig field : fields) { + result.add(field.getName()); + } + return result; + } + + @JsonIgnore + public boolean validateFieldNames(String fieldName){ + boolean result = false; + List names = this.getFieldNames(); + if (names.contains(fieldName)){ + result = true; + } + return result; + } + + +} diff --git a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java index f29219e6bd7..9aa471488f9 100644 --- a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java @@ -174,7 +174,7 @@ public void testEmptyRow() throws Exception { new RowSetComparison(expected).verifyAndClearAll(results); } - // + // Create unit test for overloaded constructor private RowSet setupTestData(){ TupleMetadata expectedSchema = new SchemaBuilder() From cb1b93270bd4c750092637ea49148a7d3d2c15bb Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Fri, 5 Nov 2021 11:13:58 -0400 Subject: [PATCH 12/41] Added method to validate field name input and verify there are no duplicates. Modified tests to enable testing of new constructor. --- .../fixedwidth/FixedwidthFieldConfig.java | 1 - .../fixedwidth/FixedwidthFormatConfig.java | 29 +++++++++++++++---- .../TestFixedwidthRecordReader.java | 2 +- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java index ae2c7c0f095..d0ab5bcda84 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java @@ -57,7 +57,6 @@ public FixedwidthFieldConfig(@JsonProperty("name") String name, this.dateTimeFormat = dateTimeFormat; - // Need to verify names are different - where can we access all the names of other columns // if(name != null){ // this.name = name; diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java index 053d83676e3..00abe0a834e 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -24,18 +24,25 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonTypeName; import org.apache.drill.common.PlanStringBuilder; +import org.apache.drill.common.exceptions.UserException; import org.apache.drill.common.logical.FormatPluginConfig; import org.apache.drill.exec.store.log.LogFormatField; +import org.apache.drill.exec.store.log.LogFormatPlugin; import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Objects; +import java.util.Set; @JsonTypeName(FixedwidthFormatPlugin.DEFAULT_NAME) @JsonInclude(JsonInclude.Include.NON_DEFAULT) public class FixedwidthFormatConfig implements FormatPluginConfig { + private static final Logger logger = LoggerFactory.getLogger(FixedwidthFormatConfig.class); private final List extensions; private final List fields; @@ -44,6 +51,8 @@ public FixedwidthFormatConfig(@JsonProperty("extensions") List extension @JsonProperty("fields") List fields) { this.extensions = extensions == null ? Collections.singletonList("fwf") : ImmutableList.copyOf(extensions); this.fields = fields; + + validateFieldInput(); } @JsonInclude(JsonInclude.Include.NON_DEFAULT) @@ -101,13 +110,21 @@ public List getFieldNames() { } @JsonIgnore - public boolean validateFieldNames(String fieldName){ - boolean result = false; - List names = this.getFieldNames(); - if (names.contains(fieldName)){ - result = true; + public void validateFieldInput(){ + Set uniqueNames = new HashSet<>(); + for (String name : this.getFieldNames()){ + if (name.length() == 0){ + + } + if (uniqueNames.contains(name)){ + throw UserException + .validationError() + .message("Duplicate column name: " + name) + .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .build(logger); + } + uniqueNames.add(name); } - return result; } diff --git a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java index 9aa471488f9..a6f14e7aaf9 100644 --- a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java @@ -50,7 +50,7 @@ public static void setup() throws Exception { FixedwidthFormatConfig formatConfig = new FixedwidthFormatConfig(Lists.newArrayList("fwf"), Lists.newArrayList( - new FixedwidthFieldConfig("Number", 1, 5, TypeProtos.MinorType.VARDECIMAL, ""), + new FixedwidthFieldConfig("Number", 1, 5, TypeProtos.MinorType.VARDECIMAL), new FixedwidthFieldConfig("Letter", 7,4, TypeProtos.MinorType.VARCHAR, ""), new FixedwidthFieldConfig("Address",12, 3,TypeProtos.MinorType.INT, ""), new FixedwidthFieldConfig("Date",16, 10,TypeProtos.MinorType.DATE, "MM-dd-yyyy"), From 84198624b52a8b53cdc152edf52f3006b1f46417 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Tue, 16 Nov 2021 15:35:15 -0500 Subject: [PATCH 13/41] Added two getters to FixedwidthFormatConfig to prep for offset verification --- .../fixedwidth/FixedwidthFormatConfig.java | 32 +++++++++++++++++-- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java index 00abe0a834e..10a7494bd8b 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -109,13 +109,39 @@ public List getFieldNames() { return result; } + @JsonIgnore + public List getFieldIndices() { + List result = new ArrayList<>(); + if (! hasFields()) { + return result; + } + + for (FixedwidthFieldConfig field : fields) { + result.add(field.getIndex()); + } + return result; + } + + @JsonIgnore + public List getFieldWidths() { + List result = new ArrayList<>(); + if (! hasFields()) { + return result; + } + + for (FixedwidthFieldConfig field : fields) { + result.add(field.getWidth()); + } + return result; + } + @JsonIgnore public void validateFieldInput(){ Set uniqueNames = new HashSet<>(); for (String name : this.getFieldNames()){ - if (name.length() == 0){ - - } + /*if (name.length() == 0){ + + }*/ if (uniqueNames.contains(name)){ throw UserException .validationError() From 31e1549bb56ed795228cf8a21efaac8f2065ca52 Mon Sep 17 00:00:00 2001 From: Esther Buchwalter Date: Wed, 17 Nov 2021 15:57:15 -0500 Subject: [PATCH 14/41] Added a check for overlapping fields --- .../fixedwidth/FixedwidthFormatConfig.java | 33 +++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java index 10a7494bd8b..78a718f972d 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -26,8 +26,6 @@ import org.apache.drill.common.PlanStringBuilder; import org.apache.drill.common.exceptions.UserException; import org.apache.drill.common.logical.FormatPluginConfig; -import org.apache.drill.exec.store.log.LogFormatField; -import org.apache.drill.exec.store.log.LogFormatPlugin; import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -151,6 +149,37 @@ public void validateFieldInput(){ } uniqueNames.add(name); } + List fieldIndices = this.getFieldIndices(); + List fieldWidths = this.getFieldWidths(); + int prevIndexAndWidth = -1; + + //assuming that fieldIndices is the same size as fieldWidths, width is required + for (int i = 0; i= 0.") + .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .build(logger); + } + /* + else if (fieldWidths.get(i) == null || fieldWidths.get(i) < 1) { + if (i == fieldIndices.size()-1) { + Integer width = + } + Integer width = fieldIndices.get(i+1) - fieldIndices.get(i); + fieldWidths.set(i, width); + } + */ + else if (fieldIndices.get(i) <= prevIndexAndWidth) { + throw UserException + .validationError() + .message("Overlapping fields at indices " + fieldIndices.get(i-1) + "and" + fieldIndices.get(i) + ".") + .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .build(logger); + } + prevIndexAndWidth = fieldIndices.get(i) + fieldWidths.get(i); + } } From 07edbded4b2c90a3cf41c0ace8768abd57a851c4 Mon Sep 17 00:00:00 2001 From: Esther Buchwalter Date: Thu, 18 Nov 2021 14:21:43 -0500 Subject: [PATCH 15/41] Updated check for overlapping fields --- .../exec/store/fixedwidth/FixedwidthFormatConfig.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java index 78a718f972d..f6ca6c7a018 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -151,14 +151,15 @@ public void validateFieldInput(){ } List fieldIndices = this.getFieldIndices(); List fieldWidths = this.getFieldWidths(); + List fieldNames = this.getFieldNames(); int prevIndexAndWidth = -1; //assuming that fieldIndices is the same size as fieldWidths, width is required for (int i = 0; i= 0.") + .message("Invalid index for field '" + fieldNames.get(i) + "' at index: " + fieldIndices.get(i) + ". Index must be > 0.") .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) .build(logger); } @@ -174,7 +175,7 @@ else if (fieldWidths.get(i) == null || fieldWidths.get(i) < 1) { else if (fieldIndices.get(i) <= prevIndexAndWidth) { throw UserException .validationError() - .message("Overlapping fields at indices " + fieldIndices.get(i-1) + "and" + fieldIndices.get(i) + ".") + .message("Overlapping fields: " + fieldNames.get(i-1) + " and " + fieldNames.get(i)) .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) .build(logger); } @@ -182,5 +183,4 @@ else if (fieldIndices.get(i) <= prevIndexAndWidth) { } } - } From fa47a143c8ed3cd7685fc2b8ef8f14c96e0fe6f7 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Tue, 23 Nov 2021 16:14:24 -0500 Subject: [PATCH 16/41] Added field validation for data types, indices, width. Includes creating two setters in field config to set default value for data types and calculate/set width based on indices. --- .../fixedwidth/FixedwidthBatchReader.java | 3 +- .../fixedwidth/FixedwidthFieldConfig.java | 46 ++++---- .../fixedwidth/FixedwidthFormatConfig.java | 100 ++++++++++++++---- .../TestFixedwidthRecordReader.java | 2 +- 4 files changed, 98 insertions(+), 53 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index f63eac3b528..cd487a7e590 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -65,7 +65,6 @@ public FixedwidthBatchReader(FixedwidthFormatConfig config, int maxRecords) { this.maxRecords = maxRecords; } - @Override public boolean open(FileSchemaNegotiator negotiator) { split = negotiator.split(); @@ -197,4 +196,4 @@ private boolean parseLine(String line, RowSetLoader writer) throws IOException { return true; } -} \ No newline at end of file +} diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java index d0ab5bcda84..e214c239fa4 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java @@ -18,6 +18,7 @@ package org.apache.drill.exec.store.fixedwidth; +import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonTypeName; @@ -29,12 +30,12 @@ @JsonTypeName("fixedwidthReaderFieldDescription") @JsonInclude(JsonInclude.Include.NON_DEFAULT) -public class FixedwidthFieldConfig { +public class FixedwidthFieldConfig implements Comparable { private final String name; private final int index; - private final int width; - private final TypeProtos.MinorType type; + private int width; + private TypeProtos.MinorType type; private final String dateTimeFormat; public FixedwidthFieldConfig(@JsonProperty("name") String name, @@ -44,41 +45,17 @@ public FixedwidthFieldConfig(@JsonProperty("name") String name, this(name, index, width, type, null); } + @JsonCreator public FixedwidthFieldConfig(@JsonProperty("name") String name, @JsonProperty("index") int index, @JsonProperty("width") int width, @JsonProperty("type") TypeProtos.MinorType type, @JsonProperty("dateTimeFormat") String dateTimeFormat) { - this.name = name; this.index = index; this.width = width; this.type = type; this.dateTimeFormat = dateTimeFormat; - - - // Need to verify names are different - where can we access all the names of other columns -// if(name != null){ -// this.name = name; -// } else{ -// throw new IllegalArgumentException("Invalid name"); //Is this the right way to throw an exception if blank? What about if not valid SQL? -// } -// -// if (index >= 0){ -// this.index = index; -// } else { -// throw new IllegalArgumentException("Index must be 0 or greater"); -// } -// -// //Can modify this to be optional and be calculated based on start index of this field and next -// this.width = width; -// -// if (type == null){ -// this.type = TypeProtos.MinorType.VARCHAR; -// } else { -// this.type = type; -// } -// this.dateTimeFormat = dateTimeFormat; // No default required, null is allowed } public String getName() {return name;} @@ -87,8 +64,16 @@ public FixedwidthFieldConfig(@JsonProperty("name") String name, public int getWidth() {return width;} + public void setWidth(int value) { + this.width = value; + } + public TypeProtos.MinorType getType() {return type;} + public void setType() { + this.type = TypeProtos.MinorType.VARCHAR; + } + public String getDateTimeFormat() {return dateTimeFormat;} @Override @@ -122,4 +107,9 @@ public String toString() { .field("dateTimeFormat", dateTimeFormat) .toString(); } + + @Override + public int compareTo(FixedwidthFieldConfig o) { + return new Integer(this.getIndex()).compareTo(o.getIndex()); + } } diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java index f6ca6c7a018..1af93ce8ed7 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -26,11 +26,13 @@ import org.apache.drill.common.PlanStringBuilder; import org.apache.drill.common.exceptions.UserException; import org.apache.drill.common.logical.FormatPluginConfig; +import org.apache.drill.common.types.TypeProtos; import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.List; @@ -43,11 +45,15 @@ public class FixedwidthFormatConfig implements FormatPluginConfig { private static final Logger logger = LoggerFactory.getLogger(FixedwidthFormatConfig.class); private final List extensions; private final List fields; + private final List validDataTypes = Arrays.asList(new TypeProtos.MinorType[]{TypeProtos.MinorType.INT, TypeProtos.MinorType.VARCHAR, + TypeProtos.MinorType.DATE, TypeProtos.MinorType.TIME, TypeProtos.MinorType.TIMESTAMP, TypeProtos.MinorType.FLOAT4, + TypeProtos.MinorType.FLOAT8, TypeProtos.MinorType.BIGINT, TypeProtos.MinorType.VARDECIMAL}); @JsonCreator public FixedwidthFormatConfig(@JsonProperty("extensions") List extensions, @JsonProperty("fields") List fields) { this.extensions = extensions == null ? Collections.singletonList("fwf") : ImmutableList.copyOf(extensions); + Collections.sort(fields); this.fields = fields; validateFieldInput(); @@ -133,13 +139,56 @@ public List getFieldWidths() { return result; } + @JsonIgnore + public void setFieldWidths(int i, int value) { + for (FixedwidthFieldConfig field : fields) { + if (field.getIndex() == i) { + field.setWidth(value); + } + } + } + + @JsonIgnore + public List getFieldTypes() { + List result = new ArrayList<>(); + if (! hasFields()) { + return result; + } + + for (FixedwidthFieldConfig field : fields) { + result.add(field.getType()); + } + return result; + } + + @JsonIgnore + public void setFieldTypes(int i) { + for (FixedwidthFieldConfig field : fields) { + if (field.getIndex() == i) { + field.setType(); + } + } + } + @JsonIgnore public void validateFieldInput(){ Set uniqueNames = new HashSet<>(); - for (String name : this.getFieldNames()){ - /*if (name.length() == 0){ + List fieldIndices = this.getFieldIndices(); + List fieldWidths = this.getFieldWidths(); + List fieldNames = this.getFieldNames(); + List fieldTypes = this.getFieldTypes(); + int width = 0; + int prevIndexAndWidth = -1; - }*/ + // Ensure no two fields have the same name + for (String name : this.getFieldNames()){ + if (name.length() == 0){ + throw UserException + .validationError() + .message("Blank field name detected.") + .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .build(logger); + } if (uniqueNames.contains(name)){ throw UserException .validationError() @@ -149,10 +198,6 @@ public void validateFieldInput(){ } uniqueNames.add(name); } - List fieldIndices = this.getFieldIndices(); - List fieldWidths = this.getFieldWidths(); - List fieldNames = this.getFieldNames(); - int prevIndexAndWidth = -1; //assuming that fieldIndices is the same size as fieldWidths, width is required for (int i = 0; i Date: Wed, 24 Nov 2021 11:04:57 -0500 Subject: [PATCH 17/41] Modified validation for field width and field index. Added comments to code. --- .../fixedwidth/FixedwidthFieldConfig.java | 4 --- .../fixedwidth/FixedwidthFormatConfig.java | 33 +++++++------------ 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java index e214c239fa4..615a04b775f 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java @@ -64,10 +64,6 @@ public FixedwidthFieldConfig(@JsonProperty("name") String name, public int getWidth() {return width;} - public void setWidth(int value) { - this.width = value; - } - public TypeProtos.MinorType getType() {return type;} public void setType() { diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java index 1af93ce8ed7..b9386749a57 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -45,9 +45,9 @@ public class FixedwidthFormatConfig implements FormatPluginConfig { private static final Logger logger = LoggerFactory.getLogger(FixedwidthFormatConfig.class); private final List extensions; private final List fields; - private final List validDataTypes = Arrays.asList(new TypeProtos.MinorType[]{TypeProtos.MinorType.INT, TypeProtos.MinorType.VARCHAR, + private final List validDataTypes = Arrays.asList(TypeProtos.MinorType.INT, TypeProtos.MinorType.VARCHAR, TypeProtos.MinorType.DATE, TypeProtos.MinorType.TIME, TypeProtos.MinorType.TIMESTAMP, TypeProtos.MinorType.FLOAT4, - TypeProtos.MinorType.FLOAT8, TypeProtos.MinorType.BIGINT, TypeProtos.MinorType.VARDECIMAL}); + TypeProtos.MinorType.FLOAT8, TypeProtos.MinorType.BIGINT, TypeProtos.MinorType.VARDECIMAL); @JsonCreator public FixedwidthFormatConfig(@JsonProperty("extensions") List extensions, @@ -139,15 +139,6 @@ public List getFieldWidths() { return result; } - @JsonIgnore - public void setFieldWidths(int i, int value) { - for (FixedwidthFieldConfig field : fields) { - if (field.getIndex() == i) { - field.setWidth(value); - } - } - } - @JsonIgnore public List getFieldTypes() { List result = new ArrayList<>(); @@ -180,7 +171,7 @@ public void validateFieldInput(){ int width = 0; int prevIndexAndWidth = -1; - // Ensure no two fields have the same name + // Validate Field Name - Ensure field is not empty, no two fields have the same name, and field is valid SQL syntax for (String name : this.getFieldNames()){ if (name.length() == 0){ throw UserException @@ -199,7 +190,7 @@ public void validateFieldInput(){ uniqueNames.add(name); } - //assuming that fieldIndices is the same size as fieldWidths, width is required + // Validate Field Index - Must be greater than 0, and must not overlap with other fields for (int i = 0; i Date: Wed, 24 Nov 2021 17:35:37 -0500 Subject: [PATCH 18/41] Added to field validation for field names. Checks for valid length and valid SQL syntax. --- .../fixedwidth/FixedwidthFormatConfig.java | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java index b9386749a57..2d0e5caa4f2 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -38,6 +38,7 @@ import java.util.List; import java.util.Objects; import java.util.Set; +import java.util.regex.Pattern; @JsonTypeName(FixedwidthFormatPlugin.DEFAULT_NAME) @JsonInclude(JsonInclude.Include.NON_DEFAULT) @@ -168,10 +169,11 @@ public void validateFieldInput(){ List fieldWidths = this.getFieldWidths(); List fieldNames = this.getFieldNames(); List fieldTypes = this.getFieldTypes(); - int width = 0; int prevIndexAndWidth = -1; - // Validate Field Name - Ensure field is not empty, no two fields have the same name, and field is valid SQL syntax + /* Validate Field Name - Ensure field is not empty, does not exceed maximum length, + is valid SQL syntax, and no two fields have the same name + */ for (String name : this.getFieldNames()){ if (name.length() == 0){ throw UserException @@ -180,6 +182,20 @@ public void validateFieldInput(){ .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) .build(logger); } + if (name.length() > 1024) { + throw UserException + .validationError() + .message("Exceeds maximum length of 1024 characters: " + name.substring(0, 1024)) + .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .build(logger); + } + if (!Pattern.matches("[a-zA-Z]\\w*", name)) { + throw UserException + .validationError() + .message("Invalid input: " + name) + .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .build(logger); + } if (uniqueNames.contains(name)){ throw UserException .validationError() From 4b221b5f365ecaa5484279fe87e4ba8c1a7ddc55 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Fri, 10 Dec 2021 09:31:12 -0500 Subject: [PATCH 19/41] WIP converting to EVF v2. Pushing to repo for troubleshooting purposes. --- .../fixedwidth/FixedwidthBatchReader.java | 3 +- .../fixedwidth/FixedwidthBatchReaderImpl.java | 89 +++++++++++++++++++ .../fixedwidth/FixedwidthFormatConfig.java | 3 +- .../fixedwidth/FixedwidthFormatPlugin.java | 2 +- .../TestFixedwidthRecordReader.java | 11 ++- .../impl/scan/v3/SchemaNegotiator.java | 2 + 6 files changed, 99 insertions(+), 11 deletions(-) create mode 100644 contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReaderImpl.java diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index cd487a7e590..79eaef2c45b 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -56,12 +56,11 @@ public class FixedwidthBatchReader implements ManagedReader uniqueNames = new HashSet<>(); List fieldIndices = this.getFieldIndices(); @@ -192,7 +191,7 @@ public void validateFieldInput(){ if (!Pattern.matches("[a-zA-Z]\\w*", name)) { throw UserException .validationError() - .message("Invalid input: " + name) + .message("Column Name '" + name + "' is not valid. Must contain letters, numbers, and underscores only.") .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) .build(logger); } diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java index f96e4a81f77..2f64e23d8e6 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java @@ -89,4 +89,4 @@ protected FileScanBuilder frameworkBuilder(OptionManager options, EasySubScan sc builder.nullType(Types.optional(TypeProtos.MinorType.VARCHAR)); return builder; } -} \ No newline at end of file +} diff --git a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java index 6f7927b8576..72ccd23004b 100644 --- a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java @@ -51,11 +51,11 @@ public static void setup() throws Exception { FixedwidthFormatConfig formatConfig = new FixedwidthFormatConfig(Lists.newArrayList("fwf"), Lists.newArrayList( new FixedwidthFieldConfig("Number", 1, 5, TypeProtos.MinorType.VARDECIMAL), - new FixedwidthFieldConfig("Address",12, 3,TypeProtos.MinorType.INT, ""), - new FixedwidthFieldConfig("Letter", 7,4, TypeProtos.MinorType.VARCHAR, ""), - new FixedwidthFieldConfig("Date",16, 10,TypeProtos.MinorType.DATE, "MM-dd-yyyy"), - new FixedwidthFieldConfig( "Time", 27, 8,TypeProtos.MinorType.TIME,"HH:mm:ss" ), - new FixedwidthFieldConfig("DateTime", 36, 23,TypeProtos.MinorType.TIMESTAMP, "MM-dd-yyyy'T'HH:mm:ss.SSX" ) + new FixedwidthFieldConfig("Address", 12, 3, TypeProtos.MinorType.INT), + new FixedwidthFieldConfig("Letter", 7, 4, TypeProtos.MinorType.VARCHAR), + new FixedwidthFieldConfig("Date", 16, 10, TypeProtos.MinorType.DATE, "MM-dd-yyyy"), + new FixedwidthFieldConfig("Time", 27, 8, TypeProtos.MinorType.TIME,"HH:mm:ss"), + new FixedwidthFieldConfig("DateTime", 36, 23, TypeProtos.MinorType.TIMESTAMP, "MM-dd-yyyy'T'HH:mm:ss.SSX") )); cluster.defineFormat("dfs", "fwf", formatConfig); cluster.defineFormat("cp", "fwf", formatConfig); @@ -218,4 +218,3 @@ private RowSet setupTestData(){ } } - diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/SchemaNegotiator.java b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/SchemaNegotiator.java index 9dee1d78df5..4455ad588cf 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/SchemaNegotiator.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/SchemaNegotiator.java @@ -224,4 +224,6 @@ public interface SchemaNegotiator { * schema order */ ResultSetLoader build(); + + Object split(); } From 4875367d0721b2a2a71177b47949592c6a1c3965 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Thu, 15 Jul 2021 13:09:58 -0400 Subject: [PATCH 20/41] Start of fixed width format plugin --- contrib/format-fixedwidth/pom.xml | 84 +++++++++++++ .../fixedwidth/FixedwidthBatchReader.java | 113 ++++++++++++++++++ .../fixedwidth/FixedwidthFieldConfig.java | 71 +++++++++++ .../fixedwidth/FixedwidthFormatConfig.java | 84 +++++++++++++ .../fixedwidth/FixedwidthFormatPlugin.java | 94 +++++++++++++++ .../src/main/resources/drill-module.conf | 23 ++++ .../test/java/TestFixedwidthRecordReader.java | 85 +++++++++++++ .../src/test/resources/fwf/test.fwf | 1 + contrib/pom.xml | 1 + 9 files changed, 556 insertions(+) create mode 100644 contrib/format-fixedwidth/pom.xml create mode 100644 contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java create mode 100644 contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java create mode 100644 contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java create mode 100644 contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java create mode 100644 contrib/format-fixedwidth/src/main/resources/drill-module.conf create mode 100644 contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java create mode 100644 contrib/format-fixedwidth/src/test/resources/fwf/test.fwf diff --git a/contrib/format-fixedwidth/pom.xml b/contrib/format-fixedwidth/pom.xml new file mode 100644 index 00000000000..d69c492b1dc --- /dev/null +++ b/contrib/format-fixedwidth/pom.xml @@ -0,0 +1,84 @@ + + + + 4.0.0 + + + drill-contrib-parent + org.apache.drill.contrib + 1.20.0-SNAPSHOT + + + drill-format-fixedwidth + Drill : Contrib : Format : Fixedwidth + + + + org.apache.drill.exec + drill-java-exec + ${project.version} + + + + + + org.apache.drill.exec + drill-java-exec + tests + ${project.version} + test + + + + org.apache.drill + drill-common + tests + ${project.version} + test + + + + + + maven-resources-plugin + + + copy-java-sources + process-sources + + copy-resources + + + ${basedir}/target/classes/org/apache/drill/exec/store/fixedwidth + + + + src/main/java/org/apache/drill/exec/store/fixedwidth + true + + + + + + + + + \ No newline at end of file diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java new file mode 100644 index 00000000000..a0c474ed5b2 --- /dev/null +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.fixedwidth; + +import org.apache.drill.common.exceptions.CustomErrorContext; +import org.apache.drill.common.exceptions.UserException; +import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator; +import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader; +import org.apache.drill.exec.record.metadata.SchemaBuilder; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.apache.hadoop.mapred.FileSplit; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.InputStream; + +public class FixedwidthBatchReader implements ManagedReader{ + + private static final Logger logger = LoggerFactory.getLogger(FixedwidthBatchReader.class); + + private FileSplit split; + + private final int maxRecords; + + private final FixedwidthFormatConfig config; + + private CustomErrorContext errorContext; + + private InputStream fsStream; + + public FixedwidthBatchReader(FixedwidthFormatConfig config, int maxRecords) { + this.config = config; + this.maxRecords = maxRecords; + } + + @Override + public boolean open(FileSchemaNegotiator negotiator) { + split = negotiator.split(); + errorContext = negotiator.parentErrorContext(); + try { + fsStream = negotiator.fileSystem().openPossiblyCompressedStream(split.getPath()); + negotiator.tableSchema(buildSchema(),true); + negotiator.build(); + } catch (Exception e) { + throw UserException + .dataReadError(e) + .message("Failed to open input file: {}", split.getPath().toString()) + .addContext(errorContext) + .addContext(e.getMessage()) + .build(logger); + } + return true; + } + + @Override + public boolean next() { + byte[] byteArray = new byte[10000]; + int bytesRead; + + try { + bytesRead = fsStream.read(byteArray); + System.out.println(new String(byteArray)); + } catch (Exception e) { + throw UserException + .dataReadError(e) + .message("Failed to read input file: {}", split.getPath().toString()) + .addContext(errorContext) + .addContext(e.getMessage()) + .build(logger); + } + return (bytesRead != -1); + } + + @Override + public void close() { + try { + fsStream.close(); + } catch (Exception e) { + throw UserException + .dataReadError(e) + .message("Failed to close input file: {}", split.getPath().toString()) + .addContext(errorContext) + .addContext(e.getMessage()) + .build(logger); + } + } + + private TupleMetadata buildSchema(){ + SchemaBuilder builder = new SchemaBuilder(); + + for (FixedwidthFieldConfig field : config.getFields()){ + builder.addNullable(field.getFieldName(),field.getDataType()); + } + + return builder.buildSchema(); + } +} \ No newline at end of file diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java new file mode 100644 index 00000000000..667ad3d098d --- /dev/null +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java @@ -0,0 +1,71 @@ +package org.apache.drill.exec.store.fixedwidth; + +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.drill.common.types.TypeProtos; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonTypeName; + +@JsonTypeName("fixedwidthReaderFieldDescription") +@JsonInclude(JsonInclude.Include.NON_DEFAULT) +public class FixedwidthFieldConfig { + + private final TypeProtos.MinorType dataType; + private final String fieldName; + private final String dateTimeFormat; + private final int startIndex; + private final int fieldWidth; + + public FixedwidthFieldConfig(@JsonProperty("dataType") TypeProtos.MinorType dataType, + @JsonProperty("fieldName") String fieldName, + @JsonProperty("dateTimeFormat") String dateTimeFormat, + @JsonProperty("startIndex") int startIndex, + @JsonProperty("fieldWidth") int fieldWidth) { + this.dataType = dataType; + this.fieldName = fieldName; + this.dateTimeFormat = dateTimeFormat; + this.startIndex = startIndex; + this.fieldWidth = fieldWidth; + } + + public TypeProtos.MinorType getDataType(){ + return dataType; + } + +// public void setDataType(TypeProtos.MinorType dataType){ +// this.dataType = dataType; +// } + + public String getFieldName(){ + return fieldName; + } + +// public void setFieldName(String fieldName){ +// this.fieldName = fieldName; +// } + + public String getDateTimeFormat() { + return dateTimeFormat; + } + +// public void setDateTimeFormat(String dateTimeFormat) { +// this.dateTimeFormat = dateTimeFormat; +// } + + public int getStartIndex(){ + return startIndex; + } + +// public void setStartIndex(int startIndex){ +// this.startIndex = startIndex; +// } + + public int getFieldWidth(){ + return fieldWidth; + } + +// public void setFieldWidth(int fieldWidth){ +// this.fieldWidth = fieldWidth; +// } + +} diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java new file mode 100644 index 00000000000..a05a7bd0e4d --- /dev/null +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.fixedwidth; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonTypeName; +import org.apache.drill.common.PlanStringBuilder; +import org.apache.drill.common.logical.FormatPluginConfig; +import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList; + +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +@JsonTypeName(FixedwidthFormatPlugin.DEFAULT_NAME) +@JsonInclude(JsonInclude.Include.NON_DEFAULT) +public class FixedwidthFormatConfig implements FormatPluginConfig { + private final List extensions; + private final List fields; + + @JsonCreator + public FixedwidthFormatConfig(@JsonProperty("extensions") List extensions, + @JsonProperty("fields") List fields) { + this.extensions = extensions == null ? Collections.singletonList("sav") : ImmutableList.copyOf(extensions); + this.fields = fields; + } //Change this + + @JsonInclude(JsonInclude.Include.NON_DEFAULT) + public List getExtensions() { + return extensions; + } + + public List getFields() { + return fields; + } + +// public FixedwidthReaderConfig getReaderConfig(FixedwidthFormatPlugin plugin) { +// return new FixedwidthReaderConfig(plugin); +// } + + @Override + public int hashCode() { + return Objects.hash(extensions, fields); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + FixedwidthFormatConfig other = (FixedwidthFormatConfig) obj; + return Objects.equals(extensions, other.extensions) + && Objects.equals(fields, other.fields); + } + + @Override + public String toString() { + return new PlanStringBuilder(this) + .field("extensions", extensions) + .field("fields", fields) + .toString(); + } +} \ No newline at end of file diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java new file mode 100644 index 00000000000..86861cb5040 --- /dev/null +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.store.fixedwidth; + +import org.apache.drill.common.logical.StoragePluginConfig; +import org.apache.drill.common.types.TypeProtos; +import org.apache.drill.common.types.Types; +import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileReaderFactory; +import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileScanBuilder; +import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator; + +import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader; +import org.apache.drill.exec.server.DrillbitContext; +import org.apache.drill.exec.server.options.OptionManager; +import org.apache.drill.exec.store.dfs.easy.EasyFormatPlugin; +import org.apache.drill.exec.store.dfs.easy.EasySubScan; +import org.apache.hadoop.conf.Configuration; + + +public class FixedwidthFormatPlugin extends EasyFormatPlugin { + + protected static final String DEFAULT_NAME = "fixedwidth"; + + private static class FixedwidthReaderFactory extends FileReaderFactory { + + private final FixedwidthFormatConfig config; + private final int maxRecords; + + public FixedwidthReaderFactory(FixedwidthFormatConfig config, int maxRecords) { + this.config = config; + this.maxRecords = maxRecords; + } + + @Override + public ManagedReader newReader() { + return new FixedwidthBatchReader(config, maxRecords); + } + } + + public FixedwidthFormatPlugin(String name, + DrillbitContext context, + Configuration fsConf, + StoragePluginConfig storageConfig, + FixedwidthFormatConfig formatConfig) { + super(name, easyConfig(fsConf, formatConfig), context, storageConfig, formatConfig); + } //final? + + private static EasyFormatConfig easyConfig(Configuration fsConf, FixedwidthFormatConfig pluginConfig) { + return EasyFormatConfig.builder() + .readable(true) + .writable(false) + .blockSplittable(false) + .compressible(true) + .supportsProjectPushdown(true) + .extensions(pluginConfig.getExtensions()) + .fsConf(fsConf) + .defaultName(DEFAULT_NAME) + .useEnhancedScan(true) + .supportsLimitPushdown(true) + .build(); + } + + @Override + public ManagedReader newBatchReader( + EasySubScan scan, OptionManager options) { + return new FixedwidthBatchReader(getConfig(), scan.getMaxRecords()); + } + + @Override + protected FileScanBuilder frameworkBuilder(OptionManager options, EasySubScan scan) { + FileScanBuilder builder = new FileScanBuilder(); + builder.setReaderFactory(new FixedwidthReaderFactory(getConfig(), scan.getMaxRecords())); + + initScanBuilder(builder, scan); + builder.nullType(Types.optional(TypeProtos.MinorType.VARCHAR)); + return builder; + } +} \ No newline at end of file diff --git a/contrib/format-fixedwidth/src/main/resources/drill-module.conf b/contrib/format-fixedwidth/src/main/resources/drill-module.conf new file mode 100644 index 00000000000..ed3e073f8dd --- /dev/null +++ b/contrib/format-fixedwidth/src/main/resources/drill-module.conf @@ -0,0 +1,23 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This file tells Drill to consider this module when class path scanning. +# This file can also include any supplementary configuration information. +# This file is in HOCON format, see https://github.com/typesafehub/config/blob/master/HOCON.md for more information. + +drill.classpath.scanning.packages += "org.apache.drill.exec.store.fixedwidth" diff --git a/contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java new file mode 100644 index 00000000000..764a37c744e --- /dev/null +++ b/contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java @@ -0,0 +1,85 @@ + +import com.google.common.collect.Lists; +import org.apache.drill.categories.RowSetTests; +import org.apache.drill.common.types.TypeProtos; +import org.apache.drill.exec.physical.rowSet.RowSet; +import org.apache.drill.exec.physical.rowSet.RowSetBuilder; +import org.apache.drill.exec.record.metadata.SchemaBuilder; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.apache.drill.exec.store.fixedwidth.FixedwidthFieldConfig; +import org.apache.drill.exec.store.fixedwidth.FixedwidthFormatConfig; +import org.apache.drill.test.ClusterFixture; +import org.apache.drill.test.ClusterTest; +import org.apache.drill.test.QueryBuilder; +import org.apache.drill.test.rowSet.RowSetComparison; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import java.nio.file.Paths; + +import static org.junit.Assert.assertEquals; + +@Category(RowSetTests.class) +public class TestFixedwidthRecordReader extends ClusterTest { + +// @BeforeClass +// public static void setup() throws Exception { +// ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher)); +// +// // Needed for compressed file unit test +// //dirTestWatcher.copyResourceToRoot(Paths.get("spss/")); +// } + + @BeforeClass + public static void setup() throws Exception { + ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher)); + + FixedwidthFormatConfig formatConfig = new FixedwidthFormatConfig(Lists.newArrayList("fwf") + , Lists.newArrayList()); + cluster.defineFormat("cp", "fwf", formatConfig); + //cluster.defineFormat("dfs", "xml", formatConfig); + + // Needed for compressed file unit test + dirTestWatcher.copyResourceToRoot(Paths.get("fwf/")); + } + + @Test + public void testExplicitQuery() throws Exception { + String sql = "SELECT ID, Urban, Urban_value FROM dfs.`spss/testdata.sav` WHERE d16=4"; + + QueryBuilder q = client.queryBuilder().sql(sql); + RowSet results = q.rowSet(); + + TupleMetadata expectedSchema = new SchemaBuilder() + .addNullable("ID", TypeProtos.MinorType.FLOAT8) + .addNullable("Urban", TypeProtos.MinorType.FLOAT8) + .addNullable("Urban_value", TypeProtos.MinorType.VARCHAR) + .buildSchema(); + + + RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema) + .addRow(47.0, 1.0, "Urban").addRow(53.0, 1.0, "Urban") + .addRow(66.0, 1.0, "Urban") + .build(); + + assertEquals(3, results.rowCount()); + + new RowSetComparison(expected).verifyAndClearAll(results); + } + + @Test + public void testBatchReader() throws Exception { + FixedwidthFieldConfig testField = new FixedwidthFieldConfig(TypeProtos.MinorType.FLOAT8,"date","MM/DD/YYYY",1,10); + System.out.println(testField.getFieldName()); + System.out.println(testField.getStartIndex()); + System.out.println(testField.getFieldWidth()); + System.out.println(testField.getDateTimeFormat()); + System.out.println(testField.getDataType()); + + String sql = "SELECT * FROM cp.`fwf/test.fwf`"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + } + + +} diff --git a/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf b/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf new file mode 100644 index 00000000000..3008d7b1a7a --- /dev/null +++ b/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf @@ -0,0 +1 @@ +1234 test 567 diff --git a/contrib/pom.xml b/contrib/pom.xml index 44c1e03d32d..3f384b96aaa 100644 --- a/contrib/pom.xml +++ b/contrib/pom.xml @@ -46,6 +46,7 @@ format-syslog format-ltsv format-excel + format-fixedwidth format-httpd format-esri format-pdf From ef0bc8265dea6f9b856bed4b74ead8c8e6d77e99 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Thu, 22 Jul 2021 15:27:54 -0400 Subject: [PATCH 21/41] Work in Progress. Producing Rows. Currently complains about buffer not being de-allocated. --- .../fixedwidth/FixedwidthBatchReader.java | 18 +++++++- .../test/java/TestFixedwidthRecordReader.java | 46 ++++++++++++++++++- 2 files changed, 61 insertions(+), 3 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index a0c474ed5b2..dee81c7be74 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -22,6 +22,8 @@ import org.apache.drill.common.exceptions.UserException; import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator; import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader; +import org.apache.drill.exec.physical.resultSet.ResultSetLoader; +import org.apache.drill.exec.physical.resultSet.RowSetLoader; import org.apache.drill.exec.record.metadata.SchemaBuilder; import org.apache.drill.exec.record.metadata.TupleMetadata; import org.apache.hadoop.mapred.FileSplit; @@ -44,6 +46,8 @@ public class FixedwidthBatchReader implements ManagedReader Date: Mon, 26 Jul 2021 11:15:52 -0400 Subject: [PATCH 22/41] First working version --- .../fixedwidth/FixedwidthBatchReader.java | 68 ++++++++++++++++--- .../test/java/TestFixedwidthRecordReader.java | 17 +++-- 2 files changed, 68 insertions(+), 17 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index dee81c7be74..59286777cd8 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -20,17 +20,21 @@ import org.apache.drill.common.exceptions.CustomErrorContext; import org.apache.drill.common.exceptions.UserException; +import org.apache.drill.common.types.TypeProtos; import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator; import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader; import org.apache.drill.exec.physical.resultSet.ResultSetLoader; import org.apache.drill.exec.physical.resultSet.RowSetLoader; import org.apache.drill.exec.record.metadata.SchemaBuilder; import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.apache.drill.shaded.guava.com.google.common.base.Charsets; import org.apache.hadoop.mapred.FileSplit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.BufferedReader; import java.io.InputStream; +import java.io.InputStreamReader; public class FixedwidthBatchReader implements ManagedReader{ @@ -48,6 +52,8 @@ public class FixedwidthBatchReader implements ManagedReader Date: Thu, 29 Jul 2021 12:40:27 -0400 Subject: [PATCH 23/41] Added more data types, refactored code --- .../fixedwidth/FixedwidthBatchReader.java | 67 +++++++++++++------ .../test/java/TestFixedwidthRecordReader.java | 55 +++++++++++---- .../src/test/resources/fwf/test.fwf | 26 ++++++- 3 files changed, 113 insertions(+), 35 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index 59286777cd8..001f8c09987 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -35,6 +35,14 @@ import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.ZoneId; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; +import java.util.Locale; public class FixedwidthBatchReader implements ManagedReader{ @@ -94,17 +102,8 @@ public boolean next() { // Use loader to read data from file to turn into Drill while (!writer.isFull() && line != null) { - Object[] row = parseLine(line); - writer.start(); - - for (int i = 0; i < row.length; i++) { - if (row[i] instanceof Integer) { - writer.scalar(i).setInt((Integer) row[i]); - } else if (row[i] instanceof String) { - writer.scalar(i).setString((String) row[i]); - } - } + parseLine(line, writer); writer.save(); line = reader.readLine(); @@ -145,31 +144,57 @@ private TupleMetadata buildSchema(){ return builder.buildSchema(); } - private Object[] parseLine(String line){ - Object[] row = new Object[config.getFields().size()]; + private void parseLine(String line, RowSetLoader writer) { int i = 0; TypeProtos.MinorType dataType; String dateTimeFormat; + String value; - for (FixedwidthFieldConfig field : config.getFields()){ - row[i] = line.substring(field.getStartIndex()-1,field.getStartIndex()+field.getFieldWidth()-1); + for (FixedwidthFieldConfig field : config.getFields()) { + value = line.substring(field.getStartIndex() - 1, field.getStartIndex() + field.getFieldWidth() - 1); // Convert String to data type in field dataType = field.getDataType(); dateTimeFormat = field.getDateTimeFormat(); - if (dataType == TypeProtos.MinorType.INT){ - row[i] = Integer.parseInt((String) row[i]); - } else if (dataType == TypeProtos.MinorType.VARCHAR){ - } else if (dataType == TypeProtos.MinorType.DATE || dataType == TypeProtos.MinorType.TIME){ - // Check to ensure date time format matches input date? - } else{ + switch (dataType) { + case INT: + writer.scalar(i).setInt(Integer.parseInt(value)); + break; + case VARCHAR: + writer.scalar(i).setString(value); + break; + case DATE: + DateTimeFormatter formatDate = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); + LocalDate date = LocalDate.parse(value, formatDate); + + writer.scalar(i).setDate(date); + break; + case TIME: + DateTimeFormatter formatTime = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); + LocalTime time = LocalTime.parse(value, formatTime); + + writer.scalar(i).setTime(time); + break; + case TIMESTAMP: + DateTimeFormatter formatTS = DateTimeFormatter.ofPattern(dateTimeFormat,Locale.ENGLISH); + LocalDateTime ldt = LocalDateTime.parse(value,formatTS); + ZoneId z = ZoneId.of( "America/Toronto" ) ; + ZonedDateTime zdt = ldt.atZone( z ) ; + Instant timeStamp = zdt.toInstant(); + + writer.scalar(i).setTimestamp(timeStamp); + break; + default: + throw new RuntimeException("Unknown data type specified in fixed width. Found data type " + dataType); + + + } i++; } - return row; } } \ No newline at end of file diff --git a/contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java index 1e6856c46ac..26fcf009244 100644 --- a/contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java @@ -18,6 +18,10 @@ import org.junit.experimental.categories.Category; import java.nio.file.Paths; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; import static org.junit.Assert.assertEquals; @@ -40,10 +44,12 @@ public static void setup() throws Exception { , Lists.newArrayList( new FixedwidthFieldConfig(TypeProtos.MinorType.INT, "Number", "", 1, 4), new FixedwidthFieldConfig(TypeProtos.MinorType.VARCHAR, "Letter", "", 6, 4), - new FixedwidthFieldConfig(TypeProtos.MinorType.INT, "Address", "", 11, 3) - )); + new FixedwidthFieldConfig(TypeProtos.MinorType.INT,"Address","",11,3), + new FixedwidthFieldConfig(TypeProtos.MinorType.DATE,"Date","MM-dd-yyyy",15,10), + new FixedwidthFieldConfig(TypeProtos.MinorType.TIME,"Time","HH:mm:ss",26,8), + new FixedwidthFieldConfig(TypeProtos.MinorType.TIMESTAMP,"DateTime","MM-dd-yyyy'T'HH:mm:ss.SSX",35,23) + )); cluster.defineFormat("cp", "fwf", formatConfig); - //cluster.defineFormat("dfs", "xml", formatConfig); // Needed for compressed file unit test dirTestWatcher.copyResourceToRoot(Paths.get("fwf/")); @@ -75,13 +81,6 @@ public void testExplicitQuery() throws Exception { @Test public void testBatchReader() throws Exception { - FixedwidthFieldConfig testField = new FixedwidthFieldConfig(TypeProtos.MinorType.FLOAT8, "date", "MM/DD/YYYY", 1, 10); - System.out.println(testField.getFieldName()); - System.out.println(testField.getStartIndex()); - System.out.println(testField.getFieldWidth()); - System.out.println(testField.getDateTimeFormat()); - System.out.println(testField.getDataType()); - String sql = "SELECT * FROM cp.`fwf/test.fwf`"; RowSet results = client.queryBuilder().sql(sql).rowSet(); @@ -89,16 +88,46 @@ public void testBatchReader() throws Exception { .addNullable("Number", TypeProtos.MinorType.INT) .addNullable("Letter", TypeProtos.MinorType.VARCHAR) .addNullable("Address", TypeProtos.MinorType.INT) + .addNullable("Date", TypeProtos.MinorType.DATE) + .addNullable("Time",TypeProtos.MinorType.TIME) + .addNullable("DateTime",TypeProtos.MinorType.TIMESTAMP) .buildSchema(); RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema) - .addRow(1234, "test", 567) + .addRow(1234, "test", 567, LocalDate.parse("2021-02-10"), LocalTime.parse("10:30:27"), Instant.parse("2021-02-10T15:30:27.00Z")) + .addRow(5678, "TEST", 890, LocalDate.parse("2021-07-27"), LocalTime.parse("12:40:15"), Instant.parse("2021-07-27T16:40:15.00Z")) + .addRow(1111, "abcd", 111, LocalDate.parse("1111-11-11"), LocalTime.parse("11:11:11"), Instant.parse("1111-11-11T16:28:43.11Z")) + .addRow(2222, "efgh", 222, LocalDate.parse("2222-01-22"), LocalTime.parse("22:22:22"), Instant.parse("2222-01-23T03:22:22.22Z")) + .addRow(3333, "ijkl", 333, LocalDate.parse("3333-02-01"), LocalTime.parse("01:33:33"), Instant.parse("3333-02-01T06:33:33.33Z")) + .addRow(4444, "mnop", 444, LocalDate.parse("4444-03-02"), LocalTime.parse("02:44:44"), Instant.parse("4444-03-02T07:44:44.44Z")) + .addRow(5555, "qrst", 555, LocalDate.parse("5555-04-03"), LocalTime.parse("03:55:55"), Instant.parse("5555-04-03T07:55:55.55Z")) + .addRow(6666, "uvwx", 666, LocalDate.parse("6666-05-04"), LocalTime.parse("04:01:01"), Instant.parse("6666-05-04T08:01:01.01Z")) + .addRow(7777, "yzzz", 777, LocalDate.parse("7777-06-05"), LocalTime.parse("05:11:11"), Instant.parse("7777-06-05T09:11:11.11Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) .build(); - assertEquals(1, results.rowCount()); + System.out.println(expected); + assertEquals(25, results.rowCount()); + + //System.out.println(results.batchSchema()); + System.out.println(results); - System.out.println(results.batchSchema()); new RowSetComparison(expected).verifyAndClearAll(results); System.out.println("Test complete."); diff --git a/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf b/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf index 3008d7b1a7a..9eba0f61944 100644 --- a/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf +++ b/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf @@ -1 +1,25 @@ -1234 test 567 +1234 test 567 02-10-2021 10:30:27 02-10-2021T10:30:27.00Z +5678 TEST 890 07-27-2021 12:40:15 07-27-2021T12:40:15.00Z +1111 abcd 111 11-11-1111 11:11:11 11-11-1111T11:11:11.11Z +2222 efgh 222 01-22-2222 22:22:22 01-22-2222T22:22:22.22Z +3333 ijkl 333 02-01-3333 01:33:33 02-01-3333T01:33:33.33Z +4444 mnop 444 03-02-4444 02:44:44 03-02-4444T02:44:44.44Z +5555 qrst 555 04-03-5555 03:55:55 04-03-5555T03:55:55.55Z +6666 uvwx 666 05-04-6666 04:01:01 05-04-6666T04:01:01.01Z +7777 yzzz 777 06-05-7777 05:11:11 06-05-7777T05:11:11.11Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z \ No newline at end of file From 6d7a2a5eab56e6a95a7b268cd2abeeb03eb62a5a Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Thu, 29 Jul 2021 13:00:10 -0400 Subject: [PATCH 24/41] Checkstyle fixes --- .../fixedwidth/FixedwidthBatchReader.java | 4 +- .../TestFixedwidthRecordReader.java | 47 +++---------------- 2 files changed, 8 insertions(+), 43 deletions(-) rename contrib/format-fixedwidth/src/test/java/{ => org/apache/drill/exec/store/fixedwidth}/TestFixedwidthRecordReader.java (86%) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index 001f8c09987..3dbd8b9dc5f 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -179,8 +179,8 @@ private void parseLine(String line, RowSetLoader writer) { case TIMESTAMP: DateTimeFormatter formatTS = DateTimeFormatter.ofPattern(dateTimeFormat,Locale.ENGLISH); LocalDateTime ldt = LocalDateTime.parse(value,formatTS); - ZoneId z = ZoneId.of( "America/Toronto" ) ; - ZonedDateTime zdt = ldt.atZone( z ) ; + ZoneId z = ZoneId.of( "America/Toronto" ); + ZonedDateTime zdt = ldt.atZone( z ); Instant timeStamp = zdt.toInstant(); writer.scalar(i).setTimestamp(timeStamp); diff --git a/contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java similarity index 86% rename from contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java rename to contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java index 26fcf009244..309837323be 100644 --- a/contrib/format-fixedwidth/src/test/java/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java @@ -1,14 +1,12 @@ +package org.apache.drill.exec.store.fixedwidth; -import com.google.common.collect.Lists; import org.apache.drill.categories.RowSetTests; import org.apache.drill.common.types.TypeProtos; import org.apache.drill.exec.physical.rowSet.RowSet; import org.apache.drill.exec.physical.rowSet.RowSetBuilder; import org.apache.drill.exec.record.metadata.SchemaBuilder; import org.apache.drill.exec.record.metadata.TupleMetadata; -import org.apache.drill.exec.store.fixedwidth.FixedwidthBatchReader; -import org.apache.drill.exec.store.fixedwidth.FixedwidthFieldConfig; -import org.apache.drill.exec.store.fixedwidth.FixedwidthFormatConfig; +import org.apache.drill.shaded.guava.com.google.common.collect.Lists; import org.apache.drill.test.ClusterFixture; import org.apache.drill.test.ClusterTest; import org.apache.drill.test.QueryBuilder; @@ -20,7 +18,6 @@ import java.nio.file.Paths; import java.time.Instant; import java.time.LocalDate; -import java.time.LocalDateTime; import java.time.LocalTime; import static org.junit.Assert.assertEquals; @@ -28,27 +25,19 @@ @Category(RowSetTests.class) public class TestFixedwidthRecordReader extends ClusterTest { -// @BeforeClass -// public static void setup() throws Exception { -// ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher)); -// -// // Needed for compressed file unit test -// //dirTestWatcher.copyResourceToRoot(Paths.get("spss/")); -// } - @BeforeClass public static void setup() throws Exception { ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher)); - FixedwidthFormatConfig formatConfig = new FixedwidthFormatConfig(Lists.newArrayList("fwf") - , Lists.newArrayList( + FixedwidthFormatConfig formatConfig = new FixedwidthFormatConfig(Lists.newArrayList("fwf"), + Lists.newArrayList( new FixedwidthFieldConfig(TypeProtos.MinorType.INT, "Number", "", 1, 4), new FixedwidthFieldConfig(TypeProtos.MinorType.VARCHAR, "Letter", "", 6, 4), new FixedwidthFieldConfig(TypeProtos.MinorType.INT,"Address","",11,3), new FixedwidthFieldConfig(TypeProtos.MinorType.DATE,"Date","MM-dd-yyyy",15,10), new FixedwidthFieldConfig(TypeProtos.MinorType.TIME,"Time","HH:mm:ss",26,8), new FixedwidthFieldConfig(TypeProtos.MinorType.TIMESTAMP,"DateTime","MM-dd-yyyy'T'HH:mm:ss.SSX",35,23) - )); + )); cluster.defineFormat("cp", "fwf", formatConfig); // Needed for compressed file unit test @@ -134,28 +123,4 @@ public void testBatchReader() throws Exception { client.close(); } - - - -/* -BatchSchema [ -fields=[ - [`Number` (INT:OPTIONAL), - children=([`$bits$` (UINT1:REQUIRED)], - [`Number` (INT:OPTIONAL)])], - [`Letter` (VARCHAR:OPTIONAL), - children=([`$bits$` (UINT1:REQUIRED)], - [`Letter` (VARCHAR:OPTIONAL), - children=([`$offsets$` (UINT4:REQUIRED)]) - ] - ) - ], - [`Address` (INT:OPTIONAL), - children=([`$bits$` (UINT1:REQUIRED)], - [`Address` (INT:OPTIONAL)]) - ] -], -selectionVector=NONE] -*/ - -} \ No newline at end of file +} From 056df138e504662d06559a4237045cd4e1d2bdd5 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Thu, 29 Jul 2021 13:54:49 -0400 Subject: [PATCH 25/41] Removed println statement from Batch Reader, Simplified logic --- .../store/fixedwidth/FixedwidthBatchReader.java | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index 3dbd8b9dc5f..30b39611de9 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -97,7 +97,6 @@ public boolean next() { // Use loader to read data from file to turn into Drill try { line = reader.readLine(); - System.out.println(line); RowSetLoader writer = loader.writer(); while (!writer.isFull() && line != null) { @@ -153,9 +152,9 @@ private void parseLine(String line, RowSetLoader writer) { for (FixedwidthFieldConfig field : config.getFields()) { value = line.substring(field.getStartIndex() - 1, field.getStartIndex() + field.getFieldWidth() - 1); - // Convert String to data type in field dataType = field.getDataType(); dateTimeFormat = field.getDateTimeFormat(); + DateTimeFormatter formatter = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); switch (dataType) { case INT: @@ -165,24 +164,18 @@ private void parseLine(String line, RowSetLoader writer) { writer.scalar(i).setString(value); break; case DATE: - DateTimeFormatter formatDate = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); - LocalDate date = LocalDate.parse(value, formatDate); - + LocalDate date = LocalDate.parse(value, formatter); writer.scalar(i).setDate(date); break; case TIME: - DateTimeFormatter formatTime = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); - LocalTime time = LocalTime.parse(value, formatTime); - + LocalTime time = LocalTime.parse(value, formatter); writer.scalar(i).setTime(time); break; case TIMESTAMP: - DateTimeFormatter formatTS = DateTimeFormatter.ofPattern(dateTimeFormat,Locale.ENGLISH); - LocalDateTime ldt = LocalDateTime.parse(value,formatTS); + LocalDateTime ldt = LocalDateTime.parse(value,formatter); ZoneId z = ZoneId.of( "America/Toronto" ); ZonedDateTime zdt = ldt.atZone( z ); Instant timeStamp = zdt.toInstant(); - writer.scalar(i).setTimestamp(timeStamp); break; default: From d2097b3eff85e900f410b8810011c47274a6de45 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Fri, 6 Aug 2021 16:14:29 -0400 Subject: [PATCH 26/41] Modified format, fixed maxRecords in next(), modified Exception handling in Batch Reader --- contrib/format-fixedwidth/pom.xml | 115 +++++++-------- .../fixedwidth/FixedwidthBatchReader.java | 139 +++++++++--------- .../fixedwidth/FixedwidthFieldConfig.java | 64 ++++---- .../fixedwidth/FixedwidthFormatConfig.java | 8 +- .../fixedwidth/FixedwidthFormatPlugin.java | 28 ++-- .../TestFixedwidthRecordReader.java | 124 +++++++++------- .../src/test/resources/fwf/test.fwf | 2 +- distribution/pom.xml | 5 + distribution/src/assemble/component.xml | 1 + 9 files changed, 237 insertions(+), 249 deletions(-) diff --git a/contrib/format-fixedwidth/pom.xml b/contrib/format-fixedwidth/pom.xml index d69c492b1dc..c30db19a536 100644 --- a/contrib/format-fixedwidth/pom.xml +++ b/contrib/format-fixedwidth/pom.xml @@ -19,66 +19,63 @@ --> - 4.0.0 + 4.0.0 + + drill-contrib-parent + org.apache.drill.contrib + 1.20.0-SNAPSHOT + + drill-format-fixedwidth + Drill : Contrib : Format : Fixedwidth - - drill-contrib-parent - org.apache.drill.contrib - 1.20.0-SNAPSHOT - + + + org.apache.drill.exec + drill-java-exec + ${project.version} + - drill-format-fixedwidth - Drill : Contrib : Format : Fixedwidth + + + org.apache.drill.exec + drill-java-exec + tests + ${project.version} + test + + + org.apache.drill + drill-common + tests + ${project.version} + test + + + + + + maven-resources-plugin + + + copy-java-sources + process-sources + + copy-resources + + + ${basedir}/target/classes/org/apache/drill/exec/store/fixedwidth + + + + src/main/java/org/apache/drill/exec/store/fixedwidth + true + + + + + + + + - - - org.apache.drill.exec - drill-java-exec - ${project.version} - - - - - - org.apache.drill.exec - drill-java-exec - tests - ${project.version} - test - - - - org.apache.drill - drill-common - tests - ${project.version} - test - - - - - - maven-resources-plugin - - - copy-java-sources - process-sources - - copy-resources - - - ${basedir}/target/classes/org/apache/drill/exec/store/fixedwidth - - - - src/main/java/org/apache/drill/exec/store/fixedwidth - true - - - - - - - - \ No newline at end of file diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index 30b39611de9..2be542d8d39 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -33,6 +33,7 @@ import org.slf4j.LoggerFactory; import java.io.BufferedReader; +import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.time.Instant; @@ -44,23 +45,18 @@ import java.time.format.DateTimeFormatter; import java.util.Locale; -public class FixedwidthBatchReader implements ManagedReader{ +public class FixedwidthBatchReader implements ManagedReader { private static final Logger logger = LoggerFactory.getLogger(FixedwidthBatchReader.class); - private FileSplit split; - private final int maxRecords; - private final FixedwidthFormatConfig config; - private CustomErrorContext errorContext; - private InputStream fsStream; - private ResultSetLoader loader; - + private RowSetLoader writer; private BufferedReader reader; + private int lineNum; public FixedwidthBatchReader(FixedwidthFormatConfig config, int maxRecords) { this.config = config; @@ -71,51 +67,47 @@ public FixedwidthBatchReader(FixedwidthFormatConfig config, int maxRecords) { public boolean open(FileSchemaNegotiator negotiator) { split = negotiator.split(); errorContext = negotiator.parentErrorContext(); + lineNum = 0; try { fsStream = negotiator.fileSystem().openPossiblyCompressedStream(split.getPath()); - negotiator.tableSchema(buildSchema(),true); + negotiator.tableSchema(buildSchema(), true); loader = negotiator.build(); } catch (Exception e) { throw UserException - .dataReadError(e) - .message("Failed to open input file: {}", split.getPath().toString()) - .addContext(errorContext) - .addContext(e.getMessage()) - .build(logger); + .dataReadError(e) + .message("Failed to open input file: {}", split.getPath().toString()) + .addContext(errorContext) + .addContext(e.getMessage()) + .build(logger); } - reader = new BufferedReader(new InputStreamReader(fsStream, Charsets.UTF_8)); - return true; - } @Override public boolean next() { // Use loader to read data from file to turn into Drill rows - String line; + RowSetLoader writer = loader.writer(); try { line = reader.readLine(); - RowSetLoader writer = loader.writer(); - while (!writer.isFull() && line != null) { - writer.start(); parseLine(line, writer); writer.save(); - line = reader.readLine(); + lineNum++; } - } catch (Exception e) { + } catch (IOException e) { throw UserException - .dataReadError(e) - .message("Failed to read input file: {}", split.getPath().toString()) - .addContext(errorContext) - .addContext(e.getMessage()) - .build(logger); + .dataReadError(e) + .message("Failed to read input file: {}", split.getPath().toString()) + .addContext(errorContext) + .addContext(e.getMessage()) + .addContext("Line Number", lineNum) + .build(logger); } - return (line != null); + return writer.limitReached(maxRecords); // returns false when maxRecords limit has been reached } @Override @@ -125,69 +117,72 @@ public void close() { loader.close(); } catch (Exception e) { throw UserException - .dataReadError(e) - .message("Failed to close input file: {}", split.getPath().toString()) - .addContext(errorContext) - .addContext(e.getMessage()) - .build(logger); + .dataReadError(e) + .message("Failed to close input file: {}", split.getPath().toString()) + .addContext(errorContext) + .addContext(e.getMessage()) + .build(logger); } } - private TupleMetadata buildSchema(){ + private TupleMetadata buildSchema() { SchemaBuilder builder = new SchemaBuilder(); - - for (FixedwidthFieldConfig field : config.getFields()){ - builder.addNullable(field.getFieldName(),field.getDataType()); + for (FixedwidthFieldConfig field : config.getFields()) { + builder.addNullable(field.getFieldName(), field.getDataType()); } - - return builder.buildSchema(); + return builder.buildSchema(); } - private void parseLine(String line, RowSetLoader writer) { + + private boolean parseLine(String line, RowSetLoader writer) throws IOException { int i = 0; TypeProtos.MinorType dataType; String dateTimeFormat; String value; - for (FixedwidthFieldConfig field : config.getFields()) { value = line.substring(field.getStartIndex() - 1, field.getStartIndex() + field.getFieldWidth() - 1); - dataType = field.getDataType(); dateTimeFormat = field.getDateTimeFormat(); DateTimeFormatter formatter = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); - - switch (dataType) { - case INT: - writer.scalar(i).setInt(Integer.parseInt(value)); - break; - case VARCHAR: - writer.scalar(i).setString(value); - break; - case DATE: - LocalDate date = LocalDate.parse(value, formatter); - writer.scalar(i).setDate(date); - break; - case TIME: - LocalTime time = LocalTime.parse(value, formatter); - writer.scalar(i).setTime(time); - break; - case TIMESTAMP: - LocalDateTime ldt = LocalDateTime.parse(value,formatter); - ZoneId z = ZoneId.of( "America/Toronto" ); - ZonedDateTime zdt = ldt.atZone( z ); - Instant timeStamp = zdt.toInstant(); - writer.scalar(i).setTimestamp(timeStamp); - break; - default: - throw new RuntimeException("Unknown data type specified in fixed width. Found data type " + dataType); - - - + try { + switch (dataType) { + case INT: + writer.scalar(i).setInt(Integer.parseInt(value)); + break; + case VARCHAR: + writer.scalar(i).setString(value); + break; + case DATE: + LocalDate date = LocalDate.parse(value, formatter); + writer.scalar(i).setDate(date); + break; + case TIME: + LocalTime time = LocalTime.parse(value, formatter); + writer.scalar(i).setTime(time); + break; + case TIMESTAMP: + LocalDateTime ldt = LocalDateTime.parse(value, formatter); + ZoneId z = ZoneId.of("America/Toronto"); + ZonedDateTime zdt = ldt.atZone(z); + Instant timeStamp = zdt.toInstant(); + writer.scalar(i).setTimestamp(timeStamp); + break; + case FLOAT4: + writer.scalar(i).setFloat(Float.parseFloat(value)); + break; + case FLOAT8: + writer.scalar(i).setDouble(Double.parseDouble(value)); + break; + default: + throw new RuntimeException("Unknown data type specified in fixed width. Found data type " + dataType); + } + } catch (RuntimeException e){ + throw new IOException("Failed to parse value: " + value + " as " + dataType); } - i++; } + return true; } } \ No newline at end of file diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java index 667ad3d098d..9a9c1260434 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java @@ -1,10 +1,27 @@ -package org.apache.drill.exec.store.fixedwidth; +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ -import com.fasterxml.jackson.annotation.JsonProperty; -import org.apache.drill.common.types.TypeProtos; +package org.apache.drill.exec.store.fixedwidth; import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonTypeName; +import org.apache.drill.common.types.TypeProtos; @JsonTypeName("fixedwidthReaderFieldDescription") @JsonInclude(JsonInclude.Include.NON_DEFAULT) @@ -28,44 +45,13 @@ public FixedwidthFieldConfig(@JsonProperty("dataType") TypeProtos.MinorType data this.fieldWidth = fieldWidth; } - public TypeProtos.MinorType getDataType(){ - return dataType; - } - -// public void setDataType(TypeProtos.MinorType dataType){ -// this.dataType = dataType; -// } + public TypeProtos.MinorType getDataType() {return dataType;} - public String getFieldName(){ - return fieldName; - } + public String getFieldName() {return fieldName;} -// public void setFieldName(String fieldName){ -// this.fieldName = fieldName; -// } - - public String getDateTimeFormat() { - return dateTimeFormat; - } - -// public void setDateTimeFormat(String dateTimeFormat) { -// this.dateTimeFormat = dateTimeFormat; -// } - - public int getStartIndex(){ - return startIndex; - } - -// public void setStartIndex(int startIndex){ -// this.startIndex = startIndex; -// } - - public int getFieldWidth(){ - return fieldWidth; - } + public String getDateTimeFormat() {return dateTimeFormat;} -// public void setFieldWidth(int fieldWidth){ -// this.fieldWidth = fieldWidth; -// } + public int getStartIndex() {return startIndex;} + public int getFieldWidth() {return fieldWidth;} } diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java index a05a7bd0e4d..06f867a2d37 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -39,9 +39,9 @@ public class FixedwidthFormatConfig implements FormatPluginConfig { @JsonCreator public FixedwidthFormatConfig(@JsonProperty("extensions") List extensions, @JsonProperty("fields") List fields) { - this.extensions = extensions == null ? Collections.singletonList("sav") : ImmutableList.copyOf(extensions); + this.extensions = extensions == null ? Collections.singletonList("fwf") : ImmutableList.copyOf(extensions); this.fields = fields; - } //Change this + } @JsonInclude(JsonInclude.Include.NON_DEFAULT) public List getExtensions() { @@ -52,10 +52,6 @@ public List getFields() { return fields; } -// public FixedwidthReaderConfig getReaderConfig(FixedwidthFormatPlugin plugin) { -// return new FixedwidthReaderConfig(plugin); -// } - @Override public int hashCode() { return Objects.hash(extensions, fields); diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java index 86861cb5040..a10aad9ea11 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java @@ -24,7 +24,6 @@ import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileReaderFactory; import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileScanBuilder; import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator; - import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader; import org.apache.drill.exec.server.DrillbitContext; import org.apache.drill.exec.server.options.OptionManager; @@ -59,26 +58,26 @@ public FixedwidthFormatPlugin(String name, StoragePluginConfig storageConfig, FixedwidthFormatConfig formatConfig) { super(name, easyConfig(fsConf, formatConfig), context, storageConfig, formatConfig); - } //final? + } private static EasyFormatConfig easyConfig(Configuration fsConf, FixedwidthFormatConfig pluginConfig) { return EasyFormatConfig.builder() - .readable(true) - .writable(false) - .blockSplittable(false) - .compressible(true) - .supportsProjectPushdown(true) - .extensions(pluginConfig.getExtensions()) - .fsConf(fsConf) - .defaultName(DEFAULT_NAME) - .useEnhancedScan(true) - .supportsLimitPushdown(true) - .build(); + .readable(true) + .writable(false) + .blockSplittable(false) + .compressible(true) + .supportsProjectPushdown(true) + .extensions(pluginConfig.getExtensions()) + .fsConf(fsConf) + .defaultName(DEFAULT_NAME) + .useEnhancedScan(true) + .supportsLimitPushdown(true) + .build(); } @Override public ManagedReader newBatchReader( - EasySubScan scan, OptionManager options) { + EasySubScan scan, OptionManager options) { return new FixedwidthBatchReader(getConfig(), scan.getMaxRecords()); } @@ -86,7 +85,6 @@ public ManagedReader newBatchReader( protected FileScanBuilder frameworkBuilder(OptionManager options, EasySubScan scan) { FileScanBuilder builder = new FileScanBuilder(); builder.setReaderFactory(new FixedwidthReaderFactory(getConfig(), scan.getMaxRecords())); - initScanBuilder(builder, scan); builder.nullType(Types.optional(TypeProtos.MinorType.VARCHAR)); return builder; diff --git a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java index 309837323be..b8ce99f09c6 100644 --- a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java @@ -1,3 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.drill.exec.store.fixedwidth; import org.apache.drill.categories.RowSetTests; @@ -28,16 +46,15 @@ public class TestFixedwidthRecordReader extends ClusterTest { @BeforeClass public static void setup() throws Exception { ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher)); - FixedwidthFormatConfig formatConfig = new FixedwidthFormatConfig(Lists.newArrayList("fwf"), - Lists.newArrayList( - new FixedwidthFieldConfig(TypeProtos.MinorType.INT, "Number", "", 1, 4), - new FixedwidthFieldConfig(TypeProtos.MinorType.VARCHAR, "Letter", "", 6, 4), - new FixedwidthFieldConfig(TypeProtos.MinorType.INT,"Address","",11,3), - new FixedwidthFieldConfig(TypeProtos.MinorType.DATE,"Date","MM-dd-yyyy",15,10), - new FixedwidthFieldConfig(TypeProtos.MinorType.TIME,"Time","HH:mm:ss",26,8), - new FixedwidthFieldConfig(TypeProtos.MinorType.TIMESTAMP,"DateTime","MM-dd-yyyy'T'HH:mm:ss.SSX",35,23) - )); + Lists.newArrayList( + new FixedwidthFieldConfig(TypeProtos.MinorType.INT, "Number", "", 1, 4), + new FixedwidthFieldConfig(TypeProtos.MinorType.VARCHAR, "Letter", "", 6, 4), + new FixedwidthFieldConfig(TypeProtos.MinorType.INT, "Address", "", 11, 3), + new FixedwidthFieldConfig(TypeProtos.MinorType.DATE, "Date", "MM-dd-yyyy", 15, 10), + new FixedwidthFieldConfig(TypeProtos.MinorType.TIME, "Time", "HH:mm:ss", 26, 8), + new FixedwidthFieldConfig(TypeProtos.MinorType.TIMESTAMP, "DateTime", "MM-dd-yyyy'T'HH:mm:ss.SSX", 35, 23) + )); cluster.defineFormat("cp", "fwf", formatConfig); // Needed for compressed file unit test @@ -47,22 +64,18 @@ public static void setup() throws Exception { @Test public void testExplicitQuery() throws Exception { String sql = "SELECT ID, Urban, Urban_value FROM dfs.`spss/testdata.sav` WHERE d16=4"; - QueryBuilder q = client.queryBuilder().sql(sql); RowSet results = q.rowSet(); TupleMetadata expectedSchema = new SchemaBuilder() - .addNullable("ID", TypeProtos.MinorType.FLOAT8) - .addNullable("Urban", TypeProtos.MinorType.FLOAT8) - .addNullable("Urban_value", TypeProtos.MinorType.VARCHAR) - .buildSchema(); - - + .addNullable("ID", TypeProtos.MinorType.FLOAT8) + .addNullable("Urban", TypeProtos.MinorType.FLOAT8) + .addNullable("Urban_value", TypeProtos.MinorType.VARCHAR) + .buildSchema(); RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema) - .addRow(47.0, 1.0, "Urban").addRow(53.0, 1.0, "Urban") - .addRow(66.0, 1.0, "Urban") - .build(); - + .addRow(47.0, 1.0, "Urban").addRow(53.0, 1.0, "Urban") + .addRow(66.0, 1.0, "Urban") + .build(); assertEquals(3, results.rowCount()); new RowSetComparison(expected).verifyAndClearAll(results); @@ -70,46 +83,44 @@ public void testExplicitQuery() throws Exception { @Test public void testBatchReader() throws Exception { - String sql = "SELECT * FROM cp.`fwf/test.fwf`"; + String sql = "SELECT * FROM cp.`fwf/test.fwf` LIMIT 30"; RowSet results = client.queryBuilder().sql(sql).rowSet(); TupleMetadata expectedSchema = new SchemaBuilder() - .addNullable("Number", TypeProtos.MinorType.INT) - .addNullable("Letter", TypeProtos.MinorType.VARCHAR) - .addNullable("Address", TypeProtos.MinorType.INT) - .addNullable("Date", TypeProtos.MinorType.DATE) - .addNullable("Time",TypeProtos.MinorType.TIME) - .addNullable("DateTime",TypeProtos.MinorType.TIMESTAMP) - .buildSchema(); - - + .addNullable("Number", TypeProtos.MinorType.INT) + .addNullable("Letter", TypeProtos.MinorType.VARCHAR) + .addNullable("Address", TypeProtos.MinorType.INT) + .addNullable("Date", TypeProtos.MinorType.DATE) + .addNullable("Time", TypeProtos.MinorType.TIME) + .addNullable("DateTime", TypeProtos.MinorType.TIMESTAMP) + .buildSchema(); RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema) - .addRow(1234, "test", 567, LocalDate.parse("2021-02-10"), LocalTime.parse("10:30:27"), Instant.parse("2021-02-10T15:30:27.00Z")) - .addRow(5678, "TEST", 890, LocalDate.parse("2021-07-27"), LocalTime.parse("12:40:15"), Instant.parse("2021-07-27T16:40:15.00Z")) - .addRow(1111, "abcd", 111, LocalDate.parse("1111-11-11"), LocalTime.parse("11:11:11"), Instant.parse("1111-11-11T16:28:43.11Z")) - .addRow(2222, "efgh", 222, LocalDate.parse("2222-01-22"), LocalTime.parse("22:22:22"), Instant.parse("2222-01-23T03:22:22.22Z")) - .addRow(3333, "ijkl", 333, LocalDate.parse("3333-02-01"), LocalTime.parse("01:33:33"), Instant.parse("3333-02-01T06:33:33.33Z")) - .addRow(4444, "mnop", 444, LocalDate.parse("4444-03-02"), LocalTime.parse("02:44:44"), Instant.parse("4444-03-02T07:44:44.44Z")) - .addRow(5555, "qrst", 555, LocalDate.parse("5555-04-03"), LocalTime.parse("03:55:55"), Instant.parse("5555-04-03T07:55:55.55Z")) - .addRow(6666, "uvwx", 666, LocalDate.parse("6666-05-04"), LocalTime.parse("04:01:01"), Instant.parse("6666-05-04T08:01:01.01Z")) - .addRow(7777, "yzzz", 777, LocalDate.parse("7777-06-05"), LocalTime.parse("05:11:11"), Instant.parse("7777-06-05T09:11:11.11Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .build(); + .addRow(1234, "test", 567, LocalDate.parse("2021-02-10"), LocalTime.parse("10:30:27"), Instant.parse("2021-02-10T15:30:27.00Z")) + .addRow(5678, "TEST", 890, LocalDate.parse("2021-07-27"), LocalTime.parse("12:40:15"), Instant.parse("2021-07-27T16:40:15.00Z")) + .addRow(1111, "abcd", 111, LocalDate.parse("1111-11-11"), LocalTime.parse("11:11:11"), Instant.parse("1111-11-11T16:28:43.11Z")) + .addRow(2222, "efgh", 222, LocalDate.parse("2222-01-22"), LocalTime.parse("22:22:22"), Instant.parse("2222-01-23T03:22:22.22Z")) + .addRow(3333, "ijkl", 333, LocalDate.parse("3333-02-01"), LocalTime.parse("01:33:33"), Instant.parse("3333-02-01T06:33:33.33Z")) + .addRow(4444, "mnop", 444, LocalDate.parse("4444-03-02"), LocalTime.parse("02:44:44"), Instant.parse("4444-03-02T07:44:44.44Z")) + .addRow(5555, "qrst", 555, LocalDate.parse("5555-04-03"), LocalTime.parse("03:55:55"), Instant.parse("5555-04-03T07:55:55.55Z")) + .addRow(6666, "uvwx", 666, LocalDate.parse("6666-05-04"), LocalTime.parse("04:01:01"), Instant.parse("6666-05-04T08:01:01.01Z")) + .addRow(7777, "yzzz", 777, LocalDate.parse("7777-06-05"), LocalTime.parse("05:11:11"), Instant.parse("7777-06-05T09:11:11.11Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .build(); System.out.println(expected); assertEquals(25, results.rowCount()); @@ -117,7 +128,6 @@ public void testBatchReader() throws Exception { //System.out.println(results.batchSchema()); System.out.println(results); - new RowSetComparison(expected).verifyAndClearAll(results); System.out.println("Test complete."); client.close(); diff --git a/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf b/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf index 9eba0f61944..98cea6d8607 100644 --- a/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf +++ b/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf @@ -1,4 +1,4 @@ -1234 test 567 02-10-2021 10:30:27 02-10-2021T10:30:27.00Z +1T34 test 567 02-10-2021 10:30:27 02-10-2021T10:30:27.00Z 5678 TEST 890 07-27-2021 12:40:15 07-27-2021T12:40:15.00Z 1111 abcd 111 11-11-1111 11:11:11 11-11-1111T11:11:11.11Z 2222 efgh 222 01-22-2222 22:22:22 01-22-2222T22:22:22.22Z diff --git a/distribution/pom.xml b/distribution/pom.xml index f3f44747e46..f4073be8987 100644 --- a/distribution/pom.xml +++ b/distribution/pom.xml @@ -460,6 +460,11 @@ drill-format-excel ${project.version} + + org.apache.drill.contrib + drill-format-fixedwidth + ${project.version} + org.apache.drill.contrib drill-druid-storage diff --git a/distribution/src/assemble/component.xml b/distribution/src/assemble/component.xml index 853793d4d51..4752b1a4e44 100644 --- a/distribution/src/assemble/component.xml +++ b/distribution/src/assemble/component.xml @@ -55,6 +55,7 @@ org.apache.drill.contrib:drill-format-excel:jar org.apache.drill.contrib:drill-format-spss:jar org.apache.drill.contrib:drill-format-sas:jar + org.apache.drill.contrib:drill-format-fixedwidth:jar org.apache.drill.contrib:drill-jdbc-storage:jar org.apache.drill.contrib:drill-kudu-storage:jar org.apache.drill.contrib:drill-storage-phoenix:jar From f2920a07b05b9cd84f63ca9c8a646f0b8b8ffc6f Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Mon, 13 Sep 2021 15:32:54 -0400 Subject: [PATCH 27/41] Addressing Review Comments. - Simplified FieldConfig variables - Added compressed file test - Added unit test for explicit column references - Modified close() to include AutoCloseables - Added Long data type - Added Decimal data type - not fully implemented --- .../fixedwidth/FixedwidthBatchReader.java | 34 +++-- .../fixedwidth/FixedwidthFieldConfig.java | 96 +++++++++++--- .../fixedwidth/FixedwidthFormatPlugin.java | 2 +- .../TestFixedwidthRecordReader.java | 124 +++++++++++------- .../src/test/resources/fwf/test.fwf | 50 +++---- 5 files changed, 198 insertions(+), 108 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index 2be542d8d39..d389211b930 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -18,6 +18,7 @@ package org.apache.drill.exec.store.fixedwidth; +import org.apache.drill.common.AutoCloseables; import org.apache.drill.common.exceptions.CustomErrorContext; import org.apache.drill.common.exceptions.UserException; import org.apache.drill.common.types.TypeProtos; @@ -36,6 +37,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.math.BigDecimal; import java.time.Instant; import java.time.LocalDate; import java.time.LocalDateTime; @@ -63,6 +65,7 @@ public FixedwidthBatchReader(FixedwidthFormatConfig config, int maxRecords) { this.maxRecords = maxRecords; } + @Override public boolean open(FileSchemaNegotiator negotiator) { split = negotiator.split(); @@ -112,23 +115,21 @@ public boolean next() { // Use loader to read data from file to turn into Drill @Override public void close() { - try { - fsStream.close(); - loader.close(); - } catch (Exception e) { - throw UserException - .dataReadError(e) - .message("Failed to close input file: {}", split.getPath().toString()) - .addContext(errorContext) - .addContext(e.getMessage()) - .build(logger); + if (fsStream != null){ + AutoCloseables.closeSilently(fsStream); + fsStream = null; } } private TupleMetadata buildSchema() { SchemaBuilder builder = new SchemaBuilder(); for (FixedwidthFieldConfig field : config.getFields()) { - builder.addNullable(field.getFieldName(), field.getDataType()); + if (field.getType() == TypeProtos.MinorType.VARDECIMAL){ + builder.addNullable(field.getName(), TypeProtos.MinorType.VARDECIMAL,38,4); + //revisit this + } else { + builder.addNullable(field.getName(), field.getType()); + } } return builder.buildSchema(); } @@ -140,8 +141,8 @@ private boolean parseLine(String line, RowSetLoader writer) throws IOException { String dateTimeFormat; String value; for (FixedwidthFieldConfig field : config.getFields()) { - value = line.substring(field.getStartIndex() - 1, field.getStartIndex() + field.getFieldWidth() - 1); - dataType = field.getDataType(); + value = line.substring(field.getIndex() - 1, field.getIndex() + field.getWidth() - 1); + dataType = field.getType(); dateTimeFormat = field.getDateTimeFormat(); DateTimeFormatter formatter = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); try { @@ -173,6 +174,13 @@ private boolean parseLine(String line, RowSetLoader writer) throws IOException { case FLOAT8: writer.scalar(i).setDouble(Double.parseDouble(value)); break; + case BIGINT: + writer.scalar(i).setLong(Long.parseLong(value)); + break; + case VARDECIMAL: + BigDecimal bigDecimal = new BigDecimal(value); + writer.scalar(i).setDecimal(bigDecimal); + break; default: throw new RuntimeException("Unknown data type specified in fixed width. Found data type " + dataType); } diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java index 9a9c1260434..69ad9b55b6d 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java @@ -21,37 +21,97 @@ import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonTypeName; +import org.apache.drill.common.PlanStringBuilder; import org.apache.drill.common.types.TypeProtos; +import java.util.Objects; + + @JsonTypeName("fixedwidthReaderFieldDescription") @JsonInclude(JsonInclude.Include.NON_DEFAULT) public class FixedwidthFieldConfig { - private final TypeProtos.MinorType dataType; - private final String fieldName; + private final String name; + private final int index; + private final int width; + private final TypeProtos.MinorType type; private final String dateTimeFormat; - private final int startIndex; - private final int fieldWidth; - - public FixedwidthFieldConfig(@JsonProperty("dataType") TypeProtos.MinorType dataType, - @JsonProperty("fieldName") String fieldName, - @JsonProperty("dateTimeFormat") String dateTimeFormat, - @JsonProperty("startIndex") int startIndex, - @JsonProperty("fieldWidth") int fieldWidth) { - this.dataType = dataType; - this.fieldName = fieldName; + + public FixedwidthFieldConfig(@JsonProperty("name") String name, + @JsonProperty("index") int index, + @JsonProperty("width") int width, + @JsonProperty("type") TypeProtos.MinorType type, + @JsonProperty("dateTimeFormat") String dateTimeFormat) { + + this.name = name; + this.index = index; + this.width = width; + this.type = type; this.dateTimeFormat = dateTimeFormat; - this.startIndex = startIndex; - this.fieldWidth = fieldWidth; + + // Need to verify names are different - where can we access all the names of other columns +// if(name != null){ +// this.name = name; +// } else{ +// throw new IllegalArgumentException("Invalid name"); //Is this the right way to throw an exception if blank? What about if not valid SQL? +// } +// +// if (index >= 0){ +// this.index = index; +// } else { +// throw new IllegalArgumentException("Index must be 0 or greater"); +// } +// +// //Can modify this to be optional and be calculated based on start index of this field and next +// this.width = width; +// +// if (type == null){ +// this.type = TypeProtos.MinorType.VARCHAR; +// } else { +// this.type = type; +// } +// this.dateTimeFormat = dateTimeFormat; // No default required, null is allowed } - public TypeProtos.MinorType getDataType() {return dataType;} + public String getName() {return name;} + + public int getIndex() {return index;} - public String getFieldName() {return fieldName;} + public int getWidth() {return width;} + + public TypeProtos.MinorType getType() {return type;} public String getDateTimeFormat() {return dateTimeFormat;} - public int getStartIndex() {return startIndex;} + @Override + public int hashCode() { + return Objects.hash(name, index, width, type, dateTimeFormat); + } - public int getFieldWidth() {return fieldWidth;} + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + FixedwidthFieldConfig other = (FixedwidthFieldConfig) obj; + return Objects.equals(name, other.name) + && Objects.equals(index, other.index) + && Objects.equals(width, other.width) + && Objects.equals(type, other.type) + && Objects.equals(dateTimeFormat, other.dateTimeFormat); + } + + @Override + public String toString() { + return new PlanStringBuilder(this) + .field("name", name) + .field("index", index) + .field("width", width) + .field("type", type) + .field("dateTimeFormat", dateTimeFormat) + .toString(); + } } diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java index a10aad9ea11..f96e4a81f77 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java @@ -64,7 +64,7 @@ private static EasyFormatConfig easyConfig(Configuration fsConf, FixedwidthForma return EasyFormatConfig.builder() .readable(true) .writable(false) - .blockSplittable(false) + .blockSplittable(false) // Change to true .compressible(true) .supportsProjectPushdown(true) .extensions(pluginConfig.getExtensions()) diff --git a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java index b8ce99f09c6..60faf73c723 100644 --- a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java @@ -38,7 +38,7 @@ import java.time.LocalDate; import java.time.LocalTime; -import static org.junit.Assert.assertEquals; +import static org.apache.drill.test.QueryTestUtil.generateCompressedFile; @Category(RowSetTests.class) public class TestFixedwidthRecordReader extends ClusterTest { @@ -46,91 +46,113 @@ public class TestFixedwidthRecordReader extends ClusterTest { @BeforeClass public static void setup() throws Exception { ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher)); + FixedwidthFormatConfig formatConfig = new FixedwidthFormatConfig(Lists.newArrayList("fwf"), Lists.newArrayList( - new FixedwidthFieldConfig(TypeProtos.MinorType.INT, "Number", "", 1, 4), - new FixedwidthFieldConfig(TypeProtos.MinorType.VARCHAR, "Letter", "", 6, 4), - new FixedwidthFieldConfig(TypeProtos.MinorType.INT, "Address", "", 11, 3), - new FixedwidthFieldConfig(TypeProtos.MinorType.DATE, "Date", "MM-dd-yyyy", 15, 10), - new FixedwidthFieldConfig(TypeProtos.MinorType.TIME, "Time", "HH:mm:ss", 26, 8), - new FixedwidthFieldConfig(TypeProtos.MinorType.TIMESTAMP, "DateTime", "MM-dd-yyyy'T'HH:mm:ss.SSX", 35, 23) + new FixedwidthFieldConfig("Number", 1, 5, TypeProtos.MinorType.VARDECIMAL, ""), + new FixedwidthFieldConfig("Letter", 7,4, TypeProtos.MinorType.VARCHAR, ""), + new FixedwidthFieldConfig("Address",12, 3,TypeProtos.MinorType.INT, ""), + new FixedwidthFieldConfig("Date",16, 10,TypeProtos.MinorType.DATE, "MM-dd-yyyy"), + new FixedwidthFieldConfig( "Time", 27, 8,TypeProtos.MinorType.TIME,"HH:mm:ss" ), + new FixedwidthFieldConfig("DateTime", 36, 23,TypeProtos.MinorType.TIMESTAMP, "MM-dd-yyyy'T'HH:mm:ss.SSX" ) )); + cluster.defineFormat("dfs", "fwf", formatConfig); cluster.defineFormat("cp", "fwf", formatConfig); // Needed for compressed file unit test dirTestWatcher.copyResourceToRoot(Paths.get("fwf/")); } + @Test + public void testStarQuery() throws Exception { + String sql = "SELECT * FROM cp.`fwf/test.fwf`"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + RowSet expected = setupTestData(); + new RowSetComparison(expected).verifyAndClearAll(results); + } + + @Test + public void testExplicitAllQuery() throws Exception { + String sql = "SELECT Number, Letter, Address, `Date`, `Time`, DateTime FROM cp.`fwf/test.fwf`"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + RowSet expected = setupTestData(); + new RowSetComparison(expected).verifyAndClearAll(results); + } + @Test public void testExplicitQuery() throws Exception { - String sql = "SELECT ID, Urban, Urban_value FROM dfs.`spss/testdata.sav` WHERE d16=4"; + String sql = "SELECT Number, Letter, Address FROM cp.`fwf/test.fwf` WHERE Letter='yzzz'"; QueryBuilder q = client.queryBuilder().sql(sql); RowSet results = q.rowSet(); TupleMetadata expectedSchema = new SchemaBuilder() - .addNullable("ID", TypeProtos.MinorType.FLOAT8) - .addNullable("Urban", TypeProtos.MinorType.FLOAT8) - .addNullable("Urban_value", TypeProtos.MinorType.VARCHAR) + .addNullable("Number", TypeProtos.MinorType.VARDECIMAL,38,4) + .addNullable("Letter", TypeProtos.MinorType.VARCHAR) + .addNullable("Address", TypeProtos.MinorType.INT) .buildSchema(); RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema) - .addRow(47.0, 1.0, "Urban").addRow(53.0, 1.0, "Urban") - .addRow(66.0, 1.0, "Urban") + .addRow(77.77, "yzzz", 777) .build(); - assertEquals(3, results.rowCount()); new RowSetComparison(expected).verifyAndClearAll(results); } + //Test Serialization/Deserialization + + //Test Compressed File @Test - public void testBatchReader() throws Exception { - String sql = "SELECT * FROM cp.`fwf/test.fwf` LIMIT 30"; + public void testStarQueryWithCompressedFile() throws Exception { + generateCompressedFile("fwf/test.fwf", "zip", "fwf/test.fwf.zip" ); + + String sql = "SELECT * FROM dfs.`fwf/test.fwf.zip`"; + System.out.println("Compressed file generated"); RowSet results = client.queryBuilder().sql(sql).rowSet(); + RowSet expected = setupTestData(); + new RowSetComparison(expected).verifyAndClearAll(results); + } + + // Test Entering invalid schemata - incorrect limits + private RowSet setupTestData(){ TupleMetadata expectedSchema = new SchemaBuilder() - .addNullable("Number", TypeProtos.MinorType.INT) + .addNullable("Number", TypeProtos.MinorType.VARDECIMAL,38,4) .addNullable("Letter", TypeProtos.MinorType.VARCHAR) .addNullable("Address", TypeProtos.MinorType.INT) .addNullable("Date", TypeProtos.MinorType.DATE) .addNullable("Time", TypeProtos.MinorType.TIME) .addNullable("DateTime", TypeProtos.MinorType.TIMESTAMP) .buildSchema(); + RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema) - .addRow(1234, "test", 567, LocalDate.parse("2021-02-10"), LocalTime.parse("10:30:27"), Instant.parse("2021-02-10T15:30:27.00Z")) - .addRow(5678, "TEST", 890, LocalDate.parse("2021-07-27"), LocalTime.parse("12:40:15"), Instant.parse("2021-07-27T16:40:15.00Z")) - .addRow(1111, "abcd", 111, LocalDate.parse("1111-11-11"), LocalTime.parse("11:11:11"), Instant.parse("1111-11-11T16:28:43.11Z")) - .addRow(2222, "efgh", 222, LocalDate.parse("2222-01-22"), LocalTime.parse("22:22:22"), Instant.parse("2222-01-23T03:22:22.22Z")) - .addRow(3333, "ijkl", 333, LocalDate.parse("3333-02-01"), LocalTime.parse("01:33:33"), Instant.parse("3333-02-01T06:33:33.33Z")) - .addRow(4444, "mnop", 444, LocalDate.parse("4444-03-02"), LocalTime.parse("02:44:44"), Instant.parse("4444-03-02T07:44:44.44Z")) - .addRow(5555, "qrst", 555, LocalDate.parse("5555-04-03"), LocalTime.parse("03:55:55"), Instant.parse("5555-04-03T07:55:55.55Z")) - .addRow(6666, "uvwx", 666, LocalDate.parse("6666-05-04"), LocalTime.parse("04:01:01"), Instant.parse("6666-05-04T08:01:01.01Z")) - .addRow(7777, "yzzz", 777, LocalDate.parse("7777-06-05"), LocalTime.parse("05:11:11"), Instant.parse("7777-06-05T09:11:11.11Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) - .addRow(8888, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(12.34, "test", 567, LocalDate.parse("2021-02-10"), LocalTime.parse("10:30:27"), Instant.parse("2021-02-10T15:30:27.00Z")) + .addRow(56.78, "TEST", 890, LocalDate.parse("2021-07-27"), LocalTime.parse("12:40:15"), Instant.parse("2021-07-27T16:40:15.00Z")) + .addRow(11.11, "abcd", 111, LocalDate.parse("1111-11-11"), LocalTime.parse("11:11:11"), Instant.parse("1111-11-11T16:28:43.11Z")) + .addRow(22.22, "efgh", 222, LocalDate.parse("2222-01-22"), LocalTime.parse("22:22:22"), Instant.parse("2222-01-23T03:22:22.22Z")) + .addRow(33.33, "ijkl", 333, LocalDate.parse("3333-02-01"), LocalTime.parse("01:33:33"), Instant.parse("3333-02-01T06:33:33.33Z")) + .addRow(44.44, "mnop", 444, LocalDate.parse("4444-03-02"), LocalTime.parse("02:44:44"), Instant.parse("4444-03-02T07:44:44.44Z")) + .addRow(55.55, "qrst", 555, LocalDate.parse("5555-04-03"), LocalTime.parse("03:55:55"), Instant.parse("5555-04-03T07:55:55.55Z")) + .addRow(66.66, "uvwx", 666, LocalDate.parse("6666-05-04"), LocalTime.parse("04:01:01"), Instant.parse("6666-05-04T08:01:01.01Z")) + .addRow(77.77, "yzzz", 777, LocalDate.parse("7777-06-05"), LocalTime.parse("05:11:11"), Instant.parse("7777-06-05T09:11:11.11Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) + .addRow(88.88, "aabb", 888, LocalDate.parse("8888-07-06"), LocalTime.parse("06:22:22"), Instant.parse("8888-07-07T10:22:22.22Z")) .build(); - System.out.println(expected); - assertEquals(25, results.rowCount()); - - //System.out.println(results.batchSchema()); - System.out.println(results); - - new RowSetComparison(expected).verifyAndClearAll(results); - System.out.println("Test complete."); - client.close(); + return expected; } } + diff --git a/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf b/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf index 98cea6d8607..71be3669fec 100644 --- a/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf +++ b/contrib/format-fixedwidth/src/test/resources/fwf/test.fwf @@ -1,25 +1,25 @@ -1T34 test 567 02-10-2021 10:30:27 02-10-2021T10:30:27.00Z -5678 TEST 890 07-27-2021 12:40:15 07-27-2021T12:40:15.00Z -1111 abcd 111 11-11-1111 11:11:11 11-11-1111T11:11:11.11Z -2222 efgh 222 01-22-2222 22:22:22 01-22-2222T22:22:22.22Z -3333 ijkl 333 02-01-3333 01:33:33 02-01-3333T01:33:33.33Z -4444 mnop 444 03-02-4444 02:44:44 03-02-4444T02:44:44.44Z -5555 qrst 555 04-03-5555 03:55:55 04-03-5555T03:55:55.55Z -6666 uvwx 666 05-04-6666 04:01:01 05-04-6666T04:01:01.01Z -7777 yzzz 777 06-05-7777 05:11:11 06-05-7777T05:11:11.11Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z -8888 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z \ No newline at end of file +12.34 test 567 02-10-2021 10:30:27 02-10-2021T10:30:27.00Z +56.78 TEST 890 07-27-2021 12:40:15 07-27-2021T12:40:15.00Z +11.11 abcd 111 11-11-1111 11:11:11 11-11-1111T11:11:11.11Z +22.22 efgh 222 01-22-2222 22:22:22 01-22-2222T22:22:22.22Z +33.33 ijkl 333 02-01-3333 01:33:33 02-01-3333T01:33:33.33Z +44.44 mnop 444 03-02-4444 02:44:44 03-02-4444T02:44:44.44Z +55.55 qrst 555 04-03-5555 03:55:55 04-03-5555T03:55:55.55Z +66.66 uvwx 666 05-04-6666 04:01:01 05-04-6666T04:01:01.01Z +77.77 yzzz 777 06-05-7777 05:11:11 06-05-7777T05:11:11.11Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z From 7a68da5f92a47ba2b433b0a097494569d7450529 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Fri, 15 Oct 2021 10:44:29 -0400 Subject: [PATCH 28/41] Added Serialization/Deserialization test, added blank row test file, cleaned up compressed file test --- .../TestFixedwidthRecordReader.java | 67 ++++++++++++++++++- .../src/test/resources/fwf/test_blankrow.fwf | 26 +++++++ 2 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 contrib/format-fixedwidth/src/test/resources/fwf/test_blankrow.fwf diff --git a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java index 60faf73c723..a24d3498297 100644 --- a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java @@ -39,6 +39,7 @@ import java.time.LocalTime; import static org.apache.drill.test.QueryTestUtil.generateCompressedFile; +import static org.junit.Assert.assertEquals; @Category(RowSetTests.class) public class TestFixedwidthRecordReader extends ClusterTest { @@ -98,20 +99,82 @@ public void testExplicitQuery() throws Exception { } //Test Serialization/Deserialization + @Test + public void testSerDe() throws Exception { + String sql = "SELECT COUNT(*) FROM dfs.`fwf/test.fwf`"; + String plan = queryBuilder().sql(sql).explainJson(); + long cnt = queryBuilder().physical(plan).singletonLong(); + assertEquals(5L, cnt); + } - //Test Compressed File @Test public void testStarQueryWithCompressedFile() throws Exception { generateCompressedFile("fwf/test.fwf", "zip", "fwf/test.fwf.zip" ); String sql = "SELECT * FROM dfs.`fwf/test.fwf.zip`"; - System.out.println("Compressed file generated"); RowSet results = client.queryBuilder().sql(sql).rowSet(); RowSet expected = setupTestData(); new RowSetComparison(expected).verifyAndClearAll(results); } // Test Entering invalid schemata - incorrect limits + // Undefined field, what happens + // Parse invalid file, make sure correct error + + + @Test + public void testOutOfOrder() throws Exception{ + String sql = "SELECT Address, DateTime, `Date`, Letter FROM cp.`fwf/test.fwf`"; + QueryBuilder q = client.queryBuilder().sql(sql); + RowSet results = q.rowSet(); + + TupleMetadata expectedSchema = new SchemaBuilder() + .addNullable("Address", TypeProtos.MinorType.INT) + .addNullable("DateTime", TypeProtos.MinorType.TIMESTAMP) + .addNullable("Date", TypeProtos.MinorType.DATE) + .addNullable("Letter", TypeProtos.MinorType.VARCHAR) + .buildSchema(); + RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema) + .addRow(567, Instant.parse("2021-02-10T15:30:27.00Z"), LocalDate.parse("2021-02-10"), "test") + .addRow(890, Instant.parse("2021-07-27T16:40:15.00Z"), LocalDate.parse("2021-07-27"), "TEST") + .addRow(111, Instant.parse("1111-11-11T16:28:43.11Z"), LocalDate.parse("1111-11-11"), "abcd") + .addRow(222, Instant.parse("2222-01-23T03:22:22.22Z"), LocalDate.parse("2222-01-22"), "efgh") + .addRow(333, Instant.parse("3333-02-01T06:33:33.33Z"), LocalDate.parse("3333-02-01"), "ijkl") + .addRow(444, Instant.parse("4444-03-02T07:44:44.44Z"), LocalDate.parse("4444-03-02"), "mnop") + .addRow(555, Instant.parse("5555-04-03T07:55:55.55Z"), LocalDate.parse("5555-04-03"), "qrst") + .addRow(666, Instant.parse("6666-05-04T08:01:01.01Z"), LocalDate.parse("6666-05-04"), "uvwx") + .addRow(777, Instant.parse("7777-06-05T09:11:11.11Z"), LocalDate.parse("7777-06-05"), "yzzz") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .addRow(888, Instant.parse("8888-07-07T10:22:22.22Z"), LocalDate.parse("8888-07-06"), "aabb") + .build(); + + new RowSetComparison(expected).verifyAndClearAll(results); + } + + // How should we be handling an empty/blank row? + @Test + public void testEmptyRow() throws Exception { + String sql = "SELECT * FROM cp.`fwf/test_blankrow.fwf`"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + RowSet expected = setupTestData(); + new RowSetComparison(expected).verifyAndClearAll(results); + } + + // private RowSet setupTestData(){ TupleMetadata expectedSchema = new SchemaBuilder() diff --git a/contrib/format-fixedwidth/src/test/resources/fwf/test_blankrow.fwf b/contrib/format-fixedwidth/src/test/resources/fwf/test_blankrow.fwf new file mode 100644 index 00000000000..6c582f8d615 --- /dev/null +++ b/contrib/format-fixedwidth/src/test/resources/fwf/test_blankrow.fwf @@ -0,0 +1,26 @@ +12.34 test 567 02-10-2021 10:30:27 02-10-2021T10:30:27.00Z +56.78 TEST 890 07-27-2021 12:40:15 07-27-2021T12:40:15.00Z +11.11 abcd 111 11-11-1111 11:11:11 11-11-1111T11:11:11.11Z +22.22 efgh 222 01-22-2222 22:22:22 01-22-2222T22:22:22.22Z +33.33 ijkl 333 02-01-3333 01:33:33 02-01-3333T01:33:33.33Z +44.44 mnop 444 03-02-4444 02:44:44 03-02-4444T02:44:44.44Z +55.55 qrst 555 04-03-5555 03:55:55 04-03-5555T03:55:55.55Z +66.66 uvwx 666 05-04-6666 04:01:01 05-04-6666T04:01:01.01Z +77.77 yzzz 777 06-05-7777 05:11:11 06-05-7777T05:11:11.11Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z + +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z +88.88 aabb 888 07-06-8888 06:22:22 07-07-8888T06:22:22.22Z From e0110da4975263a70b989e94cd4c22ce2cf788f5 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Wed, 20 Oct 2021 15:44:09 -0400 Subject: [PATCH 29/41] Fixed Serialization/Deserialization test --- .../exec/store/fixedwidth/FixedwidthBatchReader.java | 12 ++++++++---- .../store/fixedwidth/TestFixedwidthRecordReader.java | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index d389211b930..f63eac3b528 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -143,8 +143,6 @@ private boolean parseLine(String line, RowSetLoader writer) throws IOException { for (FixedwidthFieldConfig field : config.getFields()) { value = line.substring(field.getIndex() - 1, field.getIndex() + field.getWidth() - 1); dataType = field.getType(); - dateTimeFormat = field.getDateTimeFormat(); - DateTimeFormatter formatter = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); try { switch (dataType) { case INT: @@ -154,15 +152,21 @@ private boolean parseLine(String line, RowSetLoader writer) throws IOException { writer.scalar(i).setString(value); break; case DATE: + dateTimeFormat = field.getDateTimeFormat(); + DateTimeFormatter formatter = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); LocalDate date = LocalDate.parse(value, formatter); writer.scalar(i).setDate(date); break; case TIME: - LocalTime time = LocalTime.parse(value, formatter); + dateTimeFormat = field.getDateTimeFormat(); + DateTimeFormatter formatter2 = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); + LocalTime time = LocalTime.parse(value, formatter2); writer.scalar(i).setTime(time); break; case TIMESTAMP: - LocalDateTime ldt = LocalDateTime.parse(value, formatter); + dateTimeFormat = field.getDateTimeFormat(); + DateTimeFormatter formatter3 = DateTimeFormatter.ofPattern(dateTimeFormat, Locale.ENGLISH); + LocalDateTime ldt = LocalDateTime.parse(value, formatter3); ZoneId z = ZoneId.of("America/Toronto"); ZonedDateTime zdt = ldt.atZone(z); Instant timeStamp = zdt.toInstant(); diff --git a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java index a24d3498297..f29219e6bd7 100644 --- a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java @@ -101,10 +101,10 @@ public void testExplicitQuery() throws Exception { //Test Serialization/Deserialization @Test public void testSerDe() throws Exception { - String sql = "SELECT COUNT(*) FROM dfs.`fwf/test.fwf`"; + String sql = "SELECT COUNT(*) FROM cp.`fwf/test.fwf`"; String plan = queryBuilder().sql(sql).explainJson(); long cnt = queryBuilder().physical(plan).singletonLong(); - assertEquals(5L, cnt); + assertEquals(25L, cnt); } @Test From f01f1aa101fffd59023cef227eb0c80b6453ab19 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Thu, 4 Nov 2021 14:30:23 -0400 Subject: [PATCH 30/41] Added another constructor to enable user to not have to enter dateTimeFormat when not appropriate, started adding methods to perform field name verification (not complete). --- .../fixedwidth/FixedwidthFieldConfig.java | 9 +++++ .../fixedwidth/FixedwidthFormatConfig.java | 36 ++++++++++++++++++- .../TestFixedwidthRecordReader.java | 2 +- 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java index 69ad9b55b6d..ae2c7c0f095 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java @@ -37,6 +37,13 @@ public class FixedwidthFieldConfig { private final TypeProtos.MinorType type; private final String dateTimeFormat; + public FixedwidthFieldConfig(@JsonProperty("name") String name, + @JsonProperty("index") int index, + @JsonProperty("width") int width, + @JsonProperty("type") TypeProtos.MinorType type) { + this(name, index, width, type, null); + } + public FixedwidthFieldConfig(@JsonProperty("name") String name, @JsonProperty("index") int index, @JsonProperty("width") int width, @@ -49,6 +56,8 @@ public FixedwidthFieldConfig(@JsonProperty("name") String name, this.type = type; this.dateTimeFormat = dateTimeFormat; + + // Need to verify names are different - where can we access all the names of other columns // if(name != null){ // this.name = name; diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java index 06f867a2d37..053d83676e3 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -19,13 +19,16 @@ package org.apache.drill.exec.store.fixedwidth; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonTypeName; import org.apache.drill.common.PlanStringBuilder; import org.apache.drill.common.logical.FormatPluginConfig; +import org.apache.drill.exec.store.log.LogFormatField; import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Objects; @@ -77,4 +80,35 @@ public String toString() { .field("fields", fields) .toString(); } -} \ No newline at end of file + + + @JsonIgnore + public boolean hasFields() { + return fields != null && ! fields.isEmpty(); + } + + @JsonIgnore + public List getFieldNames() { + List result = new ArrayList<>(); + if (! hasFields()) { + return result; + } + + for (FixedwidthFieldConfig field : fields) { + result.add(field.getName()); + } + return result; + } + + @JsonIgnore + public boolean validateFieldNames(String fieldName){ + boolean result = false; + List names = this.getFieldNames(); + if (names.contains(fieldName)){ + result = true; + } + return result; + } + + +} diff --git a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java index f29219e6bd7..9aa471488f9 100644 --- a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java @@ -174,7 +174,7 @@ public void testEmptyRow() throws Exception { new RowSetComparison(expected).verifyAndClearAll(results); } - // + // Create unit test for overloaded constructor private RowSet setupTestData(){ TupleMetadata expectedSchema = new SchemaBuilder() From 5da7a773285a0d37b653b8dd7d6698c8876b981f Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Fri, 5 Nov 2021 11:13:58 -0400 Subject: [PATCH 31/41] Added method to validate field name input and verify there are no duplicates. Modified tests to enable testing of new constructor. --- .../fixedwidth/FixedwidthFieldConfig.java | 1 - .../fixedwidth/FixedwidthFormatConfig.java | 29 +++++++++++++++---- .../TestFixedwidthRecordReader.java | 2 +- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java index ae2c7c0f095..d0ab5bcda84 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java @@ -57,7 +57,6 @@ public FixedwidthFieldConfig(@JsonProperty("name") String name, this.dateTimeFormat = dateTimeFormat; - // Need to verify names are different - where can we access all the names of other columns // if(name != null){ // this.name = name; diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java index 053d83676e3..00abe0a834e 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -24,18 +24,25 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonTypeName; import org.apache.drill.common.PlanStringBuilder; +import org.apache.drill.common.exceptions.UserException; import org.apache.drill.common.logical.FormatPluginConfig; import org.apache.drill.exec.store.log.LogFormatField; +import org.apache.drill.exec.store.log.LogFormatPlugin; import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Objects; +import java.util.Set; @JsonTypeName(FixedwidthFormatPlugin.DEFAULT_NAME) @JsonInclude(JsonInclude.Include.NON_DEFAULT) public class FixedwidthFormatConfig implements FormatPluginConfig { + private static final Logger logger = LoggerFactory.getLogger(FixedwidthFormatConfig.class); private final List extensions; private final List fields; @@ -44,6 +51,8 @@ public FixedwidthFormatConfig(@JsonProperty("extensions") List extension @JsonProperty("fields") List fields) { this.extensions = extensions == null ? Collections.singletonList("fwf") : ImmutableList.copyOf(extensions); this.fields = fields; + + validateFieldInput(); } @JsonInclude(JsonInclude.Include.NON_DEFAULT) @@ -101,13 +110,21 @@ public List getFieldNames() { } @JsonIgnore - public boolean validateFieldNames(String fieldName){ - boolean result = false; - List names = this.getFieldNames(); - if (names.contains(fieldName)){ - result = true; + public void validateFieldInput(){ + Set uniqueNames = new HashSet<>(); + for (String name : this.getFieldNames()){ + if (name.length() == 0){ + + } + if (uniqueNames.contains(name)){ + throw UserException + .validationError() + .message("Duplicate column name: " + name) + .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .build(logger); + } + uniqueNames.add(name); } - return result; } diff --git a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java index 9aa471488f9..a6f14e7aaf9 100644 --- a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java @@ -50,7 +50,7 @@ public static void setup() throws Exception { FixedwidthFormatConfig formatConfig = new FixedwidthFormatConfig(Lists.newArrayList("fwf"), Lists.newArrayList( - new FixedwidthFieldConfig("Number", 1, 5, TypeProtos.MinorType.VARDECIMAL, ""), + new FixedwidthFieldConfig("Number", 1, 5, TypeProtos.MinorType.VARDECIMAL), new FixedwidthFieldConfig("Letter", 7,4, TypeProtos.MinorType.VARCHAR, ""), new FixedwidthFieldConfig("Address",12, 3,TypeProtos.MinorType.INT, ""), new FixedwidthFieldConfig("Date",16, 10,TypeProtos.MinorType.DATE, "MM-dd-yyyy"), From 22ccbcbbb506b8ff6ec7dc8d3031b725ae978916 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Tue, 16 Nov 2021 15:35:15 -0500 Subject: [PATCH 32/41] Added two getters to FixedwidthFormatConfig to prep for offset verification --- .../fixedwidth/FixedwidthFormatConfig.java | 32 +++++++++++++++++-- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java index 00abe0a834e..10a7494bd8b 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -109,13 +109,39 @@ public List getFieldNames() { return result; } + @JsonIgnore + public List getFieldIndices() { + List result = new ArrayList<>(); + if (! hasFields()) { + return result; + } + + for (FixedwidthFieldConfig field : fields) { + result.add(field.getIndex()); + } + return result; + } + + @JsonIgnore + public List getFieldWidths() { + List result = new ArrayList<>(); + if (! hasFields()) { + return result; + } + + for (FixedwidthFieldConfig field : fields) { + result.add(field.getWidth()); + } + return result; + } + @JsonIgnore public void validateFieldInput(){ Set uniqueNames = new HashSet<>(); for (String name : this.getFieldNames()){ - if (name.length() == 0){ - - } + /*if (name.length() == 0){ + + }*/ if (uniqueNames.contains(name)){ throw UserException .validationError() From ecf6fb8e6284e39df36ea7917fb22abd949e733d Mon Sep 17 00:00:00 2001 From: Esther Buchwalter Date: Wed, 17 Nov 2021 15:57:15 -0500 Subject: [PATCH 33/41] Added a check for overlapping fields --- .../fixedwidth/FixedwidthFormatConfig.java | 33 +++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java index 10a7494bd8b..78a718f972d 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -26,8 +26,6 @@ import org.apache.drill.common.PlanStringBuilder; import org.apache.drill.common.exceptions.UserException; import org.apache.drill.common.logical.FormatPluginConfig; -import org.apache.drill.exec.store.log.LogFormatField; -import org.apache.drill.exec.store.log.LogFormatPlugin; import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -151,6 +149,37 @@ public void validateFieldInput(){ } uniqueNames.add(name); } + List fieldIndices = this.getFieldIndices(); + List fieldWidths = this.getFieldWidths(); + int prevIndexAndWidth = -1; + + //assuming that fieldIndices is the same size as fieldWidths, width is required + for (int i = 0; i= 0.") + .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .build(logger); + } + /* + else if (fieldWidths.get(i) == null || fieldWidths.get(i) < 1) { + if (i == fieldIndices.size()-1) { + Integer width = + } + Integer width = fieldIndices.get(i+1) - fieldIndices.get(i); + fieldWidths.set(i, width); + } + */ + else if (fieldIndices.get(i) <= prevIndexAndWidth) { + throw UserException + .validationError() + .message("Overlapping fields at indices " + fieldIndices.get(i-1) + "and" + fieldIndices.get(i) + ".") + .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .build(logger); + } + prevIndexAndWidth = fieldIndices.get(i) + fieldWidths.get(i); + } } From 1978e14cf4da19080cdf0c26b80a3b3dfd48776e Mon Sep 17 00:00:00 2001 From: Esther Buchwalter Date: Thu, 18 Nov 2021 14:21:43 -0500 Subject: [PATCH 34/41] Updated check for overlapping fields --- .../exec/store/fixedwidth/FixedwidthFormatConfig.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java index 78a718f972d..f6ca6c7a018 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -151,14 +151,15 @@ public void validateFieldInput(){ } List fieldIndices = this.getFieldIndices(); List fieldWidths = this.getFieldWidths(); + List fieldNames = this.getFieldNames(); int prevIndexAndWidth = -1; //assuming that fieldIndices is the same size as fieldWidths, width is required for (int i = 0; i= 0.") + .message("Invalid index for field '" + fieldNames.get(i) + "' at index: " + fieldIndices.get(i) + ". Index must be > 0.") .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) .build(logger); } @@ -174,7 +175,7 @@ else if (fieldWidths.get(i) == null || fieldWidths.get(i) < 1) { else if (fieldIndices.get(i) <= prevIndexAndWidth) { throw UserException .validationError() - .message("Overlapping fields at indices " + fieldIndices.get(i-1) + "and" + fieldIndices.get(i) + ".") + .message("Overlapping fields: " + fieldNames.get(i-1) + " and " + fieldNames.get(i)) .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) .build(logger); } @@ -182,5 +183,4 @@ else if (fieldIndices.get(i) <= prevIndexAndWidth) { } } - } From a79f8a5536541f0856f6346fbe45b2a8937362a4 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Tue, 23 Nov 2021 16:14:24 -0500 Subject: [PATCH 35/41] Added field validation for data types, indices, width. Includes creating two setters in field config to set default value for data types and calculate/set width based on indices. --- .../fixedwidth/FixedwidthBatchReader.java | 3 +- .../fixedwidth/FixedwidthFieldConfig.java | 46 ++++---- .../fixedwidth/FixedwidthFormatConfig.java | 100 ++++++++++++++---- .../TestFixedwidthRecordReader.java | 2 +- 4 files changed, 98 insertions(+), 53 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index f63eac3b528..cd487a7e590 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -65,7 +65,6 @@ public FixedwidthBatchReader(FixedwidthFormatConfig config, int maxRecords) { this.maxRecords = maxRecords; } - @Override public boolean open(FileSchemaNegotiator negotiator) { split = negotiator.split(); @@ -197,4 +196,4 @@ private boolean parseLine(String line, RowSetLoader writer) throws IOException { return true; } -} \ No newline at end of file +} diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java index d0ab5bcda84..e214c239fa4 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java @@ -18,6 +18,7 @@ package org.apache.drill.exec.store.fixedwidth; +import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonTypeName; @@ -29,12 +30,12 @@ @JsonTypeName("fixedwidthReaderFieldDescription") @JsonInclude(JsonInclude.Include.NON_DEFAULT) -public class FixedwidthFieldConfig { +public class FixedwidthFieldConfig implements Comparable { private final String name; private final int index; - private final int width; - private final TypeProtos.MinorType type; + private int width; + private TypeProtos.MinorType type; private final String dateTimeFormat; public FixedwidthFieldConfig(@JsonProperty("name") String name, @@ -44,41 +45,17 @@ public FixedwidthFieldConfig(@JsonProperty("name") String name, this(name, index, width, type, null); } + @JsonCreator public FixedwidthFieldConfig(@JsonProperty("name") String name, @JsonProperty("index") int index, @JsonProperty("width") int width, @JsonProperty("type") TypeProtos.MinorType type, @JsonProperty("dateTimeFormat") String dateTimeFormat) { - this.name = name; this.index = index; this.width = width; this.type = type; this.dateTimeFormat = dateTimeFormat; - - - // Need to verify names are different - where can we access all the names of other columns -// if(name != null){ -// this.name = name; -// } else{ -// throw new IllegalArgumentException("Invalid name"); //Is this the right way to throw an exception if blank? What about if not valid SQL? -// } -// -// if (index >= 0){ -// this.index = index; -// } else { -// throw new IllegalArgumentException("Index must be 0 or greater"); -// } -// -// //Can modify this to be optional and be calculated based on start index of this field and next -// this.width = width; -// -// if (type == null){ -// this.type = TypeProtos.MinorType.VARCHAR; -// } else { -// this.type = type; -// } -// this.dateTimeFormat = dateTimeFormat; // No default required, null is allowed } public String getName() {return name;} @@ -87,8 +64,16 @@ public FixedwidthFieldConfig(@JsonProperty("name") String name, public int getWidth() {return width;} + public void setWidth(int value) { + this.width = value; + } + public TypeProtos.MinorType getType() {return type;} + public void setType() { + this.type = TypeProtos.MinorType.VARCHAR; + } + public String getDateTimeFormat() {return dateTimeFormat;} @Override @@ -122,4 +107,9 @@ public String toString() { .field("dateTimeFormat", dateTimeFormat) .toString(); } + + @Override + public int compareTo(FixedwidthFieldConfig o) { + return new Integer(this.getIndex()).compareTo(o.getIndex()); + } } diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java index f6ca6c7a018..1af93ce8ed7 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -26,11 +26,13 @@ import org.apache.drill.common.PlanStringBuilder; import org.apache.drill.common.exceptions.UserException; import org.apache.drill.common.logical.FormatPluginConfig; +import org.apache.drill.common.types.TypeProtos; import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.List; @@ -43,11 +45,15 @@ public class FixedwidthFormatConfig implements FormatPluginConfig { private static final Logger logger = LoggerFactory.getLogger(FixedwidthFormatConfig.class); private final List extensions; private final List fields; + private final List validDataTypes = Arrays.asList(new TypeProtos.MinorType[]{TypeProtos.MinorType.INT, TypeProtos.MinorType.VARCHAR, + TypeProtos.MinorType.DATE, TypeProtos.MinorType.TIME, TypeProtos.MinorType.TIMESTAMP, TypeProtos.MinorType.FLOAT4, + TypeProtos.MinorType.FLOAT8, TypeProtos.MinorType.BIGINT, TypeProtos.MinorType.VARDECIMAL}); @JsonCreator public FixedwidthFormatConfig(@JsonProperty("extensions") List extensions, @JsonProperty("fields") List fields) { this.extensions = extensions == null ? Collections.singletonList("fwf") : ImmutableList.copyOf(extensions); + Collections.sort(fields); this.fields = fields; validateFieldInput(); @@ -133,13 +139,56 @@ public List getFieldWidths() { return result; } + @JsonIgnore + public void setFieldWidths(int i, int value) { + for (FixedwidthFieldConfig field : fields) { + if (field.getIndex() == i) { + field.setWidth(value); + } + } + } + + @JsonIgnore + public List getFieldTypes() { + List result = new ArrayList<>(); + if (! hasFields()) { + return result; + } + + for (FixedwidthFieldConfig field : fields) { + result.add(field.getType()); + } + return result; + } + + @JsonIgnore + public void setFieldTypes(int i) { + for (FixedwidthFieldConfig field : fields) { + if (field.getIndex() == i) { + field.setType(); + } + } + } + @JsonIgnore public void validateFieldInput(){ Set uniqueNames = new HashSet<>(); - for (String name : this.getFieldNames()){ - /*if (name.length() == 0){ + List fieldIndices = this.getFieldIndices(); + List fieldWidths = this.getFieldWidths(); + List fieldNames = this.getFieldNames(); + List fieldTypes = this.getFieldTypes(); + int width = 0; + int prevIndexAndWidth = -1; - }*/ + // Ensure no two fields have the same name + for (String name : this.getFieldNames()){ + if (name.length() == 0){ + throw UserException + .validationError() + .message("Blank field name detected.") + .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .build(logger); + } if (uniqueNames.contains(name)){ throw UserException .validationError() @@ -149,10 +198,6 @@ public void validateFieldInput(){ } uniqueNames.add(name); } - List fieldIndices = this.getFieldIndices(); - List fieldWidths = this.getFieldWidths(); - List fieldNames = this.getFieldNames(); - int prevIndexAndWidth = -1; //assuming that fieldIndices is the same size as fieldWidths, width is required for (int i = 0; i Date: Wed, 24 Nov 2021 11:04:57 -0500 Subject: [PATCH 36/41] Modified validation for field width and field index. Added comments to code. --- .../fixedwidth/FixedwidthFieldConfig.java | 4 --- .../fixedwidth/FixedwidthFormatConfig.java | 33 +++++++------------ 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java index e214c239fa4..615a04b775f 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java @@ -64,10 +64,6 @@ public FixedwidthFieldConfig(@JsonProperty("name") String name, public int getWidth() {return width;} - public void setWidth(int value) { - this.width = value; - } - public TypeProtos.MinorType getType() {return type;} public void setType() { diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java index 1af93ce8ed7..b9386749a57 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -45,9 +45,9 @@ public class FixedwidthFormatConfig implements FormatPluginConfig { private static final Logger logger = LoggerFactory.getLogger(FixedwidthFormatConfig.class); private final List extensions; private final List fields; - private final List validDataTypes = Arrays.asList(new TypeProtos.MinorType[]{TypeProtos.MinorType.INT, TypeProtos.MinorType.VARCHAR, + private final List validDataTypes = Arrays.asList(TypeProtos.MinorType.INT, TypeProtos.MinorType.VARCHAR, TypeProtos.MinorType.DATE, TypeProtos.MinorType.TIME, TypeProtos.MinorType.TIMESTAMP, TypeProtos.MinorType.FLOAT4, - TypeProtos.MinorType.FLOAT8, TypeProtos.MinorType.BIGINT, TypeProtos.MinorType.VARDECIMAL}); + TypeProtos.MinorType.FLOAT8, TypeProtos.MinorType.BIGINT, TypeProtos.MinorType.VARDECIMAL); @JsonCreator public FixedwidthFormatConfig(@JsonProperty("extensions") List extensions, @@ -139,15 +139,6 @@ public List getFieldWidths() { return result; } - @JsonIgnore - public void setFieldWidths(int i, int value) { - for (FixedwidthFieldConfig field : fields) { - if (field.getIndex() == i) { - field.setWidth(value); - } - } - } - @JsonIgnore public List getFieldTypes() { List result = new ArrayList<>(); @@ -180,7 +171,7 @@ public void validateFieldInput(){ int width = 0; int prevIndexAndWidth = -1; - // Ensure no two fields have the same name + // Validate Field Name - Ensure field is not empty, no two fields have the same name, and field is valid SQL syntax for (String name : this.getFieldNames()){ if (name.length() == 0){ throw UserException @@ -199,7 +190,7 @@ public void validateFieldInput(){ uniqueNames.add(name); } - //assuming that fieldIndices is the same size as fieldWidths, width is required + // Validate Field Index - Must be greater than 0, and must not overlap with other fields for (int i = 0; i Date: Wed, 24 Nov 2021 17:35:37 -0500 Subject: [PATCH 37/41] Added to field validation for field names. Checks for valid length and valid SQL syntax. --- .../fixedwidth/FixedwidthFormatConfig.java | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java index b9386749a57..2d0e5caa4f2 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java @@ -38,6 +38,7 @@ import java.util.List; import java.util.Objects; import java.util.Set; +import java.util.regex.Pattern; @JsonTypeName(FixedwidthFormatPlugin.DEFAULT_NAME) @JsonInclude(JsonInclude.Include.NON_DEFAULT) @@ -168,10 +169,11 @@ public void validateFieldInput(){ List fieldWidths = this.getFieldWidths(); List fieldNames = this.getFieldNames(); List fieldTypes = this.getFieldTypes(); - int width = 0; int prevIndexAndWidth = -1; - // Validate Field Name - Ensure field is not empty, no two fields have the same name, and field is valid SQL syntax + /* Validate Field Name - Ensure field is not empty, does not exceed maximum length, + is valid SQL syntax, and no two fields have the same name + */ for (String name : this.getFieldNames()){ if (name.length() == 0){ throw UserException @@ -180,6 +182,20 @@ public void validateFieldInput(){ .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) .build(logger); } + if (name.length() > 1024) { + throw UserException + .validationError() + .message("Exceeds maximum length of 1024 characters: " + name.substring(0, 1024)) + .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .build(logger); + } + if (!Pattern.matches("[a-zA-Z]\\w*", name)) { + throw UserException + .validationError() + .message("Invalid input: " + name) + .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .build(logger); + } if (uniqueNames.contains(name)){ throw UserException .validationError() From aa74ec53dffbbca65e1e896826fd26c2be0ae230 Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Fri, 10 Dec 2021 09:31:12 -0500 Subject: [PATCH 38/41] WIP converting to EVF v2. Pushing to repo for troubleshooting purposes. --- .../fixedwidth/FixedwidthBatchReader.java | 3 +- .../fixedwidth/FixedwidthBatchReaderImpl.java | 89 +++++++++++++++++++ .../fixedwidth/FixedwidthFormatConfig.java | 3 +- .../fixedwidth/FixedwidthFormatPlugin.java | 2 +- .../TestFixedwidthRecordReader.java | 11 ++- .../impl/scan/v3/SchemaNegotiator.java | 2 + 6 files changed, 99 insertions(+), 11 deletions(-) create mode 100644 contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReaderImpl.java diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java index cd487a7e590..79eaef2c45b 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java @@ -56,12 +56,11 @@ public class FixedwidthBatchReader implements ManagedReader uniqueNames = new HashSet<>(); List fieldIndices = this.getFieldIndices(); @@ -192,7 +191,7 @@ public void validateFieldInput(){ if (!Pattern.matches("[a-zA-Z]\\w*", name)) { throw UserException .validationError() - .message("Invalid input: " + name) + .message("Column Name '" + name + "' is not valid. Must contain letters, numbers, and underscores only.") .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) .build(logger); } diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java index f96e4a81f77..2f64e23d8e6 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java @@ -89,4 +89,4 @@ protected FileScanBuilder frameworkBuilder(OptionManager options, EasySubScan sc builder.nullType(Types.optional(TypeProtos.MinorType.VARCHAR)); return builder; } -} \ No newline at end of file +} diff --git a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java index 6f7927b8576..72ccd23004b 100644 --- a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java @@ -51,11 +51,11 @@ public static void setup() throws Exception { FixedwidthFormatConfig formatConfig = new FixedwidthFormatConfig(Lists.newArrayList("fwf"), Lists.newArrayList( new FixedwidthFieldConfig("Number", 1, 5, TypeProtos.MinorType.VARDECIMAL), - new FixedwidthFieldConfig("Address",12, 3,TypeProtos.MinorType.INT, ""), - new FixedwidthFieldConfig("Letter", 7,4, TypeProtos.MinorType.VARCHAR, ""), - new FixedwidthFieldConfig("Date",16, 10,TypeProtos.MinorType.DATE, "MM-dd-yyyy"), - new FixedwidthFieldConfig( "Time", 27, 8,TypeProtos.MinorType.TIME,"HH:mm:ss" ), - new FixedwidthFieldConfig("DateTime", 36, 23,TypeProtos.MinorType.TIMESTAMP, "MM-dd-yyyy'T'HH:mm:ss.SSX" ) + new FixedwidthFieldConfig("Address", 12, 3, TypeProtos.MinorType.INT), + new FixedwidthFieldConfig("Letter", 7, 4, TypeProtos.MinorType.VARCHAR), + new FixedwidthFieldConfig("Date", 16, 10, TypeProtos.MinorType.DATE, "MM-dd-yyyy"), + new FixedwidthFieldConfig("Time", 27, 8, TypeProtos.MinorType.TIME,"HH:mm:ss"), + new FixedwidthFieldConfig("DateTime", 36, 23, TypeProtos.MinorType.TIMESTAMP, "MM-dd-yyyy'T'HH:mm:ss.SSX") )); cluster.defineFormat("dfs", "fwf", formatConfig); cluster.defineFormat("cp", "fwf", formatConfig); @@ -218,4 +218,3 @@ private RowSet setupTestData(){ } } - diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/SchemaNegotiator.java b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/SchemaNegotiator.java index 9dee1d78df5..4455ad588cf 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/SchemaNegotiator.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/SchemaNegotiator.java @@ -224,4 +224,6 @@ public interface SchemaNegotiator { * schema order */ ResultSetLoader build(); + + Object split(); } From 1972fb9babd1199e99e6cc8c7042a74193bcd53f Mon Sep 17 00:00:00 2001 From: Megan Foss Date: Tue, 22 Mar 2022 12:15:37 -0400 Subject: [PATCH 39/41] Updating pom.xml with new drill snapshot version --- contrib/format-fixedwidth/pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/format-fixedwidth/pom.xml b/contrib/format-fixedwidth/pom.xml index c30db19a536..6287db3629f 100644 --- a/contrib/format-fixedwidth/pom.xml +++ b/contrib/format-fixedwidth/pom.xml @@ -23,7 +23,7 @@ drill-contrib-parent org.apache.drill.contrib - 1.20.0-SNAPSHOT + 1.21.0-SNAPSHOT drill-format-fixedwidth Drill : Contrib : Format : Fixedwidth @@ -78,4 +78,4 @@ - \ No newline at end of file + From 1e75757d88614ca629dc4aa045fef9ddb1ad53eb Mon Sep 17 00:00:00 2001 From: tswagger Date: Tue, 22 Mar 2022 11:24:10 -0500 Subject: [PATCH 40/41] Renamed classes --- contrib/format-fixedwidth/pom.xml | 14 ++- .../fixedwidth/FixedWidthBatchReader.java | 100 ++++++++++++++++++ ...Config.java => FixedWidthFieldConfig.java} | 12 +-- ...onfig.java => FixedWidthFormatConfig.java} | 42 ++++---- ...lugin.java => FixedWidthFormatPlugin.java} | 47 ++++---- ...der.java => FixedwidthBatchReader.javaOLD} | 0 .../fixedwidth/FixedwidthBatchReaderImpl.java | 89 ---------------- ...r.java => TestFixedWidthRecordReader.java} | 16 +-- .../impl/scan/v3/SchemaNegotiator.java | 2 - 9 files changed, 165 insertions(+), 157 deletions(-) create mode 100644 contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedWidthBatchReader.java rename contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/{FixedwidthFieldConfig.java => FixedWidthFieldConfig.java} (89%) rename contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/{FixedwidthFormatConfig.java => FixedWidthFormatConfig.java} (87%) rename contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/{FixedwidthFormatPlugin.java => FixedWidthFormatPlugin.java} (60%) rename contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/{FixedwidthBatchReader.java => FixedwidthBatchReader.javaOLD} (100%) delete mode 100644 contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReaderImpl.java rename contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/{TestFixedwidthRecordReader.java => TestFixedWidthRecordReader.java} (95%) diff --git a/contrib/format-fixedwidth/pom.xml b/contrib/format-fixedwidth/pom.xml index c30db19a536..27f652aba69 100644 --- a/contrib/format-fixedwidth/pom.xml +++ b/contrib/format-fixedwidth/pom.xml @@ -23,10 +23,10 @@ drill-contrib-parent org.apache.drill.contrib - 1.20.0-SNAPSHOT + 1.21.0-SNAPSHOT drill-format-fixedwidth - Drill : Contrib : Format : Fixedwidth + Drill : Contrib : Format : FixedWidth @@ -34,6 +34,11 @@ drill-java-exec ${project.version} + + + + + @@ -63,8 +68,7 @@ copy-resources - ${basedir}/target/classes/org/apache/drill/exec/store/fixedwidth - + ${basedir}/target/classes/org/apache/drill/exec/store/fixedwidth src/main/java/org/apache/drill/exec/store/fixedwidth @@ -78,4 +82,4 @@ - \ No newline at end of file + diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedWidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedWidthBatchReader.java new file mode 100644 index 00000000000..39ea624e8f5 --- /dev/null +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedWidthBatchReader.java @@ -0,0 +1,100 @@ +package org.apache.drill.exec.store.fixedwidth; + +import org.apache.drill.common.AutoCloseables; +import org.apache.drill.common.exceptions.CustomErrorContext; +import org.apache.drill.common.exceptions.UserException; +import org.apache.drill.common.types.TypeProtos; +import org.apache.drill.exec.physical.impl.scan.v3.ManagedReader; +import org.apache.drill.exec.physical.impl.scan.v3.file.FileSchemaNegotiator; +import org.apache.drill.exec.physical.resultSet.ResultSetLoader; +import org.apache.drill.exec.record.metadata.SchemaBuilder; +import org.apache.drill.exec.record.metadata.TupleMetadata; +import org.apache.drill.shaded.guava.com.google.common.base.Charsets; +import org.apache.hadoop.mapred.FileSplit; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import com.epam.parso.impl; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; + +public class FixedWidthBatchReader implements ManagedReader { + + private final int maxRecords; // Do we need this? + private final FixedWidthFormatConfig config; + private InputStream fsStream; + private ResultSetLoader loader; + private FileSplit split; + private CustomErrorContext errorContext; + private static final Logger logger = LoggerFactory.getLogger(FixedWidthBatchReader.class); + private BufferedReader reader; + + public FixedWidthBatchReader(FileSchemaNegotiator negotiator, FixedWidthFormatConfig config, int maxRecords) { + this.loader = open(negotiator); + this.config = config; + this.maxRecords = maxRecords; + } + + @Override + public boolean next() { + return true; + } + + @Override + public void close() { + if (fsStream != null){ + AutoCloseables.closeSilently(fsStream); + fsStream = null; + } + } + + private ResultSetLoader open(FileSchemaNegotiator negotiator) { + this.split = (FileSplit) negotiator.split(); + this.errorContext = negotiator.parentErrorContext(); + openFile(negotiator); + + try { + negotiator.tableSchema(buildSchema(), true); + this.loader = negotiator.build(); + } catch (Exception e) { + throw UserException + .dataReadError(e) + .message("Failed to open input file: {}", this.split.getPath().toString()) + .addContext(this.errorContext) + .addContext(e.getMessage()) + .build(FixedWidthBatchReader.logger); + } + this.reader = new BufferedReader(new InputStreamReader(this.fsStream, Charsets.UTF_8)); + return this.loader; + } + + private void openFile(FileSchemaNegotiator negotiator) { + try { + this.fsStream = negotiator.file().fileSystem().openPossiblyCompressedStream(this.split.getPath()); + sasFileReader = new SasFileReaderImpl(this.fsStream); + firstRow = sasFileReader.readNext(); + } catch (IOException e) { + throw UserException + .dataReadError(e) + .message("Unable to open Fixed Width File %s", this.split.getPath()) + .addContext(e.getMessage()) + .addContext(this.errorContext) + .build(FixedWidthBatchReader.logger); + } + } + + private TupleMetadata buildSchema() { + SchemaBuilder builder = new SchemaBuilder(); + for (FixedWidthFieldConfig field : config.getFields()) { + if (field.getType() == TypeProtos.MinorType.VARDECIMAL){ + builder.addNullable(field.getName(), TypeProtos.MinorType.VARDECIMAL,38,4); + //revisit this + } else { + builder.addNullable(field.getName(), field.getType()); + } + } + return builder.buildSchema(); + } +} diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedWidthFieldConfig.java similarity index 89% rename from contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java rename to contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedWidthFieldConfig.java index 615a04b775f..bb65ccba76b 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFieldConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedWidthFieldConfig.java @@ -30,7 +30,7 @@ @JsonTypeName("fixedwidthReaderFieldDescription") @JsonInclude(JsonInclude.Include.NON_DEFAULT) -public class FixedwidthFieldConfig implements Comparable { +public class FixedWidthFieldConfig implements Comparable { private final String name; private final int index; @@ -38,7 +38,7 @@ public class FixedwidthFieldConfig implements Comparable private TypeProtos.MinorType type; private final String dateTimeFormat; - public FixedwidthFieldConfig(@JsonProperty("name") String name, + public FixedWidthFieldConfig(@JsonProperty("name") String name, @JsonProperty("index") int index, @JsonProperty("width") int width, @JsonProperty("type") TypeProtos.MinorType type) { @@ -46,7 +46,7 @@ public FixedwidthFieldConfig(@JsonProperty("name") String name, } @JsonCreator - public FixedwidthFieldConfig(@JsonProperty("name") String name, + public FixedWidthFieldConfig(@JsonProperty("name") String name, @JsonProperty("index") int index, @JsonProperty("width") int width, @JsonProperty("type") TypeProtos.MinorType type, @@ -85,7 +85,7 @@ public boolean equals(Object obj) { if (obj == null || getClass() != obj.getClass()) { return false; } - FixedwidthFieldConfig other = (FixedwidthFieldConfig) obj; + FixedWidthFieldConfig other = (FixedWidthFieldConfig) obj; return Objects.equals(name, other.name) && Objects.equals(index, other.index) && Objects.equals(width, other.width) @@ -105,7 +105,7 @@ public String toString() { } @Override - public int compareTo(FixedwidthFieldConfig o) { - return new Integer(this.getIndex()).compareTo(o.getIndex()); + public int compareTo(FixedWidthFieldConfig o) { + return Integer.compare(this.getIndex(), o.getIndex()); } } diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedWidthFormatConfig.java similarity index 87% rename from contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java rename to contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedWidthFormatConfig.java index 81b476e6121..e8b575c8629 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatConfig.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedWidthFormatConfig.java @@ -40,19 +40,19 @@ import java.util.Set; import java.util.regex.Pattern; -@JsonTypeName(FixedwidthFormatPlugin.DEFAULT_NAME) +@JsonTypeName(FixedWidthFormatPlugin.DEFAULT_NAME) @JsonInclude(JsonInclude.Include.NON_DEFAULT) -public class FixedwidthFormatConfig implements FormatPluginConfig { - private static final Logger logger = LoggerFactory.getLogger(FixedwidthFormatConfig.class); +public class FixedWidthFormatConfig implements FormatPluginConfig { + private static final Logger logger = LoggerFactory.getLogger(FixedWidthFormatConfig.class); private final List extensions; - private final List fields; + private final List fields; private final List validDataTypes = Arrays.asList(TypeProtos.MinorType.INT, TypeProtos.MinorType.VARCHAR, TypeProtos.MinorType.DATE, TypeProtos.MinorType.TIME, TypeProtos.MinorType.TIMESTAMP, TypeProtos.MinorType.FLOAT4, TypeProtos.MinorType.FLOAT8, TypeProtos.MinorType.BIGINT, TypeProtos.MinorType.VARDECIMAL); @JsonCreator - public FixedwidthFormatConfig(@JsonProperty("extensions") List extensions, - @JsonProperty("fields") List fields) { + public FixedWidthFormatConfig(@JsonProperty("extensions") List extensions, + @JsonProperty("fields") List fields) { this.extensions = extensions == null ? Collections.singletonList("fwf") : ImmutableList.copyOf(extensions); Collections.sort(fields); this.fields = fields; @@ -65,7 +65,7 @@ public List getExtensions() { return extensions; } - public List getFields() { + public List getFields() { return fields; } @@ -82,7 +82,7 @@ public boolean equals(Object obj) { if (obj == null || getClass() != obj.getClass()) { return false; } - FixedwidthFormatConfig other = (FixedwidthFormatConfig) obj; + FixedWidthFormatConfig other = (FixedWidthFormatConfig) obj; return Objects.equals(extensions, other.extensions) && Objects.equals(fields, other.fields); } @@ -108,7 +108,7 @@ public List getFieldNames() { return result; } - for (FixedwidthFieldConfig field : fields) { + for (FixedWidthFieldConfig field : fields) { result.add(field.getName()); } return result; @@ -121,7 +121,7 @@ public List getFieldIndices() { return result; } - for (FixedwidthFieldConfig field : fields) { + for (FixedWidthFieldConfig field : fields) { result.add(field.getIndex()); } return result; @@ -134,7 +134,7 @@ public List getFieldWidths() { return result; } - for (FixedwidthFieldConfig field : fields) { + for (FixedWidthFieldConfig field : fields) { result.add(field.getWidth()); } return result; @@ -147,7 +147,7 @@ public List getFieldTypes() { return result; } - for (FixedwidthFieldConfig field : fields) { + for (FixedWidthFieldConfig field : fields) { result.add(field.getType()); } return result; @@ -155,7 +155,7 @@ public List getFieldTypes() { @JsonIgnore public void setFieldTypes(int i) { - for (FixedwidthFieldConfig field : fields) { + for (FixedWidthFieldConfig field : fields) { if (field.getIndex() == i) { field.setType(); } @@ -178,28 +178,28 @@ public void validateFieldInput(){ throw UserException .validationError() .message("Blank field name detected.") - .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .addContext("Plugin", FixedWidthFormatPlugin.DEFAULT_NAME) .build(logger); } if (name.length() > 1024) { throw UserException .validationError() .message("Exceeds maximum length of 1024 characters: " + name.substring(0, 1024)) - .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .addContext("Plugin", FixedWidthFormatPlugin.DEFAULT_NAME) .build(logger); } if (!Pattern.matches("[a-zA-Z]\\w*", name)) { throw UserException .validationError() .message("Column Name '" + name + "' is not valid. Must contain letters, numbers, and underscores only.") - .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .addContext("Plugin", FixedWidthFormatPlugin.DEFAULT_NAME) .build(logger); } if (uniqueNames.contains(name)){ throw UserException .validationError() .message("Duplicate column name: " + name) - .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .addContext("Plugin", FixedWidthFormatPlugin.DEFAULT_NAME) .build(logger); } uniqueNames.add(name); @@ -211,14 +211,14 @@ public void validateFieldInput(){ throw UserException .validationError() .message("Invalid index for field '" + fieldNames.get(i) + "' at index: " + fieldIndices.get(i) + ". Index must be > 0.") - .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .addContext("Plugin", FixedWidthFormatPlugin.DEFAULT_NAME) .build(logger); } else if (fieldIndices.get(i) <= prevIndexAndWidth) { throw UserException .validationError() .message("Overlapping fields: " + fieldNames.get(i-1) + " and " + fieldNames.get(i)) - .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .addContext("Plugin", FixedWidthFormatPlugin.DEFAULT_NAME) .build(logger); } @@ -227,7 +227,7 @@ else if (fieldIndices.get(i) <= prevIndexAndWidth) { throw UserException .validationError() .message("Width for field '" + fieldNames.get(i) + "' is invalid. Widths must be greater than 0.") - .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .addContext("Plugin", FixedWidthFormatPlugin.DEFAULT_NAME) .build(logger); } prevIndexAndWidth = fieldIndices.get(i) + fieldWidths.get(i); @@ -240,7 +240,7 @@ else if (!validDataTypes.contains(fieldTypes.get(i))){ throw UserException .validationError() .message("Field type " + fieldTypes.get(i) + " is not valid. Please check for typos and ensure the required data type is included in the Fixed Width Format Plugin.") - .addContext("Plugin", FixedwidthFormatPlugin.DEFAULT_NAME) + .addContext("Plugin", FixedWidthFormatPlugin.DEFAULT_NAME) .build(logger); } } diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedWidthFormatPlugin.java similarity index 60% rename from contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java rename to contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedWidthFormatPlugin.java index 2f64e23d8e6..a3a0fef4aac 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthFormatPlugin.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedWidthFormatPlugin.java @@ -21,46 +21,48 @@ import org.apache.drill.common.logical.StoragePluginConfig; import org.apache.drill.common.types.TypeProtos; import org.apache.drill.common.types.Types; -import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileReaderFactory; -import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileScanBuilder; -import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator; -import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader; +import org.apache.drill.exec.physical.impl.scan.v3.file.FileReaderFactory; +import org.apache.drill.exec.physical.impl.scan.v3.file.FileSchemaNegotiator; +import org.apache.drill.exec.physical.impl.scan.v3.ManagedReader; +import org.apache.drill.exec.physical.impl.scan.v3.ManagedReader.EarlyEofException; +import org.apache.drill.exec.physical.impl.scan.v3.file.FileScanLifecycleBuilder; import org.apache.drill.exec.server.DrillbitContext; -import org.apache.drill.exec.server.options.OptionManager; import org.apache.drill.exec.store.dfs.easy.EasyFormatPlugin; +import org.apache.drill.exec.store.dfs.easy.EasyFormatPlugin.ScanFrameworkVersion; import org.apache.drill.exec.store.dfs.easy.EasySubScan; + import org.apache.hadoop.conf.Configuration; -public class FixedwidthFormatPlugin extends EasyFormatPlugin { +public class FixedWidthFormatPlugin extends EasyFormatPlugin { protected static final String DEFAULT_NAME = "fixedwidth"; - private static class FixedwidthReaderFactory extends FileReaderFactory { + private static class FixedWidthReaderFactory extends FileReaderFactory { - private final FixedwidthFormatConfig config; + private final FixedWidthFormatConfig config; private final int maxRecords; - public FixedwidthReaderFactory(FixedwidthFormatConfig config, int maxRecords) { + public FixedWidthReaderFactory(FixedWidthFormatConfig config, int maxRecords) { this.config = config; this.maxRecords = maxRecords; } @Override - public ManagedReader newReader() { - return new FixedwidthBatchReader(config, maxRecords); + public ManagedReader newReader(FileSchemaNegotiator negotiator) throws EarlyEofException { + return new FixedWidthBatchReader(negotiator, config, maxRecords); } } - public FixedwidthFormatPlugin(String name, + public FixedWidthFormatPlugin(String name, DrillbitContext context, Configuration fsConf, StoragePluginConfig storageConfig, - FixedwidthFormatConfig formatConfig) { + FixedWidthFormatConfig formatConfig) { super(name, easyConfig(fsConf, formatConfig), context, storageConfig, formatConfig); } - private static EasyFormatConfig easyConfig(Configuration fsConf, FixedwidthFormatConfig pluginConfig) { + private static EasyFormatConfig easyConfig(Configuration fsConf, FixedWidthFormatConfig pluginConfig) { return EasyFormatConfig.builder() .readable(true) .writable(false) @@ -70,23 +72,16 @@ private static EasyFormatConfig easyConfig(Configuration fsConf, FixedwidthForma .extensions(pluginConfig.getExtensions()) .fsConf(fsConf) .defaultName(DEFAULT_NAME) - .useEnhancedScan(true) +// .useEnhancedScan(true) + .scanVersion(ScanFrameworkVersion.EVF_V2) .supportsLimitPushdown(true) .build(); } @Override - public ManagedReader newBatchReader( - EasySubScan scan, OptionManager options) { - return new FixedwidthBatchReader(getConfig(), scan.getMaxRecords()); - } - - @Override - protected FileScanBuilder frameworkBuilder(OptionManager options, EasySubScan scan) { - FileScanBuilder builder = new FileScanBuilder(); - builder.setReaderFactory(new FixedwidthReaderFactory(getConfig(), scan.getMaxRecords())); - initScanBuilder(builder, scan); + protected void configureScan(FileScanLifecycleBuilder builder, EasySubScan scan) { builder.nullType(Types.optional(TypeProtos.MinorType.VARCHAR)); - return builder; + builder.readerFactory(new FixedWidthReaderFactory(formatConfig, scan.getMaxRecords())); } + } diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.javaOLD similarity index 100% rename from contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.java rename to contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReader.javaOLD diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReaderImpl.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReaderImpl.java deleted file mode 100644 index a7d53bdf8b8..00000000000 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedwidthBatchReaderImpl.java +++ /dev/null @@ -1,89 +0,0 @@ -package org.apache.drill.exec.store.fixedwidth; - -import org.apache.drill.common.AutoCloseables; -import org.apache.drill.common.exceptions.CustomErrorContext; -import org.apache.drill.common.exceptions.UserException; -import org.apache.drill.exec.ops.OperatorContext; -import org.apache.drill.exec.physical.impl.scan.v3.ManagedReader; -import org.apache.drill.exec.physical.impl.scan.v3.SchemaNegotiator; -import org.apache.drill.exec.physical.impl.scan.v3.file.FileScanLifecycle; -import org.apache.drill.exec.physical.impl.scan.v3.file.FileSchemaNegotiator; -import org.apache.drill.exec.physical.impl.scan.v3.lifecycle.ReaderLifecycle; -import org.apache.drill.exec.physical.impl.scan.v3.lifecycle.SchemaNegotiatorImpl; -import org.apache.drill.exec.physical.impl.scan.v3.schema.ProjectedColumn; -import org.apache.drill.exec.physical.resultSet.ResultSetLoader; -import org.apache.drill.exec.record.metadata.TupleMetadata; -import org.apache.drill.shaded.guava.com.google.common.base.Charsets; -import org.apache.hadoop.mapred.FileSplit; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; - -public class FixedwidthBatchReaderImpl implements ManagedReader { - - private final int maxRecords; - private final FixedwidthFormatConfig config; - private InputStream fsStream; - private ResultSetLoader loader; - private FileSplit split; - private CustomErrorContext errorContext; - private static final Logger logger = LoggerFactory.getLogger(FixedwidthBatchReader.class); - - public FixedwidthBatchReaderImpl (SchemaNegotiator negotiator, FixedwidthFormatConfig config, int maxRecords) { - this.loader = open(negotiator); - this.config = config; - this.maxRecords = maxRecords; - } - - @Override - public boolean next() { - - } - - @Override - public void close() { - if (fsStream != null){ - AutoCloseables.closeSilently(fsStream); - fsStream = null; - } - } - - private ResultSetLoader open(SchemaNegotiator negotiator) { - split = (FileSplit) negotiator.split(); - errorContext = negotiator.parentErrorContext(); - openFile(negotiator); - - try { - negotiator.tableSchema(buildSchema(), true); - loader = negotiator.build(); - } catch (Exception e) { - throw UserException - .dataReadError(e) - .message("Failed to open input file: {}", split.getPath().toString()) - .addContext(errorContext) - .addContext(e.getMessage()) - .build(logger); - } - reader = new BufferedReader(new InputStreamReader(fsStream, Charsets.UTF_8)); - return loader; - } - - private void openFile(FileSchemaNegotiator negotiator) { - try { - fsStream = negotiator.file().fileSystem().openPossiblyCompressedStream(split.getPath()); - sasFileReader = new SasFileReaderImpl(fsStream); - firstRow = sasFileReader.readNext(); - } catch (IOException e) { - throw UserException - .dataReadError(e) - .message("Unable to open Fixed Width File %s", split.getPath()) - .addContext(e.getMessage()) - .addContext(errorContext) - .build(logger); - } - } -} diff --git a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedWidthRecordReader.java similarity index 95% rename from contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java rename to contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedWidthRecordReader.java index 72ccd23004b..2d04498cb93 100644 --- a/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedwidthRecordReader.java +++ b/contrib/format-fixedwidth/src/test/java/org/apache/drill/exec/store/fixedwidth/TestFixedWidthRecordReader.java @@ -42,20 +42,20 @@ import static org.junit.Assert.assertEquals; @Category(RowSetTests.class) -public class TestFixedwidthRecordReader extends ClusterTest { +public class TestFixedWidthRecordReader extends ClusterTest { @BeforeClass public static void setup() throws Exception { ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher)); - FixedwidthFormatConfig formatConfig = new FixedwidthFormatConfig(Lists.newArrayList("fwf"), + FixedWidthFormatConfig formatConfig = new FixedWidthFormatConfig(Lists.newArrayList("fwf"), Lists.newArrayList( - new FixedwidthFieldConfig("Number", 1, 5, TypeProtos.MinorType.VARDECIMAL), - new FixedwidthFieldConfig("Address", 12, 3, TypeProtos.MinorType.INT), - new FixedwidthFieldConfig("Letter", 7, 4, TypeProtos.MinorType.VARCHAR), - new FixedwidthFieldConfig("Date", 16, 10, TypeProtos.MinorType.DATE, "MM-dd-yyyy"), - new FixedwidthFieldConfig("Time", 27, 8, TypeProtos.MinorType.TIME,"HH:mm:ss"), - new FixedwidthFieldConfig("DateTime", 36, 23, TypeProtos.MinorType.TIMESTAMP, "MM-dd-yyyy'T'HH:mm:ss.SSX") + new FixedWidthFieldConfig("Number", 1, 5, TypeProtos.MinorType.VARDECIMAL), + new FixedWidthFieldConfig("Address", 12, 3, TypeProtos.MinorType.INT), + new FixedWidthFieldConfig("Letter", 7, 4, TypeProtos.MinorType.VARCHAR), + new FixedWidthFieldConfig("Date", 16, 10, TypeProtos.MinorType.DATE, "MM-dd-yyyy"), + new FixedWidthFieldConfig("Time", 27, 8, TypeProtos.MinorType.TIME,"HH:mm:ss"), + new FixedWidthFieldConfig("DateTime", 36, 23, TypeProtos.MinorType.TIMESTAMP, "MM-dd-yyyy'T'HH:mm:ss.SSX") )); cluster.defineFormat("dfs", "fwf", formatConfig); cluster.defineFormat("cp", "fwf", formatConfig); diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/SchemaNegotiator.java b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/SchemaNegotiator.java index 4455ad588cf..9dee1d78df5 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/SchemaNegotiator.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/v3/SchemaNegotiator.java @@ -224,6 +224,4 @@ public interface SchemaNegotiator { * schema order */ ResultSetLoader build(); - - Object split(); } From bf6a16c3b0e5ccb41b4356e284135e8358baa25f Mon Sep 17 00:00:00 2001 From: tswagger Date: Tue, 22 Mar 2022 13:54:07 -0500 Subject: [PATCH 41/41] Updated pom.xml Commented out a few files --- contrib/format-fixedwidth/pom.xml | 2 +- .../fixedwidth/FixedWidthBatchReader.java | 35 +++++++++---------- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/contrib/format-fixedwidth/pom.xml b/contrib/format-fixedwidth/pom.xml index 27f652aba69..adb841ffaeb 100644 --- a/contrib/format-fixedwidth/pom.xml +++ b/contrib/format-fixedwidth/pom.xml @@ -23,7 +23,7 @@ drill-contrib-parent org.apache.drill.contrib - 1.21.0-SNAPSHOT + 2.0.0-SNAPSHOT drill-format-fixedwidth Drill : Contrib : Format : FixedWidth diff --git a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedWidthBatchReader.java b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedWidthBatchReader.java index 39ea624e8f5..7367cc670f0 100644 --- a/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedWidthBatchReader.java +++ b/contrib/format-fixedwidth/src/main/java/org/apache/drill/exec/store/fixedwidth/FixedWidthBatchReader.java @@ -13,10 +13,9 @@ import org.apache.hadoop.mapred.FileSplit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.epam.parso.impl; import java.io.BufferedReader; -import java.io.IOException; +//import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; @@ -51,9 +50,9 @@ public void close() { } private ResultSetLoader open(FileSchemaNegotiator negotiator) { - this.split = (FileSplit) negotiator.split(); +// this.split = (FileSplit) negotiator.split(); this.errorContext = negotiator.parentErrorContext(); - openFile(negotiator); +// openFile(negotiator); try { negotiator.tableSchema(buildSchema(), true); @@ -70,20 +69,20 @@ private ResultSetLoader open(FileSchemaNegotiator negotiator) { return this.loader; } - private void openFile(FileSchemaNegotiator negotiator) { - try { - this.fsStream = negotiator.file().fileSystem().openPossiblyCompressedStream(this.split.getPath()); - sasFileReader = new SasFileReaderImpl(this.fsStream); - firstRow = sasFileReader.readNext(); - } catch (IOException e) { - throw UserException - .dataReadError(e) - .message("Unable to open Fixed Width File %s", this.split.getPath()) - .addContext(e.getMessage()) - .addContext(this.errorContext) - .build(FixedWidthBatchReader.logger); - } - } +// private void openFile(FileSchemaNegotiator negotiator) { +// try { +// this.fsStream = negotiator.file().fileSystem().openPossiblyCompressedStream(this.split.getPath()); +// sasFileReader = new SasFileReaderImpl(this.fsStream); +// firstRow = sasFileReader.readNext(); +// } catch (IOException e) { +// throw UserException +// .dataReadError(e) +// .message("Unable to open Fixed Width File %s", this.split.getPath()) +// .addContext(e.getMessage()) +// .addContext(this.errorContext) +// .build(FixedWidthBatchReader.logger); +// } +// } private TupleMetadata buildSchema() { SchemaBuilder builder = new SchemaBuilder();