Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
cgivre committed Oct 31, 2023
1 parent 097da74 commit b3f5e29
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 8 deletions.
4 changes: 4 additions & 0 deletions contrib/format-xml/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ Aside from the file extension, there is one configuration option:

* `dataLevel`: XML data often contains a considerable amount of nesting which is not necesarily useful for data analysis. This parameter allows you to set the nesting level
where the data actually starts. The levels start at `1`.
* `allTextMode`: When set to true, Drill will not attempt to infer data types. Defaults to `true`.
* `useXSD`: When set to `true`, if the XML file has an associated XSD schema file, Drill will
download that file and use that for the schema. Defaults to `false`.

The default configuration is shown below:

Expand All @@ -16,6 +19,7 @@ The default configuration is shown below:
"xml"
],
"allTextMode": true,
"useXSD": false,
"dataLevel": 2
}
```
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,13 @@ static class XMLReaderConfig {
final XMLFormatPlugin plugin;
final int dataLevel;
final boolean allTextMode;
final boolean useXSD;

XMLReaderConfig(XMLFormatPlugin plugin) {
this.plugin = plugin;
dataLevel = plugin.getConfig().dataLevel;
allTextMode = plugin.getConfig().allTextMode();
useXSD = plugin.getConfig().useXSD();
}
}

Expand All @@ -63,7 +65,11 @@ public XMLBatchReader(XMLReaderConfig readerConfig, EasySubScan scan, FileSchema
this.readerConfig = readerConfig;
file = negotiator.file();

// Add schema if provided
// We need to set an order of precedence for schemata.
// The order implemented here is:
// 1. Provided schema
// 2. Schema from XSD
// 3. Inferred schema from data
if (negotiator.providedSchema() != null) {
TupleMetadata schema = negotiator.providedSchema();
negotiator.tableSchema(schema, false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,22 @@ public class XMLFormatConfig implements FormatPluginConfig {

public final List<String> extensions;
public final int dataLevel;

@JsonProperty
public final boolean allTextMode;
public final boolean useXSD;

public XMLFormatConfig(@JsonProperty("extensions") List<String> extensions,
@JsonProperty("dataLevel") int dataLevel,
@JsonProperty("allTextMode") Boolean allTextMode
@JsonProperty("allTextMode") Boolean allTextMode,
@JsonProperty("useXSD") Boolean useXSD
) {
this.extensions = extensions == null ? Collections.singletonList("xml") : ImmutableList.copyOf(extensions);
this.dataLevel = Math.max(dataLevel, 1);

// Default to true
this.allTextMode = allTextMode == null || allTextMode;

// Default to false
this.useXSD = useXSD != null && useXSD;
}

@JsonInclude(JsonInclude.Include.NON_DEFAULT)
Expand All @@ -56,13 +59,20 @@ public List<String> getExtensions() {
}

@JsonProperty("allTextMode")
@JsonInclude(JsonInclude.Include.NON_DEFAULT)
public boolean allTextMode() {
return allTextMode;
}

@JsonProperty("useXSD")
@JsonInclude(JsonInclude.Include.NON_DEFAULT)
public boolean useXSD() {
return useXSD;
}

@Override
public int hashCode() {
return Objects.hash(extensions, dataLevel, allTextMode);
return Objects.hash(extensions, dataLevel, allTextMode, useXSD);
}

public XMLBatchReader.XMLReaderConfig getReaderConfig(XMLFormatPlugin plugin) {
Expand All @@ -80,7 +90,8 @@ public boolean equals(Object obj) {
XMLFormatConfig other = (XMLFormatConfig) obj;
return Objects.equals(extensions, other.extensions)
&& Objects.equals(dataLevel, other.dataLevel)
&& Objects.equals(allTextMode, other.allTextMode);
&& Objects.equals(allTextMode, other.allTextMode)
&& Objects.equals(useXSD, other.useXSD);
}

@Override
Expand All @@ -89,6 +100,7 @@ public String toString() {
.field("extensions", extensions)
.field("dataLevel", dataLevel)
.field("allTextMode", allTextMode)
.field("useXSD", useXSD)
.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ public class XMLReader implements Closeable {

/**
* This field indicates the various states in which the reader operates. The names should be self-explanatory,
* but they are used as the reader iterates over the XML tags to know what to do.
* and they are used as the reader iterates over the XML tags to know what to do.
*/
private enum xmlState {
ROW_STARTED,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ public class TestXMLReader extends ClusterTest {
public static void setup() throws Exception {
ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher));

XMLFormatConfig formatConfig = new XMLFormatConfig(null, 2, true);
XMLFormatConfig formatConfig = new XMLFormatConfig(null, 2, true, false);
cluster.defineFormat("cp", "xml", formatConfig);
cluster.defineFormat("dfs", "xml", formatConfig);

Expand Down

0 comments on commit b3f5e29

Please sign in to comment.