Skip to content

Commit

Permalink
DRILL-8493: Drill Unable to Read XML Files with Namespaces (#2908)
Browse files Browse the repository at this point in the history
  • Loading branch information
cgivre authored Apr 28, 2024
1 parent c4c0b28 commit dda5bc4
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ public class XMLReader implements Closeable {
private XMLEventReader reader;
private ImplicitColumns metadata;
private boolean isSelfClosingEvent;
private Iterator<Attribute> rootAttributeIterator;

/**
* This field indicates the various states in which the reader operates. The names should be self-explanatory,
Expand All @@ -103,6 +104,11 @@ public XMLReader(InputStream fsStream, int dataLevel, boolean allTextMode) throw

// This property prevents XXE attacks by disallowing DTD.
inputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, false);

// When reading some documents with XML Namespaces, Drill seems to ignore the rest of the
// document. Setting this parameter to false solves this issue. However, when we introduce
// XSD support, it will likely be necessary to make this a configurable parameter.
inputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false);
reader = inputFactory.createXMLEventReader(fsStream);
fieldNameStack = new Stack<>();
rowWriterStack = new Stack<>();
Expand Down Expand Up @@ -340,7 +346,6 @@ private void processEvent(XMLEvent currentEvent,
// Get the field value
fieldValue = currentEvent.asCharacters().getData().trim();
changeState(xmlState.GETTING_DATA);
changeState(xmlState.GETTING_DATA);
break;

case XMLStreamConstants.END_ELEMENT:
Expand All @@ -367,11 +372,11 @@ private void processEvent(XMLEvent currentEvent,
} else if (currentState == xmlState.FIELD_ENDED && currentNestingLevel >= dataLevel) {
// Case to end nested maps
// Pop tupleWriter off stack
if (rowWriterStack.size() > 0) {
if (!rowWriterStack.isEmpty()) {
currentTupleWriter = rowWriterStack.pop();
}
// Pop field name
if (fieldNameStack.size() > 0) {
if (!fieldNameStack.isEmpty()) {
fieldNameStack.pop();
}

Expand All @@ -385,7 +390,7 @@ private void processEvent(XMLEvent currentEvent,
attributePrefix = XMLUtils.removeField(attributePrefix, fieldName);

// Pop field name
if (fieldNameStack.size() > 0) {
if (!fieldNameStack.isEmpty()) {
fieldNameStack.pop();
}
fieldName = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,30 @@ public void testAttributesOnRoot() throws Exception {
new RowSetComparison(expected).verifyAndClearAll(results);
}

@Test
public void testAttributesOnRootWithNamespace() throws Exception {
String sql = "SELECT * FROM table(cp.`xml/sitemap.xml` (type => 'xml', dataLevel => 2))";
RowSet results = client.queryBuilder().sql(sql).rowSet();

TupleMetadata expectedSchema = new SchemaBuilder()
.add("attributes", MinorType.MAP, DataMode.REQUIRED)
.addNullable("loc", MinorType.VARCHAR)
.addNullable("lastmod", MinorType.VARCHAR)
.addNullable("changefreq", MinorType.VARCHAR)
.addNullable("priority", MinorType.VARCHAR)
.build();

RowSet expected = client.rowSetBuilder(expectedSchema)
.addRow(mapArray(), "https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ1.xml", "2024-03-28T00:10:00.074Z", "monthly", "1.0")
.addRow(mapArray(), "https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ2.xml", "2023-06-20T23:44:00.215Z", "monthly", "1.0")
.addRow(mapArray(), "https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ3.xml", "2023-07-03T14:32:01.529Z", "monthly", "1.0")
.build();

assertEquals(3, results.rowCount());
new RowSetComparison(expected).verifyAndClearAll(results);
}


@Test
public void testXXE() throws Exception {
String sql = "SELECT * FROM cp.`xml/bad.xml`";
Expand Down
45 changes: 45 additions & 0 deletions contrib/format-xml/src/test/resources/xml/sitemap.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
<url>
<loc>
https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ1.xml
</loc>
<lastmod>2024-03-28T00:10:00.074Z</lastmod>
<changefreq>monthly</changefreq>
<priority>1.0</priority>
</url>
<url>
<loc>
https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ2.xml
</loc>
<lastmod>2023-06-20T23:44:00.215Z</lastmod>
<changefreq>monthly</changefreq>
<priority>1.0</priority>
</url>
<url>
<loc>
https://www.govinfo.gov/bulkdata/PLAW/118/public/PLAW-118publ3.xml
</loc>
<lastmod>2023-07-03T14:32:01.529Z</lastmod>
<changefreq>monthly</changefreq>
<priority>1.0</priority>
</url>
</urlset>

0 comments on commit dda5bc4

Please sign in to comment.