Skip to content

Commit

Permalink
support decoding legacy lists with no inner element (tensorflow#286)
Browse files Browse the repository at this point in the history
  • Loading branch information
aloneguid committed Apr 17, 2023
1 parent 2fd609f commit 3765aef
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 28 deletions.
12 changes: 12 additions & 0 deletions src/Parquet.Test/ParquetReaderOnTestFilesTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -133,5 +133,17 @@ public async Task Read_col_names_with_trailing_dots() {
Table tbl = await ParquetReader.ReadTableFromStreamAsync(s);
Assert.NotNull(tbl);
}

[Fact]
public async Task Read_legacy_list() {
using Stream s = OpenTestFile("special/legacy-list.parquet");
using ParquetReader r = await ParquetReader.CreateAsync(s);
DataColumn[] cols = await r.ReadEntireRowGroupAsync();

Assert.Equal(3, cols.Length);
Assert.Equal(new string[] { "1_0", "1_0" }, cols[0].Data);
Assert.Equal(new double[] { 2004, 2004 }, cols[1].Data);
Assert.Equal(Enumerable.Range(0, 168).Concat(Enumerable.Range(0, 168)).ToArray(), cols[2].Data);
}
}
}
50 changes: 50 additions & 0 deletions src/Parquet.Test/Schema/SchemaTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
using CT = Parquet.Thrift.ConvertedType;
using System.Numerics;
using Parquet.Encodings;
using Parquet.File;

namespace Parquet.Test.Schema {
public class SchemaTest : TestBase {
Expand Down Expand Up @@ -336,5 +337,54 @@ public void SystemTypeToThriftMapping(Type t, TT expectedTT, CT? expectedCT) {
Assert.Equal(expectedTT, foundTT);
Assert.Equal(expectedCT, foundCT);
}

[Fact]
public void Decode_list_normal() {
ParquetSchema schema = ThriftFooter.Parse(
new Thrift.SchemaElement("my_list") {
Converted_type = CT.LIST,
Num_children = 1
},
new Thrift.SchemaElement("list") {
Repetition_type = Thrift.FieldRepetitionType.REPEATED,
Num_children = 1
},
new Thrift.SchemaElement("element") {
Repetition_type = Thrift.FieldRepetitionType.REQUIRED,
Type = TT.INT32
});

Field f = schema[0];
if(f is ListField lf) {
Assert.Equal("my_list", lf.Name);
Assert.Equal("element", lf.Item.Name);
} else {
Assert.Fail("list expected");
}
}

[Fact]
public void Decode_list_legacy_no_mid_group() {
ParquetSchema schema = ThriftFooter.Parse(
new Thrift.SchemaElement("my_list") {
Converted_type = CT.LIST
},
new Thrift.SchemaElement("list") {
Repetition_type = Thrift.FieldRepetitionType.REPEATED,
Num_children = 1
},
new Thrift.SchemaElement("element") {
Repetition_type = Thrift.FieldRepetitionType.REQUIRED,
Type = TT.INT32
});

Field f = schema[0];
if(f is ListField lf) {
Assert.Equal("my_list", lf.Name);
Assert.Equal("element", lf.Item.Name);
} else {
Assert.Fail("list expected");
}
}
}
}
Binary file added src/Parquet.Test/data/special/legacy-list.parquet
Binary file not shown.
35 changes: 16 additions & 19 deletions src/Parquet/Encodings/SchemaEncoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -169,22 +169,28 @@ static bool TryBuildList(List<Thrift.SchemaElement> schema,
ref int index, out int ownedChildren,
out ListField? field) {

Thrift.SchemaElement se = schema[index];
Thrift.SchemaElement outerGroup = schema[index];

if(!(se.__isset.converted_type && se.Converted_type == Thrift.ConvertedType.LIST)) {
if(!(outerGroup.__isset.converted_type && outerGroup.Converted_type == Thrift.ConvertedType.LIST)) {
ownedChildren = 0;
field = null;
return false;
}

Thrift.SchemaElement tseList = schema[index];
field = ListField.CreateWithNoItem(tseList.Name, tseList.Repetition_type != FieldRepetitionType.REQUIRED);
field = ListField.CreateWithNoItem(outerGroup.Name, outerGroup.Repetition_type != FieldRepetitionType.REQUIRED);

//https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules
Thrift.SchemaElement tseRepeated = schema[index + 1];
Thrift.SchemaElement midGroup = schema[index + 1];
bool midIsGroup = midGroup.Num_children > 0;

// Rule 1. If the repeated field is not a group, then its type is the element type and elements are required.
// todo: not implemented
if(!midIsGroup) {
field.Path = new FieldPath(outerGroup.Name);
field.ThriftSchemaElement = outerGroup;
index += 1; //only skip this element
ownedChildren = 1; // next element is list's item
return true;
}

// Rule 2. If the repeated field is a group with multiple fields, then its type is the element type and elements are required.
// todo: not implemented
Expand All @@ -194,20 +200,11 @@ static bool TryBuildList(List<Thrift.SchemaElement> schema,
// type and elements are required.
// todo: not implemented fully, only "array"

// "group with one field and is named either array":
if(tseList.Num_children == 1 && tseRepeated.Name == "array") {
field.Path = new FieldPath(tseList.Name);
index += 1; //only skip this element
ownedChildren = 1;
return true;
}

// Normal "modern" LIST:
//as we are skipping elements set path hint
Thrift.SchemaElement tseRepeatedGroup = schema[index + 1];
field.Path = new FieldPath(tseList.Name, tseRepeatedGroup.Name);
field.ThriftSchemaElement = se;
field.GroupSchemaElement = tseRepeatedGroup;
// as we are skipping elements set path hint
field.Path = new FieldPath(outerGroup.Name, midGroup.Name);
field.ThriftSchemaElement = outerGroup;
field.GroupSchemaElement = midGroup;
index += 2; //skip this element and child container
ownedChildren = 1; //we should get this element assigned back
return true;
Expand Down
12 changes: 12 additions & 0 deletions src/Parquet/File/ThriftFooter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,18 @@ public ThriftFooter(Thrift.FileMetaData fileMeta) {
_tree = new ThriftSchemaTree(_fileMeta.Schema);
}

internal static ParquetSchema Parse(params Thrift.SchemaElement[] elements) {

var slst = new List<Thrift.SchemaElement> {
new Thrift.SchemaElement("root") { Num_children = 1 },
};
slst.AddRange(elements);

return new ThriftFooter(new Thrift.FileMetaData {
Schema = slst
}).CreateModelSchema(null);
}

public ThriftFooter(ParquetSchema schema, long totalRowCount) {
if(schema == null) {
throw new ArgumentNullException(nameof(schema));
Expand Down
29 changes: 20 additions & 9 deletions src/Parquet/Schema/ListField.cs
Original file line number Diff line number Diff line change
Expand Up @@ -96,16 +96,27 @@ internal override FieldPath? PathPrefix {

internal override void PropagateLevels(int parentRepetitionLevel, int parentDefinitionLevel) {

// both get
MaxDefinitionLevel = parentDefinitionLevel;
MaxRepetitionLevel = parentRepetitionLevel + 1; // because it's repeated ;)

if(IsNullable) {
MaxDefinitionLevel++;
}

if(GroupSchemaElement == null || GroupSchemaElement.Repetition_type != Thrift.FieldRepetitionType.REQUIRED) {
MaxDefinitionLevel++;
MaxRepetitionLevel = parentRepetitionLevel;

if(ThriftSchemaElement != null) {
// building from file
if(IsNullable)
MaxDefinitionLevel += 1;

if(GroupSchemaElement != null) {
if(GroupSchemaElement.Repetition_type != Thrift.FieldRepetitionType.REQUIRED)
MaxDefinitionLevel += 1;

MaxRepetitionLevel += 1;
}
} else {
// probably building manually
if(IsNullable)
MaxDefinitionLevel += 1;

MaxDefinitionLevel += 1; // assuming optional group
MaxRepetitionLevel += 1; // assuming non-legacy lists, which have repeated group
}

//push to child item
Expand Down

0 comments on commit 3765aef

Please sign in to comment.