From 3765aefada5b79accc54a65683a932c8497b4c8d Mon Sep 17 00:00:00 2001 From: IG Date: Mon, 17 Apr 2023 11:05:25 +0100 Subject: [PATCH] support decoding legacy lists with no inner element (#286) --- .../ParquetReaderOnTestFilesTest.cs | 12 +++++ src/Parquet.Test/Schema/SchemaTest.cs | 50 ++++++++++++++++++ .../data/special/legacy-list.parquet | Bin 0 -> 1872 bytes src/Parquet/Encodings/SchemaEncoder.cs | 35 ++++++------ src/Parquet/File/ThriftFooter.cs | 12 +++++ src/Parquet/Schema/ListField.cs | 29 ++++++---- 6 files changed, 110 insertions(+), 28 deletions(-) create mode 100644 src/Parquet.Test/data/special/legacy-list.parquet diff --git a/src/Parquet.Test/ParquetReaderOnTestFilesTest.cs b/src/Parquet.Test/ParquetReaderOnTestFilesTest.cs index e5e1cc4729..abdb88d7a5 100644 --- a/src/Parquet.Test/ParquetReaderOnTestFilesTest.cs +++ b/src/Parquet.Test/ParquetReaderOnTestFilesTest.cs @@ -133,5 +133,17 @@ public async Task Read_col_names_with_trailing_dots() { Table tbl = await ParquetReader.ReadTableFromStreamAsync(s); Assert.NotNull(tbl); } + + [Fact] + public async Task Read_legacy_list() { + using Stream s = OpenTestFile("special/legacy-list.parquet"); + using ParquetReader r = await ParquetReader.CreateAsync(s); + DataColumn[] cols = await r.ReadEntireRowGroupAsync(); + + Assert.Equal(3, cols.Length); + Assert.Equal(new string[] { "1_0", "1_0" }, cols[0].Data); + Assert.Equal(new double[] { 2004, 2004 }, cols[1].Data); + Assert.Equal(Enumerable.Range(0, 168).Concat(Enumerable.Range(0, 168)).ToArray(), cols[2].Data); + } } } \ No newline at end of file diff --git a/src/Parquet.Test/Schema/SchemaTest.cs b/src/Parquet.Test/Schema/SchemaTest.cs index 4ec7fd2db9..e794b1f119 100644 --- a/src/Parquet.Test/Schema/SchemaTest.cs +++ b/src/Parquet.Test/Schema/SchemaTest.cs @@ -10,6 +10,7 @@ using CT = Parquet.Thrift.ConvertedType; using System.Numerics; using Parquet.Encodings; +using Parquet.File; namespace Parquet.Test.Schema { public class SchemaTest : TestBase { @@ -336,5 +337,54 @@ public void SystemTypeToThriftMapping(Type t, TT expectedTT, CT? expectedCT) { Assert.Equal(expectedTT, foundTT); Assert.Equal(expectedCT, foundCT); } + + [Fact] + public void Decode_list_normal() { + ParquetSchema schema = ThriftFooter.Parse( + new Thrift.SchemaElement("my_list") { + Converted_type = CT.LIST, + Num_children = 1 + }, + new Thrift.SchemaElement("list") { + Repetition_type = Thrift.FieldRepetitionType.REPEATED, + Num_children = 1 + }, + new Thrift.SchemaElement("element") { + Repetition_type = Thrift.FieldRepetitionType.REQUIRED, + Type = TT.INT32 + }); + + Field f = schema[0]; + if(f is ListField lf) { + Assert.Equal("my_list", lf.Name); + Assert.Equal("element", lf.Item.Name); + } else { + Assert.Fail("list expected"); + } + } + + [Fact] + public void Decode_list_legacy_no_mid_group() { + ParquetSchema schema = ThriftFooter.Parse( + new Thrift.SchemaElement("my_list") { + Converted_type = CT.LIST + }, + new Thrift.SchemaElement("list") { + Repetition_type = Thrift.FieldRepetitionType.REPEATED, + Num_children = 1 + }, + new Thrift.SchemaElement("element") { + Repetition_type = Thrift.FieldRepetitionType.REQUIRED, + Type = TT.INT32 + }); + + Field f = schema[0]; + if(f is ListField lf) { + Assert.Equal("my_list", lf.Name); + Assert.Equal("element", lf.Item.Name); + } else { + Assert.Fail("list expected"); + } + } } } \ No newline at end of file diff --git a/src/Parquet.Test/data/special/legacy-list.parquet b/src/Parquet.Test/data/special/legacy-list.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5d1af3153300dfc143972f9d8ef91289c291bacf GIT binary patch literal 1872 zcmajg$x{8>4pt|4T-rLx?BQs56e>v1{hID$M2%u%$NS_7g1Mi`142F z!v>wHMbL?Q$zUup6`QA&Of8)@y=+GLOxZF^vCf`jn>%m5eSu@)B4@>7SLKqW?q#0k zUSC!9ik1GF+Cbf^;OaGN>l@as->`Ah<}F(rw>53wv9o#C?mc^3TKDZgaIo#r;Un!w zL*YnN?dUufJANXbNT#|@YUxb2`_$>4Grec~&Yiz-@zUihSNpGBzj5={?K=Z^@7;g! z@X_PJCr_U}fAR9w>!IN{Z{NND@bQzN`#K`<85|L0(Ryf0Z6uZO#FARu7Z2SX6DTl8u5~;uI8fdMw@^i#nN3GhO;nG{Z}x z6pd9(b6AlqIwi#r(zH;IV3DjgYp}v(lS`~l$z=Vuh%8zp%Q;KGuEG?Y4>}^jiIZxgHOWc#FQq?=HfG8x8Lh7I0gVlVn+zK) z17()qN-I8#!T;Dc-&eK#RBn+3t6M4W3TY>^YQ_`l)>58yi(p{lQF6;jmmGq978jq$Ob#1-lr6(4WaQ^n`j226FI{Ms& zEM|8i98G1zarN79e#r|dg@KH&Jl(O3n&7sMSTf`2>pR>hC}pQKI<@?E%Z*c!a schema, ref int index, out int ownedChildren, out ListField? field) { - Thrift.SchemaElement se = schema[index]; + Thrift.SchemaElement outerGroup = schema[index]; - if(!(se.__isset.converted_type && se.Converted_type == Thrift.ConvertedType.LIST)) { + if(!(outerGroup.__isset.converted_type && outerGroup.Converted_type == Thrift.ConvertedType.LIST)) { ownedChildren = 0; field = null; return false; } - Thrift.SchemaElement tseList = schema[index]; - field = ListField.CreateWithNoItem(tseList.Name, tseList.Repetition_type != FieldRepetitionType.REQUIRED); + field = ListField.CreateWithNoItem(outerGroup.Name, outerGroup.Repetition_type != FieldRepetitionType.REQUIRED); //https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules - Thrift.SchemaElement tseRepeated = schema[index + 1]; + Thrift.SchemaElement midGroup = schema[index + 1]; + bool midIsGroup = midGroup.Num_children > 0; // Rule 1. If the repeated field is not a group, then its type is the element type and elements are required. - // todo: not implemented + if(!midIsGroup) { + field.Path = new FieldPath(outerGroup.Name); + field.ThriftSchemaElement = outerGroup; + index += 1; //only skip this element + ownedChildren = 1; // next element is list's item + return true; + } // Rule 2. If the repeated field is a group with multiple fields, then its type is the element type and elements are required. // todo: not implemented @@ -194,20 +200,11 @@ static bool TryBuildList(List schema, // type and elements are required. // todo: not implemented fully, only "array" - // "group with one field and is named either array": - if(tseList.Num_children == 1 && tseRepeated.Name == "array") { - field.Path = new FieldPath(tseList.Name); - index += 1; //only skip this element - ownedChildren = 1; - return true; - } - // Normal "modern" LIST: - //as we are skipping elements set path hint - Thrift.SchemaElement tseRepeatedGroup = schema[index + 1]; - field.Path = new FieldPath(tseList.Name, tseRepeatedGroup.Name); - field.ThriftSchemaElement = se; - field.GroupSchemaElement = tseRepeatedGroup; + // as we are skipping elements set path hint + field.Path = new FieldPath(outerGroup.Name, midGroup.Name); + field.ThriftSchemaElement = outerGroup; + field.GroupSchemaElement = midGroup; index += 2; //skip this element and child container ownedChildren = 1; //we should get this element assigned back return true; diff --git a/src/Parquet/File/ThriftFooter.cs b/src/Parquet/File/ThriftFooter.cs index b22be1acf7..10fb1b9407 100644 --- a/src/Parquet/File/ThriftFooter.cs +++ b/src/Parquet/File/ThriftFooter.cs @@ -24,6 +24,18 @@ public ThriftFooter(Thrift.FileMetaData fileMeta) { _tree = new ThriftSchemaTree(_fileMeta.Schema); } + internal static ParquetSchema Parse(params Thrift.SchemaElement[] elements) { + + var slst = new List { + new Thrift.SchemaElement("root") { Num_children = 1 }, + }; + slst.AddRange(elements); + + return new ThriftFooter(new Thrift.FileMetaData { + Schema = slst + }).CreateModelSchema(null); + } + public ThriftFooter(ParquetSchema schema, long totalRowCount) { if(schema == null) { throw new ArgumentNullException(nameof(schema)); diff --git a/src/Parquet/Schema/ListField.cs b/src/Parquet/Schema/ListField.cs index bdf6e7c2fe..54ccd4778a 100644 --- a/src/Parquet/Schema/ListField.cs +++ b/src/Parquet/Schema/ListField.cs @@ -96,16 +96,27 @@ internal override FieldPath? PathPrefix { internal override void PropagateLevels(int parentRepetitionLevel, int parentDefinitionLevel) { - // both get MaxDefinitionLevel = parentDefinitionLevel; - MaxRepetitionLevel = parentRepetitionLevel + 1; // because it's repeated ;) - - if(IsNullable) { - MaxDefinitionLevel++; - } - - if(GroupSchemaElement == null || GroupSchemaElement.Repetition_type != Thrift.FieldRepetitionType.REQUIRED) { - MaxDefinitionLevel++; + MaxRepetitionLevel = parentRepetitionLevel; + + if(ThriftSchemaElement != null) { + // building from file + if(IsNullable) + MaxDefinitionLevel += 1; + + if(GroupSchemaElement != null) { + if(GroupSchemaElement.Repetition_type != Thrift.FieldRepetitionType.REQUIRED) + MaxDefinitionLevel += 1; + + MaxRepetitionLevel += 1; + } + } else { + // probably building manually + if(IsNullable) + MaxDefinitionLevel += 1; + + MaxDefinitionLevel += 1; // assuming optional group + MaxRepetitionLevel += 1; // assuming non-legacy lists, which have repeated group } //push to child item