From 876df3433b7d4135c056ae0e3f1c69390f77b5d8 Mon Sep 17 00:00:00 2001 From: IG Date: Wed, 4 Dec 2024 11:16:28 +0000 Subject: [PATCH] stricter schema validation on deserialization (#573) --- docs/rn/5.0.3.md | 1 + .../Serialisation/ParquetSerializerTest.cs | 43 ++++++++++++------ .../data/special/no-logical-type.parquet | Bin 0 -> 505 bytes .../Serialization/ParquetSerializer.cs | 17 +++++++ 4 files changed, 48 insertions(+), 13 deletions(-) create mode 100644 src/Parquet.Test/data/special/no-logical-type.parquet diff --git a/docs/rn/5.0.3.md b/docs/rn/5.0.3.md index 52ec3778..0d38ef49 100644 --- a/docs/rn/5.0.3.md +++ b/docs/rn/5.0.3.md @@ -2,6 +2,7 @@ - Class-reflected schemas for `map` complex types will generate key/value properties with "key" and "value" names, respectively. This is required in order to deserialise externally generated parquet files with dictionaries. - Updated dependent packages. +- Class deserializer will check for type compatibility when deserialising incompatible types, which will prevent accidental data loss or narrowing. Thanks to @dkotov in #573. # Floor diff --git a/src/Parquet.Test/Serialisation/ParquetSerializerTest.cs b/src/Parquet.Test/Serialisation/ParquetSerializerTest.cs index 431a99f0..85656583 100644 --- a/src/Parquet.Test/Serialisation/ParquetSerializerTest.cs +++ b/src/Parquet.Test/Serialisation/ParquetSerializerTest.cs @@ -1147,23 +1147,40 @@ class EdgeCaseInt32 { public int Id { get; set; } } + class EdgeCaseInt32Optional { + public int? Id { get; set; } + } + + class EdgeCaseInt64Optional { + public long? Id { get; set; } + } + + /// + /// This shoudl throw InvalidCastException as the raw Int64 is being cast to Int32 + /// + /// [Fact] - public async Task EdgeCase_rawint64_to_classInt32() { - var schema = new ParquetSchema(new DataField("Id")); - using var ms = new MemoryStream(); - using(ParquetWriter writer = await ParquetWriter.CreateAsync(schema, ms)) { - using(ParquetRowGroupWriter rg = writer.CreateRowGroup()) { - await rg.WriteColumnAsync(new DataColumn(schema.DataFields[0], new long[] { 1, 2, 3 })); - } - } - ms.Position = 0; + public async Task EdgeCase_RawInt64_to_Int32() { + + using Stream testFile = OpenTestFile("special/no-logical-type.parquet"); + + await Assert.ThrowsAsync(async () => { + await ParquetSerializer.DeserializeAsync(testFile); + }); + } - IList data = await ParquetSerializer.DeserializeAsync(ms); + [Fact] + public async Task EdgeCase_Int64() { - Assert.Equal(1, data[0].Id); - Assert.Equal(2, data[1].Id); - Assert.Equal(3, data[2].Id); + IList r = await ParquetSerializer.DeserializeAsync( + OpenTestFile("special/no-logical-type.parquet")); + Assert.NotNull(r); + Assert.Equal(3, r.Count); + Assert.Equal(1, r[0].Id); + Assert.Equal(2, r[1].Id); + Assert.Equal(3, r[2].Id); } + } } diff --git a/src/Parquet.Test/data/special/no-logical-type.parquet b/src/Parquet.Test/data/special/no-logical-type.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a7b499adf1e687a37de93ad92997f3d2f6699fc8 GIT binary patch literal 505 zcmZWmO-sW-5S^qoMi4=8mjr^81xpL*lB8dz6v3Ol6%QU1kxh52)ubOusz~Xb+q++>40l?b8(DIV#T-jhHL+p(Rp>5!t` z1jDpfEYYb!@2M=C-Ob*O8HLK9+lEC;jE!~x^g|;DG$tp2Ta2z(XLFVFbn?U#l?y3yVO=a({ybHz!?HY`4|3+R z;Yh_YW1Xu78;fKm^(4~kws _schemaToStriper = new(); private static readonly ConcurrentDictionary _typeToAssembler = new(); private static readonly ConcurrentDictionary _schemaToAssembler = new(); + private static readonly Dictionary> AllowedDeserializerConversions = new() { +#if NET8_0_OR_GREATER + { typeof(DateOnly), new HashSet{ typeof(DateTime) } }, + { typeof(TimeOnly), new HashSet{ typeof(TimeSpan) } }, +#endif + }; private static async Task SerializeRowGroupAsync(ParquetWriter writer, Striper striper, IEnumerable objectInstances, @@ -513,6 +519,17 @@ private static async Task DeserializeRowGroupAsync(ParquetReader reader, int rgi if(fileField.MaxRepetitionLevel != assemblerField.MaxRepetitionLevel) throw new InvalidDataException($"class repetition level ({assemblerField.MaxRepetitionLevel}) does not match file's repetition level ({fileField.MaxRepetitionLevel}) in field '{assemblerField.Path}'. This usually means collection in class definition is incompatible."); + if(fileField.ClrType != assemblerField.ClrType) { + + // check if this is one of the allowed conversions + bool isStillAllowed = + AllowedDeserializerConversions.TryGetValue(assemblerField.ClrType, out HashSet? allowedConversions) && + allowedConversions.Contains(fileField.ClrType); + + if(!isStillAllowed) + throw new InvalidCastException($"class type ({assemblerField.ClrType}) does not match file's type ({fileField.ClrType}) in field '{assemblerField.Path}'"); + } + // make final result DataField r = (DataField)assemblerField.Clone();