Skip to content

Commit

Permalink
[Kernel] Add type widening to supported table features (#3656)
Browse files Browse the repository at this point in the history
## Description
Allow reading and writing to tables that have the type widening table
features enabled (both preview and stable table feature).

Reading:
- The default kernel parquet reader supports widening conversions since
#3541. Engines may also choose to
implement type widening natively in their parquet reader if they wish.
Writing:
- Nothing to do, type widening doesn't impact the write path - writing
data always uses the latest data schema.

## How was this patch tested?
Added read integration tests.

Tests are based on golden tables. Generating the tables requires Spark
4.0, due to spark master cross-compilation being broken, the table
generation code is not included here.
The following steps where used to generate the tables.
1. Create a table with initial data types and insert initial data
2. Enable type widening and schema evolution
3. Insert data with wider type for each column. Column types are
automatically widened during schema evolution.

`type-widening` table:
| Column | Initial type | Widened Type |
| - | - | - |
| byte_long | byte | long |
| int_long | int | long |
| float_double | float | double |
| byte_double | byte | double |
| short_double | short | double |
| int_double | int | double |
| decimal_decimal_same_scale | decimal(10, 2) | decimal(20, 2) |
| decimal_decimal_greater_scale | decimal(10, 2) | decimal(20, 5) |
| byte_decimal | byte | decimal(11, 1) |
| short_decimal | short | decimal(11, 1) |
| int_decimal | int | decimal(11, 1) |
| long_decimal | long | decimal(21, 1) |
| date_timestamp_ntz | date | timestamp_ntz |

`type-widening-nested` table:
| Column | Initial type | Widened Type |
| - | - | - |
| struct | struct<a: int> | struct<a: long> |
| map | map<int, int> | map<long, long> |
| array | array<int> | array<long> |


## Does this PR introduce _any_ user-facing changes?
Yes, it's now possible to read from and write to delta tables with type
widening enabled using kernel.
  • Loading branch information
johanl-db authored Sep 26, 2024
1 parent 6871379 commit 67934f8
Show file tree
Hide file tree
Showing 25 changed files with 130 additions and 4 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"commitInfo":{"timestamp":1727266119620,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"1416"},"engineInfo":"Apache-Spark/3.5.2 Delta-Lake/3.3.0-SNAPSHOT","txnId":"034f1fec-b6d9-4957-93c1-09a19c323fc2"}}
{"metaData":{"id":"43c8feba-0140-4d91-8c16-52f627a79cfe","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"struct\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"map\",\"type\":{\"type\":\"map\",\"keyType\":\"integer\",\"valueType\":\"integer\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"array\",\"type\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1727266118466}}
{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
{"add":{"path":"part-00000-138244f1-b939-40db-a4bd-d57cf3d214d2-c000.snappy.parquet","partitionValues":{},"size":1416,"modificationTime":1727266119587,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"struct\":{\"a\":1}},\"maxValues\":{\"struct\":{\"a\":1}},\"nullCount\":{\"struct\":{\"a\":0},\"map\":0,\"array\":0}}"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"commitInfo":{"timestamp":1727266120320,"operation":"SET TBLPROPERTIES","operationParameters":{"properties":"{\"delta.enableTypeWidening\":\"true\"}"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/3.5.2 Delta-Lake/3.3.0-SNAPSHOT","txnId":"f4db4fbf-a05b-41b4-9049-e1f28a08ec5b"}}
{"metaData":{"id":"43c8feba-0140-4d91-8c16-52f627a79cfe","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"struct\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"map\",\"type\":{\"type\":\"map\",\"keyType\":\"integer\",\"valueType\":\"integer\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"array\",\"type\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableTypeWidening":"true"},"createdTime":1727266118466}}
{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["typeWidening-preview"],"writerFeatures":["typeWidening-preview","appendOnly","invariants"]}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"commitInfo":{"timestamp":1727266121897,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":1,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"1519"},"engineInfo":"Apache-Spark/3.5.2 Delta-Lake/3.3.0-SNAPSHOT","txnId":"55211a5a-9d2b-4367-929a-ac5850f91b78"}}
{"metaData":{"id":"43c8feba-0140-4d91-8c16-52f627a79cfe","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"struct\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"a\",\"type\":\"long\",\"nullable\":true,\"metadata\":{\"delta.typeChanges\":[{\"toType\":\"long\",\"fromType\":\"integer\",\"tableVersion\":2}]}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"map\",\"type\":{\"type\":\"map\",\"keyType\":\"long\",\"valueType\":\"long\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{\"delta.typeChanges\":[{\"toType\":\"long\",\"fromType\":\"integer\",\"tableVersion\":2,\"fieldPath\":\"key\"},{\"toType\":\"long\",\"fromType\":\"integer\",\"tableVersion\":2,\"fieldPath\":\"value\"}]}},{\"name\":\"array\",\"type\":{\"type\":\"array\",\"elementType\":\"long\",\"containsNull\":true},\"nullable\":true,\"metadata\":{\"delta.typeChanges\":[{\"toType\":\"long\",\"fromType\":\"integer\",\"tableVersion\":2,\"fieldPath\":\"element\"}]}}]}","partitionColumns":[],"configuration":{"delta.enableTypeWidening":"true"},"createdTime":1727266118466}}
{"add":{"path":"part-00000-1f777f86-350c-4181-b7ef-73df70847eac-c000.snappy.parquet","partitionValues":{},"size":1519,"modificationTime":1727266121853,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"struct\":{\"a\":9223372036854775807}},\"maxValues\":{\"struct\":{\"a\":9223372036854775807}},\"nullCount\":{\"struct\":{\"a\":0},\"map\":0,\"array\":0}}","defaultRowCommitVersion":2}}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"commitInfo":{"timestamp":1727266110116,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"3694"},"engineInfo":"Apache-Spark/3.5.2 Delta-Lake/3.3.0-SNAPSHOT","txnId":"80c33fca-d936-40cf-81fb-7ef52b67e25b"}}
{"metaData":{"id":"db0018ee-037b-41f7-8266-85058ceafb06","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"byte_long\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int_long\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"float_double\",\"type\":\"float\",\"nullable\":true,\"metadata\":{}},{\"name\":\"byte_double\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{}},{\"name\":\"short_double\",\"type\":\"short\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int_double\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_decimal_same_scale\",\"type\":\"decimal(10,2)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_decimal_greater_scale\",\"type\":\"decimal(10,2)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"byte_decimal\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{}},{\"name\":\"short_decimal\",\"type\":\"short\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int_decimal\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"long_decimal\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"date_timestamp_ntz\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1727266102938}}
{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
{"add":{"path":"part-00000-1045efe0-45bb-4b99-9f83-5ffa04a63ab2-c000.snappy.parquet","partitionValues":{},"size":3694,"modificationTime":1727266109760,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"byte_long\":1,\"int_long\":2,\"float_double\":3.4,\"byte_double\":5,\"short_double\":6,\"int_double\":7,\"decimal_decimal_same_scale\":123.45,\"decimal_decimal_greater_scale\":67.89,\"byte_decimal\":1,\"short_decimal\":2,\"int_decimal\":3,\"long_decimal\":4,\"date_timestamp_ntz\":\"2024-09-09\"},\"maxValues\":{\"byte_long\":1,\"int_long\":2,\"float_double\":3.4,\"byte_double\":5,\"short_double\":6,\"int_double\":7,\"decimal_decimal_same_scale\":123.45,\"decimal_decimal_greater_scale\":67.89,\"byte_decimal\":1,\"short_decimal\":2,\"int_decimal\":3,\"long_decimal\":4,\"date_timestamp_ntz\":\"2024-09-09\"},\"nullCount\":{\"byte_long\":0,\"int_long\":0,\"float_double\":0,\"byte_double\":0,\"short_double\":0,\"int_double\":0,\"decimal_decimal_same_scale\":0,\"decimal_decimal_greater_scale\":0,\"byte_decimal\":0,\"short_decimal\":0,\"int_decimal\":0,\"long_decimal\":0,\"date_timestamp_ntz\":0}}"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"commitInfo":{"timestamp":1727266114275,"operation":"SET TBLPROPERTIES","operationParameters":{"properties":"{\"delta.enableTypeWidening\":\"true\",\"delta.feature.timestampntz\":\"supported\"}"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/3.5.2 Delta-Lake/3.3.0-SNAPSHOT","txnId":"7b869171-851a-4a8d-96e6-baee5496b98f"}}
{"metaData":{"id":"db0018ee-037b-41f7-8266-85058ceafb06","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"byte_long\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int_long\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"float_double\",\"type\":\"float\",\"nullable\":true,\"metadata\":{}},{\"name\":\"byte_double\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{}},{\"name\":\"short_double\",\"type\":\"short\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int_double\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_decimal_same_scale\",\"type\":\"decimal(10,2)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal_decimal_greater_scale\",\"type\":\"decimal(10,2)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"byte_decimal\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{}},{\"name\":\"short_decimal\",\"type\":\"short\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int_decimal\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"long_decimal\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"date_timestamp_ntz\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableTypeWidening":"true"},"createdTime":1727266102938}}
{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["timestampNtz","typeWidening-preview"],"writerFeatures":["timestampNtz","typeWidening-preview","appendOnly","invariants"]}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"commitInfo":{"timestamp":1727266116833,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":1,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"4059"},"engineInfo":"Apache-Spark/3.5.2 Delta-Lake/3.3.0-SNAPSHOT","txnId":"2e63edee-6d96-4d12-90af-85b90f4fa9e5"}}
{"metaData":{"id":"db0018ee-037b-41f7-8266-85058ceafb06","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"byte_long\",\"type\":\"long\",\"nullable\":true,\"metadata\":{\"delta.typeChanges\":[{\"toType\":\"long\",\"fromType\":\"byte\",\"tableVersion\":2}]}},{\"name\":\"int_long\",\"type\":\"long\",\"nullable\":true,\"metadata\":{\"delta.typeChanges\":[{\"toType\":\"long\",\"fromType\":\"integer\",\"tableVersion\":2}]}},{\"name\":\"float_double\",\"type\":\"double\",\"nullable\":true,\"metadata\":{\"delta.typeChanges\":[{\"toType\":\"double\",\"fromType\":\"float\",\"tableVersion\":2}]}},{\"name\":\"byte_double\",\"type\":\"double\",\"nullable\":true,\"metadata\":{\"delta.typeChanges\":[{\"toType\":\"double\",\"fromType\":\"byte\",\"tableVersion\":2}]}},{\"name\":\"short_double\",\"type\":\"double\",\"nullable\":true,\"metadata\":{\"delta.typeChanges\":[{\"toType\":\"double\",\"fromType\":\"short\",\"tableVersion\":2}]}},{\"name\":\"int_double\",\"type\":\"double\",\"nullable\":true,\"metadata\":{\"delta.typeChanges\":[{\"toType\":\"double\",\"fromType\":\"integer\",\"tableVersion\":2}]}},{\"name\":\"decimal_decimal_same_scale\",\"type\":\"decimal(20,2)\",\"nullable\":true,\"metadata\":{\"delta.typeChanges\":[{\"toType\":\"decimal(20,2)\",\"fromType\":\"decimal(10,2)\",\"tableVersion\":2}]}},{\"name\":\"decimal_decimal_greater_scale\",\"type\":\"decimal(20,5)\",\"nullable\":true,\"metadata\":{\"delta.typeChanges\":[{\"toType\":\"decimal(20,5)\",\"fromType\":\"decimal(10,2)\",\"tableVersion\":2}]}},{\"name\":\"byte_decimal\",\"type\":\"decimal(11,1)\",\"nullable\":true,\"metadata\":{\"delta.typeChanges\":[{\"toType\":\"decimal(11,1)\",\"fromType\":\"byte\",\"tableVersion\":2}]}},{\"name\":\"short_decimal\",\"type\":\"decimal(11,1)\",\"nullable\":true,\"metadata\":{\"delta.typeChanges\":[{\"toType\":\"decimal(11,1)\",\"fromType\":\"short\",\"tableVersion\":2}]}},{\"name\":\"int_decimal\",\"type\":\"decimal(11,1)\",\"nullable\":true,\"metadata\":{\"delta.typeChanges\":[{\"toType\":\"decimal(11,1)\",\"fromType\":\"integer\",\"tableVersion\":2}]}},{\"name\":\"long_decimal\",\"type\":\"decimal(21,1)\",\"nullable\":true,\"metadata\":{\"delta.typeChanges\":[{\"toType\":\"decimal(21,1)\",\"fromType\":\"long\",\"tableVersion\":2}]}},{\"name\":\"date_timestamp_ntz\",\"type\":\"timestamp_ntz\",\"nullable\":true,\"metadata\":{\"delta.typeChanges\":[{\"toType\":\"timestamp_ntz\",\"fromType\":\"date\",\"tableVersion\":2}]}}]}","partitionColumns":[],"configuration":{"delta.enableTypeWidening":"true"},"createdTime":1727266102938}}
{"add":{"path":"part-00000-cd317895-4ae0-4292-b918-62d4ca832bd7-c000.snappy.parquet","partitionValues":{},"size":4059,"modificationTime":1727266116789,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"byte_long\":9223372036854775807,\"int_long\":9223372036854775807,\"float_double\":1.234567890123,\"byte_double\":1.234567890123,\"short_double\":1.234567890123,\"int_double\":1.234567890123,\"decimal_decimal_same_scale\":12345678901234.56,\"decimal_decimal_greater_scale\":12345678901.23456,\"byte_decimal\":123.4,\"short_decimal\":12345.6,\"int_decimal\":1234567890.1,\"long_decimal\":123456789012345678.9,\"date_timestamp_ntz\":\"2024-09-09T12:34:56.123\"},\"maxValues\":{\"byte_long\":9223372036854775807,\"int_long\":9223372036854775807,\"float_double\":1.234567890123,\"byte_double\":1.234567890123,\"short_double\":1.234567890123,\"int_double\":1.234567890123,\"decimal_decimal_same_scale\":12345678901234.56,\"decimal_decimal_greater_scale\":12345678901.23456,\"byte_decimal\":123.4,\"short_decimal\":12345.6,\"int_decimal\":1234567890.1,\"long_decimal\":123456789012345678.9,\"date_timestamp_ntz\":\"2024-09-09T12:34:56.123\"},\"nullCount\":{\"byte_long\":0,\"int_long\":0,\"float_double\":0,\"byte_double\":0,\"short_double\":0,\"int_double\":0,\"decimal_decimal_same_scale\":0,\"decimal_decimal_greater_scale\":0,\"byte_decimal\":0,\"short_decimal\":0,\"int_decimal\":0,\"long_decimal\":0,\"date_timestamp_ntz\":0}}","defaultRowCommitVersion":2}}
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,9 @@ private static String getValue(Row row, int columnOrdinal) {
return LocalDate.ofEpochDay(daysSinceEpochUTC).toString();
} else if (dataType instanceof LongType) {
return Long.toString(row.getLong(columnOrdinal));
} else if (dataType instanceof TimestampType) {
// TimestampType data is stored internally as the number of microseconds since epoch
} else if (dataType instanceof TimestampType || dataType instanceof TimestampNTZType) {
// Timestamps are stored internally as the number of microseconds since epoch.
// TODO: TimestampType should use the session timezone to display values.
long microSecsSinceEpochUTC = row.getLong(columnOrdinal);
LocalDateTime dateTime = LocalDateTime.ofEpochSecond(
microSecsSinceEpochUTC / 1_000_000 /* epochSecond */,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,22 @@ public void runTests() throws Exception {
asList(new Column("as_int"), Literal.ofInt(1))
)),
0 /* expected row count */);

// Type widening: table with various type changes.
runAndVerifyRowCount(
"type_widening",
"type-widening",
Optional.empty(), /* read schema - read all columns */
Optional.empty(), /* predicate */
2 /* expected row count */);

// Type widening: table with type changes inside nested struct/array/map.
runAndVerifyRowCount(
"type_widening_nested",
"type-widening-nested",
Optional.empty(), /* read schema - read all columns */
Optional.empty(), /* predicate */
2 /* expected row count */);
}

private void runAndVerifyRowCount(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ public class TableFeatures {
add("appendOnly");
add("inCommitTimestamp");
add("columnMapping");
add("typeWidening-preview");
add("typeWidening");
}
});

Expand All @@ -47,6 +49,8 @@ public class TableFeatures {
add("columnMapping");
add("deletionVectors");
add("timestampNtz");
add("typeWidening-preview");
add("typeWidening");
add("vacuumProtocolCheck");
add("variantType-preview");
add("v2Checkpoint");
Expand Down Expand Up @@ -89,7 +93,7 @@ public static void validateReadSupportedTable(
* <li>protocol writer version 1.
* <li>protocol writer version 2 only with appendOnly feature enabled.
* <li>protocol writer version 7 with {@code appendOnly}, {@code inCommitTimestamp}, {@code
* columnMapping} feature enabled.
* columnMapping}, {@code typeWidening} feature enabled.
* </ul>
*
* @param protocol Table protocol
Expand Down Expand Up @@ -129,6 +133,10 @@ public static void validateWriteSupportedTable(
break;
case "columnMapping":
break;
case "typeWidening-preview":
break;
case "typeWidening":
break;
default:
throw unsupportedWriterFeature(tablePath, writerFeature);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class TableFeaturesSuite extends AnyFunSuite {
checkSupported(createTestProtocol(minWriterVersion = 7))
}

Seq("appendOnly", "inCommitTimestamp", "columnMapping")
Seq("appendOnly", "inCommitTimestamp", "columnMapping", "typeWidening-preview", "typeWidening")
.foreach { supportedWriterFeature =>
test(s"validateWriteSupported: protocol 7 with $supportedWriterFeature") {
checkSupported(createTestProtocol(minWriterVersion = 7, supportedWriterFeature))
Expand Down
Loading

0 comments on commit 67934f8

Please sign in to comment.