From 9dcece4719c17490d606bc2088188a7faa706800 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Thu, 27 Mar 2025 13:19:38 -0700 Subject: [PATCH 1/6] fix small type --- pyiceberg/io/pyarrow.py | 6 +-- tests/io/test_pyarrow.py | 90 ++++++++++++++++---------------- tests/io/test_pyarrow_visitor.py | 4 +- tests/test_schema.py | 2 +- 4 files changed, 51 insertions(+), 51 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 522af0f344..5f88a37218 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -625,7 +625,7 @@ def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field: def list(self, list_type: ListType, element_result: pa.DataType) -> pa.DataType: element_field = self.field(list_type.element_field, element_result) - return pa.large_list(value_type=element_field) + return pa.list_(value_type=element_field) def map(self, map_type: MapType, key_result: pa.DataType, value_result: pa.DataType) -> pa.DataType: key_field = self.field(map_type.key_field, key_result) @@ -675,7 +675,7 @@ def visit_timestamptz_ns(self, _: TimestamptzNanoType) -> pa.DataType: return pa.timestamp(unit="ns", tz="UTC") def visit_string(self, _: StringType) -> pa.DataType: - return pa.large_string() + return pa.string() def visit_uuid(self, _: UUIDType) -> pa.DataType: return pa.binary(16) @@ -684,7 +684,7 @@ def visit_unknown(self, _: UnknownType) -> pa.DataType: return pa.null() def visit_binary(self, _: BinaryType) -> pa.DataType: - return pa.large_binary() + return pa.binary() def _convert_scalar(value: Any, iceberg_type: IcebergType) -> pa.scalar: diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index e90f3a46fc..d63043b13c 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -406,7 +406,7 @@ def test_pyarrow_unified_session_properties() -> None: def test_schema_to_pyarrow_schema_include_field_ids(table_schema_nested: Schema) -> None: actual = schema_to_pyarrow(table_schema_nested) - expected = """foo: large_string + expected = """foo: string -- field metadata -- PARQUET:field_id: '1' bar: int32 not null @@ -415,20 +415,20 @@ def test_schema_to_pyarrow_schema_include_field_ids(table_schema_nested: Schema) baz: bool -- field metadata -- PARQUET:field_id: '3' -qux: large_list not null - child 0, element: large_string not null +qux: list not null + child 0, element: string not null -- field metadata -- PARQUET:field_id: '5' -- field metadata -- PARQUET:field_id: '4' -quux: map> not null - child 0, entries: struct not null> not null - child 0, key: large_string not null +quux: map> not null + child 0, entries: struct not null> not null + child 0, key: string not null -- field metadata -- PARQUET:field_id: '7' - child 1, value: map not null - child 0, entries: struct not null - child 0, key: large_string not null + child 1, value: map not null + child 0, entries: struct not null + child 0, key: string not null -- field metadata -- PARQUET:field_id: '9' child 1, value: int32 not null @@ -438,7 +438,7 @@ def test_schema_to_pyarrow_schema_include_field_ids(table_schema_nested: Schema) PARQUET:field_id: '8' -- field metadata -- PARQUET:field_id: '6' -location: large_list not null> not null +location: list not null> not null child 0, element: struct not null child 0, latitude: float -- field metadata -- @@ -450,8 +450,8 @@ def test_schema_to_pyarrow_schema_include_field_ids(table_schema_nested: Schema) PARQUET:field_id: '12' -- field metadata -- PARQUET:field_id: '11' -person: struct - child 0, name: large_string +person: struct + child 0, name: string -- field metadata -- PARQUET:field_id: '16' child 1, age: int32 not null @@ -464,24 +464,24 @@ def test_schema_to_pyarrow_schema_include_field_ids(table_schema_nested: Schema) def test_schema_to_pyarrow_schema_exclude_field_ids(table_schema_nested: Schema) -> None: actual = schema_to_pyarrow(table_schema_nested, include_field_ids=False) - expected = """foo: large_string + expected = """foo: string bar: int32 not null baz: bool -qux: large_list not null - child 0, element: large_string not null -quux: map> not null - child 0, entries: struct not null> not null - child 0, key: large_string not null - child 1, value: map not null - child 0, entries: struct not null - child 0, key: large_string not null +qux: list not null + child 0, element: string not null +quux: map> not null + child 0, entries: struct not null> not null + child 0, key: string not null + child 1, value: map not null + child 0, entries: struct not null + child 0, key: string not null child 1, value: int32 not null -location: large_list not null> not null +location: list not null> not null child 0, element: struct not null child 0, latitude: float child 1, longitude: float -person: struct - child 0, name: large_string +person: struct + child 0, name: string child 1, age: int32 not null""" assert repr(actual) == expected @@ -546,18 +546,18 @@ def test_timestamptz_type_to_pyarrow() -> None: def test_string_type_to_pyarrow() -> None: iceberg_type = StringType() - assert visit(iceberg_type, _ConvertToArrowSchema()) == pa.large_string() + assert visit(iceberg_type, _ConvertToArrowSchema()) == pa.string() def test_binary_type_to_pyarrow() -> None: iceberg_type = BinaryType() - assert visit(iceberg_type, _ConvertToArrowSchema()) == pa.large_binary() + assert visit(iceberg_type, _ConvertToArrowSchema()) == pa.binary() def test_struct_type_to_pyarrow(table_schema_simple: Schema) -> None: expected = pa.struct( [ - pa.field("foo", pa.large_string(), nullable=True, metadata={"field_id": "1"}), + pa.field("foo", pa.string(), nullable=True, metadata={"field_id": "1"}), pa.field("bar", pa.int32(), nullable=False, metadata={"field_id": "2"}), pa.field("baz", pa.bool_(), nullable=True, metadata={"field_id": "3"}), ] @@ -575,7 +575,7 @@ def test_map_type_to_pyarrow() -> None: ) assert visit(iceberg_map, _ConvertToArrowSchema()) == pa.map_( pa.field("key", pa.int32(), nullable=False, metadata={"field_id": "1"}), - pa.field("value", pa.large_string(), nullable=False, metadata={"field_id": "2"}), + pa.field("value", pa.string(), nullable=False, metadata={"field_id": "2"}), ) @@ -585,7 +585,7 @@ def test_list_type_to_pyarrow() -> None: element_type=IntegerType(), element_required=True, ) - assert visit(iceberg_map, _ConvertToArrowSchema()) == pa.large_list( + assert visit(iceberg_map, _ConvertToArrowSchema()) == pa.list_( pa.field("element", pa.int32(), nullable=False, metadata={"field_id": "1"}) ) @@ -668,11 +668,11 @@ def test_expr_less_than_or_equal_to_pyarrow(bound_reference: BoundReference[str] def test_expr_in_to_pyarrow(bound_reference: BoundReference[str]) -> None: assert repr(expression_to_pyarrow(BoundIn(bound_reference, {literal("hello"), literal("world")}))) in ( - """""", - """""", @@ -681,11 +681,11 @@ def test_expr_in_to_pyarrow(bound_reference: BoundReference[str]) -> None: def test_expr_not_in_to_pyarrow(bound_reference: BoundReference[str]) -> None: assert repr(expression_to_pyarrow(BoundNotIn(bound_reference, {literal("hello"), literal("world")}))) in ( - """""", - """""", @@ -1030,12 +1030,12 @@ def test_projection_add_column(file_int: str) -> None: assert ( repr(result_table.schema) == """id: int32 -list: large_list +list: list child 0, element: int32 -map: map - child 0, entries: struct not null +map: map + child 0, entries: struct not null child 0, key: int32 not null - child 1, value: large_string + child 1, value: string location: struct child 0, lat: double child 1, lon: double""" @@ -1051,7 +1051,7 @@ def test_read_list(schema_list: Schema, file_list: str) -> None: assert ( repr(result_table.schema) - == """ids: large_list + == """ids: list child 0, element: int32""" ) @@ -1088,10 +1088,10 @@ def test_projection_add_column_struct(schema_int: Schema, file_int: str) -> None assert r.as_py() is None assert ( repr(result_table.schema) - == """id: map - child 0, entries: struct not null + == """id: map + child 0, entries: struct not null child 0, key: int32 not null - child 1, value: large_string""" + child 1, value: string""" ) @@ -1422,7 +1422,7 @@ def test_projection_list_of_structs(schema_list_of_structs: Schema, file_list_of ] assert ( repr(result_table.schema) - == """locations: large_list> + == """locations: list> child 0, element: struct child 0, latitude: double not null child 1, longitude: double not null @@ -1569,7 +1569,7 @@ def test_delete(deletes_file: str, example_task: FileScanTask, table_schema_simp assert ( str(with_deletes) == """pyarrow.Table -foo: large_string +foo: string bar: int32 not null baz: bool ---- @@ -1606,7 +1606,7 @@ def test_delete_duplicates(deletes_file: str, example_task: FileScanTask, table_ assert ( str(with_deletes) == """pyarrow.Table -foo: large_string +foo: string bar: int32 not null baz: bool ---- @@ -1637,7 +1637,7 @@ def test_pyarrow_wrap_fsspec(example_task: FileScanTask, table_schema_simple: Sc assert ( str(projection) == """pyarrow.Table -foo: large_string +foo: string bar: int32 not null baz: bool ---- diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py index 9d5772d01c..6da7219c44 100644 --- a/tests/io/test_pyarrow_visitor.py +++ b/tests/io/test_pyarrow_visitor.py @@ -229,14 +229,14 @@ def test_pyarrow_timestamp_tz_invalid_tz() -> None: def test_pyarrow_string_to_iceberg(pyarrow_type: pa.DataType) -> None: converted_iceberg_type = visit_pyarrow(pyarrow_type, _ConvertToIceberg()) assert converted_iceberg_type == StringType() - assert visit(converted_iceberg_type, _ConvertToArrowSchema()) == pa.large_string() + assert visit(converted_iceberg_type, _ConvertToArrowSchema()) == pa.string() @pytest.mark.parametrize("pyarrow_type", [pa.binary(), pa.large_binary(), pa.binary_view()]) def test_pyarrow_variable_binary_to_iceberg(pyarrow_type: pa.DataType) -> None: converted_iceberg_type = visit_pyarrow(pyarrow_type, _ConvertToIceberg()) assert converted_iceberg_type == BinaryType() - assert visit(converted_iceberg_type, _ConvertToArrowSchema()) == pa.large_binary() + assert visit(converted_iceberg_type, _ConvertToArrowSchema()) == pa.binary() def test_pyarrow_struct_to_iceberg() -> None: diff --git a/tests/test_schema.py b/tests/test_schema.py index 3ca74c4027..a7fab18478 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -1648,7 +1648,7 @@ def test_arrow_schema() -> None: expected_schema = pa.schema( [ - pa.field("foo", pa.large_string(), nullable=False), + pa.field("foo", pa.string(), nullable=False), pa.field("bar", pa.int32(), nullable=True), pa.field("baz", pa.bool_(), nullable=True), ] From 4380c3ea8b97176287f133ded470667cbecbc20c Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Thu, 27 Mar 2025 13:22:25 -0700 Subject: [PATCH 2/6] update doc --- mkdocs/docs/configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 1e364a11fe..9e1b84a3b3 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -199,7 +199,7 @@ PyIceberg uses [S3FileSystem](https://arrow.apache.org/docs/python/generated/pya | Key | Example | Description | | ------------------------------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| pyarrow.use-large-types-on-read | True | Use large PyArrow types i.e. [large_string](https://arrow.apache.org/docs/python/generated/pyarrow.large_string.html), [large_binary](https://arrow.apache.org/docs/python/generated/pyarrow.large_binary.html) and [large_list](https://arrow.apache.org/docs/python/generated/pyarrow.large_list.html) field types on table scans. The default value is True. | +| pyarrow.use-large-types-on-read | False | Use large PyArrow types i.e. [large_string](https://arrow.apache.org/docs/python/generated/pyarrow.large_string.html), [large_binary](https://arrow.apache.org/docs/python/generated/pyarrow.large_binary.html) and [large_list](https://arrow.apache.org/docs/python/generated/pyarrow.large_list.html) field types on table scans. The default value is False. | From 191ff95287fb1b629f0da201438abcc4a798e116 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Thu, 27 Mar 2025 13:54:01 -0700 Subject: [PATCH 3/6] make PYARROW_USE_LARGE_TYPES_ON_READ work --- pyiceberg/io/pyarrow.py | 2 +- tests/integration/test_reads.py | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 5f88a37218..18dfd47dc5 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -1612,7 +1612,7 @@ def _table_from_scan_task(task: FileScanTask) -> pa.Table: removed_in="0.11.0", help_message=f"Property `{PYARROW_USE_LARGE_TYPES_ON_READ}` will be removed.", ) - result = result.cast(arrow_schema) + result = result.cast(_pyarrow_schema_ensure_large_types(arrow_schema)) if self._limit is not None: return result.slice(0, self._limit) diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index 5ac5162f8e..44718215b8 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -872,9 +872,12 @@ def test_table_scan_keep_types(catalog: Catalog) -> None: @pytest.mark.integration +@pytest.mark.filterwarnings( + "ignore:Deprecated in 0.10.0, will be removed in 0.11.0. Property `pyarrow.use-large-types-on-read` will be removed.:DeprecationWarning" +) @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) -def test_table_scan_override_with_small_types(catalog: Catalog) -> None: - identifier = "default.test_table_scan_override_with_small_types" +def test_table_scan_override_with_large_types(catalog: Catalog) -> None: + identifier = "default.test_table_scan_override_with_large_types" arrow_table = pa.Table.from_arrays( [ pa.array(["a", "b", "c"]), @@ -900,15 +903,15 @@ def test_table_scan_override_with_small_types(catalog: Catalog) -> None: with tbl.update_schema() as update_schema: update_schema.update_column("string-to-binary", BinaryType()) - tbl.io.properties[PYARROW_USE_LARGE_TYPES_ON_READ] = "False" + tbl.io.properties[PYARROW_USE_LARGE_TYPES_ON_READ] = "True" result_table = tbl.scan().to_arrow() expected_schema = pa.schema( [ - pa.field("string", pa.string()), + pa.field("string", pa.large_string()), pa.field("string-to-binary", pa.large_binary()), - pa.field("binary", pa.binary()), - pa.field("list", pa.list_(pa.string())), + pa.field("binary", pa.large_binary()), + pa.field("list", pa.large_list(pa.large_string())), ] ) assert result_table.schema.equals(expected_schema) From 06f785784afb5347b1d6fc185f0a25f84242a405 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Thu, 27 Mar 2025 13:58:43 -0700 Subject: [PATCH 4/6] ensure large type --- pyiceberg/io/pyarrow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 18dfd47dc5..2ec875fef5 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -1718,8 +1718,8 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array: target_schema = schema_to_pyarrow( promote(file_field.field_type, field.field_type), include_field_ids=self._include_field_ids ) - if self._use_large_types is False: - target_schema = _pyarrow_schema_ensure_small_types(target_schema) + if self._use_large_types is True: + target_schema = _pyarrow_schema_ensure_large_types(target_schema) return values.cast(target_schema) elif (target_type := schema_to_pyarrow(field.field_type, include_field_ids=self._include_field_ids)) != values.type: if field.field_type == TimestampType(): From b84d6e46a4681a8a1cf1dd94442e5538eec7b9b2 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 31 Mar 2025 16:07:43 -0400 Subject: [PATCH 5/6] Update mkdocs/docs/configuration.md Co-authored-by: Fokko Driesprong --- mkdocs/docs/configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 9e1b84a3b3..ce95c091d1 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -199,7 +199,7 @@ PyIceberg uses [S3FileSystem](https://arrow.apache.org/docs/python/generated/pya | Key | Example | Description | | ------------------------------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| pyarrow.use-large-types-on-read | False | Use large PyArrow types i.e. [large_string](https://arrow.apache.org/docs/python/generated/pyarrow.large_string.html), [large_binary](https://arrow.apache.org/docs/python/generated/pyarrow.large_binary.html) and [large_list](https://arrow.apache.org/docs/python/generated/pyarrow.large_list.html) field types on table scans. The default value is False. | +| pyarrow.use-large-types-on-read | False | Force large PyArrow types i.e. [large_string](https://arrow.apache.org/docs/python/generated/pyarrow.large_string.html), [large_binary](https://arrow.apache.org/docs/python/generated/pyarrow.large_binary.html) and [large_list](https://arrow.apache.org/docs/python/generated/pyarrow.large_list.html) field types on table scans. The default value is False. | From 79a80c25111223c37c7cd8502f6de59e0f5e5542 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Fri, 25 Apr 2025 09:20:45 -0700 Subject: [PATCH 6/6] a few more --- mkdocs/docs/api.md | 16 ++++++++-------- tests/catalog/test_sql.py | 10 +++++----- tests/conftest.py | 6 +++--- .../test_writes/test_partitioned_writes.py | 2 +- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index d84c82ec2a..9fb5947059 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -418,7 +418,7 @@ This produces the following result with `tbl.scan().to_arrow()`: ```python pyarrow.Table -city: large_string +city: string lat: double long: double ---- @@ -476,7 +476,7 @@ This produces the following result with `tbl.scan().to_arrow()`: ```python pyarrow.Table -city: large_string +city: string lat: double long: double ---- @@ -957,14 +957,14 @@ split_offsets: list equality_ids: list child 0, item: int32 sort_order_id: int32 -readable_metrics: struct not null, lat: struct not null, long: struct not null> +readable_metrics: struct not null, lat: struct not null, long: struct not null> child 0, city: struct not null child 0, column_size: int64 child 1, value_count: int64 child 2, null_value_count: int64 child 3, nan_value_count: int64 - child 4, lower_bound: large_string - child 5, upper_bound: large_string + child 4, lower_bound: string + child 5, upper_bound: string child 1, lat: struct not null child 0, column_size: int64 child 1, value_count: int64 @@ -998,7 +998,7 @@ equality_ids:[[[],[]]] sort_order_id:[[[],[]]] readable_metrics: [ -- is_valid: all not null - -- child 0 type: struct + -- child 0 type: struct -- is_valid: all not null -- child 0 type: int64 [140] @@ -1008,9 +1008,9 @@ readable_metrics: [ [0] -- child 3 type: int64 [null] - -- child 4 type: large_string + -- child 4 type: string ["Amsterdam"] - -- child 5 type: large_string + -- child 5 type: string ["San Francisco"] -- child 1 type: struct -- is_valid: all not null diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py index 8c3047b2ca..3482302c90 100644 --- a/tests/catalog/test_sql.py +++ b/tests/catalog/test_sql.py @@ -404,7 +404,7 @@ def test_write_pyarrow_schema(catalog: SqlCatalog, table_identifier: Identifier) ], schema=pa.schema( [ - pa.field("foo", pa.large_string(), nullable=True), + pa.field("foo", pa.string(), nullable=True), pa.field("bar", pa.int32(), nullable=False), pa.field("baz", pa.bool_(), nullable=True), pa.field("large", pa.large_string(), nullable=True), @@ -1462,7 +1462,7 @@ def test_write_and_evolve(catalog: SqlCatalog, format_version: int) -> None: { "foo": ["a", None, "z"], }, - schema=pa.schema([pa.field("foo", pa.large_string(), nullable=True)]), + schema=pa.schema([pa.field("foo", pa.string(), nullable=True)]), ) tbl = catalog.create_table(identifier=identifier, schema=pa_table.schema, properties={"format-version": str(format_version)}) @@ -1474,7 +1474,7 @@ def test_write_and_evolve(catalog: SqlCatalog, format_version: int) -> None: }, schema=pa.schema( [ - pa.field("foo", pa.large_string(), nullable=True), + pa.field("foo", pa.string(), nullable=True), pa.field("bar", pa.int32(), nullable=True), ] ), @@ -1514,7 +1514,7 @@ def test_create_table_transaction(catalog: SqlCatalog, format_version: int) -> N { "foo": ["a", None, "z"], }, - schema=pa.schema([pa.field("foo", pa.large_string(), nullable=True)]), + schema=pa.schema([pa.field("foo", pa.string(), nullable=True)]), ) pa_table_with_column = pa.Table.from_pydict( @@ -1524,7 +1524,7 @@ def test_create_table_transaction(catalog: SqlCatalog, format_version: int) -> N }, schema=pa.schema( [ - pa.field("foo", pa.large_string(), nullable=True), + pa.field("foo", pa.string(), nullable=True), pa.field("bar", pa.int32(), nullable=True), ] ), diff --git a/tests/conftest.py b/tests/conftest.py index 09f3a15d56..b142612553 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2510,8 +2510,8 @@ def pa_schema() -> "pa.Schema": return pa.schema( [ ("bool", pa.bool_()), - ("string", pa.large_string()), - ("string_long", pa.large_string()), + ("string", pa.string()), + ("string_long", pa.string()), ("int", pa.int32()), ("long", pa.int64()), ("float", pa.float32()), @@ -2525,7 +2525,7 @@ def pa_schema() -> "pa.Schema": # ("time", pa.time64("us")), # Not natively supported by Arrow # ("uuid", pa.fixed(16)), - ("binary", pa.large_binary()), + ("binary", pa.binary()), ("fixed", pa.binary(16)), ] ) diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index a299036e6b..268591ab9d 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -891,7 +891,7 @@ def test_unsupported_transform( with pytest.raises( ValueError, - match="FeatureUnsupported => Unsupported data type for truncate transform: LargeBinary", + match="FeatureUnsupported => Unsupported data type for truncate transform: Binary", ): tbl.append(arrow_table_with_null)