Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(python): Fix Array data type initialization #11907

Merged
merged 9 commits into from
Oct 21, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docs/src/python/user-guide/expressions/lists.py
Original file line number Diff line number Diff line change
@@ -97,7 +97,10 @@
pl.Series("Array_1", [[1, 3], [2, 5]]),
pl.Series("Array_2", [[1, 7, 3], [8, 1, 0]]),
],
schema={"Array_1": pl.Array(2, pl.Int64), "Array_2": pl.Array(3, pl.Int64)},
schema={
"Array_1": pl.Array(inner=pl.Int64, width=2),
"Array_2": pl.Array(inner=pl.Int64, width=3),
},
)
print(array_df)
# --8<-- [end:array_df]
53 changes: 42 additions & 11 deletions py-polars/polars/datatypes/classes.py
Original file line number Diff line number Diff line change
@@ -456,18 +456,18 @@ class Unknown(DataType):


class List(NestedType):
"""Nested list/array type with variable length of inner lists."""
"""Variable length list type."""

inner: PolarsDataType | None = None

def __init__(self, inner: PolarsDataType | PythonDataType):
"""
Nested list/array type with variable length of inner lists.
Variable length list type.

Parameters
----------
inner
The `DataType` of values within the list
The ``DataType`` of the values within each list.

Examples
--------
@@ -518,26 +518,31 @@ def __repr__(self) -> str:


class Array(NestedType):
"""Nested list/array type with fixed length of inner arrays."""
"""Fixed length list type."""

inner: PolarsDataType | None = None
width: int

def __init__(self, width: int, inner: PolarsDataType | PythonDataType = Null):
def __init__( # noqa: D417
self,
*args: Any,
width: int | None = None,
inner: PolarsDataType | PythonDataType | None = None,
):
"""
Nested list/array type with fixed length of inner arrays.
Fixed length list type.

Parameters
----------
width
The fixed size length of the inner arrays.
The length of the arrays.
inner
The `DataType` of values within the inner arrays
The ``DataType`` of the values within each array.

Examples
--------
>>> s = pl.Series(
... "a", [[1, 2], [4, 3]], dtype=pl.Array(width=2, inner=pl.Int64)
... "a", [[1, 2], [4, 3]], dtype=pl.Array(inner=pl.Int64, width=2)
... )
>>> s
shape: (2,)
@@ -548,6 +553,32 @@ def __init__(self, width: int, inner: PolarsDataType | PythonDataType = Null):
]

"""
from polars.utils.deprecation import issue_deprecation_warning

if args:
# TODO: When removing this deprecation, update the `to_object`
# implementation in py-polars/src/conversion.rs to use `call1` instead of
# `call`
issue_deprecation_warning(
"Parameters `inner` and `width` will change positions in the next breaking release."
" Use keyword arguments to keep current behavior and silence this warning.",
version="0.19.11",
)
if len(args) == 1:
width = args[0]
else:
width, inner = args[:2]
if width is None:
raise TypeError("`width` must be specified when initializing an `Array`")

if inner is None:
issue_deprecation_warning(
"The default value for the `inner` parameter of `Array` will be removed in the next breaking release."
" Pass `inner=pl.Null`to keep current behavior and silence this warning.",
version="0.19.11",
)
inner = Null

self.width = width
self.inner = polars.datatypes.py_type_to_dtype(inner)

@@ -570,11 +601,11 @@ def __eq__(self, other: PolarsDataType) -> bool: # type: ignore[override]
return False

def __hash__(self) -> int:
return hash((self.__class__, self.inner))
return hash((self.__class__, self.inner, self.width))

def __repr__(self) -> str:
class_name = self.__class__.__name__
return f"{class_name}({self.inner!r})"
return f"{class_name}({self.inner!r}, {self.width})"


class Field:
3 changes: 2 additions & 1 deletion py-polars/polars/datatypes/constants.py
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@
from typing import TYPE_CHECKING

from polars.datatypes import (
Array,
DataTypeGroup,
Date,
Datetime,
@@ -75,7 +76,7 @@
FLOAT_DTYPES | INTEGER_DTYPES | frozenset([Decimal])
)

NESTED_DTYPES: frozenset[PolarsDataType] = DataTypeGroup([List, Struct])
NESTED_DTYPES: frozenset[PolarsDataType] = DataTypeGroup([List, Struct, Array])

# number of rows to scan by default when inferring datatypes
N_INFER_DEFAULT = 100
3 changes: 2 additions & 1 deletion py-polars/polars/datatypes/convert.py
Original file line number Diff line number Diff line change
@@ -21,6 +21,7 @@
)

from polars.datatypes import (
Array,
Binary,
Boolean,
Categorical,
@@ -203,7 +204,7 @@ def unpack_dtypes(

unpacked: set[PolarsDataType] = set()
for tp in dtypes:
if isinstance(tp, List):
if isinstance(tp, (List, Array)):
if include_compound:
unpacked.add(tp)
unpacked.update(unpack_dtypes(tp.inner, include_compound=include_compound))
8 changes: 4 additions & 4 deletions py-polars/polars/expr/array.py
Original file line number Diff line number Diff line change
@@ -24,7 +24,7 @@ def min(self) -> Expr:
--------
>>> df = pl.DataFrame(
... data={"a": [[1, 2], [4, 3]]},
... schema={"a": pl.Array(width=2, inner=pl.Int64)},
... schema={"a": pl.Array(inner=pl.Int64, width=2)},
... )
>>> df.select(pl.col("a").arr.min())
shape: (2, 1)
@@ -48,7 +48,7 @@ def max(self) -> Expr:
--------
>>> df = pl.DataFrame(
... data={"a": [[1, 2], [4, 3]]},
... schema={"a": pl.Array(width=2, inner=pl.Int64)},
... schema={"a": pl.Array(inner=pl.Int64, width=2)},
... )
>>> df.select(pl.col("a").arr.max())
shape: (2, 1)
@@ -72,7 +72,7 @@ def sum(self) -> Expr:
--------
>>> df = pl.DataFrame(
... data={"a": [[1, 2], [4, 3]]},
... schema={"a": pl.Array(width=2, inner=pl.Int64)},
... schema={"a": pl.Array(inner=pl.Int64, width=2)},
... )
>>> df.select(pl.col("a").arr.sum())
shape: (2, 1)
@@ -103,7 +103,7 @@ def unique(self, *, maintain_order: bool = False) -> Expr:
... {
... "a": [[1, 1, 2]],
... },
... schema_overrides={"a": pl.Array(width=3, inner=pl.Int64)},
... schema_overrides={"a": pl.Array(inner=pl.Int64, width=3)},
... )
>>> df.select(pl.col("a").arr.unique())
shape: (1, 1)
8 changes: 4 additions & 4 deletions py-polars/polars/series/array.py
Original file line number Diff line number Diff line change
@@ -25,7 +25,7 @@ def min(self) -> Series:
Examples
--------
>>> s = pl.Series(
... "a", [[1, 2], [4, 3]], dtype=pl.Array(width=2, inner=pl.Int64)
... "a", [[1, 2], [4, 3]], dtype=pl.Array(inner=pl.Int64, width=2)
... )
>>> s.arr.min()
shape: (2,)
@@ -44,7 +44,7 @@ def max(self) -> Series:
Examples
--------
>>> s = pl.Series(
... "a", [[1, 2], [4, 3]], dtype=pl.Array(width=2, inner=pl.Int64)
... "a", [[1, 2], [4, 3]], dtype=pl.Array(inner=pl.Int64, width=2)
... )
>>> s.arr.max()
shape: (2,)
@@ -64,7 +64,7 @@ def sum(self) -> Series:
--------
>>> df = pl.DataFrame(
... data={"a": [[1, 2], [4, 3]]},
... schema={"a": pl.Array(width=2, inner=pl.Int64)},
... schema={"a": pl.Array(inner=pl.Int64, width=2)},
... )
>>> df.select(pl.col("a").arr.sum())
shape: (2, 1)
@@ -94,7 +94,7 @@ def unique(self, *, maintain_order: bool = False) -> Series:
... {
... "a": [[1, 1, 2]],
... },
... schema_overrides={"a": pl.Array(width=3, inner=pl.Int64)},
... schema_overrides={"a": pl.Array(inner=pl.Int64, width=3)},
... )
>>> df.select(pl.col("a").arr.unique())
shape: (1, 1)
5 changes: 4 additions & 1 deletion py-polars/src/conversion.rs
Original file line number Diff line number Diff line change
@@ -312,7 +312,10 @@ impl ToPyObject for Wrap<DataType> {
DataType::Array(inner, size) => {
let inner = Wrap(*inner.clone()).to_object(py);
let list_class = pl.getattr(intern!(py, "Array")).unwrap();
list_class.call1((*size, inner)).unwrap().into()
let kwargs = PyDict::new(py);
kwargs.set_item("inner", inner).unwrap();
kwargs.set_item("width", size).unwrap();
list_class.call((), Some(kwargs)).unwrap().into()
},
DataType::List(inner) => {
let inner = Wrap(*inner.clone()).to_object(py);
37 changes: 25 additions & 12 deletions py-polars/tests/unit/datatypes/test_array.py
Original file line number Diff line number Diff line change
@@ -9,7 +9,7 @@ def test_cast_list_array() -> None:
payload = [[1, 2, 3], [4, 2, 3]]
s = pl.Series(payload)

dtype = pl.Array(width=3, inner=pl.Int64)
dtype = pl.Array(inner=pl.Int64, width=3)
out = s.cast(dtype)
assert out.dtype == dtype
assert out.to_list() == payload
@@ -20,19 +20,19 @@ def test_cast_list_array() -> None:
pl.ComputeError,
match=r"incompatible offsets in source list",
):
s.cast(pl.Array(width=2, inner=pl.Int64))
s.cast(pl.Array(inner=pl.Int64, width=2))


def test_array_construction() -> None:
payload = [[1, 2, 3], [4, 2, 3]]

dtype = pl.Array(width=3, inner=pl.Int64)
dtype = pl.Array(inner=pl.Int64, width=3)
s = pl.Series(payload, dtype=dtype)
assert s.dtype == dtype
assert s.to_list() == payload

# inner type
dtype = pl.Array(2, pl.UInt8)
dtype = pl.Array(inner=pl.UInt8, width=2)
payload = [[1, 2], [3, 4]]
s = pl.Series(payload, dtype=dtype)
assert s.dtype == dtype
@@ -41,13 +41,13 @@ def test_array_construction() -> None:
# create using schema
df = pl.DataFrame(
schema={
"a": pl.Array(width=3, inner=pl.Float32),
"b": pl.Array(width=5, inner=pl.Datetime("ms")),
"a": pl.Array(inner=pl.Float32, width=3),
"b": pl.Array(inner=pl.Datetime("ms"), width=5),
}
)
assert df.dtypes == [
pl.Array(width=3, inner=pl.Float32),
pl.Array(width=5, inner=pl.Datetime("ms")),
pl.Array(inner=pl.Float32, width=3),
pl.Array(inner=pl.Datetime("ms"), width=5),
]
assert df.rows() == []

@@ -56,7 +56,9 @@ def test_array_in_group_by() -> None:
df = pl.DataFrame(
[
pl.Series("id", [1, 2]),
pl.Series("list", [[1, 2], [5, 5]], dtype=pl.Array(2, pl.UInt8)),
pl.Series(
"list", [[1, 2], [5, 5]], dtype=pl.Array(inner=pl.UInt8, width=2)
),
]
)

@@ -83,7 +85,7 @@ def test_array_in_group_by() -> None:
def test_array_invalid_operation() -> None:
s = pl.Series(
[[1, 2], [8, 9]],
dtype=pl.Array(width=2, inner=pl.Int32),
dtype=pl.Array(inner=pl.Int32, width=2),
)
with pytest.raises(
InvalidOperationError,
@@ -94,11 +96,22 @@ def test_array_invalid_operation() -> None:

def test_array_concat() -> None:
a_df = pl.DataFrame({"a": [[0, 1], [1, 0]]}).select(
pl.col("a").cast(pl.Array(width=2, inner=pl.Int32))
pl.col("a").cast(pl.Array(inner=pl.Int32, width=2))
)
b_df = pl.DataFrame({"a": [[1, 1], [0, 0]]}).select(
pl.col("a").cast(pl.Array(width=2, inner=pl.Int32))
pl.col("a").cast(pl.Array(inner=pl.Int32, width=2))
)
assert pl.concat([a_df, b_df]).to_dict(False) == {
"a": [[0, 1], [1, 0], [1, 1], [0, 0]]
}


def test_array_init_deprecation() -> None:
with pytest.deprecated_call():
pl.Array(2)
with pytest.deprecated_call():
pl.Array(2, pl.Utf8)
with pytest.deprecated_call():
pl.Array(2, inner=pl.Utf8)
with pytest.deprecated_call():
pl.Array(width=2)
4 changes: 2 additions & 2 deletions py-polars/tests/unit/datatypes/test_struct.py
Original file line number Diff line number Diff line change
@@ -645,8 +645,8 @@ def test_empty_struct() -> None:
pl.List,
pl.List(pl.Null),
pl.List(pl.Utf8),
pl.Array(32),
pl.Array(16, inner=pl.UInt8),
pl.Array(inner=pl.Null, width=32),
pl.Array(inner=pl.UInt8, width=16),
pl.Struct,
pl.Struct([pl.Field("", pl.Null)]),
pl.Struct([pl.Field("x", pl.UInt32), pl.Field("y", pl.Float64)]),
8 changes: 4 additions & 4 deletions py-polars/tests/unit/namespaces/test_array.py
Original file line number Diff line number Diff line change
@@ -5,19 +5,19 @@


def test_arr_min_max() -> None:
s = pl.Series("a", [[1, 2], [4, 3]], dtype=pl.Array(width=2, inner=pl.Int64))
s = pl.Series("a", [[1, 2], [4, 3]], dtype=pl.Array(inner=pl.Int64, width=2))
assert s.arr.max().to_list() == [2, 4]
assert s.arr.min().to_list() == [1, 3]


def test_arr_sum() -> None:
s = pl.Series("a", [[1, 2], [4, 3]], dtype=pl.Array(width=2, inner=pl.Int64))
s = pl.Series("a", [[1, 2], [4, 3]], dtype=pl.Array(inner=pl.Int64, width=2))
assert s.arr.sum().to_list() == [3, 7]


def test_arr_unique() -> None:
df = pl.DataFrame(
{"a": pl.Series("a", [[1, 1], [4, 3]], dtype=pl.Array(width=2, inner=pl.Int64))}
{"a": pl.Series("a", [[1, 1], [4, 3]], dtype=pl.Array(inner=pl.Int64, width=2))}
)

out = df.select(pl.col("a").arr.unique(maintain_order=True))
@@ -26,5 +26,5 @@ def test_arr_unique() -> None:


def test_array_to_numpy() -> None:
s = pl.Series([[1, 2], [3, 4], [5, 6]], dtype=pl.Array(width=2, inner=pl.Int64))
s = pl.Series([[1, 2], [3, 4], [5, 6]], dtype=pl.Array(inner=pl.Int64, width=2))
assert (s.to_numpy() == np.array([[1, 2], [3, 4], [5, 6]])).all()
2 changes: 1 addition & 1 deletion py-polars/tests/unit/operations/test_explode.py
Original file line number Diff line number Diff line change
@@ -309,7 +309,7 @@ def test_explode_inner_null() -> None:
def test_explode_array() -> None:
df = pl.LazyFrame(
{"a": [[1, 2], [2, 3]], "b": [1, 2]},
schema_overrides={"a": pl.Array(2, inner=pl.Int64)},
schema_overrides={"a": pl.Array(inner=pl.Int64, width=2)},
)
expected = pl.DataFrame({"a": [1, 2, 2, 3], "b": [1, 1, 2, 2]})
for ex in ("a", ~cs.integer()):