From e1bba0704b58bd85ff40523994d314e9dbf68c18 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Thu, 28 Mar 2024 16:11:03 +0800 Subject: [PATCH] feat: Implements `arr.n_unique` (#15296) --- .../src/chunked_array/array/iterator.rs | 10 ++++++++ .../src/chunked_array/array/namespace.rs | 8 ++++++ crates/polars-plan/src/dsl/array.rs | 5 ++++ .../src/dsl/function_expr/array.rs | 8 ++++++ .../source/reference/expressions/array.rst | 1 + .../docs/source/reference/series/array.rst | 1 + py-polars/polars/expr/array.py | 25 +++++++++++++++++++ py-polars/polars/series/array.py | 16 ++++++++++++ py-polars/src/expr/array.rs | 4 +++ .../tests/unit/namespaces/array/test_array.py | 15 +++++++++++ 10 files changed, 93 insertions(+) diff --git a/crates/polars-core/src/chunked_array/array/iterator.rs b/crates/polars-core/src/chunked_array/array/iterator.rs index a9ecbf43ffb8..589b8996dc62 100644 --- a/crates/polars-core/src/chunked_array/array/iterator.rs +++ b/crates/polars-core/src/chunked_array/array/iterator.rs @@ -161,6 +161,16 @@ impl ArrayChunked { self.amortized_iter().map(f).collect_ca(self.name()) } + /// Try apply a closure `F` elementwise. + pub fn try_apply_amortized_generic<'a, F, K, V>(&'a self, f: F) -> PolarsResult> + where + V: PolarsDataType, + F: FnMut(Option>) -> PolarsResult> + Copy, + V::Array: ArrayFromIter>, + { + self.amortized_iter().map(f).try_collect_ca(self.name()) + } + pub fn for_each_amortized<'a, F>(&'a self, f: F) where F: FnMut(Option>), diff --git a/crates/polars-ops/src/chunked_array/array/namespace.rs b/crates/polars-ops/src/chunked_array/array/namespace.rs index 49c30cd00e0a..42e402f25066 100644 --- a/crates/polars-ops/src/chunked_array/array/namespace.rs +++ b/crates/polars-ops/src/chunked_array/array/namespace.rs @@ -76,6 +76,14 @@ pub trait ArrayNameSpace: AsArray { ca.try_apply_amortized_to_list(|s| s.as_ref().unique_stable()) } + fn array_n_unique(&self) -> PolarsResult { + let ca = self.as_array(); + ca.try_apply_amortized_generic(|opt_s| { + let opt_v = opt_s.map(|s| s.as_ref().n_unique()).transpose()?; + Ok(opt_v.map(|idx| idx as IdxSize)) + }) + } + #[cfg(feature = "array_any_all")] fn array_any(&self) -> PolarsResult { let ca = self.as_array(); diff --git a/crates/polars-plan/src/dsl/array.rs b/crates/polars-plan/src/dsl/array.rs index b00347ba8007..e8d66b9c71a1 100644 --- a/crates/polars-plan/src/dsl/array.rs +++ b/crates/polars-plan/src/dsl/array.rs @@ -59,6 +59,11 @@ impl ArrayNameSpace { .map_private(FunctionExpr::ArrayExpr(ArrayFunction::Unique(true))) } + pub fn n_unique(self) -> Expr { + self.0 + .map_private(FunctionExpr::ArrayExpr(ArrayFunction::NUnique)) + } + /// Cast the Array column to List column with the same inner data type. pub fn to_list(self) -> Expr { self.0 diff --git a/crates/polars-plan/src/dsl/function_expr/array.rs b/crates/polars-plan/src/dsl/function_expr/array.rs index 77b8ac2f68e3..a731a8e0c70a 100644 --- a/crates/polars-plan/src/dsl/function_expr/array.rs +++ b/crates/polars-plan/src/dsl/function_expr/array.rs @@ -11,6 +11,7 @@ pub enum ArrayFunction { Sum, ToList, Unique(bool), + NUnique, Std(u8), Var(u8), Median, @@ -39,6 +40,7 @@ impl ArrayFunction { Sum => mapper.nested_sum_type(), ToList => mapper.try_map_dtype(map_array_dtype_to_list_dtype), Unique(_) => mapper.try_map_dtype(map_array_dtype_to_list_dtype), + NUnique => mapper.with_dtype(IDX_DTYPE), Std(_) => mapper.map_to_float_dtype(), Var(_) => mapper.map_to_float_dtype(), Median => mapper.map_to_float_dtype(), @@ -75,6 +77,7 @@ impl Display for ArrayFunction { Sum => "sum", ToList => "to_list", Unique(_) => "unique", + NUnique => "n_unique", Std(_) => "std", Var(_) => "var", Median => "median", @@ -107,6 +110,7 @@ impl From for SpecialEq> { Sum => map!(sum), ToList => map!(to_list), Unique(stable) => map!(unique, stable), + NUnique => map!(n_unique), Std(ddof) => map!(std, ddof), Var(ddof) => map!(var, ddof), Median => map!(median), @@ -162,6 +166,10 @@ pub(super) fn unique(s: &Series, stable: bool) -> PolarsResult { out.map(|ca| ca.into_series()) } +pub(super) fn n_unique(s: &Series) -> PolarsResult { + Ok(s.array()?.array_n_unique()?.into_series()) +} + pub(super) fn to_list(s: &Series) -> PolarsResult { let list_dtype = map_array_dtype_to_list_dtype(s.dtype())?; s.cast(&list_dtype) diff --git a/py-polars/docs/source/reference/expressions/array.rst b/py-polars/docs/source/reference/expressions/array.rst index dd3d7be45d98..f25f2a30bbfd 100644 --- a/py-polars/docs/source/reference/expressions/array.rst +++ b/py-polars/docs/source/reference/expressions/array.rst @@ -16,6 +16,7 @@ The following methods are available under the `expr.arr` attribute. Expr.arr.std Expr.arr.to_list Expr.arr.unique + Expr.arr.n_unique Expr.arr.var Expr.arr.all Expr.arr.any diff --git a/py-polars/docs/source/reference/series/array.rst b/py-polars/docs/source/reference/series/array.rst index 13f2da759833..28976e1cab7d 100644 --- a/py-polars/docs/source/reference/series/array.rst +++ b/py-polars/docs/source/reference/series/array.rst @@ -16,6 +16,7 @@ The following methods are available under the `Series.arr` attribute. Series.arr.std Series.arr.to_list Series.arr.unique + Series.arr.n_unique Series.arr.var Series.arr.all Series.arr.any diff --git a/py-polars/polars/expr/array.py b/py-polars/polars/expr/array.py index 6972d5e1f062..1b19275fa961 100644 --- a/py-polars/polars/expr/array.py +++ b/py-polars/polars/expr/array.py @@ -187,6 +187,31 @@ def unique(self, *, maintain_order: bool = False) -> Expr: """ return wrap_expr(self._pyexpr.arr_unique(maintain_order)) + def n_unique(self) -> Expr: + """ + Count the number of unique values in every sub-arrays. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [[1, 1, 2], [2, 3, 4]], + ... }, + ... schema={"a": pl.Array(pl.Int64, 3)}, + ... ) + >>> df.with_columns(n_unique=pl.col("a").arr.n_unique()) + shape: (2, 2) + ┌───────────────┬──────────┐ + │ a ┆ n_unique │ + │ --- ┆ --- │ + │ array[i64, 3] ┆ u32 │ + ╞═══════════════╪══════════╡ + │ [1, 1, 2] ┆ 2 │ + │ [2, 3, 4] ┆ 3 │ + └───────────────┴──────────┘ + """ + return wrap_expr(self._pyexpr.arr_n_unique()) + def to_list(self) -> Expr: """ Convert an Array column into a List column with the same inner data type. diff --git a/py-polars/polars/series/array.py b/py-polars/polars/series/array.py index 04c88f701575..793ae0507404 100644 --- a/py-polars/polars/series/array.py +++ b/py-polars/polars/series/array.py @@ -153,6 +153,22 @@ def unique(self, *, maintain_order: bool = False) -> Series: └───────────┘ """ + def n_unique(self) -> Series: + """ + Count the number of unique values in every sub-arrays. + + Examples + -------- + >>> s = pl.Series("a", [[1, 2], [4, 4]], dtype=pl.Array(pl.Int64, 2)) + >>> s.arr.n_unique() + shape: (2,) + Series: 'a' [u32] + [ + 2 + 1 + ] + """ + def to_list(self) -> Series: """ Convert an Array column into a List column with the same inner data type. diff --git a/py-polars/src/expr/array.rs b/py-polars/src/expr/array.rs index 5b0cb2bf365b..fdeb99e39384 100644 --- a/py-polars/src/expr/array.rs +++ b/py-polars/src/expr/array.rs @@ -40,6 +40,10 @@ impl PyExpr { } } + fn arr_n_unique(&self) -> Self { + self.inner.clone().arr().n_unique().into() + } + fn arr_to_list(&self) -> Self { self.inner.clone().arr().to_list().into() } diff --git a/py-polars/tests/unit/namespaces/array/test_array.py b/py-polars/tests/unit/namespaces/array/test_array.py index 48e3133d37e4..b3f907452150 100644 --- a/py-polars/tests/unit/namespaces/array/test_array.py +++ b/py-polars/tests/unit/namespaces/array/test_array.py @@ -382,3 +382,18 @@ def test_array_shift() -> None: schema={"lit": pl.Array(pl.Int64, 3), "expr": pl.Array(pl.Int64, 3)}, ) assert_frame_equal(out, expected) + + +def test_array_n_unique() -> None: + df = pl.DataFrame( + { + "a": [[1, 1, 2], [3, 3, 3], [None, None, None], None], + }, + schema={"a": pl.Array(pl.Int64, 3)}, + ) + + out = df.select(n_unique=pl.col("a").arr.n_unique()) + expected = pl.DataFrame( + {"n_unique": [2, 1, 1, None]}, schema={"n_unique": pl.UInt32} + ) + assert_frame_equal(out, expected)