From 7a71d7bd54ecce45a0e2b534f5bfbb24e0b7c1c8 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 20 Mar 2024 14:26:11 -0400 Subject: [PATCH] Focus the user-defined-functions page on Python only. --- .../python/user-guide/expressions/structs.py | 3 ++ .../expressions/user-defined-functions.py | 25 ++++++++-- .../rust/user-guide/expressions/structs.rs | 49 +++++++++++++++++++ .../expressions/user-defined-functions.rs | 41 ---------------- docs/user-guide/expressions/structs.md | 6 ++- 5 files changed, 77 insertions(+), 47 deletions(-) diff --git a/docs/src/python/user-guide/expressions/structs.py b/docs/src/python/user-guide/expressions/structs.py index ee034a362bc6..3c0c61055166 100644 --- a/docs/src/python/user-guide/expressions/structs.py +++ b/docs/src/python/user-guide/expressions/structs.py @@ -64,3 +64,6 @@ ).filter(pl.struct("Movie", "Theatre").is_duplicated()) print(out) # --8<-- [end:struct_ranking] + +# --8<-- [start:multi_column_apply] +# --8<-- [end:multi_column_apply] diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index f6a4ec98326b..2c3e1f06ac52 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -94,12 +94,27 @@ def diff_from_mean_numba(arr, result): # --8<-- [end:missing_data] + # --8<-- [start:combine] -out = df.select( - pl.struct(["keys", "values"]) - .map_elements(lambda x: len(x["keys"]) + x["values"]) - .alias("solution_map_elements"), - (pl.col("keys").str.len_bytes() + pl.col("values")).alias("solution_expr"), +# Add two arrays together: +@guvectorize([(int64[:], int64[:], float64[:])], "(n),(n)->(n)") +def add(arr, arr2, result): + for i in range(len(arr)): + result[i] = arr[i] + arr2[i] + + +df3 = pl.DataFrame({"values1": [1, 2, 3], "values2": [10, 20, 30]}) + +out = df3.select( + # Create a struct that has two columns in it: + pl.struct(["values1", "values2"]) + # Pass the struct to a lambda that then passes the individual columns to + # the add() function: + .map_batches( + lambda combined: add( + combined.struct.field("values1"), combined.struct.field("values2") + ) + ).alias("add_columns") ) print(out) # --8<-- [end:combine] diff --git a/docs/src/rust/user-guide/expressions/structs.rs b/docs/src/rust/user-guide/expressions/structs.rs index 01c08eaf3d7f..d882830106a5 100644 --- a/docs/src/rust/user-guide/expressions/structs.rs +++ b/docs/src/rust/user-guide/expressions/structs.rs @@ -95,5 +95,54 @@ fn main() -> Result<(), Box> { println!("{}", &out); // --8<-- [end:struct_ranking] + // --8<-- [start:multi_column_apply] + let df = df!( + "keys" => &["a", "a", "b"], + "values" => &[10, 7, 1], + )?; + + let out = df + .lazy() + .select([ + // pack to struct to get access to multiple fields in a custom `apply/map` + as_struct(vec![col("keys"), col("values")]) + // we will compute the len(a) + b + .apply( + |s| { + // downcast to struct + let ca = s.struct_()?; + + // get the fields as Series + let s_a = &ca.fields()[0]; + let s_b = &ca.fields()[1]; + + // downcast the `Series` to their known type + let ca_a = s_a.str()?; + let ca_b = s_b.i32()?; + + // iterate both `ChunkedArrays` + let out: Int32Chunked = ca_a + .into_iter() + .zip(ca_b) + .map(|(opt_a, opt_b)| match (opt_a, opt_b) { + (Some(a), Some(b)) => Some(a.len() as i32 + b), + _ => None, + }) + .collect(); + + Ok(Some(out.into_series())) + }, + GetOutput::from_type(DataType::Int32), + ) + // note: the `'solution_map_elements'` alias is just there to show how you + // get the same output as in the Python API example. + .alias("solution_map_elements"), + (col("keys").str().count_matches(lit("."), true) + col("values")) + .alias("solution_expr"), + ]) + .collect()?; + println!("{}", out); + + // --8<-- [end:multi_column_apply] Ok(()) } diff --git a/docs/src/rust/user-guide/expressions/user-defined-functions.rs b/docs/src/rust/user-guide/expressions/user-defined-functions.rs index b2ffb40a17ef..42d61ed4f1c1 100644 --- a/docs/src/rust/user-guide/expressions/user-defined-functions.rs +++ b/docs/src/rust/user-guide/expressions/user-defined-functions.rs @@ -25,47 +25,6 @@ fn main() -> Result<(), Box> { // --8<-- [end:missing_data] // --8<-- [start:combine] - let out = df - .lazy() - .select([ - // pack to struct to get access to multiple fields in a custom `apply/map` - as_struct(vec![col("keys"), col("values")]) - // we will compute the len(a) + b - .apply( - |s| { - // downcast to struct - let ca = s.struct_()?; - - // get the fields as Series - let s_a = &ca.fields()[0]; - let s_b = &ca.fields()[1]; - - // downcast the `Series` to their known type - let ca_a = s_a.str()?; - let ca_b = s_b.i32()?; - - // iterate both `ChunkedArrays` - let out: Int32Chunked = ca_a - .into_iter() - .zip(ca_b) - .map(|(opt_a, opt_b)| match (opt_a, opt_b) { - (Some(a), Some(b)) => Some(a.len() as i32 + b), - _ => None, - }) - .collect(); - - Ok(Some(out.into_series())) - }, - GetOutput::from_type(DataType::Int32), - ) - // note: the `'solution_map_elements'` alias is just there to show how you - // get the same output as in the Python API example. - .alias("solution_map_elements"), - (col("keys").str().count_matches(lit("."), true) + col("values")) - .alias("solution_expr"), - ]) - .collect()?; - println!("{}", out); // --8<-- [end:combine] Ok(()) } diff --git a/docs/user-guide/expressions/structs.md b/docs/user-guide/expressions/structs.md index 056c1b2e21b7..4ccaca925613 100644 --- a/docs/user-guide/expressions/structs.md +++ b/docs/user-guide/expressions/structs.md @@ -96,4 +96,8 @@ That's a pretty complex set of requirements done very elegantly in Polars! ### Using multi-column apply -This was discussed in the previous section on _User Defined Functions_. +This was discussed in the previous section on _User Defined Functions_ for the Python case. +Here's an example of doing so with Rust: + + +{{code_block('user-guide/expressions/structs','multi_column_apply',[])}}