From 4a077d3346a32e533599139824378333320fa39e Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Mon, 26 Feb 2024 14:08:37 -0500 Subject: [PATCH 01/20] More accurate statement. --- docs/user-guide/expressions/user-defined-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md index 882cc11c6ac1..dbcf9987e7fe 100644 --- a/docs/user-guide/expressions/user-defined-functions.md +++ b/docs/user-guide/expressions/user-defined-functions.md @@ -50,7 +50,7 @@ Use cases for `map_batches` in the `group_by` context are slim. They are only us --8<-- "python/user-guide/expressions/user-defined-functions.py:dataframe" ``` -In the snippet above we group by the `"keys"` column. That means we have the following groups: +We would like to group by the `"keys"` column. That means we will have the following groups: ```c "a" -> [10, 7] From c1b52c3edda8f0ef9866ffacacf1668d200943a1 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 28 Feb 2024 13:00:39 -0500 Subject: [PATCH 02/20] Start of rewrite of UDF docs. --- docs/requirements.txt | 1 + .../expressions/user-defined-functions.py | 47 ++++++++++++++ .../expressions/user-defined-functions.rs | 6 ++ .../expressions/user-defined-functions.md | 62 +++++++++++++++++-- py-polars/requirements-dev.txt | 1 + 5 files changed, 112 insertions(+), 5 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index dccf92dd62d1..1a0fddbf0ef0 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,6 +5,7 @@ matplotlib seaborn plotly altair +numba mkdocs-material==9.5.2 mkdocs-macros-plugin==1.0.4 diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index e0658b2d36a4..88326d27dccb 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -14,6 +14,53 @@ print(df) # --8<-- [end:dataframe] + +# --8<-- [start:custom_sum] +def custom_sum(series): + # This will be very slow for non-triviial Series, since it's all Python + # code: + total = 0 + for value in series: + total += value + return total + + +# Apply our custom function a full Series with map_batches(): +out = df.select(pl.col("values").map_batches(custom_sum)) +assert out["values"].item() == 18 +print("== select() with UDF ==") +print(out) + +# Apply our custom function per group: +print("== group_by() with UDF ==") +out = df.group_by("keys").agg(pl.col("values").map_batches(custom_sum)) +print(out) +# --8<-- [end:custom_sum] + +# --8<-- [start:custom_sum_numba] +from numba import guvectorize, int64 + + +# This will be compiled to machine code, so it will be fast. The Series is +# converted to a NumPy array before being passed to the function: +@guvectorize([(int64[:], int64[:])], "(n)->()") +def custom_sum_numba(arr, result): + total = 0 + for value in series: + total += value + result[0] = total + + +out = df.select(pl.col("values").map_batches(custom_sum_numba)) +print("== select() with UDF ==") +assert out["values"].item() == 18 +print(out) + +out = df.group_by("keys").agg(pl.col("values").map_batches(custom_sum_numba)) +print("== group_by() with UDF ==") +print(out) +# --8<-- [end:custom_sum_numba] + # --8<-- [start:shift_map_batches] out = df.group_by("keys", maintain_order=True).agg( pl.col("values").map_batches(lambda s: s.shift()).alias("shift_map_batches"), diff --git a/docs/src/rust/user-guide/expressions/user-defined-functions.rs b/docs/src/rust/user-guide/expressions/user-defined-functions.rs index 56661fcabc84..060a1d64fe25 100644 --- a/docs/src/rust/user-guide/expressions/user-defined-functions.rs +++ b/docs/src/rust/user-guide/expressions/user-defined-functions.rs @@ -9,6 +9,12 @@ fn main() -> Result<(), Box> { println!("{}", df); // --8<-- [end:dataframe] + // --8<-- [start:custom_sum] + // --8<-- [end:custom_sum] + + // --8<-- [start:custom_sum_numba] + // --8<-- [end:custom_sum_numba] + // --8<-- [start:shift_map_batches] let out = df .clone() diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md index dbcf9987e7fe..b4c9e50719f2 100644 --- a/docs/user-guide/expressions/user-defined-functions.md +++ b/docs/user-guide/expressions/user-defined-functions.md @@ -8,8 +8,60 @@ over data in Polars. For this we provide the following expressions: -- `map_batches` -- `map_elements` +- [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html): Always passes the full `Series` to the function. +- [:material-api: `map_elements`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_elements.html): Passes the smallest logical for an operation; in a `select()` context this will be individual items, in a `group_by()` context this will be group-specific `Series`. + +## Example: A slow, custom sum function written in Python + +For demonstration purposes, let's say we want to sum the values in a `Series` using a function we write in Python. +Here is our data: + +{{code_block('user-guide/expressions/user-defined-functions','dataframe',[])}} + +```python exec="on" result="text" session="user-guide/udf" +--8<-- "python/user-guide/expressions/user-defined-functions.py:setup" +--8<-- "python/user-guide/expressions/user-defined-functions.py:dataframe" +``` + +We can use `map_batches()` to run this function on either the full `Series` or individual groups in a `group_by()`. +Since the result of the latter is a `Series`, we can also extract it into a single value if we want. + +{{code_block('user-guide/expressions/user-defined-functions','custom_sum',[])}} + +```python exec="on" result="text" session="user-guide/udf" +--8<-- "python/user-guide/expressions/user-defined-functions.py:custom_sum" +``` + +The problem with this implementation is that it's slow. +In general, you want to minimize how much Python code you call if you want fast results. +Calling a Python function for every `Series` isn't usually a problem, unless the `group_by()` produces a very large number of groups. +However, running the `for` loop in Python, and then summing the values in Python, will be very slow. + +## Fast operations with user-defined functions + +In general, user-defined functions will run most quickly when two conditions are met: + +1. **You're operating on a whole `Series`.** + That means you'll want to use `map_batches()` in `select()` contexts, and `map_elements()` in `group_by()` contexts so your function gets called per group. + See below for more details about the difference between the two APIs. +2. **You're using a function written in a compiled language.** + For numeric calculations Polars supports a pair of interfaces defined by NumPy called ["ufuncs"](https://numpy.org/doc/stable/reference/ufuncs.html) and ["generalized ufuncs"](https://numpy.org/neps/nep-0005-generalized-ufuncs.html). + The latter runs on each item individually, but the latter accepts a whole array. + The easiest way to write these in Python is to use [Numba](https://numba.readthedocs.io/en/stable/), which allows you to write custom functions in (a subset) of Python while still getting the benefit of compiled code. + +## Example: A fast custom sum function in Python using Numba + +Numba provides a decorator called [`@guvectorize`](https://numba.readthedocs.io/en/stable/user/vectorize.html#the-guvectorize-decorator) that takes a Python function and compiles it to fast machine code, in a way that allows it to be used by Polars. + +In the following example the `custom_sum_numba()` will be compiled to fast machine code at import time, which will take a little time. +After that all calls to the function will run quickly. +The `Series` will be converted to a NumPy array before being passed to the function: + +{{code_block('user-guide/expressions/user-defined-functions','custom_sum_numba',[])}} + +```python exec="on" result="text" session="user-guide/udf" +--8<-- "python/user-guide/expressions/user-defined-functions.py:custom_sum" +``` ## To `map_batches` or to `map_elements`. @@ -28,7 +80,7 @@ we could use `map_batches` to pass an expression column to a neural network mode [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) ```python -df.with_columns([ +features_df.with_columns([ pl.col("features").map_batches(lambda s: MyNeuralNetwork.forward(s.to_numpy())).alias("activations") ]) ``` @@ -36,12 +88,12 @@ df.with_columns([ === ":fontawesome-brands-rust: Rust" ```rust -df.with_columns([ +features_df.with_columns([ col("features").map(|s| Ok(my_nn.forward(s))).alias("activations") ]) ``` -Use cases for `map_batches` in the `group_by` context are slim. They are only used for performance reasons, but can quite easily lead to incorrect results. Let me explain why. +Use cases for `map_batches` in the `group_by` context are slim. They are only used for performance reasons, but can quite easily lead to incorrect results. Let me explain why, returning to our original code example. {{code_block('user-guide/expressions/user-defined-functions','dataframe',[])}} diff --git a/py-polars/requirements-dev.txt b/py-polars/requirements-dev.txt index 4a3785e77f15..20760b7c049f 100644 --- a/py-polars/requirements-dev.txt +++ b/py-polars/requirements-dev.txt @@ -18,6 +18,7 @@ numpy pandas pyarrow pydantic>=2.0.0 +numba # Datetime / time zones backports.zoneinfo; python_version < '3.9' tzdata; platform_system == 'Windows' From ff7ef91c3d50bfa1cb31822cfd2bae8dcac66873 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 28 Feb 2024 14:07:17 -0500 Subject: [PATCH 03/20] Start on missing data docs. --- .../expressions/user-defined-functions.py | 56 ++++----- .../expressions/user-defined-functions.rs | 41 +----- .../expressions/user-defined-functions.md | 118 ++---------------- 3 files changed, 43 insertions(+), 172 deletions(-) diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index 88326d27dccb..84c686244e16 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -38,7 +38,7 @@ def custom_sum(series): # --8<-- [end:custom_sum] # --8<-- [start:custom_sum_numba] -from numba import guvectorize, int64 +from numba import guvectorize, int64, float64 # This will be compiled to machine code, so it will be fast. The Series is @@ -46,14 +46,14 @@ def custom_sum(series): @guvectorize([(int64[:], int64[:])], "(n)->()") def custom_sum_numba(arr, result): total = 0 - for value in series: + for value in arr: total += value result[0] = total out = df.select(pl.col("values").map_batches(custom_sum_numba)) print("== select() with UDF ==") -assert out["values"].item() == 18 +# assert out["values"].item() == 18 print(out) out = df.group_by("keys").agg(pl.col("values").map_batches(custom_sum_numba)) @@ -61,39 +61,35 @@ def custom_sum_numba(arr, result): print(out) # --8<-- [end:custom_sum_numba] -# --8<-- [start:shift_map_batches] -out = df.group_by("keys", maintain_order=True).agg( - pl.col("values").map_batches(lambda s: s.shift()).alias("shift_map_batches"), - pl.col("values").shift().alias("shift_expression"), -) -print(out) -# --8<-- [end:shift_map_batches] - - -# --8<-- [start:map_elements] -out = df.group_by("keys", maintain_order=True).agg( - pl.col("values").map_elements(lambda s: s.shift()).alias("shift_map_elements"), - pl.col("values").shift().alias("shift_expression"), +# --8<-- [start:dataframe2] +df2 = pl.DataFrame( + { + "values": [1, 2, 3, None, 4], + } ) -print(out) -# --8<-- [end:map_elements] - -# --8<-- [start:counter] -counter = 0 +print(df2) +# --8<-- [end:dataframe2] +# --8<-- [start:custom_mean_numba] +@guvectorize([(int64[:], float64[:])], "(n)->()") +def custom_mean_numba(arr, result): + total = 0 + for value in arr: + total += value + result[0] = total / len(arr) -def add_counter(val: int) -> int: - global counter - counter += 1 - return counter + val +out = df2.select(pl.col("values").mean()) +print("== built-in mean() knows to skip empty values ==") +# assert out["values"][0] == 2.5 +print(out) -out = df.select( - pl.col("values").map_elements(add_counter).alias("solution_map_elements"), - (pl.col("values") + pl.int_range(1, pl.len() + 1)).alias("solution_expr"), -) +out = df2.select(pl.col("values").map_batches(custom_mean_numba)) +print("== custom mean() gets the wrong answer because of missing data ==") +# assert out["values"][0] != 2.5 print(out) -# --8<-- [end:counter] + +# --8<-- [end:custom_mean_numba] # --8<-- [start:combine] out = df.select( diff --git a/docs/src/rust/user-guide/expressions/user-defined-functions.rs b/docs/src/rust/user-guide/expressions/user-defined-functions.rs index 060a1d64fe25..be0ca0582a00 100644 --- a/docs/src/rust/user-guide/expressions/user-defined-functions.rs +++ b/docs/src/rust/user-guide/expressions/user-defined-functions.rs @@ -15,44 +15,11 @@ fn main() -> Result<(), Box> { // --8<-- [start:custom_sum_numba] // --8<-- [end:custom_sum_numba] - // --8<-- [start:shift_map_batches] - let out = df - .clone() - .lazy() - .group_by(["keys"]) - .agg([ - col("values") - .map(|s| Ok(Some(s.shift(1))), GetOutput::default()) - // note: the `'shift_map_batches'` alias is just there to show how you - // get the same output as in the Python API example. - .alias("shift_map_batches"), - col("values").shift(lit(1)).alias("shift_expression"), - ]) - .collect()?; - - println!("{}", out); - // --8<-- [end:shift_map_batches] - - // --8<-- [start:map_elements] - let out = df - .clone() - .lazy() - .group_by([col("keys")]) - .agg([ - col("values") - .apply(|s| Ok(Some(s.shift(1))), GetOutput::default()) - // note: the `'shift_map_elements'` alias is just there to show how you - // get the same output as in the Python API example. - .alias("shift_map_elements"), - col("values").shift(lit(1)).alias("shift_expression"), - ]) - .collect()?; - println!("{}", out); - // --8<-- [end:map_elements] - - // --8<-- [start:counter] + // --8<-- [start:dataframe2] + // --8<-- [end:dataframe2] - // --8<-- [end:counter] + // --8<-- [start:custom_mean_numba] + // --8<-- [end:custom_mean_numba] // --8<-- [start:combine] let out = df diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md index b4c9e50719f2..ae8cc8ccae0b 100644 --- a/docs/user-guide/expressions/user-defined-functions.md +++ b/docs/user-guide/expressions/user-defined-functions.md @@ -60,122 +60,30 @@ The `Series` will be converted to a NumPy array before being passed to the funct {{code_block('user-guide/expressions/user-defined-functions','custom_sum_numba',[])}} ```python exec="on" result="text" session="user-guide/udf" ---8<-- "python/user-guide/expressions/user-defined-functions.py:custom_sum" -``` - -## To `map_batches` or to `map_elements`. - -These functions have an important distinction in how they operate and consequently what data they will pass to the user. - -A `map_batches` passes the `Series` backed by the `expression` as is. - -`map_batches` follows the same rules in both the `select` and the `group_by` context, this will -mean that the `Series` represents a column in a `DataFrame`. Note that in the `group_by` context, that column is not yet -aggregated! - -Use cases for `map_batches` are for instance passing the `Series` in an expression to a third party library. Below we show how -we could use `map_batches` to pass an expression column to a neural network model. - -=== ":fontawesome-brands-python: Python" -[:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) - -```python -features_df.with_columns([ - pl.col("features").map_batches(lambda s: MyNeuralNetwork.forward(s.to_numpy())).alias("activations") -]) +--8<-- "python/user-guide/expressions/user-defined-functions.py:custom_sum_numba" ``` -=== ":fontawesome-brands-rust: Rust" +## Missing data can break your calculation -```rust -features_df.with_columns([ - col("features").map(|s| Ok(my_nn.forward(s))).alias("activations") -]) -``` +As mentioned above, before being passed to a generalized `ufunc` like our Numba function a `Series` will be converted to a NumPy array. +Unfortunately, NumPy arrays don't have a concept of missing data, which means the array won't actually match the `Series`. -Use cases for `map_batches` in the `group_by` context are slim. They are only used for performance reasons, but can quite easily lead to incorrect results. Let me explain why, returning to our original code example. +If you're calculating results item by item, this doesn't matter. +But if the result depends on more than one value in the `Series`, the result will be wrong: -{{code_block('user-guide/expressions/user-defined-functions','dataframe',[])}} +{{code_block('user-guide/expressions/user-defined-functions','dataframe2',[])}} +{{code_block('user-guide/expressions/user-defined-functions','custom_mean_numba',[])}} ```python exec="on" result="text" session="user-guide/udf" ---8<-- "python/user-guide/expressions/user-defined-functions.py:setup" ---8<-- "python/user-guide/expressions/user-defined-functions.py:dataframe" -``` - -We would like to group by the `"keys"` column. That means we will have the following groups: - -```c -"a" -> [10, 7] -"b" -> [1] -``` - -If we would then apply a `shift` operation to the right, we'd expect: - -```c -"a" -> [null, 10] -"b" -> [null] +--8<-- "python/user-guide/expressions/user-defined-functions.py:dataframe2" +--8<-- "python/user-guide/expressions/user-defined-functions.py:custom_mean_numba" ``` -Let's try that out and see what we get: - -{{code_block('user-guide/expressions/user-defined-functions','shift_map_batches',[])}} - -```python exec="on" result="text" session="user-guide/udf" ---8<-- "python/user-guide/expressions/user-defined-functions.py:shift_map_batches" -``` +## Pre-written fast functions -Ouch.. we clearly get the wrong results here. Group `"b"` even got a value from group `"a"` 😵. -This went horribly wrong, because the `map_batches` applies the function before we aggregate! So that means the whole column `[10, 7, 1`\] got shifted to `[null, 10, 7]` and was then aggregated. - -So my advice is to never use `map_batches` in the `group_by` context unless you know you need it and know what you are doing. - -## To `map_elements` - -Luckily we can fix previous example with `map_elements`. `map_elements` works on the smallest logical elements for that operation. - -That is: - -- `select context` -> single elements -- `group by context` -> single groups - -So with `map_elements` we should be able to fix our example: - -=== ":fontawesome-brands-python: Python" -[:material-api: `map_elements`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_elements.html) - -{{code_block('user-guide/expressions/user-defined-functions','map_elements',[])}} - -```python exec="on" result="text" session="user-guide/udf" ---8<-- "python/user-guide/expressions/user-defined-functions.py:map_elements" -``` - -And observe, a valid result! 🎉 - -## `map_elements` in the `select` context - -In the `select` context, the `map_elements` expression passes elements of the column to the Python function. - -_Note that you are now running Python, this will be slow._ - -Let's go through some examples to see what to expect. We will continue with the `DataFrame` we defined at the start of -this section and show an example with the `map_elements` function and a counter example where we use the expression API to -achieve the same goals. - -### Adding a counter - -In this example we create a global `counter` and then add the integer `1` to the global state at every element processed. -Every iteration the result of the increment will be added to the element value. - -> Note, this example isn't provided in Rust. The reason is that the global `counter` value would lead to data races when this `apply` is evaluated in parallel. It would be possible to wrap it in a `Mutex` to protect the variable, but that would be obscuring the point of the example. This is a case where the Python Global Interpreter Lock's performance tradeoff provides some safety guarantees. - -{{code_block('user-guide/expressions/user-defined-functions','counter',[])}} - -```python exec="on" result="text" session="user-guide/udf" ---8<-- "python/user-guide/expressions/user-defined-functions.py:counter" -``` -### Combining multiple column values +## Combining multiple column values If we want to have access to values of different columns in a single `map_elements` function call, we can create `struct` data type. This data type collects those columns as fields in the `struct`. So if we'd create a struct from the columns @@ -199,7 +107,7 @@ In Python, those would be passed as `dict` to the calling Python function and ca `Structs` are covered in detail in the next section. -### Return types? +## Return types Custom Python functions are black boxes for Polars. We really don't know what kind of black arts you are doing, so we have to infer and try our best to understand what you meant. From e305e237d97c7fc6a78586ca58560cb70d991162 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Thu, 29 Feb 2024 14:17:58 -0500 Subject: [PATCH 04/20] Continue adding documentation. --- .../expressions/user-defined-functions.py | 7 +++ .../expressions/user-defined-functions.rs | 3 + .../expressions/user-defined-functions.md | 63 ++++++++++++------- 3 files changed, 51 insertions(+), 22 deletions(-) diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index 84c686244e16..8e0d93b37d1e 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -37,6 +37,13 @@ def custom_sum(series): print(out) # --8<-- [end:custom_sum] +# --8<-- [start:np_log] +import numpy as np + +out = df.select(pl.col("values").map_batches(np.log)) +print(out) +# --8<-- [end:np_log] + # --8<-- [start:custom_sum_numba] from numba import guvectorize, int64, float64 diff --git a/docs/src/rust/user-guide/expressions/user-defined-functions.rs b/docs/src/rust/user-guide/expressions/user-defined-functions.rs index be0ca0582a00..49b3ff7c837d 100644 --- a/docs/src/rust/user-guide/expressions/user-defined-functions.rs +++ b/docs/src/rust/user-guide/expressions/user-defined-functions.rs @@ -12,6 +12,9 @@ fn main() -> Result<(), Box> { // --8<-- [start:custom_sum] // --8<-- [end:custom_sum] + // --8<-- [start:np_log] + // --8<-- [end:np_log] + // --8<-- [start:custom_sum_numba] // --8<-- [end:custom_sum_numba] diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md index ae8cc8ccae0b..c9ed5d1b6310 100644 --- a/docs/user-guide/expressions/user-defined-functions.md +++ b/docs/user-guide/expressions/user-defined-functions.md @@ -6,10 +6,11 @@ than in other libraries. Still, you need to have the power to be able to pass an expression's state to a third party library or apply your black box function over data in Polars. -For this we provide the following expressions: +In this part of the documentation we'll be using one specific API: - [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html): Always passes the full `Series` to the function. -- [:material-api: `map_elements`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_elements.html): Passes the smallest logical for an operation; in a `select()` context this will be individual items, in a `group_by()` context this will be group-specific `Series`. + +A later section will explain other available APIs for applying user-defined functions. ## Example: A slow, custom sum function written in Python @@ -23,8 +24,7 @@ Here is our data: --8<-- "python/user-guide/expressions/user-defined-functions.py:dataframe" ``` -We can use `map_batches()` to run this function on either the full `Series` or individual groups in a `group_by()`. -Since the result of the latter is a `Series`, we can also extract it into a single value if we want. +We can use `map_batches()` to run this function on either the full `Series` or individual groups in a `group_by()`: {{code_block('user-guide/expressions/user-defined-functions','custom_sum',[])}} @@ -32,26 +32,33 @@ Since the result of the latter is a `Series`, we can also extract it into a sing --8<-- "python/user-guide/expressions/user-defined-functions.py:custom_sum" ``` -The problem with this implementation is that it's slow. +## Fast operations with user-defined functions + +The problem with a pure-Python implementation is that it's slow. In general, you want to minimize how much Python code you call if you want fast results. Calling a Python function for every `Series` isn't usually a problem, unless the `group_by()` produces a very large number of groups. However, running the `for` loop in Python, and then summing the values in Python, will be very slow. -## Fast operations with user-defined functions +To maximize speed, you'll want to make sure that you're using a function written in a compiled language. +For numeric calculations Polars supports a pair of interfaces defined by NumPy called ["ufuncs"](https://numpy.org/doc/stable/reference/ufuncs.html) and ["generalized ufuncs"](https://numpy.org/neps/nep-0005-generalized-ufuncs.html). +The former runs on each item individually, and the latter accepts a whole NumPy array, so allows for more flexible operations. + +[NumPy](https://numpy.org/doc/stable/reference/ufuncs.html) and other libraries like [SciPy](https://docs.scipy.org/doc/scipy/reference/special.html#module-scipy.special) come with pre-written ufuncs you can use with Polars. +For example: -In general, user-defined functions will run most quickly when two conditions are met: +{{code_block('user-guide/expressions/user-defined-functions','np_log',[])}} -1. **You're operating on a whole `Series`.** - That means you'll want to use `map_batches()` in `select()` contexts, and `map_elements()` in `group_by()` contexts so your function gets called per group. - See below for more details about the difference between the two APIs. -2. **You're using a function written in a compiled language.** - For numeric calculations Polars supports a pair of interfaces defined by NumPy called ["ufuncs"](https://numpy.org/doc/stable/reference/ufuncs.html) and ["generalized ufuncs"](https://numpy.org/neps/nep-0005-generalized-ufuncs.html). - The latter runs on each item individually, but the latter accepts a whole array. - The easiest way to write these in Python is to use [Numba](https://numba.readthedocs.io/en/stable/), which allows you to write custom functions in (a subset) of Python while still getting the benefit of compiled code. +```python exec="on" result="text" session="user-guide/udf" +--8<-- "python/user-guide/expressions/user-defined-functions.py:np_log" +``` ## Example: A fast custom sum function in Python using Numba -Numba provides a decorator called [`@guvectorize`](https://numba.readthedocs.io/en/stable/user/vectorize.html#the-guvectorize-decorator) that takes a Python function and compiles it to fast machine code, in a way that allows it to be used by Polars. +The pre-written functions are helpful, but our goal is to write our own functions. +For example, let's say we want a fast version of our `custum_sum()` example above. +The easiest way to write this in Python is to use [Numba](https://numba.readthedocs.io/en/stable/), which allows you to write custom functions in (a subset) of Python while still getting the benefit of compiled code. + +In particular, Numba provides a decorator called [`@guvectorize`](https://numba.readthedocs.io/en/stable/user/vectorize.html#the-guvectorize-decorator) that compiles a Python function to fast machine code, in a way that allows it to be used by Polars. In the following example the `custom_sum_numba()` will be compiled to fast machine code at import time, which will take a little time. After that all calls to the function will run quickly. @@ -65,26 +72,30 @@ The `Series` will be converted to a NumPy array before being passed to the funct ## Missing data can break your calculation -As mentioned above, before being passed to a generalized `ufunc` like our Numba function a `Series` will be converted to a NumPy array. -Unfortunately, NumPy arrays don't have a concept of missing data, which means the array won't actually match the `Series`. +Before being passed to a user-defined function like `custom_sum_numba()`, a `Series` will be converted to a NumPy array. +Unfortunately, NumPy arrays don't have a concept of missing data. +If there is missing data in the original `Series`, this means the resulting array won't actually match the `Series`. If you're calculating results item by item, this doesn't matter. -But if the result depends on more than one value in the `Series`, the result will be wrong: +For example, `numpy.log()` gets called on each individual value separately, so those missing values don't change the calculation. +But if the result of a user-defined function depend on multiple values in the `Series`, the result may be wrong: {{code_block('user-guide/expressions/user-defined-functions','dataframe2',[])}} -{{code_block('user-guide/expressions/user-defined-functions','custom_mean_numba',[])}} ```python exec="on" result="text" session="user-guide/udf" --8<-- "python/user-guide/expressions/user-defined-functions.py:dataframe2" ---8<-- "python/user-guide/expressions/user-defined-functions.py:custom_mean_numba" ``` -## Pre-written fast functions - +{{code_block('user-guide/expressions/user-defined-functions','custom_mean_numba',[])}} +```python exec="on" result="text" session="user-guide/udf" +--8<-- "python/user-guide/expressions/user-defined-functions.py:custom_mean_numba" +``` ## Combining multiple column values +TODO + If we want to have access to values of different columns in a single `map_elements` function call, we can create `struct` data type. This data type collects those columns as fields in the `struct`. So if we'd create a struct from the columns `"keys"` and `"values"`, we would get the following struct elements: @@ -107,8 +118,16 @@ In Python, those would be passed as `dict` to the calling Python function and ca `Structs` are covered in detail in the next section. +## Streaming calculations + +Passing the full `Series` to the user-defined function has a cost: it will use a lot of memory. + +TODO + ## Return types +TODO + Custom Python functions are black boxes for Polars. We really don't know what kind of black arts you are doing, so we have to infer and try our best to understand what you meant. From f631513ff45582cb0587843bbd4d3421af2fe1e0 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 20 Mar 2024 13:35:05 -0400 Subject: [PATCH 05/20] Give up on scalar results. --- .../expressions/user-defined-functions.py | 54 +++++++++---------- .../expressions/user-defined-functions.rs | 12 ++--- .../expressions/user-defined-functions.md | 25 +++++---- 3 files changed, 45 insertions(+), 46 deletions(-) diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index 8e0d93b37d1e..f6a4ec98326b 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -7,35 +7,35 @@ # --8<-- [start:dataframe] df = pl.DataFrame( { - "keys": ["a", "a", "b"], - "values": [10, 7, 1], + "keys": ["a", "a", "b", "b"], + "values": [10, 7, 1, 23], } ) print(df) # --8<-- [end:dataframe] -# --8<-- [start:custom_sum] -def custom_sum(series): - # This will be very slow for non-triviial Series, since it's all Python +# --8<-- [start:diff_from_mean] +def diff_from_mean(series): + # This will be very slow for non-trivial Series, since it's all Python # code: total = 0 for value in series: total += value - return total + mean = total / len(series) + return pl.Series([value - mean for value in series]) # Apply our custom function a full Series with map_batches(): -out = df.select(pl.col("values").map_batches(custom_sum)) -assert out["values"].item() == 18 +out = df.select(pl.col("values").map_batches(diff_from_mean)) print("== select() with UDF ==") print(out) # Apply our custom function per group: print("== group_by() with UDF ==") -out = df.group_by("keys").agg(pl.col("values").map_batches(custom_sum)) +out = df.group_by("keys").agg(pl.col("values").map_batches(diff_from_mean)) print(out) -# --8<-- [end:custom_sum] +# --8<-- [end:diff_from_mean] # --8<-- [start:np_log] import numpy as np @@ -44,29 +44,31 @@ def custom_sum(series): print(out) # --8<-- [end:np_log] -# --8<-- [start:custom_sum_numba] +# --8<-- [start:diff_from_mean_numba] from numba import guvectorize, int64, float64 # This will be compiled to machine code, so it will be fast. The Series is # converted to a NumPy array before being passed to the function: -@guvectorize([(int64[:], int64[:])], "(n)->()") -def custom_sum_numba(arr, result): +@guvectorize([(int64[:], float64[:])], "(n)->(n)") +def diff_from_mean_numba(arr, result): total = 0 for value in arr: total += value - result[0] = total + mean = total / len(arr) + for i, value in enumerate(arr): + result[i] = value - mean -out = df.select(pl.col("values").map_batches(custom_sum_numba)) +out = df.select(pl.col("values").map_batches(diff_from_mean_numba)) print("== select() with UDF ==") # assert out["values"].item() == 18 print(out) -out = df.group_by("keys").agg(pl.col("values").map_batches(custom_sum_numba)) +out = df.group_by("keys").agg(pl.col("values").map_batches(diff_from_mean_numba)) print("== group_by() with UDF ==") print(out) -# --8<-- [end:custom_sum_numba] +# --8<-- [end:diff_from_mean_numba] # --8<-- [start:dataframe2] df2 = pl.DataFrame( @@ -77,26 +79,20 @@ def custom_sum_numba(arr, result): print(df2) # --8<-- [end:dataframe2] -# --8<-- [start:custom_mean_numba] -@guvectorize([(int64[:], float64[:])], "(n)->()") -def custom_mean_numba(arr, result): - total = 0 - for value in arr: - total += value - result[0] = total / len(arr) - -out = df2.select(pl.col("values").mean()) +# --8<-- [start:missing_data] +# Implement equivalent of diff_from_mean_numba() using Polars APIs: +out = df2.select(pl.col("values") - pl.col("values").mean()) print("== built-in mean() knows to skip empty values ==") # assert out["values"][0] == 2.5 print(out) -out = df2.select(pl.col("values").map_batches(custom_mean_numba)) -print("== custom mean() gets the wrong answer because of missing data ==") +out = df2.select(pl.col("values").map_batches(diff_from_mean_numba)) +print("== custom mean gets the wrong answer because of missing data ==") # assert out["values"][0] != 2.5 print(out) -# --8<-- [end:custom_mean_numba] +# --8<-- [end:missing_data] # --8<-- [start:combine] out = df.select( diff --git a/docs/src/rust/user-guide/expressions/user-defined-functions.rs b/docs/src/rust/user-guide/expressions/user-defined-functions.rs index 49b3ff7c837d..b2ffb40a17ef 100644 --- a/docs/src/rust/user-guide/expressions/user-defined-functions.rs +++ b/docs/src/rust/user-guide/expressions/user-defined-functions.rs @@ -9,20 +9,20 @@ fn main() -> Result<(), Box> { println!("{}", df); // --8<-- [end:dataframe] - // --8<-- [start:custom_sum] - // --8<-- [end:custom_sum] + // --8<-- [start:diff_from_mean] + // --8<-- [end:diff_from_mean] // --8<-- [start:np_log] // --8<-- [end:np_log] - // --8<-- [start:custom_sum_numba] - // --8<-- [end:custom_sum_numba] + // --8<-- [start:diff_from_mean_numba] + // --8<-- [end:diff_from_mean_numba] // --8<-- [start:dataframe2] // --8<-- [end:dataframe2] - // --8<-- [start:custom_mean_numba] - // --8<-- [end:custom_mean_numba] + // --8<-- [start:missing_data] + // --8<-- [end:missing_data] // --8<-- [start:combine] let out = df diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md index c9ed5d1b6310..c1a5cfe1e58e 100644 --- a/docs/user-guide/expressions/user-defined-functions.md +++ b/docs/user-guide/expressions/user-defined-functions.md @@ -14,7 +14,7 @@ A later section will explain other available APIs for applying user-defined func ## Example: A slow, custom sum function written in Python -For demonstration purposes, let's say we want to sum the values in a `Series` using a function we write in Python. +For demonstration purposes, let's say we want to calculate the difference between the mean of a `Series` and each value. Here is our data: {{code_block('user-guide/expressions/user-defined-functions','dataframe',[])}} @@ -26,10 +26,10 @@ Here is our data: We can use `map_batches()` to run this function on either the full `Series` or individual groups in a `group_by()`: -{{code_block('user-guide/expressions/user-defined-functions','custom_sum',[])}} +{{code_block('user-guide/expressions/user-defined-functions','diff_from_mean',[])}} ```python exec="on" result="text" session="user-guide/udf" ---8<-- "python/user-guide/expressions/user-defined-functions.py:custom_sum" +--8<-- "python/user-guide/expressions/user-defined-functions.py:diff_from_mean" ``` ## Fast operations with user-defined functions @@ -52,27 +52,27 @@ For example: --8<-- "python/user-guide/expressions/user-defined-functions.py:np_log" ``` -## Example: A fast custom sum function in Python using Numba +## Example: A fast custom function using Numba The pre-written functions are helpful, but our goal is to write our own functions. -For example, let's say we want a fast version of our `custum_sum()` example above. +For example, let's say we want a fast version of our `diff_from_mean()` example above. The easiest way to write this in Python is to use [Numba](https://numba.readthedocs.io/en/stable/), which allows you to write custom functions in (a subset) of Python while still getting the benefit of compiled code. In particular, Numba provides a decorator called [`@guvectorize`](https://numba.readthedocs.io/en/stable/user/vectorize.html#the-guvectorize-decorator) that compiles a Python function to fast machine code, in a way that allows it to be used by Polars. -In the following example the `custom_sum_numba()` will be compiled to fast machine code at import time, which will take a little time. +In the following example the `diff_from_mean_numba()` will be compiled to fast machine code at import time, which will take a little time. After that all calls to the function will run quickly. The `Series` will be converted to a NumPy array before being passed to the function: -{{code_block('user-guide/expressions/user-defined-functions','custom_sum_numba',[])}} +{{code_block('user-guide/expressions/user-defined-functions','diff_from_mean_numba',[])}} ```python exec="on" result="text" session="user-guide/udf" ---8<-- "python/user-guide/expressions/user-defined-functions.py:custom_sum_numba" +--8<-- "python/user-guide/expressions/user-defined-functions.py:diff_from_mean_numba" ``` ## Missing data can break your calculation -Before being passed to a user-defined function like `custom_sum_numba()`, a `Series` will be converted to a NumPy array. +Before being passed to a user-defined function like `diff_from_mean_numba()`, a `Series` will be converted to a NumPy array. Unfortunately, NumPy arrays don't have a concept of missing data. If there is missing data in the original `Series`, this means the resulting array won't actually match the `Series`. @@ -86,12 +86,15 @@ But if the result of a user-defined function depend on multiple values in the `S --8<-- "python/user-guide/expressions/user-defined-functions.py:dataframe2" ``` -{{code_block('user-guide/expressions/user-defined-functions','custom_mean_numba',[])}} +{{code_block('user-guide/expressions/user-defined-functions','missing_data',[])}} ```python exec="on" result="text" session="user-guide/udf" ---8<-- "python/user-guide/expressions/user-defined-functions.py:custom_mean_numba" +--8<-- "python/user-guide/expressions/user-defined-functions.py:missing_data" ``` +So how do you deal with missing data? +Either [fill it in](missing-data.md) or [drop it](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.drop_nulls.html) before calling the customer user function. + ## Combining multiple column values TODO From b530e99266f5562a7ed00d2c1938443b3db4c076 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 20 Mar 2024 14:11:02 -0400 Subject: [PATCH 06/20] A couple more sections. --- .../expressions/user-defined-functions.md | 33 +++++-------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md index c1a5cfe1e58e..86b2b18724ee 100644 --- a/docs/user-guide/expressions/user-defined-functions.md +++ b/docs/user-guide/expressions/user-defined-functions.md @@ -97,21 +97,8 @@ Either [fill it in](missing-data.md) or [drop it](https://docs.pola.rs/py-polars ## Combining multiple column values -TODO - -If we want to have access to values of different columns in a single `map_elements` function call, we can create `struct` data -type. This data type collects those columns as fields in the `struct`. So if we'd create a struct from the columns -`"keys"` and `"values"`, we would get the following struct elements: - -```python -[ - {"keys": "a", "values": 10}, - {"keys": "a", "values": 7}, - {"keys": "b", "values": 1}, -] -``` - -In Python, those would be passed as `dict` to the calling Python function and can thus be indexed by `field: str`. In Rust, you'll get a `Series` with the `Struct` type. The fields of the struct can then be indexed and downcast. +If you want to pass multiple columns to a user-defined function, you can use `Struct`s, which are [covered in detail in a different section](structs.md). +The basic idea is to combine multiple columns into a `Struct`, and then the function can extract the columns back out: {{code_block('user-guide/expressions/user-defined-functions','combine',[])}} @@ -119,23 +106,17 @@ In Python, those would be passed as `dict` to the calling Python function and ca --8<-- "python/user-guide/expressions/user-defined-functions.py:combine" ``` -`Structs` are covered in detail in the next section. - ## Streaming calculations Passing the full `Series` to the user-defined function has a cost: it will use a lot of memory. +You can use a `is_elementwise=True` argument to [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) to stream results into the function, which means it might not get all values. -TODO +For a function like `numpy.log()`, this works fine, because `numpy.log()` effectively calculates each individual value separately anyway. +However, for our example `diff_from_mean()` function above, this would result in incorrect results, since it would calculate the mean on only part of the `Series`. ## Return types -TODO - -Custom Python functions are black boxes for Polars. We really don't know what kind of black arts you are doing, so we have -to infer and try our best to understand what you meant. - -As a user it helps to understand what we do to better utilize custom functions. - +Custom Python functions are often black boxes; Polars doesn't know what your function is doing or what it will return. The data type is automatically inferred. We do that by waiting for the first non-null value. That value will then be used to determine the type of the `Series`. @@ -156,3 +137,5 @@ Rust types map as follows: - `bool` -> `Boolean` - `String` or `str` -> `String` - `Vec` -> `List[tp]` (where the inner type is inferred with the same rules) + +You can pass a `return_dtype` argument to [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) if you want to override the inferred type. From 7a71d7bd54ecce45a0e2b534f5bfbb24e0b7c1c8 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 20 Mar 2024 14:26:11 -0400 Subject: [PATCH 07/20] Focus the user-defined-functions page on Python only. --- .../python/user-guide/expressions/structs.py | 3 ++ .../expressions/user-defined-functions.py | 25 ++++++++-- .../rust/user-guide/expressions/structs.rs | 49 +++++++++++++++++++ .../expressions/user-defined-functions.rs | 41 ---------------- docs/user-guide/expressions/structs.md | 6 ++- 5 files changed, 77 insertions(+), 47 deletions(-) diff --git a/docs/src/python/user-guide/expressions/structs.py b/docs/src/python/user-guide/expressions/structs.py index ee034a362bc6..3c0c61055166 100644 --- a/docs/src/python/user-guide/expressions/structs.py +++ b/docs/src/python/user-guide/expressions/structs.py @@ -64,3 +64,6 @@ ).filter(pl.struct("Movie", "Theatre").is_duplicated()) print(out) # --8<-- [end:struct_ranking] + +# --8<-- [start:multi_column_apply] +# --8<-- [end:multi_column_apply] diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index f6a4ec98326b..2c3e1f06ac52 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -94,12 +94,27 @@ def diff_from_mean_numba(arr, result): # --8<-- [end:missing_data] + # --8<-- [start:combine] -out = df.select( - pl.struct(["keys", "values"]) - .map_elements(lambda x: len(x["keys"]) + x["values"]) - .alias("solution_map_elements"), - (pl.col("keys").str.len_bytes() + pl.col("values")).alias("solution_expr"), +# Add two arrays together: +@guvectorize([(int64[:], int64[:], float64[:])], "(n),(n)->(n)") +def add(arr, arr2, result): + for i in range(len(arr)): + result[i] = arr[i] + arr2[i] + + +df3 = pl.DataFrame({"values1": [1, 2, 3], "values2": [10, 20, 30]}) + +out = df3.select( + # Create a struct that has two columns in it: + pl.struct(["values1", "values2"]) + # Pass the struct to a lambda that then passes the individual columns to + # the add() function: + .map_batches( + lambda combined: add( + combined.struct.field("values1"), combined.struct.field("values2") + ) + ).alias("add_columns") ) print(out) # --8<-- [end:combine] diff --git a/docs/src/rust/user-guide/expressions/structs.rs b/docs/src/rust/user-guide/expressions/structs.rs index 01c08eaf3d7f..d882830106a5 100644 --- a/docs/src/rust/user-guide/expressions/structs.rs +++ b/docs/src/rust/user-guide/expressions/structs.rs @@ -95,5 +95,54 @@ fn main() -> Result<(), Box> { println!("{}", &out); // --8<-- [end:struct_ranking] + // --8<-- [start:multi_column_apply] + let df = df!( + "keys" => &["a", "a", "b"], + "values" => &[10, 7, 1], + )?; + + let out = df + .lazy() + .select([ + // pack to struct to get access to multiple fields in a custom `apply/map` + as_struct(vec![col("keys"), col("values")]) + // we will compute the len(a) + b + .apply( + |s| { + // downcast to struct + let ca = s.struct_()?; + + // get the fields as Series + let s_a = &ca.fields()[0]; + let s_b = &ca.fields()[1]; + + // downcast the `Series` to their known type + let ca_a = s_a.str()?; + let ca_b = s_b.i32()?; + + // iterate both `ChunkedArrays` + let out: Int32Chunked = ca_a + .into_iter() + .zip(ca_b) + .map(|(opt_a, opt_b)| match (opt_a, opt_b) { + (Some(a), Some(b)) => Some(a.len() as i32 + b), + _ => None, + }) + .collect(); + + Ok(Some(out.into_series())) + }, + GetOutput::from_type(DataType::Int32), + ) + // note: the `'solution_map_elements'` alias is just there to show how you + // get the same output as in the Python API example. + .alias("solution_map_elements"), + (col("keys").str().count_matches(lit("."), true) + col("values")) + .alias("solution_expr"), + ]) + .collect()?; + println!("{}", out); + + // --8<-- [end:multi_column_apply] Ok(()) } diff --git a/docs/src/rust/user-guide/expressions/user-defined-functions.rs b/docs/src/rust/user-guide/expressions/user-defined-functions.rs index b2ffb40a17ef..42d61ed4f1c1 100644 --- a/docs/src/rust/user-guide/expressions/user-defined-functions.rs +++ b/docs/src/rust/user-guide/expressions/user-defined-functions.rs @@ -25,47 +25,6 @@ fn main() -> Result<(), Box> { // --8<-- [end:missing_data] // --8<-- [start:combine] - let out = df - .lazy() - .select([ - // pack to struct to get access to multiple fields in a custom `apply/map` - as_struct(vec![col("keys"), col("values")]) - // we will compute the len(a) + b - .apply( - |s| { - // downcast to struct - let ca = s.struct_()?; - - // get the fields as Series - let s_a = &ca.fields()[0]; - let s_b = &ca.fields()[1]; - - // downcast the `Series` to their known type - let ca_a = s_a.str()?; - let ca_b = s_b.i32()?; - - // iterate both `ChunkedArrays` - let out: Int32Chunked = ca_a - .into_iter() - .zip(ca_b) - .map(|(opt_a, opt_b)| match (opt_a, opt_b) { - (Some(a), Some(b)) => Some(a.len() as i32 + b), - _ => None, - }) - .collect(); - - Ok(Some(out.into_series())) - }, - GetOutput::from_type(DataType::Int32), - ) - // note: the `'solution_map_elements'` alias is just there to show how you - // get the same output as in the Python API example. - .alias("solution_map_elements"), - (col("keys").str().count_matches(lit("."), true) + col("values")) - .alias("solution_expr"), - ]) - .collect()?; - println!("{}", out); // --8<-- [end:combine] Ok(()) } diff --git a/docs/user-guide/expressions/structs.md b/docs/user-guide/expressions/structs.md index 056c1b2e21b7..4ccaca925613 100644 --- a/docs/user-guide/expressions/structs.md +++ b/docs/user-guide/expressions/structs.md @@ -96,4 +96,8 @@ That's a pretty complex set of requirements done very elegantly in Polars! ### Using multi-column apply -This was discussed in the previous section on _User Defined Functions_. +This was discussed in the previous section on _User Defined Functions_ for the Python case. +Here's an example of doing so with Rust: + + +{{code_block('user-guide/expressions/structs','multi_column_apply',[])}} From 153268563f215eb03b67ba7826834cd119f68517 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 20 Mar 2024 14:34:54 -0400 Subject: [PATCH 08/20] A variety of edits. --- .../expressions/user-defined-functions.py | 7 ++--- .../expressions/user-defined-functions.md | 29 ++++++++----------- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index 2c3e1f06ac52..ea8f09cab8a1 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -49,7 +49,9 @@ def diff_from_mean(series): # This will be compiled to machine code, so it will be fast. The Series is -# converted to a NumPy array before being passed to the function: +# converted to a NumPy array before being passed to the function. See the +# Numba documentation for more details: +# https://numba.readthedocs.io/en/stable/user/vectorize.html @guvectorize([(int64[:], float64[:])], "(n)->(n)") def diff_from_mean_numba(arr, result): total = 0 @@ -62,7 +64,6 @@ def diff_from_mean_numba(arr, result): out = df.select(pl.col("values").map_batches(diff_from_mean_numba)) print("== select() with UDF ==") -# assert out["values"].item() == 18 print(out) out = df.group_by("keys").agg(pl.col("values").map_batches(diff_from_mean_numba)) @@ -84,12 +85,10 @@ def diff_from_mean_numba(arr, result): # Implement equivalent of diff_from_mean_numba() using Polars APIs: out = df2.select(pl.col("values") - pl.col("values").mean()) print("== built-in mean() knows to skip empty values ==") -# assert out["values"][0] == 2.5 print(out) out = df2.select(pl.col("values").map_batches(diff_from_mean_numba)) print("== custom mean gets the wrong answer because of missing data ==") -# assert out["values"][0] != 2.5 print(out) # --8<-- [end:missing_data] diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md index 86b2b18724ee..d27809c914dd 100644 --- a/docs/user-guide/expressions/user-defined-functions.md +++ b/docs/user-guide/expressions/user-defined-functions.md @@ -1,18 +1,13 @@ # User-defined functions (Python) -You should be convinced by now that Polars expressions are so powerful and flexible that there is much less need for custom Python functions -than in other libraries. +Polars expressions are quite powerful and flexible, so there is much less need for custom Python functions compared to other libraries. +Still, you may need to pass an expression's state to a third party library or apply your black box function to data in Polars. -Still, you need to have the power to be able to pass an expression's state to a third party library or apply your black box function -over data in Polars. - -In this part of the documentation we'll be using one specific API: +In this part of the documentation we'll be using one specific API that allows you to do this: - [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html): Always passes the full `Series` to the function. -A later section will explain other available APIs for applying user-defined functions. - -## Example: A slow, custom sum function written in Python +## Example: A slow, custom function written in Python For demonstration purposes, let's say we want to calculate the difference between the mean of a `Series` and each value. Here is our data: @@ -41,7 +36,7 @@ However, running the `for` loop in Python, and then summing the values in Python To maximize speed, you'll want to make sure that you're using a function written in a compiled language. For numeric calculations Polars supports a pair of interfaces defined by NumPy called ["ufuncs"](https://numpy.org/doc/stable/reference/ufuncs.html) and ["generalized ufuncs"](https://numpy.org/neps/nep-0005-generalized-ufuncs.html). -The former runs on each item individually, and the latter accepts a whole NumPy array, so allows for more flexible operations. +The former runs on each item individually, and the latter accepts a whole NumPy array, which allows for more flexible operations. [NumPy](https://numpy.org/doc/stable/reference/ufuncs.html) and other libraries like [SciPy](https://docs.scipy.org/doc/scipy/reference/special.html#module-scipy.special) come with pre-written ufuncs you can use with Polars. For example: @@ -92,8 +87,8 @@ But if the result of a user-defined function depend on multiple values in the `S --8<-- "python/user-guide/expressions/user-defined-functions.py:missing_data" ``` -So how do you deal with missing data? -Either [fill it in](missing-data.md) or [drop it](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.drop_nulls.html) before calling the customer user function. +How do you deal with missing data? +Either [fill it in](missing-data.md) or [drop it](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.drop_nulls.html) before calling your custom function. ## Combining multiple column values @@ -108,17 +103,17 @@ The basic idea is to combine multiple columns into a `Struct`, and then the func ## Streaming calculations -Passing the full `Series` to the user-defined function has a cost: it will use a lot of memory. -You can use a `is_elementwise=True` argument to [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) to stream results into the function, which means it might not get all values. +Passing the full `Series` to the user-defined function has a cost: it may use a lot of memory, as its contents are copied into a NumPy array. +You can use a `is_elementwise=True` argument to [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) to stream results into the function, which means it might not get all values at once. -For a function like `numpy.log()`, this works fine, because `numpy.log()` effectively calculates each individual value separately anyway. +For a function like `numpy.log()` this works fine, because `numpy.log()` effectively calculates each individual value separately anyway. However, for our example `diff_from_mean()` function above, this would result in incorrect results, since it would calculate the mean on only part of the `Series`. ## Return types Custom Python functions are often black boxes; Polars doesn't know what your function is doing or what it will return. -The data type is automatically inferred. We do that by waiting for the first non-null value. That value will then be used -to determine the type of the `Series`. +The return data type is therefore automatically inferred. We do that by waiting for the first non-null value. That value will then be used +to determine the type of the resulting `Series`. The mapping of Python types to Polars data types is as follows: From 4c7362a45411421871759e05aa321383b4e51ad0 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 20 Mar 2024 14:40:04 -0400 Subject: [PATCH 09/20] Extraneous line break. --- docs/user-guide/expressions/structs.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/user-guide/expressions/structs.md b/docs/user-guide/expressions/structs.md index 4ccaca925613..563d8879ef3b 100644 --- a/docs/user-guide/expressions/structs.md +++ b/docs/user-guide/expressions/structs.md @@ -99,5 +99,4 @@ That's a pretty complex set of requirements done very elegantly in Polars! This was discussed in the previous section on _User Defined Functions_ for the Python case. Here's an example of doing so with Rust: - {{code_block('user-guide/expressions/structs','multi_column_apply',[])}} From 68f160d82f3984cc5dd70b779c9bb1f8c6e4a1eb Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 20 Mar 2024 14:43:12 -0400 Subject: [PATCH 10/20] Reformat. --- .../python/user-guide/expressions/user-defined-functions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index ea8f09cab8a1..bfdb2c34a3fe 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -113,7 +113,8 @@ def add(arr, arr2, result): lambda combined: add( combined.struct.field("values1"), combined.struct.field("values2") ) - ).alias("add_columns") + ) + .alias("add_columns") ) print(out) # --8<-- [end:combine] From 428c8c5cd6b740b61e160cb9d4dc8dd7068cb433 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Tue, 14 May 2024 09:21:52 -0400 Subject: [PATCH 11/20] Update missing data to the fact we now disallow it. --- .../expressions/user-defined-functions.py | 25 +------------------ .../expressions/user-defined-functions.rs | 6 ----- .../expressions/user-defined-functions.md | 22 +++++----------- 3 files changed, 7 insertions(+), 46 deletions(-) diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index bfdb2c34a3fe..18cb5bbd05e0 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -71,28 +71,6 @@ def diff_from_mean_numba(arr, result): print(out) # --8<-- [end:diff_from_mean_numba] -# --8<-- [start:dataframe2] -df2 = pl.DataFrame( - { - "values": [1, 2, 3, None, 4], - } -) -print(df2) -# --8<-- [end:dataframe2] - - -# --8<-- [start:missing_data] -# Implement equivalent of diff_from_mean_numba() using Polars APIs: -out = df2.select(pl.col("values") - pl.col("values").mean()) -print("== built-in mean() knows to skip empty values ==") -print(out) - -out = df2.select(pl.col("values").map_batches(diff_from_mean_numba)) -print("== custom mean gets the wrong answer because of missing data ==") -print(out) - -# --8<-- [end:missing_data] - # --8<-- [start:combine] # Add two arrays together: @@ -113,8 +91,7 @@ def add(arr, arr2, result): lambda combined: add( combined.struct.field("values1"), combined.struct.field("values2") ) - ) - .alias("add_columns") + ).alias("add_columns") ) print(out) # --8<-- [end:combine] diff --git a/docs/src/rust/user-guide/expressions/user-defined-functions.rs b/docs/src/rust/user-guide/expressions/user-defined-functions.rs index 42d61ed4f1c1..c392563398c4 100644 --- a/docs/src/rust/user-guide/expressions/user-defined-functions.rs +++ b/docs/src/rust/user-guide/expressions/user-defined-functions.rs @@ -18,12 +18,6 @@ fn main() -> Result<(), Box> { // --8<-- [start:diff_from_mean_numba] // --8<-- [end:diff_from_mean_numba] - // --8<-- [start:dataframe2] - // --8<-- [end:dataframe2] - - // --8<-- [start:missing_data] - // --8<-- [end:missing_data] - // --8<-- [start:combine] // --8<-- [end:combine] Ok(()) diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md index d27809c914dd..98b926656e5c 100644 --- a/docs/user-guide/expressions/user-defined-functions.md +++ b/docs/user-guide/expressions/user-defined-functions.md @@ -53,7 +53,8 @@ The pre-written functions are helpful, but our goal is to write our own function For example, let's say we want a fast version of our `diff_from_mean()` example above. The easiest way to write this in Python is to use [Numba](https://numba.readthedocs.io/en/stable/), which allows you to write custom functions in (a subset) of Python while still getting the benefit of compiled code. -In particular, Numba provides a decorator called [`@guvectorize`](https://numba.readthedocs.io/en/stable/user/vectorize.html#the-guvectorize-decorator) that compiles a Python function to fast machine code, in a way that allows it to be used by Polars. +In particular, Numba provides a decorator called [`@guvectorize`](https://numba.readthedocs.io/en/stable/user/vectorize.html#the-guvectorize-decorator). +This creates a generalized ufunc by compiling a Python function to fast machine code, in a way that allows it to be used by Polars. In the following example the `diff_from_mean_numba()` will be compiled to fast machine code at import time, which will take a little time. After that all calls to the function will run quickly. @@ -65,7 +66,7 @@ The `Series` will be converted to a NumPy array before being passed to the funct --8<-- "python/user-guide/expressions/user-defined-functions.py:diff_from_mean_numba" ``` -## Missing data can break your calculation +## Missing data is not allowed when calling generalized ufuncs Before being passed to a user-defined function like `diff_from_mean_numba()`, a `Series` will be converted to a NumPy array. Unfortunately, NumPy arrays don't have a concept of missing data. @@ -73,21 +74,10 @@ If there is missing data in the original `Series`, this means the resulting arra If you're calculating results item by item, this doesn't matter. For example, `numpy.log()` gets called on each individual value separately, so those missing values don't change the calculation. -But if the result of a user-defined function depend on multiple values in the `Series`, the result may be wrong: +But if the result of a user-defined function depend on multiple values in the `Series`, it's not clear what exactly should happen with the missing values. -{{code_block('user-guide/expressions/user-defined-functions','dataframe2',[])}} - -```python exec="on" result="text" session="user-guide/udf" ---8<-- "python/user-guide/expressions/user-defined-functions.py:dataframe2" -``` - -{{code_block('user-guide/expressions/user-defined-functions','missing_data',[])}} - -```python exec="on" result="text" session="user-guide/udf" ---8<-- "python/user-guide/expressions/user-defined-functions.py:missing_data" -``` - -How do you deal with missing data? +Therefore, when calling generalized ufuncs such as Numba functions decorated with `@guvectorize`, Polars will raise an error if you try to pass in a `Series` with missing data. +How do you get rid of missing data? Either [fill it in](missing-data.md) or [drop it](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.drop_nulls.html) before calling your custom function. ## Combining multiple column values From b06f7e889fd77847b281d8b96643d95e691ef32d Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 29 May 2024 10:18:50 -0400 Subject: [PATCH 12/20] Reformat with ruff. --- .../python/user-guide/expressions/user-defined-functions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index 18cb5bbd05e0..4e96b5ffed33 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -91,7 +91,8 @@ def add(arr, arr2, result): lambda combined: add( combined.struct.field("values1"), combined.struct.field("values2") ) - ).alias("add_columns") + ) + .alias("add_columns") ) print(out) # --8<-- [end:combine] From 2a28b1399edc738d5f0cee2d7bdab855269f4311 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 29 May 2024 10:32:14 -0400 Subject: [PATCH 13/20] Finish moving over from user-defined-functions --- docs/src/python/user-guide/expressions/structs.py | 9 +++++++++ docs/user-guide/expressions/structs.md | 6 +++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/docs/src/python/user-guide/expressions/structs.py b/docs/src/python/user-guide/expressions/structs.py index 3c0c61055166..01e21cca25b5 100644 --- a/docs/src/python/user-guide/expressions/structs.py +++ b/docs/src/python/user-guide/expressions/structs.py @@ -66,4 +66,13 @@ # --8<-- [end:struct_ranking] # --8<-- [start:multi_column_apply] +df = pl.DataFrame({"keys": ["a", "a", "b"], "values": [10, 7, 1]}) + +out = df.select( + pl.struct(["keys", "values"]) + .map_elements(lambda x: len(x["keys"]) + x["values"]) + .alias("solution_map_elements"), + (pl.col("keys").str.len_bytes() + pl.col("values")).alias("solution_expr"), +) +print(out) # --8<-- [end:multi_column_apply] diff --git a/docs/user-guide/expressions/structs.md b/docs/user-guide/expressions/structs.md index 563d8879ef3b..d692c05ad0a1 100644 --- a/docs/user-guide/expressions/structs.md +++ b/docs/user-guide/expressions/structs.md @@ -97,6 +97,10 @@ That's a pretty complex set of requirements done very elegantly in Polars! ### Using multi-column apply This was discussed in the previous section on _User Defined Functions_ for the Python case. -Here's an example of doing so with Rust: +Here's an example of doing so with both Python and Rust: {{code_block('user-guide/expressions/structs','multi_column_apply',[])}} + +```python exec="on" result="text" session="user-guide/structs" +--8<-- "python/user-guide/expressions/structs.py:multi_column_apply" +``` From 3739d0a9d035d4f59a13674026999b7eb1002069 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 29 May 2024 10:33:48 -0400 Subject: [PATCH 14/20] Update --- docs/user-guide/expressions/numpy.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/user-guide/expressions/numpy.md b/docs/user-guide/expressions/numpy.md index 6500e87b5207..4a5a46978b57 100644 --- a/docs/user-guide/expressions/numpy.md +++ b/docs/user-guide/expressions/numpy.md @@ -15,8 +15,8 @@ This means that if a function is not provided by Polars, we can use NumPy and we ### Interoperability -Polars `Series` have support for NumPy universal functions (ufuncs). Element-wise functions such as `np.exp()`, `np.cos()`, `np.div()`, etc. all work with almost zero overhead. +Polars `Series` have support for NumPy universal functions (ufuncs) and generalized ufuncs. Element-wise functions such as `np.exp()`, `np.cos()`, `np.div()`, etc. all work with almost zero overhead. -However, as a Polars-specific remark: missing values are a separate bitmask and are not visible by NumPy. This can lead to a window function or a `np.convolve()` giving flawed or incomplete results. +However, as a Polars-specific remark: missing values are a separate bitmask and are not visible by NumPy. This can lead to a window function or a `np.convolve()` giving flawed or incomplete results, so an error will be raised if you pass a `Series` with missing data to a generalized ufunc. Convert a Polars `Series` to a NumPy array with the `.to_numpy()` method. Missing values will be replaced by `np.nan` during the conversion. From 01c7067c211c8f8d571610ac885dab3d3e3e3c88 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 5 Jun 2024 10:02:12 -0400 Subject: [PATCH 15/20] Add back map_elements(). --- .../expressions/user-defined-functions.py | 15 ++++++-- .../expressions/user-defined-functions.rs | 3 ++ .../expressions/user-defined-functions.md | 34 ++++++++++++++++--- 3 files changed, 45 insertions(+), 7 deletions(-) diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index 4e96b5ffed33..71bc0280b873 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -14,6 +14,18 @@ print(df) # --8<-- [end:dataframe] +# --8<-- [start:individual_log] +import math + + +def my_log(value): + return math.log(value) + + +out = df.select(pl.col("values").map_elements(my_log, return_dtype=pl.Float64)) +print(out) +# --8<-- [end:individual_log] + # --8<-- [start:diff_from_mean] def diff_from_mean(series): @@ -91,8 +103,7 @@ def add(arr, arr2, result): lambda combined: add( combined.struct.field("values1"), combined.struct.field("values2") ) - ) - .alias("add_columns") + ).alias("add_columns") ) print(out) # --8<-- [end:combine] diff --git a/docs/src/rust/user-guide/expressions/user-defined-functions.rs b/docs/src/rust/user-guide/expressions/user-defined-functions.rs index c392563398c4..41cce71169b0 100644 --- a/docs/src/rust/user-guide/expressions/user-defined-functions.rs +++ b/docs/src/rust/user-guide/expressions/user-defined-functions.rs @@ -9,6 +9,9 @@ fn main() -> Result<(), Box> { println!("{}", df); // --8<-- [end:dataframe] + // --8<-- [start:individual_log] + // --8<-- [end:individual_log] + // --8<-- [start:diff_from_mean] // --8<-- [end:diff_from_mean] diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md index 98b926656e5c..84bf299dfeaf 100644 --- a/docs/user-guide/expressions/user-defined-functions.md +++ b/docs/user-guide/expressions/user-defined-functions.md @@ -3,13 +3,14 @@ Polars expressions are quite powerful and flexible, so there is much less need for custom Python functions compared to other libraries. Still, you may need to pass an expression's state to a third party library or apply your black box function to data in Polars. -In this part of the documentation we'll be using one specific API that allows you to do this: +In this part of the documentation we'll be using two APIs that allows you to do this: +- [:material-api: `map_elements`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html): Call a function separately on each value in the `Series`. - [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html): Always passes the full `Series` to the function. -## Example: A slow, custom function written in Python +## Processing individual values with `map_elements()` -For demonstration purposes, let's say we want to calculate the difference between the mean of a `Series` and each value. +Let's start with the simplest case: we want to process each value in a `Series` individually. Here is our data: {{code_block('user-guide/expressions/user-defined-functions','dataframe',[])}} @@ -19,7 +20,27 @@ Here is our data: --8<-- "python/user-guide/expressions/user-defined-functions.py:dataframe" ``` -We can use `map_batches()` to run this function on either the full `Series` or individual groups in a `group_by()`: +We'll call `math.log()` on each individual value: + +{{code_block('user-guide/expressions/user-defined-functions','individual_log',[])}} + +```python exec="on" result="text" session="user-guide/udf" +--8<-- "python/user-guide/expressions/user-defined-functions.py:individual_log" +``` + +While this works, `map_elements()` has two problems: + +1. **Limited to individual items:** Often you'll want to have a calculation that needs to operate on the whole `Series`, rather than individual items one by one. +2. **Performance overhead:** Even if you do want to process each item individually, calling a function for each individual item is slow; all those extra function calls add a lot of overhead. + +Let's start by solving the first problem, and then we'll see how to solve the second problem. + +## Processing a whole `Series` with `map_batches()` + +We want to run a custom on the contents of a whole `Series`. +For demonstration purposes, let's say we want to calculate the difference between the mean of a `Series` and each value. + +We can use the `map_batches()` API to run this function on either the full `Series` or individual groups in a `group_by()`: {{code_block('user-guide/expressions/user-defined-functions','diff_from_mean',[])}} @@ -47,9 +68,12 @@ For example: --8<-- "python/user-guide/expressions/user-defined-functions.py:np_log" ``` +Notice that we can use `map_batches()`, because `numpy.log()` is able to run on both individual items and on whole NumPy arrays. +This means it will run much faster than our original example, since we only have a single Python call and then all processing happens in a fast low-level language. + ## Example: A fast custom function using Numba -The pre-written functions are helpful, but our goal is to write our own functions. +The pre-written functions NumPy provides are helpful, but our goal is to write our own functions. For example, let's say we want a fast version of our `diff_from_mean()` example above. The easiest way to write this in Python is to use [Numba](https://numba.readthedocs.io/en/stable/), which allows you to write custom functions in (a subset) of Python while still getting the benefit of compiled code. From f53a9b5f983a9472a981256d7a0848619316a693 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 5 Jun 2024 10:03:56 -0400 Subject: [PATCH 16/20] Match Python --- .../src/rust/user-guide/expressions/user-defined-functions.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/rust/user-guide/expressions/user-defined-functions.rs b/docs/src/rust/user-guide/expressions/user-defined-functions.rs index 41cce71169b0..b83898ef6c7c 100644 --- a/docs/src/rust/user-guide/expressions/user-defined-functions.rs +++ b/docs/src/rust/user-guide/expressions/user-defined-functions.rs @@ -3,8 +3,8 @@ use polars::prelude::*; fn main() -> Result<(), Box> { // --8<-- [start:dataframe] let df = df!( - "keys" => &["a", "a", "b"], - "values" => &[10, 7, 1], + "keys" => &["a", "a", "b", "b"], + "values" => &[10, 7, 1, 23], )?; println!("{}", df); // --8<-- [end:dataframe] From 9e75eca65ce8dbc877791bc4f5853803f614a670 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Wed, 5 Jun 2024 10:04:28 -0400 Subject: [PATCH 17/20] typo. --- .../python/user-guide/expressions/user-defined-functions.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index 71bc0280b873..a436a6d8241e 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -38,7 +38,7 @@ def diff_from_mean(series): return pl.Series([value - mean for value in series]) -# Apply our custom function a full Series with map_batches(): +# Apply our custom function to a full Series with map_batches(): out = df.select(pl.col("values").map_batches(diff_from_mean)) print("== select() with UDF ==") print(out) @@ -103,7 +103,8 @@ def add(arr, arr2, result): lambda combined: add( combined.struct.field("values1"), combined.struct.field("values2") ) - ).alias("add_columns") + ) + .alias("add_columns") ) print(out) # --8<-- [end:combine] From dce8a04e7366024777f94aea54aabb120711d1e6 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 6 Jun 2024 15:44:25 +0100 Subject: [PATCH 18/20] correct link, article, add missing word --- .../expressions/user-defined-functions.md | 15 ++++++++------- py-polars/tests/docs/test_user_guide.py | 3 +++ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md index 84bf299dfeaf..5c018e9fe88c 100644 --- a/docs/user-guide/expressions/user-defined-functions.md +++ b/docs/user-guide/expressions/user-defined-functions.md @@ -5,7 +5,7 @@ Still, you may need to pass an expression's state to a third party library or ap In this part of the documentation we'll be using two APIs that allows you to do this: -- [:material-api: `map_elements`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html): Call a function separately on each value in the `Series`. +- [:material-api: `map_elements`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_elements.html): Call a function separately on each value in the `Series`. - [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html): Always passes the full `Series` to the function. ## Processing individual values with `map_elements()` @@ -37,7 +37,7 @@ Let's start by solving the first problem, and then we'll see how to solve the se ## Processing a whole `Series` with `map_batches()` -We want to run a custom on the contents of a whole `Series`. +We want to run a custom function on the contents of a whole `Series`. For demonstration purposes, let's say we want to calculate the difference between the mean of a `Series` and each value. We can use the `map_batches()` API to run this function on either the full `Series` or individual groups in a `group_by()`: @@ -52,8 +52,6 @@ We can use the `map_batches()` API to run this function on either the full `Seri The problem with a pure-Python implementation is that it's slow. In general, you want to minimize how much Python code you call if you want fast results. -Calling a Python function for every `Series` isn't usually a problem, unless the `group_by()` produces a very large number of groups. -However, running the `for` loop in Python, and then summing the values in Python, will be very slow. To maximize speed, you'll want to make sure that you're using a function written in a compiled language. For numeric calculations Polars supports a pair of interfaces defined by NumPy called ["ufuncs"](https://numpy.org/doc/stable/reference/ufuncs.html) and ["generalized ufuncs"](https://numpy.org/neps/nep-0005-generalized-ufuncs.html). @@ -118,10 +116,13 @@ The basic idea is to combine multiple columns into a `Struct`, and then the func ## Streaming calculations Passing the full `Series` to the user-defined function has a cost: it may use a lot of memory, as its contents are copied into a NumPy array. -You can use a `is_elementwise=True` argument to [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) to stream results into the function, which means it might not get all values at once. +You can use the `is_elementwise=True` argument to [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) to stream results into the function, which means it might not get all values at once. -For a function like `numpy.log()` this works fine, because `numpy.log()` effectively calculates each individual value separately anyway. -However, for our example `diff_from_mean()` function above, this would result in incorrect results, since it would calculate the mean on only part of the `Series`. +!!! note + The `is_elementwise` argument can lead to incorrect results if set incorrectly. + If you set `is_elementwise=True`, make sure that your function actually operates + element-by-element (e.g. "calculate the logarithm of each value") - our example function `diff_from_mean()`, + for instance, does not. ## Return types diff --git a/py-polars/tests/docs/test_user_guide.py b/py-polars/tests/docs/test_user_guide.py index 08be6fe9dfbf..a513f4b5f0c1 100644 --- a/py-polars/tests/docs/test_user_guide.py +++ b/py-polars/tests/docs/test_user_guide.py @@ -32,5 +32,8 @@ def _change_test_dir() -> Iterator[None]: @pytest.mark.docs() @pytest.mark.parametrize("path", snippet_paths) @pytest.mark.usefixtures("_change_test_dir") +@pytest.mark.filterwarnings( + r"ignore:\nExpr\.map_elements:polars.exceptions.PolarsInefficientMapWarning" +) def test_run_python_snippets(path: Path) -> None: runpy.run_path(str(path)) From 2e5f5c871f324264f175cbf52cf3dc8909645fd8 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 6 Jun 2024 16:00:26 +0100 Subject: [PATCH 19/20] lint --- docs/user-guide/expressions/user-defined-functions.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md index 5c018e9fe88c..dc994148c63b 100644 --- a/docs/user-guide/expressions/user-defined-functions.md +++ b/docs/user-guide/expressions/user-defined-functions.md @@ -119,10 +119,10 @@ Passing the full `Series` to the user-defined function has a cost: it may use a You can use the `is_elementwise=True` argument to [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) to stream results into the function, which means it might not get all values at once. !!! note - The `is_elementwise` argument can lead to incorrect results if set incorrectly. - If you set `is_elementwise=True`, make sure that your function actually operates - element-by-element (e.g. "calculate the logarithm of each value") - our example function `diff_from_mean()`, - for instance, does not. +The `is_elementwise` argument can lead to incorrect results if set incorrectly. +If you set `is_elementwise=True`, make sure that your function actually operates +element-by-element (e.g. "calculate the logarithm of each value") - our example function `diff_from_mean()`, +for instance, does not. ## Return types From 0778a443fc4341e8ef3b60ac75550afaf383309a Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Tue, 25 Jun 2024 13:21:24 -0400 Subject: [PATCH 20/20] Try to pacify linter --- docs/src/python/user-guide/expressions/user-defined-functions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index 00ea19c5d54b..a436a6d8241e 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -59,6 +59,7 @@ def diff_from_mean(series): # --8<-- [start:diff_from_mean_numba] from numba import guvectorize, int64, float64 + # This will be compiled to machine code, so it will be fast. The Series is # converted to a NumPy array before being passed to the function. See the # Numba documentation for more details: