From a53ab1123c43a9b2486c6897c87e2e2033f98346 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Fri, 9 Feb 2024 21:14:43 +0100 Subject: [PATCH] docs(python): Define what a 'character' means in `slice` / `len_chars` (#14395) --- py-polars/polars/expr/string.py | 67 ++++++++++++++++++------------- py-polars/polars/series/string.py | 26 +++++++++--- 2 files changed, 59 insertions(+), 34 deletions(-) diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index 2be102167774..e046fa14eea3 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -428,6 +428,12 @@ def len_chars(self) -> Expr: equivalent output with much better performance: :func:`len_bytes` runs in _O(1)_, while :func:`len_chars` runs in (_O(n)_). + A character is defined as a `Unicode scalar value`_. A single character is + represented by a single byte when working with ASCII text, and a maximum of + 4 bytes otherwise. + + .. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value + Examples -------- >>> df = pl.DataFrame({"a": ["Café", "345", "東京", None]}) @@ -2115,7 +2121,7 @@ def slice( self, offset: int | IntoExprColumn, length: int | IntoExprColumn | None = None ) -> Expr: """ - Create subslices of the string values of a String Series. + Extract a substring from each string value. Parameters ---------- @@ -2130,40 +2136,45 @@ def slice( Expr Expression of data type :class:`String`. + Notes + ----- + Both the `offset` and `length` inputs are defined in terms of the number + of characters in the (UTF8) string. A character is defined as a + `Unicode scalar value`_. A single character is represented by a single byte + when working with ASCII text, and a maximum of 4 bytes otherwise. + + .. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value + Examples -------- >>> df = pl.DataFrame({"s": ["pear", None, "papaya", "dragonfruit"]}) - >>> df.with_columns( - ... pl.col("s").str.slice(-3).alias("s_sliced"), - ... ) + >>> df.with_columns(pl.col("s").str.slice(-3).alias("slice")) shape: (4, 2) - ┌─────────────┬──────────┐ - │ s ┆ s_sliced │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞═════════════╪══════════╡ - │ pear ┆ ear │ - │ null ┆ null │ - │ papaya ┆ aya │ - │ dragonfruit ┆ uit │ - └─────────────┴──────────┘ + ┌─────────────┬───────┐ + │ s ┆ slice │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞═════════════╪═══════╡ + │ pear ┆ ear │ + │ null ┆ null │ + │ papaya ┆ aya │ + │ dragonfruit ┆ uit │ + └─────────────┴───────┘ Using the optional `length` parameter - >>> df.with_columns( - ... pl.col("s").str.slice(4, length=3).alias("s_sliced"), - ... ) + >>> df.with_columns(pl.col("s").str.slice(4, length=3).alias("slice")) shape: (4, 2) - ┌─────────────┬──────────┐ - │ s ┆ s_sliced │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞═════════════╪══════════╡ - │ pear ┆ │ - │ null ┆ null │ - │ papaya ┆ ya │ - │ dragonfruit ┆ onf │ - └─────────────┴──────────┘ + ┌─────────────┬───────┐ + │ s ┆ slice │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞═════════════╪═══════╡ + │ pear ┆ │ + │ null ┆ null │ + │ papaya ┆ ya │ + │ dragonfruit ┆ onf │ + └─────────────┴───────┘ """ offset = parse_as_expression(offset) length = parse_as_expression(length) @@ -2200,7 +2211,7 @@ def explode(self) -> Expr: def to_integer(self, *, base: int = 10, strict: bool = True) -> Expr: """ - Convert an String column into an Int64 column with base radix. + Convert a String column into an Int64 column with base radix. Parameters ---------- diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py index cbcb929e0842..881d3ce2b5c5 100644 --- a/py-polars/polars/series/string.py +++ b/py-polars/polars/series/string.py @@ -371,6 +371,12 @@ def len_chars(self) -> Series: equivalent output with much better performance: :func:`len_bytes` runs in _O(1)_, while :func:`len_chars` runs in (_O(n)_). + A character is defined as a `Unicode scalar value`_. A single character is + represented by a single byte when working with ASCII text, and a maximum of + 4 bytes otherwise. + + .. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value + Examples -------- >>> s = pl.Series(["Café", "345", "東京", None]) @@ -1580,7 +1586,7 @@ def slice( self, offset: int | IntoExprColumn, length: int | IntoExprColumn | None = None ) -> Series: """ - Create subslices of the string values of a String Series. + Extract a substring from each string value. Parameters ---------- @@ -1593,15 +1599,23 @@ def slice( Returns ------- Series - Series of data type :class:`Struct` with fields of data type - :class:`String`. + Series of data type :class:`String`. + + Notes + ----- + Both the `offset` and `length` inputs are defined in terms of the number + of characters in the (UTF8) string. A character is defined as a + `Unicode scalar value`_. A single character is represented by a single byte + when working with ASCII text, and a maximum of 4 bytes otherwise. + + .. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value Examples -------- - >>> s = pl.Series("s", ["pear", None, "papaya", "dragonfruit"]) + >>> s = pl.Series(["pear", None, "papaya", "dragonfruit"]) >>> s.str.slice(-3) shape: (4,) - Series: 's' [str] + Series: '' [str] [ "ear" null @@ -1613,7 +1627,7 @@ def slice( >>> s.str.slice(4, length=3) shape: (4,) - Series: 's' [str] + Series: '' [str] [ "" null