docs(python): Define what a 'character' means in slice / len_chars (

#14395)
pola-rs · Feb 9, 2024 · a53ab11 · a53ab11
1 parent c14e87f
commit a53ab11
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 34 deletions.
diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py
@@ -428,6 +428,12 @@ def len_chars(self) -> Expr:
         equivalent output with much better performance:
         :func:`len_bytes` runs in _O(1)_, while :func:`len_chars` runs in (_O(n)_).
 
+        A character is defined as a `Unicode scalar value`_. A single character is
+        represented by a single byte when working with ASCII text, and a maximum of
+        4 bytes otherwise.
+
+        .. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
+
         Examples
         --------
         >>> df = pl.DataFrame({"a": ["Café", "345", "東京", None]})
@@ -2115,7 +2121,7 @@ def slice(
         self, offset: int | IntoExprColumn, length: int | IntoExprColumn | None = None
     ) -> Expr:
         """
-        Create subslices of the string values of a String Series.
+        Extract a substring from each string value.
 
         Parameters
         ----------
@@ -2130,40 +2136,45 @@ def slice(
         Expr
             Expression of data type :class:`String`.
 
+        Notes
+        -----
+        Both the `offset` and `length` inputs are defined in terms of the number
+        of characters in the (UTF8) string. A character is defined as a
+        `Unicode scalar value`_. A single character is represented by a single byte
+        when working with ASCII text, and a maximum of 4 bytes otherwise.
+
+        .. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
+
         Examples
         --------
         >>> df = pl.DataFrame({"s": ["pear", None, "papaya", "dragonfruit"]})
-        >>> df.with_columns(
-        ...     pl.col("s").str.slice(-3).alias("s_sliced"),
-        ... )
+        >>> df.with_columns(pl.col("s").str.slice(-3).alias("slice"))
         shape: (4, 2)
-        ┌─────────────┬──────────┐
-        │ s           ┆ s_sliced │
-        │ ---         ┆ ---      │
-        │ str         ┆ str      │
-        ╞═════════════╪══════════╡
-        │ pear        ┆ ear      │
-        │ null        ┆ null     │
-        │ papaya      ┆ aya      │
-        │ dragonfruit ┆ uit      │
-        └─────────────┴──────────┘
+        ┌─────────────┬───────┐
+        │ s           ┆ slice │
+        │ ---         ┆ ---   │
+        │ str         ┆ str   │
+        ╞═════════════╪═══════╡
+        │ pear        ┆ ear   │
+        │ null        ┆ null  │
+        │ papaya      ┆ aya   │
+        │ dragonfruit ┆ uit   │
+        └─────────────┴───────┘
 
         Using the optional `length` parameter
 
-        >>> df.with_columns(
-        ...     pl.col("s").str.slice(4, length=3).alias("s_sliced"),
-        ... )
+        >>> df.with_columns(pl.col("s").str.slice(4, length=3).alias("slice"))
         shape: (4, 2)
-        ┌─────────────┬──────────┐
-        │ s           ┆ s_sliced │
-        │ ---         ┆ ---      │
-        │ str         ┆ str      │
-        ╞═════════════╪══════════╡
-        │ pear        ┆          │
-        │ null        ┆ null     │
-        │ papaya      ┆ ya       │
-        │ dragonfruit ┆ onf      │
-        └─────────────┴──────────┘
+        ┌─────────────┬───────┐
+        │ s           ┆ slice │
+        │ ---         ┆ ---   │
+        │ str         ┆ str   │
+        ╞═════════════╪═══════╡
+        │ pear        ┆       │
+        │ null        ┆ null  │
+        │ papaya      ┆ ya    │
+        │ dragonfruit ┆ onf   │
+        └─────────────┴───────┘
         """
         offset = parse_as_expression(offset)
         length = parse_as_expression(length)
@@ -2200,7 +2211,7 @@ def explode(self) -> Expr:
 
     def to_integer(self, *, base: int = 10, strict: bool = True) -> Expr:
         """
-        Convert an String column into an Int64 column with base radix.
+        Convert a String column into an Int64 column with base radix.
 
         Parameters
         ----------

diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py
@@ -371,6 +371,12 @@ def len_chars(self) -> Series:
         equivalent output with much better performance:
         :func:`len_bytes` runs in _O(1)_, while :func:`len_chars` runs in (_O(n)_).
 
+        A character is defined as a `Unicode scalar value`_. A single character is
+        represented by a single byte when working with ASCII text, and a maximum of
+        4 bytes otherwise.
+
+        .. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
+
         Examples
         --------
         >>> s = pl.Series(["Café", "345", "東京", None])
@@ -1580,7 +1586,7 @@ def slice(
         self, offset: int | IntoExprColumn, length: int | IntoExprColumn | None = None
     ) -> Series:
         """
-        Create subslices of the string values of a String Series.
+        Extract a substring from each string value.
 
         Parameters
         ----------
@@ -1593,15 +1599,23 @@ def slice(
         Returns
         -------
         Series
-            Series of data type :class:`Struct` with fields of data type
-            :class:`String`.
+            Series of data type :class:`String`.
+
+        Notes
+        -----
+        Both the `offset` and `length` inputs are defined in terms of the number
+        of characters in the (UTF8) string. A character is defined as a
+        `Unicode scalar value`_. A single character is represented by a single byte
+        when working with ASCII text, and a maximum of 4 bytes otherwise.
+
+        .. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
 
         Examples
         --------
-        >>> s = pl.Series("s", ["pear", None, "papaya", "dragonfruit"])
+        >>> s = pl.Series(["pear", None, "papaya", "dragonfruit"])
         >>> s.str.slice(-3)
         shape: (4,)
-        Series: 's' [str]
+        Series: '' [str]
         [
             "ear"
             null
@@ -1613,7 +1627,7 @@ def slice(
 
         >>> s.str.slice(4, length=3)
         shape: (4,)
-        Series: 's' [str]
+        Series: '' [str]
         [
             ""
             null