Skip to content

Commit

Permalink
docs(python): Define what a 'character' means in slice / len_chars (
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego authored Feb 9, 2024
1 parent c14e87f commit a53ab11
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 34 deletions.
67 changes: 39 additions & 28 deletions py-polars/polars/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,12 @@ def len_chars(self) -> Expr:
equivalent output with much better performance:
:func:`len_bytes` runs in _O(1)_, while :func:`len_chars` runs in (_O(n)_).
A character is defined as a `Unicode scalar value`_. A single character is
represented by a single byte when working with ASCII text, and a maximum of
4 bytes otherwise.
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
Examples
--------
>>> df = pl.DataFrame({"a": ["Café", "345", "東京", None]})
Expand Down Expand Up @@ -2115,7 +2121,7 @@ def slice(
self, offset: int | IntoExprColumn, length: int | IntoExprColumn | None = None
) -> Expr:
"""
Create subslices of the string values of a String Series.
Extract a substring from each string value.
Parameters
----------
Expand All @@ -2130,40 +2136,45 @@ def slice(
Expr
Expression of data type :class:`String`.
Notes
-----
Both the `offset` and `length` inputs are defined in terms of the number
of characters in the (UTF8) string. A character is defined as a
`Unicode scalar value`_. A single character is represented by a single byte
when working with ASCII text, and a maximum of 4 bytes otherwise.
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
Examples
--------
>>> df = pl.DataFrame({"s": ["pear", None, "papaya", "dragonfruit"]})
>>> df.with_columns(
... pl.col("s").str.slice(-3).alias("s_sliced"),
... )
>>> df.with_columns(pl.col("s").str.slice(-3).alias("slice"))
shape: (4, 2)
┌─────────────┬──────────
│ s ┆ s_sliced
│ --- ┆ ---
│ str ┆ str
╞═════════════╪══════════
│ pear ┆ ear
│ null ┆ null
│ papaya ┆ aya
│ dragonfruit ┆ uit
└─────────────┴──────────
┌─────────────┬───────┐
│ s ┆ slice
│ --- ┆ --- │
│ str ┆ str │
╞═════════════╪═══════╡
│ pear ┆ ear │
│ null ┆ null │
│ papaya ┆ aya │
│ dragonfruit ┆ uit │
└─────────────┴───────┘
Using the optional `length` parameter
>>> df.with_columns(
... pl.col("s").str.slice(4, length=3).alias("s_sliced"),
... )
>>> df.with_columns(pl.col("s").str.slice(4, length=3).alias("slice"))
shape: (4, 2)
┌─────────────┬──────────
│ s ┆ s_sliced
│ --- ┆ ---
│ str ┆ str
╞═════════════╪══════════
│ pear ┆
│ null ┆ null
│ papaya ┆ ya
│ dragonfruit ┆ onf
└─────────────┴──────────
┌─────────────┬───────┐
│ s ┆ slice
│ --- ┆ --- │
│ str ┆ str │
╞═════════════╪═══════╡
│ pear ┆ │
│ null ┆ null │
│ papaya ┆ ya │
│ dragonfruit ┆ onf │
└─────────────┴───────┘
"""
offset = parse_as_expression(offset)
length = parse_as_expression(length)
Expand Down Expand Up @@ -2200,7 +2211,7 @@ def explode(self) -> Expr:

def to_integer(self, *, base: int = 10, strict: bool = True) -> Expr:
"""
Convert an String column into an Int64 column with base radix.
Convert a String column into an Int64 column with base radix.
Parameters
----------
Expand Down
26 changes: 20 additions & 6 deletions py-polars/polars/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,12 @@ def len_chars(self) -> Series:
equivalent output with much better performance:
:func:`len_bytes` runs in _O(1)_, while :func:`len_chars` runs in (_O(n)_).
A character is defined as a `Unicode scalar value`_. A single character is
represented by a single byte when working with ASCII text, and a maximum of
4 bytes otherwise.
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
Examples
--------
>>> s = pl.Series(["Café", "345", "東京", None])
Expand Down Expand Up @@ -1580,7 +1586,7 @@ def slice(
self, offset: int | IntoExprColumn, length: int | IntoExprColumn | None = None
) -> Series:
"""
Create subslices of the string values of a String Series.
Extract a substring from each string value.
Parameters
----------
Expand All @@ -1593,15 +1599,23 @@ def slice(
Returns
-------
Series
Series of data type :class:`Struct` with fields of data type
:class:`String`.
Series of data type :class:`String`.
Notes
-----
Both the `offset` and `length` inputs are defined in terms of the number
of characters in the (UTF8) string. A character is defined as a
`Unicode scalar value`_. A single character is represented by a single byte
when working with ASCII text, and a maximum of 4 bytes otherwise.
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
Examples
--------
>>> s = pl.Series("s", ["pear", None, "papaya", "dragonfruit"])
>>> s = pl.Series(["pear", None, "papaya", "dragonfruit"])
>>> s.str.slice(-3)
shape: (4,)
Series: 's' [str]
Series: '' [str]
[
"ear"
null
Expand All @@ -1613,7 +1627,7 @@ def slice(
>>> s.str.slice(4, length=3)
shape: (4,)
Series: 's' [str]
Series: '' [str]
[
""
null
Expand Down

0 comments on commit a53ab11

Please sign in to comment.