Skip to content

Commit

Permalink
Update docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
mcrumiller committed Feb 12, 2024
1 parent dfd6b66 commit 939d6c5
Show file tree
Hide file tree
Showing 2 changed files with 174 additions and 58 deletions.
130 changes: 98 additions & 32 deletions py-polars/polars/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -2182,76 +2182,142 @@ def slice(

def head(self, n: int | IntoExprColumn = 10) -> Expr:
"""
Return the first n characters of each string in a Utf8 Series.
Return the first n characters of each string in a String Series.
Parameters
----------
n
Length of the slice. Negative indexing supported.
Length of the slice (integer or expression). Negative indexing is supported;
see note (2) below.
Returns
-------
Expr
Expression of data type :class:`Utf8`.
Expression of data type :class:`String`.
Notes
-----
A "character" is a valid (non-surrogate) UTF-8 codepoint, which is a single byte
when working with ASCII text, and a maximum of 4 bytes otherwise.
1) The `n` input is defined in terms of the number of characters in the (UTF8)
string. A character is defined as a `Unicode scalar value`_. A single
character is represented by a single byte when working with ASCII text, and a
maximum of 4 bytes otherwise.
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
2) When the `n` input is negative, `head` returns characters up to the `n`th
from the end of the string. For example, if `n = -3`, then all characters
except the last three are returned.
3) If the length of the string has fewer than `n` characters, the full string is
returned.
Examples
--------
Return up to the first 5 characters:
>>> df = pl.DataFrame({"s": ["pear", None, "papaya", "dragonfruit"]})
>>> df.with_columns(pl.col("s").str.head(3).alias("s_head3"))
>>> df.with_columns(pl.col("s").str.head(5).alias("s_head_5"))
shape: (4, 2)
┌─────────────┬─────────┐
│ s ┆ s_head3 │
│ --- ┆ --- │
│ str ┆ str │
╞═════════════╪═════════╡
│ pear ┆ pea │
│ null ┆ null │
│ papaya ┆ pap │
│ dragonfruit ┆ dra │
└─────────────┴─────────┘
┌─────────────┬──────────┐
│ s ┆ s_head_5 │
│ --- ┆ --- │
│ str ┆ str │
╞═════════════╪══════════╡
│ pear ┆ pear │
│ null ┆ null │
│ papaya ┆ papay │
│ dragonfruit ┆ drago │
└─────────────┴──────────┘
Return characters determined by column `n`:
>>> df = pl.DataFrame(
... {
... "s": ["pear", None, "papaya", "dragonfruit"],
... "n": [3, 4, -2, -5],
... }
... )
>>> df.with_columns(pl.col("s").str.head("n").alias("s_head_n"))
shape: (4, 3)
┌─────────────┬─────┬──────────┐
│ s ┆ n ┆ s_head_n │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ str │
╞═════════════╪═════╪══════════╡
│ pear ┆ 3 ┆ pea │
│ null ┆ 4 ┆ null │
│ papaya ┆ -2 ┆ papa │
│ dragonfruit ┆ -5 ┆ dragon │
└─────────────┴─────┴──────────┘
"""
n = parse_as_expression(n)
return wrap_expr(self._pyexpr.str_head(n))

def tail(self, n: int | IntoExprColumn = 10) -> Expr:
"""
Return the last n characters of each string in a Utf8 Series.
Return the last n characters of each string in a String Series.
Parameters
----------
n
Length of the slice. Negative indexing is supported.
Length of the slice (integer or expression). Negative indexing is supported;
see note (2) below.
Returns
-------
Expr
Expression of data type :class:`Utf8`.
Expression of data type :class:`String`.
Notes
-----
A "character" is a valid (non-surrogate) UTF-8 codepoint, which is a single byte
when working with ASCII text, and a maximum of 4 bytes otherwise.
1) The `n` input is defined in terms of the number of characters in the (UTF8)
string. A character is defined as a `Unicode scalar value`_. A single
character is represented by a single byte when working with ASCII text, and a
maximum of 4 bytes otherwise.
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
2) When the `n` input is negative, `tail` returns characters starting from the
`n`th from the beginning of the string. For example, if `n = -3`, then all
characters except the first three are returned.
3) If the length of the string has fewer than `n` characters, the full string is
returned.
Examples
--------
Return up to the last 5 characters:
>>> df = pl.DataFrame({"s": ["pear", None, "papaya", "dragonfruit"]})
>>> df.with_columns(pl.col("s").str.tail(3).alias("s_tail3"))
>>> df.with_columns(pl.col("s").str.tail(5).alias("s_tail_5"))
shape: (4, 2)
┌─────────────┬─────────┐
│ s ┆ s_tail3 │
│ --- ┆ --- │
│ str ┆ str │
╞═════════════╪═════════╡
│ pear ┆ ear │
│ null ┆ null │
│ papaya ┆ aya │
│ dragonfruit ┆ uit │
└─────────────┴─────────┘
┌─────────────┬──────────┐
│ s ┆ s_tail_5 │
│ --- ┆ --- │
│ str ┆ str │
╞═════════════╪══════════╡
│ pear ┆ pear │
│ null ┆ null │
│ papaya ┆ apaya │
│ dragonfruit ┆ fruit │
└─────────────┴──────────┘
Return characters determined by column `n`:
>>> df = pl.DataFrame(
... {
... "s": ["pear", None, "papaya", "dragonfruit"],
... "n": [3, 4, -2, -5],
... }
... )
>>> df.with_columns(pl.col("s").str.tail("n").alias("s_tail_n"))
shape: (4, 3)
┌─────────────┬─────┬──────────┐
│ s ┆ n ┆ s_tail_n │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ str │
╞═════════════╪═════╪══════════╡
│ pear ┆ 3 ┆ ear │
│ null ┆ 4 ┆ null │
│ papaya ┆ -2 ┆ paya │
│ dragonfruit ┆ -5 ┆ nfruit │
└─────────────┴─────┴──────────┘
"""
n = parse_as_expression(n)
return wrap_expr(self._pyexpr.str_tail(n))
Expand Down
102 changes: 76 additions & 26 deletions py-polars/polars/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1638,67 +1638,117 @@ def slice(

def head(self, n: int | IntoExprColumn = 10) -> Series:
"""
Return the first n characters of each string in a Utf8 Series.
Return the first n characters of each string in a String Series.
Parameters
----------
n
Length of the slice
Length of the slice (integer or expression). Negative indexing is supported;
see note (2) below.
Returns
-------
Series
Series of data type :class:`Struct` with fields of data type :class:`Utf8`.
Expr
Series of data type :class:`String`.
Notes
-----
A "character" is a valid (non-surrogate) UTF-8 codepoint, which is a single byte
when working with ASCII text, and a maximum of 4 bytes otherwise.
1) The `n` input is defined in terms of the number of characters in the (UTF8)
string. A character is defined as a `Unicode scalar value`_. A single
character is represented by a single byte when working with ASCII text, and a
maximum of 4 bytes otherwise.
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
2) When `n` is negative, `head` returns characters up to the `n`th from the end
of the string. For example, if `n = -3`, then all characters except the last
three are returned.
3) If the length of the string has fewer than `n` characters, the full string is
returned.
Examples
--------
>>> s = pl.Series("s", ["pear", None, "papaya", "dragonfruit"])
>>> s.str.head(3)
Return up to the first 5 characters.
>>> s = pl.Series(["pear", None, "papaya", "dragonfruit"])
>>> s.str.head(5)
shape: (4,)
Series: 's' [str]
Series: '' [str]
[
"pea"
null
"pap"
"dra"
"pear"
null
"papay"
"drago"
]
Return up to the 3rd character from the end.
>>> s = pl.Series(["pear", None, "papaya", "dragonfruit"])
>>> s.str.head(-3)
shape: (4,)
Series: '' [str]
[
"p"
null
"pap"
"dragonfr"
]
"""

def tail(self, n: int | IntoExprColumn = 10) -> Series:
"""
Return the last n characters of each string in a Utf8 Series.
Return the last n characters of each string in a String Series.
Parameters
----------
n
Length of the slice
Length of the slice (integer or expression). Negative indexing is supported;
see note (2) below.
Returns
-------
Series
Series of data type :class:`Struct` with fields of data type :class:`Utf8`.
Expr
Series of data type :class:`String`.
Notes
-----
A "character" is a valid (non-surrogate) UTF-8 codepoint, which is a single byte
when working with ASCII text, and a maximum of 4 bytes otherwise.
1) The `n` input is defined in terms of the number of characters in the (UTF8)
string. A character is defined as a `Unicode scalar value`_. A single
character is represented by a single byte when working with ASCII text, and a
maximum of 4 bytes otherwise.
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
2) When `n` is negative, `tail` returns characters starting from the `n`th from
the beginning of the string. For example, if `n = -3`, then all characters
except the first three are returned.
3) If the length of the string has fewer than `n` characters, the full string is
returned.
Examples
--------
>>> s = pl.Series("s", ["pear", None, "papaya", "dragonfruit"])
>>> s.str.tail(3)
Return up to the last 5 characters.
>>> s = pl.Series(["pear", None, "papaya", "dragonfruit"])
>>> s.str.tail(5)
shape: (4,)
Series: 's' [str]
Series: '' [str]
[
"ear"
null
"aya"
"uit"
"pear"
null
"apaya"
"fruit"
]
Return from the 3rd character to the end.
>>> s = pl.Series(["pear", None, "papaya", "dragonfruit"])
>>> s.str.tail(-3)
shape: (4,)
Series: '' [str]
[
"r"
null
"aya"
"gonfruit"
]
"""

Expand Down

0 comments on commit 939d6c5

Please sign in to comment.