From 9d856fd51ca9703842b06e39957a5e3148c85d2d Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Wed, 30 Oct 2024 15:13:34 +0400 Subject: [PATCH] docs: Improve explanation of the `$` character with reference to capture groups (vs use as a literal) --- py-polars/polars/expr/string.py | 89 +++++++++++++++++++------ py-polars/polars/series/string.py | 107 ++++++++++++++++++++++-------- 2 files changed, 148 insertions(+), 48 deletions(-) diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index e94f995ee700..37481a925e68 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -1908,13 +1908,36 @@ def replace( Notes ----- - The dollar sign (`$`) is a special character related to capture groups. - To refer to a literal dollar sign, use `$$` instead or set `literal` to `True`. - - To modify regular expression behaviour (such as case-sensitivity) with flags, - use the inline `(?iLmsuxU)` syntax. See the regex crate's section on - `grouping and flags `_ - for additional information about the use of inline expression modifiers. + * To modify regular expression behaviour (such as case-sensitivity) with flags, + use the inline `(?iLmsuxU)` syntax. See the regex crate's section on + `grouping and flags `_ + for additional information about the use of inline expression modifiers. + + * The dollar sign (`$`) is a special character related to capture groups; if you + want to replace some target pattern with characters that include a literal `$` + you should escape it by doubling it up as `$$`, or set `literal=True` if you + do not need a full regular expression pattern match. Otherwise, you will be + referencing a (potentially non-existent) capture group. + + In the example below we need to double up `$` (to represent a literal dollar + sign, and then refer to the capture group using `$n` or `${n}`, hence the + three consecutive `$` characters in the replacement value: + + .. code-block:: python + + >>> df = pl.DataFrame({"cost": ["#12.34", "#56.78"]}) + >>> df.with_columns( + ... cost_usd=pl.col("cost").str.replace(r"#(\d+)", "$$${1}") + ... ) + shape: (2, 2) + ┌────────┬──────────┐ + │ cost ┆ cost_usd │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞════════╪══════════╡ + │ #12.34 ┆ $12.34 │ + │ #56.78 ┆ $56.78 │ + └────────┴──────────┘ Examples -------- @@ -1930,9 +1953,9 @@ def replace( │ 2 ┆ abc456 │ └─────┴────────┘ - Capture groups are supported. Use `${1}` in the `value` string to refer to the - first capture group in the `pattern`, `${2}` to refer to the second capture - group, and so on. You can also use named capture groups. + Capture groups are supported. Use `$1` or `${1}` in the `value` string to refer + to the first capture group in the `pattern`, `$2` or `${2}` to refer to the + second capture group, and so on. You can also use *named* capture groups. >>> df = pl.DataFrame({"word": ["hat", "hut"]}) >>> df.with_columns( @@ -1999,13 +2022,39 @@ def replace_all( Notes ----- - The dollar sign (`$`) is a special character related to capture groups. - To refer to a literal dollar sign, use `$$` instead or set `literal` to `True`. - - To modify regular expression behaviour (such as case-sensitivity) with flags, - use the inline `(?iLmsuxU)` syntax. See the regex crate's section on - `grouping and flags `_ - for additional information about the use of inline expression modifiers. + * To modify regular expression behaviour (such as case-sensitivity) with flags, + use the inline `(?iLmsuxU)` syntax. See the regex crate's section on + `grouping and flags `_ + for additional information about the use of inline expression modifiers. + + * The dollar sign (`$`) is a special character related to capture groups; if you + want to replace some target pattern with characters that include a literal `$` + you should escape it by doubling it up as `$$`, or set `literal=True` if you + do not need a full regular expression pattern match. Otherwise, you will be + referencing a (potentially non-existent) capture group. + + In the example below we need to double up `$` to represent a literal dollar + sign, otherwise we are referring to a capture group (which may or may not + exist): + + .. code-block:: python + + >>> df = pl.DataFrame({"text": ["ab12cd34ef", "gh45ij67kl"]}) + >>> df.with_columns( + ... # the replacement pattern refers back to the capture group + ... text1=pl.col("text").str.replace_all(r"(?\d{2,})", "$N$"), + ... # doubling-up the `$` results in it appearing as a literal value + ... text2=pl.col("text").str.replace_all(r"(?\d{2,})", "$$N$$"), + ... ) + shape: (2, 3) + ┌────────────┬──────────────┬──────────────┐ + │ text ┆ text1 ┆ text2 │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str │ + ╞════════════╪══════════════╪══════════════╡ + │ ab12cd34ef ┆ ab12$cd34$ef ┆ ab$N$cd$N$ef │ + │ gh45ij67kl ┆ gh45$ij67$kl ┆ gh$N$ij$N$kl │ + └────────────┴──────────────┴──────────────┘ Examples -------- @@ -2021,9 +2070,9 @@ def replace_all( │ 2 ┆ 123-123 │ └─────┴─────────┘ - Capture groups are supported. Use `${1}` in the `value` string to refer to the - first capture group in the `pattern`, `${2}` to refer to the second capture - group, and so on. You can also use named capture groups. + Capture groups are supported. Use `$1` or `${1}` in the `value` string to refer + to the first capture group in the `pattern`, `$2` or `${2}` to refer to the + second capture group, and so on. You can also use *named* capture groups. >>> df = pl.DataFrame({"word": ["hat", "hut"]}) >>> df.with_columns( diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py index 97f3e373e98f..064cfc580b21 100644 --- a/py-polars/polars/series/string.py +++ b/py-polars/polars/series/string.py @@ -1109,13 +1109,46 @@ def replace( Notes ----- - The dollar sign (`$`) is a special character related to capture groups. - To refer to a literal dollar sign, use `$$` instead or set `literal` to `True`. - - To modify regular expression behaviour (such as case-sensitivity) with flags, - use the inline `(?iLmsuxU)` syntax. See the regex crate's section on - `grouping and flags `_ - for additional information about the use of inline expression modifiers. + * To modify regular expression behaviour (such as case-sensitivity) with flags, + use the inline `(?iLmsuxU)` syntax. (See the regex crate's section on + `grouping and flags `_ + for additional information about the use of inline expression modifiers). + + * The dollar sign (`$`) is a special character related to capture groups; if you + want to replace some target pattern with characters that include a literal `$` + you should escape it by doubling it up as `$$`, or set `literal=True` if you + do not need a full regular expression pattern match. Otherwise, you will be + referencing a (potentially non-existent) capture group. + + If not escaped, the `$0` in the replacement value (below) represents a capture + group: + + .. code-block:: python + + >>> s = pl.Series("cents", ["000.25", "00.50", "0.75"]) + >>> s.str.replace(r"^(0+)\.", "$0.") + shape: (3,) + Series: 'cents' [str] + [ + "000..25" + "00..50" + "0..75" + ] + + To have `$` represent a literal value, it should be doubled-up as `$$` + (or, for simpler find/replace operations, set `literal=True` if you do + not require a full regular expression match): + + .. code-block:: python + + >>> s.str.replace(r"^(0+)\.", "$$0.") + shape: (3,) + Series: 'cents' [str] + [ + "$0.25" + "$0.50" + "$0.75" + ] Examples -------- @@ -1128,24 +1161,24 @@ def replace( "abc456" ] - Capture groups are supported. Use `${1}` in the `value` string to refer to the - first capture group in the `pattern`, `${2}` to refer to the second capture - group, and so on. You can also use named capture groups. + Capture groups are supported. Use `$1` or `${1}` in the `value` string to refer + to the first capture group in the `pattern`, `$2` or `${2}` to refer to the + second capture group, and so on. You can also use *named* capture groups. >>> s = pl.Series(["hat", "hut"]) >>> s.str.replace("h(.)t", "b${1}d") shape: (2,) Series: '' [str] [ - "bad" - "bud" + "bad" + "bud" ] >>> s.str.replace("h(?.)t", "b${vowel}d") shape: (2,) Series: '' [str] [ - "bad" - "bud" + "bad" + "bud" ] Apply case-insensitive string replacement using the `(?i)` flag. @@ -1181,13 +1214,31 @@ def replace_all(self, pattern: str, value: str, *, literal: bool = False) -> Ser Notes ----- - The dollar sign (`$`) is a special character related to capture groups. - To refer to a literal dollar sign, use `$$` instead or set `literal` to `True`. - - To modify regular expression behaviour (such as case-sensitivity) with flags, - use the inline `(?iLmsuxU)` syntax. See the regex crate's section on - `grouping and flags `_ - for additional information about the use of inline expression modifiers. + * To modify regular expression behaviour (such as case-sensitivity) with flags, + use the inline `(?iLmsuxU)` syntax. (See the regex crate's section on + `grouping and flags `_ + for additional information about the use of inline expression modifiers). + + * The dollar sign (`$`) is a special character related to capture groups; if you + want to replace some target pattern with characters that include a literal `$` + you should escape it by doubling it up as `$$`, or set `literal=True` if you + do not need a full regular expression pattern match. Otherwise, you will be + referencing a (potentially non-existent) capture group. + + In the example below we need to double up `$` (to represent a literal dollar + sign, and then refer to the capture group using `$n` or `${n}`, hence the + three consecutive `$` characters in the replacement value: + + .. code-block:: python + + >>> s = pl.Series("cost", ["#12.34", "#56.78"]) + >>> s.str.replace_all(r"#(\d+)", "$$${1}").alias("cost_usd") + shape: (2,) + Series: 'cost_usd' [str] + [ + "$12.34" + "$56.78" + ] Examples -------- @@ -1200,24 +1251,24 @@ def replace_all(self, pattern: str, value: str, *, literal: bool = False) -> Ser "abc456" ] - Capture groups are supported. Use `${1}` in the `value` string to refer to the - first capture group in the `pattern`, `${2}` to refer to the second capture - group, and so on. You can also use named capture groups. + Capture groups are supported. Use `$1` or `${1}` in the `value` string to refer + to the first capture group in the `pattern`, `$2` or `${2}` to refer to the + second capture group, and so on. You can also use *named* capture groups. >>> s = pl.Series(["hat", "hut"]) >>> s.str.replace_all("h(.)t", "b${1}d") shape: (2,) Series: '' [str] [ - "bad" - "bud" + "bad" + "bud" ] >>> s.str.replace_all("h(?.)t", "b${vowel}d") shape: (2,) Series: '' [str] [ - "bad" - "bud" + "bad" + "bud" ] Apply case-insensitive string replacement using the `(?i)` flag.