From 063811da3936eafec78f348c4c710d9707496c25 Mon Sep 17 00:00:00 2001 From: Corwin Joy Date: Wed, 23 Oct 2024 13:43:32 -0700 Subject: [PATCH 1/2] Fix str.replace function so that when literal=True, capture groups are ignored. --- .../src/dsl/function_expr/strings.rs | 15 ++++++--- .../namespaces/string/test_string.py | 31 ++++++++++++++++++- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/crates/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs index 12e8c6c6e53e..e6a4c496c87d 100644 --- a/crates/polars-plan/src/dsl/function_expr/strings.rs +++ b/crates/polars-plan/src/dsl/function_expr/strings.rs @@ -844,20 +844,27 @@ fn replace_n<'a>( "replacement value length ({}) does not match string column length ({})", len_val, ca.len(), ); - let literal = literal || is_literal_pat(&pat); + let lit = is_literal_pat(&pat); + let literal_pat = literal || lit; - if literal { + if literal_pat { pat = escape(&pat) } let reg = Regex::new(&pat)?; - let lit = pat.chars().all(|c| !c.is_ascii_punctuation()); let f = |s: &'a str, val: &'a str| { if lit && (s.len() <= 32) { Cow::Owned(s.replacen(&pat, val, 1)) } else { - reg.replace(s, val) + // According to the docs for replace_all + // when literal = True then capture groups are ignored. + if literal { + reg.replace(s, NoExpand(val)) + } else { + reg.replace(s, val) + } + } }; Ok(iter_and_replace(ca, val, f)) diff --git a/py-polars/tests/unit/operations/namespaces/string/test_string.py b/py-polars/tests/unit/operations/namespaces/string/test_string.py index 3b2637a0f334..cdfc21280e2d 100644 --- a/py-polars/tests/unit/operations/namespaces/string/test_string.py +++ b/py-polars/tests/unit/operations/namespaces/string/test_string.py @@ -1006,7 +1006,7 @@ def test_replace_all() -> None: ) -def test_replace_literal_no_caputures() -> None: +def test_replace_all_literal_no_caputures() -> None: # When using literal = True, capture groups should be disabled # Single row code path in Rust @@ -1034,6 +1034,35 @@ def test_replace_literal_no_caputures() -> None: assert df2.get_column("text2")[1] == "I lost $2 yesterday." +def test_replace_literal_no_caputures() -> None: + # When using literal = True, capture groups should be disabled + + # Single row code path in Rust + df = pl.DataFrame({"text": ["I found yesterday."], "amt": ["$1"]}) + df = df.with_columns( + pl.col("text") + .str.replace("", pl.col("amt"), literal=True) + .alias("text2") + ) + assert df.get_column("text2")[0] == "I found $1 yesterday." + + # Multi-row code path in Rust + # A string shorter than 32 chars, and one longer than 32 chars to test both sub-paths + df2 = pl.DataFrame( + { + "text": ["I found yesterday.", "I lost yesterday and this string is longer than 32 characters."], + "amt": ["$1", "$2"], + } + ) + df2 = df2.with_columns( + pl.col("text") + .str.replace("", pl.col("amt"), literal=True) + .alias("text2") + ) + assert df2.get_column("text2")[0] == "I found $1 yesterday." + assert df2.get_column("text2")[1] == "I lost $2 yesterday and this string is longer than 32 characters." + + def test_replace_expressions() -> None: df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t"], "value": ["A", "B"]}) out = df.select([pl.col("foo").str.replace(pl.col("foo").first(), pl.col("value"))]) From 43f5d92691d42b689bf22c1fc2e46dca60a30dfe Mon Sep 17 00:00:00 2001 From: Corwin Joy Date: Wed, 23 Oct 2024 16:38:55 -0700 Subject: [PATCH 2/2] Apply formatting and linting changes. --- .../src/dsl/function_expr/strings.rs | 3 +-- .../namespaces/string/test_string.py | 21 +++++++++++-------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/crates/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs index e6a4c496c87d..039c995557be 100644 --- a/crates/polars-plan/src/dsl/function_expr/strings.rs +++ b/crates/polars-plan/src/dsl/function_expr/strings.rs @@ -857,14 +857,13 @@ fn replace_n<'a>( if lit && (s.len() <= 32) { Cow::Owned(s.replacen(&pat, val, 1)) } else { - // According to the docs for replace_all + // According to the docs for replace // when literal = True then capture groups are ignored. if literal { reg.replace(s, NoExpand(val)) } else { reg.replace(s, val) } - } }; Ok(iter_and_replace(ca, val, f)) diff --git a/py-polars/tests/unit/operations/namespaces/string/test_string.py b/py-polars/tests/unit/operations/namespaces/string/test_string.py index cdfc21280e2d..050a581ff406 100644 --- a/py-polars/tests/unit/operations/namespaces/string/test_string.py +++ b/py-polars/tests/unit/operations/namespaces/string/test_string.py @@ -1040,27 +1040,30 @@ def test_replace_literal_no_caputures() -> None: # Single row code path in Rust df = pl.DataFrame({"text": ["I found yesterday."], "amt": ["$1"]}) df = df.with_columns( - pl.col("text") - .str.replace("", pl.col("amt"), literal=True) - .alias("text2") + pl.col("text").str.replace("", pl.col("amt"), literal=True).alias("text2") ) assert df.get_column("text2")[0] == "I found $1 yesterday." # Multi-row code path in Rust - # A string shorter than 32 chars, and one longer than 32 chars to test both sub-paths + # A string shorter than 32 chars, + # and one longer than 32 chars to test both sub-paths df2 = pl.DataFrame( { - "text": ["I found yesterday.", "I lost yesterday and this string is longer than 32 characters."], + "text": [ + "I found yesterday.", + "I lost yesterday and this string is longer than 32 characters.", + ], "amt": ["$1", "$2"], } ) df2 = df2.with_columns( - pl.col("text") - .str.replace("", pl.col("amt"), literal=True) - .alias("text2") + pl.col("text").str.replace("", pl.col("amt"), literal=True).alias("text2") ) assert df2.get_column("text2")[0] == "I found $1 yesterday." - assert df2.get_column("text2")[1] == "I lost $2 yesterday and this string is longer than 32 characters." + assert ( + df2.get_column("text2")[1] + == "I lost $2 yesterday and this string is longer than 32 characters." + ) def test_replace_expressions() -> None: