From c731d11f0db45c197c8ae327a5d9d76c230cb78b Mon Sep 17 00:00:00 2001 From: Corwin Joy Date: Tue, 22 Oct 2024 00:23:27 -0700 Subject: [PATCH] fix: Capture groups should be ignored in replace_all when literal=True (#19366) --- .../src/dsl/function_expr/strings.rs | 18 +++++++++--- .../namespaces/string/test_string.py | 28 +++++++++++++++++++ 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/crates/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs index ba06dc00e67c..66bd3c5c6e73 100644 --- a/crates/polars-plan/src/dsl/function_expr/strings.rs +++ b/crates/polars-plan/src/dsl/function_expr/strings.rs @@ -9,7 +9,7 @@ use polars_core::utils::handle_casting_failures; #[cfg(feature = "dtype-struct")] use polars_utils::format_pl_smallstr; #[cfg(feature = "regex")] -use regex::{escape, Regex}; +use regex::{escape, NoExpand, Regex}; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -888,15 +888,25 @@ fn replace_all<'a>( "replacement value length ({}) does not match string column length ({})", len_val, ca.len(), ); - let literal = literal || is_literal_pat(&pat); - if literal { + let literal_pat = literal || is_literal_pat(&pat); + + if literal_pat { pat = escape(&pat) } let reg = Regex::new(&pat)?; - let f = |s: &'a str, val: &'a str| reg.replace_all(s, val); + let f = |s: &'a str, val: &'a str| { + // According to the docs for replace_all + // when literal = True then capture groups are ignored. + if literal { + reg.replace_all(s, NoExpand(val)) + } else { + reg.replace_all(s, val) + } + }; + Ok(iter_and_replace(ca, val, f)) }, _ => polars_bail!( diff --git a/py-polars/tests/unit/operations/namespaces/string/test_string.py b/py-polars/tests/unit/operations/namespaces/string/test_string.py index 842b0fd141a5..6b221c1706e8 100644 --- a/py-polars/tests/unit/operations/namespaces/string/test_string.py +++ b/py-polars/tests/unit/operations/namespaces/string/test_string.py @@ -1006,6 +1006,34 @@ def test_replace_all() -> None: ) +def test_replace_literal_no_caputures() -> None: + # When using literal = True, capture groups should be disabled + + # Single row code path in Rust + df = pl.DataFrame({"text": ["I found yesterday."], "amt": ["$1"]}) + df = df.with_columns( + pl.col("text") + .str.replace_all("", pl.col("amt"), literal=True) + .alias("text2") + ) + assert df.get_column("text2")[0] == "I found $1 yesterday." + + # Multi-row code path in Rust + df2 = pl.DataFrame( + { + "text": ["I found yesterday.", "I lost yesterday."], + "amt": ["$1", "$2"], + } + ) + df2 = df2.with_columns( + pl.col("text") + .str.replace_all("", pl.col("amt"), literal=True) + .alias("text2") + ) + assert df2.get_column("text2")[0] == "I found $1 yesterday." + assert df2.get_column("text2")[1] == "I lost $2 yesterday." + + def test_replace_expressions() -> None: df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t"], "value": ["A", "B"]}) out = df.select([pl.col("foo").str.replace(pl.col("foo").first(), pl.col("value"))])