Skip to content

Commit

Permalink
fix: Capture groups should be ignored in replace_all when literal=True (
Browse files Browse the repository at this point in the history
  • Loading branch information
corwinjoy authored Oct 22, 2024
1 parent eb596c9 commit c731d11
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 4 deletions.
18 changes: 14 additions & 4 deletions crates/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use polars_core::utils::handle_casting_failures;
#[cfg(feature = "dtype-struct")]
use polars_utils::format_pl_smallstr;
#[cfg(feature = "regex")]
use regex::{escape, Regex};
use regex::{escape, NoExpand, Regex};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};

Expand Down Expand Up @@ -888,15 +888,25 @@ fn replace_all<'a>(
"replacement value length ({}) does not match string column length ({})",
len_val, ca.len(),
);
let literal = literal || is_literal_pat(&pat);

if literal {
let literal_pat = literal || is_literal_pat(&pat);

if literal_pat {
pat = escape(&pat)
}

let reg = Regex::new(&pat)?;

let f = |s: &'a str, val: &'a str| reg.replace_all(s, val);
let f = |s: &'a str, val: &'a str| {
// According to the docs for replace_all
// when literal = True then capture groups are ignored.
if literal {
reg.replace_all(s, NoExpand(val))
} else {
reg.replace_all(s, val)
}
};

Ok(iter_and_replace(ca, val, f))
},
_ => polars_bail!(
Expand Down
28 changes: 28 additions & 0 deletions py-polars/tests/unit/operations/namespaces/string/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1006,6 +1006,34 @@ def test_replace_all() -> None:
)


def test_replace_literal_no_caputures() -> None:
# When using literal = True, capture groups should be disabled

# Single row code path in Rust
df = pl.DataFrame({"text": ["I found <amt> yesterday."], "amt": ["$1"]})
df = df.with_columns(
pl.col("text")
.str.replace_all("<amt>", pl.col("amt"), literal=True)
.alias("text2")
)
assert df.get_column("text2")[0] == "I found $1 yesterday."

# Multi-row code path in Rust
df2 = pl.DataFrame(
{
"text": ["I found <amt> yesterday.", "I lost <amt> yesterday."],
"amt": ["$1", "$2"],
}
)
df2 = df2.with_columns(
pl.col("text")
.str.replace_all("<amt>", pl.col("amt"), literal=True)
.alias("text2")
)
assert df2.get_column("text2")[0] == "I found $1 yesterday."
assert df2.get_column("text2")[1] == "I lost $2 yesterday."


def test_replace_expressions() -> None:
df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t"], "value": ["A", "B"]})
out = df.select([pl.col("foo").str.replace(pl.col("foo").first(), pl.col("value"))])
Expand Down

0 comments on commit c731d11

Please sign in to comment.