Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Expressify to_integer #15604

Merged
merged 2 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 35 additions & 13 deletions crates/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,25 +63,47 @@ pub trait StringNameSpaceImpl: AsString {

#[cfg(feature = "string_to_integer")]
// Parse a string number with base _radix_ into a decimal (i64)
fn to_integer(&self, base: u32, strict: bool) -> PolarsResult<Int64Chunked> {
fn to_integer(&self, base: &UInt32Chunked, strict: bool) -> PolarsResult<Int64Chunked> {
let ca = self.as_string();
let f = |opt_s: Option<&str>| -> Option<i64> {
opt_s.and_then(|s| <i64 as Num>::from_str_radix(s, base).ok())
let f = |opt_s: Option<&str>, opt_base: Option<u32>| -> Option<i64> {
match (opt_s, opt_base) {
(Some(s), Some(base)) => <i64 as Num>::from_str_radix(s, base).ok(),
_ => None,
}
};
let out: Int64Chunked = ca.apply_generic(f);

let out = broadcast_binary_elementwise(ca, base, f);
if strict && ca.null_count() != out.null_count() {
let failure_mask = !ca.is_null() & out.is_null();
let failure_mask = ca.is_not_null() & out.is_null() & base.is_not_null();
let all_failures = ca.filter(&failure_mask)?;
if all_failures.is_empty() {
return Ok(out);
}
let n_failures = all_failures.len();
let some_failures = all_failures.unique()?.slice(0, 10).sort(false);
let some_error_msg = some_failures
.get(0)
.and_then(|s| <i64 as Num>::from_str_radix(s, base).err())
.map_or_else(
|| unreachable!("failed to extract ParseIntError"),
|e| format!("{}", e),
);
let some_error_msg = match base.len() {
1 => {
// we can ensure that base is not null.
let base = base.get(0).unwrap();
some_failures
.get(0)
.and_then(|s| <i64 as Num>::from_str_radix(s, base).err())
.map_or_else(
|| unreachable!("failed to extract ParseIntError"),
|e| format!("{}", e),
)
},
_ => {
let base_filures = base.filter(&failure_mask)?;
some_failures
.get(0)
.zip(base_filures.get(0))
.and_then(|(s, base)| <i64 as Num>::from_str_radix(s, base).err())
.map_or_else(
|| unreachable!("failed to extract ParseIntError"),
|e| format!("{}", e),
)
},
};
polars_bail!(
ComputeError:
"strict integer parsing failed for {} value(s): {}; error message for the \
Expand Down
12 changes: 7 additions & 5 deletions crates/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ pub enum StringFunction {
strict: bool,
},
#[cfg(feature = "string_to_integer")]
ToInteger(u32, bool),
ToInteger(bool),
LenBytes,
LenChars,
Lowercase,
Expand Down Expand Up @@ -343,7 +343,7 @@ impl From<StringFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
StripPrefix => map_as_slice!(strings::strip_prefix),
StripSuffix => map_as_slice!(strings::strip_suffix),
#[cfg(feature = "string_to_integer")]
ToInteger(base, strict) => map!(strings::to_integer, base, strict),
ToInteger(strict) => map_as_slice!(strings::to_integer, strict),
Slice => map_as_slice!(strings::str_slice),
#[cfg(feature = "string_encoding")]
HexEncode => map!(strings::hex_encode),
Expand Down Expand Up @@ -888,9 +888,11 @@ pub(super) fn reverse(s: &Series) -> PolarsResult<Series> {
}

#[cfg(feature = "string_to_integer")]
pub(super) fn to_integer(s: &Series, base: u32, strict: bool) -> PolarsResult<Series> {
let ca = s.str()?;
ca.to_integer(base, strict).map(|ok| ok.into_series())
pub(super) fn to_integer(s: &[Series], strict: bool) -> PolarsResult<Series> {
let ca = s[0].str()?;
let base = s[1].strict_cast(&DataType::UInt32)?;
ca.to_integer(base.u32()?, strict)
.map(|ok| ok.into_series())
}
pub(super) fn str_slice(s: &[Series]) -> PolarsResult<Series> {
// Calculate the post-broadcast length and ensure everything is consistent.
Expand Down
12 changes: 7 additions & 5 deletions crates/polars-plan/src/dsl/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -483,11 +483,13 @@ impl StringNameSpace {

#[cfg(feature = "string_to_integer")]
/// Parse string in base radix into decimal.
pub fn to_integer(self, base: u32, strict: bool) -> Expr {
self.0
.map_private(FunctionExpr::StringExpr(StringFunction::ToInteger(
base, strict,
)))
pub fn to_integer(self, base: Expr, strict: bool) -> Expr {
self.0.map_many_private(
FunctionExpr::StringExpr(StringFunction::ToInteger(strict)),
&[base],
false,
false,
)
}

/// Return the length of each string as the number of bytes.
Expand Down
8 changes: 6 additions & 2 deletions py-polars/polars/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -2234,14 +2234,17 @@ def explode(self) -> Expr:
"""
return wrap_expr(self._pyexpr.str_explode())

def to_integer(self, *, base: int = 10, strict: bool = True) -> Expr:
def to_integer(
self, *, base: int | IntoExprColumn = 10, strict: bool = True
) -> Expr:
"""
Convert a String column into an Int64 column with base radix.

Parameters
----------
base
Positive integer which is the base of the string we are parsing.
Positive integer or expression which is the base of the string
we are parsing.
Default: 10.
strict
Bool, Default=True will raise any ParseError or overflow as ComputeError.
Expand Down Expand Up @@ -2282,6 +2285,7 @@ def to_integer(self, *, base: int = 10, strict: bool = True) -> Expr:
│ null ┆ null │
└──────┴────────┘
"""
base = parse_as_expression(base, str_as_lit=False)
return wrap_expr(self._pyexpr.str_to_integer(base, strict))

@deprecate_renamed_function("to_integer", version="0.19.14")
Expand Down
3 changes: 2 additions & 1 deletion py-polars/polars/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1690,7 +1690,8 @@ def to_integer(self, *, base: int = 10, strict: bool = True) -> Series:
Parameters
----------
base
Positive integer which is the base of the string we are parsing.
Positive integer or expression which is the base of the string
we are parsing.
Default: 10.
strict
Bool, Default=True will raise any ParseError or overflow as ComputeError.
Expand Down
4 changes: 2 additions & 2 deletions py-polars/src/expr/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -205,11 +205,11 @@ impl PyExpr {
self.inner.clone().str().base64_decode(strict).into()
}

fn str_to_integer(&self, base: u32, strict: bool) -> Self {
fn str_to_integer(&self, base: Self, strict: bool) -> Self {
self.inner
.clone()
.str()
.to_integer(base, strict)
.to_integer(base.inner, strict)
.with_fmt("str.to_integer")
.into()
}
Expand Down
17 changes: 16 additions & 1 deletion py-polars/tests/unit/namespaces/string/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,22 @@ def test_str_to_integer() -> None:
hex.str.to_integer(base=16)


def test_str_to_integer_df() -> None:
def test_str_to_integer_base_expr() -> None:
df = pl.DataFrame(
{"str": ["110", "ff00", "234", None, "130"], "base": [2, 16, 10, 8, None]}
)
out = df.select(base_expr=pl.col("str").str.to_integer(base="base"))
expected = pl.DataFrame({"base_expr": [6, 65280, 234, None, None]})
assert_frame_equal(out, expected)

# test strict raise
df = pl.DataFrame({"str": ["110", "ff00", "cafe", None], "base": [2, 10, 10, 8]})

with pytest.raises(pl.ComputeError, match="failed for 2 value"):
df.select(pl.col("str").str.to_integer(base="base"))


def test_str_to_integer_base_literal() -> None:
df = pl.DataFrame(
{
"bin": ["110", "101", "-010", "invalid", None],
Expand Down
Loading