From a36bb7757c3067f36fa235bfaf6028e9af9a7bfd Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 13 Feb 2024 11:24:09 -0800 Subject: [PATCH 1/6] Return null for overflow when casting string to integer --- arrow-cast/src/cast.rs | 19 +++++++++++++++++++ arrow-cast/src/parse.rs | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index a813c5f6c87e..7868946532c4 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -4911,6 +4911,25 @@ mod tests { assert!(c.is_null(2)); } + #[test] + fn test_cast_string_to_integral_overflow() { + let str = Arc::new(StringArray::from(vec![ + Some("123"), + Some("-123"), + Some("86374"), + None, + ])) as ArrayRef; + + let options = CastOptions { + safe: true, + format_options: FormatOptions::default(), + }; + let res = cast_with_options(&str, &DataType::Int16, &options).expect("should cast to i16"); + let expected = + Arc::new(Int16Array::from(vec![Some(123), Some(-123), None, None])) as ArrayRef; + assert_eq!(&res, &expected); + } + #[test] fn test_cast_string_to_timestamp() { let a1 = Arc::new(StringArray::from(vec![ diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 50e9fda672f6..e39e0964bc5b 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -438,7 +438,7 @@ macro_rules! parser_primitive { ($t:ty) => { impl Parser for $t { fn parse(string: &str) -> Option { - lexical_core::parse::(string.as_bytes()).ok() + string.parse::().ok() } } }; From ef5e05d876cdf90481a133f1d5d91d4e4b596adc Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 14 Feb 2024 10:26:02 -0800 Subject: [PATCH 2/6] Use atoi_simd --- arrow-cast/Cargo.toml | 1 + arrow-cast/src/parse.rs | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 19b857297d14..5a144fe1848b 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -49,6 +49,7 @@ chrono = { workspace = true } half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } +atoi_simd = "0.15.6" comfy-table = { version = "7.0", optional = true, default-features = false } base64 = "0.21" diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index e39e0964bc5b..f81bd2105bf3 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -438,7 +438,7 @@ macro_rules! parser_primitive { ($t:ty) => { impl Parser for $t { fn parse(string: &str) -> Option { - string.parse::().ok() + atoi_simd::parse(string.as_bytes()).ok() } } }; From 9914866cc99482b4569cb84021d456d0c246d0f7 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 14 Feb 2024 12:32:31 -0800 Subject: [PATCH 3/6] Use atoi --- arrow-cast/Cargo.toml | 2 +- arrow-cast/src/parse.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 5a144fe1848b..81dd0ebd415f 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -49,7 +49,7 @@ chrono = { workspace = true } half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } -atoi_simd = "0.15.6" +atoi = "2.0.0" comfy-table = { version = "7.0", optional = true, default-features = false } base64 = "0.21" diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index f81bd2105bf3..0fe88aa8c552 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -438,7 +438,7 @@ macro_rules! parser_primitive { ($t:ty) => { impl Parser for $t { fn parse(string: &str) -> Option { - atoi_simd::parse(string.as_bytes()).ok() + atoi::atoi::(string.as_bytes()) } } }; From 53dd0479a5221e6bc7b6447389abdf712a1819ad Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 14 Feb 2024 14:04:45 -0800 Subject: [PATCH 4/6] Return to str.parse. --- arrow-cast/Cargo.toml | 1 - arrow-cast/src/parse.rs | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 81dd0ebd415f..19b857297d14 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -49,7 +49,6 @@ chrono = { workspace = true } half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } -atoi = "2.0.0" comfy-table = { version = "7.0", optional = true, default-features = false } base64 = "0.21" diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 0fe88aa8c552..e39e0964bc5b 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -438,7 +438,7 @@ macro_rules! parser_primitive { ($t:ty) => { impl Parser for $t { fn parse(string: &str) -> Option { - atoi::atoi::(string.as_bytes()) + string.parse::().ok() } } }; From c22e5cd9deca4564472930bcb4b2e48299a78502 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 14 Feb 2024 14:55:13 -0800 Subject: [PATCH 5/6] Revert "Return to str.parse." This reverts commit 53dd0479a5221e6bc7b6447389abdf712a1819ad. --- arrow-cast/Cargo.toml | 1 + arrow-cast/src/parse.rs | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 19b857297d14..81dd0ebd415f 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -49,6 +49,7 @@ chrono = { workspace = true } half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } +atoi = "2.0.0" comfy-table = { version = "7.0", optional = true, default-features = false } base64 = "0.21" diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index e39e0964bc5b..0fe88aa8c552 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -438,7 +438,7 @@ macro_rules! parser_primitive { ($t:ty) => { impl Parser for $t { fn parse(string: &str) -> Option { - string.parse::().ok() + atoi::atoi::(string.as_bytes()) } } }; From ebe2b58b242ab6ebfdbbc5b51f6b4a6801cb7151 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 14 Feb 2024 14:56:13 -0800 Subject: [PATCH 6/6] Check trailing string --- arrow-cast/src/parse.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 0fe88aa8c552..72942af8394a 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -438,7 +438,12 @@ macro_rules! parser_primitive { ($t:ty) => { impl Parser for $t { fn parse(string: &str) -> Option { - atoi::atoi::(string.as_bytes()) + match atoi::FromRadix10SignedChecked::from_radix_10_signed_checked( + string.as_bytes(), + ) { + (Some(n), x) if x == string.len() => Some(n), + _ => None, + } } } };