Skip to content

Commit 00cf79b

Browse files
authored
chore: Use twox-hash 2.0 xxhash64 oneshot api instead of custom implementation (#1041)
1 parent d670af7 commit 00cf79b

File tree

12 files changed

+26
-236
lines changed

12 files changed

+26
-236
lines changed

LICENSE.txt

+1-28
Original file line numberDiff line numberDiff line change
@@ -210,31 +210,4 @@ This project includes code from Apache Aurora.
210210

211211
Copyright: 2016 The Apache Software Foundation.
212212
Home page: https://aurora.apache.org/
213-
License: http://www.apache.org/licenses/LICENSE-2.0
214-
215-
--------------------------------------------------------------------------------
216-
217-
This project includes software from the twox-hash project
218-
https://github.com/shepmaster/twox-hash
219-
220-
The MIT License (MIT)
221-
222-
Copyright (c) 2015 Jake Goulding
223-
224-
Permission is hereby granted, free of charge, to any person obtaining a copy
225-
of this software and associated documentation files (the "Software"), to deal
226-
in the Software without restriction, including without limitation the rights
227-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
228-
copies of the Software, and to permit persons to whom the Software is
229-
furnished to do so, subject to the following conditions:
230-
231-
The above copyright notice and this permission notice shall be included in
232-
all copies or substantial portions of the Software.
233-
234-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
235-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
236-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
237-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
238-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
239-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
240-
THE SOFTWARE.
213+
License: http://www.apache.org/licenses/LICENSE-2.0

NOTICE.txt

-3
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,6 @@ Copyright 2024 The Apache Software Foundation
44
This product includes software developed at
55
The Apache Software Foundation (http://www.apache.org/).
66

7-
This product includes software from the twox-hash project (MIT License)
8-
https://github.com/shepmaster/twox-hash
9-
107
This product includes software developed at
118
Apache Gluten (https://github.com/apache/incubator-gluten/)
129
Specifically:

native/Cargo.lock

+12-4
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

native/core/src/execution/sort.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use std::{cmp, mem, ptr};
1919

2020
/// This is a copy of the `rdxsort-rs` crate, with the following changes:
2121
/// - removed `Rdx` implementations for all types except for i64 which is the packed representation
22-
/// of row addresses and partition ids from Spark.
22+
/// of row addresses and partition ids from Spark.
2323
2424
pub trait Rdx {
2525
/// Sets the number of buckets used by the generic implementation.

native/core/src/parquet/util/test_common/page_util.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ pub trait DataPageBuilder {
4848
/// - add_def_levels()
4949
/// - add_values() for normal data page / add_indices() for dictionary data page
5050
/// - consume()
51-
/// in order to populate and obtain a data page.
51+
/// in order to populate and obtain a data page.
5252
pub struct DataPageBuilderImpl {
5353
desc: ColumnDescPtr,
5454
encoding: Option<Encoding>,

native/spark-expr/Cargo.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,13 @@ chrono-tz = { workspace = true }
3939
num = { workspace = true }
4040
regex = { workspace = true }
4141
thiserror = { workspace = true }
42+
twox-hash = "2.0.0"
4243

4344
[dev-dependencies]
4445
arrow-data = {workspace = true}
4546
criterion = "0.5.1"
4647
rand = { workspace = true}
47-
twox-hash = "1.6.3"
48+
4849

4950
[lib]
5051
name = "datafusion_comet_spark_expr"

native/spark-expr/benches/conditional.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,12 @@ fn criterion_benchmark(c: &mut Criterion) {
5151
if i % 7 == 0 {
5252
c2.append_null();
5353
} else {
54-
c2.append_value(&format!("string {i}"));
54+
c2.append_value(format!("string {i}"));
5555
}
5656
if i % 9 == 0 {
5757
c3.append_null();
5858
} else {
59-
c3.append_value(&format!("other string {i}"));
59+
c3.append_value(format!("other string {i}"));
6060
}
6161
}
6262
let c1 = Arc::new(c1.finish());

native/spark-expr/src/cast.rs

+1-3
Original file line numberDiff line numberDiff line change
@@ -1568,9 +1568,7 @@ fn get_timestamp_values<T: TimeZone>(
15681568
timestamp_type: &str,
15691569
tz: &T,
15701570
) -> SparkResult<Option<i64>> {
1571-
let values: Vec<_> = value
1572-
.split(|c| c == 'T' || c == '-' || c == ':' || c == '.')
1573-
.collect();
1571+
let values: Vec<_> = value.split(['T', '-', ':', '.']).collect();
15741572
let year = values[0].parse::<i32>().unwrap_or_default();
15751573
let month = values.get(1).map_or(1, |m| m.parse::<u32>().unwrap_or(1));
15761574
let day = values.get(2).map_or(1, |d| d.parse::<u32>().unwrap_or(1));

native/spark-expr/src/lib.rs

-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ mod temporal;
3333
pub mod timezone;
3434
mod to_json;
3535
pub mod utils;
36-
mod xxhash64;
3736

3837
pub use cast::{spark_cast, Cast};
3938
pub use error::{SparkError, SparkResult};

native/spark-expr/src/spark_hash.rs

+5-1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ use arrow::{
2222
datatypes::{ArrowNativeTypeOp, UInt16Type, UInt32Type, UInt64Type, UInt8Type},
2323
};
2424
use std::sync::Arc;
25+
use twox_hash::XxHash64;
2526

2627
use datafusion::{
2728
arrow::{
@@ -34,7 +35,10 @@ use datafusion::{
3435
error::{DataFusionError, Result},
3536
};
3637

37-
use crate::xxhash64::spark_compatible_xxhash64;
38+
#[inline]
39+
pub(crate) fn spark_compatible_xxhash64<T: AsRef<[u8]>>(data: T, seed: u64) -> u64 {
40+
XxHash64::oneshot(seed, data.as_ref())
41+
}
3842

3943
/// Spark-compatible murmur3 hash function
4044
#[inline]

native/spark-expr/src/xxhash64.rs

-190
This file was deleted.

rust-toolchain.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,5 @@
1616
# under the License.
1717

1818
[toolchain]
19-
channel = "1.79"
19+
channel = "1.81"
2020
components = ["rustfmt", "clippy", "rust-analyzer"]

0 commit comments

Comments
 (0)