Skip to content

Commit b5446ef

Browse files
Upgrade datafusion 39 (#728)
* deps: update datafusion to 39.0.0, pyo3 to 0.21, and object_store to 0.10.1 `datafusion-common` also depends on `pyo3`, so they need to be upgraded together. * feat: remove GetIndexField datafusion replaced Expr::GetIndexField with a FieldAccessor trait. Ref apache/datafusion#10568 Ref apache/datafusion#10769 * feat: update ScalarFunction The field `func_name` was changed to `func` as part of removing `ScalarFunctionDefinition` upstream. Ref apache/datafusion#10325 * feat: incorporate upstream array_slice fixes Fixes #670 * update ExectionPlan::children impl for DatasetExec Ref apache/datafusion#10543 * update value_interval_daytime Ref apache/arrow-rs#5769 * update regexp_replace and regexp_match Fixes #677 * add gil-refs feature to pyo3 This silences pyo3's deprecation warnings for its new Bounds api. It's the 1st step of the migration, and should be removed before merge. Ref https://pyo3.rs/v0.21.0/migration#from-020-to-021 * fix signature for octet_length Ref apache/datafusion#10726 * update signature for covar_samp AggregateUDF expressions now have a builder API design, which removes arguments like filter and order_by Ref apache/datafusion#10545 Ref apache/datafusion#10492 * convert covar_pop to expr_fn api Ref: https://github.com/apache/datafusion/pull/10418/files * convert median to expr_fn api Ref apache/datafusion#10644 * convert variance sample to UDF Ref apache/datafusion#10667 * convert first_value and last_value to UDFs Ref apache/datafusion#10648 * checkpointing with a few todos to fix remaining compile errors * impl PyExpr::python_value for IntervalDayTime and IntervalMonthDayNano * convert sum aggregate function to UDF * remove unnecessary clone on double reference * apply cargo fmt * remove duplicate allow-dead-code annotation * update tpch examples for new pyarrow interval Fixes #665 * marked q11 tpch example as expected fail Ref #730 * add default stride of None back to array_slice
1 parent 860283a commit b5446ef

23 files changed

+467
-327
lines changed

Cargo.lock

+321-219
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

+11-11
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
[package]
1919
name = "datafusion-python"
20-
version = "38.0.1"
20+
version = "39.0.0"
2121
homepage = "https://datafusion.apache.org/python"
2222
repository = "https://github.com/apache/datafusion-python"
2323
authors = ["Apache DataFusion <[email protected]>"]
@@ -36,28 +36,28 @@ substrait = ["dep:datafusion-substrait"]
3636
[dependencies]
3737
tokio = { version = "1.35", features = ["macros", "rt", "rt-multi-thread", "sync"] }
3838
rand = "0.8"
39-
pyo3 = { version = "0.20", features = ["extension-module", "abi3", "abi3-py38"] }
40-
datafusion = { version = "38.0.0", features = ["pyarrow", "avro", "unicode_expressions"] }
41-
datafusion-common = { version = "38.0.0", features = ["pyarrow"] }
42-
datafusion-expr = "38.0.0"
43-
datafusion-functions-array = "38.0.0"
44-
datafusion-optimizer = "38.0.0"
45-
datafusion-sql = "38.0.0"
46-
datafusion-substrait = { version = "38.0.0", optional = true }
39+
pyo3 = { version = "0.21", features = ["extension-module", "abi3", "abi3-py38", "gil-refs"] }
40+
datafusion = { version = "39.0.0", features = ["pyarrow", "avro", "unicode_expressions"] }
41+
datafusion-common = { version = "39.0.0", features = ["pyarrow"] }
42+
datafusion-expr = "39.0.0"
43+
datafusion-functions-array = "39.0.0"
44+
datafusion-optimizer = "39.0.0"
45+
datafusion-sql = "39.0.0"
46+
datafusion-substrait = { version = "39.0.0", optional = true }
4747
prost = "0.12"
4848
prost-types = "0.12"
4949
uuid = { version = "1.8", features = ["v4"] }
5050
mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] }
5151
async-trait = "0.1"
5252
futures = "0.3"
53-
object_store = { version = "0.9.1", features = ["aws", "gcp", "azure"] }
53+
object_store = { version = "0.10.1", features = ["aws", "gcp", "azure"] }
5454
parking_lot = "0.12"
5555
regex-syntax = "0.8.1"
5656
syn = "2.0.43"
5757
url = "2.2"
5858

5959
[build-dependencies]
60-
pyo3-build-config = "0.20.0"
60+
pyo3-build-config = "0.21"
6161

6262
[lib]
6363
name = "datafusion_python"

docs/source/user-guide/common-operations/functions.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -92,12 +92,13 @@ DataFusion offers a range of helpful options.
9292
f.left(col('"Name"'), literal(4)).alias("code")
9393
)
9494
95-
This also includes the functions for regular expressions like :func:`.regexp_match`
95+
This also includes the functions for regular expressions like :func:`.regexp_replace` and :func:`.regexp_match`
9696

9797
.. ipython:: python
9898
9999
df.select(
100100
f.regexp_match(col('"Name"'), literal("Char")).alias("dragons"),
101+
f.regexp_replace(col('"Name"'), literal("saur"), literal("fleur")).alias("flowers")
101102
)
102103
103104

examples/tpch/_tests.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,10 @@ def check_q17(df):
7272
("q08_market_share", "q8"),
7373
("q09_product_type_profit_measure", "q9"),
7474
("q10_returned_item_reporting", "q10"),
75-
("q11_important_stock_identification", "q11"),
75+
pytest.param(
76+
"q11_important_stock_identification", "q11",
77+
marks=pytest.mark.xfail # https://github.com/apache/datafusion-python/issues/730
78+
),
7679
("q12_ship_mode_order_priority", "q12"),
7780
("q13_customer_distribution", "q13"),
7881
("q14_promotion_effect", "q14"),

examples/tpch/q01_pricing_summary_report.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,7 @@
4848
# want to report results for. It should be between 60-120 days before the end.
4949
DAYS_BEFORE_FINAL = 90
5050

51-
# Note: this is a hack on setting the values. It should be set differently once
52-
# https://github.com/apache/datafusion-python/issues/665 is resolved.
53-
interval = pa.scalar((0, 0, DAYS_BEFORE_FINAL), type=pa.month_day_nano_interval())
51+
interval = pa.scalar((0, DAYS_BEFORE_FINAL, 0), type=pa.month_day_nano_interval())
5452

5553
print("Final date in database:", greatest_ship_date)
5654

examples/tpch/q04_order_priority_checking.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,7 @@
4949
# Create a date object from the string
5050
date = datetime.strptime(DATE_OF_INTEREST, "%Y-%m-%d").date()
5151

52-
# Note: this is a hack on setting the values. It should be set differently once
53-
# https://github.com/apache/datafusion-python/issues/665 is resolved.
54-
interval = pa.scalar((0, 0, INTERVAL_DAYS), type=pa.month_day_nano_interval())
52+
interval = pa.scalar((0, INTERVAL_DAYS, 0), type=pa.month_day_nano_interval())
5553

5654
# Limit results to cases where commitment date before receipt date
5755
# Aggregate the results so we only get one row to join with the order table.

examples/tpch/q05_local_supplier_volume.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,7 @@
4141

4242
date = datetime.strptime(DATE_OF_INTEREST, "%Y-%m-%d").date()
4343

44-
# Note: this is a hack on setting the values. It should be set differently once
45-
# https://github.com/apache/datafusion-python/issues/665 is resolved.
46-
interval = pa.scalar((0, 0, INTERVAL_DAYS), type=pa.month_day_nano_interval())
44+
interval = pa.scalar((0, INTERVAL_DAYS, 0), type=pa.month_day_nano_interval())
4745

4846
# Load the dataframes we need
4947

examples/tpch/q06_forecasting_revenue_change.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,7 @@
4545

4646
date = datetime.strptime(DATE_OF_INTEREST, "%Y-%m-%d").date()
4747

48-
# Note: this is a hack on setting the values. It should be set differently once
49-
# https://github.com/apache/datafusion-python/issues/665 is resolved.
50-
interval = pa.scalar((0, 0, INTERVAL_DAYS), type=pa.month_day_nano_interval())
48+
interval = pa.scalar((0, INTERVAL_DAYS, 0), type=pa.month_day_nano_interval())
5149

5250
# Load the dataframes we need
5351

examples/tpch/q10_returned_item_reporting.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,7 @@
3838

3939
date_start_of_quarter = lit(datetime.strptime(DATE_START_OF_QUARTER, "%Y-%m-%d").date())
4040

41-
# Note: this is a hack on setting the values. It should be set differently once
42-
# https://github.com/apache/datafusion-python/issues/665 is resolved.
43-
interval_one_quarter = lit(pa.scalar((0, 0, 92), type=pa.month_day_nano_interval()))
41+
interval_one_quarter = lit(pa.scalar((0, 92, 0), type=pa.month_day_nano_interval()))
4442

4543
# Load the dataframes we need
4644

examples/tpch/q12_ship_mode_order_priority.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,7 @@
5151

5252
date = datetime.strptime(DATE_OF_INTEREST, "%Y-%m-%d").date()
5353

54-
# Note: this is a hack on setting the values. It should be set differently once
55-
# https://github.com/apache/datafusion-python/issues/665 is resolved.
56-
interval = pa.scalar((0, 0, 365), type=pa.month_day_nano_interval())
54+
interval = pa.scalar((0, 365, 0), type=pa.month_day_nano_interval())
5755

5856

5957
df = df_lineitem.filter(col("l_receiptdate") >= lit(date)).filter(

examples/tpch/q14_promotion_effect.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,8 @@
3434
DATE = "1995-09-01"
3535

3636
date_of_interest = lit(datetime.strptime(DATE, "%Y-%m-%d").date())
37-
# Note: this is a hack on setting the values. It should be set differently once
38-
# https://github.com/apache/datafusion-python/issues/665 is resolved.
39-
interval_one_month = lit(pa.scalar((0, 0, 30), type=pa.month_day_nano_interval()))
37+
38+
interval_one_month = lit(pa.scalar((0, 30, 0), type=pa.month_day_nano_interval()))
4039

4140
# Load the dataframes we need
4241

examples/tpch/q15_top_supplier.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,8 @@
3434
DATE = "1996-01-01"
3535

3636
date_of_interest = lit(datetime.strptime(DATE, "%Y-%m-%d").date())
37-
# Note: this is a hack on setting the values. It should be set differently once
38-
# https://github.com/apache/datafusion-python/issues/665 is resolved.
39-
interval_3_months = lit(pa.scalar((0, 0, 91), type=pa.month_day_nano_interval()))
37+
38+
interval_3_months = lit(pa.scalar((0, 91, 0), type=pa.month_day_nano_interval()))
4039

4140
# Load the dataframes we need
4241

examples/tpch/q20_potential_part_promotion.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,7 @@
5656

5757
date = datetime.strptime(DATE_OF_INTEREST, "%Y-%m-%d").date()
5858

59-
# Note: this is a hack on setting the values. It should be set differently once
60-
# https://github.com/apache/datafusion-python/issues/665 is resolved.
61-
interval = pa.scalar((0, 0, 365), type=pa.month_day_nano_interval())
59+
interval = pa.scalar((0, 365, 0), type=pa.month_day_nano_interval())
6260

6361
# Filter down dataframes
6462
df_nation = df_nation.filter(col("n_name") == lit(NATION_OF_INTEREST))

python/datafusion/__init__.py

-2
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@
5353
ScalarVariable,
5454
Sort,
5555
TableScan,
56-
GetIndexedField,
5756
Not,
5857
IsNotNull,
5958
IsTrue,
@@ -116,7 +115,6 @@
116115
"SimilarTo",
117116
"ScalarVariable",
118117
"Alias",
119-
"GetIndexedField",
120118
"Not",
121119
"IsNotNull",
122120
"IsTrue",

python/datafusion/tests/test_functions.py

-1
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,6 @@ def py_flatten(arr):
461461
pytest.param(
462462
lambda col: f.list_slice(col, literal(-1), literal(2)),
463463
lambda data: [arr[-1:2] for arr in data],
464-
marks=pytest.mark.xfail,
465464
),
466465
[
467466
lambda col: f.array_intersect(col, literal([3.0, 4.0])),

python/datafusion/tests/test_imports.py

-2
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@
5353
SimilarTo,
5454
ScalarVariable,
5555
Alias,
56-
GetIndexedField,
5756
Not,
5857
IsNotNull,
5958
IsTrue,
@@ -126,7 +125,6 @@ def test_class_module_is_datafusion():
126125
SimilarTo,
127126
ScalarVariable,
128127
Alias,
129-
GetIndexedField,
130128
Not,
131129
IsNotNull,
132130
IsTrue,

src/common/data_type.rs

+1
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ impl DataTypeMap {
251251
pub fn map_from_scalar_to_arrow(scalar_val: &ScalarValue) -> Result<DataType, PyErr> {
252252
match scalar_val {
253253
ScalarValue::Boolean(_) => Ok(DataType::Boolean),
254+
ScalarValue::Float16(_) => Ok(DataType::Float16),
254255
ScalarValue::Float32(_) => Ok(DataType::Float32),
255256
ScalarValue::Float64(_) => Ok(DataType::Float64),
256257
ScalarValue::Decimal128(_, precision, scale) => {

src/dataset_exec.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ impl ExecutionPlan for DatasetExec {
164164
self.schema.clone()
165165
}
166166

167-
fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
167+
fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
168168
// this is a leaf node and has no children
169169
vec![]
170170
}

src/expr.rs

+14-18
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,12 @@ use std::sync::Arc;
2323

2424
use datafusion::arrow::datatypes::{DataType, Field};
2525
use datafusion::arrow::pyarrow::PyArrowType;
26+
use datafusion::functions::core::expr_ext::FieldAccessor;
2627
use datafusion::scalar::ScalarValue;
2728
use datafusion_expr::{
2829
col,
2930
expr::{AggregateFunction, InList, InSubquery, ScalarFunction, Sort, WindowFunction},
30-
lit, Between, BinaryExpr, Case, Cast, Expr, GetFieldAccess, GetIndexedField, Like, Operator,
31-
TryCast,
31+
lit, Between, BinaryExpr, Case, Cast, Expr, Like, Operator, TryCast,
3232
};
3333

3434
use crate::common::data_type::{DataTypeMap, RexType};
@@ -71,7 +71,6 @@ pub mod filter;
7171
pub mod grouping_set;
7272
pub mod in_list;
7373
pub mod in_subquery;
74-
pub mod indexed_field;
7574
pub mod join;
7675
pub mod like;
7776
pub mod limit;
@@ -216,13 +215,7 @@ impl PyExpr {
216215
}
217216

218217
fn __getitem__(&self, key: &str) -> PyResult<PyExpr> {
219-
Ok(Expr::GetIndexedField(GetIndexedField::new(
220-
Box::new(self.expr.clone()),
221-
GetFieldAccess::NamedStructField {
222-
name: ScalarValue::Utf8(Some(key.to_string())),
223-
},
224-
))
225-
.into())
218+
Ok(self.expr.clone().field(key).into())
226219
}
227220

228221
#[staticmethod]
@@ -263,7 +256,7 @@ impl PyExpr {
263256
pub fn rex_type(&self) -> PyResult<RexType> {
264257
Ok(match self.expr {
265258
Expr::Alias(..) => RexType::Alias,
266-
Expr::Column(..) | Expr::GetIndexedField { .. } => RexType::Reference,
259+
Expr::Column(..) => RexType::Reference,
267260
Expr::ScalarVariable(..) | Expr::Literal(..) => RexType::Literal,
268261
Expr::BinaryExpr { .. }
269262
| Expr::Not(..)
@@ -314,6 +307,11 @@ impl PyExpr {
314307
),
315308
)),
316309
ScalarValue::Boolean(v) => Ok(v.into_py(py)),
310+
ScalarValue::Float16(_) => Err(py_datafusion_err(
311+
datafusion_common::DataFusionError::NotImplemented(
312+
"ScalarValue::Float16".to_string(),
313+
),
314+
)),
317315
ScalarValue::Float32(v) => Ok(v.into_py(py)),
318316
ScalarValue::Float64(v) => Ok(v.into_py(py)),
319317
ScalarValue::Decimal128(v, _, _) => Ok(v.into_py(py)),
@@ -355,8 +353,10 @@ impl PyExpr {
355353
ScalarValue::TimestampMicrosecond(v, _) => Ok(v.into_py(py)),
356354
ScalarValue::TimestampNanosecond(v, _) => Ok(v.into_py(py)),
357355
ScalarValue::IntervalYearMonth(v) => Ok(v.into_py(py)),
358-
ScalarValue::IntervalDayTime(v) => Ok(v.into_py(py)),
359-
ScalarValue::IntervalMonthDayNano(v) => Ok(v.into_py(py)),
356+
ScalarValue::IntervalDayTime(v) => Ok(ScalarValue::IntervalDayTime(*v).into_py(py)),
357+
ScalarValue::IntervalMonthDayNano(v) => {
358+
Ok(ScalarValue::IntervalMonthDayNano(*v).into_py(py))
359+
}
360360
ScalarValue::DurationSecond(v) => Ok(v.into_py(py)),
361361
ScalarValue::DurationMicrosecond(v) => Ok(v.into_py(py)),
362362
ScalarValue::DurationNanosecond(v) => Ok(v.into_py(py)),
@@ -417,7 +417,6 @@ impl PyExpr {
417417
| Expr::IsNotFalse(expr)
418418
| Expr::IsNotUnknown(expr)
419419
| Expr::Negative(expr)
420-
| Expr::GetIndexedField(GetIndexedField { expr, .. })
421420
| Expr::Cast(Cast { expr, .. })
422421
| Expr::TryCast(TryCast { expr, .. })
423422
| Expr::Sort(Sort { expr, .. })
@@ -513,9 +512,7 @@ impl PyExpr {
513512
op,
514513
right: _,
515514
}) => format!("{op}"),
516-
Expr::ScalarFunction(ScalarFunction { func_def, args: _ }) => {
517-
func_def.name().to_string()
518-
}
515+
Expr::ScalarFunction(ScalarFunction { func, args: _ }) => func.name().to_string(),
519516
Expr::Cast { .. } => "cast".to_string(),
520517
Expr::Between { .. } => "between".to_string(),
521518
Expr::Case { .. } => "case".to_string(),
@@ -674,7 +671,6 @@ pub(crate) fn init_module(m: &PyModule) -> PyResult<()> {
674671
m.add_class::<cast::PyCast>()?;
675672
m.add_class::<cast::PyTryCast>()?;
676673
m.add_class::<between::PyBetween>()?;
677-
m.add_class::<indexed_field::PyGetIndexedField>()?;
678674
m.add_class::<explain::PyExplain>()?;
679675
m.add_class::<limit::PyLimit>()?;
680676
m.add_class::<aggregate::PyAggregate>()?;

src/expr/literal.rs

+1-6
Original file line numberDiff line numberDiff line change
@@ -137,12 +137,7 @@ impl PyLiteral {
137137

138138
pub fn value_interval_day_time(&self) -> PyResult<Option<(i32, i32)>> {
139139
match &self.value {
140-
ScalarValue::IntervalDayTime(Some(iv)) => {
141-
let interval = *iv as u64;
142-
let days = (interval >> 32) as i32;
143-
let ms = interval as i32;
144-
Ok(Some((days, ms)))
145-
}
140+
ScalarValue::IntervalDayTime(Some(iv)) => Ok(Some((iv.days, iv.milliseconds))),
146141
ScalarValue::IntervalDayTime(None) => Ok(None),
147142
other => Err(unexpected_literal_value(other)),
148143
}

src/expr/signature.rs

-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ use pyo3::prelude::*;
2020

2121
#[allow(dead_code)]
2222
#[pyclass(name = "Signature", module = "datafusion.expr", subclass)]
23-
#[allow(dead_code)]
2423
#[derive(Clone)]
2524
pub struct PySignature {
2625
type_signature: TypeSignature,

0 commit comments

Comments
 (0)