Skip to content

Commit 1f74d8e

Browse files
committed
Remove Sort expression (Expr::Sort)
Remove sort as an expression, i.e. remove `Expr::Sort` from `Expr` enum. Use `expr::Sort` directly when sorting. The sort expression was used in context of ordering (sort, topk, create table, file sorting). Those places require their sort expression to be of type Sort anyway and no other expression was allowed, so this change improves static typing. Sort as an expression was illegal in other contexts.
1 parent 840d843 commit 1f74d8e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+708
-876
lines changed

datafusion-examples/examples/advanced_udwf.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ async fn main() -> Result<()> {
219219
let window_expr = smooth_it
220220
.call(vec![col("speed")]) // smooth_it(speed)
221221
.partition_by(vec![col("car")]) // PARTITION BY car
222-
.order_by(vec![col("time").sort(true, true).to_expr()]) // ORDER BY time ASC
222+
.order_by(vec![col("time").sort(true, true)]) // ORDER BY time ASC
223223
.window_frame(WindowFrame::new(None))
224224
.build()?;
225225
let df = ctx.table("cars").await?.window(vec![window_expr])?;

datafusion-examples/examples/expr_api.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ fn expr_fn_demo() -> Result<()> {
9999
// such as `FIRST_VALUE(price FILTER quantity > 100 ORDER BY ts )
100100
let agg = first_value
101101
.call(vec![col("price")])
102-
.order_by(vec![col("ts").sort(false, false).to_expr()])
102+
.order_by(vec![col("ts").sort(false, false)])
103103
.filter(col("quantity").gt(lit(100)))
104104
.build()?; // build the aggregate
105105
assert_eq!(

datafusion-examples/examples/file_stream_provider.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ mod non_windows {
3939
use datafusion::datasource::TableProvider;
4040
use datafusion::prelude::{SessionConfig, SessionContext};
4141
use datafusion_common::{exec_err, Result};
42-
use datafusion_expr::Expr;
42+
use datafusion_expr::SortExpr;
4343

4444
// Number of lines written to FIFO
4545
const TEST_BATCH_SIZE: usize = 5;
@@ -49,7 +49,7 @@ mod non_windows {
4949
fn fifo_table(
5050
schema: SchemaRef,
5151
path: impl Into<PathBuf>,
52-
sort: Vec<Vec<Expr>>,
52+
sort: Vec<Vec<SortExpr>>,
5353
) -> Arc<dyn TableProvider> {
5454
let source = FileStreamProvider::new_file(schema, path.into())
5555
.with_batch_size(TEST_BATCH_SIZE)
@@ -157,7 +157,7 @@ mod non_windows {
157157
]));
158158

159159
// Specify the ordering:
160-
let order = vec![vec![datafusion_expr::col("a1").sort(true, false).to_expr()]];
160+
let order = vec![vec![datafusion_expr::col("a1").sort(true, false)]];
161161

162162
let provider = fifo_table(schema.clone(), fifo_path, order.clone());
163163
ctx.register_table("fifo", provider)?;

datafusion-examples/examples/parse_sql_expr.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ async fn query_parquet_demo() -> Result<()> {
114114
)?
115115
// Directly parsing the SQL text into a sort expression is not supported yet, so
116116
// construct it programmatically
117-
.sort(vec![col("double_col").sort(false, false).to_expr()])?
117+
.sort(vec![col("double_col").sort(false, false)])?
118118
.limit(0, Some(1))?;
119119

120120
let result = df.collect().await?;

datafusion-examples/examples/simple_udwf.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ async fn main() -> Result<()> {
121121
let window_expr = smooth_it
122122
.call(vec![col("speed")]) // smooth_it(speed)
123123
.partition_by(vec![col("car")]) // PARTITION BY car
124-
.order_by(vec![col("time").sort(true, true).to_expr()]) // ORDER BY time ASC
124+
.order_by(vec![col("time").sort(true, true)]) // ORDER BY time ASC
125125
.window_frame(WindowFrame::new(None))
126126
.build()?;
127127
let df = ctx.table("cars").await?.window(vec![window_expr])?;

datafusion/core/src/dataframe/mod.rs

Lines changed: 38 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ use datafusion_common::config::{CsvOptions, JsonOptions};
5252
use datafusion_common::{
5353
plan_err, Column, DFSchema, DataFusionError, ParamValues, SchemaError, UnnestOptions,
5454
};
55-
use datafusion_expr::{case, is_null, lit};
55+
use datafusion_expr::{case, is_null, lit, SortExpr};
5656
use datafusion_expr::{
5757
utils::COUNT_STAR_EXPANSION, TableProviderFilterPushDown, UNNAMED_TABLE,
5858
};
@@ -62,7 +62,6 @@ use datafusion_functions_aggregate::expr_fn::{
6262

6363
use async_trait::async_trait;
6464
use datafusion_catalog::Session;
65-
use datafusion_expr::expr::sort_vec_from_expr;
6665

6766
/// Contains options that control how data is
6867
/// written out from a DataFrame
@@ -578,7 +577,7 @@ impl DataFrame {
578577
self,
579578
on_expr: Vec<Expr>,
580579
select_expr: Vec<Expr>,
581-
sort_expr: Option<Vec<Expr>>,
580+
sort_expr: Option<Vec<SortExpr>>,
582581
) -> Result<DataFrame> {
583582
let plan = LogicalPlanBuilder::from(self.plan)
584583
.distinct_on(on_expr, select_expr, sort_expr)?
@@ -777,6 +776,15 @@ impl DataFrame {
777776
})
778777
}
779778

779+
/// Apply a sort by provided expressions with default direction
780+
pub fn sort_by(self, expr: Vec<Expr>) -> Result<DataFrame> {
781+
self.sort(
782+
expr.into_iter()
783+
.map(|e| e.sort(true, false))
784+
.collect::<Vec<SortExpr>>(),
785+
)
786+
}
787+
780788
/// Sort the DataFrame by the specified sorting expressions.
781789
///
782790
/// Note that any expression can be turned into
@@ -792,16 +800,14 @@ impl DataFrame {
792800
/// let ctx = SessionContext::new();
793801
/// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
794802
/// let df = df.sort(vec![
795-
/// col("a").sort(true, true).to_expr(), // a ASC, nulls first
796-
/// col("b").sort(false, false).to_expr(), // b DESC, nulls last
803+
/// col("a").sort(true, true), // a ASC, nulls first
804+
/// col("b").sort(false, false), // b DESC, nulls last
797805
/// ])?;
798806
/// # Ok(())
799807
/// # }
800808
/// ```
801-
pub fn sort(self, expr: Vec<Expr>) -> Result<DataFrame> {
802-
let plan = LogicalPlanBuilder::from(self.plan)
803-
.sort(sort_vec_from_expr(expr))?
804-
.build()?;
809+
pub fn sort(self, expr: Vec<SortExpr>) -> Result<DataFrame> {
810+
let plan = LogicalPlanBuilder::from(self.plan).sort(expr)?.build()?;
805811
Ok(DataFrame {
806812
session_state: self.session_state,
807813
plan,
@@ -1322,7 +1328,7 @@ impl DataFrame {
13221328
/// let ctx = SessionContext::new();
13231329
/// // Sort the data by column "b" and write it to a new location
13241330
/// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?
1325-
/// .sort(vec![col("b").sort(true, true).to_expr()])? // sort by b asc, nulls first
1331+
/// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first
13261332
/// .write_csv(
13271333
/// "output.csv",
13281334
/// DataFrameWriteOptions::new(),
@@ -1382,7 +1388,7 @@ impl DataFrame {
13821388
/// let ctx = SessionContext::new();
13831389
/// // Sort the data by column "b" and write it to a new location
13841390
/// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?
1385-
/// .sort(vec![col("b").sort(true, true).to_expr()])? // sort by b asc, nulls first
1391+
/// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first
13861392
/// .write_json(
13871393
/// "output.json",
13881394
/// DataFrameWriteOptions::new(),
@@ -2406,10 +2412,7 @@ mod tests {
24062412

24072413
Expr::WindowFunction(w)
24082414
.null_treatment(NullTreatment::IgnoreNulls)
2409-
.order_by(vec![
2410-
col("c2").sort(true, true).to_expr(),
2411-
col("c3").sort(true, true).to_expr(),
2412-
])
2415+
.order_by(vec![col("c2").sort(true, true), col("c3").sort(true, true)])
24132416
.window_frame(WindowFrame::new_bounds(
24142417
WindowFrameUnits::Rows,
24152418
WindowFrameBound::Preceding(ScalarValue::UInt64(None)),
@@ -2499,7 +2502,7 @@ mod tests {
24992502
.unwrap()
25002503
.distinct()
25012504
.unwrap()
2502-
.sort(vec![col("c1").sort(true, true).to_expr()])
2505+
.sort(vec![col("c1").sort(true, true)])
25032506
.unwrap();
25042507

25052508
let df_results = plan.clone().collect().await?;
@@ -2530,7 +2533,7 @@ mod tests {
25302533
.distinct()
25312534
.unwrap()
25322535
// try to sort on some value not present in input to distinct
2533-
.sort(vec![col("c2").sort(true, true).to_expr()])
2536+
.sort(vec![col("c2").sort(true, true)])
25342537
.unwrap_err();
25352538
assert_eq!(err.strip_backtrace(), "Error during planning: For SELECT DISTINCT, ORDER BY expressions c2 must appear in select list");
25362539

@@ -2577,10 +2580,10 @@ mod tests {
25772580
.distinct_on(
25782581
vec![col("c1")],
25792582
vec![col("c1")],
2580-
Some(vec![col("c1").sort(true, true).to_expr()]),
2583+
Some(vec![col("c1").sort(true, true)]),
25812584
)
25822585
.unwrap()
2583-
.sort(vec![col("c1").sort(true, true).to_expr()])
2586+
.sort(vec![col("c1").sort(true, true)])
25842587
.unwrap();
25852588

25862589
let df_results = plan.clone().collect().await?;
@@ -2611,11 +2614,11 @@ mod tests {
26112614
.distinct_on(
26122615
vec![col("c1")],
26132616
vec![col("c1")],
2614-
Some(vec![col("c1").sort(true, true).to_expr()]),
2617+
Some(vec![col("c1").sort(true, true)]),
26152618
)
26162619
.unwrap()
26172620
// try to sort on some value not present in input to distinct
2618-
.sort(vec![col("c2").sort(true, true).to_expr()])
2621+
.sort(vec![col("c2").sort(true, true)])
26192622
.unwrap_err();
26202623
assert_eq!(err.strip_backtrace(), "Error during planning: For SELECT DISTINCT, ORDER BY expressions c2 must appear in select list");
26212624

@@ -3021,7 +3024,7 @@ mod tests {
30213024
)?
30223025
.sort(vec![
30233026
// make the test deterministic
3024-
col("t1.c1").sort(true, true).to_expr(),
3027+
col("t1.c1").sort(true, true),
30253028
])?
30263029
.limit(0, Some(1))?;
30273030

@@ -3098,7 +3101,7 @@ mod tests {
30983101
)?
30993102
.sort(vec![
31003103
// make the test deterministic
3101-
col("t1.c1").sort(true, true).to_expr(),
3104+
col("t1.c1").sort(true, true),
31023105
])?
31033106
.limit(0, Some(1))?;
31043107

@@ -3131,9 +3134,9 @@ mod tests {
31313134
.filter(col("c2").eq(lit(3)).and(col("c1").eq(lit("a"))))?
31323135
.sort(vec![
31333136
// make the test deterministic
3134-
col("c1").sort(true, true).to_expr(),
3135-
col("c2").sort(true, true).to_expr(),
3136-
col("c3").sort(true, true).to_expr(),
3137+
col("c1").sort(true, true),
3138+
col("c2").sort(true, true),
3139+
col("c3").sort(true, true),
31373140
])?
31383141
.limit(0, Some(1))?
31393142
.with_column("sum", col("c2") + col("c3"))?;
@@ -3211,12 +3214,12 @@ mod tests {
32113214
)?
32123215
.sort(vec![
32133216
// make the test deterministic
3214-
col("t1.c1").sort(true, true).to_expr(),
3215-
col("t1.c2").sort(true, true).to_expr(),
3216-
col("t1.c3").sort(true, true).to_expr(),
3217-
col("t2.c1").sort(true, true).to_expr(),
3218-
col("t2.c2").sort(true, true).to_expr(),
3219-
col("t2.c3").sort(true, true).to_expr(),
3217+
col("t1.c1").sort(true, true),
3218+
col("t1.c2").sort(true, true),
3219+
col("t1.c3").sort(true, true),
3220+
col("t2.c1").sort(true, true),
3221+
col("t2.c2").sort(true, true),
3222+
col("t2.c3").sort(true, true),
32203223
])?
32213224
.limit(0, Some(1))?;
32223225

@@ -3289,9 +3292,9 @@ mod tests {
32893292
.limit(0, Some(1))?
32903293
.sort(vec![
32913294
// make the test deterministic
3292-
col("c1").sort(true, true).to_expr(),
3293-
col("c2").sort(true, true).to_expr(),
3294-
col("c3").sort(true, true).to_expr(),
3295+
col("c1").sort(true, true),
3296+
col("c2").sort(true, true),
3297+
col("c3").sort(true, true),
32953298
])?
32963299
.select_columns(&["c1"])?;
32973300

datafusion/core/src/datasource/listing/helpers.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,10 @@ pub fn expr_applicable_for_cols(col_names: &[String], expr: &Expr) -> bool {
102102
}
103103

104104
// TODO other expressions are not handled yet:
105-
// - AGGREGATE, WINDOW and SORT should not end up in filter conditions, except maybe in some edge cases
105+
// - AGGREGATE and WINDOW should not end up in filter conditions, except maybe in some edge cases
106106
// - Can `Wildcard` be considered as a `Literal`?
107107
// - ScalarVariable could be `applicable`, but that would require access to the context
108108
Expr::AggregateFunction { .. }
109-
| Expr::Sort { .. }
110109
| Expr::WindowFunction { .. }
111110
| Expr::Wildcard { .. }
112111
| Expr::Unnest { .. }

datafusion/core/src/datasource/listing/table.rs

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ use datafusion_physical_expr::{
5151

5252
use async_trait::async_trait;
5353
use datafusion_catalog::Session;
54-
use datafusion_expr::expr::sort_vec_vec_to_expr;
5554
use futures::{future, stream, StreamExt, TryStreamExt};
5655
use itertools::Itertools;
5756
use object_store::ObjectStore;
@@ -714,10 +713,7 @@ impl ListingTable {
714713

715714
/// If file_sort_order is specified, creates the appropriate physical expressions
716715
fn try_create_output_ordering(&self) -> Result<Vec<LexOrdering>> {
717-
create_ordering(
718-
&self.table_schema,
719-
&sort_vec_vec_to_expr(self.options.file_sort_order.clone()),
720-
)
716+
create_ordering(&self.table_schema, &self.options.file_sort_order)
721717
}
722718
}
723719

@@ -1068,7 +1064,6 @@ mod tests {
10681064
use datafusion_physical_expr::PhysicalSortExpr;
10691065
use datafusion_physical_plan::ExecutionPlanProperties;
10701066

1071-
use datafusion_expr::expr::sort_vec_vec_from_expr;
10721067
use tempfile::TempDir;
10731068

10741069
#[tokio::test]
@@ -1159,6 +1154,7 @@ mod tests {
11591154

11601155
use crate::datasource::file_format::parquet::ParquetFormat;
11611156
use datafusion_physical_plan::expressions::col as physical_col;
1157+
use std::ops::Add;
11621158

11631159
// (file_sort_order, expected_result)
11641160
let cases = vec![
@@ -1207,9 +1203,7 @@ mod tests {
12071203
];
12081204

12091205
for (file_sort_order, expected_result) in cases {
1210-
let options = options.clone().with_file_sort_order(sort_vec_vec_from_expr(
1211-
sort_vec_vec_to_expr(file_sort_order),
1212-
));
1206+
let options = options.clone().with_file_sort_order(file_sort_order);
12131207

12141208
let config = ListingTableConfig::new(table_path.clone())
12151209
.with_listing_options(options)

datafusion/core/src/datasource/listing_table_factory.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ use datafusion_expr::CreateExternalTable;
3333

3434
use async_trait::async_trait;
3535
use datafusion_catalog::Session;
36-
use datafusion_expr::expr::sort_vec_vec_from_expr;
3736

3837
/// A `TableProviderFactory` capable of creating new `ListingTable`s
3938
#[derive(Debug, Default)]
@@ -115,7 +114,7 @@ impl TableProviderFactory for ListingTableFactory {
115114
.with_file_extension(file_extension)
116115
.with_target_partitions(state.config().target_partitions())
117116
.with_table_partition_cols(table_partition_cols)
118-
.with_file_sort_order(sort_vec_vec_from_expr(cmd.order_exprs.clone()));
117+
.with_file_sort_order(cmd.order_exprs.clone());
119118

120119
options
121120
.validate_partitions(session_state, &table_path)

datafusion/core/src/datasource/memory.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ use datafusion_physical_plan::metrics::MetricsSet;
4343

4444
use async_trait::async_trait;
4545
use datafusion_catalog::Session;
46+
use datafusion_expr::SortExpr;
4647
use futures::StreamExt;
4748
use log::debug;
4849
use parking_lot::Mutex;
@@ -64,7 +65,7 @@ pub struct MemTable {
6465
column_defaults: HashMap<String, Expr>,
6566
/// Optional pre-known sort order(s). Must be `SortExpr`s.
6667
/// inserting data into this table removes the order
67-
pub sort_order: Arc<Mutex<Vec<Vec<Expr>>>>,
68+
pub sort_order: Arc<Mutex<Vec<Vec<SortExpr>>>>,
6869
}
6970

7071
impl MemTable {
@@ -118,7 +119,7 @@ impl MemTable {
118119
///
119120
/// Note that multiple sort orders are supported, if some are known to be
120121
/// equivalent,
121-
pub fn with_sort_order(self, mut sort_order: Vec<Vec<Expr>>) -> Self {
122+
pub fn with_sort_order(self, mut sort_order: Vec<Vec<SortExpr>>) -> Self {
122123
std::mem::swap(self.sort_order.lock().as_mut(), &mut sort_order);
123124
self
124125
}

0 commit comments

Comments
 (0)