Skip to content

Commit 83487e3

Browse files
authored
feat: Improve datafusion-cli memory usage and considering reserve mem… (#14766)
* feat: Improve datafusion-cli memory usage and considering reserve memory for the result batches * Address new comments * Address new comments * fix test * fix test * Address comments * Fix doc * Fix row count showing * Fix fmt * fix corner case * remove unused code
1 parent 84232d8 commit 83487e3

File tree

3 files changed

+38
-11
lines changed

3 files changed

+38
-11
lines changed

datafusion-cli/src/command.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,9 @@ impl Command {
6262
Self::Help => {
6363
let now = Instant::now();
6464
let command_batch = all_commands_info();
65-
print_options.print_batches(command_batch.schema(), &[command_batch], now)
65+
let schema = command_batch.schema();
66+
let num_rows = command_batch.num_rows();
67+
print_options.print_batches(schema, &[command_batch], now, num_rows)
6668
}
6769
Self::ListTables => {
6870
exec_and_print(ctx, print_options, "SHOW TABLES".into()).await

datafusion-cli/src/exec.rs

+34-9
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,6 @@
1717

1818
//! Execution functions
1919
20-
use std::collections::HashMap;
21-
use std::fs::File;
22-
use std::io::prelude::*;
23-
use std::io::BufReader;
24-
2520
use crate::cli_context::CliSessionContext;
2621
use crate::helper::split_from_semicolon;
2722
use crate::print_format::PrintFormat;
@@ -31,6 +26,11 @@ use crate::{
3126
object_storage::get_object_store,
3227
print_options::{MaxRows, PrintOptions},
3328
};
29+
use futures::StreamExt;
30+
use std::collections::HashMap;
31+
use std::fs::File;
32+
use std::io::prelude::*;
33+
use std::io::BufReader;
3434

3535
use datafusion::common::instant::Instant;
3636
use datafusion::common::{plan_datafusion_err, plan_err};
@@ -39,10 +39,12 @@ use datafusion::datasource::listing::ListingTableUrl;
3939
use datafusion::error::{DataFusionError, Result};
4040
use datafusion::logical_expr::{DdlStatement, LogicalPlan};
4141
use datafusion::physical_plan::execution_plan::EmissionType;
42-
use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties};
42+
use datafusion::physical_plan::{execute_stream, ExecutionPlanProperties};
4343
use datafusion::sql::parser::{DFParser, Statement};
4444
use datafusion::sql::sqlparser::dialect::dialect_from_str;
4545

46+
use datafusion::execution::memory_pool::MemoryConsumer;
47+
use datafusion::physical_plan::spill::get_record_batch_memory_size;
4648
use datafusion::sql::sqlparser;
4749
use rustyline::error::ReadlineError;
4850
use rustyline::Editor;
@@ -235,6 +237,10 @@ pub(super) async fn exec_and_print(
235237
let df = ctx.execute_logical_plan(plan).await?;
236238
let physical_plan = df.create_physical_plan().await?;
237239

240+
// Track memory usage for the query result if it's bounded
241+
let mut reservation =
242+
MemoryConsumer::new("DataFusion-Cli").register(task_ctx.memory_pool());
243+
238244
if physical_plan.boundedness().is_unbounded() {
239245
if physical_plan.pipeline_behavior() == EmissionType::Final {
240246
return plan_err!(
@@ -247,10 +253,29 @@ pub(super) async fn exec_and_print(
247253
let stream = execute_stream(physical_plan, task_ctx.clone())?;
248254
print_options.print_stream(stream, now).await?;
249255
} else {
250-
// Bounded stream; collected results are printed after all input consumed.
256+
// Bounded stream; collected results size is limited by the maxrows option
251257
let schema = physical_plan.schema();
252-
let results = collect(physical_plan, task_ctx.clone()).await?;
253-
adjusted.into_inner().print_batches(schema, &results, now)?;
258+
let mut stream = execute_stream(physical_plan, task_ctx.clone())?;
259+
let mut results = vec![];
260+
let mut row_count = 0_usize;
261+
while let Some(batch) = stream.next().await {
262+
let batch = batch?;
263+
let curr_num_rows = batch.num_rows();
264+
if let MaxRows::Limited(max_rows) = print_options.maxrows {
265+
// Stop collecting results if the number of rows exceeds the limit
266+
// results batch should include the last batch that exceeds the limit
267+
if row_count < max_rows + curr_num_rows {
268+
// Try to grow the reservation to accommodate the batch in memory
269+
reservation.try_grow(get_record_batch_memory_size(&batch))?;
270+
results.push(batch);
271+
}
272+
}
273+
row_count += curr_num_rows;
274+
}
275+
adjusted
276+
.into_inner()
277+
.print_batches(schema, &results, now, row_count)?;
278+
reservation.free();
254279
}
255280
}
256281

datafusion-cli/src/print_options.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -102,14 +102,14 @@ impl PrintOptions {
102102
schema: SchemaRef,
103103
batches: &[RecordBatch],
104104
query_start_time: Instant,
105+
row_count: usize,
105106
) -> Result<()> {
106107
let stdout = std::io::stdout();
107108
let mut writer = stdout.lock();
108109

109110
self.format
110111
.print_batches(&mut writer, schema, batches, self.maxrows, true)?;
111112

112-
let row_count: usize = batches.iter().map(|b| b.num_rows()).sum();
113113
let formatted_exec_details = get_execution_details_formatted(
114114
row_count,
115115
if self.format == PrintFormat::Table {

0 commit comments

Comments
 (0)