Skip to content

Commit eca0e07

Browse files
alan910127jonahgao
andauthored
refactor: add get_available_parallelism function (#13595)
Co-authored-by: jonahgao <[email protected]>
1 parent 703b10d commit eca0e07

File tree

13 files changed

+48
-77
lines changed

13 files changed

+48
-77
lines changed

benchmarks/src/bin/external_aggr.rs

+4-7
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,9 @@
1818
//! external_aggr binary entrypoint
1919
2020
use std::collections::HashMap;
21-
use std::num::NonZero;
2221
use std::path::PathBuf;
2322
use std::sync::Arc;
2423
use std::sync::OnceLock;
25-
use std::thread::available_parallelism;
2624
use structopt::StructOpt;
2725

2826
use arrow::record_batch::RecordBatch;
@@ -41,6 +39,7 @@ use datafusion::physical_plan::{collect, displayable};
4139
use datafusion::prelude::*;
4240
use datafusion_benchmarks::util::{BenchmarkRun, CommonOpt};
4341
use datafusion_common::instant::Instant;
42+
use datafusion_common::utils::get_available_parallelism;
4443
use datafusion_common::{exec_datafusion_err, exec_err, DEFAULT_PARQUET_EXTENSION};
4544

4645
#[derive(Debug, StructOpt)]
@@ -327,11 +326,9 @@ impl ExternalAggrConfig {
327326
}
328327

329328
fn partitions(&self) -> usize {
330-
self.common.partitions.unwrap_or(
331-
available_parallelism()
332-
.unwrap_or(NonZero::new(1).unwrap())
333-
.get(),
334-
)
329+
self.common
330+
.partitions
331+
.unwrap_or(get_available_parallelism())
335332
}
336333

337334
/// Parse memory limit from string to number of bytes

benchmarks/src/bin/h2o.rs

+2-5
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,9 @@ use datafusion::datasource::MemTable;
2727
use datafusion::prelude::CsvReadOptions;
2828
use datafusion::{arrow::util::pretty, error::Result, prelude::SessionContext};
2929
use datafusion_benchmarks::util::BenchmarkRun;
30-
use std::num::NonZero;
30+
use datafusion_common::utils::get_available_parallelism;
3131
use std::path::PathBuf;
3232
use std::sync::Arc;
33-
use std::thread::available_parallelism;
3433
use structopt::StructOpt;
3534
use tokio::time::Instant;
3635

@@ -93,9 +92,7 @@ async fn group_by(opt: &GroupBy) -> Result<()> {
9392
.with_listing_options(ListingOptions::new(Arc::new(CsvFormat::default())))
9493
.with_schema(Arc::new(schema));
9594
let csv = ListingTable::try_new(listing_config)?;
96-
let partition_size = available_parallelism()
97-
.unwrap_or(NonZero::new(1).unwrap())
98-
.get();
95+
let partition_size = get_available_parallelism();
9996
let memtable =
10097
MemTable::load(Arc::new(csv), Some(partition_size), &ctx.state()).await?;
10198
ctx.register_table("x", Arc::new(memtable))?;

benchmarks/src/imdb/run.rs

+4-7
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,8 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use std::num::NonZero;
1918
use std::path::PathBuf;
2019
use std::sync::Arc;
21-
use std::thread::available_parallelism;
2220

2321
use super::{get_imdb_table_schema, get_query_sql, IMDB_TABLES};
2422
use crate::util::{BenchmarkRun, CommonOpt};
@@ -37,6 +35,7 @@ use datafusion::physical_plan::display::DisplayableExecutionPlan;
3735
use datafusion::physical_plan::{collect, displayable};
3836
use datafusion::prelude::*;
3937
use datafusion_common::instant::Instant;
38+
use datafusion_common::utils::get_available_parallelism;
4039
use datafusion_common::{DEFAULT_CSV_EXTENSION, DEFAULT_PARQUET_EXTENSION};
4140

4241
use log::info;
@@ -470,11 +469,9 @@ impl RunOpt {
470469
}
471470

472471
fn partitions(&self) -> usize {
473-
self.common.partitions.unwrap_or(
474-
available_parallelism()
475-
.unwrap_or(NonZero::new(1).unwrap())
476-
.get(),
477-
)
472+
self.common
473+
.partitions
474+
.unwrap_or(get_available_parallelism())
478475
}
479476
}
480477

benchmarks/src/sort.rs

+4-8
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,8 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use std::num::NonZero;
1918
use std::path::PathBuf;
2019
use std::sync::Arc;
21-
use std::thread::available_parallelism;
2220

2321
use crate::util::{AccessLogOpt, BenchmarkRun, CommonOpt};
2422

@@ -30,7 +28,7 @@ use datafusion::physical_plan::sorts::sort::SortExec;
3028
use datafusion::prelude::{SessionConfig, SessionContext};
3129
use datafusion::test_util::parquet::TestParquetFile;
3230
use datafusion_common::instant::Instant;
33-
31+
use datafusion_common::utils::get_available_parallelism;
3432
use structopt::StructOpt;
3533

3634
/// Test performance of sorting large datasets
@@ -149,11 +147,9 @@ impl RunOpt {
149147
rundata.start_new_case(title);
150148
for i in 0..self.common.iterations {
151149
let config = SessionConfig::new().with_target_partitions(
152-
self.common.partitions.unwrap_or(
153-
available_parallelism()
154-
.unwrap_or(NonZero::new(1).unwrap())
155-
.get(),
156-
),
150+
self.common
151+
.partitions
152+
.unwrap_or(get_available_parallelism()),
157153
);
158154
let ctx = SessionContext::new_with_config(config);
159155
let (rows, elapsed) =

benchmarks/src/sort_tpch.rs

+4-7
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,8 @@
2222
//! runs end-to-end sort queries and test the performance on multiple CPU cores.
2323
2424
use futures::StreamExt;
25-
use std::num::NonZero;
2625
use std::path::PathBuf;
2726
use std::sync::Arc;
28-
use std::thread::available_parallelism;
2927
use structopt::StructOpt;
3028

3129
use datafusion::datasource::file_format::parquet::ParquetFormat;
@@ -39,6 +37,7 @@ use datafusion::physical_plan::display::DisplayableExecutionPlan;
3937
use datafusion::physical_plan::{displayable, execute_stream};
4038
use datafusion::prelude::*;
4139
use datafusion_common::instant::Instant;
40+
use datafusion_common::utils::get_available_parallelism;
4241
use datafusion_common::DEFAULT_PARQUET_EXTENSION;
4342

4443
use crate::util::{BenchmarkRun, CommonOpt};
@@ -317,10 +316,8 @@ impl RunOpt {
317316
}
318317

319318
fn partitions(&self) -> usize {
320-
self.common.partitions.unwrap_or(
321-
available_parallelism()
322-
.unwrap_or(NonZero::new(1).unwrap())
323-
.get(),
324-
)
319+
self.common
320+
.partitions
321+
.unwrap_or(get_available_parallelism())
325322
}
326323
}

benchmarks/src/tpch/run.rs

+4-7
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,8 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use std::num::NonZero;
1918
use std::path::PathBuf;
2019
use std::sync::Arc;
21-
use std::thread::available_parallelism;
2220

2321
use super::{
2422
get_query_sql, get_tbl_tpch_table_schema, get_tpch_table_schema, TPCH_TABLES,
@@ -39,6 +37,7 @@ use datafusion::physical_plan::display::DisplayableExecutionPlan;
3937
use datafusion::physical_plan::{collect, displayable};
4038
use datafusion::prelude::*;
4139
use datafusion_common::instant::Instant;
40+
use datafusion_common::utils::get_available_parallelism;
4241
use datafusion_common::{DEFAULT_CSV_EXTENSION, DEFAULT_PARQUET_EXTENSION};
4342

4443
use log::info;
@@ -298,11 +297,9 @@ impl RunOpt {
298297
}
299298

300299
fn partitions(&self) -> usize {
301-
self.common.partitions.unwrap_or(
302-
available_parallelism()
303-
.unwrap_or(NonZero::new(1).unwrap())
304-
.get(),
305-
)
300+
self.common
301+
.partitions
302+
.unwrap_or(get_available_parallelism())
306303
}
307304
}
308305

benchmarks/src/util/options.rs

+2-7
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,8 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use std::{num::NonZero, thread::available_parallelism};
19-
2018
use datafusion::prelude::SessionConfig;
19+
use datafusion_common::utils::get_available_parallelism;
2120
use structopt::StructOpt;
2221

2322
// Common benchmark options (don't use doc comments otherwise this doc
@@ -51,11 +50,7 @@ impl CommonOpt {
5150
pub fn update_config(&self, config: SessionConfig) -> SessionConfig {
5251
config
5352
.with_target_partitions(
54-
self.partitions.unwrap_or(
55-
available_parallelism()
56-
.unwrap_or(NonZero::new(1).unwrap())
57-
.get(),
58-
),
53+
self.partitions.unwrap_or(get_available_parallelism()),
5954
)
6055
.with_batch_size(self.batch_size)
6156
}

benchmarks/src/util/run.rs

+2-5
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,12 @@
1616
// under the License.
1717

1818
use datafusion::{error::Result, DATAFUSION_VERSION};
19+
use datafusion_common::utils::get_available_parallelism;
1920
use serde::{Serialize, Serializer};
2021
use serde_json::Value;
2122
use std::{
2223
collections::HashMap,
23-
num::NonZero,
2424
path::Path,
25-
thread::available_parallelism,
2625
time::{Duration, SystemTime},
2726
};
2827

@@ -70,9 +69,7 @@ impl RunContext {
7069
Self {
7170
benchmark_version: env!("CARGO_PKG_VERSION").to_owned(),
7271
datafusion_version: DATAFUSION_VERSION.to_owned(),
73-
num_cpus: available_parallelism()
74-
.unwrap_or(NonZero::new(1).unwrap())
75-
.get(),
72+
num_cpus: get_available_parallelism(),
7673
start_time: SystemTime::now(),
7774
arguments: std::env::args().skip(1).collect::<Vec<String>>(),
7875
}

datafusion/common/src/config.rs

+3-4
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,11 @@
2020
use std::any::Any;
2121
use std::collections::{BTreeMap, HashMap};
2222
use std::fmt::{self, Display};
23-
use std::num::NonZero;
2423
use std::str::FromStr;
25-
use std::thread::available_parallelism;
2624

2725
use crate::error::_config_err;
2826
use crate::parsers::CompressionTypeVariant;
27+
use crate::utils::get_available_parallelism;
2928
use crate::{DataFusionError, Result};
3029

3130
/// A macro that wraps a configuration struct and automatically derives
@@ -252,7 +251,7 @@ config_namespace! {
252251
/// concurrency.
253252
///
254253
/// Defaults to the number of CPU cores on the system
255-
pub target_partitions: usize, default = available_parallelism().unwrap_or(NonZero::new(1).unwrap()).get()
254+
pub target_partitions: usize, default = get_available_parallelism()
256255

257256
/// The default time zone
258257
///
@@ -268,7 +267,7 @@ config_namespace! {
268267
/// This is mostly use to plan `UNION` children in parallel.
269268
///
270269
/// Defaults to the number of CPU cores on the system
271-
pub planning_concurrency: usize, default = available_parallelism().unwrap_or(NonZero::new(1).unwrap()).get()
270+
pub planning_concurrency: usize, default = get_available_parallelism()
272271

273272
/// When set to true, skips verifying that the schema produced by
274273
/// planning the input of `LogicalPlan::Aggregate` exactly matches the

datafusion/common/src/utils/mod.rs

+12
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,10 @@ use sqlparser::parser::Parser;
3939
use std::borrow::{Borrow, Cow};
4040
use std::cmp::{min, Ordering};
4141
use std::collections::HashSet;
42+
use std::num::NonZero;
4243
use std::ops::Range;
4344
use std::sync::Arc;
45+
use std::thread::available_parallelism;
4446

4547
/// Applies an optional projection to a [`SchemaRef`], returning the
4648
/// projected schema
@@ -761,6 +763,16 @@ pub fn combine_limit(
761763
(combined_skip, combined_fetch)
762764
}
763765

766+
/// Returns the estimated number of threads available for parallel execution.
767+
///
768+
/// This is a wrapper around `std::thread::available_parallelism`, providing a default value
769+
/// of `1` if the system's parallelism cannot be determined.
770+
pub fn get_available_parallelism() -> usize {
771+
available_parallelism()
772+
.unwrap_or(NonZero::new(1).expect("literal value `1` shouldn't be zero"))
773+
.get()
774+
}
775+
764776
#[cfg(test)]
765777
mod tests {
766778
use crate::ScalarValue::Null;

datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs

+3-5
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,15 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use std::{cmp, num::NonZero, sync::Arc, thread::available_parallelism};
18+
use std::{cmp, sync::Arc};
1919

2020
use datafusion::{
2121
datasource::MemTable,
2222
prelude::{SessionConfig, SessionContext},
2323
};
2424
use datafusion_catalog::TableProvider;
25-
use datafusion_common::error::Result;
2625
use datafusion_common::ScalarValue;
26+
use datafusion_common::{error::Result, utils::get_available_parallelism};
2727
use datafusion_expr::col;
2828
use rand::{thread_rng, Rng};
2929

@@ -73,9 +73,7 @@ impl SessionContextGenerator {
7373
];
7474

7575
let max_batch_size = cmp::max(1, dataset_ref.total_rows_num);
76-
let max_target_partitions = available_parallelism()
77-
.unwrap_or(NonZero::new(1).unwrap())
78-
.get();
76+
let max_target_partitions = get_available_parallelism();
7977

8078
Self {
8179
dataset: dataset_ref,

datafusion/core/tests/sql/mod.rs

+2-8
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use std::num::NonZero;
1918
use std::sync::Arc;
20-
use std::thread::available_parallelism;
2119

2220
use arrow::{
2321
array::*, datatypes::*, record_batch::RecordBatch,
@@ -34,6 +32,7 @@ use datafusion::prelude::*;
3432
use datafusion::test_util;
3533
use datafusion::{assert_batches_eq, assert_batches_sorted_eq};
3634
use datafusion::{execution::context::SessionContext, physical_plan::displayable};
35+
use datafusion_common::utils::get_available_parallelism;
3736
use datafusion_common::{assert_contains, assert_not_contains};
3837
use object_store::path::Path;
3938
use std::fs::File;
@@ -261,12 +260,7 @@ impl ExplainNormalizer {
261260

262261
// convert things like partitioning=RoundRobinBatch(16)
263262
// to partitioning=RoundRobinBatch(NUM_CORES)
264-
let needle = format!(
265-
"RoundRobinBatch({})",
266-
available_parallelism()
267-
.unwrap_or(NonZero::new(1).unwrap())
268-
.get()
269-
);
263+
let needle = format!("RoundRobinBatch({})", get_available_parallelism());
270264
replacements.push((needle, "RoundRobinBatch(NUM_CORES)".to_string()));
271265

272266
Self { replacements }

datafusion/sqllogictest/bin/sqllogictests.rs

+2-7
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,10 @@
1717

1818
use std::ffi::OsStr;
1919
use std::fs;
20-
use std::num::NonZero;
2120
use std::path::{Path, PathBuf};
22-
use std::thread::available_parallelism;
2321

2422
use clap::Parser;
23+
use datafusion_common::utils::get_available_parallelism;
2524
use datafusion_sqllogictest::{DataFusion, TestContext};
2625
use futures::stream::StreamExt;
2726
use itertools::Itertools;
@@ -114,11 +113,7 @@ async fn run_tests() -> Result<()> {
114113
.join()
115114
})
116115
// run up to num_cpus streams in parallel
117-
.buffer_unordered(
118-
available_parallelism()
119-
.unwrap_or(NonZero::new(1).unwrap())
120-
.get(),
121-
)
116+
.buffer_unordered(get_available_parallelism())
122117
.flat_map(|result| {
123118
// Filter out any Ok() leaving only the DataFusionErrors
124119
futures::stream::iter(match result {

0 commit comments

Comments
 (0)