Skip to content

Commit 3ac92ad

Browse files
authored
use FileFormat::get_ext as the default file extension filter (#12417)
* use defeault file extention filter from FileFormat * use with_file_extension_opt api
1 parent 1e31093 commit 3ac92ad

File tree

1 file changed

+55
-7
lines changed
  • datafusion/core/src/datasource/listing

1 file changed

+55
-7
lines changed

datafusion/core/src/datasource/listing/table.rs

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -228,13 +228,13 @@ pub struct ListingOptions {
228228
impl ListingOptions {
229229
/// Creates an options instance with the given format
230230
/// Default values:
231-
/// - no file extension filter
231+
/// - use default file extension filter
232232
/// - no input partition to discover
233233
/// - one target partition
234234
/// - stat collection
235235
pub fn new(format: Arc<dyn FileFormat>) -> Self {
236236
Self {
237-
file_extension: String::new(),
237+
file_extension: format.get_ext(),
238238
format,
239239
table_partition_cols: vec![],
240240
collect_stat: true,
@@ -1314,6 +1314,7 @@ mod tests {
13141314
"test:///bucket/key-prefix/",
13151315
12,
13161316
5,
1317+
Some(""),
13171318
)
13181319
.await?;
13191320

@@ -1328,6 +1329,7 @@ mod tests {
13281329
"test:///bucket/key-prefix/",
13291330
4,
13301331
4,
1332+
Some(""),
13311333
)
13321334
.await?;
13331335

@@ -1343,12 +1345,19 @@ mod tests {
13431345
"test:///bucket/key-prefix/",
13441346
2,
13451347
2,
1348+
Some(""),
13461349
)
13471350
.await?;
13481351

13491352
// no files => no groups
1350-
assert_list_files_for_scan_grouping(&[], "test:///bucket/key-prefix/", 2, 0)
1351-
.await?;
1353+
assert_list_files_for_scan_grouping(
1354+
&[],
1355+
"test:///bucket/key-prefix/",
1356+
2,
1357+
0,
1358+
Some(""),
1359+
)
1360+
.await?;
13521361

13531362
// files that don't match the prefix
13541363
assert_list_files_for_scan_grouping(
@@ -1360,6 +1369,21 @@ mod tests {
13601369
"test:///bucket/key-prefix/",
13611370
10,
13621371
2,
1372+
Some(""),
1373+
)
1374+
.await?;
1375+
1376+
// files that don't match the prefix or the default file extention
1377+
assert_list_files_for_scan_grouping(
1378+
&[
1379+
"bucket/key-prefix/file0.avro",
1380+
"bucket/key-prefix/file1.parquet",
1381+
"bucket/other-prefix/roguefile.avro",
1382+
],
1383+
"test:///bucket/key-prefix/",
1384+
10,
1385+
1,
1386+
None,
13631387
)
13641388
.await?;
13651389
Ok(())
@@ -1380,6 +1404,7 @@ mod tests {
13801404
&["test:///bucket/key1/", "test:///bucket/key2/"],
13811405
12,
13821406
5,
1407+
Some(""),
13831408
)
13841409
.await?;
13851410

@@ -1396,6 +1421,7 @@ mod tests {
13961421
&["test:///bucket/key1/", "test:///bucket/key2/"],
13971422
5,
13981423
5,
1424+
Some(""),
13991425
)
14001426
.await?;
14011427

@@ -1412,11 +1438,13 @@ mod tests {
14121438
&["test:///bucket/key1/"],
14131439
2,
14141440
2,
1441+
Some(""),
14151442
)
14161443
.await?;
14171444

14181445
// no files => no groups
1419-
assert_list_files_for_multi_paths(&[], &["test:///bucket/key1/"], 2, 0).await?;
1446+
assert_list_files_for_multi_paths(&[], &["test:///bucket/key1/"], 2, 0, Some(""))
1447+
.await?;
14201448

14211449
// files that don't match the prefix
14221450
assert_list_files_for_multi_paths(
@@ -1431,6 +1459,24 @@ mod tests {
14311459
&["test:///bucket/key3/"],
14321460
2,
14331461
1,
1462+
Some(""),
1463+
)
1464+
.await?;
1465+
1466+
// files that don't match the prefix or the default file ext
1467+
assert_list_files_for_multi_paths(
1468+
&[
1469+
"bucket/key1/file0.avro",
1470+
"bucket/key1/file1.csv",
1471+
"bucket/key1/file2.avro",
1472+
"bucket/key2/file3.csv",
1473+
"bucket/key2/file4.avro",
1474+
"bucket/key3/file5.csv",
1475+
],
1476+
&["test:///bucket/key1/", "test:///bucket/key3/"],
1477+
2,
1478+
2,
1479+
None,
14341480
)
14351481
.await?;
14361482
Ok(())
@@ -1458,14 +1504,15 @@ mod tests {
14581504
table_prefix: &str,
14591505
target_partitions: usize,
14601506
output_partitioning: usize,
1507+
file_ext: Option<&str>,
14611508
) -> Result<()> {
14621509
let ctx = SessionContext::new();
14631510
register_test_store(&ctx, &files.iter().map(|f| (*f, 10)).collect::<Vec<_>>());
14641511

14651512
let format = AvroFormat {};
14661513

14671514
let opt = ListingOptions::new(Arc::new(format))
1468-
.with_file_extension("")
1515+
.with_file_extension_opt(file_ext)
14691516
.with_target_partitions(target_partitions);
14701517

14711518
let schema = Schema::new(vec![Field::new("a", DataType::Boolean, false)]);
@@ -1491,14 +1538,15 @@ mod tests {
14911538
table_prefix: &[&str],
14921539
target_partitions: usize,
14931540
output_partitioning: usize,
1541+
file_ext: Option<&str>,
14941542
) -> Result<()> {
14951543
let ctx = SessionContext::new();
14961544
register_test_store(&ctx, &files.iter().map(|f| (*f, 10)).collect::<Vec<_>>());
14971545

14981546
let format = AvroFormat {};
14991547

15001548
let opt = ListingOptions::new(Arc::new(format))
1501-
.with_file_extension("")
1549+
.with_file_extension_opt(file_ext)
15021550
.with_target_partitions(target_partitions);
15031551

15041552
let schema = Schema::new(vec![Field::new("a", DataType::Boolean, false)]);

0 commit comments

Comments
 (0)