Skip to content

Commit 3c09c19

Browse files
committed
Minor: Add more documentation about table_partition_columns
1 parent 1f8ede5 commit 3c09c19

File tree

1 file changed

+39
-5
lines changed
  • datafusion/core/src/datasource/listing

1 file changed

+39
-5
lines changed

datafusion/core/src/datasource/listing/table.rs

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -213,10 +213,7 @@ pub struct ListingOptions {
213213
/// The file format
214214
pub format: Arc<dyn FileFormat>,
215215
/// The expected partition column names in the folder structure.
216-
/// For example `Vec["a", "b"]` means that the two first levels of
217-
/// partitioning expected should be named "a" and "b":
218-
/// - If there is a third level of partitioning it will be ignored.
219-
/// - Files that don't follow this partitioning will be ignored.
216+
/// See [Self::with_table_partition_cols] for details
220217
pub table_partition_cols: Vec<(String, DataType)>,
221218
/// Set true to try to guess statistics from the files.
222219
/// This can add a lot of overhead as it will usually require files
@@ -298,14 +295,49 @@ impl ListingOptions {
298295
self
299296
}
300297

301-
/// Set table partition column names on [`ListingOptions`] and returns self.
298+
/// Set `table partition columns` on [`ListingOptions`] and returns self.
299+
///
300+
/// "partition columns," used to support [Hive Partitioning], are
301+
/// columns added to the data that is read, based on the folder
302+
/// structure where the data resides.
303+
///
304+
/// For example, give the following files in your filesystem:
305+
///
306+
/// ```text
307+
/// /mnt/nyctaxi/year=2022/month=01/tripdata.parquet
308+
/// /mnt/nyctaxi/year=2021/month=12/tripdata.parquet
309+
/// /mnt/nyctaxi/year=2021/month=11/tripdata.parquet
310+
/// ```
311+
///
312+
/// A [`ListingTable`] created at `/mnt/nyctaxi/` with partition
313+
/// columns "year" and "month" will include new `year` and `month`
314+
/// columns while reading the files. The `year` column would have
315+
/// value `2022` and the `month` column would have value `01` for
316+
/// the rows read from
317+
/// `/mnt/nyctaxi/year=2022/month=01/tripdata.parquet`
318+
///
319+
///# Notes
320+
///
321+
/// - If only one level (e.g. `year` in the example above) is specified, the other levels are ignored
322+
/// but the files are still read.
323+
///
324+
/// - Files that don't follow this partitioning scheme will be
325+
/// ignored.
326+
///
327+
/// - Since the columns have the same value for all rows read from
328+
/// each individual file (such as dates), they are typically
329+
/// dictionary encoded for efficiency.
330+
///
331+
/// # Example
302332
///
303333
/// ```
304334
/// # use std::sync::Arc;
305335
/// # use arrow::datatypes::DataType;
306336
/// # use datafusion::prelude::col;
307337
/// # use datafusion::datasource::{listing::ListingOptions, file_format::parquet::ParquetFormat};
308338
///
339+
/// // listing options for files with paths such as `/mnt/data/col_a=x/col_b=y/data.parquet`
340+
/// // `col_a` and `col_b` will be included in the data read from those files
309341
/// let listing_options = ListingOptions::new(Arc::new(
310342
/// ParquetFormat::default()
311343
/// ))
@@ -315,6 +347,8 @@ impl ListingOptions {
315347
/// assert_eq!(listing_options.table_partition_cols, vec![("col_a".to_string(), DataType::Utf8),
316348
/// ("col_b".to_string(), DataType::Utf8)]);
317349
/// ```
350+
///
351+
/// [Hive Partitioning]: https://docs.cloudera.com/HDPDocuments/HDP2/HDP-2.1.3/bk_system-admin-guide/content/hive_partitioned_tables.html
318352
pub fn with_table_partition_cols(
319353
mut self,
320354
table_partition_cols: Vec<(String, DataType)>,

0 commit comments

Comments
 (0)