Minor: Add more documentation about table_partition_columns

alamb · alamb · commit 3c09c19583bd · 2023-03-13T12:33:51.000-04:00
diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
@@ -213,10 +213,7 @@ pub struct ListingOptions {
     /// The file format
     pub format: Arc<dyn FileFormat>,
     /// The expected partition column names in the folder structure.
-    /// For example `Vec["a", "b"]` means that the two first levels of
-    /// partitioning expected should be named "a" and "b":
-    /// - If there is a third level of partitioning it will be ignored.
-    /// - Files that don't follow this partitioning will be ignored.
+    /// See [Self::with_table_partition_cols] for details
     pub table_partition_cols: Vec<(String, DataType)>,
     /// Set true to try to guess statistics from the files.
     /// This can add a lot of overhead as it will usually require files
@@ -298,14 +295,49 @@ impl ListingOptions {
         self
     }
 
-    /// Set table partition column names on [`ListingOptions`] and returns self.
+    /// Set `table partition columns` on [`ListingOptions`] and returns self.
+    ///
+    /// "partition columns," used to support [Hive Partitioning], are
+    /// columns added to the data that is read, based on the folder
+    /// structure where the data resides.
+    ///
+    /// For example, give the following files in your filesystem:
+    ///
+    /// ```text
+    /// /mnt/nyctaxi/year=2022/month=01/tripdata.parquet
+    /// /mnt/nyctaxi/year=2021/month=12/tripdata.parquet
+    /// /mnt/nyctaxi/year=2021/month=11/tripdata.parquet
+    /// ```
+    ///
+    /// A [`ListingTable`] created at `/mnt/nyctaxi/` with partition
+    /// columns "year" and "month" will include new `year` and `month`
+    /// columns while reading the files. The `year` column would have
+    /// value `2022` and the `month` column would have value `01` for
+    /// the rows read from
+    /// `/mnt/nyctaxi/year=2022/month=01/tripdata.parquet`
+    ///
+    ///# Notes
+    ///
+    /// - If only one level (e.g. `year` in the example above) is specified, the other levels are ignored
+    /// but the files are still read.
+    ///
+    /// - Files that don't follow this partitioning scheme will be
+    /// ignored.
+    ///
+    /// - Since the columns have the same value for all rows read from
+    /// each individual file (such as dates), they are typically
+    /// dictionary encoded for efficiency.
+    ///
+    /// # Example
     ///
     /// ```
     /// # use std::sync::Arc;
     /// # use arrow::datatypes::DataType;
     /// # use datafusion::prelude::col;
     /// # use datafusion::datasource::{listing::ListingOptions, file_format::parquet::ParquetFormat};
     ///
+    /// // listing options for files with paths such as  `/mnt/data/col_a=x/col_b=y/data.parquet`
+    /// // `col_a` and `col_b` will be included in the data read from those files
     /// let listing_options = ListingOptions::new(Arc::new(
     ///     ParquetFormat::default()
     ///   ))
@@ -315,6 +347,8 @@ impl ListingOptions {
     /// assert_eq!(listing_options.table_partition_cols, vec![("col_a".to_string(), DataType::Utf8),
     ///     ("col_b".to_string(), DataType::Utf8)]);
     /// ```
+    ///
+    /// [Hive Partitioning]: https://docs.cloudera.com/HDPDocuments/HDP2/HDP-2.1.3/bk_system-admin-guide/content/hive_partitioned_tables.html
     pub fn with_table_partition_cols(
         mut self,
         table_partition_cols: Vec<(String, DataType)>,