@@ -212,10 +212,7 @@ pub struct ListingOptions {
212
212
/// The file format
213
213
pub format : Arc < dyn FileFormat > ,
214
214
/// The expected partition column names in the folder structure.
215
- /// For example `Vec["a", "b"]` means that the two first levels of
216
- /// partitioning expected should be named "a" and "b":
217
- /// - If there is a third level of partitioning it will be ignored.
218
- /// - Files that don't follow this partitioning will be ignored.
215
+ /// See [Self::with_table_partition_cols] for details
219
216
pub table_partition_cols : Vec < ( String , DataType ) > ,
220
217
/// Set true to try to guess statistics from the files.
221
218
/// This can add a lot of overhead as it will usually require files
@@ -297,16 +294,54 @@ impl ListingOptions {
297
294
self
298
295
}
299
296
300
- /// Set table partition column names on [`ListingOptions`] and returns self.
297
+ /// Set ` table partition columns` on [`ListingOptions`] and returns self.
301
298
///
302
- /// You may use [`wrap_partition_type_in_dict`] to request a dictionary-encoded type.
299
+ /// "partition columns," used to support [Hive Partitioning], are
300
+ /// columns added to the data that is read, based on the folder
301
+ /// structure where the data resides.
302
+ ///
303
+ /// For example, give the following files in your filesystem:
304
+ ///
305
+ /// ```text
306
+ /// /mnt/nyctaxi/year=2022/month=01/tripdata.parquet
307
+ /// /mnt/nyctaxi/year=2021/month=12/tripdata.parquet
308
+ /// /mnt/nyctaxi/year=2021/month=11/tripdata.parquet
309
+ /// ```
310
+ ///
311
+ /// A [`ListingTable`] created at `/mnt/nyctaxi/` with partition
312
+ /// columns "year" and "month" will include new `year` and `month`
313
+ /// columns while reading the files. The `year` column would have
314
+ /// value `2022` and the `month` column would have value `01` for
315
+ /// the rows read from
316
+ /// `/mnt/nyctaxi/year=2022/month=01/tripdata.parquet`
317
+ ///
318
+ ///# Notes
319
+ ///
320
+ /// - If only one level (e.g. `year` in the example above) is
321
+ /// specified, the other levels are ignored but the files are
322
+ /// still read.
323
+ ///
324
+ /// - Files that don't follow this partitioning scheme will be
325
+ /// ignored.
326
+ ///
327
+ /// - Since the columns have the same value for all rows read from
328
+ /// each individual file (such as dates), they are typically
329
+ /// dictionary encoded for efficiency. You may use
330
+ /// [`wrap_partition_type_in_dict`] to request a
331
+ /// dictionary-encoded type.
332
+ ///
333
+ /// - The partition columns are solely extracted from the file path. Especially they are NOT part of the parquet files itself.
334
+ ///
335
+ /// # Example
303
336
///
304
337
/// ```
305
338
/// # use std::sync::Arc;
306
339
/// # use arrow::datatypes::DataType;
307
340
/// # use datafusion::prelude::col;
308
341
/// # use datafusion::datasource::{listing::ListingOptions, file_format::parquet::ParquetFormat};
309
342
///
343
+ /// // listing options for files with paths such as `/mnt/data/col_a=x/col_b=y/data.parquet`
344
+ /// // `col_a` and `col_b` will be included in the data read from those files
310
345
/// let listing_options = ListingOptions::new(Arc::new(
311
346
/// ParquetFormat::default()
312
347
/// ))
@@ -317,7 +352,7 @@ impl ListingOptions {
317
352
/// ("col_b".to_string(), DataType::Utf8)]);
318
353
/// ```
319
354
///
320
- ///
355
+ /// [Hive Partitioning]: https://docs.cloudera.com/HDPDocuments/HDP2/HDP-2.1.3/bk_system-admin-guide/content/hive_partitioned_tables.html
321
356
/// [`wrap_partition_type_in_dict`]: crate::physical_plan::file_format::wrap_partition_type_in_dict
322
357
pub fn with_table_partition_cols (
323
358
mut self ,
0 commit comments