Skip to content

Commit 0f6931c

Browse files
alambcrepererum
andauthored
Minor: Add more documentation about table_partition_columns (#5576)
* Minor: Add more documentation about `table_partition_columns` * Update datafusion/core/src/datasource/listing/table.rs Co-authored-by: Marco Neumann <[email protected]> * Update docs some more --------- Co-authored-by: Marco Neumann <[email protected]>
1 parent a578150 commit 0f6931c

File tree

2 files changed

+53
-17
lines changed
  • datafusion/core/src

2 files changed

+53
-17
lines changed

datafusion/core/src/datasource/listing/table.rs

+42-7
Original file line numberDiff line numberDiff line change
@@ -212,10 +212,7 @@ pub struct ListingOptions {
212212
/// The file format
213213
pub format: Arc<dyn FileFormat>,
214214
/// The expected partition column names in the folder structure.
215-
/// For example `Vec["a", "b"]` means that the two first levels of
216-
/// partitioning expected should be named "a" and "b":
217-
/// - If there is a third level of partitioning it will be ignored.
218-
/// - Files that don't follow this partitioning will be ignored.
215+
/// See [Self::with_table_partition_cols] for details
219216
pub table_partition_cols: Vec<(String, DataType)>,
220217
/// Set true to try to guess statistics from the files.
221218
/// This can add a lot of overhead as it will usually require files
@@ -297,16 +294,54 @@ impl ListingOptions {
297294
self
298295
}
299296

300-
/// Set table partition column names on [`ListingOptions`] and returns self.
297+
/// Set `table partition columns` on [`ListingOptions`] and returns self.
301298
///
302-
/// You may use [`wrap_partition_type_in_dict`] to request a dictionary-encoded type.
299+
/// "partition columns," used to support [Hive Partitioning], are
300+
/// columns added to the data that is read, based on the folder
301+
/// structure where the data resides.
302+
///
303+
/// For example, give the following files in your filesystem:
304+
///
305+
/// ```text
306+
/// /mnt/nyctaxi/year=2022/month=01/tripdata.parquet
307+
/// /mnt/nyctaxi/year=2021/month=12/tripdata.parquet
308+
/// /mnt/nyctaxi/year=2021/month=11/tripdata.parquet
309+
/// ```
310+
///
311+
/// A [`ListingTable`] created at `/mnt/nyctaxi/` with partition
312+
/// columns "year" and "month" will include new `year` and `month`
313+
/// columns while reading the files. The `year` column would have
314+
/// value `2022` and the `month` column would have value `01` for
315+
/// the rows read from
316+
/// `/mnt/nyctaxi/year=2022/month=01/tripdata.parquet`
317+
///
318+
///# Notes
319+
///
320+
/// - If only one level (e.g. `year` in the example above) is
321+
/// specified, the other levels are ignored but the files are
322+
/// still read.
323+
///
324+
/// - Files that don't follow this partitioning scheme will be
325+
/// ignored.
326+
///
327+
/// - Since the columns have the same value for all rows read from
328+
/// each individual file (such as dates), they are typically
329+
/// dictionary encoded for efficiency. You may use
330+
/// [`wrap_partition_type_in_dict`] to request a
331+
/// dictionary-encoded type.
332+
///
333+
/// - The partition columns are solely extracted from the file path. Especially they are NOT part of the parquet files itself.
334+
///
335+
/// # Example
303336
///
304337
/// ```
305338
/// # use std::sync::Arc;
306339
/// # use arrow::datatypes::DataType;
307340
/// # use datafusion::prelude::col;
308341
/// # use datafusion::datasource::{listing::ListingOptions, file_format::parquet::ParquetFormat};
309342
///
343+
/// // listing options for files with paths such as `/mnt/data/col_a=x/col_b=y/data.parquet`
344+
/// // `col_a` and `col_b` will be included in the data read from those files
310345
/// let listing_options = ListingOptions::new(Arc::new(
311346
/// ParquetFormat::default()
312347
/// ))
@@ -317,7 +352,7 @@ impl ListingOptions {
317352
/// ("col_b".to_string(), DataType::Utf8)]);
318353
/// ```
319354
///
320-
///
355+
/// [Hive Partitioning]: https://docs.cloudera.com/HDPDocuments/HDP2/HDP-2.1.3/bk_system-admin-guide/content/hive_partitioned_tables.html
321356
/// [`wrap_partition_type_in_dict`]: crate::physical_plan::file_format::wrap_partition_type_in_dict
322357
pub fn with_table_partition_cols(
323358
mut self,

datafusion/core/src/physical_plan/file_format/mod.rs

+11-10
Original file line numberDiff line numberDiff line change
@@ -69,22 +69,23 @@ use std::{
6969

7070
use super::{ColumnStatistics, Statistics};
7171

72-
/// Convert logical type of partition column to physical type: `Dictionary(UInt16, val_type)`.
72+
/// Convert type to a type suitable for use as a [`ListingTable`]
73+
/// partition column. Returns `Dictionary(UInt16, val_type)`, which is
74+
/// a reasonable trade off between a reasonable number of partition
75+
/// values and space efficiency.
7376
///
74-
/// You CAN use this to specify types for partition columns. However you MAY also choose not to dictionary-encode the
75-
/// data or to use a different dictionary type.
77+
/// This use this to specify types for partition columns. However
78+
/// you MAY also choose not to dictionary-encode the data or to use a
79+
/// different dictionary type.
7680
///
77-
/// Use [`wrap_partition_value_in_dict`] to wrap the values.
81+
/// Use [`wrap_partition_value_in_dict`] to wrap a [`ScalarValue`] in the same say.
7882
pub fn wrap_partition_type_in_dict(val_type: DataType) -> DataType {
7983
DataType::Dictionary(Box::new(DataType::UInt16), Box::new(val_type))
8084
}
8185

82-
/// Convert scalar value of partition columns to physical type: `Dictionary(UInt16, val_type)` .
83-
///
84-
/// You CAN use this to specify types for partition columns. However you MAY also choose not to dictionary-encode the
85-
/// data or to use a different dictionary type.
86-
///
87-
/// Use [`wrap_partition_type_in_dict`] to wrap the types.
86+
/// Convert a [`ScalarValue`] of partition columns to a type, as
87+
/// decribed in the documentation of [`wrap_partition_type_in_dict`],
88+
/// which can wrap the types.
8889
pub fn wrap_partition_value_in_dict(val: ScalarValue) -> ScalarValue {
8990
ScalarValue::Dictionary(Box::new(DataType::UInt16), Box::new(val))
9091
}

0 commit comments

Comments
 (0)