From f56472a03750dd46f0cd88add2531450b25f9951 Mon Sep 17 00:00:00 2001 From: christophermcdermott Date: Thu, 20 Mar 2025 13:41:29 -0400 Subject: [PATCH] fix: write hive partitions for any int/uint/float --- datafusion/datasource/src/write/demux.rs | 60 ++++++++++++++++++++- datafusion/sqllogictest/test_files/copy.slt | 34 ++++++++---- 2 files changed, 81 insertions(+), 13 deletions(-) diff --git a/datafusion/datasource/src/write/demux.rs b/datafusion/datasource/src/write/demux.rs index 111d22060c0d..fc2e5daf92b6 100644 --- a/datafusion/datasource/src/write/demux.rs +++ b/datafusion/datasource/src/write/demux.rs @@ -33,8 +33,10 @@ use arrow::array::{ }; use arrow::datatypes::{DataType, Schema}; use datafusion_common::cast::{ - as_boolean_array, as_date32_array, as_date64_array, as_int32_array, as_int64_array, - as_string_array, as_string_view_array, + as_boolean_array, as_date32_array, as_date64_array, as_float16_array, + as_float32_array, as_float64_array, as_int16_array, as_int32_array, as_int64_array, + as_int8_array, as_string_array, as_string_view_array, as_uint16_array, + as_uint32_array, as_uint64_array, as_uint8_array, }; use datafusion_common::{exec_datafusion_err, not_impl_err, DataFusionError}; use datafusion_common_runtime::SpawnedTask; @@ -407,6 +409,18 @@ fn compute_partition_keys_by_row<'a>( partition_values.push(Cow::from(date)); } } + DataType::Int8 => { + let array = as_int8_array(col_array)?; + for i in 0..rb.num_rows() { + partition_values.push(Cow::from(array.value(i).to_string())); + } + } + DataType::Int16 => { + let array = as_int16_array(col_array)?; + for i in 0..rb.num_rows() { + partition_values.push(Cow::from(array.value(i).to_string())); + } + } DataType::Int32 => { let array = as_int32_array(col_array)?; for i in 0..rb.num_rows() { @@ -419,6 +433,48 @@ fn compute_partition_keys_by_row<'a>( partition_values.push(Cow::from(array.value(i).to_string())); } } + DataType::UInt8 => { + let array = as_uint8_array(col_array)?; + for i in 0..rb.num_rows() { + partition_values.push(Cow::from(array.value(i).to_string())); + } + } + DataType::UInt16 => { + let array = as_uint16_array(col_array)?; + for i in 0..rb.num_rows() { + partition_values.push(Cow::from(array.value(i).to_string())); + } + } + DataType::UInt32 => { + let array = as_uint32_array(col_array)?; + for i in 0..rb.num_rows() { + partition_values.push(Cow::from(array.value(i).to_string())); + } + } + DataType::UInt64 => { + let array = as_uint64_array(col_array)?; + for i in 0..rb.num_rows() { + partition_values.push(Cow::from(array.value(i).to_string())); + } + } + DataType::Float16 => { + let array = as_float16_array(col_array)?; + for i in 0..rb.num_rows() { + partition_values.push(Cow::from(array.value(i).to_string())); + } + } + DataType::Float32 => { + let array = as_float32_array(col_array)?; + for i in 0..rb.num_rows() { + partition_values.push(Cow::from(array.value(i).to_string())); + } + } + DataType::Float64 => { + let array = as_float64_array(col_array)?; + for i in 0..rb.num_rows() { + partition_values.push(Cow::from(array.value(i).to_string())); + } + } DataType::Dictionary(_, _) => { downcast_dictionary_array!( col_array => { diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index e2bb23e35732..925f96bd4ac0 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -110,24 +110,36 @@ a # Copy to directory as partitioned files query I -COPY (values (1::int, 2::bigint, 19968::date, arrow_cast(1725235200000, 'Date64'), false, 'x'), - (11::int, 22::bigint, 19969::date, arrow_cast(1725148800000, 'Date64'), true, 'y') +COPY (values (arrow_cast(1, 'Int8'), arrow_cast(2, 'UInt8'), arrow_cast(3, 'Int16'), arrow_cast(4, 'UInt16'), + arrow_cast(5, 'Int32'), arrow_cast(6, 'UInt32'), arrow_cast(7, 'Int64'), arrow_cast(8, 'UInt64'), + arrow_cast(9.1015625, 'Float16'), arrow_cast(10.1, 'Float32'), arrow_cast(11.1, 'Float64'), 19968::date, + arrow_cast(1725235200000, 'Date64'), false, 'x'), + (arrow_cast(11, 'Int8'), arrow_cast(22, 'UInt8'), arrow_cast(33, 'Int16'), arrow_cast(44, 'UInt16'), + arrow_cast(55, 'Int32'), arrow_cast(66, 'UInt32'), arrow_cast(77, 'Int64'), arrow_cast(88, 'UInt64'), + arrow_cast(9.203125, 'Float16'), arrow_cast(10.2, 'Float32'), arrow_cast(11.2, 'Float64'), 19969::date, + arrow_cast(1725148800000, 'Date64'), true, 'y') ) -TO 'test_files/scratch/copy/partitioned_table5/' STORED AS parquet PARTITIONED BY (column1, column2, column3, column4, column5) +TO 'test_files/scratch/copy/partitioned_table5/' STORED AS parquet PARTITIONED BY (column1, column2, column3, column4, + column5, column6, column7, column8, column9, column10, column11, column12, column13, column14) OPTIONS ('format.compression' 'zstd(10)'); ---- 2 # validate partitioning statement ok -CREATE EXTERNAL TABLE validate_partitioned_parquet5 (column1 int, column2 bigint, column3 date, column4 date, column5 boolean, column6 varchar) STORED AS PARQUET -LOCATION 'test_files/scratch/copy/partitioned_table5/' PARTITIONED BY (column1, column2, column3, column4, column5); - -query IIDDBT -select column1, column2, column3, column4, column5, column6 from validate_partitioned_parquet5 order by column1,column2,column3,column4,column5; ----- -1 2 2024-09-02 2024-09-02 false x -11 22 2024-09-03 2024-09-01 true y +CREATE EXTERNAL TABLE validate_partitioned_parquet5 (column1 int, column2 int, column3 int, column4 int, column5 int, + column6 int, column7 bigint, column8 bigint, column9 float, column10 float, column11 float, column12 date, + column13 date, column14 boolean, column15 varchar) STORED AS PARQUET +LOCATION 'test_files/scratch/copy/partitioned_table5/' PARTITIONED BY (column1, column2, column3, column4, column5, + column6, column7, column8, column9, column10, column11, column12, column13, column14); + +query IIIIIIIIRRRDDBT +select column1, column2, column3, column4, column5, column6, column7, column8, column9, column10, column11, column12, + column13, column14, column15 from validate_partitioned_parquet5 order by column1, column2, column3, column4, + column5, column6, column7, column8, column9, column10, column11, column12, column13; +---- +1 2 3 4 5 6 7 8 9.1015625 10.1 11.1 2024-09-02 2024-09-02 false x +11 22 33 44 55 66 77 88 9.203125 10.2 11.2 2024-09-03 2024-09-01 true y statement ok