diff --git a/arrow-integration-test/data/integration.json b/arrow-integration-test/data/integration.json index 7e4a22cddba6..fb1471889413 100644 --- a/arrow-integration-test/data/integration.json +++ b/arrow-integration-test/data/integration.json @@ -319,6 +319,89 @@ "children": [] } ] + }, + { + "name": "utf8views", + "type": { + "name": "utf8view" + }, + "nullable": true, + "children": [] + }, + { + "name": "binaryviews", + "type": { + "name": "binaryview" + }, + "nullable": true, + "children": [] + }, + { + "name": "listviews", + "type": { + "name": "listview" + }, + "nullable": true, + "children": [ + { + "name": "item", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": true, + "children": [] + } + ] + }, + { + "name": "largelistviews", + "type": { + "name": "largelistview" + }, + "nullable": true, + "children": [ + { + "name": "item", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": true, + "children": [] + } + ] + }, + { + "name": "runendencoded", + "type": { + "name": "runendencoded" + }, + "nullable": true, + "children": [ + { + "name": "run_ends", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 16 + }, + "nullable": false, + "children": [] + }, + { + "name": "values", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": true, + "children": [] + } + ] } ] }, @@ -801,6 +884,162 @@ ] } ] + }, + { + "name": "utf8views", + "count": 3, + "VALIDITY": [ + 1, + 0, + 1 + ], + "VIEWS": [ + { + "SIZE": 5, + "INLINED": "hello" + }, + { + "SIZE": 0, + "INLINED": "" + }, + { + "SIZE": 19, + "PREFIX_HEX": "74686973", + "BUFFER_INDEX": 0, + "OFFSET": 0 + } + ], + "VARIADIC_DATA_BUFFERS": ["74686973206973206E6F7420696E6C696E6564"] + }, + { + "name": "binaryviews", + "count": 3, + "VALIDITY": [ + 1, + 1, + 0 + ], + "VIEWS": [ + { + "SIZE": 2, + "INLINED": "F34D" + }, + { + "SIZE": 16, + "PREFIX_HEX": "00010203", + "BUFFER_INDEX": 0, + "OFFSET": 0 + }, + { + "SIZE": 0, + "INLINED": "" + } + ], + "VARIADIC_DATA_BUFFERS": ["000102030405060708090A0B0C0D0E0F"] + }, + { + "name": "listviews", + "count": 3, + "VALIDITY": [ + 1, + 0, + 1 + ], + "OFFSET": [ + 0, + 2, + 2 + ], + "SIZE": [ + 2, + 0, + 3 + ], + "children": [ + { + "name": "item", + "count": 5, + "VALIDITY": [ + 1, + 1, + 1, + 0, + 1 + ], + "DATA": [ + 1, + 2, + 3, + 4, + 5 + ] + } + ] + }, + { + "name": "largelistviews", + "count": 3, + "VALIDITY": [ + 1, + 1, + 0 + ], + "OFFSET": [ + "0", + "2", + "3" + ], + "SIZE": [ + "2", + "1", + "0" + ], + "children": [ + { + "name": "item", + "count": 3, + "VALIDITY": [ + 1, + 0, + 1 + ], + "DATA": [ + 10, + 20, + 30 + ] + } + ] + }, + { + "name": "runendencoded", + "count": 3, + "children": [ + { + "name": "run_ends", + "count": 2, + "VALIDITY": [ + 1, + 1 + ], + "DATA": [ + 2, + 3 + ] + }, + { + "name": "values", + "count": 2, + "VALIDITY": [ + 1, + 0 + ], + "DATA": [ + 100, + 200 + ] + } + ] } ] } diff --git a/arrow-integration-test/src/datatype.rs b/arrow-integration-test/src/datatype.rs index 4c17fbe76be7..69174a1c221e 100644 --- a/arrow-integration-test/src/datatype.rs +++ b/arrow-integration-test/src/datatype.rs @@ -29,6 +29,8 @@ pub fn data_type_from_json(json: &serde_json::Value) -> Result { Some(s) if s == "bool" => Ok(DataType::Boolean), Some(s) if s == "binary" => Ok(DataType::Binary), Some(s) if s == "largebinary" => Ok(DataType::LargeBinary), + Some(s) if s == "binaryview" => Ok(DataType::BinaryView), + Some(s) if s == "utf8view" => Ok(DataType::Utf8View), Some(s) if s == "utf8" => Ok(DataType::Utf8), Some(s) if s == "largeutf8" => Ok(DataType::LargeUtf8), Some(s) if s == "fixedsizebinary" => { @@ -182,6 +184,14 @@ pub fn data_type_from_json(json: &serde_json::Value) -> Result { // return a largelist with any type as its child isn't defined in the map Ok(DataType::LargeList(default_field)) } + Some(s) if s == "listview" => { + // return a listview with any type as its child isn't defined in the map + Ok(DataType::ListView(default_field)) + } + Some(s) if s == "largelistview" => { + // return a large listview with any type as its child isn't defined in the map + Ok(DataType::LargeListView(default_field)) + } Some(s) if s == "fixedsizelist" => { // return a list with any type as its child isn't defined in the map if let Some(Value::Number(size)) = map.get("listSize") { @@ -199,6 +209,13 @@ pub fn data_type_from_json(json: &serde_json::Value) -> Result { // return an empty `struct` type as its children aren't defined in the map Ok(DataType::Struct(Fields::empty())) } + Some(s) if s == "runendencoded" => { + // return a run end encoded with placeholder types as children aren't defined in the map + Ok(DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + default_field, + )) + } Some(s) if s == "map" => { if let Some(Value::Bool(keys_sorted)) = map.get("keysSorted") { // Return a map with an empty type as its children aren't defined in the map @@ -271,9 +288,8 @@ pub fn data_type_to_json(data_type: &DataType) -> serde_json::Value { DataType::LargeUtf8 => json!({"name": "largeutf8"}), DataType::Binary => json!({"name": "binary"}), DataType::LargeBinary => json!({"name": "largebinary"}), - DataType::BinaryView | DataType::Utf8View => { - unimplemented!("BinaryView/Utf8View not implemented") - } + DataType::BinaryView => json!({"name": "binaryview"}), + DataType::Utf8View => json!({"name": "utf8view"}), DataType::FixedSizeBinary(byte_width) => { json!({"name": "fixedsizebinary", "byteWidth": byte_width}) } @@ -281,9 +297,8 @@ pub fn data_type_to_json(data_type: &DataType) -> serde_json::Value { DataType::Union(_, _) => json!({"name": "union"}), DataType::List(_) => json!({ "name": "list"}), DataType::LargeList(_) => json!({ "name": "largelist"}), - DataType::ListView(_) | DataType::LargeListView(_) => { - unimplemented!("ListView/LargeListView not implemented") - } + DataType::ListView(_) => json!({ "name": "listview"}), + DataType::LargeListView(_) => json!({ "name": "largelistview"}), DataType::FixedSizeList(_, length) => { json!({"name":"fixedsizelist", "listSize": length}) } @@ -352,7 +367,7 @@ pub fn data_type_to_json(data_type: &DataType) -> serde_json::Value { DataType::Map(_, keys_sorted) => { json!({"name": "map", "keysSorted": keys_sorted}) } - DataType::RunEndEncoded(_, _) => todo!(), + DataType::RunEndEncoded(_, _) => json!({"name": "runendencoded"}), } } diff --git a/arrow-integration-test/src/field.rs b/arrow-integration-test/src/field.rs index 8b0ca264e02e..2a32fa9fcbdf 100644 --- a/arrow-integration-test/src/field.rs +++ b/arrow-integration-test/src/field.rs @@ -114,43 +114,50 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { // if data_type is a struct or list, get its children let data_type = match data_type { - DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _) => { - match map.get("children") { - Some(Value::Array(values)) => { - if values.len() != 1 { - return Err(ArrowError::ParseError( - "Field 'children' must have one element for a list data type" - .to_string(), - )); - } - match data_type { - DataType::List(_) => { - DataType::List(Arc::new(field_from_json(&values[0])?)) - } - DataType::LargeList(_) => { - DataType::LargeList(Arc::new(field_from_json(&values[0])?)) - } - DataType::FixedSizeList(_, int) => DataType::FixedSizeList( - Arc::new(field_from_json(&values[0])?), - int, - ), - _ => unreachable!( - "Data type should be a list, largelist or fixedsizelist" - ), - } - } - Some(_) => { + DataType::List(_) + | DataType::LargeList(_) + | DataType::ListView(_) + | DataType::LargeListView(_) + | DataType::FixedSizeList(_, _) => match map.get("children") { + Some(Value::Array(values)) => { + if values.len() != 1 { return Err(ArrowError::ParseError( - "Field 'children' must be an array".to_string(), + "Field 'children' must have one element for a list data type" + .to_string(), )); } - None => { - return Err(ArrowError::ParseError( - "Field missing 'children' attribute".to_string(), - )); + match data_type { + DataType::List(_) => { + DataType::List(Arc::new(field_from_json(&values[0])?)) + } + DataType::LargeList(_) => { + DataType::LargeList(Arc::new(field_from_json(&values[0])?)) + } + DataType::ListView(_) => { + DataType::ListView(Arc::new(field_from_json(&values[0])?)) + } + DataType::LargeListView(_) => { + DataType::LargeListView(Arc::new(field_from_json(&values[0])?)) + } + DataType::FixedSizeList(_, int) => { + DataType::FixedSizeList(Arc::new(field_from_json(&values[0])?), int) + } + _ => unreachable!( + "Data type should be a list, largelist, listview, largelistview or fixedsizelist" + ), } } - } + Some(_) => { + return Err(ArrowError::ParseError( + "Field 'children' must be an array".to_string(), + )); + } + None => { + return Err(ArrowError::ParseError( + "Field missing 'children' attribute".to_string(), + )); + } + }, DataType::Struct(_) => match map.get("children") { Some(Value::Array(values)) => { DataType::Struct(values.iter().map(field_from_json).collect::>()?) @@ -215,6 +222,29 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { )); } }, + DataType::RunEndEncoded(_, _) => match map.get("children") { + Some(Value::Array(values)) => { + if values.len() != 2 { + return Err(ArrowError::ParseError( + "Field 'children' must have exactly 2 elements for RunEndEncoded" + .to_string(), + )); + } + let run_ends = Arc::new(field_from_json(&values[0])?); + let values_field = Arc::new(field_from_json(&values[1])?); + DataType::RunEndEncoded(run_ends, values_field) + } + Some(_) => { + return Err(ArrowError::ParseError( + "Field 'children' must be an array".to_string(), + )); + } + None => { + return Err(ArrowError::ParseError( + "Field missing 'children' attribute".to_string(), + )); + } + }, _ => data_type, }; @@ -269,8 +299,13 @@ pub fn field_to_json(field: &Field) -> serde_json::Value { DataType::Struct(fields) => fields.iter().map(|x| field_to_json(x.as_ref())).collect(), DataType::List(field) | DataType::LargeList(field) + | DataType::ListView(field) + | DataType::LargeListView(field) | DataType::FixedSizeList(field, _) | DataType::Map(field, _) => vec![field_to_json(field)], + DataType::RunEndEncoded(run_ends, values) => { + vec![field_to_json(run_ends), field_to_json(values)] + } _ => vec![], }; diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index 0f0b4fe2ffee..e0aa3ecf855a 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -203,6 +203,15 @@ pub struct ArrowJsonColumn { /// The type id for union types #[serde(rename = "TYPE_ID")] pub type_id: Option>, + /// The sizes for ListView/LargeListView types + #[serde(rename = "SIZE")] + pub size: Option>, + /// The views for BinaryView/Utf8View types + #[serde(rename = "VIEWS")] + pub views: Option>, + /// The variadic data buffers for BinaryView/Utf8View types + #[serde(rename = "VARIADIC_DATA_BUFFERS")] + pub variadic_data_buffers: Option>, /// The children columns for nested types pub children: Option>, } @@ -772,6 +781,66 @@ pub fn array_from_json( .unwrap(); Ok(Arc::new(LargeListArray::from(list_data))) } + DataType::ListView(child_field) => { + let null_buf = create_null_buf(&json_col); + let children = json_col.children.clone().unwrap(); + let child_array = array_from_json(child_field, children[0].clone(), dictionaries)?; + let offsets: Vec = json_col + .offset + .unwrap() + .iter() + .map(|v| v.as_i64().unwrap() as i32) + .collect(); + let sizes: Vec = json_col + .size + .unwrap() + .iter() + .map(|v| v.as_i64().unwrap() as i32) + .collect(); + let list_data = ArrayData::builder(field.data_type().clone()) + .len(json_col.count) + .add_buffer(Buffer::from(offsets.to_byte_slice())) + .add_buffer(Buffer::from(sizes.to_byte_slice())) + .add_child_data(child_array.into_data()) + .null_bit_buffer(Some(null_buf)) + .build() + .unwrap(); + Ok(Arc::new(ListViewArray::from(list_data))) + } + DataType::LargeListView(child_field) => { + let null_buf = create_null_buf(&json_col); + let children = json_col.children.clone().unwrap(); + let child_array = array_from_json(child_field, children[0].clone(), dictionaries)?; + let offsets: Vec = json_col + .offset + .unwrap() + .iter() + .map(|v| match v { + Value::Number(n) => n.as_i64().unwrap(), + Value::String(s) => s.parse::().unwrap(), + _ => panic!("64-bit offset must be either string or number"), + }) + .collect(); + let sizes: Vec = json_col + .size + .unwrap() + .iter() + .map(|v| match v { + Value::Number(n) => n.as_i64().unwrap(), + Value::String(s) => s.parse::().unwrap(), + _ => panic!("64-bit size must be either string or number"), + }) + .collect(); + let list_data = ArrayData::builder(field.data_type().clone()) + .len(json_col.count) + .add_buffer(Buffer::from(offsets.to_byte_slice())) + .add_buffer(Buffer::from(sizes.to_byte_slice())) + .add_child_data(child_array.into_data()) + .null_bit_buffer(Some(null_buf)) + .build() + .unwrap(); + Ok(Arc::new(LargeListViewArray::from(list_data))) + } DataType::FixedSizeList(child_field, _) => { let children = json_col.children.clone().unwrap(); let child_array = array_from_json(child_field, children[0].clone(), dictionaries)?; @@ -953,6 +1022,86 @@ pub fn array_from_json( UnionArray::try_new(fields.clone(), type_ids.into(), offset, children).unwrap(); Ok(Arc::new(array)) } + DataType::Utf8View => { + let views = json_col.views.ok_or_else(|| { + ArrowError::JsonError("Utf8View requires VIEWS field".to_string()) + })?; + let variadic_buffers = json_col.variadic_data_buffers.unwrap_or_default(); + let validity = json_col.validity.as_ref(); + + let mut builder = StringViewBuilder::new(); + for (i, view) in views.iter().enumerate() { + let is_valid = validity.map_or(1, |v| v[i]); + if is_valid == 0 { + builder.append_null(); + } else { + let view_obj = view.as_object().unwrap(); + let size = view_obj["SIZE"].as_u64().unwrap() as usize; + // Check for INLINED key presence - inlined if SIZE <= 12 + if let Some(inlined) = view_obj.get("INLINED") { + builder.append_value(inlined.as_str().unwrap()); + } else { + // Reference to variadic buffer + let buffer_index = view_obj["BUFFER_INDEX"].as_u64().unwrap() as usize; + let offset = view_obj["OFFSET"].as_u64().unwrap() as usize; + let buffer_data = hex::decode(&variadic_buffers[buffer_index]).unwrap(); + let s = std::str::from_utf8(&buffer_data[offset..offset + size]).unwrap(); + builder.append_value(s); + } + } + } + Ok(Arc::new(builder.finish())) + } + DataType::BinaryView => { + let views = json_col.views.ok_or_else(|| { + ArrowError::JsonError("BinaryView requires VIEWS field".to_string()) + })?; + let variadic_buffers = json_col.variadic_data_buffers.unwrap_or_default(); + let validity = json_col.validity.as_ref(); + + let mut builder = BinaryViewBuilder::new(); + for (i, view) in views.iter().enumerate() { + let is_valid = validity.map_or(1, |v| v[i]); + if is_valid == 0 { + builder.append_null(); + } else { + let view_obj = view.as_object().unwrap(); + let size = view_obj["SIZE"].as_u64().unwrap() as usize; + // Check for INLINED key presence - inlined if SIZE <= 12 + if let Some(inlined) = view_obj.get("INLINED") { + let data = hex::decode(inlined.as_str().unwrap()).unwrap(); + builder.append_value(&data); + } else { + // Reference to variadic buffer + let buffer_index = view_obj["BUFFER_INDEX"].as_u64().unwrap() as usize; + let offset = view_obj["OFFSET"].as_u64().unwrap() as usize; + let buffer_data = hex::decode(&variadic_buffers[buffer_index]).unwrap(); + builder.append_value(&buffer_data[offset..offset + size]); + } + } + } + Ok(Arc::new(builder.finish())) + } + DataType::RunEndEncoded(run_ends_field, values_field) => { + let children = json_col.children.clone().unwrap(); + if children.len() != 2 { + return Err(ArrowError::JsonError( + "RunEndEncoded requires exactly 2 children".to_string(), + )); + } + let run_ends_array = + array_from_json(run_ends_field, children[0].clone(), dictionaries)?; + let values_array = array_from_json(values_field, children[1].clone(), dictionaries)?; + + let run_array_data = ArrayData::builder(field.data_type().clone()) + .len(json_col.count) + .add_child_data(run_ends_array.into_data()) + .add_child_data(values_array.into_data()) + .build() + .unwrap(); + + Ok(make_array(run_array_data)) + } t => Err(ArrowError::JsonError(format!( "data type {t} not supported" ))), @@ -1092,6 +1241,9 @@ impl ArrowJsonBatch { data: Some(data), offset: None, type_id: None, + size: None, + views: None, + variadic_data_buffers: None, children: None, } } @@ -1102,6 +1254,9 @@ impl ArrowJsonBatch { data: None, offset: None, type_id: None, + size: None, + views: None, + variadic_data_buffers: None, children: None, }, }; @@ -1265,6 +1420,26 @@ mod tests { ])), true, ), + Field::new("utf8views", DataType::Utf8View, true), + Field::new("binaryviews", DataType::BinaryView, true), + Field::new( + "listviews", + DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, true))), + true, + ), + Field::new( + "largelistviews", + DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int32, true))), + true, + ), + Field::new( + "runendencoded", + DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int16, false)), + Arc::new(Field::new("values", DataType::Int32, true)), + ), + true, + ), ]); let bools_with_metadata_map = BooleanArray::from(vec![Some(true), None, Some(false)]); @@ -1336,6 +1511,58 @@ mod tests { .unwrap(); let structs = StructArray::from(struct_data); + let utf8views = + StringViewArray::from(vec![Some("hello"), None, Some("this is not inlined")]); + let binaryviews = BinaryViewArray::from_iter(vec![ + Some(b"\xf3\x4d".as_slice()), + Some(b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f".as_slice()), + None, + ]); + + let listview_value_data = Int32Array::from(vec![Some(1), Some(2), Some(3), None, Some(5)]); + let listview_offsets = Buffer::from_slice_ref([0i32, 2, 2]); + let listview_sizes = Buffer::from_slice_ref([2i32, 0, 3]); + let listview_data_type = + DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, true))); + let listview_data = ArrayData::builder(listview_data_type) + .len(3) + .add_buffer(listview_offsets) + .add_buffer(listview_sizes) + .add_child_data(listview_value_data.into_data()) + .null_bit_buffer(Some(Buffer::from([0b00000101]))) + .build() + .unwrap(); + let listviews = ListViewArray::from(listview_data); + + let largelistview_value_data = Int32Array::from(vec![Some(10), None, Some(30)]); + let largelistview_offsets = Buffer::from_slice_ref([0i64, 2, 3]); + let largelistview_sizes = Buffer::from_slice_ref([2i64, 1, 0]); + let largelistview_data_type = + DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int32, true))); + let largelistview_data = ArrayData::builder(largelistview_data_type) + .len(3) + .add_buffer(largelistview_offsets) + .add_buffer(largelistview_sizes) + .add_child_data(largelistview_value_data.into_data()) + .null_bit_buffer(Some(Buffer::from([0b00000011]))) + .build() + .unwrap(); + let largelistviews = LargeListViewArray::from(largelistview_data); + + let ree_run_ends = Int16Array::from(vec![2, 3]); + let ree_values = Int32Array::from(vec![Some(100), None]); + let ree_data_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int16, false)), + Arc::new(Field::new("values", DataType::Int32, true)), + ); + let ree_data = ArrayData::builder(ree_data_type) + .len(3) + .add_child_data(ree_run_ends.into_data()) + .add_child_data(ree_values.into_data()) + .build() + .unwrap(); + let runendencoded = RunArray::::from(ree_data); + let record_batch = RecordBatch::try_new( Arc::new(schema.clone()), vec![ @@ -1369,6 +1596,11 @@ mod tests { Arc::new(utf8s), Arc::new(lists), Arc::new(structs), + Arc::new(utf8views), + Arc::new(binaryviews), + Arc::new(listviews), + Arc::new(largelistviews), + Arc::new(runendencoded), ], ) .unwrap();