diff --git a/r/inst/test-data/README.md b/r/inst/test-data/README.md new file mode 100644 index 000000000..7704e1419 --- /dev/null +++ b/r/inst/test-data/README.md @@ -0,0 +1,40 @@ + + +# Extra test files for R + +## complex-map.arrows + +Sourced from the [Overture Maps Foundation](https://overturemaps.org) "divisions_area" table using +[SedonaDB](https://sedona.apache.org/sedonadb). + +```r +library(sedonadb) + +sd_sql("SET datafusion.execution.batch_size = 1024") + +sd_read_parquet("/Volumes/data/overture/data/theme=divisions/type=division_area/") |> + sd_to_view("division_area", overwrite = TRUE) + +sd_sql("SELECT ROW_NUMBER() OVER (ORDER BY names.primary) as idx, names FROM division_area WHERE names.common IS NOT NULL") |> + sd_compute() |> sd_to_view("names_with_common", overwrite = TRUE) + +sd_sql("SELECT * FROM names_with_common WHERE (idx % 100) = 0 ORDER BY idx") |> + nanoarrow::write_nanoarrow("inst/test-data/complex-map.arrows") +``` diff --git a/r/inst/test-data/complex-map.arrows b/r/inst/test-data/complex-map.arrows new file mode 100644 index 000000000..965297e11 Binary files /dev/null and b/r/inst/test-data/complex-map.arrows differ diff --git a/r/src/materialize.c b/r/src/materialize.c index 77db5cbf7..2d42245e1 100644 --- a/r/src/materialize.c +++ b/r/src/materialize.c @@ -64,17 +64,41 @@ static int has_attrib_safe(SEXP x, SEXP sym) { } R_xlen_t nanoarrow_data_frame_size(SEXP x) { - if (Rf_length(x) > 0) { - // This both avoids materializing the row.names attribute and - // makes this work with struct-style vctrs that don't have a - // row.names attribute but that always have one or more element + // If this is actually a data frame, check the rownames. ALTREP ensures + // that "expanding" these rownames by accessing the attribute does not + // actually compute millions of strings. + if (Rf_inherits(x, "data.frame")) { + return Rf_xlength(Rf_getAttrib(x, R_RowNamesSymbol)); + } + + // If we are here, this is a struct style vctr. If we have a plain vctr + // as the first element we can just compute its length from the Rf_xlength() + // of the first column. + if (Rf_length(x) > 0 && !Rf_isObject(VECTOR_ELT(x, 0))) { return Rf_xlength(VECTOR_ELT(x, 0)); } else { - // Since ALTREP was introduced, materializing the row.names attribute is - // usually deferred such that values in the form c(NA, -nrow), 1:nrow, or - // as.character(1:nrow) are never actually computed when the length is - // taken. - return Rf_xlength(Rf_getAttrib(x, R_RowNamesSymbol)); + // Otherwise, we need S3 dispatch to compute the length for us. + SEXP length_sym = PROTECT(Rf_install("length")); + SEXP call_sexp = PROTECT(Rf_lang2(length_sym, x)); + SEXP length_sexp = PROTECT(Rf_eval(call_sexp, R_BaseEnv)); + if (Rf_xlength(length_sexp) != 1) { + Rf_error("length() return value with size != 1"); + } + + R_xlen_t out = 0; + switch (TYPEOF(length_sexp)) { + case INTSXP: + out = INTEGER(length_sexp)[0]; + break; + case REALSXP: + out = (R_xlen_t)REAL(length_sexp)[0]; + break; + default: + Rf_error("length() return value with unexpected type"); + } + + UNPROTECT(3); + return out; } } @@ -559,6 +583,7 @@ static int nanoarrow_materialize_list_of(struct RConverter* converter, case NANOARROW_TYPE_NA: return NANOARROW_OK; case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_MAP: for (int64_t i = 0; i < dst->length; i++) { if (!ArrowArrayViewIsNull(src->array_view, src->offset + i)) { offset = offsets[raw_src_offset + i]; diff --git a/r/tests/testthat/test-convert-array-stream.R b/r/tests/testthat/test-convert-array-stream.R index c1db77616..068e3958d 100644 --- a/r/tests/testthat/test-convert-array-stream.R +++ b/r/tests/testthat/test-convert-array-stream.R @@ -252,6 +252,40 @@ test_that("convert array stream works for fixed_size_list_of() with parent nulls ) }) +test_that("complex nested struct/map combination can be read", { + skip_if_not_installed("arrow") + skip_if_not_installed("vctrs") + + complex_map_file <- system.file("test-data/complex-map.arrows", package = "nanoarrow") + df_from_arrow <- as.data.frame(arrow::read_ipc_stream(complex_map_file)) + df_from_nanoarrow <- as.data.frame(read_nanoarrow(complex_map_file)) + + expect_identical( + df_from_nanoarrow$idx, + as.double(df_from_arrow$idx) + ) + + expect_identical( + df_from_nanoarrow$names$primary, + df_from_arrow$names$primary + ) + + expect_identical( + df_from_nanoarrow$names$common[[3000]], + as.data.frame(df_from_arrow$names$common[[3000]]) + ) + + expect_identical( + lengths(df_from_nanoarrow$names$rules), + lengths(df_from_arrow$names$rules) + ) + + expect_identical( + df_from_nanoarrow$names$rules[[4238]][c("variant", "language", "value")], + as.data.frame(df_from_arrow$names$rules[[4238]][c("variant", "language", "value")]) + ) +}) + test_that("convert array stream respects the value of n", { batches <- list( data.frame(x = 1:5), diff --git a/r/tests/testthat/test-convert-array.R b/r/tests/testthat/test-convert-array.R index 973fc8612..7b7be069a 100644 --- a/r/tests/testthat/test-convert-array.R +++ b/r/tests/testthat/test-convert-array.R @@ -154,6 +154,17 @@ test_that("convert to vector works for tibble", { ) }) +test_that("convert to vector works for a rcrd-sytle vctr with complex columns", { + skip_if_not_installed("vctrs") + + rcrd <- vctrs::new_rcrd(list(x = data.frame(y = 1:10))) + rcrd_array <- as_nanoarrow_array(vctrs::vec_data(rcrd)) + expect_identical( + convert_array(rcrd_array, rcrd), + rcrd + ) +}) + test_that("convert to vector works for nanoarrow_vctr()", { array <- as_nanoarrow_array(c("one", "two", "three")) @@ -1084,6 +1095,30 @@ test_that("convert to vector works for null -> vctrs::list_of()", { ) }) +test_that("convert to vector works for map -> vctrs::list_of", { + skip_if_not_installed("arrow") + skip_if_not_installed("vctrs") + + values <- vctrs::list_of( + data.frame(key = "key1", value = 1L), + data.frame(key = c("key2", "key3"), value = c(2L, 3L)), + NULL + ) + + array_list <- as_nanoarrow_array( + arrow::Array$create( + values, + type = arrow::map_of(arrow::string(), arrow::int32()) + ) + ) + + # Default conversion + expect_identical( + convert_array(array_list), + values + ) +}) + test_that("convert to vector works for fixed_size_list_of() -> matrix()", { mat <- matrix(1:6, ncol = 2, byrow = TRUE) array <- as_nanoarrow_array(mat)