From 02e340923e815543bdfd1f5da42e05fd47917849 Mon Sep 17 00:00:00 2001 From: Henrik Lindberg Date: Thu, 14 Jul 2022 21:29:12 +0200 Subject: [PATCH 1/3] Improve performance of transpose_list() This improves the runtime significantly for loading data with many columns. The order of loop nesting as well as a much more efficient binary search does the trick. In a real world example, fetching ~300k rows with ~50 columns from MongoDB, this brings the query + load time from 70 seconds to ~40. Microbenchmark with synthetic data on an AMD 5950X, 128GB RAM, Fedora Linux 36, R 4.1.3, jsonlite 1.8.0.9000 commit 80854359 ``` > set.seed(1) > rows <- 10000 > columns <- 100 > p_missing <- 0.2 > > recordlist <- lapply(1:rows, function(rownum) { + row <- as.list(1:columns) + names(row) <- paste0("col_", row) + row[runif(columns) > p_missing] + }) > columns <- unique(unlist(lapply(recordlist, names), recursive = FALSE, + use.names = FALSE)) ``` Before this change ``` > microbenchmark::microbenchmark( + jsonlite:::transpose_list(recordlist, columns), + times = 10 + ) Unit: milliseconds expr min lq mean median uq max neval jsonlite:::transpose_list(recordlist, columns) 577.8338 589.4064 593.0518 591.6895 599.4221 607.3057 10 ``` With this change ``` > microbenchmark::microbenchmark( + jsonlite:::transpose_list(recordlist, columns), + times = 10 + ) Unit: milliseconds expr min lq mean median uq max neval jsonlite:::transpose_list(recordlist, columns) 41.37537 43.22655 43.88987 43.76705 45.43552 46.81052 10 ``` --- R/transpose_list.R | 10 +++++- src/transpose_list.c | 47 ++++++++++++++++++++----- tests/testthat/test-simplifyDataFrame.R | 33 +++++++++++++++++ 3 files changed, 80 insertions(+), 10 deletions(-) create mode 100644 tests/testthat/test-simplifyDataFrame.R diff --git a/R/transpose_list.R b/R/transpose_list.R index f5501a22..416ffaf6 100644 --- a/R/transpose_list.R +++ b/R/transpose_list.R @@ -1,4 +1,12 @@ #' @useDynLib jsonlite C_transpose_list transpose_list <- function(x, names) { - .Call(C_transpose_list, x, names) + # Sort names before entering C, allowing for a binary search + LC_COLLATE <- "LC_COLLATE" + collate_before <- Sys.getlocale(LC_COLLATE) + on.exit(Sys.setlocale(LC_COLLATE, collate_before)) + Sys.setlocale(LC_COLLATE, "C") + sorted_names <- sort(names) + + transposed <- .Call(C_transpose_list, x, sorted_names) + transposed[match(names, sorted_names)] } diff --git a/src/transpose_list.c b/src/transpose_list.c index 9607a38e..632f594d 100644 --- a/src/transpose_list.c +++ b/src/transpose_list.c @@ -1,26 +1,55 @@ #include #include +// names is assumed to be sorted, to make names matching faster +// by using a binary search SEXP C_transpose_list(SEXP x, SEXP names) { size_t ncol = Rf_length(names); size_t nrow = Rf_length(x); SEXP out = PROTECT(allocVector(VECSXP, ncol)); + + // Allocate output for(size_t i = 0; i < ncol; i++){ - const char * targetname = CHAR(STRING_ELT(names, i)); SEXP col = PROTECT(allocVector(VECSXP, nrow)); - for(size_t j = 0; j < nrow; j++){ - //search for 'targetname' in each record j - SEXP list = VECTOR_ELT(x, j); - SEXP listnames = getAttrib(list, R_NamesSymbol); - for(size_t k = 0; k < Rf_length(listnames); k++){ - if(!strcmp(CHAR(STRING_ELT(listnames, k)), targetname)){ + SET_VECTOR_ELT(out, i, col); + UNPROTECT(1); + } + + // Find and save all elements in their transposed place + for(size_t j = 0; j < nrow; j++){ + SEXP list = VECTOR_ELT(x, j); + SEXP listnames = getAttrib(list, R_NamesSymbol); + size_t listlength = Rf_length(listnames); + + for(size_t k = 0; k < listlength; k++){ + const char * listname = CHAR(STRING_ELT(listnames, k)); + + // Binary search for a name match + size_t low = 0; + size_t high = ncol - 1; + size_t mid; + while(1){ + mid = (low + high) / 2; + const char * targetname = CHAR(STRING_ELT(names, mid)); + + int strcmp_result = strcmp(listname, targetname); + if(strcmp_result == 0){ + // Match! + SEXP col = VECTOR_ELT(out, mid); SET_VECTOR_ELT(col, j, VECTOR_ELT(list, k)); break; + } else if (strcmp_result > 0){ + low = mid + 1; + } else { + high = mid - 1; + } + + if (low > high) { + // No match to be found + break; } } } - SET_VECTOR_ELT(out, i, col); - UNPROTECT(1); } //setAttrib(out, R_NamesSymbol, names); UNPROTECT(1); diff --git a/tests/testthat/test-simplifyDataFrame.R b/tests/testthat/test-simplifyDataFrame.R new file mode 100644 index 00000000..c8d07afa --- /dev/null +++ b/tests/testthat/test-simplifyDataFrame.R @@ -0,0 +1,33 @@ +context("simplifyDataFrame") + +test_that("simplifyDataFrame() works", { + source <- list( + list(a = 11, b = 12), + list(d = 24), + list(a = 31, c = 33) + ) + + actual <- simplifyDataFrame(source, flatten = TRUE) + + # Check that column order is preserved as discovered in the data + expect_equal(colnames(actual), c("a", "b", "d", "c")) + + expect_row_equals <- function(number, expected) { + expect_equal( + as.numeric(actual[number, ]), + expected + ) + } + # a b d c + expect_row_equals(1, c(11, 12, NA, NA)) + expect_row_equals(2, c(NA, NA, 24, NA)) + expect_row_equals(3, c(31, NA, NA, 33)) +}) + +test_that("transpose_list() does not change locale", { + locale_before <- Sys.getlocale() + transpose_list(list(a = 1), c("a")) + locale_after <- Sys.getlocale() + + expect_equal(locale_before, locale_after) +}) From 8b467f62835602014a52483929a4639dcd7c4fa9 Mon Sep 17 00:00:00 2001 From: Henrik Lindberg Date: Sun, 17 Jul 2022 11:57:05 +0200 Subject: [PATCH 2/3] Protect from edge case infinite loop If a name exists in the data, sorted less than the smallest being requested, the previous code would end up in an infinite loop. --- src/transpose_list.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/transpose_list.c b/src/transpose_list.c index 632f594d..c2e5d095 100644 --- a/src/transpose_list.c +++ b/src/transpose_list.c @@ -28,7 +28,7 @@ SEXP C_transpose_list(SEXP x, SEXP names) { size_t low = 0; size_t high = ncol - 1; size_t mid; - while(1){ + while(low <= high){ mid = (low + high) / 2; const char * targetname = CHAR(STRING_ELT(names, mid)); @@ -41,13 +41,11 @@ SEXP C_transpose_list(SEXP x, SEXP names) { } else if (strcmp_result > 0){ low = mid + 1; } else { + if (high == 0) { + break; + } high = mid - 1; } - - if (low > high) { - // No match to be found - break; - } } } } From 5a9331e35ea1a81d449b56deb1d6d07e95207316 Mon Sep 17 00:00:00 2001 From: Henrik Lindberg Date: Sun, 17 Jul 2022 12:19:14 +0200 Subject: [PATCH 3/3] Fix typo --- src/transpose_list.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transpose_list.c b/src/transpose_list.c index c2e5d095..677af8a4 100644 --- a/src/transpose_list.c +++ b/src/transpose_list.c @@ -41,7 +41,7 @@ SEXP C_transpose_list(SEXP x, SEXP names) { } else if (strcmp_result > 0){ low = mid + 1; } else { - if (high == 0) { + if (mid == 0) { break; } high = mid - 1;