From f6e29baf59d0a95c75dd109ecda5e4df346221aa Mon Sep 17 00:00:00 2001 From: mike Date: Sat, 16 Mar 2024 11:33:54 +1000 Subject: [PATCH] Added read_ndjson_str() --- NAMESPACE | 1 + NEWS.md | 1 + R/ndjson.R | 53 +++++ man/read_json_conn.Rd | 3 +- man/read_json_file.Rd | 3 +- man/read_json_raw.Rd | 3 +- man/read_json_str.Rd | 3 +- man/read_ndjson_file.Rd | 3 +- man/read_ndjson_str.Rd | 68 ++++++ src/Makevars | 2 +- src/init.c | 6 + src/ndjson-parse.c | 422 +++++++++++++++++++++++++++++++++++ src/utils.c | 2 +- tests/testthat/test-ndjson.R | 42 +++- 14 files changed, 601 insertions(+), 11 deletions(-) create mode 100644 man/read_ndjson_str.Rd diff --git a/NAMESPACE b/NAMESPACE index 4ec2d9c..ef2ea92 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(read_json_file) export(read_json_raw) export(read_json_str) export(read_ndjson_file) +export(read_ndjson_str) export(validate_json_file) export(validate_json_str) export(write_json_file) diff --git a/NEWS.md b/NEWS.md index 7b01bf1..b36a90c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,7 @@ # yyjsonr 0.1.18.9004 2024-03-15 * Re-introduce NDJSON support +* Add `read_ndjson_str()` # yyjsonr 0.1.18.9003 2024-03-13 diff --git a/R/ndjson.R b/R/ndjson.R index 81e2aba..ff3ef9c 100644 --- a/R/ndjson.R +++ b/R/ndjson.R @@ -63,6 +63,59 @@ read_ndjson_file <- function(filename, type = c('df', 'list'), nread = -1, nskip } +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +#' Parse an NDJSON file to a data.frame or list +#' +#' If reading as data.frame, each row of NDJSON becomes a row in the data.frame. +#' If reading as a list, then each row becomes an element in the list. +#' +#' If parsing NDJSON to a data.frame it is usually better if the json objects +#' are consistent from line-to-line. Type inference for the data.frame is done +#' during initialisation by reading through \code{nprobe} lines. Warning: if +#' there is a type-mismatch further into the file than it is probed, then you +#' will get missing values in the data.frame, or JSON values not captured in +#' the R data. +#' +#' No flattening of the namespace is done i.e. nested object remain nested. +#' +#' @inheritParams read_ndjson_file +#' @param x string containing NDJSON +#' +#' @examples +#' tmp <- tempfile() +#' json <- write_ndjson_str(head(mtcars)) +#' read_ndjson_str(json, type = 'list') +#' +#' @family JSON Parsers +#' @return NDJSON data read into R as list or data.frame depending +#' on \code{'type'} argument +#' @export +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +read_ndjson_str <- function(x, type = c('df', 'list'), nread = -1, nskip = 0, nprobe = 100, opts = list(), ...) { + + type <- match.arg(type) + + if (type == 'list') { + .Call( + parse_ndjson_str_as_list_, + x, + nread, + nskip, + modify_list(opts, list(...)) + ) + } else { + .Call( + parse_ndjson_str_as_df_, + x, + nread, + nskip, + nprobe, + modify_list(opts, list(...)) + ) + } +} + + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #' Write list or data.frame object to NDJSON in a file #' diff --git a/man/read_json_conn.Rd b/man/read_json_conn.Rd index 10a56b8..bdd3d5d 100644 --- a/man/read_json_conn.Rd +++ b/man/read_json_conn.Rd @@ -37,6 +37,7 @@ Other JSON Parsers: \code{\link{read_json_file}()}, \code{\link{read_json_raw}()}, \code{\link{read_json_str}()}, -\code{\link{read_ndjson_file}()} +\code{\link{read_ndjson_file}()}, +\code{\link{read_ndjson_str}()} } \concept{JSON Parsers} diff --git a/man/read_json_file.Rd b/man/read_json_file.Rd index aac53cf..82edf0a 100644 --- a/man/read_json_file.Rd +++ b/man/read_json_file.Rd @@ -30,6 +30,7 @@ Other JSON Parsers: \code{\link{read_json_conn}()}, \code{\link{read_json_raw}()}, \code{\link{read_json_str}()}, -\code{\link{read_ndjson_file}()} +\code{\link{read_ndjson_file}()}, +\code{\link{read_ndjson_str}()} } \concept{JSON Parsers} diff --git a/man/read_json_raw.Rd b/man/read_json_raw.Rd index 55a7f63..185b424 100644 --- a/man/read_json_raw.Rd +++ b/man/read_json_raw.Rd @@ -29,6 +29,7 @@ Other JSON Parsers: \code{\link{read_json_conn}()}, \code{\link{read_json_file}()}, \code{\link{read_json_str}()}, -\code{\link{read_ndjson_file}()} +\code{\link{read_ndjson_file}()}, +\code{\link{read_ndjson_str}()} } \concept{JSON Parsers} diff --git a/man/read_json_str.Rd b/man/read_json_str.Rd index 6891972..f11bac0 100644 --- a/man/read_json_str.Rd +++ b/man/read_json_str.Rd @@ -28,6 +28,7 @@ Other JSON Parsers: \code{\link{read_json_conn}()}, \code{\link{read_json_file}()}, \code{\link{read_json_raw}()}, -\code{\link{read_ndjson_file}()} +\code{\link{read_ndjson_file}()}, +\code{\link{read_ndjson_str}()} } \concept{JSON Parsers} diff --git a/man/read_ndjson_file.Rd b/man/read_ndjson_file.Rd index 49d9512..f9119a4 100644 --- a/man/read_ndjson_file.Rd +++ b/man/read_ndjson_file.Rd @@ -63,6 +63,7 @@ Other JSON Parsers: \code{\link{read_json_conn}()}, \code{\link{read_json_file}()}, \code{\link{read_json_raw}()}, -\code{\link{read_json_str}()} +\code{\link{read_json_str}()}, +\code{\link{read_ndjson_str}()} } \concept{JSON Parsers} diff --git a/man/read_ndjson_str.Rd b/man/read_ndjson_str.Rd new file mode 100644 index 0000000..5d736e3 --- /dev/null +++ b/man/read_ndjson_str.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ndjson.R +\name{read_ndjson_str} +\alias{read_ndjson_str} +\title{Parse an NDJSON file to a data.frame or list} +\usage{ +read_ndjson_str( + x, + type = c("df", "list"), + nread = -1, + nskip = 0, + nprobe = 100, + opts = list(), + ... +) +} +\arguments{ +\item{x}{string containing NDJSON} + +\item{type}{The type of R object the JSON should be parsed into. Valid +values are 'df' or 'list'. Default: 'df' (data.frame)} + +\item{nread}{Number of records to read. Default: -1 (reads all JSON strings)} + +\item{nskip}{Number of records to skip before starting to read. Default: 0 +(skip no data)} + +\item{nprobe}{Number of lines to read to determine types for data.frame +columns. Default: 100. Use \code{-1} to probe entire file.} + +\item{opts}{Named list of options for parsing. Usually created by \code{opts_read_json()}} + +\item{...}{Other named options can be used to override any options in \code{opts}. +The valid named options are identical to arguments to \code{\link[=opts_read_json]{opts_read_json()}}} +} +\value{ +NDJSON data read into R as list or data.frame depending +on \code{'type'} argument +} +\description{ +If reading as data.frame, each row of NDJSON becomes a row in the data.frame. +If reading as a list, then each row becomes an element in the list. +} +\details{ +If parsing NDJSON to a data.frame it is usually better if the json objects +are consistent from line-to-line. Type inference for the data.frame is done +during initialisation by reading through \code{nprobe} lines. Warning: if +there is a type-mismatch further into the file than it is probed, then you +will get missing values in the data.frame, or JSON values not captured in +the R data. + +No flattening of the namespace is done i.e. nested object remain nested. +} +\examples{ +tmp <- tempfile() +json <- write_ndjson_str(head(mtcars)) +read_ndjson_str(json, type = 'list') + +} +\seealso{ +Other JSON Parsers: +\code{\link{read_json_conn}()}, +\code{\link{read_json_file}()}, +\code{\link{read_json_raw}()}, +\code{\link{read_json_str}()}, +\code{\link{read_ndjson_file}()} +} +\concept{JSON Parsers} diff --git a/src/Makevars b/src/Makevars index 10fb6a8..cffea85 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,2 +1,2 @@ PKG_LIBS=-lz -#PKG_CFLAGS += -Wconversion \ No newline at end of file +#PKG_CFLAGS += -Wconversion diff --git a/src/init.c b/src/init.c index 94810cf..fa05a26 100644 --- a/src/init.c +++ b/src/init.c @@ -26,6 +26,9 @@ extern SEXP validate_json_str_ (SEXP str_ , SEXP verbose_, SEXP parse_opts_) extern SEXP parse_ndjson_file_as_df_ (SEXP filename_, SEXP nread_, SEXP nskip_, SEXP nprobe_, SEXP parse_opts_); extern SEXP parse_ndjson_file_as_list_(SEXP filename_, SEXP nread_, SEXP nskip_, SEXP parse_opts_); +extern SEXP parse_ndjson_str_as_df_ (SEXP str_, SEXP nread_, SEXP nskip_, SEXP nprobe_, SEXP parse_opts_); +extern SEXP parse_ndjson_str_as_list_(SEXP str_, SEXP nread_, SEXP nskip_, SEXP parse_opts_); + extern SEXP serialize_df_to_ndjson_str_ (SEXP robj_, SEXP serialize_opts_); extern SEXP serialize_df_to_ndjson_file_(SEXP robj_, SEXP filename_, SEXP serialize_opts_); @@ -58,6 +61,9 @@ static const R_CallMethodDef CEntries[] = { {"parse_ndjson_file_as_df_" , (DL_FUNC) &parse_ndjson_file_as_df_ , 5}, {"parse_ndjson_file_as_list_", (DL_FUNC) &parse_ndjson_file_as_list_, 4}, + {"parse_ndjson_str_as_df_" , (DL_FUNC) &parse_ndjson_str_as_df_ , 5}, + {"parse_ndjson_str_as_list_", (DL_FUNC) &parse_ndjson_str_as_list_, 4}, + {"serialize_df_to_ndjson_str_" , (DL_FUNC) &serialize_df_to_ndjson_str_ , 2}, {"serialize_df_to_ndjson_file_", (DL_FUNC) &serialize_df_to_ndjson_file_, 3}, diff --git a/src/ndjson-parse.c b/src/ndjson-parse.c index 736a191..3bb1ca2 100644 --- a/src/ndjson-parse.c +++ b/src/ndjson-parse.c @@ -206,6 +206,130 @@ SEXP parse_ndjson_file_as_list_(SEXP filename_, SEXP nread_, SEXP nskip_, SEXP p +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Parse ndjson as a list of R objects: one-r-object-per-line-of-input +// +// Compared to parsing to data.frame +// PRO: Simple +// PRO: Can handle any type without worrying about data.frame column types +// being consistent across multiple input lines +// CON: Slower: Every object on every line gets allocated into an R object +// Compared to data.frame which allocates all its space at once and +// just slots values into this memory. +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +SEXP parse_ndjson_str_as_list_(SEXP str_, SEXP nread_, SEXP nskip_, SEXP parse_opts_) { + + parse_options opt = create_parse_options(parse_opts_); + opt.yyjson_read_flag |= YYJSON_READ_STOP_WHEN_DONE; + + int nread = asInteger(nread_); + int nskip = asInteger(nskip_); + + + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Allocating a list with a default starting size to grow into. + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SEXP list_ = PROTECT(allocVector(VECSXP, 64)); + R_xlen_t list_size = XLENGTH(list_); + + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Iterate over the file. For each line + // - check if new data would overflow list + // - if so, then grow list + // - create a yyjson doc from this line + // - if document is NULL + // insert a NULL into list + // - otherwise + // insert resulting robject into list + // - free the doc + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + char *str = (char *)CHAR( STRING_ELT(str_, 0) ); + size_t str_size = strlen(str); + size_t orig_str_size = strlen(str); + size_t total_read = 0; + + + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Skip lines if requested + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + while (nskip > 0 && total_read < orig_str_size) { + yyjson_read_err err; + yyjson_doc *doc = yyjson_read_opts(str, str_size, opt.yyjson_read_flag, NULL, &err); + size_t pos = yyjson_doc_get_read_size(doc); + yyjson_doc_free(doc); + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Advance string + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + total_read += pos + 1; + str += pos + 1; + str_size -= (pos + 1); + + nskip--; + } + + + unsigned int i = 0; + while (total_read < orig_str_size) { + + if (i >= nread) { + break; + } + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Grow list if we need more room + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + if (i >= list_size) { + UNPROTECT(1); + list_ = PROTECT(grow_list(list_)); + list_size = XLENGTH(list_); + } + + yyjson_read_err err; + yyjson_doc *doc = yyjson_read_opts(str, str_size, opt.yyjson_read_flag, NULL, &err); + size_t pos = yyjson_doc_get_read_size(doc); + + + if (doc == NULL) { + warning("Couldn't parse NDJSON row %i. Inserting 'NULL'\n", i + 1); + SET_VECTOR_ELT(list_, i, R_NilValue); + } else { + SET_VECTOR_ELT(list_, i, parse_json_from_str(str, str_size, &opt)); + } + i++; + + yyjson_doc_free(doc); + + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Advance string + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + total_read += pos + 1; + str += pos + 1; + str_size -= (pos + 1); + } + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // In-situ faux truncation of a VECSXP object. + // This just hides the trailing elements from R + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SETLENGTH(list_, i); + SET_TRUELENGTH(list_, list_size); + SET_GROWABLE_BIT(list_); + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Close input, tidy memory and return + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + UNPROTECT(1); + return list_; +} + + + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Parse ndjson as a data.frame one-rorw-per-line-of-input @@ -508,3 +632,301 @@ SEXP parse_ndjson_file_as_df_(SEXP filename_, SEXP nread_, SEXP nskip_, SEXP npr return df_; } + + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Parse string into data.frame +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +SEXP parse_ndjson_str_as_df_(SEXP str_, SEXP nread_, SEXP nskip_, SEXP nprobe_, SEXP parse_opts_) { + + int nprotect = 0; + parse_options opt = create_parse_options(parse_opts_); + opt.yyjson_read_flag |= YYJSON_READ_STOP_WHEN_DONE; + + int nread = asInteger(nread_); + int nskip = asInteger(nskip_); + int nprobe = asInteger(nprobe_); + + if (nread <= 0) { nread = INT32_MAX; } + if (nprobe <= 0) { nprobe = INT32_MAX; } + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Iterate over the file. For each line + // - check if new data would overflow list + // - if so, then grow list + // - create a yyjson doc from this line + // - if document is NULL + // insert a NULL into list + // - otherwise + // insert resulting robject into list + // - free the doc + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + char *str = (char *)CHAR( STRING_ELT(str_, 0) ); + size_t str_size = strlen(str); + size_t orig_str_size = strlen(str); + size_t total_read = 0; + + + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Skip lines if requested + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + while (nskip > 0 && total_read < orig_str_size) { + yyjson_read_err err; + yyjson_doc *doc = yyjson_read_opts(str, str_size, opt.yyjson_read_flag, NULL, &err); + size_t pos = yyjson_doc_get_read_size(doc); + yyjson_doc_free(doc); + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Advance string + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + total_read += pos + 1; + str += pos + 1; + str_size -= (pos + 1); + + nskip--; + } + + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Accumulation of unique key-names in the objects + // These will become the column names of the data.frame. + // Each column also has a 'type_bitset' to keep track of the type of each + // value across the different {}-objects + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + char *colname[MAX_DF_COLS]; + unsigned int type_bitset[MAX_DF_COLS] = {0}; + unsigned int sexp_type[MAX_DF_COLS] = {0}; + int ncols = 0; + int nrows = 0; + + + char *mark_str = str; + size_t mark_str_size = str_size; + size_t mark_total_read = total_read; + + + while (nprobe > 0 && total_read < orig_str_size) { + yyjson_read_err err; + yyjson_doc *doc = yyjson_read_opts(str, str_size, opt.yyjson_read_flag, NULL, &err); + size_t pos = yyjson_doc_get_read_size(doc); + if (doc == NULL) { + // output_verbose_error(buf, err); + error("Couldn't parse JSON during probe line %i\n", nrows + 1); + } + + yyjson_val *obj = yyjson_doc_get_root(doc); + yyjson_val *key; + yyjson_obj_iter obj_iter = yyjson_obj_iter_with(obj); // MUST be an object + + while ((key = yyjson_obj_iter_next(&obj_iter))) { + yyjson_val *val = yyjson_obj_iter_get_val(key); + + int name_idx = -1; + for (int i = 0; i < ncols; i++) { + if (yyjson_equals_str(key, colname[i])) { + name_idx = i; + break; + } + } + if (name_idx < 0) { + // Name has not been seen yet + name_idx = ncols; + colname[ncols] = (char *)yyjson_get_str(key); + ncols++; + if (ncols == MAX_DF_COLS) { + error("Maximum columns for data.frame exceeded: %i", MAX_DF_COLS); + } + } + + type_bitset[name_idx] = update_type_bitset(type_bitset[name_idx], val, &opt); + } + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Advance string + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + total_read += pos + 1; + str += pos + 1; + str_size -= (pos + 1); + + + nrows++; + nprobe--; + } + // Rprintf("Step X0: nrows = %i\n", nrows); + + // json <- write_ndjson_str(head(mtcars)); read_ndjson_str(json) + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Read the rest of the string to figure out how many rows there are in total + // TODO: Just count "\n" here + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + if (total_read < orig_str_size) { + for (size_t sp = 0; sp < str_size; sp++) { + if (str[sp] == '\n') { + nrows++; + } + } + if (str[str_size =1] != '\n') { + // STring does not end in newline, so need to manually count the last row + nrows++; + } + } + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // How many rows does the user want to read vs how many do we have + // and how many they want to skip. + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + nrows = nrows > nread ? nread : nrows; + + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Create a data.frame. + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SEXP df_ = PROTECT(allocVector(VECSXP, ncols)); nprotect++; + + + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // For each column name, + // - determine the best SEXP to represent the 'type_bitset' + // - Call a parse function which will + // - loop through the entire []-array, plucking the value from each + // {}-object + // - return an atomic vector or a list + // - place this vector as a column in the data.frame + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + for (unsigned int col = 0; col < ncols; col++) { + sexp_type[col] = get_best_sexp_to_represent_type_bitset(type_bitset[col], &opt); + + // INT64SXP is actually contained in a REALSXP + unsigned int alloc_type = sexp_type[col] == INT64SXP ? REALSXP : sexp_type[col]; + + // Allocate memory for column + SEXP vec_ = PROTECT(allocVector(alloc_type, nrows)); + if (sexp_type[col] == INT64SXP) { + setAttrib(vec_, R_ClassSymbol, mkString("integer64")); + } + + // place vector into data.frame + SET_VECTOR_ELT(df_, col, vec_); + UNPROTECT(1); // no longer needs protection once part of data.frame + } + + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Parse file + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + str = mark_str; + str_size = mark_str_size; + total_read = mark_total_read; + + // keep track of actual number of rows parsed. + // This might not be the same as 'nrow' as we can skip rows that we + // can't parse. + int row = 0; + + for (unsigned int i = 0; i < nrows; i++) { + yyjson_read_err err; + yyjson_doc *doc = yyjson_read_opts(str, str_size, opt.yyjson_read_flag, NULL, &err); + size_t pos = yyjson_doc_get_read_size(doc); + if (doc == NULL) { + // output_verbose_error(buf, err); + error("Couldn't parse JSON on line %i\n", i + 1); + } + + yyjson_val *obj = yyjson_doc_get_root(doc); + if (yyjson_get_type(obj) != YYJSON_TYPE_OBJ) { + error("parse_ndjson_as_df() only works if all lines represent JSON objects"); + } + + for (unsigned int col = 0; col < ncols; col++) { + SEXP column_ = VECTOR_ELT(df_, col); + + yyjson_val *val = yyjson_obj_get(obj, colname[col]); + + switch(sexp_type[col]) { + case LGLSXP: + LOGICAL(column_)[row] = json_val_to_logical(val, &opt); + break; + case INTSXP: + INTEGER(column_)[row] = json_val_to_integer(val, &opt); + break; + case INT64SXP: { + long long tmp = json_val_to_integer64(val, &opt); + ((long long *)(REAL(column_)))[row] = tmp; + } + break; + case REALSXP: + REAL(column_)[row] = json_val_to_double(val, &opt); + break; + case STRSXP: + if (val == NULL) { + SET_STRING_ELT(column_, row, NA_STRING); + } else { + SET_STRING_ELT(column_, row, json_val_to_charsxp(val, &opt)); + } + break; + case VECSXP: + if (val == NULL) { + SET_VECTOR_ELT(column_, row, opt.df_missing_list_elem); + } else { + SET_VECTOR_ELT(column_, row, json_as_robj(val, &opt)); + } + break; + default: + error("parse_ndjson_file_as_df_(): Unknown type"); + } + + } + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Advance string + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + total_read += pos + 1; + str += pos + 1; + str_size -= (pos + 1); + + + row++; + } + + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Set colnames on data.frame + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SEXP nms_ = PROTECT(allocVector(STRSXP, ncols)); nprotect++; + for (unsigned int i = 0; i < ncols; i++) { + SET_STRING_ELT(nms_, i, mkChar(colname[i])); + } + Rf_setAttrib(df_, R_NamesSymbol, nms_); + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Resize each data.frame column vector to match the actual data length + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + if (nrows != row) { + int allocated_length = nrows; + int data_length = row; + for (int i=0; i < length(df_); i++) { + SETLENGTH(VECTOR_ELT(df_, i), data_length); + SET_TRUELENGTH(VECTOR_ELT(df_, i), allocated_length); + SET_GROWABLE_BIT(VECTOR_ELT(df_, i)); + } + } + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Set empty rownames on data.frame + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SEXP rownames = PROTECT(allocVector(INTSXP, 2)); nprotect++; + SET_INTEGER_ELT(rownames, 0, NA_INTEGER); + SET_INTEGER_ELT(rownames, 1, -row); + setAttrib(df_, R_RowNamesSymbol, rownames); + + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // Set 'data.frame' class + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SET_CLASS(df_, mkString("data.frame")); + + UNPROTECT(nprotect); + return df_; +} diff --git a/src/utils.c b/src/utils.c index c8606de..4a87ce3 100644 --- a/src/utils.c +++ b/src/utils.c @@ -20,4 +20,4 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SEXP yyjson_version_(void) { return mkString(YYJSON_VERSION_STRING); -} \ No newline at end of file +} diff --git a/tests/testthat/test-ndjson.R b/tests/testthat/test-ndjson.R index d099142..6138914 100644 --- a/tests/testthat/test-ndjson.R +++ b/tests/testthat/test-ndjson.R @@ -59,9 +59,9 @@ test_that("write_ndjson_file df works", { test_that("write_ndjson_str df works", { file <- tempfile() write_ndjson_file(iris, file) - ref <- write_ndjson_str(iris) + ref2 <- write_ndjson_str(iris) res <- paste(readLines(file), collapse = "\n") - expect_identical(res, ref) + expect_identical(res, ref2) }) test_that("write_ndjson_file list works", { @@ -74,11 +74,45 @@ test_that("write_ndjson_file list works", { test_that("write_ndjson_str list works", { + + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # Check write_ndjson_file() and write_ndjson_str() agree + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ file <- tempfile() write_ndjson_file(tref, file) - ref <- write_ndjson_str(tref) + ref2 <- write_ndjson_str(tref) res <- paste(readLines(file), collapse = "\n") - expect_identical(res, ref) + expect_identical(res, ref2) + + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # Read NDJSON string as list + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + json <- write_ndjson_str(tref) + ref2 <- read_ndjson_str(json, type = 'list') + expect_identical(ref2, tref) + + json <- write_ndjson_str(tref) + ref2 <- read_ndjson_str(json, type = 'list', nskip = 1) + expect_identical(ref2, tref[-1]) + + json <- write_ndjson_str(tref) + ref2 <- read_ndjson_str(json, type = 'list', nskip = 2, nread = 3) + expect_identical(ref2, tref[3:5]) + + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # Read NDJSON string as data.frame + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + json <- write_ndjson_str(ref) + ref2 <- read_ndjson_str(json, type = 'df') + expect_identical(ref2, ref) + + json <- write_ndjson_str(ref) + ref2 <- read_ndjson_str(json, type = 'df', nskip = 1) + expect_identical(ref2, ref[-1, ], ignore_attr = TRUE) + + json <- write_ndjson_str(ref) + ref2 <- read_ndjson_str(json, type = 'df', nskip = 2, nread = 3) + expect_identical(ref2, ref[3:5, ], ignore_attr = TRUE) })