diff --git a/base/db/.Rbuildignore b/base/db/.Rbuildignore index 91114bf2f2b..02548d35d12 100644 --- a/base/db/.Rbuildignore +++ b/base/db/.Rbuildignore @@ -1,2 +1,3 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +try\.sqlite diff --git a/base/db/.gitignore b/base/db/.gitignore new file mode 100644 index 00000000000..c1e2f2a3c7b --- /dev/null +++ b/base/db/.gitignore @@ -0,0 +1,2 @@ +try.sqlite +inst/import-try/data-proc diff --git a/base/db/DESCRIPTION b/base/db/DESCRIPTION index 0e1d9183067..82366762c69 100644 --- a/base/db/DESCRIPTION +++ b/base/db/DESCRIPTION @@ -18,6 +18,10 @@ Imports: PEcAn.utils, dbplyr (>= 1.2.0), dplyr, + tibble, + purrr, + tidyr, + glue, lubridate, magrittr, ncdf4, @@ -25,7 +29,12 @@ Imports: udunits2 Suggests: RPostgreSQL, - testthat (>= 1.0.2) + RSQLite, + testthat (>= 1.0.2), + tidyverse, + data.table, + rcrossref, + here License: FreeBSD + file LICENSE Copyright: Authors LazyLoad: yes diff --git a/base/db/NAMESPACE b/base/db/NAMESPACE index 78717a7f0b4..01bcbcd748e 100644 --- a/base/db/NAMESPACE +++ b/base/db/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export("%>%") export(append.covariate) export(assign.treatments) export(bety2pecan) @@ -12,6 +13,7 @@ export(db.print.connections) export(db.query) export(db.showQueries) export(dbHostInfo) +export(db_merge_into) export(dbfile.check) export(dbfile.file) export(dbfile.id) @@ -32,7 +34,9 @@ export(get_run_ids) export(get_users) export(get_var_names) export(get_workflow_ids) +export(insert_table) export(load_data_single_run) +export(match_dbcols) export(ncdays2date) export(query.file.path) export(query.format.vars) @@ -43,7 +47,9 @@ export(query.trait.data) export(query.traits) export(rename_jags_columns) export(runs) +export(search_references) export(take.samples) +export(try2sqlite) export(var_names_all) export(workflow) export(workflows) diff --git a/base/db/R/db_merge_into.R b/base/db/R/db_merge_into.R new file mode 100644 index 00000000000..a00718e5ac3 --- /dev/null +++ b/base/db/R/db_merge_into.R @@ -0,0 +1,29 @@ +#' Merge local data frame into SQL table +#' +#' @inheritParams insert_table +#' @inheritDotParams insert_table +#' @param by Character vector of columns by which to perform merge. Defaults to all columns in `values` +#' @return Data frame: Inner join of SQL table and input data frame (as unevaluated "lazy query" table) +#' @export +#' @examples +#' irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") +#' dplyr::copy_to(irisdb, iris[1:10,], name = "iris", overwrite = TRUE) +#' db_merge_into(iris[1:12,], "iris", irisdb) +#' dplyr::tbl(irisdb, "iris") %>% dplyr::count() +db_merge_into <- function(values, table, con, by = NULL, drop = FALSE, ...) { + values_fixed <- match_dbcols(values, table, con, drop = FALSE) + if (is.null(by)) { + by <- match_colnames(values, table, con) + } + sql_tbl <- dplyr::tbl(con, table) + values_merge <- dplyr::anti_join(values_fixed, sql_tbl, by = by, copy = TRUE) + if (nrow(values_merge) < 1 || ncol(values_merge) < 1) { + PEcAn.logger::logger.warn( + "Input table for merge is empty." + ) + } else { + insert <- insert_table(values_merge, table, con, ...) + } + dplyr::tbl(con, table) %>% + dplyr::inner_join(values_fixed, copy = TRUE) +} diff --git a/base/db/R/insert_table.R b/base/db/R/insert_table.R new file mode 100644 index 00000000000..f9e45c488bf --- /dev/null +++ b/base/db/R/insert_table.R @@ -0,0 +1,112 @@ +#' Insert R data frame into SQL database +#' +#' First, subset to matching columns. Then, make sure the local and SQL column +#' classes match, coercing local to SQL as necessary (or throwing an error). +#' Then, build an SQL string for the insert statement. Finally, insert into the +#' database. +#' +#' @param values `data.frame` of values to write to SQL database +#' @param table Name of target SQL table, as character +#' @param coerce_col_class logical, whether or not to coerce local data columns +#' to SQL classes. Default = `TRUE.` +#' @param drop logical. If `TRUE` (default), drop columns not found in SQL table. +#' @inheritParams db.query +#' @inherit db.query return +#' @export +#' @examples +#' irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") +#' dplyr::copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE) +#' insert_table(iris[-1,], "iris", irisdb$con) +#' dplyr::tbl(irisdb, "iris") +insert_table <- function(values, table, con, coerce_col_class = TRUE, drop = TRUE) { + values_fixed <- match_dbcols(values, table, con, coerce_col_class, drop = TRUE) + insert_query <- build_insert_query(values_fixed, table, .con = con) + db.query(insert_query, con) +} + +#' Match column names and classes between local and SQL table +#' +#' @inheritParams insert_table +#' @return `values` `data.frame` with column names and classes matched to SQL +#' @export +match_dbcols <- function(values, table, con, coerce_col_class = TRUE, drop = TRUE) { + use_cols <- match_colnames(values, table, con) + if (length(use_cols) < 1) { + PEcAn.logger::logger.severe( + "No columns match between input and target table." + ) + } + PEcAn.logger::logger.debug( + "Matched the following cols: ", + paste(use_cols, collapse = ", ") + ) + values_sub <- values[, use_cols] + # Load one row to get column types + sql_row <- dplyr::tbl(con, table) %>% head(1) %>% dplyr::collect() + sql_types <- purrr::map(sql_row, class) %>% + purrr::map_chr(1) %>% + .[use_cols] + values_types <- purrr::map(values_sub, class) %>% purrr::map_chr(1) + type_mismatch <- sql_types != values_types + if (sum(type_mismatch) > 0) { + mismatch_string <- sprintf( + "%s: local is %s, SQL is %s", + names(values_types), + values_types, + sql_types + )[type_mismatch] + PEcAn.logger::logger.info( + "Found type mismatches in the following columns: ", + paste0(mismatch_string, collapse = "; ") + ) + if (!coerce_col_class) { + PEcAn.logger::logger.severe( + "Type mismatch detected, and `coerce_col_class` is `FALSE`. ", + "Fix column class mismatches manually." + ) + } else { + PEcAn.logger::logger.info( + "Coercing local column types to match SQL." + ) + # Coerce values data frame to these types + values_fixed <- purrr::map2_dfc(values_sub, sql_types, as) + } + } else { + values_fixed <- values_sub + } + if (drop) { + values_fixed + } else { + drop_cols <- colnames(values)[!colnames(values) %in% use_cols] + dplyr::bind_cols(values_fixed, values[, drop_cols]) + } +} + +#' Match names of local data frame to SQL table +#' +#' @inheritParams insert_table +match_colnames <- function(values, table, con) { + tbl_db <- dplyr::tbl(con, table) + table_cols <- dplyr::tbl_vars(tbl_db) + values_cols <- colnames(values) + intersect(values_cols, table_cols) +} + +#' Build query to insert R data frame into SQL table +#' +#' @inheritParams insert_table +#' @inheritParams glue::glue_sql +build_insert_query <- function(values, table, .con) { + value_list <- purrr::map(seq_len(nrow(values)), ~as.list(values[.x, ])) + + insert_list <- value_list %>% + purrr::map(unname) %>% + purrr::map(dbplyr::escape) %>% + purrr::map(dbplyr::sql_vector) + + glue::glue_sql( + "INSERT INTO {`table`} ({`colnames(values)`*}) ", + "VALUES {insert_list*}", + .con = .con + ) +} diff --git a/base/db/R/search_references.R b/base/db/R/search_references.R new file mode 100644 index 00000000000..14b61adc7bb --- /dev/null +++ b/base/db/R/search_references.R @@ -0,0 +1,67 @@ +#' Perform crossref search for a list of references +#' +#' @param queries Character vector of queries +#' @inheritDotParams search_reference_single +#' @inherit search_reference_single description return +#' @export +search_references <- function(queries, ...) { + search_fun <- search_reference_single %>% + purrr::partial(...) %>% + purrr::possibly(otherwise = data.frame(title = "Not found")) + encodeString(queries) %>% + purrr::map_dfr(search_fun) +} + +#' Perform crossref search for a single reference +#' +#' Requires the `rcrossref` package. +#' +#' @param query Citation string (length 1) to search for DOI +#' @param min_score Minimum match score. Default (85) is fairly strict. +#' @param limit Number of results to return +#' @return `data.frame` containing crossref information converted to match bety citations table. +search_reference_single <- function(query, limit = 1, min_score = 85) { + stopifnot(length(query) == 1) + PEcAn.logger::logger.debug("Processing query:\n", query) + crsearch <- rcrossref::cr_works(query = query, limit = limit) + if (is.null(crsearch[["data"]])) { + PEcAn.logger::logger.warn( + "Error in crossref query. ", + "Setting title to search string and leaving other fields blank." + ) + return(tibble::tibble(query = query)) + } + crdata <- crsearch[["data"]] %>% + dplyr::mutate(score = as.numeric(score)) %>% + dplyr::filter(score > min_score) + if (nrow(crdata) < 1) { + PEcAn.logger::logger.info( + "No matches found. ", + "Setting title to search string and leaving other fields blank.") + return(tibble::tibble(query = query)) + } + keep_cols <- c( + "author", + "year", + "title", + journal = "container.title", + vol = "volume", + pg = "page", + doi = "DOI", + "score", + "query" + ) + proc_search <- crdata %>% + dplyr::mutate( + # Get the first author only -- this is the BETY format + author_family = purrr::map(author, list("family", 1)), + author_given = purrr::map(author, list("given", 1)), + author = paste(author_family, author_given, sep = ", "), + year = gsub("([[:digit:]]{4}).*", "\\1", issued) %>% as.numeric(), + query = query, + score = as.numeric(score) + ) + use_cols <- keep_cols[keep_cols %in% colnames(proc_search)] + dplyr::select(proc_search, !!!use_cols) +} + diff --git a/base/db/R/try2sqlite.R b/base/db/R/try2sqlite.R new file mode 100644 index 00000000000..828afd00957 --- /dev/null +++ b/base/db/R/try2sqlite.R @@ -0,0 +1,91 @@ +#' Convert TRY text file to SQLite database +#' +#' The TRY file is huge and unnecessarily long, which makes it difficult to +#' work with. The resulting SQLite database is much smaller on disk, and can be +#' read much faster thanks to lazy evaluation. +#' +#' The resulting TRY SQLite database contains the following tables: +#' - `values` -- The actual TRY data. Links to all other tables through ID columns. +#' - `traits` -- Description of trait and data names. Links to `values` through `DataID`. Similar to BETY `variables` table. +#' - `datasets` -- Description of datasets and references/citations. Links to `values` through `DatasetID` and `ReferenceID`. +#' - `species` -- Species. Links to `values` through `AccSpeciesID`. +#' +#' @param try_files Character vector of file names containing TRY data. +#' Multiple files are combined with `data.table::rbindlist`. +#' @param sqlite_file Target SQLite database file name, as character. +#' @export +try2sqlite <- function(try_files, sqlite_file = "try.sqlite") { + # Read files + PEcAn.logger::logger.info("Reading in TRY data...") + raw_data <- Map(data.table::fread, try_files) %>% + data.table::rbindlist() + + # Create integer reference ID for compact storage + PEcAn.logger::logger.info("Adding ReferenceID column") + raw_data[["ReferenceID"]] <- as.integer(factor(raw_data[["Reference"]])) + + # Create tables + PEcAn.logger::logger.info("Extracting data values table.") + data_cols <- c( + "ObsDataID", # TRY row ID -- unique to each observation of a given trait + "ObservationID", # TRY "entity" ID -- identifies a set of trait measurements (e.g. leaf) + "DataID", # Links to data ID + "StdValue", # Standardized, QA-QC'ed value + "UnitName", # Standardized unit + "AccSpeciesID", # Link to 'species' table + "DatasetID", # Link to 'datasets' table. + "ReferenceID", # Link to 'try_references' table. + "ValueKindName", # Type of value, e.g. mean, min, max, etc. + "UncertaintyName", # Kind of uncertainty + "Replicates", # Number of replicates + "RelUncertaintyPercent", + "OrigValueStr", # Original data, as character string (before QA/QC) + "OrigUnitStr", # Original unit, as character string (before QA/QC) + "OrigUncertaintyStr" # Original uncertainty, as character string (before QA/QC) + ) + data_values <- unique(raw_data[, data_cols, with = FALSE]) + + PEcAn.logger::logger.info("Extrating datasets table...") + datasets_cols <- c( + "DatasetID", + "Dataset", + "LastName", + "FirstName", + "Reference", + "ReferenceID" + ) + datasets_values <- unique(raw_data[, datasets_cols, with = FALSE]) + + PEcAn.logger::logger.info("Extracting traits table...") + traits_cols <- c( + "DataID", + "DataName", + "TraitID", + "TraitName" + ) + traits_values <- unique(raw_data[, traits_cols, with = FALSE]) + + PEcAn.logger::logger.info("Extracting species table...") + species_cols <- c( + "AccSpeciesID", + "AccSpeciesName", + "SpeciesName" + ) + species_values <- unique(raw_data[, species_cols, with = FALSE]) + + PEcAn.logger::logger.info("Writing tables to SQLite database...") + con <- DBI::dbConnect(RSQLite::SQLite(), sqlite_file) + on.exit(DBI::dbDisconnect(con)) + PEcAn.logger::logger.info("Writing values table...") + DBI::dbWriteTable(con, "values", data_values) + PEcAn.logger::logger.info("Writing traits table...") + DBI::dbWriteTable(con, "traits", traits_values) + PEcAn.logger::logger.info("Writing datasets table...") + DBI::dbWriteTable(con, "datasets", datasets_values) + PEcAn.logger::logger.info("Writing species table...") + DBI::dbWriteTable(con, "species", species_values) + + PEcAn.logger::logger.info("Done creating TRY SQLite database!") + + NULL +} diff --git a/base/db/R/zz.imports.R b/base/db/R/zz.imports.R index 050b5a37cf9..4eb0423a534 100644 --- a/base/db/R/zz.imports.R +++ b/base/db/R/zz.imports.R @@ -1,3 +1,9 @@ -##' Imports from other packages -##' -##' @importFrom magrittr `%>%` +#' Imports from other packages +#' +#' @name otherimports +#' @importFrom magrittr %>% +#' @export +magrittr::`%>%` + +#' @rdname otherimports +#' @importFrom rlang !! !!! diff --git a/base/db/inst/import-try/01_try_sqlite.R b/base/db/inst/import-try/01_try_sqlite.R new file mode 100644 index 00000000000..9480ea8687e --- /dev/null +++ b/base/db/inst/import-try/01_try_sqlite.R @@ -0,0 +1,18 @@ +# Create the TRY SQLite database +library(PEcAn.DB) +library(PEcAn.logger) +library(here) + +configfile <- here("inst", "import-try", "config.R") +source(configfile) + +if (!exists("overwrite")) { + overwrite <- FALSE +} + +if (!file.exists(sqlite_file) || overwrite) { + file.remove(sqlite_file) + try2sqlite(try_files, sqlite_file) +} else { + logger.info("TRY SQLite database already exists and `overwrite` is FALSE. ") +} diff --git a/base/db/inst/import-try/02_citations.R b/base/db/inst/import-try/02_citations.R new file mode 100644 index 00000000000..98c1dc56758 --- /dev/null +++ b/base/db/inst/import-try/02_citations.R @@ -0,0 +1,71 @@ +# Add DOIs to TRY citations +library(tidyverse) +library(DBI) +library(RSQLite) +library(rcrossref) +library(PEcAn.logger) +library(PEcAn.DB) +library(here) + +wd <- here("inst", "import-try") +configfile <- file.path(wd, "config.R") +source(configfile) + +data_dir <- file.path(wd, "data-proc") +dir.create(data_dir, showWarnings = FALSE) + +bety <- db.open(betyparams) +if (!"notes" %in% dbListFields(bety, "citations")) { + logger.severe( + "`notes` column required in Bety citations table ", + "but not found in this version of Bety. ", + "Please make sure you have performed schema migration ", + "version 20180206152600 (relax_citations)." + ) +} + +trydb <- dbConnect(SQLite(), sqlite_file) + +reference_dat <- tbl(trydb, "datasets") %>% + distinct(Reference, ReferenceID) %>% + collect() + +refs_proc_file <- file.path(data_dir, "refs_proc.rds") +if (file.exists(refs_proc_file)) { + refs_proc <- readRDS(refs_proc_file) +} else { + logger.setLevel("DEBUG") # To get status messages + refs_proc <- reference_dat %>% + mutate(cr_df = map(Reference, search_references, min_score = 40)) %>% + unnest() + logger.setLevel("INFO") + saveRDS(refs_proc, refs_proc_file) +} + +# Replace bad matches with NA +minscore <- 85 +fill_na <- function(field, score) { + na <- as(NA, class(field)) + if_else(score > minscore, field, na) +} +refs_proc2 <- refs_proc %>% + mutate_at( + c("title", "author", "year", "journal", "vol", "pg", "doi"), + fill_na, + score = .$score + ) %>% + mutate( + title = if_else(!is.na(title), title, paste0("TRY ReferenceID ", ReferenceID)), + author = if_else(!is.na(author), author, "Unknown TRY data (see title)"), + author = substr(author, 0, 254), # Trim author to 255 characters + journal = if_else(!is.na(journal), journal, "Unknown TRY data (see title)"), + # Use the Kattge 2007 TRY paper's DOI as a placeholder + doi = if_else(!is.na(doi), doi, "10.1111/j.1365-2486.2011.02451.x"), + year = if_else(!is.na(year), year, 2018), + pg = if_else(!is.na(pg), pg, "9999"), + notes = paste("Original TRY reference: ", Reference) + ) + +bety_refs <- db_merge_into(refs_proc2, "citations", bety, "notes") %>% + collect() +saveRDS(bety_refs, file.path(data_dir, "refs_bety.rds")) diff --git a/base/db/inst/import-try/01.global.subset.R b/base/db/inst/import-try/91.global.subset.R similarity index 100% rename from base/db/inst/import-try/01.global.subset.R rename to base/db/inst/import-try/91.global.subset.R diff --git a/base/db/inst/import-try/02.data.specific.subset.R b/base/db/inst/import-try/92.data.specific.subset.R similarity index 100% rename from base/db/inst/import-try/02.data.specific.subset.R rename to base/db/inst/import-try/92.data.specific.subset.R diff --git a/base/db/inst/import-try/03.create.try.sites.R b/base/db/inst/import-try/93.create.try.sites.R similarity index 100% rename from base/db/inst/import-try/03.create.try.sites.R rename to base/db/inst/import-try/93.create.try.sites.R diff --git a/base/db/inst/import-try/04.match.species.R b/base/db/inst/import-try/94.match.species.R similarity index 100% rename from base/db/inst/import-try/04.match.species.R rename to base/db/inst/import-try/94.match.species.R diff --git a/base/db/inst/import-try/05.citations.R b/base/db/inst/import-try/95.citations.R similarity index 100% rename from base/db/inst/import-try/05.citations.R rename to base/db/inst/import-try/95.citations.R diff --git a/base/db/inst/import-try/06.load.data.R b/base/db/inst/import-try/96.load.data.R similarity index 100% rename from base/db/inst/import-try/06.load.data.R rename to base/db/inst/import-try/96.load.data.R diff --git a/base/db/inst/import-try/README.md b/base/db/inst/import-try/README.md index 33ec4ade800..7ad5b00de76 100644 --- a/base/db/inst/import-try/README.md +++ b/base/db/inst/import-try/README.md @@ -1,8 +1,13 @@ --- -title: "Import TRY database into BETY"" +title: "Import TRY database into BETY" Author: "Alexey Shiklomanov" --- +# Workflow + +1. `01_try_sqlite.R` -- Read TRY data files (huge tab-delimited text files) and convert them to an SQLite database, which is much faster and easier to work with. +2. `02_citations.R` -- Add DOIs to TRY references, if possible. + # Package dependencies 1. `data.table` -- Makes it remotely possible to work with the TRY database. Requires an up-to-date version, so if parts of the workflow break, try re-installing this. 2. `bit64` -- Used by `data.table` to read and store large integers, which constitude most of the ID's in TRY and BETY. diff --git a/base/db/inst/import-try/bety_connect.R b/base/db/inst/import-try/bety_connect.R new file mode 100644 index 00000000000..89d20b37f12 --- /dev/null +++ b/base/db/inst/import-try/bety_connect.R @@ -0,0 +1,6 @@ +bety <- PEcAn.DB::db.open(list( + host = "localhost", + dbname = "bety", + user = "bety", + password = "bety" +)) diff --git a/base/db/inst/import-try/config.R b/base/db/inst/import-try/config.R new file mode 100644 index 00000000000..eff2c8ce7a1 --- /dev/null +++ b/base/db/inst/import-try/config.R @@ -0,0 +1,14 @@ +# Character vector containing TRY file paths +try_files <- c("~/Projects/try-data/4143.txt") +stopifnot(all(file.exists(try_files))) + +# Path to generated TRY SQLite file +sqlite_file <- "inst/import-try/try.sqlite" + +# Bety connection configuration +betyparams <- list( + host = "localhost", + dbname = "bety", + user = "bety", + password = "bety" +) diff --git a/base/db/man/build_insert_query.Rd b/base/db/man/build_insert_query.Rd new file mode 100644 index 00000000000..bbfb23fcfd4 --- /dev/null +++ b/base/db/man/build_insert_query.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/insert_table.R +\name{build_insert_query} +\alias{build_insert_query} +\title{Build query to insert R data frame into SQL table} +\usage{ +build_insert_query(values, table, .con) +} +\arguments{ +\item{values}{`data.frame` of values to write to SQL database} + +\item{table}{Name of target SQL table, as character} + +\item{.con}{[\code{DBIConnection}]:A DBI connection object obtained from \code{DBI::dbConnect()}.} +} +\description{ +Build query to insert R data frame into SQL table +} diff --git a/base/db/man/db_merge_into.Rd b/base/db/man/db_merge_into.Rd new file mode 100644 index 00000000000..63752d4b027 --- /dev/null +++ b/base/db/man/db_merge_into.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/db_merge_into.R +\name{db_merge_into} +\alias{db_merge_into} +\title{Merge local data frame into SQL table} +\usage{ +db_merge_into(values, table, con, by = NULL, drop = FALSE, ...) +} +\arguments{ +\item{values}{`data.frame` of values to write to SQL database} + +\item{table}{Name of target SQL table, as character} + +\item{con}{database connection object} + +\item{by}{Character vector of columns by which to perform merge. Defaults to all columns in `values`} + +\item{drop}{logical. If `TRUE` (default), drop columns not found in SQL table.} + +\item{...}{Arguments passed on to \code{insert_table} +\describe{ + \item{values}{`data.frame` of values to write to SQL database} + \item{table}{Name of target SQL table, as character} + \item{coerce_col_class}{logical, whether or not to coerce local data columns +to SQL classes. Default = `TRUE.`} + \item{drop}{logical. If `TRUE` (default), drop columns not found in SQL table.} + \item{con}{database connection object} +}} +} +\value{ +Data frame: Inner join of SQL table and input data frame (as unevaluated "lazy query" table) +} +\description{ +Merge local data frame into SQL table +} +\examples{ +irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") +dplyr::copy_to(irisdb, iris[1:10,], name = "iris", overwrite = TRUE) +db_merge_into(iris[1:12,], "iris", irisdb) +dplyr::tbl(irisdb, "iris") \%>\% dplyr::count() +} diff --git a/base/db/man/insert_table.Rd b/base/db/man/insert_table.Rd new file mode 100644 index 00000000000..c51740868ce --- /dev/null +++ b/base/db/man/insert_table.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/insert_table.R +\name{insert_table} +\alias{insert_table} +\title{Insert R data frame into SQL database} +\usage{ +insert_table(values, table, con, coerce_col_class = TRUE, drop = TRUE) +} +\arguments{ +\item{values}{`data.frame` of values to write to SQL database} + +\item{table}{Name of target SQL table, as character} + +\item{con}{database connection object} + +\item{coerce_col_class}{logical, whether or not to coerce local data columns +to SQL classes. Default = `TRUE.`} + +\item{drop}{logical. If `TRUE` (default), drop columns not found in SQL table.} +} +\value{ +data frame with query results +} +\description{ +First, subset to matching columns. Then, make sure the local and SQL column +classes match, coercing local to SQL as necessary (or throwing an error). +Then, build an SQL string for the insert statement. Finally, insert into the +database. +} +\examples{ +irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") +dplyr::copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE) +insert_table(iris[-1,], "iris", irisdb$con) +dplyr::tbl(irisdb, "iris") +} diff --git a/base/db/man/match_colnames.Rd b/base/db/man/match_colnames.Rd new file mode 100644 index 00000000000..38450a89bfa --- /dev/null +++ b/base/db/man/match_colnames.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/insert_table.R +\name{match_colnames} +\alias{match_colnames} +\title{Match names of local data frame to SQL table} +\usage{ +match_colnames(values, table, con) +} +\arguments{ +\item{values}{`data.frame` of values to write to SQL database} + +\item{table}{Name of target SQL table, as character} + +\item{con}{database connection object} +} +\description{ +Match names of local data frame to SQL table +} diff --git a/base/db/man/match_dbcols.Rd b/base/db/man/match_dbcols.Rd new file mode 100644 index 00000000000..ba0b6ae8d00 --- /dev/null +++ b/base/db/man/match_dbcols.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/insert_table.R +\name{match_dbcols} +\alias{match_dbcols} +\title{Match column names and classes between local and SQL table} +\usage{ +match_dbcols(values, table, con, coerce_col_class = TRUE, drop = TRUE) +} +\arguments{ +\item{values}{`data.frame` of values to write to SQL database} + +\item{table}{Name of target SQL table, as character} + +\item{con}{database connection object} + +\item{coerce_col_class}{logical, whether or not to coerce local data columns +to SQL classes. Default = `TRUE.`} + +\item{drop}{logical. If `TRUE` (default), drop columns not found in SQL table.} +} +\value{ +`values` `data.frame` with column names and classes matched to SQL +} +\description{ +Match column names and classes between local and SQL table +} diff --git a/base/db/man/otherimports.Rd b/base/db/man/otherimports.Rd new file mode 100644 index 00000000000..dd79f7fbbc3 --- /dev/null +++ b/base/db/man/otherimports.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/zz.imports.R +\docType{import} +\name{otherimports} +\alias{otherimports} +\alias{\%>\%} +\title{Imports from other packages} +\description{ +Imports from other packages +} +\keyword{internal} +\description{ +These objects are imported from other packages. Follow the links +below to see their documentation. + +\describe{ + \item{magrittr}{\code{\link[magrittr]{\%>\%}}} +}} + diff --git a/base/db/man/search_reference_single.Rd b/base/db/man/search_reference_single.Rd new file mode 100644 index 00000000000..c689ccdb86d --- /dev/null +++ b/base/db/man/search_reference_single.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/search_references.R +\name{search_reference_single} +\alias{search_reference_single} +\title{Perform crossref search for a single reference} +\usage{ +search_reference_single(query, limit = 1, min_score = 85) +} +\arguments{ +\item{query}{Citation string (length 1) to search for DOI} + +\item{limit}{Number of results to return} + +\item{min_score}{Minimum match score. Default (85) is fairly strict.} +} +\value{ +`data.frame` containing crossref information converted to match bety citations table. +} +\description{ +Requires the `rcrossref` package. +} diff --git a/base/db/man/search_references.Rd b/base/db/man/search_references.Rd new file mode 100644 index 00000000000..8af809bf7a1 --- /dev/null +++ b/base/db/man/search_references.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/search_references.R +\name{search_references} +\alias{search_references} +\title{Perform crossref search for a list of references} +\usage{ +search_references(queries, ...) +} +\arguments{ +\item{queries}{Character vector of queries} + +\item{...}{Arguments passed on to \code{search_reference_single} +\describe{ + \item{query}{Citation string (length 1) to search for DOI} + \item{min_score}{Minimum match score. Default (85) is fairly strict.} + \item{limit}{Number of results to return} +}} +} +\value{ +`data.frame` containing crossref information converted to match bety citations table. +} +\description{ +Perform crossref search for a list of references +} diff --git a/base/db/man/try2sqlite.Rd b/base/db/man/try2sqlite.Rd new file mode 100644 index 00000000000..e4b6407712b --- /dev/null +++ b/base/db/man/try2sqlite.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/try2sqlite.R +\name{try2sqlite} +\alias{try2sqlite} +\title{Convert TRY text file to SQLite database} +\usage{ +try2sqlite(try_files, sqlite_file = "try.sqlite") +} +\arguments{ +\item{try_files}{Character vector of file names containing TRY data. +Multiple files are combined with `data.table::rbindlist`.} + +\item{sqlite_file}{Target SQLite database file name, as character.} +} +\description{ +The TRY file is huge and unnecessarily long, which makes it difficult to +work with. The resulting SQLite database is much smaller on disk, and can be +read much faster thanks to lazy evaluation. +} +\details{ +The resulting TRY SQLite database contains the following tables: + - `values` -- The actual TRY data. Links to all other tables through ID columns. + - `traits` -- Description of trait and data names. Links to `values` through `DataID`. Similar to BETY `variables` table. + - `datasets` -- Description of datasets and references/citations. Links to `values` through `DatasetID` and `ReferenceID`. + - `species` -- Species. Links to `values` through `AccSpeciesID`. +} diff --git a/base/db/tests/testthat/test.insert.R b/base/db/tests/testthat/test.insert.R new file mode 100644 index 00000000000..95dd80a459e --- /dev/null +++ b/base/db/tests/testthat/test.insert.R @@ -0,0 +1,49 @@ +library(PEcAn.DB) +library(testthat) +context("SQL insertion helper functions") + +test_that( + "RSQLite-dependent tests work", + { + skip_if_not_installed("RSQLite") + irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") + iris <- tibble::as_tibble(iris) %>% + dplyr::mutate( + Species = as.character(Species) + ) + dplyr::copy_to(irisdb, iris[1,], "iris", overwrite = TRUE) + # Add extra column to see if it's successfully ignored + iris2 <- dplyr::mutate(iris, extracol = seq_len(nrow(iris))) + iris_insert <- iris2[2:10,] + .insert <- insert_table(iris_insert, "iris", irisdb) + test_that( + "Subset of iris was inserted into database", + { + iris_insert_test <- dplyr::tbl(irisdb, "iris") %>% dplyr::collect() + expect_equal(iris[1:10,], iris_insert_test) + } + ) + + iris_merge <- iris2[5:12,] + out_merge <- db_merge_into(iris_merge, "iris", irisdb) %>% + dplyr::collect() + iris_merge_nrow <- dplyr::tbl(irisdb, "iris") %>% + dplyr::count() %>% + dplyr::pull(n) + test_that( + "Only subset of iris data were merged", + { + expect_equal(out_merge, iris2[5:12,]) + out_merge2 <- db_merge_into(iris_merge, "iris", irisdb) %>% + dplyr::collect() + expect_equal(out_merge, out_merge2) + } + ) + + test_that( + "Extra column (not in SQL) was retained in `out_merge`", + { + expect_true("extracol" %in% colnames(out_merge)) + } + ) + })