Skip to content

Commit 20dd9d7

Browse files
authored
Merge pull request #1848 from ashiklom/try-import
Import TRY citations, and some useful DB functions
2 parents 02ff3a8 + 894dd27 commit 20dd9d7

30 files changed

+719
-5
lines changed

base/db/.Rbuildignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
^.*\.Rproj$
22
^\.Rproj\.user$
3+
try\.sqlite

base/db/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
try.sqlite
2+
inst/import-try/data-proc

base/db/DESCRIPTION

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,23 @@ Imports:
1818
PEcAn.utils,
1919
dbplyr (>= 1.2.0),
2020
dplyr,
21+
tibble,
22+
purrr,
23+
tidyr,
24+
glue,
2125
lubridate,
2226
magrittr,
2327
ncdf4,
2428
plyr (>= 1.8.4),
2529
udunits2
2630
Suggests:
2731
RPostgreSQL,
28-
testthat (>= 1.0.2)
32+
RSQLite,
33+
testthat (>= 1.0.2),
34+
tidyverse,
35+
data.table,
36+
rcrossref,
37+
here
2938
License: FreeBSD + file LICENSE
3039
Copyright: Authors
3140
LazyLoad: yes

base/db/NAMESPACE

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Generated by roxygen2: do not edit by hand
22

3+
export("%>%")
34
export(append.covariate)
45
export(assign.treatments)
56
export(bety2pecan)
@@ -12,6 +13,7 @@ export(db.print.connections)
1213
export(db.query)
1314
export(db.showQueries)
1415
export(dbHostInfo)
16+
export(db_merge_into)
1517
export(dbfile.check)
1618
export(dbfile.file)
1719
export(dbfile.id)
@@ -32,7 +34,9 @@ export(get_run_ids)
3234
export(get_users)
3335
export(get_var_names)
3436
export(get_workflow_ids)
37+
export(insert_table)
3538
export(load_data_single_run)
39+
export(match_dbcols)
3640
export(ncdays2date)
3741
export(query.file.path)
3842
export(query.format.vars)
@@ -43,7 +47,9 @@ export(query.trait.data)
4347
export(query.traits)
4448
export(rename_jags_columns)
4549
export(runs)
50+
export(search_references)
4651
export(take.samples)
52+
export(try2sqlite)
4753
export(var_names_all)
4854
export(workflow)
4955
export(workflows)

base/db/R/db_merge_into.R

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#' Merge local data frame into SQL table
2+
#'
3+
#' @inheritParams insert_table
4+
#' @inheritDotParams insert_table
5+
#' @param by Character vector of columns by which to perform merge. Defaults to all columns in `values`
6+
#' @return Data frame: Inner join of SQL table and input data frame (as unevaluated "lazy query" table)
7+
#' @export
8+
#' @examples
9+
#' irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
10+
#' dplyr::copy_to(irisdb, iris[1:10,], name = "iris", overwrite = TRUE)
11+
#' db_merge_into(iris[1:12,], "iris", irisdb)
12+
#' dplyr::tbl(irisdb, "iris") %>% dplyr::count()
13+
db_merge_into <- function(values, table, con, by = NULL, drop = FALSE, ...) {
14+
values_fixed <- match_dbcols(values, table, con, drop = FALSE)
15+
if (is.null(by)) {
16+
by <- match_colnames(values, table, con)
17+
}
18+
sql_tbl <- dplyr::tbl(con, table)
19+
values_merge <- dplyr::anti_join(values_fixed, sql_tbl, by = by, copy = TRUE)
20+
if (nrow(values_merge) < 1 || ncol(values_merge) < 1) {
21+
PEcAn.logger::logger.warn(
22+
"Input table for merge is empty."
23+
)
24+
} else {
25+
insert <- insert_table(values_merge, table, con, ...)
26+
}
27+
dplyr::tbl(con, table) %>%
28+
dplyr::inner_join(values_fixed, copy = TRUE)
29+
}

base/db/R/insert_table.R

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
#' Insert R data frame into SQL database
2+
#'
3+
#' First, subset to matching columns. Then, make sure the local and SQL column
4+
#' classes match, coercing local to SQL as necessary (or throwing an error).
5+
#' Then, build an SQL string for the insert statement. Finally, insert into the
6+
#' database.
7+
#'
8+
#' @param values `data.frame` of values to write to SQL database
9+
#' @param table Name of target SQL table, as character
10+
#' @param coerce_col_class logical, whether or not to coerce local data columns
11+
#' to SQL classes. Default = `TRUE.`
12+
#' @param drop logical. If `TRUE` (default), drop columns not found in SQL table.
13+
#' @inheritParams db.query
14+
#' @inherit db.query return
15+
#' @export
16+
#' @examples
17+
#' irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
18+
#' dplyr::copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE)
19+
#' insert_table(iris[-1,], "iris", irisdb$con)
20+
#' dplyr::tbl(irisdb, "iris")
21+
insert_table <- function(values, table, con, coerce_col_class = TRUE, drop = TRUE) {
22+
values_fixed <- match_dbcols(values, table, con, coerce_col_class, drop = TRUE)
23+
insert_query <- build_insert_query(values_fixed, table, .con = con)
24+
db.query(insert_query, con)
25+
}
26+
27+
#' Match column names and classes between local and SQL table
28+
#'
29+
#' @inheritParams insert_table
30+
#' @return `values` `data.frame` with column names and classes matched to SQL
31+
#' @export
32+
match_dbcols <- function(values, table, con, coerce_col_class = TRUE, drop = TRUE) {
33+
use_cols <- match_colnames(values, table, con)
34+
if (length(use_cols) < 1) {
35+
PEcAn.logger::logger.severe(
36+
"No columns match between input and target table."
37+
)
38+
}
39+
PEcAn.logger::logger.debug(
40+
"Matched the following cols: ",
41+
paste(use_cols, collapse = ", ")
42+
)
43+
values_sub <- values[, use_cols]
44+
# Load one row to get column types
45+
sql_row <- dplyr::tbl(con, table) %>% head(1) %>% dplyr::collect()
46+
sql_types <- purrr::map(sql_row, class) %>%
47+
purrr::map_chr(1) %>%
48+
.[use_cols]
49+
values_types <- purrr::map(values_sub, class) %>% purrr::map_chr(1)
50+
type_mismatch <- sql_types != values_types
51+
if (sum(type_mismatch) > 0) {
52+
mismatch_string <- sprintf(
53+
"%s: local is %s, SQL is %s",
54+
names(values_types),
55+
values_types,
56+
sql_types
57+
)[type_mismatch]
58+
PEcAn.logger::logger.info(
59+
"Found type mismatches in the following columns: ",
60+
paste0(mismatch_string, collapse = "; ")
61+
)
62+
if (!coerce_col_class) {
63+
PEcAn.logger::logger.severe(
64+
"Type mismatch detected, and `coerce_col_class` is `FALSE`. ",
65+
"Fix column class mismatches manually."
66+
)
67+
} else {
68+
PEcAn.logger::logger.info(
69+
"Coercing local column types to match SQL."
70+
)
71+
# Coerce values data frame to these types
72+
values_fixed <- purrr::map2_dfc(values_sub, sql_types, as)
73+
}
74+
} else {
75+
values_fixed <- values_sub
76+
}
77+
if (drop) {
78+
values_fixed
79+
} else {
80+
drop_cols <- colnames(values)[!colnames(values) %in% use_cols]
81+
dplyr::bind_cols(values_fixed, values[, drop_cols])
82+
}
83+
}
84+
85+
#' Match names of local data frame to SQL table
86+
#'
87+
#' @inheritParams insert_table
88+
match_colnames <- function(values, table, con) {
89+
tbl_db <- dplyr::tbl(con, table)
90+
table_cols <- dplyr::tbl_vars(tbl_db)
91+
values_cols <- colnames(values)
92+
intersect(values_cols, table_cols)
93+
}
94+
95+
#' Build query to insert R data frame into SQL table
96+
#'
97+
#' @inheritParams insert_table
98+
#' @inheritParams glue::glue_sql
99+
build_insert_query <- function(values, table, .con) {
100+
value_list <- purrr::map(seq_len(nrow(values)), ~as.list(values[.x, ]))
101+
102+
insert_list <- value_list %>%
103+
purrr::map(unname) %>%
104+
purrr::map(dbplyr::escape) %>%
105+
purrr::map(dbplyr::sql_vector)
106+
107+
glue::glue_sql(
108+
"INSERT INTO {`table`} ({`colnames(values)`*}) ",
109+
"VALUES {insert_list*}",
110+
.con = .con
111+
)
112+
}

base/db/R/search_references.R

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#' Perform crossref search for a list of references
2+
#'
3+
#' @param queries Character vector of queries
4+
#' @inheritDotParams search_reference_single
5+
#' @inherit search_reference_single description return
6+
#' @export
7+
search_references <- function(queries, ...) {
8+
search_fun <- search_reference_single %>%
9+
purrr::partial(...) %>%
10+
purrr::possibly(otherwise = data.frame(title = "Not found"))
11+
encodeString(queries) %>%
12+
purrr::map_dfr(search_fun)
13+
}
14+
15+
#' Perform crossref search for a single reference
16+
#'
17+
#' Requires the `rcrossref` package.
18+
#'
19+
#' @param query Citation string (length 1) to search for DOI
20+
#' @param min_score Minimum match score. Default (85) is fairly strict.
21+
#' @param limit Number of results to return
22+
#' @return `data.frame` containing crossref information converted to match bety citations table.
23+
search_reference_single <- function(query, limit = 1, min_score = 85) {
24+
stopifnot(length(query) == 1)
25+
PEcAn.logger::logger.debug("Processing query:\n", query)
26+
crsearch <- rcrossref::cr_works(query = query, limit = limit)
27+
if (is.null(crsearch[["data"]])) {
28+
PEcAn.logger::logger.warn(
29+
"Error in crossref query. ",
30+
"Setting title to search string and leaving other fields blank."
31+
)
32+
return(tibble::tibble(query = query))
33+
}
34+
crdata <- crsearch[["data"]] %>%
35+
dplyr::mutate(score = as.numeric(score)) %>%
36+
dplyr::filter(score > min_score)
37+
if (nrow(crdata) < 1) {
38+
PEcAn.logger::logger.info(
39+
"No matches found. ",
40+
"Setting title to search string and leaving other fields blank.")
41+
return(tibble::tibble(query = query))
42+
}
43+
keep_cols <- c(
44+
"author",
45+
"year",
46+
"title",
47+
journal = "container.title",
48+
vol = "volume",
49+
pg = "page",
50+
doi = "DOI",
51+
"score",
52+
"query"
53+
)
54+
proc_search <- crdata %>%
55+
dplyr::mutate(
56+
# Get the first author only -- this is the BETY format
57+
author_family = purrr::map(author, list("family", 1)),
58+
author_given = purrr::map(author, list("given", 1)),
59+
author = paste(author_family, author_given, sep = ", "),
60+
year = gsub("([[:digit:]]{4}).*", "\\1", issued) %>% as.numeric(),
61+
query = query,
62+
score = as.numeric(score)
63+
)
64+
use_cols <- keep_cols[keep_cols %in% colnames(proc_search)]
65+
dplyr::select(proc_search, !!!use_cols)
66+
}
67+

base/db/R/try2sqlite.R

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#' Convert TRY text file to SQLite database
2+
#'
3+
#' The TRY file is huge and unnecessarily long, which makes it difficult to
4+
#' work with. The resulting SQLite database is much smaller on disk, and can be
5+
#' read much faster thanks to lazy evaluation.
6+
#'
7+
#' The resulting TRY SQLite database contains the following tables:
8+
#' - `values` -- The actual TRY data. Links to all other tables through ID columns.
9+
#' - `traits` -- Description of trait and data names. Links to `values` through `DataID`. Similar to BETY `variables` table.
10+
#' - `datasets` -- Description of datasets and references/citations. Links to `values` through `DatasetID` and `ReferenceID`.
11+
#' - `species` -- Species. Links to `values` through `AccSpeciesID`.
12+
#'
13+
#' @param try_files Character vector of file names containing TRY data.
14+
#' Multiple files are combined with `data.table::rbindlist`.
15+
#' @param sqlite_file Target SQLite database file name, as character.
16+
#' @export
17+
try2sqlite <- function(try_files, sqlite_file = "try.sqlite") {
18+
# Read files
19+
PEcAn.logger::logger.info("Reading in TRY data...")
20+
raw_data <- Map(data.table::fread, try_files) %>%
21+
data.table::rbindlist()
22+
23+
# Create integer reference ID for compact storage
24+
PEcAn.logger::logger.info("Adding ReferenceID column")
25+
raw_data[["ReferenceID"]] <- as.integer(factor(raw_data[["Reference"]]))
26+
27+
# Create tables
28+
PEcAn.logger::logger.info("Extracting data values table.")
29+
data_cols <- c(
30+
"ObsDataID", # TRY row ID -- unique to each observation of a given trait
31+
"ObservationID", # TRY "entity" ID -- identifies a set of trait measurements (e.g. leaf)
32+
"DataID", # Links to data ID
33+
"StdValue", # Standardized, QA-QC'ed value
34+
"UnitName", # Standardized unit
35+
"AccSpeciesID", # Link to 'species' table
36+
"DatasetID", # Link to 'datasets' table.
37+
"ReferenceID", # Link to 'try_references' table.
38+
"ValueKindName", # Type of value, e.g. mean, min, max, etc.
39+
"UncertaintyName", # Kind of uncertainty
40+
"Replicates", # Number of replicates
41+
"RelUncertaintyPercent",
42+
"OrigValueStr", # Original data, as character string (before QA/QC)
43+
"OrigUnitStr", # Original unit, as character string (before QA/QC)
44+
"OrigUncertaintyStr" # Original uncertainty, as character string (before QA/QC)
45+
)
46+
data_values <- unique(raw_data[, data_cols, with = FALSE])
47+
48+
PEcAn.logger::logger.info("Extrating datasets table...")
49+
datasets_cols <- c(
50+
"DatasetID",
51+
"Dataset",
52+
"LastName",
53+
"FirstName",
54+
"Reference",
55+
"ReferenceID"
56+
)
57+
datasets_values <- unique(raw_data[, datasets_cols, with = FALSE])
58+
59+
PEcAn.logger::logger.info("Extracting traits table...")
60+
traits_cols <- c(
61+
"DataID",
62+
"DataName",
63+
"TraitID",
64+
"TraitName"
65+
)
66+
traits_values <- unique(raw_data[, traits_cols, with = FALSE])
67+
68+
PEcAn.logger::logger.info("Extracting species table...")
69+
species_cols <- c(
70+
"AccSpeciesID",
71+
"AccSpeciesName",
72+
"SpeciesName"
73+
)
74+
species_values <- unique(raw_data[, species_cols, with = FALSE])
75+
76+
PEcAn.logger::logger.info("Writing tables to SQLite database...")
77+
con <- DBI::dbConnect(RSQLite::SQLite(), sqlite_file)
78+
on.exit(DBI::dbDisconnect(con))
79+
PEcAn.logger::logger.info("Writing values table...")
80+
DBI::dbWriteTable(con, "values", data_values)
81+
PEcAn.logger::logger.info("Writing traits table...")
82+
DBI::dbWriteTable(con, "traits", traits_values)
83+
PEcAn.logger::logger.info("Writing datasets table...")
84+
DBI::dbWriteTable(con, "datasets", datasets_values)
85+
PEcAn.logger::logger.info("Writing species table...")
86+
DBI::dbWriteTable(con, "species", species_values)
87+
88+
PEcAn.logger::logger.info("Done creating TRY SQLite database!")
89+
90+
NULL
91+
}

base/db/R/zz.imports.R

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1-
##' Imports from other packages
2-
##'
3-
##' @importFrom magrittr `%>%`
1+
#' Imports from other packages
2+
#'
3+
#' @name otherimports
4+
#' @importFrom magrittr %>%
5+
#' @export
6+
magrittr::`%>%`
7+
8+
#' @rdname otherimports
9+
#' @importFrom rlang !! !!!

0 commit comments

Comments
 (0)