From 683a2be06ee6109ef42bbb272e5ab7e47e2a040b Mon Sep 17 00:00:00 2001 From: abourgoin Date: Thu, 31 Oct 2024 13:10:44 +0100 Subject: [PATCH 01/10] refactor build_bcpc_idx, allow for more sources (fr, iupac, ru) --- R/bcpc.R | 125 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 77 insertions(+), 48 deletions(-) diff --git a/R/bcpc.R b/R/bcpc.R index b1ef45b5..197e9752 100644 --- a/R/bcpc.R +++ b/R/bcpc.R @@ -32,7 +32,7 @@ #' # use CAS-numbers #' bcpc_query("79622-59-6", from = 'cas') #' } -bcpc_query <- function(query, from = c("name", "cas"), +bcpc_query <- function(query, from = c("name", "cas", "inchikey", "name_fr"), verbose = getOption("verbose"), type, ...) { @@ -194,54 +194,28 @@ build_bcpc_idx <- function(verbose = getOption("verbose"), force_build = FALSE) dir.create(paste0(tempdir(), "/data")) } if (verbose) message("Building index. ", appendLF = FALSE) - idx1_url <- "https://pesticidecompendium.bcpc.org/index_rn.html" - idx4_url <- "https://pesticidecompendium.bcpc.org/index_cn.html" - res1 <- try(httr::RETRY("GET", - idx1_url, - httr::user_agent(webchem_url()), - config = httr::config(accept_encoding = "identity"), - terminate_on = 404, - quiet = TRUE), silent= TRUE) - if (inherits(res1, "try-error")) { - if (verbose) webchem_message("service_down") - return(NA) - } - if (verbose) message(httr::message_for_status(res1)) - if (res1$status_code == 200){ - idx1 <- read_html(res1) - prep_idx <- function(y) { - names <- xml_text(xml_find_all(y, "//dl/dt")) - links <- xml_attr( - xml_find_all(y, "//dt/following-sibling::dd[1]/a[1]"), "href") - linknames <- xml_text( - xml_find_all(y, "//dt/following-sibling::dd[1]/a[1]")) - return(data.frame(names, links, linknames, stringsAsFactors = FALSE)) + index_sources <- data.frame( + name = c("rn", "inchikey", "cn", "name_fr", "name_ru"), + code = c(TRUE, TRUE, FALSE, FALSE, FALSE), + url = c( + "https://pesticidecompendium.bcpc.org/index_rn.html", + "www.bcpcpesticidecompendium.org/index-inchikey.html", + "https://pesticidecompendium.bcpc.org/index_cn.html", + "https://pesticidecompendium.bcpc.org/index-fr.html", + "https://pesticidecompendium.bcpc.org/index-ru.html" + ) + ) + idxs <- apply(index_sources, 1, function(source) { + res <- query_idx(source[["url"]]) + if (source[["code"]]) { + idx <- prep_idx_code(res, source = source[["name"]]) + } else { + idx <- prep_idx_named(res, source = source[["name"]], xpath = "//p/a") } - bcpc_idx <- rbind(prep_idx(idx1)) - bcpc_idx[["source"]] <- "rn" - res4 <- try(httr::RETRY("GET", - idx4_url, - httr::user_agent(webchem_url()), - config = httr::config(accept_encoding = "identity"), - terminate_on = 404, - quiet = TRUE), silent= TRUE) - if (inherits(res4, "try-error")) { - if (verbose) webchem_message("service_down") - return(NA) - } - idx4 <- read_html(res4) - n <- xml_find_all(idx4, "//a") - names <- xml_text(n) - rm <- names == "" - names <- names[!rm] - links <- xml_attr(n, "href") - links <- links[!rm] - idx4 <- data.frame(names = NA, links = links, linknames = names, - source = "cn", stringsAsFactors = FALSE) - bcpc_idx <- rbind(bcpc_idx, idx4) - - # fix encoding - ln <- bcpc_idx$linknames + }) + bcpc_idx <- do.call(rbind, idxs) + # fix encoding + ln <- bcpc_idx$linknames Encoding(ln) <- "latin1" ln <- iconv(ln, from = "latin1", to = "ASCII", sub = "") bcpc_idx$linknames <- ln @@ -251,3 +225,58 @@ build_bcpc_idx <- function(verbose = getOption("verbose"), force_build = FALSE) } return(bcpc_idx) } + +load_cas_idx <- function() { + idx_cas_url <- "https://pesticidecompendium.bcpc.org/index_rn.html" +} + +query_idx <- function(url, verbose = getOption("verbose")) { + res <- try( + httr::RETRY( + "GET", + url, + httr::user_agent(webchem_url()), + config = httr::config(accept_encoding = "identity"), + terminate_on = 404, + quiet = TRUE + ), + silent = TRUE + ) + if (inherits(res, "try-error")) { + if (verbose) webchem_message("service_down") + return(NA) + } + if (verbose) message(httr::message_for_status(res)) + if (res$status_code == 200) return(res) +} + +prep_idx_code <- function(res, source) { + idx <- read_html(res) + names <- xml_text(xml_find_all(idx, "//dl/dt")) + links <- xml_attr( + xml_find_all(idx, "//dt/following-sibling::dd[1]/a[1]"), "href" + ) + linknames <- xml_text( + xml_find_all(idx, "//dt/following-sibling::dd[1]/a[1]") + ) + df_idx <- data.frame(names, links, linknames, stringsAsFactors = FALSE) + df_idx[["source"]] <- source + return(df_idx) +} + +prep_idx_named <- function(res, source, xpath) { + idx <- read_html(res) + n <- xml_find_all(idx, xpath) + names <- xml_text(n) + keep <- nzchar(names) + names <- names[keep] + links <- xml_attr(n, "href") + links <- links[keep] + data.frame( + names = NA, + links = links, + linknames = names, + source = source, + stringsAsFactors = FALSE + ) +} \ No newline at end of file From 4dbd4bff2231319d4265f61228d9afe9bcf676ec Mon Sep 17 00:00:00 2001 From: abourgoin Date: Thu, 31 Oct 2024 13:13:40 +0100 Subject: [PATCH 02/10] fix --- R/bcpc.R | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/R/bcpc.R b/R/bcpc.R index 197e9752..13b96627 100644 --- a/R/bcpc.R +++ b/R/bcpc.R @@ -221,14 +221,11 @@ build_bcpc_idx <- function(verbose = getOption("verbose"), force_build = FALSE) bcpc_idx$linknames <- ln attr(bcpc_idx, "date") <- Sys.Date() save(bcpc_idx, file = paste0(tempdir(), "/data/bcpc_idx.rda")) - } + } return(bcpc_idx) } -load_cas_idx <- function() { - idx_cas_url <- "https://pesticidecompendium.bcpc.org/index_rn.html" -} query_idx <- function(url, verbose = getOption("verbose")) { res <- try( From 6542ab5b304ce8085982b2cd72042e465a5ced05 Mon Sep 17 00:00:00 2001 From: abourgoin Date: Thu, 7 Nov 2024 16:14:42 +0100 Subject: [PATCH 03/10] extract function foo into proper named function. Added name_fr, name_zh, name_ru as scraped fields from page --- R/bcpc.R | 274 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 159 insertions(+), 115 deletions(-) diff --git a/R/bcpc.R b/R/bcpc.R index 13b96627..74d25b14 100644 --- a/R/bcpc.R +++ b/R/bcpc.R @@ -52,121 +52,7 @@ bcpc_query <- function(query, from = c("name", "cas", "inchikey", "name_fr"), bcpc_idx <- build_bcpc_idx(verbose, ...) names(query) <- query - - foo <- function(query, from, verbose) { - if (from == "cas") { - query <- as.cas(query, verbose = verbose) - names <- bcpc_idx$names[bcpc_idx$source == "rn"] - # select only first link - links <- bcpc_idx$links[bcpc_idx$source == "rn"] - linknames <- bcpc_idx$linknames[bcpc_idx$source == "rn"] - cname <- linknames[tolower(names) == tolower(query)] - } - if (is.na(query)) { - if (verbose) webchem_message("na") - return(NA) - } - if (verbose) webchem_message("query", query, appendLF = FALSE) - # search links in indexes - if (from == "name") { - links <- bcpc_idx$links[bcpc_idx$source == "cn"] - names <- bcpc_idx$linknames[bcpc_idx$source == "cn"] - cname <- query - } - - takelink <- links[tolower(names) == tolower(query)] - if (length(takelink) == 0) { - if (verbose) message("Not found.") - return(NA) - } - if (length(takelink) > 1) { - takelink <- unique(takelink) - if (length(takelink) > 1) { - message("More then one link found. Returning first.") - takelink <- takelink[1] - } - } - - qurl <- paste0("https://pesticidecompendium.bcpc.org/", takelink) - webchem_sleep(type = 'scrape') - res <- try(httr::RETRY("GET", - qurl, - httr::user_agent(webchem_url()), - terminate_on = 404, - config = httr::config(accept_encoding = "identity"), - quiet = TRUE), silent = TRUE) - if (inherits(res, "try-error")) { - if (verbose) webchem_message("service_down") - return(NA) - } - if (verbose) message(httr::message_for_status(res)) - if (res$status_code == 200){ - ttt <- read_html(res) - status <- xml_text( - xml_find_all(ttt, "//tr/th[@id='r1']/following-sibling::td")) - pref_iupac_name <- xml_text( - xml_find_all(ttt, "//tr/th[@id='r2']/following-sibling::td")) - iupac_name <- xml_text( - xml_find_all(ttt, "//tr/th[@id='r3']/following-sibling::td")) - cas <- xml_text( - xml_find_all(ttt, "//tr/th[@id='r5']/following-sibling::td")) - formula <- xml_text( - xml_find_all(ttt, "//tr/th[@id='r6']/following-sibling::td")) - activity_text <- as.character(xml_find_all(ttt, "//tr/th[@id='r7']/following-sibling::td")) - a_tmp_1 <- trimws(gsub("(.*)", "\\1", activity_text)) - a_tmp_2 <- gsub("", "", a_tmp_1) - a_tmp_3 <- gsub("", "", a_tmp_2) - a_split <- strsplit(a_tmp_3, "
")[[1]] - activity <- unname(sapply(a_split, function(x) gsub(" \\(.*\\)$", "", x))) - subactivity <- unname(sapply(a_split, function(x) { - if (grepl("\\(.*\\)", x)) gsub(".*\\((.*)\\)$", "\\1", x) - else NA})) - inchikey_r <- xml_text( - xml_find_all(ttt, "//tr/th[@id='r11']/following-sibling::td")) - if (length(inchikey_r) == 0) { - inchikey <- NA - } else { - if (grepl("isomer", inchikey_r)) { - inchikey <- c( - s_isomer = gsub( - ".*\\(S\\)-isomer:(.*)(minor component.*)", "\\1", inchikey_r), - r_isomer = gsub(".*\\(R\\)-isomer:(.*)", "\\1", inchikey_r)) - } - if (grepl("identifier", inchikey_r)) { - inchikey <- c(gsub("(.*)identifier.*", "\\1", inchikey_r), - gsub(".*identifier.*:(.*)", "\\1", inchikey_r)) - names(inchikey) <- c("inchikey", - gsub(".*(identifier.*:).*", "\\1", inchikey_r) - ) - } - if (!grepl("isomer", inchikey_r) & !grepl("identifier", inchikey_r)) - inchikey <- inchikey_r - } - - inchi <- xml_text( - xml_find_all(ttt, "//tr/th[@id='r12']/following-sibling::td")) - if (length(inchi) == 0) { - inchi <- NA - } else { - if (grepl("isomer", inchi)) { - inchi <- c(s_isomer = gsub(".*\\(S\\)-isomer:(.*)(minor component.*)", - "\\1", inchi), - r_isomer = gsub(".*\\(R\\)-isomer:(.*)", "\\1", inchi)) - } - } - - out <- list(cname = cname, status = status, - pref_iupac_name = pref_iupac_name, iupac_name = iupac_name, - cas = cas, formula = formula, activity = activity, - subactivity = subactivity, inchikey = inchikey, inchi = inchi, - source_url = qurl) - return(out) - } - else { - return(NA) - } - } - out <- lapply(query, function(x) foo(x, from = from, verbose = verbose)) + out <- lapply(query, function(x) scrape_bcpc_frame(x, from = from, idx = bcpc_idx, verbose = verbose)) class(out) <- c("bcpc_query", "list") return(out) } @@ -276,4 +162,162 @@ prep_idx_named <- function(res, source, xpath) { source = source, stringsAsFactors = FALSE ) +} + +#' Function to handle synonymous arguments +#' +#' This function returns a dataframe of linknames, links and sources +#' @param from argument (single) string +#' @return string +#' @seealso \code{\link{bcpc_query}} +#' @noRd +convert_from_arg_bcpc <- function(from) { + from <- gsub("name|name_en", "cn", from) + from <- gsub("cas", "rn", from) + from +} + +#' Function to scrape information from a substance page +#' +#' This function returns a named list of substance attributes and values +#' @param query string to query in index +#' @param from source argument +#' @param idx data.frame of bcpc index +#' @return named list +#' @seealso \code{\link{bcpc_query}}, \code{\link{build_bcpc_idx}} +#' @noRd +scrape_bcpc_frame <- function(query, from, idx, verbose) { + idx <- subset(idx, source == from) + if (is.na(query)) { + if (verbose) webchem_message("na") + return(NA) + } + if (verbose) webchem_message("query", query, appendLF = FALSE) + # generic selection + if (from == "cas") query <- as.cas(query, verbose = verbose) + names <- idx$names + links <- idx$links + linknames <- idx$linknames + cname <- linknames[tolower(names) == tolower(query)] + takelink <- links[tolower(names) == tolower(query)] + # end new + # search links in indexes + if (length(takelink) == 0) { + if (verbose) message("Not found.") + return(NA) + } + if (length(takelink) > 1) { + takelink <- unique(takelink) + if (length(takelink) > 1) { + message("More then one link found. Returning first.") + takelink <- takelink[1] + } + } + + qurl <- paste0("https://pesticidecompendium.bcpc.org/", takelink) + webchem_sleep(type = "scrape") + res <- query_bcpc_url(qurl) + if (!length(res) || !res$status_code == 200) { + return(NA) + } + + ttt <- read_html(res) + status <- xml_text( + xml_find_all(ttt, "//tr/th[@id='r1']/following-sibling::td") + ) + name_fr <- xml_find_all(ttt, "//h2/span/following::span[@lang='fr'][1]") + # remove noun gender + name_fr <- xml_text( + read_xml( + gsub(" \\(.*<\\/abbr>\\)", "", name_fr) + ) + ) + name_ru <- xml_text( + xml_find_all(ttt, "//h2/span/following::span[@lang='ru'][1]") + ) + name_zh <- xml_text( + xml_find_all(ttt, "//h2/span/following::span[@lang='zh-Hans'][1]") + ) + pref_iupac_name <- xml_text( + xml_find_all(ttt, "//tr/th[@id='r2']/following-sibling::td") + ) + iupac_name <- xml_text( + xml_find_all(ttt, "//tr/th[@id='r3']/following-sibling::td") + ) + cas <- xml_text( + xml_find_all(ttt, "//tr/th[@id='r5']/following-sibling::td") + ) + formula <- xml_text( + xml_find_all(ttt, "//tr/th[@id='r6']/following-sibling::td") + ) + activity_text <- as.character(xml_find_all(ttt, "//tr/th[@id='r7']/following-sibling::td")) + a_tmp_1 <- trimws(gsub("(.*)", "\\1", activity_text)) + a_tmp_2 <- gsub("", "", a_tmp_1) + a_tmp_3 <- gsub("", "", a_tmp_2) + a_split <- strsplit(a_tmp_3, "
")[[1]] + activity <- unname(sapply(a_split, function(x) gsub(" \\(.*\\)$", "", x))) + subactivity <- unname(sapply(a_split, function(x) { + if (grepl("\\(.*\\)", x)) { + gsub(".*\\((.*)\\)$", "\\1", x) + } else { + NA + } + })) + inchikey_r <- xml_text( + xml_find_all(ttt, "//tr/th[@id='r11']/following-sibling::td") + ) + if (length(inchikey_r) == 0) { + inchikey <- NA + } else { + if (grepl("isomer", inchikey_r)) { + inchikey <- c( + s_isomer = gsub( + ".*\\(S\\)-isomer:(.*)(minor component.*)", "\\1", inchikey_r + ), + r_isomer = gsub(".*\\(R\\)-isomer:(.*)", "\\1", inchikey_r) + ) + } + if (grepl("identifier", inchikey_r)) { + inchikey <- c( + gsub("(.*)identifier.*", "\\1", inchikey_r), + gsub(".*identifier.*:(.*)", "\\1", inchikey_r) + ) + names(inchikey) <- c( + "inchikey", + gsub(".*(identifier.*:).*", "\\1", inchikey_r) + ) + } + if (!grepl("isomer", inchikey_r) & !grepl("identifier", inchikey_r)) { + inchikey <- inchikey_r + } + } + + inchi <- xml_text( + xml_find_all(ttt, "//tr/th[@id='r12']/following-sibling::td") + ) + if (length(inchi) == 0) { + inchi <- NA + } else { + if (grepl("isomer", inchi)) { + inchi <- c( + s_isomer = gsub( + ".*\\(S\\)-isomer:(.*)(minor component.*)", + "\\1", inchi + ), + r_isomer = gsub(".*\\(R\\)-isomer:(.*)", "\\1", inchi) + ) + } + } + out <- list( + cname = cname, + name_fr = name_fr, + name_ru = name_ru, + name_zh = name_zh, + status = status, + pref_iupac_name = pref_iupac_name, iupac_name = iupac_name, + cas = cas, formula = formula, activity = activity, + subactivity = subactivity, inchikey = inchikey, inchi = inchi, + source_url = qurl + ) + return(out) } \ No newline at end of file From 2bcbc9322819a513c632fe8159893d7a4bfa15a7 Mon Sep 17 00:00:00 2001 From: abourgoin Date: Thu, 7 Nov 2024 16:18:37 +0100 Subject: [PATCH 04/10] fix and refactor build_bcpc_idx, fix prep_functions --- R/bcpc.R | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/R/bcpc.R b/R/bcpc.R index 74d25b14..9d8e5f41 100644 --- a/R/bcpc.R +++ b/R/bcpc.R @@ -32,25 +32,25 @@ #' # use CAS-numbers #' bcpc_query("79622-59-6", from = 'cas') #' } -bcpc_query <- function(query, from = c("name", "cas", "inchikey", "name_fr"), +bcpc_query <- function(query, from = c("name", "cas","rn","cn","fr","en","zh","ru","inchikey"), verbose = getOption("verbose"), type, ...) { if (!ping_service("bcpc")) stop(webchem_message("service_down")) + # Deprecations messages if (!missing(type)) { message('"type" is deprecated. Please use "from" instead. ') from <- type } - if ("commonname" %in% from) { message('To search by compound name use "name" instead of "commonname"') from <- "name" } from <- match.arg(from) - bcpc_idx <- build_bcpc_idx(verbose, ...) - + from <- convert_from_arg_bcpc(from) + bcpc_idx <- build_bcpc_idx(verbose=verbose, force_build = FALSE, ...) names(query) <- query out <- lapply(query, function(x) scrape_bcpc_frame(x, from = from, idx = bcpc_idx, verbose = verbose)) class(out) <- c("bcpc_query", "list") @@ -69,30 +69,33 @@ bcpc_query <- function(query, from = c("name", "cas", "inchikey", "name_fr"), #' @seealso \code{\link{bcpc_query}}, \code{\link{tempdir}} #' @source \url{https://pesticidecompendium.bcpc.org} #' @noRd -build_bcpc_idx <- function(verbose = getOption("verbose"), force_build = FALSE) { +build_bcpc_idx <- function(sources = c("rn", "inchikey", "cn", "fr", "ru", "zh"), verbose = getOption("verbose"), force_build = FALSE) { if (!ping_service("bcpc")) stop(webchem_message("service_down")) suppressWarnings(try(load(paste0(tempdir(), "/data/bcpc_idx.rda")), silent = TRUE)) - if (!file.exists(paste0(tempdir(), "/data/bcpc_idx.rda")) | - force_build | + if (!file.exists(paste0(tempdir(), "/data/bcpc_idx.rda")) || + force_build || try(Sys.Date() - attr(bcpc_idx, "date"), silent = TRUE) > 30) { if (!dir.exists(paste0(tempdir(), "/data"))) { dir.create(paste0(tempdir(), "/data")) } + sources <- match.arg(sources, several.ok = TRUE) if (verbose) message("Building index. ", appendLF = FALSE) - index_sources <- data.frame( - name = c("rn", "inchikey", "cn", "name_fr", "name_ru"), - code = c(TRUE, TRUE, FALSE, FALSE, FALSE), + idx_sources <- data.frame( + name = c("rn", "inchikey", "cn", "fr", "ru", "zh"), + code = c(TRUE, TRUE, FALSE, FALSE, FALSE, FALSE), url = c( "https://pesticidecompendium.bcpc.org/index_rn.html", "www.bcpcpesticidecompendium.org/index-inchikey.html", "https://pesticidecompendium.bcpc.org/index_cn.html", "https://pesticidecompendium.bcpc.org/index-fr.html", - "https://pesticidecompendium.bcpc.org/index-ru.html" + "https://pesticidecompendium.bcpc.org/index-ru.html", + "https://pesticidecompendium.bcpc.org/index-zh.html" ) ) - idxs <- apply(index_sources, 1, function(source) { - res <- query_idx(source[["url"]]) + selected <- subset(idx_sources, idx_sources$name %in% sources) + idxs <- apply(selected, 1, function(source) { + res <- query_bcpc_url(source[["url"]]) if (source[["code"]]) { idx <- prep_idx_code(res, source = source[["name"]]) } else { @@ -102,12 +105,10 @@ build_bcpc_idx <- function(verbose = getOption("verbose"), force_build = FALSE) bcpc_idx <- do.call(rbind, idxs) # fix encoding ln <- bcpc_idx$linknames - Encoding(ln) <- "latin1" - ln <- iconv(ln, from = "latin1", to = "ASCII", sub = "") + ln <- iconv(ln, to = "UTF-8", sub = "")#is it necessary? bcpc_idx$linknames <- ln attr(bcpc_idx, "date") <- Sys.Date() save(bcpc_idx, file = paste0(tempdir(), "/data/bcpc_idx.rda")) - } return(bcpc_idx) } @@ -153,10 +154,9 @@ prep_idx_named <- function(res, source, xpath) { names <- xml_text(n) keep <- nzchar(names) names <- names[keep] - links <- xml_attr(n, "href") - links <- links[keep] + links <- xml_attr(n, "href")[keep] data.frame( - names = NA, + names = names, links = links, linknames = names, source = source, From 0ebc24a1e1a87d64337f86ced583cd84ef146a32 Mon Sep 17 00:00:00 2001 From: abourgoin Date: Thu, 7 Nov 2024 16:19:05 +0100 Subject: [PATCH 05/10] Documented newly created functions --- R/bcpc.R | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/R/bcpc.R b/R/bcpc.R index 9d8e5f41..908de937 100644 --- a/R/bcpc.R +++ b/R/bcpc.R @@ -113,8 +113,14 @@ build_bcpc_idx <- function(sources = c("rn", "inchikey", "cn", "fr", "ru", "zh") return(bcpc_idx) } - -query_idx <- function(url, verbose = getOption("verbose")) { +#' Function to query bcpc and handle httr errors +#' +#' This function returns an httr response object +#' @param url string; uri to query +#' @param verbose logical; print message during processing to console? +#' @return Httr response object +#' @noRd +query_bcpc_url <- function(url, verbose = getOption("verbose")) { res <- try( httr::RETRY( "GET", @@ -134,6 +140,15 @@ query_idx <- function(url, verbose = getOption("verbose")) { if (res$status_code == 200) return(res) } +#' Function scrape link names and urls from an httr response of a code (CAS, IUPAC, etc.) bcpc index frame +#' +#' This function returns a dataframe of linknames, links and sources +#' @param res httr response object to scrape +#' @param source string of response source +#' @return data.frame +#' @seealso \code{\link{build_bcpc_idx}} for referring function, +#' for named index frame, use \code{\link{prep_idx_named}} +#' @noRd prep_idx_code <- function(res, source) { idx <- read_html(res) names <- xml_text(xml_find_all(idx, "//dl/dt")) @@ -148,6 +163,16 @@ prep_idx_code <- function(res, source) { return(df_idx) } +#' Function scrape link names and urls from an httr response of a named bcpc index frame +#' +#' This function returns a dataframe of linknames, links and sources +#' @param res httr response object to scrape +#' @param source string of response source +#' @param xpath a string of an xpath +#' @return data.frame +#' @seealso \code{\link{build_bcpc_idx}} for referring function, +#' for coded index frame, use \code{\link{prep_idx_named}} +#' @noRd prep_idx_named <- function(res, source, xpath) { idx <- read_html(res) n <- xml_find_all(idx, xpath) From 9331cea62b1b114abde7b0ab37ade0a514fd7f61 Mon Sep 17 00:00:00 2001 From: abourgoin Date: Thu, 7 Nov 2024 16:30:57 +0100 Subject: [PATCH 06/10] added NEWS.md info --- NEWS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NEWS.md b/NEWS.md index 8ebb318b..ee146345 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ # dev + +## NEW FEATURES +* `bcpc_query()` now supports searching for chinese, french, russian names as well as inchi key ## BUG FIXES From e7ff991c2316bbc697e24b6b7d202ec64cffcc32 Mon Sep 17 00:00:00 2001 From: abourgoin Date: Thu, 7 Nov 2024 16:33:46 +0100 Subject: [PATCH 07/10] Added documentation --- DESCRIPTION | 2 +- man/bcpc_query.Rd | 2 +- man/webchem.Rd | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1eda32f1..5952cf71 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -51,7 +51,7 @@ Suggests: plot.matrix, usethis, vcr -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 VignetteBuilder: knitr Config/testthat/edition: 3 Config/testthat/parallel: true diff --git a/man/bcpc_query.Rd b/man/bcpc_query.Rd index 03758a48..506dd788 100644 --- a/man/bcpc_query.Rd +++ b/man/bcpc_query.Rd @@ -6,7 +6,7 @@ \usage{ bcpc_query( query, - from = c("name", "cas"), + from = c("name", "cas", "rn", "cn", "fr", "en", "zh", "ru", "inchikey"), verbose = getOption("verbose"), type, ... diff --git a/man/webchem.Rd b/man/webchem.Rd index 4ccbaa97..79650323 100644 --- a/man/webchem.Rd +++ b/man/webchem.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/webchem-package.R \docType{package} \name{webchem} +\alias{-package} \alias{webchem} \title{webchem: An R package to retrieve chemical information from the web.} \description{ From bc147c0e33505f2a168bce6742ff0e1875042620 Mon Sep 17 00:00:00 2001 From: abourgoin Date: Thu, 7 Nov 2024 16:42:57 +0100 Subject: [PATCH 08/10] Modified test-bcpc.R to take into account locales retrieved (+3 language, so +3 length for fluazinam), and new `bcpc_query` from arguments. --- tests/testthat/test-bcpc.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-bcpc.R b/tests/testthat/test-bcpc.R index e9202964..6ea02578 100644 --- a/tests/testthat/test-bcpc.R +++ b/tests/testthat/test-bcpc.R @@ -37,7 +37,7 @@ test_that("BCPC pesticide compendium, name", { expect_equal(o1[["Fluazinam"]]$cas, "79622-59-6") expect_equal(length(o1[["S-Metolachlor"]]$inchikey), 2) expect_equal(length(o1[["S-Metolachlor"]]$inchi), 2) - expect_equal(length(o1[["Fluazinam"]]), 11) + expect_equal(length(o1[["Fluazinam"]]), 14) }) @@ -60,7 +60,7 @@ test_that("BCPC pesticide compendium, build_index", { expect_s3_class(idx, "data.frame") expect_equal(ncol(idx), 4) expect_equal(names(idx), c("names", "links", "linknames", "source")) - expect_equal(unique(idx$source), c("rn", "cn")) + expect_equal(unique(idx$source), c("rn", "inchikey", "cn", "fr", "ru", "zh")) expect_equal(idx$names[1], "50-00-0") }) From f0a589e83b53e6b6250be75ba8feb4bcd15b8d19 Mon Sep 17 00:00:00 2001 From: abourgoin Date: Thu, 7 Nov 2024 16:50:49 +0100 Subject: [PATCH 09/10] default force_build to false if not explicit in args --- R/bcpc.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/bcpc.R b/R/bcpc.R index 908de937..dfbed26a 100644 --- a/R/bcpc.R +++ b/R/bcpc.R @@ -50,7 +50,7 @@ bcpc_query <- function(query, from = c("name", "cas","rn","cn","fr","en","zh","r from <- match.arg(from) from <- convert_from_arg_bcpc(from) - bcpc_idx <- build_bcpc_idx(verbose=verbose, force_build = FALSE, ...) + bcpc_idx <- build_bcpc_idx(verbose=verbose, ...) names(query) <- query out <- lapply(query, function(x) scrape_bcpc_frame(x, from = from, idx = bcpc_idx, verbose = verbose)) class(out) <- c("bcpc_query", "list") From 0b8842a4fbb34d235eab8a07091b88ad14f450e4 Mon Sep 17 00:00:00 2001 From: abourgoin Date: Thu, 7 Nov 2024 16:52:24 +0100 Subject: [PATCH 10/10] lint fixes --- R/bcpc.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/bcpc.R b/R/bcpc.R index dfbed26a..fc7663c2 100644 --- a/R/bcpc.R +++ b/R/bcpc.R @@ -50,7 +50,7 @@ bcpc_query <- function(query, from = c("name", "cas","rn","cn","fr","en","zh","r from <- match.arg(from) from <- convert_from_arg_bcpc(from) - bcpc_idx <- build_bcpc_idx(verbose=verbose, ...) + bcpc_idx <- build_bcpc_idx(verbose = verbose, ...) names(query) <- query out <- lapply(query, function(x) scrape_bcpc_frame(x, from = from, idx = bcpc_idx, verbose = verbose)) class(out) <- c("bcpc_query", "list") @@ -105,10 +105,10 @@ build_bcpc_idx <- function(sources = c("rn", "inchikey", "cn", "fr", "ru", "zh") bcpc_idx <- do.call(rbind, idxs) # fix encoding ln <- bcpc_idx$linknames - ln <- iconv(ln, to = "UTF-8", sub = "")#is it necessary? - bcpc_idx$linknames <- ln - attr(bcpc_idx, "date") <- Sys.Date() - save(bcpc_idx, file = paste0(tempdir(), "/data/bcpc_idx.rda")) + ln <- iconv(ln, to = "UTF-8", sub = "")#is it necessary? + bcpc_idx$linknames <- ln + attr(bcpc_idx, "date") <- Sys.Date() + save(bcpc_idx, file = paste0(tempdir(), "/data/bcpc_idx.rda")) } return(bcpc_idx) } @@ -163,7 +163,7 @@ prep_idx_code <- function(res, source) { return(df_idx) } -#' Function scrape link names and urls from an httr response of a named bcpc index frame +#' Scrapes link names and urls from an httr response of a named bcpc index frame #' #' This function returns a dataframe of linknames, links and sources #' @param res httr response object to scrape