From 683a2be06ee6109ef42bbb272e5ab7e47e2a040b Mon Sep 17 00:00:00 2001
From: abourgoin <alexis.bourgoin@bretagne-environnement.fr>
Date: Thu, 31 Oct 2024 13:10:44 +0100
Subject: [PATCH 01/10] refactor build_bcpc_idx, allow for more sources (fr,
 iupac, ru)

---
 R/bcpc.R | 125 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 77 insertions(+), 48 deletions(-)

diff --git a/R/bcpc.R b/R/bcpc.R
index b1ef45b5..197e9752 100644
--- a/R/bcpc.R
+++ b/R/bcpc.R
@@ -32,7 +32,7 @@
 #' # use CAS-numbers
 #' bcpc_query("79622-59-6", from = 'cas')
 #' }
-bcpc_query <- function(query, from = c("name", "cas"),
+bcpc_query <- function(query, from = c("name", "cas", "inchikey", "name_fr"),
                        verbose = getOption("verbose"),
                        type, ...) {
 
@@ -194,54 +194,28 @@ build_bcpc_idx <- function(verbose = getOption("verbose"), force_build = FALSE)
       dir.create(paste0(tempdir(), "/data"))
     }
     if (verbose) message("Building index. ", appendLF = FALSE)
-    idx1_url <- "https://pesticidecompendium.bcpc.org/index_rn.html"
-    idx4_url <- "https://pesticidecompendium.bcpc.org/index_cn.html"
-    res1 <- try(httr::RETRY("GET",
-                           idx1_url,
-                           httr::user_agent(webchem_url()),
-                           config = httr::config(accept_encoding = "identity"),
-                           terminate_on = 404,
-                           quiet = TRUE), silent= TRUE)
-    if (inherits(res1, "try-error")) {
-      if (verbose) webchem_message("service_down")
-      return(NA)
-    }
-    if (verbose) message(httr::message_for_status(res1))
-    if (res1$status_code == 200){
-      idx1 <- read_html(res1)
-      prep_idx <- function(y) {
-        names <- xml_text(xml_find_all(y, "//dl/dt"))
-        links <- xml_attr(
-          xml_find_all(y, "//dt/following-sibling::dd[1]/a[1]"), "href")
-        linknames <- xml_text(
-          xml_find_all(y, "//dt/following-sibling::dd[1]/a[1]"))
-        return(data.frame(names, links, linknames, stringsAsFactors = FALSE))
+    index_sources <- data.frame(
+      name = c("rn", "inchikey", "cn", "name_fr", "name_ru"),
+      code = c(TRUE, TRUE, FALSE, FALSE, FALSE),
+      url = c(
+        "https://pesticidecompendium.bcpc.org/index_rn.html",
+        "www.bcpcpesticidecompendium.org/index-inchikey.html",
+        "https://pesticidecompendium.bcpc.org/index_cn.html",
+        "https://pesticidecompendium.bcpc.org/index-fr.html",
+        "https://pesticidecompendium.bcpc.org/index-ru.html"
+      )
+    )
+    idxs <- apply(index_sources, 1, function(source) {
+      res <- query_idx(source[["url"]])
+      if (source[["code"]]) {
+        idx <- prep_idx_code(res, source = source[["name"]])
+      } else {
+        idx <- prep_idx_named(res, source = source[["name"]], xpath = "//p/a")
       }
-      bcpc_idx <- rbind(prep_idx(idx1))
-      bcpc_idx[["source"]] <- "rn"
-      res4 <- try(httr::RETRY("GET",
-                             idx4_url,
-                             httr::user_agent(webchem_url()),
-                             config = httr::config(accept_encoding = "identity"),
-                             terminate_on = 404,
-                             quiet = TRUE), silent= TRUE)
-    if (inherits(res4, "try-error")) {
-      if (verbose) webchem_message("service_down")
-      return(NA)
-    }
-      idx4 <- read_html(res4)
-      n <- xml_find_all(idx4, "//a")
-      names <- xml_text(n)
-      rm <- names == ""
-      names <- names[!rm]
-      links <- xml_attr(n, "href")
-      links <- links[!rm]
-      idx4 <- data.frame(names = NA, links = links, linknames = names,
-                         source = "cn", stringsAsFactors = FALSE)
-      bcpc_idx <- rbind(bcpc_idx, idx4)
-
-      # fix encoding
-      ln <- bcpc_idx$linknames
+    })
+    bcpc_idx <- do.call(rbind, idxs)
+    # fix encoding
+    ln <- bcpc_idx$linknames
       Encoding(ln) <- "latin1"
       ln <- iconv(ln, from = "latin1", to = "ASCII", sub = "")
       bcpc_idx$linknames <- ln
@@ -251,3 +225,58 @@ build_bcpc_idx <- function(verbose = getOption("verbose"), force_build = FALSE)
   }
   return(bcpc_idx)
 }
+
+load_cas_idx <- function() {
+  idx_cas_url <- "https://pesticidecompendium.bcpc.org/index_rn.html"
+}
+
+query_idx <- function(url, verbose = getOption("verbose")) {
+  res <- try(
+    httr::RETRY(
+      "GET",
+      url,
+      httr::user_agent(webchem_url()),
+      config = httr::config(accept_encoding = "identity"),
+      terminate_on = 404,
+      quiet = TRUE
+    ),
+    silent = TRUE
+  )
+  if (inherits(res, "try-error")) {
+    if (verbose) webchem_message("service_down")
+    return(NA)
+  }
+  if (verbose) message(httr::message_for_status(res))
+  if (res$status_code == 200) return(res)
+}
+
+prep_idx_code <- function(res, source) {
+  idx <- read_html(res)
+  names <- xml_text(xml_find_all(idx, "//dl/dt"))
+  links <- xml_attr(
+    xml_find_all(idx, "//dt/following-sibling::dd[1]/a[1]"), "href"
+  )
+  linknames <- xml_text(
+    xml_find_all(idx, "//dt/following-sibling::dd[1]/a[1]")
+  )
+  df_idx <- data.frame(names, links, linknames, stringsAsFactors = FALSE)
+  df_idx[["source"]] <- source
+  return(df_idx)
+}
+
+prep_idx_named <- function(res, source, xpath) {
+  idx <- read_html(res)
+  n <- xml_find_all(idx, xpath)
+  names <- xml_text(n)
+  keep <- nzchar(names)
+  names <- names[keep]
+  links <- xml_attr(n, "href")
+  links <- links[keep]
+  data.frame(
+    names = NA,
+    links = links,
+    linknames = names,
+    source = source,
+    stringsAsFactors = FALSE
+  )
+}
\ No newline at end of file

From 4dbd4bff2231319d4265f61228d9afe9bcf676ec Mon Sep 17 00:00:00 2001
From: abourgoin <alexis.bourgoin@bretagne-environnement.fr>
Date: Thu, 31 Oct 2024 13:13:40 +0100
Subject: [PATCH 02/10] fix

---
 R/bcpc.R | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/R/bcpc.R b/R/bcpc.R
index 197e9752..13b96627 100644
--- a/R/bcpc.R
+++ b/R/bcpc.R
@@ -221,14 +221,11 @@ build_bcpc_idx <- function(verbose = getOption("verbose"), force_build = FALSE)
       bcpc_idx$linknames <- ln
       attr(bcpc_idx, "date") <- Sys.Date()
       save(bcpc_idx, file = paste0(tempdir(), "/data/bcpc_idx.rda"))
-    }
+    
   }
   return(bcpc_idx)
 }
 
-load_cas_idx <- function() {
-  idx_cas_url <- "https://pesticidecompendium.bcpc.org/index_rn.html"
-}
 
 query_idx <- function(url, verbose = getOption("verbose")) {
   res <- try(

From 6542ab5b304ce8085982b2cd72042e465a5ced05 Mon Sep 17 00:00:00 2001
From: abourgoin <alexis.bourgoin@bretagne-environnement.fr>
Date: Thu, 7 Nov 2024 16:14:42 +0100
Subject: [PATCH 03/10] extract function foo into proper named function. Added
 name_fr, name_zh, name_ru as scraped fields from page

---
 R/bcpc.R | 274 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 159 insertions(+), 115 deletions(-)

diff --git a/R/bcpc.R b/R/bcpc.R
index 13b96627..74d25b14 100644
--- a/R/bcpc.R
+++ b/R/bcpc.R
@@ -52,121 +52,7 @@ bcpc_query <- function(query, from = c("name", "cas", "inchikey", "name_fr"),
   bcpc_idx <- build_bcpc_idx(verbose, ...)
 
   names(query) <- query
-
-  foo <- function(query, from, verbose) {
-    if (from == "cas") {
-      query <- as.cas(query, verbose = verbose)
-      names <- bcpc_idx$names[bcpc_idx$source == "rn"]
-      # select only first link
-      links <- bcpc_idx$links[bcpc_idx$source == "rn"]
-      linknames <- bcpc_idx$linknames[bcpc_idx$source == "rn"]
-      cname <-  linknames[tolower(names) == tolower(query)]
-    }
-    if (is.na(query)) {
-      if (verbose) webchem_message("na")
-      return(NA)
-    }
-    if (verbose) webchem_message("query", query, appendLF = FALSE)
-  # search links in indexes
-    if (from == "name") {
-      links <- bcpc_idx$links[bcpc_idx$source == "cn"]
-      names <- bcpc_idx$linknames[bcpc_idx$source == "cn"]
-      cname <-  query
-    }
-
-    takelink <- links[tolower(names) == tolower(query)]
-    if (length(takelink) == 0) {
-      if (verbose) message("Not found.")
-      return(NA)
-    }
-    if (length(takelink) > 1) {
-      takelink <- unique(takelink)
-      if (length(takelink) > 1) {
-        message("More then one link found. Returning first.")
-        takelink <- takelink[1]
-      }
-    }
-
-    qurl <- paste0("https://pesticidecompendium.bcpc.org/", takelink)
-    webchem_sleep(type = 'scrape')
-    res <- try(httr::RETRY("GET",
-                           qurl,
-                           httr::user_agent(webchem_url()),
-                           terminate_on = 404,
-                           config = httr::config(accept_encoding = "identity"),
-                           quiet = TRUE), silent = TRUE)
-    if (inherits(res, "try-error")) {
-      if (verbose) webchem_message("service_down")
-      return(NA)
-    }
-    if (verbose) message(httr::message_for_status(res))
-    if (res$status_code == 200){
-      ttt <- read_html(res)
-      status <- xml_text(
-        xml_find_all(ttt, "//tr/th[@id='r1']/following-sibling::td"))
-      pref_iupac_name <- xml_text(
-        xml_find_all(ttt, "//tr/th[@id='r2']/following-sibling::td"))
-      iupac_name <- xml_text(
-        xml_find_all(ttt, "//tr/th[@id='r3']/following-sibling::td"))
-      cas <- xml_text(
-        xml_find_all(ttt, "//tr/th[@id='r5']/following-sibling::td"))
-      formula <- xml_text(
-        xml_find_all(ttt, "//tr/th[@id='r6']/following-sibling::td"))
-      activity_text <- as.character(xml_find_all(ttt, "//tr/th[@id='r7']/following-sibling::td"))
-      a_tmp_1 <- trimws(gsub("<td.*?>(.*)</td>", "\\1", activity_text))
-      a_tmp_2 <- gsub("<a.*?>", "", a_tmp_1)
-      a_tmp_3 <- gsub("</a>", "", a_tmp_2)
-      a_split <- strsplit(a_tmp_3, "<br>")[[1]]
-      activity <- unname(sapply(a_split, function(x) gsub(" \\(.*\\)$", "", x)))
-      subactivity <- unname(sapply(a_split, function(x) {
-          if (grepl("\\(.*\\)", x)) gsub(".*\\((.*)\\)$", "\\1", x)
-          else NA}))
-      inchikey_r <- xml_text(
-        xml_find_all(ttt, "//tr/th[@id='r11']/following-sibling::td"))
-      if (length(inchikey_r) == 0) {
-        inchikey <- NA
-      } else {
-        if (grepl("isomer", inchikey_r)) {
-          inchikey <- c(
-            s_isomer = gsub(
-              ".*\\(S\\)-isomer:(.*)(minor component.*)", "\\1", inchikey_r),
-            r_isomer = gsub(".*\\(R\\)-isomer:(.*)", "\\1", inchikey_r))
-        }
-        if (grepl("identifier", inchikey_r)) {
-          inchikey <- c(gsub("(.*)identifier.*", "\\1", inchikey_r),
-                        gsub(".*identifier.*:(.*)", "\\1", inchikey_r))
-          names(inchikey) <- c("inchikey",
-                               gsub(".*(identifier.*:).*", "\\1", inchikey_r)
-          )
-        }
-        if (!grepl("isomer", inchikey_r) & !grepl("identifier", inchikey_r))
-          inchikey <- inchikey_r
-      }
-
-      inchi <- xml_text(
-        xml_find_all(ttt, "//tr/th[@id='r12']/following-sibling::td"))
-      if (length(inchi) == 0) {
-        inchi <- NA
-      } else {
-        if (grepl("isomer", inchi)) {
-          inchi <- c(s_isomer = gsub(".*\\(S\\)-isomer:(.*)(minor component.*)",
-                                     "\\1", inchi),
-                     r_isomer = gsub(".*\\(R\\)-isomer:(.*)", "\\1", inchi))
-        }
-      }
-
-      out <- list(cname = cname, status = status,
-                  pref_iupac_name = pref_iupac_name, iupac_name = iupac_name,
-                  cas = cas, formula = formula, activity = activity,
-                  subactivity = subactivity, inchikey = inchikey, inchi = inchi,
-                  source_url = qurl)
-      return(out)
-    }
-    else {
-      return(NA)
-    }
-  }
-  out <- lapply(query, function(x) foo(x, from = from, verbose = verbose))
+  out <- lapply(query, function(x) scrape_bcpc_frame(x, from = from, idx = bcpc_idx, verbose = verbose))
   class(out) <- c("bcpc_query", "list")
   return(out)
 }
@@ -276,4 +162,162 @@ prep_idx_named <- function(res, source, xpath) {
     source = source,
     stringsAsFactors = FALSE
   )
+}
+
+#' Function to handle synonymous arguments
+#'
+#' This function returns a dataframe of linknames, links and sources
+#' @param from argument (single) string
+#' @return string
+#' @seealso \code{\link{bcpc_query}}
+#' @noRd
+convert_from_arg_bcpc <- function(from) {
+  from <- gsub("name|name_en", "cn", from)
+  from <- gsub("cas", "rn", from)
+  from
+}
+
+#' Function to scrape information from a substance page
+#'
+#' This function returns a named list of substance attributes and values
+#' @param query string to query in index
+#' @param from source argument
+#' @param idx data.frame of bcpc index
+#' @return named list
+#' @seealso \code{\link{bcpc_query}}, \code{\link{build_bcpc_idx}}
+#' @noRd
+scrape_bcpc_frame <- function(query, from, idx, verbose) {
+  idx <- subset(idx, source == from)
+  if (is.na(query)) {
+    if (verbose) webchem_message("na")
+    return(NA)
+  }
+  if (verbose) webchem_message("query", query, appendLF = FALSE)
+  # generic selection
+  if (from == "cas") query <- as.cas(query, verbose = verbose)
+  names <- idx$names
+  links <- idx$links
+  linknames <- idx$linknames
+  cname <- linknames[tolower(names) == tolower(query)]
+  takelink <- links[tolower(names) == tolower(query)]
+  # end new
+  # search links in indexes
+  if (length(takelink) == 0) {
+    if (verbose) message("Not found.")
+    return(NA)
+  }
+  if (length(takelink) > 1) {
+    takelink <- unique(takelink)
+    if (length(takelink) > 1) {
+      message("More then one link found. Returning first.")
+      takelink <- takelink[1]
+    }
+  }
+
+  qurl <- paste0("https://pesticidecompendium.bcpc.org/", takelink)
+  webchem_sleep(type = "scrape")
+  res <- query_bcpc_url(qurl)
+  if (!length(res) || !res$status_code == 200) {
+    return(NA)
+  }
+
+  ttt <- read_html(res)
+  status <- xml_text(
+    xml_find_all(ttt, "//tr/th[@id='r1']/following-sibling::td")
+  )
+  name_fr <- xml_find_all(ttt, "//h2/span/following::span[@lang='fr'][1]")
+  # remove noun gender
+  name_fr <- xml_text(
+    read_xml(
+      gsub(" \\(<abbr.*>.*<\\/abbr>\\)", "", name_fr)
+    )
+  )
+  name_ru <- xml_text(
+    xml_find_all(ttt, "//h2/span/following::span[@lang='ru'][1]")
+  )
+  name_zh <- xml_text(
+    xml_find_all(ttt, "//h2/span/following::span[@lang='zh-Hans'][1]")
+  )
+  pref_iupac_name <- xml_text(
+    xml_find_all(ttt, "//tr/th[@id='r2']/following-sibling::td")
+  )
+  iupac_name <- xml_text(
+    xml_find_all(ttt, "//tr/th[@id='r3']/following-sibling::td")
+  )
+  cas <- xml_text(
+    xml_find_all(ttt, "//tr/th[@id='r5']/following-sibling::td")
+  )
+  formula <- xml_text(
+    xml_find_all(ttt, "//tr/th[@id='r6']/following-sibling::td")
+  )
+  activity_text <- as.character(xml_find_all(ttt, "//tr/th[@id='r7']/following-sibling::td"))
+  a_tmp_1 <- trimws(gsub("<td.*?>(.*)</td>", "\\1", activity_text))
+  a_tmp_2 <- gsub("<a.*?>", "", a_tmp_1)
+  a_tmp_3 <- gsub("</a>", "", a_tmp_2)
+  a_split <- strsplit(a_tmp_3, "<br>")[[1]]
+  activity <- unname(sapply(a_split, function(x) gsub(" \\(.*\\)$", "", x)))
+  subactivity <- unname(sapply(a_split, function(x) {
+    if (grepl("\\(.*\\)", x)) {
+      gsub(".*\\((.*)\\)$", "\\1", x)
+    } else {
+      NA
+    }
+  }))
+  inchikey_r <- xml_text(
+    xml_find_all(ttt, "//tr/th[@id='r11']/following-sibling::td")
+  )
+  if (length(inchikey_r) == 0) {
+    inchikey <- NA
+  } else {
+    if (grepl("isomer", inchikey_r)) {
+      inchikey <- c(
+        s_isomer = gsub(
+          ".*\\(S\\)-isomer:(.*)(minor component.*)", "\\1", inchikey_r
+        ),
+        r_isomer = gsub(".*\\(R\\)-isomer:(.*)", "\\1", inchikey_r)
+      )
+    }
+    if (grepl("identifier", inchikey_r)) {
+      inchikey <- c(
+        gsub("(.*)identifier.*", "\\1", inchikey_r),
+        gsub(".*identifier.*:(.*)", "\\1", inchikey_r)
+      )
+      names(inchikey) <- c(
+        "inchikey",
+        gsub(".*(identifier.*:).*", "\\1", inchikey_r)
+      )
+    }
+    if (!grepl("isomer", inchikey_r) & !grepl("identifier", inchikey_r)) {
+      inchikey <- inchikey_r
+    }
+  }
+
+  inchi <- xml_text(
+    xml_find_all(ttt, "//tr/th[@id='r12']/following-sibling::td")
+  )
+  if (length(inchi) == 0) {
+    inchi <- NA
+  } else {
+    if (grepl("isomer", inchi)) {
+      inchi <- c(
+        s_isomer = gsub(
+          ".*\\(S\\)-isomer:(.*)(minor component.*)",
+          "\\1", inchi
+        ),
+        r_isomer = gsub(".*\\(R\\)-isomer:(.*)", "\\1", inchi)
+      )
+    }
+  }
+  out <- list(
+    cname = cname,
+    name_fr = name_fr,
+    name_ru = name_ru,
+    name_zh = name_zh,
+    status = status,
+    pref_iupac_name = pref_iupac_name, iupac_name = iupac_name,
+    cas = cas, formula = formula, activity = activity,
+    subactivity = subactivity, inchikey = inchikey, inchi = inchi,
+    source_url = qurl
+  )
+  return(out)
 }
\ No newline at end of file

From 2bcbc9322819a513c632fe8159893d7a4bfa15a7 Mon Sep 17 00:00:00 2001
From: abourgoin <alexis.bourgoin@bretagne-environnement.fr>
Date: Thu, 7 Nov 2024 16:18:37 +0100
Subject: [PATCH 04/10] fix and refactor build_bcpc_idx, fix prep_functions

---
 R/bcpc.R | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/R/bcpc.R b/R/bcpc.R
index 74d25b14..9d8e5f41 100644
--- a/R/bcpc.R
+++ b/R/bcpc.R
@@ -32,25 +32,25 @@
 #' # use CAS-numbers
 #' bcpc_query("79622-59-6", from = 'cas')
 #' }
-bcpc_query <- function(query, from = c("name", "cas", "inchikey", "name_fr"),
+bcpc_query <- function(query, from = c("name", "cas","rn","cn","fr","en","zh","ru","inchikey"),
                        verbose = getOption("verbose"),
                        type, ...) {
 
   if (!ping_service("bcpc")) stop(webchem_message("service_down"))
 
+  # Deprecations messages
   if (!missing(type)) {
     message('"type" is deprecated. Please use "from" instead. ')
     from <- type
   }
-
   if ("commonname" %in% from) {
     message('To search by compound name use "name" instead of "commonname"')
     from <- "name"
   }
 
   from <- match.arg(from)
-  bcpc_idx <- build_bcpc_idx(verbose, ...)
-
+  from <- convert_from_arg_bcpc(from)
+  bcpc_idx <- build_bcpc_idx(verbose=verbose, force_build = FALSE, ...)
   names(query) <- query
   out <- lapply(query, function(x) scrape_bcpc_frame(x, from = from, idx = bcpc_idx, verbose = verbose))
   class(out) <- c("bcpc_query", "list")
@@ -69,30 +69,33 @@ bcpc_query <- function(query, from = c("name", "cas", "inchikey", "name_fr"),
 #' @seealso \code{\link{bcpc_query}}, \code{\link{tempdir}}
 #' @source \url{https://pesticidecompendium.bcpc.org}
 #' @noRd
-build_bcpc_idx <- function(verbose = getOption("verbose"), force_build = FALSE) {
+build_bcpc_idx <- function(sources = c("rn", "inchikey", "cn", "fr", "ru", "zh"), verbose = getOption("verbose"), force_build = FALSE) {
   if (!ping_service("bcpc")) stop(webchem_message("service_down"))
   suppressWarnings(try(load(paste0(tempdir(), "/data/bcpc_idx.rda")),
                        silent = TRUE))
-  if (!file.exists(paste0(tempdir(), "/data/bcpc_idx.rda")) |
-      force_build |
+  if (!file.exists(paste0(tempdir(), "/data/bcpc_idx.rda")) ||
+      force_build ||
       try(Sys.Date() - attr(bcpc_idx, "date"), silent = TRUE) > 30) {
     if (!dir.exists(paste0(tempdir(), "/data"))) {
       dir.create(paste0(tempdir(), "/data"))
     }
+    sources <- match.arg(sources, several.ok = TRUE)
     if (verbose) message("Building index. ", appendLF = FALSE)
-    index_sources <- data.frame(
-      name = c("rn", "inchikey", "cn", "name_fr", "name_ru"),
-      code = c(TRUE, TRUE, FALSE, FALSE, FALSE),
+    idx_sources <- data.frame(
+      name = c("rn", "inchikey", "cn", "fr", "ru", "zh"),
+      code = c(TRUE, TRUE, FALSE, FALSE, FALSE, FALSE),
       url = c(
         "https://pesticidecompendium.bcpc.org/index_rn.html",
         "www.bcpcpesticidecompendium.org/index-inchikey.html",
         "https://pesticidecompendium.bcpc.org/index_cn.html",
         "https://pesticidecompendium.bcpc.org/index-fr.html",
-        "https://pesticidecompendium.bcpc.org/index-ru.html"
+        "https://pesticidecompendium.bcpc.org/index-ru.html",
+        "https://pesticidecompendium.bcpc.org/index-zh.html"
       )
     )
-    idxs <- apply(index_sources, 1, function(source) {
-      res <- query_idx(source[["url"]])
+    selected <- subset(idx_sources, idx_sources$name %in% sources)
+    idxs <- apply(selected, 1, function(source) {
+      res <- query_bcpc_url(source[["url"]])
       if (source[["code"]]) {
         idx <- prep_idx_code(res, source = source[["name"]])
       } else {
@@ -102,12 +105,10 @@ build_bcpc_idx <- function(verbose = getOption("verbose"), force_build = FALSE)
     bcpc_idx <- do.call(rbind, idxs)
     # fix encoding
     ln <- bcpc_idx$linknames
-      Encoding(ln) <- "latin1"
-      ln <- iconv(ln, from = "latin1", to = "ASCII", sub = "")
+      ln <- iconv(ln, to = "UTF-8", sub = "")#is it necessary?
       bcpc_idx$linknames <- ln
       attr(bcpc_idx, "date") <- Sys.Date()
       save(bcpc_idx, file = paste0(tempdir(), "/data/bcpc_idx.rda"))
-    
   }
   return(bcpc_idx)
 }
@@ -153,10 +154,9 @@ prep_idx_named <- function(res, source, xpath) {
   names <- xml_text(n)
   keep <- nzchar(names)
   names <- names[keep]
-  links <- xml_attr(n, "href")
-  links <- links[keep]
+  links <- xml_attr(n, "href")[keep]
   data.frame(
-    names = NA,
+    names = names,
     links = links,
     linknames = names,
     source = source,

From 0ebc24a1e1a87d64337f86ced583cd84ef146a32 Mon Sep 17 00:00:00 2001
From: abourgoin <alexis.bourgoin@bretagne-environnement.fr>
Date: Thu, 7 Nov 2024 16:19:05 +0100
Subject: [PATCH 05/10] Documented newly created functions

---
 R/bcpc.R | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/R/bcpc.R b/R/bcpc.R
index 9d8e5f41..908de937 100644
--- a/R/bcpc.R
+++ b/R/bcpc.R
@@ -113,8 +113,14 @@ build_bcpc_idx <- function(sources = c("rn", "inchikey", "cn", "fr", "ru", "zh")
   return(bcpc_idx)
 }
 
-
-query_idx <- function(url, verbose = getOption("verbose")) {
+#' Function to query bcpc and handle httr errors
+#'
+#' This function returns an httr response object
+#' @param url string; uri to query
+#' @param verbose logical; print message during processing to console?
+#' @return Httr response object
+#' @noRd
+query_bcpc_url <- function(url, verbose = getOption("verbose")) {
   res <- try(
     httr::RETRY(
       "GET",
@@ -134,6 +140,15 @@ query_idx <- function(url, verbose = getOption("verbose")) {
   if (res$status_code == 200) return(res)
 }
 
+#' Function scrape link names and urls from an httr response of a code (CAS, IUPAC, etc.) bcpc index frame
+#'
+#' This function returns a dataframe of linknames, links and sources
+#' @param res httr response object to scrape
+#' @param source string of response source
+#' @return data.frame
+#' @seealso \code{\link{build_bcpc_idx}} for referring function,
+#' for named index frame, use \code{\link{prep_idx_named}}
+#' @noRd
 prep_idx_code <- function(res, source) {
   idx <- read_html(res)
   names <- xml_text(xml_find_all(idx, "//dl/dt"))
@@ -148,6 +163,16 @@ prep_idx_code <- function(res, source) {
   return(df_idx)
 }
 
+#' Function scrape link names and urls from an httr response of a named bcpc index frame
+#'
+#' This function returns a dataframe of linknames, links and sources
+#' @param res httr response object to scrape
+#' @param source string of response source
+#' @param xpath a string of an xpath
+#' @return data.frame
+#' @seealso \code{\link{build_bcpc_idx}} for referring function,
+#' for coded index frame, use \code{\link{prep_idx_named}}
+#' @noRd
 prep_idx_named <- function(res, source, xpath) {
   idx <- read_html(res)
   n <- xml_find_all(idx, xpath)

From 9331cea62b1b114abde7b0ab37ade0a514fd7f61 Mon Sep 17 00:00:00 2001
From: abourgoin <alexis.bourgoin@bretagne-environnement.fr>
Date: Thu, 7 Nov 2024 16:30:57 +0100
Subject: [PATCH 06/10] added NEWS.md info

---
 NEWS.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 8ebb318b..ee146345 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,7 @@
 # dev
+ 
+## NEW FEATURES
+* `bcpc_query()` now supports searching for chinese, french, russian names as well as inchi key
 
 ## BUG FIXES
 

From e7ff991c2316bbc697e24b6b7d202ec64cffcc32 Mon Sep 17 00:00:00 2001
From: abourgoin <alexis.bourgoin@bretagne-environnement.fr>
Date: Thu, 7 Nov 2024 16:33:46 +0100
Subject: [PATCH 07/10] Added documentation

---
 DESCRIPTION       | 2 +-
 man/bcpc_query.Rd | 2 +-
 man/webchem.Rd    | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 1eda32f1..5952cf71 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -51,7 +51,7 @@ Suggests:
     plot.matrix,
     usethis,
     vcr
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 VignetteBuilder: knitr
 Config/testthat/edition: 3
 Config/testthat/parallel: true
diff --git a/man/bcpc_query.Rd b/man/bcpc_query.Rd
index 03758a48..506dd788 100644
--- a/man/bcpc_query.Rd
+++ b/man/bcpc_query.Rd
@@ -6,7 +6,7 @@
 \usage{
 bcpc_query(
   query,
-  from = c("name", "cas"),
+  from = c("name", "cas", "rn", "cn", "fr", "en", "zh", "ru", "inchikey"),
   verbose = getOption("verbose"),
   type,
   ...
diff --git a/man/webchem.Rd b/man/webchem.Rd
index 4ccbaa97..79650323 100644
--- a/man/webchem.Rd
+++ b/man/webchem.Rd
@@ -2,6 +2,7 @@
 % Please edit documentation in R/webchem-package.R
 \docType{package}
 \name{webchem}
+\alias{-package}
 \alias{webchem}
 \title{webchem: An R package to retrieve chemical information from the web.}
 \description{

From bc147c0e33505f2a168bce6742ff0e1875042620 Mon Sep 17 00:00:00 2001
From: abourgoin <alexis.bourgoin@bretagne-environnement.fr>
Date: Thu, 7 Nov 2024 16:42:57 +0100
Subject: [PATCH 08/10] Modified test-bcpc.R to take into account locales
 retrieved (+3 language, so +3 length for fluazinam), and new `bcpc_query`
 from arguments.

---
 tests/testthat/test-bcpc.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/testthat/test-bcpc.R b/tests/testthat/test-bcpc.R
index e9202964..6ea02578 100644
--- a/tests/testthat/test-bcpc.R
+++ b/tests/testthat/test-bcpc.R
@@ -37,7 +37,7 @@ test_that("BCPC pesticide compendium, name", {
   expect_equal(o1[["Fluazinam"]]$cas, "79622-59-6")
   expect_equal(length(o1[["S-Metolachlor"]]$inchikey), 2)
   expect_equal(length(o1[["S-Metolachlor"]]$inchi), 2)
-  expect_equal(length(o1[["Fluazinam"]]), 11)
+  expect_equal(length(o1[["Fluazinam"]]), 14)
 })
 
 
@@ -60,7 +60,7 @@ test_that("BCPC pesticide compendium, build_index", {
   expect_s3_class(idx, "data.frame")
   expect_equal(ncol(idx), 4)
   expect_equal(names(idx), c("names", "links", "linknames", "source"))
-  expect_equal(unique(idx$source), c("rn", "cn"))
+  expect_equal(unique(idx$source), c("rn", "inchikey", "cn", "fr", "ru", "zh"))
   expect_equal(idx$names[1], "50-00-0")
 })
 

From f0a589e83b53e6b6250be75ba8feb4bcd15b8d19 Mon Sep 17 00:00:00 2001
From: abourgoin <alexis.bourgoin@bretagne-environnement.fr>
Date: Thu, 7 Nov 2024 16:50:49 +0100
Subject: [PATCH 09/10] default force_build to false if not explicit in args

---
 R/bcpc.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/bcpc.R b/R/bcpc.R
index 908de937..dfbed26a 100644
--- a/R/bcpc.R
+++ b/R/bcpc.R
@@ -50,7 +50,7 @@ bcpc_query <- function(query, from = c("name", "cas","rn","cn","fr","en","zh","r
 
   from <- match.arg(from)
   from <- convert_from_arg_bcpc(from)
-  bcpc_idx <- build_bcpc_idx(verbose=verbose, force_build = FALSE, ...)
+  bcpc_idx <- build_bcpc_idx(verbose=verbose, ...)
   names(query) <- query
   out <- lapply(query, function(x) scrape_bcpc_frame(x, from = from, idx = bcpc_idx, verbose = verbose))
   class(out) <- c("bcpc_query", "list")

From 0b8842a4fbb34d235eab8a07091b88ad14f450e4 Mon Sep 17 00:00:00 2001
From: abourgoin <alexis.bourgoin@bretagne-environnement.fr>
Date: Thu, 7 Nov 2024 16:52:24 +0100
Subject: [PATCH 10/10] lint fixes

---
 R/bcpc.R | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/R/bcpc.R b/R/bcpc.R
index dfbed26a..fc7663c2 100644
--- a/R/bcpc.R
+++ b/R/bcpc.R
@@ -50,7 +50,7 @@ bcpc_query <- function(query, from = c("name", "cas","rn","cn","fr","en","zh","r
 
   from <- match.arg(from)
   from <- convert_from_arg_bcpc(from)
-  bcpc_idx <- build_bcpc_idx(verbose=verbose, ...)
+  bcpc_idx <- build_bcpc_idx(verbose = verbose, ...)
   names(query) <- query
   out <- lapply(query, function(x) scrape_bcpc_frame(x, from = from, idx = bcpc_idx, verbose = verbose))
   class(out) <- c("bcpc_query", "list")
@@ -105,10 +105,10 @@ build_bcpc_idx <- function(sources = c("rn", "inchikey", "cn", "fr", "ru", "zh")
     bcpc_idx <- do.call(rbind, idxs)
     # fix encoding
     ln <- bcpc_idx$linknames
-      ln <- iconv(ln, to = "UTF-8", sub = "")#is it necessary?
-      bcpc_idx$linknames <- ln
-      attr(bcpc_idx, "date") <- Sys.Date()
-      save(bcpc_idx, file = paste0(tempdir(), "/data/bcpc_idx.rda"))
+    ln <- iconv(ln, to = "UTF-8", sub = "")#is it necessary?
+    bcpc_idx$linknames <- ln
+    attr(bcpc_idx, "date") <- Sys.Date()
+    save(bcpc_idx, file = paste0(tempdir(), "/data/bcpc_idx.rda"))
   }
   return(bcpc_idx)
 }
@@ -163,7 +163,7 @@ prep_idx_code <- function(res, source) {
   return(df_idx)
 }
 
-#' Function scrape link names and urls from an httr response of a named bcpc index frame
+#' Scrapes link names and urls from an httr response of a named bcpc index frame
 #'
 #' This function returns a dataframe of linknames, links and sources
 #' @param res httr response object to scrape