diff --git a/.Rbuildignore b/.Rbuildignore index ad70c79..85e3081 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -3,3 +3,5 @@ ^.*\.Rproj$ ^\.Rproj\.user$ ^\.travis\.yml$ +^LICENSE$ +^inst/Pres$ diff --git a/DESCRIPTION b/DESCRIPTION index 0cadb42..f487198 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,17 +8,32 @@ Authors@R: c( person("Irene", "Steves", comment = "https://github.com/isteves", role = "aut"), person("Mitchell", "Maier", email = "mitchell.maier@gmail.com", role = "aut")) Maintainer: Julien Brun -Description: Tools to download data and metadata from DataONE (https://www.dataone.org) and load this information in R. +Description: Tools to download data and metadata from DataONE (https://www.dataone.org) and load this information in R. License: Apache License (== 2.0) -RoxygenNote: 6.1.1 -Imports: dataone, dplyr, eml2, emld, lubridate, purrr, readr, stats, stringr, tibble, tidyr -Remotes: cboettig/eml2, - cboettig/emld, - DataONEorg/rdataone -Suggests: testthat, +URL: https://github.com/nceas/metajam +BugReports: https://github.com/nceas/metajam/issues +Imports: + dataone, + dplyr, + eml2, + emld, + lubridate, + purrr, + readr, + stats, + stringr, + tibble, + tidyr +Remotes: + cboettig/eml2, + cboettig/emld, + DataONEorg/rdataone +Suggests: + testthat, knitr, rmarkdown, udunits2 +Encoding: UTF-8 VignetteBuilder: knitr -URL: https://github.com/nceas/metajam -BugReports: https://github.com/nceas/metajam/issues +Roxygen: list(markdown = TRUE) +RoxygenNote: 6.1.1 diff --git a/NAMESPACE b/NAMESPACE index cb24ce0..00f6127 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,10 +12,12 @@ import(readr) import(stringr) import(tidyr) importFrom(dataone,CNode) -importFrom(dataone,getSystemMetadata) importFrom(dataone,query) importFrom(emld,as_emld) importFrom(lubridate,ymd_hms) importFrom(purrr,"%||%") importFrom(stats,setNames) +importFrom(stringr,str_trim) importFrom(tibble,enframe) +importFrom(utils,URLdecode) +importFrom(utils,write.csv) diff --git a/R/check_version.R b/R/check_version.R index 6e58f65..9b8e40c 100644 --- a/R/check_version.R +++ b/R/check_version.R @@ -2,12 +2,14 @@ #' #' This function takes an identifier and checks to see if it has been obsoleted. #' -#' @param pid (character) The persistent identifier of a data, metadata, or resource map object on a DataONE member node -#' @param formatType (character) Optional. The format type to return (DATA, METADATA, RESOURCE) +#' @param pid (character) The persistent identifier of a data, metadata, or resource map object on a DataONE member node. +#' @param formatType (character) Optional. The format type to return (DATA, METADATA, RESOURCE). #' -#' @importFrom dataone CNode getSystemMetadata -#' @import stringr +#' @return A data.frame of object version PIDs and related information. +#' +#' @import dataone #' @import dplyr +#' @import stringr #' #' @export #' @@ -15,60 +17,59 @@ #' \dontrun{ #' # Most data URL's and identifiers work #' check_version("https://cn.dataone.org/cn/v2/resolve/urn:uuid:a2834e3e-f453-4c2b-8343-99477662b570") -#' +#' #' # Returns a warning if several identifiers are returned: #' check_version( #' "https://pasta.lternet.edu/package/data/eml/edi/195/2/51abf1c7a36a33a2a8bb05ccbf8c81c6" #' ) #' check_version("doi:10.18739/A2ZF6M") -#' +#' #' # You can specify a formatType (metadata, data, resource) #' check_version("doi:10.18739/A2ZF6M", formatType = "metadata") -#' +#' #' # Returns a warning if the identifier has been obsoleted #' check_version("doi:10.18739/A2HF7Z", formatType = "metadata") -#' +#' #' # Returns an error if no matching identifiers are found #' check_version("a_test_pid") #' } -#' -check_version <- function(pid, formatType = NULL){ - +check_version <- function(pid, formatType = NULL) { + # check that pid is of type character if (!all(is.character(pid), all(nchar(pid) > 0))) { stop("Argument 'pids' must be character class with non-zero number of characters.") } - - while(nchar(pid) > 5) { + + while (nchar(pid) > 5) { results <- suppressMessages( dataone::query(dataone::CNode(), list(q = sprintf('identifier:"%s"', pid), fl = "identifier, dateUploaded, formatType, obsoletedBy, resourceMap"), as = "data.frame") ) - #if results is null or empty dataframe, remove part of the URI + #if results is null or empty dataframe, remove part of the URI if (is.null(results) || nrow(results) == 0) { pid <- gsub("^[^/=]+[/=]*", "", pid) - + } else { #what to do if multiple are returned break } } - - if(nrow(results) == 0){ + + if (nrow(results) == 0) { stop("No matching identifiers were found.") } - + # filter out extra types (resource map/etc with similar pid) - if(!is.null(formatType)){ + if (!is.null(formatType)) { formatType <- toupper(formatType) results <- results[results$formatType == formatType,] } - - if(nrow(results) == 1){ - if(is.null(results$obsoletedBy) || is.na(results$obsoletedBy)){ + + if (nrow(results) == 1) { + if (is.null(results$obsoletedBy) || is.na(results$obsoletedBy)) { message("\n", results$identifier, "\nis the latest version for identifier\n", @@ -79,6 +80,6 @@ check_version <- function(pid, formatType = NULL){ } else { warning("Several identifiers are associated with ", pid) } - + return(results) } diff --git a/R/download_d1_data.R b/R/download_d1_data.R index 5e4176e..7fcf1ab 100644 --- a/R/download_d1_data.R +++ b/R/download_d1_data.R @@ -1,20 +1,25 @@ -#' Downloads data from DataOne along with metadata +#' Download data and metadata from DataONE +#' +#' Downloads data from DataONE along with metadata. #' #' @param data_url (character) An identifier or url for a DataONE object to download. -#' @param path (character) Path to a directory to download data to -#' +#' @param path (character) Path to a directory to download data to. +#' +#' @import dataone #' @import eml2 #' @import purrr -#' @import dataone +#' @import readr #' @import tidyr +#' @import stringr #' @importFrom emld as_emld #' @importFrom lubridate ymd_hms -#' -#' @return (character) Path where data is downloaded to +#' @importFrom utils URLdecode write.csv +#' +#' @return (character) Path where data is downloaded to. #' #' @export -#' -#' @examples +#' +#' @examples #' \dontrun{ #' download_d1_data("urn:uuid:a2834e3e-f453-4c2b-8343-99477662b570", path = "./Data") #' download_d1_data( @@ -25,46 +30,45 @@ download_d1_data <- function(data_url, path) { # TODO: add meta_doi to explicitly specify doi - # TODO: refine summary_metadata Irene - + stopifnot(is.character(data_url)) stopifnot(dir.exists(path)) - + ## Try to get DataONE data_id from data_url --------- data_url <- utils::URLdecode(data_url) data_versions <- check_version(data_url, formatType = "data") - - if(nrow(data_versions) == 1){ + + if (nrow(data_versions) == 1) { data_id <- data_versions$identifier - } else if(nrow(data_versions) > 1){ + } else if (nrow(data_versions) > 1) { #get most recent version data_versions$dateUploaded <- lubridate::ymd_hms(data_versions$dateUploaded) data_id <- data_versions$identifier[data_versions$dateUploaded == max(data_versions$dateUploaded)] } else { - stop("The DataOne ID could not be found for ", data_url) + stop("The DataONE ID could not be found for ", data_url) } - + ## Set Nodes ------------ data_nodes <- dataone::resolve(dataone::CNode("PROD"), data_id) d1c <- dataone::D1Client("PROD", data_nodes$data$nodeIdentifier[[1]]) cn <- dataone::CNode() - + ## Download Metadata ------------ meta_id <- dataone::query( cn, list(q = sprintf('documents:"%s" AND formatType:"METADATA" AND -obsoletedBy:*', data_id), - fl = "identifier")) %>% + fl = "identifier")) %>% unlist() - + # if no results are returned, try without -obsoletedBy if (length(meta_id) == 0) { meta_id <- dataone::query( cn, list(q = sprintf('documents:"%s" AND formatType:"METADATA"', data_id), - fl = "identifier")) %>% + fl = "identifier")) %>% unlist() } - + # depending on results, return warnings if (length(meta_id) == 0) { warning("no metadata records found") @@ -79,54 +83,55 @@ download_d1_data <- function(data_url, path) { ## Get package level metadata ----------- if (!is.null(meta_id)) { message("\nDownloading metadata ", meta_id, " ...") - meta_obj <- dataone::getObject(d1c@mn, meta_id) + meta_obj <- dataone::getObject(d1c@mn, meta_id) message("Download metadata complete") metadata_nodes <- dataone::resolve(cn, meta_id) - + eml <- tryCatch({emld::as_emld(meta_obj, from = "xml")}, # If eml make EML object error = function(e) {NULL}) - + # Get attributes ---------- ## get entity that contains the metadata for the data object entities <- c("dataTable", "spatialRaster", "spatialVector", "storedProcedure", "view", "otherEntity") entities <- entities[entities %in% names(eml$dataset)] - - entity_objs <- purrr::map(entities, ~eml2::eml_get(eml, .x)) %>% + + entity_objs <- purrr::map(entities, ~eml2::eml_get(eml, .x)) %>% # restructure so that all entities are at the same level - purrr::map_if(~!is.null(.x$entityName), list) %>% - unlist(recursive = FALSE) - + purrr::map_if(~!is.null(.x$entityName), list) %>% + unlist(recursive = FALSE) + #sometimes url is stored in ...online$url instead of ...online$url$url #sometimes url needs to be decoded - entity_data <- entity_objs %>% - purrr::keep(~any(grepl(data_id, + entity_data <- entity_objs %>% + purrr::keep(~any(grepl(data_id, purrr::map_chr(.x$physical$distribution$online$url, utils::URLdecode)))) - + if (length(entity_data) == 0) { - warning("No data metadata could not be found for ", data_url) - + warning("No data metadata could be found for ", data_url) + } else { - + if (length(entity_data) > 1) { - warning("multiple data metadata records found:\n", + warning("Multiple data metadata records found:\n", data_url, "\nThe first record was used") } - + entity_data <- entity_data[[1]] } attributeList <- suppressWarnings(eml2::get_attributes(entity_data$attributeList, eml)) - + meta_tabular <- tabularize_eml(eml) %>% tidyr::spread(name, value) - + ## Summary metadata from EML (combine with general metadata later) entity_meta <- suppressWarnings(list( Metadata_ID = meta_id[[1]], Metadata_URL = metadata_nodes$data$url[1], + Metadata_EML_Version = paste(stringr::str_extract_all(meta_tabular$eml.version, "[0-9]", simplify = TRUE), collapse = "."), File_Description = entity_data$entityDescription, File_Label = entity_data$entityLabel, - DataSet_URL = paste0("https://search.dataone.org/#view/", meta_id[[1]]), + Dataset_URL = paste0("https://search.dataone.org/#view/", meta_id[[1]]), Dataset_Title = meta_tabular$title, Dataset_StartDate = meta_tabular$temporalCoverage.beginDate, Dataset_EndDate = meta_tabular$temporalCoverage.endDate, @@ -135,25 +140,26 @@ download_d1_data <- function(data_url, path) { Dataset_EastBoundingCoordinate = meta_tabular$geographicCoverage.eastBoundingCoordinate, Dataset_NorthBoundingCoordinate = meta_tabular$geographicCoverage.northBoundingCoordinate, Dataset_SouthBoundingCoordinate = meta_tabular$geographicCoverage.southBoundingCoordinate, + Dataset_Taxonomy = meta_tabular$taxonomicCoverage, Dataset_Abstract = meta_tabular$abstract, Dataset_Methods = meta_tabular$methods, Dataset_People = meta_tabular$people )) } - + # Write files & download data-------- message("\nDownloading data ", data_id, " ...") data_sys <- suppressMessages(dataone::getSystemMetadata(d1c@cn, data_id)) - + data_name <- data_sys@fileName %|||% ifelse(exists("entity_data"), entity_data$physical$objectName %|||% entity_data$entityName, NA) %|||% data_id data_name <- gsub("[^a-zA-Z0-9. -]+", "_", data_name) #remove special characters & replace with _ data_extension <- gsub("(.*\\.)([^.]*$)", "\\2", data_name) data_name <- gsub("\\.[^.]*$", "", data_name) #remove extension meta_name <- gsub("[^a-zA-Z0-9. -]+", "_", meta_id) #remove special characters & replace with _ - - new_dir <- file.path(path, paste0(meta_name, "__", data_name, "__", data_extension)) - + + new_dir <- file.path(path, paste0(meta_name, "__", data_name, "__", data_extension)) + # Check if the dataset has already been downloaded at this location. If so, exit the function if (dir.exists(new_dir)) { warning("This dataset has already been downloaded. Please delete or move the folder to download the dataset again.") @@ -161,50 +167,50 @@ download_d1_data <- function(data_url, path) { } dir.create(new_dir) - + ## download Data out <- dataone::downloadObject(d1c, data_id, path = new_dir) message("Download complete") - + # change downloaded data object name to data_name data_files <- list.files(new_dir, full.names = TRUE) data_files_ext <- stringr::str_extract(data_files, ".[^.]{1,4}$") file.rename(data_files, file.path(new_dir, paste0(data_name, data_files_ext))) - + entity_meta_general <- list(File_Name = data_name, Date_Downloaded = paste0(Sys.time()), Data_ID = data_id, Data_URL = data_nodes$data$url[[1]] ) - + ## write metadata xml/tabular form if exists - if(exists("eml")) { + if (exists("eml")) { eml2::write_eml(eml, file.path(new_dir, paste0(data_name, "__full_metadata.xml"))) - + entity_meta_combined <- c(entity_meta_general, entity_meta) %>% unlist() %>% enframe() - readr::write_csv(entity_meta_combined, + readr::write_csv(entity_meta_combined, file.path(new_dir, paste0(data_name, "__summary_metadata.csv"))) } else { entity_meta_general <- entity_meta_general %>% unlist() %>% enframe() - readr::write_csv(entity_meta_general, + readr::write_csv(entity_meta_general, file.path(new_dir, paste0(data_name, "__summary_metadata.csv"))) } - + # write attribute tables if data metadata exists - if(exists("attributeList")) { + if (exists("attributeList")) { if (nrow(attributeList$attributes) > 0) { utils::write.csv(x = attributeList$attributes, file = file.path(new_dir, paste0(data_name, "__attribute_metadata.csv")), row.names = FALSE) } - + if (!is.null(attributeList$factors)) { utils::write.csv(x = attributeList$factors, file = file.path(new_dir, paste0(data_name, "__attribute_factor_metadata.csv")), row.names = FALSE) } } - + ## Output folder name return(new_dir) } diff --git a/R/tabularize_eml.R b/R/tabularize_eml.R index 5717f3c..880cdd3 100644 --- a/R/tabularize_eml.R +++ b/R/tabularize_eml.R @@ -2,11 +2,18 @@ #' #' This function takes a path to an EML (.xml) metadata file and returns a data frame. #' -#' @param eml An emld class object, the path to an EML (.xml) metadata file, or a raw EML object -#' @param full (logical) Returns the most commonly used metadata fields by default. -#' If \code{full = TRUE} is specified, the full set of metadata fields are returned. +#' @param eml An emld class object, the path to an EML (.xml) metadata file, or a raw EML object. +#' @param full (logical) Returns the most commonly used metadata fields by default. +#' If \code{full = TRUE} is specified, the full set of metadata fields are returned. #' +#' @return A data.frame of selected EML values. +#' +#' @import dplyr +#' @importFrom emld as_emld #' @importFrom tibble enframe +#' @importFrom stringr str_trim +#' +#' @noRd #' #' @examples #' \dontrun{ @@ -14,25 +21,25 @@ #' tabularize_eml(eml) #' tabularize_eml(eml, full = TRUE) #' } -#' -tabularize_eml <- function(eml, full = FALSE){ - - if(any(class(eml) == "emld")) { +tabularize_eml <- function(eml, full = FALSE) { + + if (any(class(eml) == "emld")) { eml <- eml - } else if(is.character(eml) | is.raw(eml)) { + } else if (is.character(eml) | is.raw(eml)) { eml <- emld::as_emld(eml) } else { - stop("The eml input could not be parsed.") + stop("The EML input could not be parsed.") } - - metadata <- eml %>% - unlist() %>% + + metadata <- eml %>% + unlist() %>% tibble::enframe() - - if(full == FALSE){ - metadata <- metadata %>% - dplyr::mutate(name = case_when( + + if (full == FALSE) { + metadata <- metadata %>% + dplyr::mutate(name = dplyr::case_when( + grepl("schemaLocation", name) ~ "eml.version", grepl("title", name) ~ "title", grepl("individualName", name) ~ "people", grepl("abstract", name) ~ "abstract", @@ -44,18 +51,18 @@ tabularize_eml <- function(eml, full = FALSE){ grepl("southBoundingCoordinate", name) ~ "geographicCoverage.southBoundingCoordinate", grepl("beginDate", name) ~ "temporalCoverage.beginDate", grepl("endDate", name) ~ "temporalCoverage.endDate", - #taxonomicCoverage + grepl("taxonRankValue", name) ~ "taxonomicCoverage", grepl("methods", name) ~ "methods", grepl("objectName", name) ~ "objectName", grepl("online.url", name) ~ "url" - )) %>% - dplyr::filter(!is.na(name)) %>% - dplyr::mutate(value = stringr::str_trim(value)) %>% - dplyr::distinct() %>% - dplyr::group_by(name) %>% - dplyr::summarize(value = paste(value, collapse = "; ")) %>% + )) %>% + dplyr::filter(!is.na(name)) %>% + dplyr::mutate(value = stringr::str_trim(value)) %>% + dplyr::distinct() %>% + dplyr::group_by(name) %>% + dplyr::summarize(value = paste(value, collapse = "; ")) %>% dplyr::mutate(value = gsub("\n", "", value)) #without this, fields get truncated in Excel } - + return(metadata) } diff --git a/man/check_version.Rd b/man/check_version.Rd index cce10da..c7f97b6 100644 --- a/man/check_version.Rd +++ b/man/check_version.Rd @@ -7,9 +7,12 @@ check_version(pid, formatType = NULL) } \arguments{ -\item{pid}{(character) The persistent identifier of a data, metadata, or resource map object on a DataONE member node} +\item{pid}{(character) The persistent identifier of a data, metadata, or resource map object on a DataONE member node.} -\item{formatType}{(character) Optional. The format type to return (DATA, METADATA, RESOURCE)} +\item{formatType}{(character) Optional. The format type to return (DATA, METADATA, RESOURCE).} +} +\value{ +A data.frame of object version PIDs and related information. } \description{ This function takes an identifier and checks to see if it has been obsoleted. @@ -34,5 +37,4 @@ check_version("doi:10.18739/A2HF7Z", formatType = "metadata") # Returns an error if no matching identifiers are found check_version("a_test_pid") } - } diff --git a/man/download_d1_data.Rd b/man/download_d1_data.Rd index c5324e7..b9a836a 100644 --- a/man/download_d1_data.Rd +++ b/man/download_d1_data.Rd @@ -2,20 +2,20 @@ % Please edit documentation in R/download_d1_data.R \name{download_d1_data} \alias{download_d1_data} -\title{Downloads data from DataOne along with metadata} +\title{Download data and metadata from DataONE} \usage{ download_d1_data(data_url, path) } \arguments{ \item{data_url}{(character) An identifier or url for a DataONE object to download.} -\item{path}{(character) Path to a directory to download data to} +\item{path}{(character) Path to a directory to download data to.} } \value{ -(character) Path where data is downloaded to +(character) Path where data is downloaded to. } \description{ -Downloads data from DataOne along with metadata +Downloads data from DataONE along with metadata. } \examples{ \dontrun{ diff --git a/man/tabularize_eml.Rd b/man/tabularize_eml.Rd deleted file mode 100644 index aad5aef..0000000 --- a/man/tabularize_eml.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tabularize_eml.R -\name{tabularize_eml} -\alias{tabularize_eml} -\title{Get tabular metadata} -\usage{ -tabularize_eml(eml, full = FALSE) -} -\arguments{ -\item{eml}{An emld class object, the path to an EML (.xml) metadata file, or a raw EML object} - -\item{full}{(logical) Returns the most commonly used metadata fields by default. -If \code{full = TRUE} is specified, the full set of metadata fields are returned.} -} -\description{ -This function takes a path to an EML (.xml) metadata file and returns a data frame. -} -\examples{ -\dontrun{ - eml <- system.file("example-eml.xml", package = "arcticdatautils") - tabularize_eml(eml) - tabularize_eml(eml, full = TRUE) -} - -} diff --git a/metajam.Rproj b/metajam.Rproj index 21a4da0..398aa14 100644 --- a/metajam.Rproj +++ b/metajam.Rproj @@ -9,9 +9,12 @@ UseSpacesForTab: Yes NumSpacesForTab: 2 Encoding: UTF-8 -RnwWeave: Sweave +RnwWeave: knitr LaTeX: pdfLaTeX +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source