diff --git a/R/occ_count.r b/R/occ_count.r index 4f11bd44..585768de 100644 --- a/R/occ_count.r +++ b/R/occ_count.r @@ -212,6 +212,61 @@ occ_count <- function(...,occurrenceStatus="PRESENT", curlopts = list()) { lifeStage = args$lifeStage, isInCluster = args$isInCluster, distanceFromCentroidInMeters = args$distanceFromCentroidInMeters, + geoDistance = args$geoDistance, + sex = args$sex, + dwcaExtension = args$dwcaExtension, + gbifId = args$gbifId, + gbifRegion = args$gbifRegion, + projectId = args$projectId, + programme = args$programme, + preparations = args$preparations, + datasetId = args$datsetId, + datasetName = args$datasetName, + publishedByGbifRegion = args$publishedByGbifRegion, + island = args$island, + islandGroup = args$islandGroup, + taxonId = args$taxonId, + taxonConceptId = args$taxonConceptId, + taxonomicStatus = args$taxonomicStatus, + acceptedTaxonKey = args$acceptedTaxonKey, + collectionKey = args$collectionsKey, + institutionKey = args$institutionKey, + otherCatalogNumbers = args$otherCatalogNumbers, + georeferencedBy = args$georeferencedBy, + installationKey = args$installationKey, + hostingOrganizationKey = args$hostingOrganizationKey, + crawlId = args$crawlId, + modified = args$modified, + higherGeography = args$higherGeography, + fieldNumber = args$fieldNumber, + parentEventId = args$parentEventId, + samplingProtocol = args$samplingProtocol, + sampleSizeUnit = args$sampleSizeUnit, + pathway = args$pathway, + gadmLevel0Gid = args$gadmLevel0Gid, + gadmLevel1Gid = args$gadmLevel1Gid, + gadmLevel2Gid = args$gadmLevel2Gid, + gadmLevel3Gid = args$gadmLevel3Gid, + earliestEonOrLowestEonothem = args$earliestEonOrLowestEonothem, + latestEonOrHighestEonothem = args$latestEonOrHighestEonothem, + earliestEraOrLowestErathem = args$earliestEraOrLowestErathem, + latestEraOrHighestErathem = args$latestEraOrHighestErathem, + earliestPeriodOrLowestSystem = args$earliestPeriodOrLowestSystem, + latestPeriodOrHighestSystem = args$latestPeriodOrHighestSystem, + earliestEpochOrLowestSeries = args$earliestEpochOrLowestSeries, + latestEpochOrHighestSeries = args$latestEpochOrHighestSeries, + earliestAgeOrLowestStage = args$earliestAgeOrLowestStage, + latestAgeOrHighestStage = args$latestAgeOrHighestStage, + lowestBiostratigraphicZone = args$lowestBiostratigraphicZone, + highestBiostratigraphicZone = args$highestBiostratigraphicZone, + group = args$group, + formation = args$formation, + member = args$member, + bed = args$bed, + associatedSequences = args$aassociatedSequences, + isSequenced = args$isSequenced, + startDayOfYear = args$startDayOfYear, + endDayOfYear = args$endDayOfYear, limit=0, start=0, fields = 'all', @@ -224,10 +279,23 @@ occ_count <- function(...,occurrenceStatus="PRESENT", curlopts = list()) { facetLimit = args$facetLimit) if("facet" %in% arg_names) { - not_facet_arg <- c("skip_validate","...","curlopts","facetMultiselect", - "facetMincount", "facet","return","fields","start", - "limit","verbatimTaxonId","geometry", "geom_big", - "geom_size","geom_n","search") + not_facet_arg <- c("skip_validate", + "...", + "curlopts", + "facetMultiselect", + "facetMincount", + "facet", + "return", + "fields", + "start", + "limit", + "verbatimTaxonId", + "geometry", + "geom_big", + "geom_size", + "geom_n", + "search", + "geoDistance") acc_facet_arg <- formal_args[!formal_args %in% not_facet_arg] if(!args$facet %in% acc_facet_arg) stop("Bad facet arg.") count <- stats::setNames(res$facet[[1]],c(args$facet,"count")) diff --git a/R/occ_data.R b/R/occ_data.R index 8e32f108..bddb412b 100644 --- a/R/occ_data.R +++ b/R/occ_data.R @@ -1,37 +1,233 @@ -#' Search for GBIF occurrences - simplified for speed +#' Legacy alternative to occ_search #' -#' @export -#' @template occsearch -#' @template oslimstart -#' @template occ -#' @template occ_data_egs -#' @seealso [downloads()], [occ_search()] -#' @section occ_data vs. occ_search: -#' This does nearly the same thing as [occ_search()], but -#' is simplified for speed, and is for the most common use case where -#' user just wants occurrence data, and not other information like taxon -#' hierarchies and media (e.g., images). Alot of time in [occ_search()] -#' is used parsing data to be more useable downstream. We do less of that -#' in this function. +#' @param taxonKey (numeric) A taxon key from the GBIF backbone. All included +#' and synonym taxa are included in the search, so a search for aves with +#' taxononKey=212 will match all birds, no matter which species. You can pass +#' many keys to \code{occ_search(taxonKey=c(1,212))}. +#' @param scientificName A scientific name from the GBIF backbone. All included +#' and synonym taxa are included in the search. +#' @param country (character) The 2-letter country code (ISO-3166-1) +#' in which the occurrence was recorded. \code{enumeration_country()}. +#' @param datasetKey (character) The occurrence dataset uuid key. That can be +#' found in the dataset page url. For example, "7e380070-f762-11e1-a439-00145 +#' eb45e9a" is the key for [Natural History Museum (London) Collection Specimens](https://www.gbif.org/dataset/7e380070-f762-11e1-a439-00145eb45e9a). +#' @param eventDate (character) Occurrence date in ISO 8601 format: yyyy, +#' yyyy-MM, yyyy-MM-dd, or MM-dd. Supports range queries, 'smaller,larger' +#' ('1990,1991', whereas '1991,1990' wouldn't work). +#' @param catalogNumber (character) An identifier of any form assigned by the +#' source within a physical collection or digital dataset for the record which +#' may not unique, but should be fairly unique in combination with the +#' institution and collection code. +#' @param recordedBy (character) The person who recorded the occurrence. +#' @param recordedByID (character) Identifier (e.g. ORCID) for the person who +#' recorded the occurrence +#' @param identifiedByID (character) Identifier (e.g. ORCID) for the person who +#' provided the taxonomic identification of the occurrence. +#' @param collectionCode (character) An identifier of any form assigned by the +#' source to identify the physical collection or digital dataset uniquely within +#' the text of an institution. +#' @param institutionCode An identifier of any form assigned by the source to +#' identify the institution the record belongs to. +#' @param basisOfRecord (character) The specific nature of the data record. See +#' [here](https://gbif.github.io/parsers/apidocs/org/gbif/api/vocabulary/BasisOfRecord.html). +#' +#' \itemize{ +#' \item "FOSSIL_SPECIMEN" +#' \item "HUMAN_OBSERVATION" +#' \item "MATERIAL_CITATION" +#' \item "MATERIAL_SAMPLE" +#' \item "LIVING_SPECIMEN" +#' \item "MACHINE_OBSERVATION" +#' \item "OBSERVATION" +#' \item "PRESERVED_SPECIMEN" +#' \item "OCCURRENCE" +#' } +#' @param year The 4 digit year. A year of 98 will be interpreted as AD 98. +#' Supports range queries, 'smaller,larger' (e.g., '1990,1991', whereas 1991, +#' 1990' wouldn't work). +#' @param month The month of the year, starting with 1 for January. Supports +#' range queries, 'smaller,larger' (e.g., '1,2', whereas '2,1' wouldn't work). +#' @param search (character) Query terms. The value for this parameter can be a +#' simple word or a phrase. For example, [search="puma"](https://www.gbif.org/occurrence/search?q=puma) +#' @param decimalLatitude Latitude in decimals between -90 and 90 based on +#' WGS84. Supports range queries, 'smaller,larger' (e.g., '25,30', whereas +#' '30,25' wouldn't work). +#' @param decimalLongitude Longitude in decimals between -180 and 180 based on +#' WGS84. Supports range queries (e.g., '-0.4,-0.2', whereas '-0.2,-0.4' +#' wouldn't work). +#' @param publishingCountry The 2-letter country code (as per ISO-3166-1) of +#' the country in which the occurrence was recorded. See +#' \code{enumeration_country()}. +#' @param elevation Elevation in meters above sea level. Supports range +#' queries, 'smaller,larger' (e.g., '5,30', whereas '30,5' wouldn't work). +#' @param depth Depth in meters relative to elevation. For example 10 meters +#' below a lake surface with given elevation. Supports range queries, +#' 'smaller,larger' (e.g., '5,30', whereas '30,5' wouldn't work). +#' @param geometry (character) Searches for occurrences inside a polygon in +#' Well Known Text (WKT) format. A WKT shape written as either +#' +#' \itemize{ +#' \item "POINT" +#' \item "LINESTRING" +#' \item "LINEARRING" +#' \item "POLYGON" +#' \item "MULTIPOLYGON" +#' } +#' +#' For Example, "POLYGON((37.08 46.86,38.06 46.86,38.06 47.28,37.08 47.28, +#' 37.0 46.8))". See also the section **WKT** below. +#' @param geom_big (character) One"bbox" or "asis" (default). +#' @param geom_size (integer) An integer indicating size of the cell. Default: +#' 40. +#' @param geom_n (integer) An integer indicating number of cells in each +#' dimension. Default: 10. +#' @param hasGeospatialIssue (logical) Includes/excludes occurrence records +#' which contain spatial issues (as determined in our record interpretation), +#' i.e. \code{hasGeospatialIssue=TRUE} returns only those records with spatial +#' issues while \code{hasGeospatialIssue=FALSE} includes only records without +#' spatial issues. The absence of this parameter returns any record with or +#' without spatial issues. +#' @param issue (character) One or more of many possible issues with each +#' occurrence record. Issues passed to this parameter filter results by +#' the issue. One of many [options](https://gbif.github.io/gbif-api/apidocs/org/gbif/api/vocabulary/OccurrenceIssue.html). +#' See [here](https://data-blog.gbif.org/post/issues-and-flags/) for definitions. +#' @param hasCoordinate (logical) Return only occurrence records with lat/long +#' data (\code{TRUE}) or all records (\code{FALSE}, default). +#' @param typeStatus Type status of the specimen. One of many +#' [options](https://www.gbif.org/occurrence/search?type_status=PARATYPE). +#' @param recordNumber Number recorded by collector of the data, different from +#' GBIF record number. +#' @param lastInterpreted Date the record was last modified in GBIF, in ISO +#' 8601 format: yyyy, yyyy-MM, yyyy-MM-dd, or MM-dd. Supports range queries, +#' 'smaller,larger' (e.g., '1990,1991', whereas '1991,1990' wouldn't work). +#' @param continent The source supplied continent. +#' +#' \itemize{ +#' \item "africa" +#' \item "antarctica" +#' \item "asia" +#' \item "europe" +#' \item "north_america" +#' \item "oceania" +#' \item "south_america" +#' } +#' +#' Continent is not inferred but only populated if provided by the +#' dataset publisher. Applying this filter may exclude many relevant records. +#' @param mediaType (character) Media type of "MovingImage", "Sound", or +#' "StillImage". +#' @param repatriated (character) Searches for records whose publishing country +#' is different to the country where the record was recorded in. +#' @param kingdomKey (numeric) Kingdom classification key. +#' @param phylumKey (numeric) Phylum classification key. +#' @param classKey (numeric) Class classification key. +#' @param orderKey (numeric) Order classification key. +#' @param familyKey (numeric) Family classification key. +#' @param genusKey (numeric) Genus classification key. +#' @param speciesKey (numeric) Species classification key. +#' @param subgenusKey (numeric) Subgenus classification key. +#' @param establishmentMeans (character) provides information about whether an +#' organism or organisms have been introduced to a given place and time through +#' the direct or indirect activity of modern humans. #' -#' There are a number of data fields GBIF returns that we drop to speed up -#' processing time within R. These fields take extra time to process -#' because they are deeply nested and so take extra time to check if -#' they are empty or not, and if not, figure out how to parse them -#' into a data.frame. The fields are: +#' \itemize{ +#' \item "Introduced" +#' \item "Native" +#' \item "NativeReintroduced" +#' \item "Vagrant" +#' \item "Uncertain" +#' \item "IntroducedAssistedColonisation" +#' } +#' +#' @param degreeOfEstablishment (character) Provides information about degree to +#' which an Organism survives, reproduces, and expands its range at the given +#' place and time. One of many [options](https://www.gbif.org/occurrence/search?advanced=1°ree_of_establishment=Managed). +#' @param protocol (character) Protocol or mechanism used to provide the +#' occurrence record. One of many [options](https://www.gbif.org/occurrence/search?protocol=DWC_ARCHIVE&advanced=1). +#' @param license (character) The type license applied to the dataset or record. #' -#' - `gadm` -#' - `media` -#' - `facts` -#' - `relations` -#' - `extensions` -#' - `identifiers` -#' - `recordedByIDs` -#' - `identifiedByIDs` +#' \itemize{ +#' \item "CC0_1_0" +#' \item "CC_BY_4_0" +#' \item "CC_BY_NC_4_0" +#' } +#' +#' @param organismId (numeric) An identifier for the Organism instance (as +#' opposed to a particular digital record of the Organism). May be a globally +#' unique identifier or an identifier specific to the data set. +#' @param publishingOrg (character) The publishing organization key (a UUID). +#' @param stateProvince (character) The name of the next smaller administrative +#' region than country (state, province, canton, department, region, etc.) in +#' which the Location occurs. +#' @param waterBody (character) The name of the water body in which the +#' locations occur +#' @param locality (character) The specific description of the place. +#' @param occurrenceStatus (character) Default is "PRESENT". Specify whether +#' search should return "PRESENT" or "ABSENT" data. +#' @param gadmGid (character) The gadm id of the area occurrences are desired +#' from. https://gadm.org/. +#' @param coordinateUncertaintyInMeters A number or range between 0-1,000,000 +#' which specifies the desired coordinate uncertainty. A coordinateUncertainty +#' InMeters=1000 will be interpreted all records with exactly 1000m. Supports +#' range queries, 'smaller,larger' (e.g., '1000,10000', whereas '10000,1000' +#' wouldn't work). +#' @param verbatimScientificName (character) Scientific name as provided by the +#' source. +#' @param verbatimTaxonId (character) The taxon identifier provided to GBIF by +#' the data publisher. +#' @param eventId (character) identifier(s) for a sampling event. +#' @param identifiedBy (character) names of people, groups, or organizations. +#' @param networkKey (character) The occurrence network key (a uuid) +#' who assigned the Taxon to the subject. +#' @param occurrenceId (character) occurrence id from source. +#' @param organismQuantity A number or range which +#' specifies the desired organism quantity. An organismQuantity=5 +#' will be interpreted all records with exactly 5. Supports range queries, +#' smaller,larger (e.g., '5,20', whereas '20,5' wouldn't work). +#' @param organismQuantityType (character) The type of quantification system +#' used for the quantity of organisms. For example, "individuals" or "biomass". +#' @param relativeOrganismQuantity (numeric) A relativeOrganismQuantity=0.1 will +#' be interpreted all records with exactly 0.1 The relative measurement of the +#' quantity of the organism (a number between 0-1). Supports range queries, +#' "smaller,larger" (e.g., '0.1,0.5', whereas '0.5,0.1' wouldn't work). +#' @param iucnRedListCategory (character) The IUCN threat status category. +#' +#' \itemize{ +#' \item "NE" (Not Evaluated) +#' \item "DD" (Data Deficient) +#' \item "LC" (Least Concern) +#' \item "NT" (Near Threatened) +#' \item "VU" (Vulnerable) +#' \item "EN" (Endangered) +#' \item "CR" (Critically Endangered) +#' \item "EX" (Extinct) +#' \item "EW" (Extinct in the Wild) +#' } +#' @param lifeStage (character) the life stage of the occurrence. One of many +#' [options](https://www.gbif.org/occurrence/search?advanced=1&life_stage=Tadpole). +#' @param isInCluster (logical) identify potentially related records on GBIF. +#' @param distanceFromCentroidInMeters A number or range. A value of "2000,*" +#' means at least 2km from known centroids. A value of "0" would mean occurrences +#' exactly on known centroids. A value of "0,2000" would mean within 2km of +#' centroids. Max value is 5000. +#' @param skip_validate (logical) whether to skip wellknown::validate_wkt call +#' or not. passed down to check_wkt(). Default: TRUE +#' @param limit Number of records to return. Default: 500. Note that the per +#' request maximum is 300, but since we set it at 500 for the function, we +#' do two requests to get you the 500 records (if there are that many). +#' Note that there is a hard maximum of 100,000, which is calculated as the +#' \code{limit+start}, so \code{start=99,000} and \code{limit=2000} won't work +#' @param start Record number to start at. Use in combination with limit to +#' page through results. Note that we do the paging internally for you, but +#' you can manually set the \code{start} parameter +#' @param curlopts (list) +#' +#' @details +#' This function is a legacy alternative to `occ_search()`. It is not +#' recommended to use `occ_data()` as it is not as flexible as `occ_search()`. +#' New search terms will not be added to this function and it is only supported +#' for legacy reasons. #' -#' To get these fields use [occ_search()] instead. -#' @note Maximum number of records you can get with this function is 100,000. -#' See https://www.gbif.org/developer/occurrence #' @return An object of class `gbif_data`, which is a S3 class list, with #' slots for metadata (`meta`) and the occurrence data itself #' (`data`), and with attributes listing the user supplied arguments @@ -39,7 +235,8 @@ #' two values of the `datasetKey` parameter to searches are done, and #' it's a "many". `meta` is a list of length four with offset, limit, #' endOfRecords and count fields. `data` is a tibble (aka data.frame) - +#' @export +#' occ_data <- function(taxonKey=NULL, scientificName=NULL, country=NULL, @@ -90,9 +287,6 @@ occ_data <- function(taxonKey=NULL, stateProvince = NULL, waterBody = NULL, locality = NULL, - limit=500, - start=0, - skip_validate = TRUE, occurrenceStatus = 'PRESENT', gadmGid = NULL, coordinateUncertaintyInMeters = NULL, @@ -109,6 +303,9 @@ occ_data <- function(taxonKey=NULL, lifeStage = NULL, isInCluster = NULL, distanceFromCentroidInMeters = NULL, + skip_validate = TRUE, + limit=500, + start=0, curlopts = list(http_version=2)) { geometry <- geometry_handler(geometry, geom_big, geom_size, geom_n) diff --git a/R/occ_search.r b/R/occ_search.r index fa6c5bc2..9154fa6c 100644 --- a/R/occ_search.r +++ b/R/occ_search.r @@ -29,38 +29,38 @@ #' data.frame is its taxonomic classification. `media` is a list of media #' objects, where each element holds a set of metadata about the media object. -occ_search <- function(taxonKey=NULL, - scientificName=NULL, - country=NULL, - publishingCountry=NULL, - hasCoordinate=NULL, - typeStatus=NULL, - recordNumber=NULL, - lastInterpreted=NULL, - continent=NULL, - geometry=NULL, - geom_big="asis", - geom_size=40, - geom_n=10, - recordedBy=NULL, - recordedByID=NULL, - identifiedByID=NULL, - basisOfRecord=NULL, - datasetKey=NULL, - eventDate=NULL, - catalogNumber=NULL, - year=NULL, - month=NULL, - decimalLatitude=NULL, - decimalLongitude=NULL, - elevation=NULL, - depth=NULL, - institutionCode=NULL, - collectionCode=NULL, - hasGeospatialIssue=NULL, - issue=NULL, - search=NULL, - mediaType=NULL, +occ_search <- function(taxonKey = NULL, + scientificName = NULL, + country = NULL, + publishingCountry = NULL, + hasCoordinate = NULL, + typeStatus = NULL, + recordNumber = NULL, + lastInterpreted = NULL, + continent = NULL, + geometry = NULL, + geom_big = "asis", + geom_size = 40, + geom_n = 10, + recordedBy = NULL, + recordedByID = NULL, + identifiedByID =NULL, + basisOfRecord = NULL, + datasetKey = NULL, + eventDate = NULL, + catalogNumber = NULL, + year = NULL, + month = NULL, + decimalLatitude = NULL, + decimalLongitude = NULL, + elevation = NULL, + depth = NULL, + institutionCode = NULL, + collectionCode = NULL, + hasGeospatialIssue = NULL, + issue = NULL, + search = NULL, + mediaType = NULL, subgenusKey = NULL, repatriated = NULL, phylumKey = NULL, @@ -95,15 +95,71 @@ occ_search <- function(taxonKey=NULL, lifeStage = NULL, isInCluster = NULL, distanceFromCentroidInMeters=NULL, - limit=500, - start=0, + geoDistance = NULL, + sex = NULL, + dwcaExtension = NULL, + gbifId = NULL, + gbifRegion = NULL, + projectId = NULL, + programme = NULL, + preparations = NULL, + datasetId = NULL, + datasetName = NULL, + publishedByGbifRegion = NULL, + island = NULL, + islandGroup = NULL, + taxonId = NULL, + taxonConceptId = NULL, + taxonomicStatus = NULL, + acceptedTaxonKey = NULL, + collectionKey = NULL, + institutionKey = NULL, + otherCatalogNumbers = NULL, + georeferencedBy = NULL, + installationKey = NULL, + hostingOrganizationKey = NULL, + crawlId = NULL, + modified = NULL, + higherGeography = NULL, + fieldNumber = NULL, + parentEventId = NULL, + samplingProtocol = NULL, + sampleSizeUnit = NULL, + pathway = NULL, + gadmLevel0Gid = NULL, + gadmLevel1Gid = NULL, + gadmLevel2Gid = NULL, + gadmLevel3Gid = NULL, + earliestEonOrLowestEonothem = NULL, + latestEonOrHighestEonothem = NULL, + earliestEraOrLowestErathem = NULL, + latestEraOrHighestErathem = NULL, + earliestPeriodOrLowestSystem = NULL, + latestPeriodOrHighestSystem = NULL, + earliestEpochOrLowestSeries = NULL, + latestEpochOrHighestSeries = NULL, + earliestAgeOrLowestStage = NULL, + latestAgeOrHighestStage = NULL, + lowestBiostratigraphicZone = NULL, + highestBiostratigraphicZone = NULL, + group = NULL, + formation = NULL, + member = NULL, + bed = NULL, + associatedSequences = NULL, + isSequenced = NULL, + startDayOfYear = NULL, + endDayOfYear = NULL, + limit = 500, + start = 0, fields = 'all', - return=NULL, + return = NULL, facet = NULL, facetMincount = NULL, facetMultiselect = NULL, skip_validate = TRUE, - curlopts = list(http_version=2), ...) { + curlopts = list(http_version=2), + ...) { pchk(return, "occ_search") geometry <- geometry_handler(geometry, geom_big, geom_size, geom_n) @@ -130,6 +186,7 @@ occ_search <- function(taxonKey=NULL, occurrenceStatus = occurrenceStatus, q = search, repatriated = repatriated, + isSequenced = isSequenced, limit = check_limit(as.integer(limit)), isInCluster = isInCluster, offset = check_limit(as.integer(start)) @@ -195,7 +252,61 @@ occ_search <- function(taxonKey=NULL, convmany(occurrenceId), convmany(iucnRedListCategory), convmany(lifeStage), - convmany(distanceFromCentroidInMeters) + convmany(distanceFromCentroidInMeters), + convmany(sex), + convmany(dwcaExtension), + convmany(gbifId), + convmany(gbifRegion), + convmany(projectId), + convmany(programme), + convmany(preparations), + convmany(datasetId), + convmany(datasetName), + convmany(publishedByGbifRegion), + convmany(island), + convmany(islandGroup), + convmany(taxonId), + convmany(taxonConceptId), + convmany(taxonomicStatus), + convmany(acceptedTaxonKey), + convmany(collectionKey), + convmany(institutionKey), + convmany(otherCatalogNumbers), + convmany(georeferencedBy), + convmany(installationKey), + convmany(hostingOrganizationKey), + convmany(crawlId), + convmany(modified), + convmany(higherGeography), + convmany(fieldNumber), + convmany(parentEventId), + convmany(samplingProtocol), + convmany(sampleSizeUnit), + convmany(pathway), + convmany(gadmLevel0Gid), + convmany(gadmLevel1Gid), + convmany(gadmLevel2Gid), + convmany(gadmLevel3Gid), + convmany(earliestEonOrLowestEonothem), + convmany(latestEonOrHighestEonothem), + convmany(earliestEraOrLowestErathem), + convmany(latestEraOrHighestErathem), + convmany(earliestPeriodOrLowestSystem), + convmany(latestPeriodOrHighestSystem), + convmany(earliestEpochOrLowestSeries), + convmany(latestEpochOrHighestSeries), + convmany(earliestAgeOrLowestStage), + convmany(latestAgeOrHighestStage), + convmany(lowestBiostratigraphicZone), + convmany(highestBiostratigraphicZone), + convmany(group), + convmany(formation), + convmany(member), + convmany(bed), + convmany(associatedSequences), + convmany(startDayOfYear), + convmany(endDayOfYear), + convmany(geoDistance) ) argscoll <<- args @@ -308,8 +419,63 @@ occ_search <- function(taxonKey=NULL, iucnRedListCategory=iucnRedListCategory, lifeStage=lifeStage, coordinateUncertaintyInMeters=coordinateUncertaintyInMeters, - distanceFromCentroidInMeters=distanceFromCentroidInMeters - ) + distanceFromCentroidInMeters=distanceFromCentroidInMeters, + sex=sex, + dwcaExtension=dwcaExtension, + gbifId=gbifId, + gbifRegion=gbifRegion, + projectId=projectId, + programme=programme, + preparations=preparations, + datasetId=datasetId, + datasetName=datasetName, + publishedByGbifRegion=publishedByGbifRegion, + island=island, + islandGroup=islandGroup, + taxonId=taxonId, + taxonConceptId=taxonConceptId, + taxonomicStatus=taxonomicStatus, + acceptedTaxonKey=acceptedTaxonKey, + collectionKey=collectionKey, + institutionKey=institutionKey, + otherCatalogNumbers=otherCatalogNumbers, + georeferencedBy=georeferencedBy, + installationKey=installationKey, + hostingOrganizationKey=hostingOrganizationKey, + crawlId=crawlId, + modified=modified, + higherGeography=higherGeography, + fieldNumber=fieldNumber, + parentEventId=parentEventId, + samplingProtocol=samplingProtocol, + sampleSizeUnit=sampleSizeUnit, + pathway=pathway, + gadmLevel0Gid=gadmLevel0Gid, + gadmLevel1Gid=gadmLevel1Gid, + gadmLevel2Gid=gadmLevel2Gid, + gadmLevel3Gid=gadmLevel3Gid, + earliestEonOrLowestEonothem=earliestEonOrLowestEonothem, + latestEonOrHighestEonothem=latestEonOrHighestEonothem, + earliestEraOrLowestErathem=earliestEraOrLowestErathem, + latestEraOrHighestErathem=latestEraOrHighestErathem, + earliestPeriodOrLowestSystem=earliestPeriodOrLowestSystem, + latestPeriodOrHighestSystem=latestPeriodOrHighestSystem, + earliestEpochOrLowestSeries=earliestEpochOrLowestSeries, + latestEpochOrHighestSeries=latestEpochOrHighestSeries, + earliestAgeOrLowestStage=earliestAgeOrLowestStage, + latestAgeOrHighestStage=latestAgeOrHighestStage, + lowestBiostratigraphicZone=lowestBiostratigraphicZone, + highestBiostratigraphicZone=highestBiostratigraphicZone, + group=group, + formation=formation, + member=member, + bed=bed, + associatedSequences=associatedSequences, + isSequenced=isSequenced, + startDayOfYear=startDayOfYear, + endDayOfYear=endDayOfYear, + geoDistance=geoDistance + ) if (!any(sapply(params, length) > 0)) { stop(sprintf("At least one of these parameters must have a value:\n%s", possparams()), diff --git a/man-roxygen/occsearch.r b/man-roxygen/occsearch.r index d627a0df..92b3949a 100644 --- a/man-roxygen/occsearch.r +++ b/man-roxygen/occsearch.r @@ -207,7 +207,113 @@ #' @param distanceFromCentroidInMeters A number or range. A value of "2000,*" #' means at least 2km from known centroids. A value of "0" would mean occurrences #' exactly on known centroids. A value of "0,2000" would mean within 2km of -#' centroids. Max value is 5000. +#' centroids. Max value is 5000. +#' @param geoDistance (character) Filters to match occurrence records with coordinate values +#' within a specified distance of a coordinate. Distance may be specified in +#' kilometres (km) or metres (m). Example : "90,100,5km" +#' @param sex (character) The sex of the biological individual(s) represented in the occurrence. +#' @param dwcaExtension (character) A known Darwin Core Archive extension RowType. +#' Limits the search to occurrences which have this extension, although they will +#' not necessarily have any useful data recorded using the extension. +#' @param gbifId (numeric) The unique GBIF key for a single occurrence. +#' @param gbifRegion (character) Gbif region based on country code. +#' @param projectId (character) The identifier for a project, which is often +#' assigned by a funded programme. +#' @param programme (character) A group of activities, often associated with a +#' specific funding stream, such as the GBIF BID programme. +#' @param preparations (character) Preparation or preservation method for +#' a specimen. +#' @param datasetId (character) The ID of the dataset. Parameter may be +#' repeated. Example : https://doi.org/10.1594/PANGAEA.315492 +#' @param datasetName (character) The exact name of the dataset. Not the same as +#' dataset title. +#' @param publishedByGbifRegion (character) GBIF region based on the owning +#' organization's country. +#' @param island (character) The name of the island on or near which the +#' location occurs. +#' @param islandGroup (character) The name of the island group in which the +#' location occurs. +#' @param taxonId (character) The taxon identifier provided to GBIF by the data +#' publisher. Example : urn:lsid:dyntaxa.se:Taxon:103026 +#' @param taxonConceptId (character) An identifier for the taxonomic concept to +#' which the record refers - not for the nomenclatural details of a taxon. +#' Example : 8fa58e08-08de-4ac1-b69c-1235340b7001 +#' @param taxonomicStatus (character) A taxonomic status. Example : SYNONYM +#' @param acceptedTaxonKey (numeric) A taxon key from the GBIF backbone. Only +#' synonym taxa are included in the search, so a search for Aves with +#' acceptedTaxonKey=212 will match occurrences identified as birds, but not +#' any known family, genus or species of bird. +#' @param collectionKey (character) A key (UUID) for a collection registered in +#' the Global Registry of Scientific Collections. +#' Example : dceb8d52-094c-4c2c-8960-75e0097c6861 +#' @param institutionKey (character) A key (UUID) for an institution registered +#' in the Global Registry of Scientific Collections. +#' @param otherCatalogNumbers (character) Previous or alternate fully qualified +#' catalog numbers. +#' @param georeferencedBy (character) Name of a person, group, or organization +#' who determined the georeference (spatial representation) for the location. +#' Example : Brad Millen +#' @param installationKey (character) The occurrence installation key (a UUID). +#' Example : 17a83780-3060-4851-9d6f-029d5fcb81c9 +#' @param hostingOrganizationKey (character) The key (UUID) of the publishing +#' organization whose installation (server) hosts the original dataset. +#' Example : fbca90e3-8aed-48b1-84e3-369afbd000ce +#' @param crawlId (numeric) Crawl attempt that harvested this record. +#' @param modified (character) The most recent date-time on which the +#' occurrence was changed, according to the publisher. Can be a range. +#' Example : 2023-02-20 +#' @param higherGeography (character) Geographic name less specific than the +#' information captured in the locality term. +#' @param fieldNumber (character) An identifier given to the event in the field. +#' Often serves as a link between field notes and the event. +#' @param parentEventId (character) An identifier for the information associated +#' with a sampling event. +#' @param samplingProtocol (character) The name of, reference to, or description +#' of the method or protocol used during a sampling event. +#' Example : malaise trap +#' @param sampleSizeUnit (character) The unit of measurement of the size +#' (time duration, length, area, or volume) of a sample in a sampling event. +#' Example : hectares +#' @param pathway (character) The process by which an organism came to be in a +#' given place at a given time, as defined in the GBIF Pathway vocabulary. +#' Example : Agriculture +#' @param gadmLevel0Gid (character) A GADM geographic identifier at the zero +#' level, for example AGO. +#' @param gadmLevel1Gid (character) A GADM geographic identifier at the first +#' level, for example AGO.1_1. +#' @param gadmLevel2Gid (character) A GADM geographic identifier at the second +#' level, for example AFG.1.1_1. +#' @param gadmLevel3Gid (character) A GADM geographic identifier at the third +#' level, for example AFG.1.1.1_1. +#' @param earliestEonOrLowestEonothem (character) geochronologic era term. +#' @param latestEonOrHighestEonothem (character) geochronologic era term. +#' @param earliestEraOrLowestErathem (character) geochronologic era term. +#' @param latestEraOrHighestErathem (character) geochronologic era term. +#' @param earliestPeriodOrLowestSystem (character) geochronologic era term. +#' @param latestPeriodOrHighestSystem (character) geochronologic era term. +#' @param earliestEpochOrLowestSeries (character) geochronologic era term. +#' @param latestEpochOrHighestSeries (character) geochronologic era term. +#' @param earliestAgeOrLowestStage (character) geochronologic era term. +#' @param latestAgeOrHighestStage (character) geochronologic era term. +#' @param lowestBiostratigraphicZone (character) geochronologic era term. +#' @param highestBiostratigraphicZone (character) geochronologic era term. +#' @param group (character) The full name of the lithostratigraphic group from +#' which the material entity was collected. +#' @param formation (character) The full name of the lithostratigraphic +#' formation from which the material entity was collected. +#' @param member (character) The full name of the lithostratigraphic member +#' from which the material entity was collected. +#' @param bed (character) The full name of the lithostratigraphic bed from +#' which the material entity was collected. +#' @param associatedSequences (character) Identifier (publication, global unique +#' identifier, URI) of genetic sequence information associated with the +#' material entity. Example : http://www.ncbi.nlm.nih.gov/nuccore/U34853.1 +#' @param isSequenced (logical) Indicates whether `associatedSequences` genetic +#' sequence information exists. +#' @param startDayOfYear (numeric) The earliest integer day of the year on +#' which the event occurred. +#' @param endDayOfYear (numeric) The latest integer day of the year on +#' which the event occurred. #' @param skip_validate (logical) whether to skip `wellknown::validate_wkt` #' call or not. passed down to [check_wkt()]. Default: `TRUE` #' diff --git a/man/occ_data.Rd b/man/occ_data.Rd index 309f547c..6406fe9b 100644 --- a/man/occ_data.Rd +++ b/man/occ_data.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/occ_data.R \name{occ_data} \alias{occ_data} -\title{Search for GBIF occurrences - simplified for speed} +\title{Legacy alternative to occ_search} \usage{ occ_data( taxonKey = NULL, @@ -55,9 +55,6 @@ occ_data( stateProvince = NULL, waterBody = NULL, locality = NULL, - limit = 500, - start = 0, - skip_validate = TRUE, occurrenceStatus = "PRESENT", gadmGid = NULL, coordinateUncertaintyInMeters = NULL, @@ -74,6 +71,9 @@ occ_data( lifeStage = NULL, isInCluster = NULL, distanceFromCentroidInMeters = NULL, + skip_validate = TRUE, + limit = 500, + start = 0, curlopts = list(http_version = 2) ) } @@ -288,19 +288,6 @@ locations occur} \item{locality}{(character) The specific description of the place.} -\item{limit}{Number of records to return. Default: 500. Note that the per -request maximum is 300, but since we set it at 500 for the function, we -do two requests to get you the 500 records (if there are that many). -Note that there is a hard maximum of 100,000, which is calculated as the -\code{limit+start}, so \code{start=99,000} and \code{limit=2000} won't work} - -\item{start}{Record number to start at. Use in combination with limit to -page through results. Note that we do the paging internally for you, but -you can manually set the \code{start} parameter} - -\item{skip_validate}{(logical) whether to skip \code{wellknown::validate_wkt} -call or not. passed down to \code{\link[=check_wkt]{check_wkt()}}. Default: \code{TRUE}} - \item{occurrenceStatus}{(character) Default is "PRESENT". Specify whether search should return "PRESENT" or "ABSENT" data.} @@ -365,9 +352,20 @@ means at least 2km from known centroids. A value of "0" would mean occurrences exactly on known centroids. A value of "0,2000" would mean within 2km of centroids. Max value is 5000.} -\item{curlopts}{list of named curl options passed on to -\code{\link[crul]{HttpClient}}. see \code{curl::curl_options} -for curl options} +\item{skip_validate}{(logical) whether to skip wellknown::validate_wkt call +or not. passed down to check_wkt(). Default: TRUE} + +\item{limit}{Number of records to return. Default: 500. Note that the per +request maximum is 300, but since we set it at 500 for the function, we +do two requests to get you the 500 records (if there are that many). +Note that there is a hard maximum of 100,000, which is calculated as the +\code{limit+start}, so \code{start=99,000} and \code{limit=2000} won't work} + +\item{start}{Record number to start at. Use in combination with limit to +page through results. Note that we do the paging internally for you, but +you can manually set the \code{start} parameter} + +\item{curlopts}{(list)} } \value{ An object of class \code{gbif_data}, which is a S3 class list, with @@ -379,496 +377,11 @@ it's a "many". \code{meta} is a list of length four with offset, limit, endOfRecords and count fields. \code{data} is a tibble (aka data.frame) } \description{ -Search for GBIF occurrences - simplified for speed -} -\note{ -Maximum number of records you can get with this function is 100,000. -See https://www.gbif.org/developer/occurrence -} -\section{Multiple values passed to a parameter}{ - -There are some parameters you can pass multiple values to in a vector, -each value of which produces a different request (multiple different -requests = c("a","b")). Some parameters allow multiple values to be passed -in the same request (multiple same request = "a;b") in a semicolon separated -string (e.g., 'a;b'); if given we'll do a single request with that parameter -repeated for each value given (e.g., \code{foo=a&foo=b} if the parameter -is \code{foo}). - -See article \href{https://docs.ropensci.org/rgbif/articles/multiple_values.html}{Multiple Values}. -} - -\section{Hierarchies}{ - -Hierarchies are returned with each occurrence object. There is no -option to return them from the API. However, within the \code{occ_search} -function you can select whether to return just hierarchies, just data, all -of data and hierarchies and metadata, or just metadata. If all hierarchies -are the same we just return one for you. -} - -\section{curl debugging}{ - -You can pass parameters not defined in this function into the call to -the GBIF API to control things about the call itself using \code{curlopts}. -See an example below that passes in the \code{verbose} function to get -details on the http call. -} - -\section{WKT}{ - -Examples of valid WKT objects: -\itemize{ -\item 'POLYGON((-19.5 34.1, 27.8 34.1, 35.9 68.1, -25.3 68.1, -19.5 34.1))' -\item 'MULTIPOLYGON(((-123 38,-116 38,-116 43,-123 43,-123 38)),((-97 41,-93 41,-93 45,-97 45,-97 41)))' -\item 'POINT(-120 40)' -\item 'LINESTRING(3 4,10 50,20 25)' -} - -Note that GBIF expects counter-clockwise winding order for WKT. You can -supply clockwise WKT, but GBIF treats it as an exclusion, so you get all -data not inside the WKT area. \code{\link[=occ_download]{occ_download()}} behaves differently -in that you should simply get no data back at all with clockwise WKT. -} - -\section{Long WKT}{ - -Options for handling long WKT strings: -Note that long WKT strings are specially handled when using \code{\link{occ_search}} or -\code{\link{occ_data}}. Here are the three options for long WKT strings (> 1500 characters), -set one of these three via the parameter \code{geom_big}: -\itemize{ -\item asis - the default setting. This means we don't do anything internally. That is, -we just pass on your WKT string just as we've done before in this package. -\item axe - this option is deprecated since rgbif v3.8.0. Might return error, -since the GBIF's polygon interpretation has changed. - -This method uses \code{sf::st_make_grid} and \code{sf::st_intersection}, which has -two parameters \code{cellsize} and \code{n}. You can tweak those parameters here by -tweaking \code{geom_size} and \code{geom_n}. \code{geom_size} seems to be more useful in -toggling the number of WKT strings you get back. - -See \code{\link{wkt_parse}} to manually break make WKT bounding box from a larger WKT -string, or break a larger WKT string into many smaller ones. - -\item bbox - this option checks whether your WKT string is longer than 1500 characters, -and if it is we create a bounding box from the WKT, do the GBIF search with that -bounding box, then prune the resulting data to only those occurrences in your original -WKT string. There is a big caveat however. Because we create a bounding box from the WKT, -and the \code{limit} parameter determines some subset of records to get, then when we -prune the resulting data to the WKT, the number of records you get could be less than -what you set with your \code{limit} parameter. However, you could set the limit to be -high enough so that you get all records back found in that bounding box, then you'll -get all the records available within the WKT. -} -} - -\section{Counts}{ - -There is a slight difference in the way records are counted here vs. -results from \code{\link{occ_count}}. For equivalent outcomes, in this -function use \code{hasCoordinate=TRUE}, and \code{hasGeospatialIssue=FALSE} -to have the same outcome using \code{\link{occ_count}} with -\code{isGeoreferenced=TRUE} -} - -\section{occ_data vs. occ_search}{ - -This does nearly the same thing as \code{\link[=occ_search]{occ_search()}}, but -is simplified for speed, and is for the most common use case where -user just wants occurrence data, and not other information like taxon -hierarchies and media (e.g., images). Alot of time in \code{\link[=occ_search]{occ_search()}} -is used parsing data to be more useable downstream. We do less of that -in this function. - -There are a number of data fields GBIF returns that we drop to speed up -processing time within R. These fields take extra time to process -because they are deeply nested and so take extra time to check if -they are empty or not, and if not, figure out how to parse them -into a data.frame. The fields are: -\itemize{ -\item \code{gadm} -\item \code{media} -\item \code{facts} -\item \code{relations} -\item \code{extensions} -\item \code{identifiers} -\item \code{recordedByIDs} -\item \code{identifiedByIDs} -} - -To get these fields use \code{\link[=occ_search]{occ_search()}} instead. -} - -\examples{ -\dontrun{ -(key <- name_backbone(name='Encelia californica')$speciesKey) -occ_data(taxonKey = key, limit = 4) -(res <- occ_data(taxonKey = key, limit = 400)) - -# Return 20 results, this is the default by the way -(key <- name_suggest(q='Helianthus annuus', rank='species')$data$key[1]) -occ_data(taxonKey=key, limit=20) - -# Instead of getting a taxon key first, you can search for a name directly -## However, note that using this approach (with \code{scientificName="..."}) -## you are getting synonyms too. The results for using \code{scientifcName} -## and \code{taxonKey} parameters are the same in this case, but I wouldn't -## be surprised if for some names they return different results -occ_data(scientificName = 'Ursus americanus', curlopts=list(verbose=TRUE)) -key <- name_backbone(name = 'Ursus americanus', rank='species')$usageKey -occ_data(taxonKey = key) - -# Search by dataset key -occ_data(datasetKey='7b5d6a48-f762-11e1-a439-00145eb45e9a', limit=10) - -# Search by catalog number -occ_data(catalogNumber="49366", limit=10) -## separate requests: use a vector of strings -occ_data(catalogNumber=c("49366","Bird.27847588"), limit=10) -## one request, many instances of same parameter: use semi-colon sep. string -occ_data(catalogNumber="49366;Bird.27847588", limit=10) - -# Use paging parameters (limit and start) to page. Note the different results -# for the two queries below. -occ_data(datasetKey='7b5d6a48-f762-11e1-a439-00145eb45e9a',start=10,limit=5) -occ_data(datasetKey='7b5d6a48-f762-11e1-a439-00145eb45e9a',start=20,limit=5) - -# Many dataset keys -## separate requests: use a vector of strings -occ_data(datasetKey=c("50c9509d-22c7-4a22-a47d-8c48425ef4a7", - "7b5d6a48-f762-11e1-a439-00145eb45e9a"), limit=20) -## one request, many instances of same parameter: use semi-colon sep. string -v="50c9509d-22c7-4a22-a47d-8c48425ef4a7;7b5d6a48-f762-11e1-a439-00145eb45e9a" -occ_data(datasetKey = v, limit=20) - -# Search by recorder -occ_data(recordedBy="smith", limit=20) - -# Many collector names -## separate requests: use a vector of strings -occ_data(recordedBy=c("smith","BJ Stacey"), limit=10) -## one request, many instances of same parameter: use semi-colon sep. string -occ_data(recordedBy="smith;BJ Stacey", limit=10) - -# recordedByID -occ_data(recordedByID="https://orcid.org/0000-0003-1691-239X", limit=20) -## many at once -### separate searches -ids <- c("https://orcid.org/0000-0003-1691-239X", - "https://orcid.org/0000-0001-7569-1828", - "https://orcid.org/0000-0002-0596-5376") -res <- occ_data(recordedByID=ids, limit=20) -res[[1]]$data$recordedByIDs[[1]] -res[[2]]$data$recordedByIDs[[1]] -res[[3]]$data$recordedByIDs[[1]] -### all in one search -res <- occ_data(recordedByID=paste0(ids, collapse=";"), limit=20) -unique(vapply(res$data$recordedByIDs, "[[", "", "value")) - -# identifiedByID -occ_data(identifiedByID="https://orcid.org/0000-0003-4710-2648", limit=20) - -# Pass in curl options for extra fun -occ_data(taxonKey=2433407, limit=20, curlopts=list(verbose=TRUE)) -occ_data(taxonKey=2433407, limit=20, - curlopts = list( - noprogress = FALSE, - progressfunction = function(down, up) { - cat(sprintf("up: \%d | down \%d\n", up, down)) - return(TRUE) - } - ) -) -# occ_data(taxonKey=2433407, limit=20, curlopts=list(timeout_ms=1)) - -# Search for many species -splist <- c('Cyanocitta stelleri', 'Junco hyemalis', 'Aix sponsa') -keys <- sapply(splist, function(x) name_suggest(x)$data$key[1], USE.NAMES=FALSE) -## separate requests: use a vector of strings -occ_data(taxonKey = keys, limit=5) -## one request, many instances of same parameter: use semi-colon sep. string -occ_data(taxonKey = paste0(keys, collapse = ";"), limit=5) - -# Search using a synonym name -# Note that you'll see a message printing out that the accepted name will -# be used -occ_data(scientificName = 'Pulsatilla patens', limit=5) - -# Search on latitidue and longitude -occ_data(decimalLatitude=40, decimalLongitude=-120, limit = 10) - -# Search on a bounding box -## in well known text format -### polygon -occ_data(geometry='POLYGON((30.1 10.1,40 40,20 40,10 20,30.1 10.1))', - limit=20) -### multipolygon -wkt <- 'MULTIPOLYGON(((-123 38,-116 38,-116 43,-123 43,-123 38)), - ((-97 41,-93 41,-93 45,-97 45,-97 41)))' -occ_data(geometry = gsub("\n\\\\s+", "", wkt), limit = 20) -### polygon and taxonkey -key <- name_suggest(q='Aesculus hippocastanum')$data$key[1] -occ_data(taxonKey=key, - geometry='POLYGON((30.1 10.1,40 40,20 40,10 20,30.1 10.1))', - limit=20) -## or using bounding box, converted to WKT internally -occ_data(geometry=c(-125.0,38.4,-121.8,40.9), limit=20) - -## you can seaerch on many geometry objects -### separate requests: use a vector of strings -wkts <- -c('POLYGON((-102.2 46,-102.2 43.7,-93.9 43.7,-93.9 46,-102.2 46))', -'POLYGON((30.1 10.1,40 40,20 40,10 20,30.1 10.1))') -occ_data(geometry = wkts, limit=20) -### one request, many instances of same parameter: use semi-colon sep. string -occ_data(geometry = paste0(wkts, collapse = ";"), limit=20) - - -# Search on a long WKT string - too long for a GBIF search API request -## By default, a very long WKT string will likely cause a request failure as -## GBIF only handles strings up to about 1500 characters long. You can leave as is, or -## - Alternatively, you can choose to break up your polygon into many, and do a -## data request on each piece, and the output is put back together (see below) -## - Or, 2nd alternatively, you could use the GBIF download API -wkt <- "POLYGON((-9.178796777343678 53.22769021556159, --12.167078027343678 51.56540789297837, --12.958093652343678 49.78333685689162,-11.024499902343678 49.21251756301334, --12.079187402343678 46.68179685941719,-15.067468652343678 45.83103608186854, --15.770593652343678 43.58271629699817,-15.067468652343678 41.57676278827219, --11.815515527343678 40.44938999172728,-12.958093652343678 37.72112962230871, --11.639734277343678 36.52987439429357,-8.299890527343678 34.96062625095747, --8.739343652343678 32.62357394385735,-5.223718652343678 30.90497915232165, -1.1044063476563224 31.80562077746643,1.1044063476563224 30.754036557416256, -6.905187597656322 32.02942785462211,5.147375097656322 32.99292810780193, -9.629796972656322 34.164474406524725,10.860265722656322 32.91918014319603, -14.551671972656322 33.72700959356651,13.409093847656322 34.888564192275204, -16.748937597656322 35.104560368110114,19.561437597656322 34.81643887792552, -18.594640722656322 36.38849705969625,22.989171972656322 37.162874858929854, -19.825109472656322 39.50651757842751,13.760656347656322 38.89353140585116, -14.112218847656322 42.36091601976124,10.596593847656322 41.11488736647705, -9.366125097656322 43.70991402658437,5.059484472656322 42.62015372417812, -2.3348750976563224 45.21526500321446,-0.7412967773436776 46.80225692528942, -6.114171972656322 47.102229890207894,8.047765722656322 45.52399303437107, -12.881750097656322 48.22681126957933,9.190343847656322 48.693079457106684, -8.750890722656322 50.68283120621287,5.059484472656322 50.40356146487845, -4.268468847656322 52.377558897655156,1.4559688476563224 53.28027243658647, -0.8407344726563224 51.62000971578333,0.5770625976563224 49.32721423860726, --2.5869999023436776 49.49875947592088,-2.4991092773436776 51.18135535408638, --2.0596561523436776 52.53822562473851,-4.696374902343678 51.67454591918756, --5.311609277343678 50.009802108095776,-6.629968652343678 48.75106196817059, --7.684656152343678 50.12263634382465,-6.190515527343678 51.83776110910459, --5.047937402343678 54.267098895684235,-6.893640527343678 53.69860705549198, --8.915124902343678 54.77719740243195,-12.079187402343678 54.52294465763567, --13.573328027343678 53.437631551347174, --11.288171777343678 53.48995552517918, --9.178796777343678 53.22769021556159))" -wkt <- gsub("\n", " ", wkt) - -#### Default option with large WKT string fails -# res <- occ_data(geometry = wkt) - -#### if WKT too long, with 'geom_big=bbox': makes into bounding box -if (interactive()){ -res <- occ_data(geometry = wkt, geom_big = "bbox") -} - - -# Search on country -occ_data(country='US', limit=20) -occ_data(country='FR', limit=20) -occ_data(country='DE', limit=20) -### separate requests: use a vector of strings -occ_data(country=c('US','DE'), limit=20) -### one request, many instances of same parameter: use semi-colon sep. string -occ_data(country = 'US;DE', limit=20) - -# Get only occurrences with lat/long data -occ_data(taxonKey=key, hasCoordinate=TRUE, limit=20) - -# Get only occurrences that were recorded as living specimens -occ_data(basisOfRecord="LIVING_SPECIMEN", hasCoordinate=TRUE, limit=20) -## multiple values in a vector = a separate request for each value -occ_data(taxonKey=key, - basisOfRecord=c("OBSERVATION", "HUMAN_OBSERVATION"), limit=20) -## mutiple values in a single string, ";" separated = one request including all values -occ_data(taxonKey=key, - basisOfRecord="OBSERVATION;HUMAN_OBSERVATION", limit=20) - -# Get occurrences for a particular eventDate -occ_data(taxonKey=key, eventDate="2013", limit=20) -occ_data(taxonKey=key, year="2013", limit=20) -occ_data(taxonKey=key, month="6", limit=20) - -# Get occurrences based on depth -key <- name_backbone(name='Salmo salar', kingdom='animals')$speciesKey -occ_data(taxonKey=key, depth=1, limit=20) - -# Get occurrences based on elevation -key <- name_backbone(name='Puma concolor', kingdom='animals')$speciesKey -occ_data(taxonKey=key, elevation=50, hasCoordinate=TRUE, limit=20) - -# Get occurrences based on institutionCode -occ_data(institutionCode="TLMF", limit=20) -### separate requests: use a vector of strings -occ_data(institutionCode=c("TLMF","ArtDatabanken"), limit=20) -### one request, many instances of same parameter: use semi-colon sep. string -occ_data(institutionCode = "TLMF;ArtDatabanken", limit=20) - -# Get occurrences based on collectionCode -occ_data(collectionCode="Floristic Databases MV - Higher Plants", limit=20) -### separate requests: use a vector of strings -occ_data(collectionCode=c("Floristic Databases MV - Higher Plants", - "Artport"), limit = 20) -### one request, many instances of same parameter: use semi-colon sep. string -occ_data(collectionCode = "Floristic Databases MV - Higher Plants;Artport", - limit = 20) - -# Get only those occurrences with spatial issues -occ_data(taxonKey=key, hasGeospatialIssue=TRUE, limit=20) - -# Search using a query string -occ_data(search="kingfisher", limit=20) - -# search on repatriated - doesn't work right now -# occ_data(repatriated = "") - -# search on phylumKey -occ_data(phylumKey = 7707728, limit = 5) - -# search on kingdomKey -occ_data(kingdomKey = 1, limit = 5) - -# search on classKey -occ_data(classKey = 216, limit = 5) - -# search on orderKey -occ_data(orderKey = 7192402, limit = 5) - -# search on familyKey -occ_data(familyKey = 3925, limit = 5) - -# search on genusKey -occ_data(genusKey = 1935496, limit = 5) - -# search on establishmentMeans -occ_data(establishmentMeans = "INVASIVE", limit = 5) -occ_data(establishmentMeans = "NATIVE", limit = 5) -occ_data(establishmentMeans = "UNCERTAIN", limit = 5) -### separate requests: use a vector of strings -occ_data(establishmentMeans = c("INVASIVE", "NATIVE"), limit = 5) -### one request, many instances of same parameter: use semi-colon sep. string -occ_data(establishmentMeans = "INVASIVE;NATIVE", limit = 5) - -# search on protocol -occ_data(protocol = "DIGIR", limit = 5) - -# search on license -occ_data(license = "CC_BY_4_0", limit = 5) - -# search on organismId -occ_data(organismId = "100", limit = 5) - -# search on publishingOrg -occ_data(publishingOrg = "28eb1a3f-1c15-4a95-931a-4af90ecb574d", limit = 5) - -# search on stateProvince -occ_data(stateProvince = "California", limit = 5) - -# search on waterBody -occ_data(waterBody = "pacific ocean", limit = 5) - -# search on locality -occ_data(locality = "Trondheim", limit = 5) -### separate requests: use a vector of strings -res <- occ_data(locality = c("Trondheim", "Hovekilen"), limit = 5) -res$Trondheim$data -res$Hovekilen$data -### one request, many instances of same parameter: use semi-colon sep. string -occ_data(locality = "Trondheim;Hovekilen", limit = 5) - - -# Range queries -## See Detail for parameters that support range queries -occ_data(depth='50,100', limit = 20) -### this is not a range search, but does two searches for each depth -occ_data(depth=c(50,100), limit = 20) - -## Range search with year -occ_data(year='1999,2000', limit=20) - -## Range search with latitude -occ_data(decimalLatitude='29.59,29.6', limit = 20) - -## Range search with distanceFromCentroidInMeters -occ_data(distanceFromCentroidInMeters = "2000,*") # at least 2km from centroids -occ_data(distanceFromCentroidInMeters = "0,2000") # close to centroids within 2km -occ_data(distanceFromCentroidInMeters = 0) # exactly on centroids - -# Search by specimen type status -## Look for possible values of the typeStatus parameter looking at the typestatus dataset -occ_data(typeStatus = 'allotype', limit = 20)$data[,c('name','typeStatus')] - -# Search by specimen record number -## This is the record number of the person/group that submitted the data, not GBIF's numbers -## You can see that many different groups have record number 1, so not super helpful -occ_data(recordNumber = 1, limit = 20)$data[,c('name','recordNumber','recordedBy')] - -# Search by last time interpreted: Date the record was last modified in GBIF -## The lastInterpreted parameter accepts ISO 8601 format dates, including -## yyyy, yyyy-MM, yyyy-MM-dd, or MM-dd. Range queries are accepted for lastInterpreted -occ_data(lastInterpreted = '2016-04-02', limit = 20) - -# Search for occurrences with images -occ_data(mediaType = 'StillImage', limit = 20) -occ_data(mediaType = 'MovingImage', limit = 20) -occ_data(mediaType = 'Sound', limit = 20) - -# Search by continent -## One of africa, antarctica, asia, europe, north_america, oceania, or -## south_america -occ_data(continent = 'south_america', limit = 20)$meta -occ_data(continent = 'africa', limit = 20)$meta -occ_data(continent = 'oceania', limit = 20)$meta -occ_data(continent = 'antarctica', limit = 20)$meta -### separate requests: use a vector of strings -occ_data(continent = c('south_america', 'oceania'), limit = 20) -### one request, many instances of same parameter: use semi-colon sep. string -occ_data(continent = 'south_america;oceania', limit = 20) - -# Query based on issues - see Details for options -## one issue -x <- occ_data(taxonKey=1, issue='DEPTH_UNLIKELY', limit = 20) -x$data[,c('name','key','decimalLatitude','decimalLongitude','depth')] -## two issues -occ_data(taxonKey=1, issue=c('DEPTH_UNLIKELY','COORDINATE_ROUNDED'), limit = 20) -# Show all records in the Arizona State Lichen Collection that cant be matched to the GBIF -# backbone properly: -occ_data(datasetKey='84c0e1a0-f762-11e1-a439-00145eb45e9a', - issue=c('TAXON_MATCH_NONE','TAXON_MATCH_HIGHERRANK'), limit = 20) - -# Parsing output by issue -(res <- occ_data(geometry='POLYGON((30.1 10.1,40 40,20 40,10 20,30.1 10.1))', limit = 50)) -## what do issues mean, can print whole table, or search for matches -head(gbif_issues()) -gbif_issues()[ gbif_issues()$code \%in\% c('cdround','cudc','gass84','txmathi'), ] -## or parse issues in various ways -### remove data rows with certain issue classes -library('magrittr') -res \%>\% occ_issues(gass84) -### split issues into separate columns -res \%>\% occ_issues(mutate = "split") -### expand issues to more descriptive names -res \%>\% occ_issues(mutate = "expand") -### split and expand -res \%>\% occ_issues(mutate = "split_expand") -### split, expand, and remove an issue class -res \%>\% occ_issues(-cudc, mutate = "split_expand") -} -} -\references{ -https://www.gbif.org/developer/occurrence#search +Legacy alternative to occ_search } -\seealso{ -\code{\link[=downloads]{downloads()}}, \code{\link[=occ_search]{occ_search()}} +\details{ +This function is a legacy alternative to \code{occ_search()}. It is not +recommended to use \code{occ_data()} as it is not as flexible as \code{occ_search()}. +New search terms will not be added to this function and it is only supported +for legacy reasons. } diff --git a/man/occ_search.Rd b/man/occ_search.Rd index 28cf159e..bc0cf803 100644 --- a/man/occ_search.Rd +++ b/man/occ_search.Rd @@ -71,6 +71,61 @@ occ_search( lifeStage = NULL, isInCluster = NULL, distanceFromCentroidInMeters = NULL, + geoDistance = NULL, + sex = NULL, + dwcaExtension = NULL, + gbifId = NULL, + gbifRegion = NULL, + projectId = NULL, + programme = NULL, + preparations = NULL, + datasetId = NULL, + datasetName = NULL, + publishedByGbifRegion = NULL, + island = NULL, + islandGroup = NULL, + taxonId = NULL, + taxonConceptId = NULL, + taxonomicStatus = NULL, + acceptedTaxonKey = NULL, + collectionKey = NULL, + institutionKey = NULL, + otherCatalogNumbers = NULL, + georeferencedBy = NULL, + installationKey = NULL, + hostingOrganizationKey = NULL, + crawlId = NULL, + modified = NULL, + higherGeography = NULL, + fieldNumber = NULL, + parentEventId = NULL, + samplingProtocol = NULL, + sampleSizeUnit = NULL, + pathway = NULL, + gadmLevel0Gid = NULL, + gadmLevel1Gid = NULL, + gadmLevel2Gid = NULL, + gadmLevel3Gid = NULL, + earliestEonOrLowestEonothem = NULL, + latestEonOrHighestEonothem = NULL, + earliestEraOrLowestErathem = NULL, + latestEraOrHighestErathem = NULL, + earliestPeriodOrLowestSystem = NULL, + latestPeriodOrHighestSystem = NULL, + earliestEpochOrLowestSeries = NULL, + latestEpochOrHighestSeries = NULL, + earliestAgeOrLowestStage = NULL, + latestAgeOrHighestStage = NULL, + lowestBiostratigraphicZone = NULL, + highestBiostratigraphicZone = NULL, + group = NULL, + formation = NULL, + member = NULL, + bed = NULL, + associatedSequences = NULL, + isSequenced = NULL, + startDayOfYear = NULL, + endDayOfYear = NULL, limit = 500, start = 0, fields = "all", @@ -358,6 +413,167 @@ means at least 2km from known centroids. A value of "0" would mean occurrences exactly on known centroids. A value of "0,2000" would mean within 2km of centroids. Max value is 5000.} +\item{geoDistance}{(character) Filters to match occurrence records with coordinate values +within a specified distance of a coordinate. Distance may be specified in +kilometres (km) or metres (m). Example : "90,100,5km"} + +\item{sex}{(character) The sex of the biological individual(s) represented in the occurrence.} + +\item{dwcaExtension}{(character) A known Darwin Core Archive extension RowType. +Limits the search to occurrences which have this extension, although they will +not necessarily have any useful data recorded using the extension.} + +\item{gbifId}{(numeric) The unique GBIF key for a single occurrence.} + +\item{gbifRegion}{(character) Gbif region based on country code.} + +\item{projectId}{(character) The identifier for a project, which is often +assigned by a funded programme.} + +\item{programme}{(character) A group of activities, often associated with a +specific funding stream, such as the GBIF BID programme.} + +\item{preparations}{(character) Preparation or preservation method for +a specimen.} + +\item{datasetId}{(character) The ID of the dataset. Parameter may be +repeated. Example : https://doi.org/10.1594/PANGAEA.315492} + +\item{datasetName}{(character) The exact name of the dataset. Not the same as +dataset title.} + +\item{publishedByGbifRegion}{(character) GBIF region based on the owning +organization's country.} + +\item{island}{(character) The name of the island on or near which the +location occurs.} + +\item{islandGroup}{(character) The name of the island group in which the +location occurs.} + +\item{taxonId}{(character) The taxon identifier provided to GBIF by the data +publisher. Example : urn:lsid:dyntaxa.se:Taxon:103026} + +\item{taxonConceptId}{(character) An identifier for the taxonomic concept to +which the record refers - not for the nomenclatural details of a taxon. +Example : 8fa58e08-08de-4ac1-b69c-1235340b7001} + +\item{taxonomicStatus}{(character) A taxonomic status. Example : SYNONYM} + +\item{acceptedTaxonKey}{(numeric) A taxon key from the GBIF backbone. Only +synonym taxa are included in the search, so a search for Aves with +acceptedTaxonKey=212 will match occurrences identified as birds, but not +any known family, genus or species of bird.} + +\item{collectionKey}{(character) A key (UUID) for a collection registered in +the Global Registry of Scientific Collections. +Example : dceb8d52-094c-4c2c-8960-75e0097c6861} + +\item{institutionKey}{(character) A key (UUID) for an institution registered +in the Global Registry of Scientific Collections.} + +\item{otherCatalogNumbers}{(character) Previous or alternate fully qualified +catalog numbers.} + +\item{georeferencedBy}{(character) Name of a person, group, or organization +who determined the georeference (spatial representation) for the location. +Example : Brad Millen} + +\item{installationKey}{(character) The occurrence installation key (a UUID). +Example : 17a83780-3060-4851-9d6f-029d5fcb81c9} + +\item{hostingOrganizationKey}{(character) The key (UUID) of the publishing +organization whose installation (server) hosts the original dataset. +Example : fbca90e3-8aed-48b1-84e3-369afbd000ce} + +\item{crawlId}{(numeric) Crawl attempt that harvested this record.} + +\item{modified}{(character) The most recent date-time on which the +occurrence was changed, according to the publisher. Can be a range. +Example : 2023-02-20} + +\item{higherGeography}{(character) Geographic name less specific than the +information captured in the locality term.} + +\item{fieldNumber}{(character) An identifier given to the event in the field. +Often serves as a link between field notes and the event.} + +\item{parentEventId}{(character) An identifier for the information associated +with a sampling event.} + +\item{samplingProtocol}{(character) The name of, reference to, or description +of the method or protocol used during a sampling event. +Example : malaise trap} + +\item{sampleSizeUnit}{(character) The unit of measurement of the size +(time duration, length, area, or volume) of a sample in a sampling event. +Example : hectares} + +\item{pathway}{(character) The process by which an organism came to be in a +given place at a given time, as defined in the GBIF Pathway vocabulary. +Example : Agriculture} + +\item{gadmLevel0Gid}{(character) A GADM geographic identifier at the zero +level, for example AGO.} + +\item{gadmLevel1Gid}{(character) A GADM geographic identifier at the first +level, for example AGO.1_1.} + +\item{gadmLevel2Gid}{(character) A GADM geographic identifier at the second +level, for example AFG.1.1_1.} + +\item{gadmLevel3Gid}{(character) A GADM geographic identifier at the third +level, for example AFG.1.1.1_1.} + +\item{earliestEonOrLowestEonothem}{(character) geochronologic era term.} + +\item{latestEonOrHighestEonothem}{(character) geochronologic era term.} + +\item{earliestEraOrLowestErathem}{(character) geochronologic era term.} + +\item{latestEraOrHighestErathem}{(character) geochronologic era term.} + +\item{earliestPeriodOrLowestSystem}{(character) geochronologic era term.} + +\item{latestPeriodOrHighestSystem}{(character) geochronologic era term.} + +\item{earliestEpochOrLowestSeries}{(character) geochronologic era term.} + +\item{latestEpochOrHighestSeries}{(character) geochronologic era term.} + +\item{earliestAgeOrLowestStage}{(character) geochronologic era term.} + +\item{latestAgeOrHighestStage}{(character) geochronologic era term.} + +\item{lowestBiostratigraphicZone}{(character) geochronologic era term.} + +\item{highestBiostratigraphicZone}{(character) geochronologic era term.} + +\item{group}{(character) The full name of the lithostratigraphic group from +which the material entity was collected.} + +\item{formation}{(character) The full name of the lithostratigraphic +formation from which the material entity was collected.} + +\item{member}{(character) The full name of the lithostratigraphic member +from which the material entity was collected.} + +\item{bed}{(character) The full name of the lithostratigraphic bed from +which the material entity was collected.} + +\item{associatedSequences}{(character) Identifier (publication, global unique +identifier, URI) of genetic sequence information associated with the +material entity. Example : http://www.ncbi.nlm.nih.gov/nuccore/U34853.1} + +\item{isSequenced}{(logical) Indicates whether \code{associatedSequences} genetic +sequence information exists.} + +\item{startDayOfYear}{(numeric) The earliest integer day of the year on +which the event occurred.} + +\item{endDayOfYear}{(numeric) The latest integer day of the year on +which the event occurred.} + \item{limit}{Number of records to return. Default: 500. Note that the per request maximum is 300, but since we set it at 500 for the function, we do two requests to get you the 500 records (if there are that many). diff --git a/tests/fixtures/occ_search_geoDistance.yml b/tests/fixtures/occ_search_geoDistance.yml new file mode 100644 index 00000000..7cdeca43 --- /dev/null +++ b/tests/fixtures/occ_search_geoDistance.yml @@ -0,0 +1,102 @@ +http_interactions: +- request: + method: get + uri: https://api.gbif.org/v1/occurrence/search?occurrenceStatus=PRESENT&limit=2&offset=0&geoDistance=50.0%2C10.0%2C10km + body: + encoding: '' + string: '' + headers: + Accept-Encoding: gzip, deflate + Accept: application/json, text/xml, application/xml, */* + response: + status: + status_code: '200' + message: OK + explanation: Request fulfilled, document follows + headers: + status: HTTP/1.1 200 OK + content-type: application/json + body: + encoding: '' + file: no + base64_string: eyJvZmZzZXQiOjAsImxpbWl0IjoyLCJlbmRPZlJlY29yZHMiOmZhbHNlLCJjb3VudCI6NzE4ODQsInJl + c3VsdHMiOlt7ImtleSI6NDUyNjY5OTQ0NCwiZGF0YXNldEtleSI6IjZhYzNmNzc0LWQ5ZmItNDc5Ni1i + M2U5LTkyYmY2YzgxYzA4NCIsInB1Ymxpc2hpbmdPcmdLZXkiOiJiYjY0NmRmZi1hOTA1LTQ0MDMtYTQ5 + Yi02ZDM3OGMyY2YwZDkiLCJpbnN0YWxsYXRpb25LZXkiOiIzNmNhZTQ1Yy0xOTgzLTQ4YmUtOWNmOS0w + NjcyYzRhNjA2MTIiLCJob3N0aW5nT3JnYW5pemF0aW9uS2V5IjoiYmI2NDZkZmYtYTkwNS00NDAzLWE0 + OWItNmQzNzhjMmNmMGQ5IiwicHVibGlzaGluZ0NvdW50cnkiOiJERSIsInByb3RvY29sIjoiQklPQ0FT + RSIsImxhc3RDcmF3bGVkIjoiMjAyNC0wMy0xNVQyMTo0MzoyOS4xOTcrMDA6MDAiLCJsYXN0UGFyc2Vk + IjoiMjAyNC0wMy0xNVQyMzoyMjo0NS44MjQrMDA6MDAiLCJjcmF3bElkIjozMDksImV4dGVuc2lvbnMi + OnsiaHR0cDovL3JzLmdiaWYub3JnL3Rlcm1zLzEuMC9NdWx0aW1lZGlhIjpbeyJodHRwOi8vcHVybC5v + cmcvZGMvdGVybXMvZm9ybWF0IjoiaW1hZ2UvanBlZyJ9XX0sImJhc2lzT2ZSZWNvcmQiOiJIVU1BTl9P + QlNFUlZBVElPTiIsIm9jY3VycmVuY2VTdGF0dXMiOiJQUkVTRU5UIiwidGF4b25LZXkiOjI0ODExMzcs + Imtpbmdkb21LZXkiOjEsInBoeWx1bUtleSI6NDQsImNsYXNzS2V5IjoyMTIsIm9yZGVyS2V5Ijo3MTky + NDAyLCJmYW1pbHlLZXkiOjkzMTYsImdlbnVzS2V5Ijo0ODQ4NDUyLCJzcGVjaWVzS2V5Ijo2MDY1ODI0 + LCJhY2NlcHRlZFRheG9uS2V5Ijo2MDY1ODI0LCJzY2llbnRpZmljTmFtZSI6IkxhcnVzIHJpZGlidW5k + dXMgTGlubmFldXMsIDE3NjYiLCJhY2NlcHRlZFNjaWVudGlmaWNOYW1lIjoiQ2hyb2ljb2NlcGhhbHVz + IHJpZGlidW5kdXMgKExpbm5hZXVzLCAxNzY2KSIsImtpbmdkb20iOiJBbmltYWxpYSIsInBoeWx1bSI6 + IkNob3JkYXRhIiwib3JkZXIiOiJDaGFyYWRyaWlmb3JtZXMiLCJmYW1pbHkiOiJMYXJpZGFlIiwiZ2Vu + dXMiOiJDaHJvaWNvY2VwaGFsdXMiLCJzcGVjaWVzIjoiQ2hyb2ljb2NlcGhhbHVzIHJpZGlidW5kdXMi + LCJnZW5lcmljTmFtZSI6IkxhcnVzIiwic3BlY2lmaWNFcGl0aGV0IjoicmlkaWJ1bmR1cyIsInRheG9u + UmFuayI6IlNQRUNJRVMiLCJ0YXhvbm9taWNTdGF0dXMiOiJTWU5PTllNIiwiaXVjblJlZExpc3RDYXRl + Z29yeSI6IkxDIiwiZGVjaW1hbExhdGl0dWRlIjo0OS45ODk0MjIsImRlY2ltYWxMb25naXR1ZGUiOjEw + LjEzNTQwMywiY29vcmRpbmF0ZVVuY2VydGFpbnR5SW5NZXRlcnMiOjI1MC4wLCJjb250aW5lbnQiOiJF + VVJPUEUiLCJnYWRtIjp7ImxldmVsMCI6eyJnaWQiOiJERVUiLCJuYW1lIjoiR2VybWFueSJ9LCJsZXZl + bDEiOnsiZ2lkIjoiREVVLjJfMSIsIm5hbWUiOiJCYXllcm4ifSwibGV2ZWwyIjp7ImdpZCI6IkRFVS4y + Ljg0XzEiLCJuYW1lIjoiU2Nod2VpbmZ1cnQifSwibGV2ZWwzIjp7ImdpZCI6IkRFVS4yLjg0LjI3XzEi + LCJuYW1lIjoiV2VybmVjayJ9fSwieWVhciI6MjAyNCwibW9udGgiOjEsImRheSI6MTksImV2ZW50RGF0 + ZSI6IjIwMjQtMDEtMTlUMDA6MDAiLCJzdGFydERheU9mWWVhciI6MTksImVuZERheU9mWWVhciI6MTks + Imlzc3VlcyI6WyJDT09SRElOQVRFX1JPVU5ERUQiLCJHRU9ERVRJQ19EQVRVTV9BU1NVTUVEX1dHUzg0 + IiwiQ09OVElORU5UX0RFUklWRURfRlJPTV9DT09SRElOQVRFUyIsIk1VTFRJTUVESUFfVVJJX0lOVkFM + SUQiXSwibGFzdEludGVycHJldGVkIjoiMjAyNC0wMy0xNVQyMzoyMjo0NS44MjQrMDA6MDAiLCJsaWNl + bnNlIjoiaHR0cDovL2NyZWF0aXZlY29tbW9ucy5vcmcvbGljZW5zZXMvYnkvNC4wL2xlZ2FsY29kZSIs + ImlzU2VxdWVuY2VkIjpmYWxzZSwiaWRlbnRpZmllcnMiOlt7ImlkZW50aWZpZXIiOiJ1cm46Y2F0YWxv + ZzpuYXR1cmd1Y2tlcjpuYXR1cmd1Y2tlcjotMTAwMTQzMTc5OCJ9XSwibWVkaWEiOltdLCJmYWN0cyI6 + W10sInJlbGF0aW9ucyI6W10sImlzSW5DbHVzdGVyIjpmYWxzZSwicmVjb3JkZWRCeSI6IjE2NTU1Njky + MjIiLCJnZW9kZXRpY0RhdHVtIjoiV0dTODQiLCJjbGFzcyI6IkF2ZXMiLCJjb3VudHJ5Q29kZSI6IkRF + IiwicmVjb3JkZWRCeUlEcyI6W10sImlkZW50aWZpZWRCeUlEcyI6W10sImdiaWZSZWdpb24iOiJFVVJP + UEUiLCJjb3VudHJ5IjoiR2VybWFueSIsInB1Ymxpc2hlZEJ5R2JpZlJlZ2lvbiI6IkVVUk9QRSIsImNh + dGFsb2dOdW1iZXIiOiItMTAwMTQzMTc5OCIsImluc3RpdHV0aW9uQ29kZSI6Im5hdHVyZ3Vja2VyIiwi + bG9jYWxpdHkiOiJXaWVzZW5oYXVzIiwiY29sbGVjdGlvbkNvZGUiOiJuYXR1cmd1Y2tlciIsImdiaWZJ + RCI6IjQ1MjY2OTk0NDQifSx7ImtleSI6NDUyNjYwNjQzMiwiZGF0YXNldEtleSI6IjZhYzNmNzc0LWQ5 + ZmItNDc5Ni1iM2U5LTkyYmY2YzgxYzA4NCIsInB1Ymxpc2hpbmdPcmdLZXkiOiJiYjY0NmRmZi1hOTA1 + LTQ0MDMtYTQ5Yi02ZDM3OGMyY2YwZDkiLCJpbnN0YWxsYXRpb25LZXkiOiIzNmNhZTQ1Yy0xOTgzLTQ4 + YmUtOWNmOS0wNjcyYzRhNjA2MTIiLCJob3N0aW5nT3JnYW5pemF0aW9uS2V5IjoiYmI2NDZkZmYtYTkw + NS00NDAzLWE0OWItNmQzNzhjMmNmMGQ5IiwicHVibGlzaGluZ0NvdW50cnkiOiJERSIsInByb3RvY29s + IjoiQklPQ0FTRSIsImxhc3RDcmF3bGVkIjoiMjAyNC0wMy0xNVQyMTo0MzoyOS4xOTcrMDA6MDAiLCJs + YXN0UGFyc2VkIjoiMjAyNC0wMy0xNVQyMzoyODowMy40MzMrMDA6MDAiLCJjcmF3bElkIjozMDksImV4 + dGVuc2lvbnMiOnsiaHR0cDovL3JzLmdiaWYub3JnL3Rlcm1zLzEuMC9NdWx0aW1lZGlhIjpbeyJodHRw + Oi8vcHVybC5vcmcvZGMvdGVybXMvZm9ybWF0IjoiaW1hZ2UvanBlZyJ9XX0sImJhc2lzT2ZSZWNvcmQi + OiJIVU1BTl9PQlNFUlZBVElPTiIsIm9jY3VycmVuY2VTdGF0dXMiOiJQUkVTRU5UIiwidGF4b25LZXki + OjI0Nzc5NjgsImtpbmdkb21LZXkiOjEsInBoeWx1bUtleSI6NDQsImNsYXNzS2V5IjoyMTIsIm9yZGVy + S2V5Ijo3MjQsImZhbWlseUtleSI6OTMzMywiZ2VudXNLZXkiOjI0Nzc4OTksInNwZWNpZXNLZXkiOjI0 + Nzc5NjgsImFjY2VwdGVkVGF4b25LZXkiOjI0Nzc5NjgsInNjaWVudGlmaWNOYW1lIjoiRGVuZHJvY29w + b3MgbWFqb3IgKExpbm5hZXVzLCAxNzU4KSIsImFjY2VwdGVkU2NpZW50aWZpY05hbWUiOiJEZW5kcm9j + b3BvcyBtYWpvciAoTGlubmFldXMsIDE3NTgpIiwia2luZ2RvbSI6IkFuaW1hbGlhIiwicGh5bHVtIjoi + Q2hvcmRhdGEiLCJvcmRlciI6IlBpY2lmb3JtZXMiLCJmYW1pbHkiOiJQaWNpZGFlIiwiZ2VudXMiOiJE + ZW5kcm9jb3BvcyIsInNwZWNpZXMiOiJEZW5kcm9jb3BvcyBtYWpvciIsImdlbmVyaWNOYW1lIjoiRGVu + ZHJvY29wb3MiLCJzcGVjaWZpY0VwaXRoZXQiOiJtYWpvciIsInRheG9uUmFuayI6IlNQRUNJRVMiLCJ0 + YXhvbm9taWNTdGF0dXMiOiJBQ0NFUFRFRCIsIml1Y25SZWRMaXN0Q2F0ZWdvcnkiOiJMQyIsImRlY2lt + YWxMYXRpdHVkZSI6NDkuOTIzODY2LCJkZWNpbWFsTG9uZ2l0dWRlIjoxMC4wNDA0NzYsImNvb3JkaW5h + dGVVbmNlcnRhaW50eUluTWV0ZXJzIjoyNTAuMCwiY29udGluZW50IjoiRVVST1BFIiwiZ2FkbSI6eyJs + ZXZlbDAiOnsiZ2lkIjoiREVVIiwibmFtZSI6Ikdlcm1hbnkifSwibGV2ZWwxIjp7ImdpZCI6IkRFVS4y + XzEiLCJuYW1lIjoiQmF5ZXJuIn0sImxldmVsMiI6eyJnaWQiOiJERVUuMi45Nl8xIiwibmFtZSI6IlfD + vHJ6YnVyZyJ9LCJsZXZlbDMiOnsiZ2lkIjoiREVVLjIuOTYuMTJfMSIsIm5hbWUiOiJIYXVzZW4gYi4g + V8O8cnpidXJnIn19LCJ5ZWFyIjoyMDI0LCJtb250aCI6MSwiZGF5IjoxNiwiZXZlbnREYXRlIjoiMjAy + NC0wMS0xNlQwMDowMCIsInN0YXJ0RGF5T2ZZZWFyIjoxNiwiZW5kRGF5T2ZZZWFyIjoxNiwiaXNzdWVz + IjpbIkNPT1JESU5BVEVfUk9VTkRFRCIsIkdFT0RFVElDX0RBVFVNX0FTU1VNRURfV0dTODQiLCJDT05U + SU5FTlRfREVSSVZFRF9GUk9NX0NPT1JESU5BVEVTIiwiTVVMVElNRURJQV9VUklfSU5WQUxJRCJdLCJs + YXN0SW50ZXJwcmV0ZWQiOiIyMDI0LTAzLTE1VDIzOjI4OjAzLjQzMyswMDowMCIsImxpY2Vuc2UiOiJo + dHRwOi8vY3JlYXRpdmVjb21tb25zLm9yZy9saWNlbnNlcy9ieS80LjAvbGVnYWxjb2RlIiwiaXNTZXF1 + ZW5jZWQiOmZhbHNlLCJpZGVudGlmaWVycyI6W3siaWRlbnRpZmllciI6InVybjpjYXRhbG9nOm5hdHVy + Z3Vja2VyOm5hdHVyZ3Vja2VyOi0xMDU2NjM1Mjk5In1dLCJtZWRpYSI6W10sImZhY3RzIjpbXSwicmVs + YXRpb25zIjpbXSwiaXNJbkNsdXN0ZXIiOmZhbHNlLCJyZWNvcmRlZEJ5IjoiMTY1NTU2OTIyMiIsImdl + b2RldGljRGF0dW0iOiJXR1M4NCIsImNsYXNzIjoiQXZlcyIsImNvdW50cnlDb2RlIjoiREUiLCJyZWNv + cmRlZEJ5SURzIjpbXSwiaWRlbnRpZmllZEJ5SURzIjpbXSwiZ2JpZlJlZ2lvbiI6IkVVUk9QRSIsImNv + dW50cnkiOiJHZXJtYW55IiwicHVibGlzaGVkQnlHYmlmUmVnaW9uIjoiRVVST1BFIiwiY2F0YWxvZ051 + bWJlciI6Ii0xMDU2NjM1Mjk5IiwiaW5zdGl0dXRpb25Db2RlIjoibmF0dXJndWNrZXIiLCJsb2NhbGl0 + eSI6IlRLMjUgQmxhdHQgNjAyNi8zIC0gVW50ZXJmcmFua2VuIC8gRXJic2hhdXNlbiAvIEhhdXNlbiIs + ImNvbGxlY3Rpb25Db2RlIjoibmF0dXJndWNrZXIiLCJnYmlmSUQiOiI0NTI2NjA2NDMyIn1dLCJmYWNl + dHMiOltdfQ== + recorded_at: 2024-05-08 12:03:03 GMT + recorded_with: vcr/1.2.0, webmockr/0.9.0 diff --git a/tests/testthat/test-occ_search.r b/tests/testthat/test-occ_search.r index 7a074b60..25953a9c 100644 --- a/tests/testthat/test-occ_search.r +++ b/tests/testthat/test-occ_search.r @@ -660,4 +660,14 @@ test_that("multiple values for parameters fails", { "You can have multiple values for only one of") }) +test_that("geoDistance works as expected", { + vcr::use_cassette("occ_search_geoDistance", { + aa <- occ_search(geoDistance = "50.0,10.0,10km", limit = 2) + }, preserve_exact_body_bytes = TRUE) + expect_is(aa, "gbif") + expect_is(aa$data, "data.frame") + expect_equal(attr(aa, "args")$geoDistance, "50.0,10.0,10km") + expect_equal(nrow(aa$data), 2) +}) + diff --git a/vignettes/effectively_using_occ_search.Rmd b/vignettes/effectively_using_occ_search.Rmd new file mode 100644 index 00000000..c192465b --- /dev/null +++ b/vignettes/effectively_using_occ_search.Rmd @@ -0,0 +1,187 @@ +--- +title: "Effectively using occ_search" +author: "John Waller" +date: "2024-05-08" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{effectively_using_occ_search} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +GBIF's [occurrence search](https://www.gbif.org/occurrence/search) is a powerful and versatile tool for accessing GBIF mediate data. This vignette will provide an overview of the `occ_search()` function and provide examples and advice of how to use it effectively and also when **not** to use it. + +> The function `occ_search()` (and related legacy function `occ_data()`) **should not** be used for serious research. Users sometimes find it easier to use `occ_search()` rather than `occ_download()` because they do not need to supply a username or password, and also do not need to wait for a download to finish. However, any serious research project should always use `occ_download()` instead. + +`occ_search()` is a quick way to get a non-random sample of occurrences from the GBIF mediated data. It is useful for quickly exploring the data, but it is not suitable for serious research because users are **limited to 100,000 records** per search combination. + +And, even if your search returns fewer than 100,000 records, it is **still** not recommended to use `occ_search()` to retrieve all the records for a serious research project. This is because it is not possible to [cite the data](https://docs.ropensci.org/rgbif/articles/gbif_citations.html) obtained this way in an easy way. + +Here are some examples of some **good** usages of `occ_search()`: + +- Quickly exploring occurrence data +- Getting occurrence counts and statistics (see also `occ_count()` and article [here](https://docs.ropensci.org/rgbif/articles/occ_counts.html)) +- Testing out search parameters before downloading data + +And here are some examples of **bad** usages of `occ_search()`: + +- Looping through a large number of species to extract occurrence data (See article [here](https://docs.ropensci.org/rgbif/articles/downloading_a_long_species_list.html) instead) +- Treating the data as a random sample +- Using `occ_search()` data for citable research + +## basisOfRecord + +One of the more useful fields to search on is `basisOfRecord`, which gives roughly the origin of the occurrence record. Most records on GBIF are either `PRESERVED_SPECIMEN` (museum/herbarium records) or `HUMAN_OBSERVATION` (usually citizen science, but sometimes research observations). + +Other interesting `basisOfRecord` values are `FOSSIL_SPECIMEN` and `LIVING_SPECIMEN` (zoos or botanical gardens), because people typically want to exclude these from their downloads. + +Keep in mind that the `basisOfRecord` values are not guaranteed to be filled in accurately by the publisher. Sometimes records are misclassified or given a `basisOfRecord` that you would not expect or have a [complicated provenance](https://data-blog.gbif.org/post/living-specimen-to-preserved-specimen-understanding-basis-of-record/). + +``` r +occ_search(basisOfRecord="PRESERVED_SPECIMEN") # museum and herbarium records +occ_search(basisOfRecord="HUMAN_OBSERVATION") # citizen science and research observations +occ_search(basisOfRecord="FOSSIL_SPECIMEN") # fossil records +occ_search(basisOfRecord="LIVING_SPECIMEN") # zoo and botanical garden records +occ_search(basisOfRecord="PRESERVED_SPECIMEN;HUMAN_OBSERVATION") # museum/herbarium and citizen science/research observations +occ_search(basisOfRecord="MACHINE_OBSERVATION") # machine observations (e.g. camera traps, acoustic recorders, etc.) +``` + +## Searching with scientificName + +Users are sometimes attracted to `occ_search()` because it is possible to supply a `scientificName` rather than a `taxonKey`. Note, that in the background a call is made the species match service (similar `to name_backbone()`) in order to retrieve a GBIF taxonKey. Because of this, a user can sometimes rarely receive back poorly matched occurrences, particularly if authorship is not supplied. + +``` r +occ_search(scientificName="Caloptery splendens") +# Or better +occ_search(scientificName="Calopteryx splendens (Harris, 1780)") +``` + +Is equivalent to doing the following: + +``` r +occ_search(taxonKey=name_backbone("Calopteryx splendens")$usageKey) +# OR +occ_search(taxonKey=1427067) +``` + +If your name happens to be a [homotypic synonym](https://docs.ropensci.org/rgbif/articles/taxonomic_names.html#too-many-choices-problem) of another name, you may get back occurrences for the other name or no results or a higher-rank match results. Therefore, it is usually safer to use the GBIF taxonKey. + +## Non-interpreted fields + +Some fields in the GBIF mediated data are "interpreted" by GBIF, meaning that they are standardized in some way. For example, the field `basisOfRecord` is standardized to a controlled vocabulary. Therefore, only a few values are returned no matter what the publisher has supplied. For instance, "pinned insect", "fish specimen", and "herbarium sheet", will all get mapped to `PRESERVED_SPECIMEN` by GBIF. + +Other fields are "non-interpreted", meaning that they are not standardized in any way. For example, the field `recordedBy` is a free text field. If you search for `recordedBy="John Smith"`, you may not get back occurrences where the `recordedBy` field is some variant such as `J. Smith`, `Smith, J.`, `Smith, John`, etc. + +One strategy for determining whether a search term is free text is by using `occ_count(facet=<"search term">)`. See article of `occ_count()` [here](https://docs.ropensci.org/rgbif/articles/occ_counts.html). + +``` r +occ_count(facet="recordedBy") +occ_count(facet="basisOfRecord") +``` + +If many unique values are returned, then it is likely that the field is free text. + +## Un-intentional mass data removal from NULL values + +Some search parameters are often `NULL` or not supplied from the publisher. In general, `occ_search()` terms that are not required fields or not filled by GBIF during interpretation are often `NULL`. For example, even though `coordinateUncertaintyInMeters` [theoretically applies](https://docs.gbif.org/georeferencing-best-practices/1.0/en/) to all occurrences with coordinates, it is often `NULL` because the publishers choose not to supply this information or it is unknown. Similarly, `sex` might often be left `NULL` more than what would be expected naively. + +Other columns with more `NULL`s than one might expect : + +- `stateProvince` +- `elevation` +- `establishmentMeans` +- `coordinateUncertaintyInMeters` + +Keep in mind that specifying any filter will remove all records with `NULL` in the filter. + +## Searching for locations + +Location searching can sometimes be challenging for new users. Particularly, searching for `stateProvince` can be tricky because the field is free text when one might expect it to be from a controlled vocabulary. `stateProvince="California"` will not return occurrences where the publisher supplied has values such as `CA`, `Calif.`, or `Cal.`. Additionally, records with coordinates falling within California may not have been supplied with a `stateProvince` value by the publisher. + +``` r +occ_search(stateProvince="California") +occ_search(stateProvince="CA")) # will return different number of records +occ_search(stateProvince="CA;California")) # search both variants at the same time +``` + +A usually better choice than searching by `stateProvince` is to search by `gadmGid`. The term `gadmGid` is a GBIF interpreted field that is filled by GBIF when coordinates are available. Looking up the `gadmGid`s can be done of the GBIF [occurrence search page](https://www.gbif.org/occurrence/map?continent=NORTH_AMERICA&has_coordinate=true&has_geospatial_issue=false&gadm_gid=USA.5_1). + +``` r +occ_search(gadmGid="USA.5_1") # search for California +occ_search(gadmGid="JPN.12_1") # search for Hokkaido Japan +occ_search(gadmGid="USA.5_1;USA.6_1") # search for California and Colorado +occ_search(gadmGid="PHL.10_1") # Bataan Philippines +occ_search(gadmGid="USA") # United States "just land" without EEZ area +``` + +Searching by `country` is typically straightforward because the field is standardized and filled by GBIF when coordinates are available. Two letter country codes are used when searching occurrences. These codes can be looked up using `enumeration_country()`. + +``` r +occ_search(country="US") # search for United States +occ_search(country="JP") # search for Japan +occ_search(country="PH") # search for Philippines +occ_search(country="SW") # search for Sweden +occ_search(country="US;JP") # search for United States and Japan +``` + +Searching by `continent` is also possible, but unlike `country`, this value is **not** filled in when coordinates are available, and instead relies on the publisher filling in this field. So if the publisher has not filled in a value, then this field will be `NULL`, even if it obviously lies on a continent. + +The field is however standardized by GBIF, so that the values are mapped to supplied values are all mapped to a controlled vocabulary(e.g. "Europa, Euroopa,EUR,Eu" -\> EUROPE, "Afrique,"Afr.","AF" -\> AFRIKA). + +``` r +occ_search(continent="EUROPE") # search for Europe +occ_search(continent="AFRIKA") # search for Africa +occ_search(continent="EUROPE;AFRIKA") # search for Europe and Africa +``` + +If you need to get all occurrences from a certain continent, I would use the `gadmGid` filter or supply a bounding box or WKT polygon to `geometry`. When using `geometry` make sure that your polygon is wound in the correct order (anti-clockwise). When in doubt, using the GBIF [web UI](https://www.gbif.org/occurrence/map) to draw and debug the polygon can be a good option. Only POLYGON and MULTIPOLYGON are accepted WKT types. + +``` r +occ_search(geometry="POLYGON((13.42436 69.86167,4.6469 67.01976,-8.26114 67.2205,-19.62021 67.81281,-28.39768 64.25374,-27.88135 53.09437,-17.55493 44.99691,-16.52228 30.81969,3.61426 32.57676,19.62021 30.37524,38.72411 32.14062,54.21375 33.87246,66.60546 43.14228,72.80133 50.54193,70.21972 62.16009,38.20778 72.6752,23.23447 73.42765,13.42436 69.86167))") # rough polygon around Europe +``` + +Sometimes it can be useful to select everything **but** a [certain region](https://www.gbif.org/occurrence/map?has_coordinate=true&has_geospatial_issue=false&geometry=POLYGON((-180%20-90,-90%20-90,0%20-90,90%20-90,180%20-90,180%2090,90%2090,0%2090,-90%2090,-180%2090,-180%20-90),(-5%20-5,-5%205,5%205,5%20-5,-5%20-5))&occurrence_status=present), also known as a "polygon with hole in it". This can be done by formatting your WKT with enough interpolated points. + +``` +POLYGON( +(-180 -90,-90 -90,0 -90,90 -90,180 -90,180 90,90 90,0 90,-90 90,-180 90,-180 -90), +(-5 -5,-5 5,5 5,5 -5,-5 -5) +) +``` + +## Searching for dates + +Some records on GBIF can be quite old (1600s), so it is sometimes useful to filter by `year` to remove these records. Year is typically the collection event or the observation event of the record. Almost all occurrences on GBIF supply a `year` value. Therefore filtering by `year` is typically safe from un-intentional mass data filtering from `NULL` values. + + +```r +occ_search(year=1998) # search for occurrences from 1998 +occ_search(year="1998,2024") # search for occurrences from 1998-2024 +occ_search(year="1900;2000") # search for occurrences from 1900 and 2000 +occ_search(year="1950,2024") # search for somewhat modern records +``` + +## Other record ids + +Sometimes users are coming to GBIF looking for a specific museum record, but they don't know the `gbifid` of the record. In these cases, searching by `occurrenceId`, `catalogNumber`, `recordNumber` or `institutionCode` can be useful. Keep in mind that many of these fields and may not be unique across all of GBIF. For example, a few institutions might use the same `institutionCode`, but actual be different institutions. Usually combining a few of these values can get you close to the record you are looking for. + +```r +occ_search(institutionCode="KU") +occ_search(catalogNumber="KU 110") + +``` + +## DWCA extensions + +New users might not be aware that some data publishers supply additional data beyond simple "when-what-where" data. Richer extra data usually comes in the form of `dwcaExtensions`. While `occ_search()` does not return the values from these extensions, it is possible to filter by extension type to see what dataset publishers have published extensions of interest. + +```r +occ_search(dwcaExtension="http://rs.gbif.org/terms/1.0/Multimedia") +occ_search(dwcaExtension="http://rs.tdwg.org/dwc/terms/MeasurementOrFact") +occ_search(dwcaExtension="http://rs.gbif.org/terms/1.0/DNADerivedData") +``` + +## Further reading + +[GBIF tech docs](https://techdocs.gbif.org/en/openapi/v1/occurrence#/Searching%20occurrences/searchOccurrence) +