Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tracking coverage of OA entities #211

Merged
merged 26 commits into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ Imports:
httr,
jsonlite,
progress,
rlang,
tibble
Suggests:
testthat (>= 3.0.0),
Expand All @@ -46,6 +47,7 @@ Suggests:
ggplot2,
coro,
rentrez,
rrapply,
covr
Encoding: UTF-8
LazyData: true
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
export(authors2df)
export(concepts2df)
export(funders2df)
export(get_coverage)
export(institutions2df)
export(keywords2df)
export(oa2bibliometrix)
export(oa2df)
export(oa_apikey)
Expand Down
21 changes: 16 additions & 5 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@
# openalexR 2.0.0
* Breaking changes in column names in the output of `oa_fetch`:
* `so` is now `source_display_name`
* `so_id` is now `source_id`
* `ab` is now `abstract`
* `url` is now `landing_page_url`
* the nested columns under `authors` no longer have the `au` prefix
* Works:
* `so` is now `source_display_name`
* `so_id` is now `source_id`
* `host_organization` now contains the id of the host organization
* `host_organization_name` now contains the name of the host organization
* `ab` is now `abstract`
* `url` is now `landing_page_url`
* `author` is now `authorships`
* the nested columns under `authors` no longer have the `au` prefix
* New columns: `fwci`, `referenced_works_count`, `keywords`
* Authors:
* `affiliation*` is removed
* New columns: `last_known_institutions`, `2yr_mean_citedness`, `h_index`, `i10_index`
* Removed `concepts` as an entity
* Added `keywords` as an entity
* Added `get_coverage()` to track the oa2df-mapped columns of OpenAlex fields

* Deprecated `oa2bibliometrix()`. Use `bibliometrix::convert2df()`
(from the **bibliometrix** R package) instead.
Expand Down
13 changes: 13 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,16 @@
#' }
#'
"countrycode"

#' Coverage of OpenAlex entity fields after converting to data frame.
#'
#' List with 8 elements associated with 8 OpenAlex entities.
#'
#' @format Each element is a dataframe with 3 columns
#' \describe{
#' \item{original}{original field name from OpenAlex}
#' \item{oa2df}{new column name in output dataframe from oa2df}
#' \item{comment}{additional notes}
#' }
#'
"oa2df_coverage"
4 changes: 1 addition & 3 deletions R/oa2bibliometrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ oa2bibliometrix <- function(df) {
countrycode$Country <- as.character(countrycode$Country)

# Authors
AU_info <- lapply(df$author[7], function(l) {
AU_info <- lapply(df$authorships[7], function(l) {
if (length(l) == 0 || (length(l) == 1 && is.na(l))){
return(empty_list(
c("AU", "RP", "C1", "AU_UN", "AU_CO")
Expand All @@ -72,8 +72,6 @@ oa2bibliometrix <- function(df) {
})
AU_info <- do.call(rbind.data.frame, AU_info)



# References
df$CR <- unlist(lapply(df$referenced_works, function(l) {
paste(shorten_oaid(l), collapse = ";")
Expand Down
186 changes: 71 additions & 115 deletions R/oa2df.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#' Convert OpenAlex collection from list to data frame
#'
#' It converts bibliographic collections gathered from OpenAlex database \href{https://openalex.org/}{https://openalex.org/} into data frame.
#' The function converts a collection of records about works, authors, institutions, venues or concepts obtained using \code{oa_request} into a data frame/tibble.
#' The function converts a collection of records about works, authors, institutions, venues or keywords obtained using \code{oa_request} into a data frame/tibble.
#'
#' @param data List. Output of \code{oa_request}.
#' @param entity Character. Scholarly entity of the search.
#' The argument can be one of
#' c("works", "authors", "institutions", "concepts", "funders", "sources", "publishers", "topics").
#' c("works", "authors", "institutions", "keywords", "funders", "sources", "publishers", "topics").
#' @param abstract Logical. If TRUE, the function returns also the abstract of each item.
#' Ignored if entity is different from "works". Defaults to TRUE.
#' @param verbose Logical.
Expand Down Expand Up @@ -48,6 +48,14 @@
#'
#' @export
oa2df <- function(data, entity, options = NULL, count_only = FALSE, group_by = NULL, abstract = TRUE, verbose = TRUE) {
rlang::warn(
"Note: `oa_fetch` and `oa2df` now return new names for some columns in openalexR v2.0.0.
See NEWS.md for the list of changes.
Call `get_coverage()` to view the all updated columns and their original names in OpenAlex.",
.frequency = "regularly",
.frequency_id = "oa2df_column_change"
)

if (length(data) == 0) {
return(NULL)
}
Expand All @@ -72,6 +80,7 @@ oa2df <- function(data, entity, options = NULL, count_only = FALSE, group_by = N
authors = authors2df(data, verbose),
institutions = institutions2df(data, verbose),
concepts = concepts2df(data, verbose),
keywords = keywords2df(data, verbose),
funders = funders2df(data, verbose),
sources = sources2df(data, verbose),
publishers = publishers2df(data, verbose),
Expand Down Expand Up @@ -137,14 +146,16 @@ oa2df <- function(data, entity, options = NULL, count_only = FALSE, group_by = N
works2df <- function(data, abstract = TRUE, verbose = TRUE,
pb = if (verbose) oa_progress(length(data)) else NULL) {
col_order <- c(
"id", "title", "display_name", "author", "abstract", "publication_date", "relevance_score",
"source_display_name", "source_id", "issn_l", "landing_page_url", "pdf_url",
"license", "version", "first_page", "last_page", "volume", "issue", "is_oa",
"is_oa_anywhere", "oa_status", "oa_url", "any_repository_has_fulltext",
"language", "grants", "cited_by_count", "counts_by_year",
"publication_year", "cited_by_api_url", "ids", "doi", "type",
"referenced_works", "related_works", "is_paratext", "is_retracted",
"concepts", "topics", "apc"
"id", "title", "display_name", "authorships", "abstract", "doi",
"publication_date", "publication_year", "relevance_score", "fwci",
"cited_by_count", "counts_by_year", "cited_by_api_url", "ids", "type",
"is_oa", "is_oa_anywhere", "oa_status", "oa_url",
"any_repository_has_fulltext", "source_display_name", "source_id", "issn_l",
"host_organization", "host_organization_name",
"landing_page_url", "pdf_url", "license", "version", "referenced_works",
"referenced_works_count", "related_works", "concepts", "topics", "keywords",
"is_paratext", "is_retracted", "language", "grants", "apc",
"first_page", "last_page", "volume", "issue"
)
works_process <- tibble::tribble(
~type, ~field,
Expand All @@ -161,9 +172,12 @@ works2df <- function(data, abstract = TRUE, verbose = TRUE,
"identical", "is_retracted",
"identical", "relevance_score",
"identical", "language",
"identical", "fwci",
"identical", "referenced_works_count",
"flat", "grants",
"flat", "referenced_works",
"flat", "related_works",
"rbind_df", "keywords",
"rbind_df", "counts_by_year",
"rbind_df", "concepts",
"flat", "apc_list",
Expand All @@ -174,7 +188,9 @@ works2df <- function(data, abstract = TRUE, verbose = TRUE,
so_cols <- c(
source_id = "id",
source_display_name = "display_name",
issn_l = "issn_l"
issn_l = "issn_l",
host_organization = "host_organization",
host_organization_name = "host_organization_name"
)

n <- length(data)
Expand All @@ -195,7 +211,7 @@ works2df <- function(data, abstract = TRUE, verbose = TRUE,
if (!is.null(sim_fields$publication_date)) {
sim_fields$publication_date <- as.Date(sim_fields$publication_date)
}
author <- process_paper_authors(paper$authorships)
authorships <- process_paper_authors(paper$authorships)
ab <- abstract_build(paper$abstract_inverted_index, abstract)
paper_biblio <- replace_w_na(paper$biblio)
open_access <- replace_w_na(paper$open_access)
Expand All @@ -221,7 +237,7 @@ works2df <- function(data, abstract = TRUE, verbose = TRUE,
}
topics <- process_topics(paper, "score")
out_ls <- c(sim_fields, venue, source, open_access, paper_biblio,
list(author = author, abstract = ab, apc = apc), topics)
list(authorships = authorships, abstract = ab, apc = apc), topics)
out_ls[sapply(out_ls, is.null)] <- NULL
list_df[[i]] <- out_ls
}
Expand All @@ -230,82 +246,6 @@ works2df <- function(data, abstract = TRUE, verbose = TRUE,
out_df[, intersect(col_order, names(out_df))]
}

#' Build abstract from inverted index
#'
#' @param ab List. Inverted index of abstract.
#' @param build Logical. If TRUE, build the abstract.
#'
#' @return Character string. The abstract of the paper.
#' @keywords internal
abstract_build <- function(ab, build = TRUE) {
if (is.null(ab) || !build) {
return(NULL)
}
w <- rep(names(ab), lengths(ab))
ind <- unlist(ab)
if (is.null(ind)) {
return("")
}

paste(w[order(ind)], collapse = " ", sep = "")
}

#' Process paper authorships
#'
#' @param authorships List. Authorships element of paper.
#'
#' @return List. A list of one dataframe with the processed authors:
#' id, display_name, orcid, author_position, is_corresponding, affiliations, affiliation_raw
#' @keywords internal
process_paper_authors <- function(authorships){
if (is.null(authorships)) {
return(NULL)
}
authors_ls <- lapply(authorships, function(l) {
l_author <- if (length(l$author)) {
replace_w_na(l$author)
} else {
empty_list(names(l$author))
}

affiliation_raw <- if (length(l$raw_affiliation_strings)) {
l$raw_affiliation_strings[[1]]
} else {
NA_character_
}

affs <- list(
affiliations = process_affil(l$institutions),
affiliation_raw = affiliation_raw
)

c(l_author, l[c("author_position", "is_corresponding")], affs)
})

list(rbind_oa_ls(authors_ls))
}


#' Process affiliations
#'
#' @param l_institution List. Nested elements include
#' id, display_name, ror, country_code, type, lineage
#'
#' @return Dataframe of with the following columns:
#' id, display_name, ror, country_code, type, lineage
#' @keywords internal
process_affil <- function(l_institution){
if (!length(l_institution)){
return(list(empty_df()))
}
l_inst <- lapply(l_institution, function(x) {
x$lineage <- paste(x$lineage, collapse = ", ")
x
})
subs_na(l_inst, "rbind_df")
}



#' Convert OpenAlex collection of authors' records from list format to data frame
#'
Expand Down Expand Up @@ -346,9 +286,6 @@ authors2df <- function(data, verbose = TRUE,
n <- length(data)
list_df <- vector(mode = "list", length = n)

inst_cols <- c("id", "display_name", "ror", "country_code", "type", "lineage")
empty_inst <- empty_list(inst_cols)

author_process <- tibble::tribble(
~type, ~field,
"identical", "id",
Expand Down Expand Up @@ -376,36 +313,23 @@ authors2df <- function(data, verbose = TRUE,
SIMPLIFY = FALSE
)

# current affiliation
sub_affiliation <- item$last_known_institutions
if (!is.null(sub_affiliation) && length(sub_affiliation)) {
sub_affiliation <- sub_affiliation[[1]]
if (is.na(sub_affiliation[[1]])) {
sub_affiliation <- empty_inst
}
sub_affiliation$lineage <- paste(sub_affiliation$lineage, collapse = ", ")
sub_affiliation <- prepend(sub_affiliation, "affiliation")
}
sub_affiliation <- replace_w_na(sub_affiliation)

# all affiliations
if (!is.null(item$affiliations)) {
l_inst <- lapply(item$affiliations, function(x) x$institution)
affs <- list(affiliations = process_affil(l_inst))
# current affiliations
if (!is.null(item$last_known_institutions)) {
l_inst <- item$last_known_institutions
affs <- list(last_known_institutions = process_affil(l_inst))
} else {
affs <- NULL
}

topics <- process_topics(item, "count")
list_df[[i]] <- c(sim_fields, sub_affiliation, affs, topics)
list_df[[i]] <- c(sim_fields, affs, item$summary_stats, topics)
}

col_order <- c(
"id", "display_name", "display_name_alternatives", "relevance_score",
"ids", "orcid", "works_count", "cited_by_count", "counts_by_year",
"affiliation_display_name", "affiliation_id", "affiliation_ror",
"affiliation_country_code", "affiliation_type", "affiliation_lineage",
"affiliations", "topics", "works_api_url"
"2yr_mean_citedness", "h_index", "i10_index",
"last_known_institutions", "topics", "works_api_url"
)

out_df <- rbind_oa_ls(list_df)
Expand Down Expand Up @@ -462,6 +386,7 @@ institutions2df <- function(data, verbose = TRUE,
"identical", "updated_date",
"identical", "created_date",
"identical", "relevance_score",
"flat", "summary_stats",
"flat", "display_name_alternatives",
"flat", "display_name_acronyms",
"row_df", "geo",
Expand All @@ -484,7 +409,7 @@ institutions2df <- function(data, verbose = TRUE,
interna <- NULL
if (!is.null(item$international)) {
interna <- list(
display_name_international = subs_na(
international_display_name = subs_na(
item$international$display_name,
type = "flat"
)
Expand All @@ -497,10 +422,10 @@ institutions2df <- function(data, verbose = TRUE,

col_order <- c(
"id", "display_name", "display_name_alternatives", "display_name_acronyms",
"display_name_international", "ror", "ids", "country_code", "geo", "type",
"international_display_name", "ror", "ids", "country_code", "geo", "type",
"homepage_url", "image_url", "image_thumbnail_url",
"associated_institutions", "relevance_score", "works_count",
"cited_by_count", "counts_by_year",
"cited_by_count", "counts_by_year", "summary_stats",
"works_api_url", "topics", "updated_date", "created_date"
)

Expand Down Expand Up @@ -602,6 +527,37 @@ concepts2df <- function(data, verbose = TRUE,
}


#' Convert keywords from list to data frame
#'
#' The function converts a list of keywords obtained using \code{oa_request} or
#' \code{oa_fetch(output = "list")} into a data frame/tibble. More on keyword at
#' <https://help.openalex.org/hc/en-us/articles/24736201130391-Keywords>.
#'
#' @inheritParams works2df
#'
#' @return a data.frame.
#'
#'
#' @examples
#' \dontrun{
#'
#' x <- oa_fetch(
#' entity = "keywords",
#' options = list(sample = 5),
#' output = "list"
#' )
#'
#' df <- oa2df(x, entity = "keywords")
#'
#' df
#' }
#'
#' @export
keywords2df <- function(data, verbose = TRUE) {
tibble::as_tibble(subs_na(data, "rbind_df")[[1]])
}


#' Convert OpenAlex collection of funders' records from list format to data frame
#'
#' It converts bibliographic collection of funders' records gathered from OpenAlex database \href{https://openalex.org/}{https://openalex.org/} into data frame.
Expand Down
Loading
Loading