diff --git a/DESCRIPTION b/DESCRIPTION index 34016f7..86ee2bb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,37 +1,40 @@ Type: Package Package: msigdbr Title: MSigDB Gene Sets for Multiple Organisms in a Tidy Data Format -Version: 2023.1.1 +Version: 9.0.0.9000 Authors@R: person("Igor", "Dolgalev", , "igor.dolgalev@nyumc.org", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-4451-126X")) Description: Provides the 'Molecular Signatures Database' (MSigDB) gene sets typically used with the 'Gene Set Enrichment Analysis' (GSEA) software (Subramanian et al. 2005 , - Liberzon et al. 2015 ) in a standard R - data frame with key-value pairs. The package includes the human genes - as listed in MSigDB as well as the corresponding symbols and IDs for - frequently studied model organisms such as mouse, rat, pig, fly, and - yeast. + Liberzon et al. 2015 , Castanza et al. + 2023 ) as an R data frame. The package + includes the human genes as listed in MSigDB as well as the + corresponding symbols and IDs for frequently studied model organisms + such as mouse, rat, pig, fly, and yeast. License: MIT + file LICENSE URL: https://igordot.github.io/msigdbr/ BugReports: https://github.com/igordot/msigdbr/issues Depends: - R (>= 3.6) + R (>= 4.1) Imports: babelgene (>= 22.9), dplyr (>= 1.1.1), - magrittr, + lifecycle, + methods, rlang, tibble, - tidyselect + tidyselect (>= 1.2.0) Suggests: knitr, + msigdbdf, rmarkdown, roxygen2, testthat VignetteBuilder: knitr +Additional_repositories: https://igordot.r-universe.dev Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 diff --git a/NAMESPACE b/NAMESPACE index f1442df..6fb9739 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,11 +1,10 @@ # Generated by roxygen2: do not edit by hand -export("%>%") export(msigdbr) export(msigdbr_collections) -export(msigdbr_show_species) export(msigdbr_species) import(tibble) +import(tidyselect) importFrom(babelgene,orthologs) importFrom(babelgene,species) importFrom(dplyr,arrange) @@ -16,8 +15,10 @@ importFrom(dplyr,inner_join) importFrom(dplyr,mutate) importFrom(dplyr,rename) importFrom(dplyr,select) -importFrom(magrittr,"%>%") +importFrom(lifecycle,deprecated) +importFrom(lifecycle,is_present) +importFrom(methods,is) importFrom(rlang,.data) -importFrom(tibble,as_tibble) -importFrom(tidyselect,any_of) -importFrom(tidyselect,everything) +importFrom(rlang,check_installed) +importFrom(utils,install.packages) +importFrom(utils,menu) diff --git a/R/msigdbr-check-data.R b/R/msigdbr-check-data.R new file mode 100644 index 0000000..36ebeff --- /dev/null +++ b/R/msigdbr-check-data.R @@ -0,0 +1,50 @@ +#' Check that the data package is installed +#' +#' Check that the 'msigdbdf' data package is installed. +#' If not, provide instructions for installation. +#' A dependency listed in DESCRIPTION Suggests is not guaranteed to be installed. +#' +#' @importFrom utils install.packages menu +msigdbr_check_data <- function() { + if (!requireNamespace("msigdbdf", quietly = TRUE)) { + message("The 'msigdbdf' package must be installed.") + + install_instructions <- paste0( + "Please run the following command to install the 'msigdbdf' package:\n", + "install.packages('msigdbdf', repos = 'https://igordot.r-universe.dev')" + ) + + error_message <- function(e) { + message(e) + cat(paste0("\nFailed to install the 'msigdbdf' package.\n", install_instructions, "\n")) + } + + if (interactive()) { + # If running R interactively + input <- utils::menu(c("Yes", "No"), title = "Would you like to install 'msigdbdf'?") + if (input == 1) { + # Answered "Yes" + message("Installing the 'msigdbdf' package.") + tryCatch( + utils::install.packages("msigdbdf", repos = c("https://igordot.r-universe.dev", getOption("repos"))), + error = error_message, warning = error_message + ) + } else { + # Answered "No" + stop(install_instructions) + } + } else { + # If not running R interactively + stop(install_instructions) + } + } +} + +.onAttach <- function(libname, pkgname) { + if (!requireNamespace("msigdbdf", quietly = TRUE)) { + packageStartupMessage( + "To access all the data, please install the 'msigdbdf' package with:\n", + "install.packages('msigdbdf', repos = 'https://igordot.r-universe.dev')" + ) + } +} diff --git a/R/msigdbr-collections.R b/R/msigdbr-collections.R index 3d25660..8074701 100644 --- a/R/msigdbr-collections.R +++ b/R/msigdbr-collections.R @@ -2,14 +2,40 @@ #' #' @return A data frame of the available collections. #' +#' @param db_species Species abbreviation for the human or mouse databases (`"Hs"` or `"Mm"`). +#' #' @importFrom dplyr arrange count distinct #' @export #' #' @examples #' msigdbr_collections() -msigdbr_collections <- function() { - msigdbr_genesets %>% - distinct(.data$gs_cat, .data$gs_subcat, .data$gs_id) %>% - count(.data$gs_cat, .data$gs_subcat, name = "num_genesets") %>% - arrange(.data$gs_cat, .data$gs_subcat) +msigdbr_collections <- function(db_species = "Hs") { + # rlang::check_installed("msigdbdf") + msigdbr_check_data() + + # Get the full table of gene sets and their member genes + mc <- msigdbdf::msigdbdf(target_species = db_species) + + # Keep only gene set information (ignors genes) + mc <- dplyr::distinct( + mc, + .data$gs_collection, + .data$gs_subcollection, + .data$gs_collection_name, + .data$gs_id + ) + + # Count the number of gene sets per collection + mc <- dplyr::count( + mc, + .data$gs_collection, + .data$gs_subcollection, + .data$gs_collection_name, + name = "num_genesets" + ) + + # Sort + mc <- dplyr::arrange(mc, .data$gs_collection, .data$gs_subcollection) + + return(mc) } diff --git a/R/msigdbr-package.R b/R/msigdbr-package.R new file mode 100644 index 0000000..a7d75f1 --- /dev/null +++ b/R/msigdbr-package.R @@ -0,0 +1,11 @@ +#' @keywords internal +"_PACKAGE" + +## usethis namespace: start +#' @import tibble +#' @import tidyselect +#' @importFrom lifecycle deprecated is_present +#' @importFrom methods is +#' @importFrom rlang .data check_installed +## usethis namespace: end +NULL diff --git a/R/msigdbr-species.R b/R/msigdbr-species.R index 402e7fb..2009164 100644 --- a/R/msigdbr-species.R +++ b/R/msigdbr-species.R @@ -4,31 +4,19 @@ #' #' @importFrom babelgene species #' @importFrom dplyr arrange distinct select -#' @importFrom tibble as_tibble +#' #' @export #' #' @examples #' msigdbr_species() msigdbr_species <- function() { - species() %>% - as_tibble() %>% + babelgene::species() |> + as_tibble() |> select( species_name = "scientific_name", species_common_name = "common_name" - ) %>% - rbind(c("Homo sapiens", "human")) %>% - distinct() %>% + ) |> + rbind(c("Homo sapiens", "human")) |> + distinct() |> arrange(.data$species_name) } - -#' List the species available in the msigdbr package -#' -#' This function is being deprecated and replaced by `msigdbr_species()`. -#' -#' @return A vector of possible species. -#' -#' @export -msigdbr_show_species <- function() { - .Deprecated("msigdbr_species") - sort(msigdbr_species()[["species_name"]]) -} diff --git a/R/msigdbr.R b/R/msigdbr.R index 853e8ac..6a18b58 100644 --- a/R/msigdbr.R +++ b/R/msigdbr.R @@ -1,21 +1,31 @@ #' Retrieve the gene sets data frame #' #' Retrieve a data frame of gene sets and their member genes. -#' The available species and collections can be checked with `msigdbr_species()` and `msigdbr_collections()`. +#' The original human genes can be converted into their corresponding counterparts in various model organisms, including mouse, rat, pig, zebrafish, fly, and yeast. +#' The output includes gene symbols along with NCBI and Ensembl IDs. #' -#' @param species Species name, such as Homo sapiens or Mus musculus. -#' @param category MSigDB collection abbreviation, such as H or C1. -#' @param subcategory MSigDB sub-collection abbreviation, such as CGP or BP. +#' Historically, the MSigDB resource has been tailored to the analysis of human-specific datasets, with gene sets exclusively aligned to the human genome. +#' Starting with release 2022.1, MSigDB incorporated a database of mouse-native gene sets and was split into human and mouse divisions ("Hs" and "Mm"). +#' Each one is provided in the approved gene symbols of its respective species. +#' The versioning convention of MSigDB is in the format `Year.Release.Species`. +#' The genes within each gene set may originate from a species different from the database target species, indicated by the `gs_source_species` and `db_target_species` fields. +#' +#' Mouse MSigDB includes gene sets curated from mouse-centric datasets and specified in native mouse gene identifiers, eliminating the need for ortholog mapping. +#' +#' @param species Species name for output genes, such as `"Homo sapiens"` or `"Mus musculus"`. Use `msigdbr_species()` for available options. +#' @param db_species Species abbreviation for the human or mouse databases (`"HS"` or `"MM"`). +#' @param collection Collection abbreviation, such as `"H"` or `"C1"`. Use `msigdbr_collections()` for the available options. +#' @param subcollection Sub-collection abbreviation, such as `"CGP"` or `"BP"`. Use `msigdbr_collections()` for the available options. +#' @param category `r lifecycle::badge("deprecated")` use the `collection` argument +#' @param subcategory `r lifecycle::badge("deprecated")` use the `subcollection` argument #' #' @return A data frame of gene sets with one gene per row. #' -#' @references \url{https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp} +#' @references #' -#' @import tibble #' @importFrom babelgene orthologs #' @importFrom dplyr arrange distinct filter inner_join mutate rename select -#' @importFrom rlang .data -#' @importFrom tidyselect any_of everything +#' #' @export #' #' @examples @@ -26,87 +36,141 @@ #' #' # get mouse C2 (curated) CGP (chemical and genetic perturbations) gene sets #' \donttest{ -#' msigdbr(species = "Mus musculus", category = "C2", subcategory = "CGP") +#' msigdbr(species = "Mus musculus", collection = "C2", subcollection = "CGP") #' } -msigdbr <- function(species = "Homo sapiens", category = NULL, subcategory = NULL) { - # confirm that only one species is specified +msigdbr <- function(species = "Homo sapiens", db_species = "HS", collection = NULL, subcollection = NULL, category = deprecated(), subcategory = deprecated()) { + # Check that msigdbdf is installed + # a dependency listed in DESCRIPTION Suggests is not guaranteed to be installed + # if (!requireNamespace("msigdbdf", quietly = TRUE)) { + # stop("package 'msigdbdf' must be installed to use this function", call. = FALSE) + # } + msigdbr_check_data() + + # Check parameters + if (!is(species, "character")) { + stop("`species` is not a character string") + } if (length(species) > 1) { - stop("please specify only one species at a time") + stop("only one `species` should be specified") + } + if (!is(db_species, "character")) { + stop("`db_species` is not a character string") + } + + # Use only mouse genes for mouse database + db_species <- toupper(db_species) + if (db_species == "MM" && !(species %in% c("Mus musculus", "mouse", "house mouse"))) { + stop("set species to mouse for the mouse database") + } + + # Check for deprecated category arguments + if (lifecycle::is_present(category)) { + lifecycle::deprecate_warn("9.0.0", "msigdbr(category)", "msigdbr(collection)") + collection <- category + } + if (lifecycle::is_present(subcategory)) { + lifecycle::deprecate_warn("9.0.0", "msigdbr(subcategory)", "msigdbr(subcollection)") + subcollection <- subcategory } - genesets_subset <- msigdbr_genesets + # Get the gene sets table + mdb <- msigdbdf::msigdbdf(target_species = db_species) - # filter by category - if (is.character(category)) { - if (length(category) > 1) { - stop("please specify only one category at a time") + # Filter by collection + if (is.character(collection)) { + if (length(collection) > 1) { + stop("Please specify only one collection at a time.") } - if (category %in% genesets_subset$gs_cat) { - genesets_subset <- filter(genesets_subset, .data$gs_cat == category) + if (collection %in% mdb$gs_collection) { + mdb <- dplyr::filter(mdb, .data$gs_collection == collection) } else { - stop("unknown category") + stop("Unknown collection. Use `msigdbr_collections()` to see the available collections.") } } - if (is.character(subcategory)) { - if (length(subcategory) > 1) { - stop("please specify only one subcategory at a time") + # Filter by sub-collection + if (is.character(subcollection)) { + if (length(subcollection) > 1) { + stop("Please specify only one subcollection at a time.") } - if (subcategory %in% genesets_subset$gs_subcat) { - genesets_subset <- filter(genesets_subset, .data$gs_subcat == subcategory) - } else if (subcategory %in% gsub(".*:", "", genesets_subset$gs_subcat)) { - genesets_subset <- filter(genesets_subset, gsub(".*:", "", .data$gs_subcat) == subcategory) + if (subcollection %in% mdb$gs_subcollection) { + mdb <- dplyr::filter(mdb, .data$gs_subcollection == subcollection) + } else if (subcollection %in% gsub(".*:", "", mdb$gs_subcollection)) { + mdb <- dplyr::filter(mdb, gsub(".*:", "", .data$gs_subcollection) == subcollection) } else { - stop("unknown subcategory") + stop("Unknown subcollection.") } } - # combine gene sets and genes - genesets_subset <- inner_join(genesets_subset, msigdbr_geneset_genes, by = "gs_id") - genesets_subset <- inner_join(genesets_subset, msigdbr_genes, by = "gene_id", relationship = "many-to-many") - genesets_subset <- select(genesets_subset, !any_of(c("gene_id"))) + # Create a fake orthologs table for cases when orthologs are not needed + species_genes <- dplyr::select( + mdb, + gene_symbol = "db_gene_symbol", + ncbi_gene = "db_ncbi_gene", + ensembl_gene = "db_ensembl_gene" + ) + species_genes <- dplyr::mutate( + species_genes, + db_ensembl_gene = .data$ensembl_gene + ) - # retrieve orthologs - if (species %in% c("Homo sapiens", "human")) { - orthologs_subset <- - genesets_subset %>% - select( - "human_ensembl_gene", - gene_symbol = "human_gene_symbol", - entrez_gene = "human_entrez_gene" - ) %>% - mutate(ensembl_gene = .data$human_ensembl_gene) %>% - distinct() - } else { - orthologs_subset <- - orthologs(genes = genesets_subset$human_ensembl_gene, species = species) %>% - select(!any_of(c("human_symbol", "human_entrez"))) %>% - rename( - human_ensembl_gene = "human_ensembl", - gene_symbol = "symbol", - entrez_gene = "entrez", - ensembl_gene = "ensembl", - ortholog_sources = "support", - num_ortholog_sources = "support_n" - ) + # Retrieve orthologs for the non-human species for the human database + if (db_species == "HS" && !(species %in% c("Homo sapiens", "human"))) { + species_genes <- babelgene::orthologs( + genes = unique(mdb$db_ensembl_gene), + species = species + ) + species_genes <- dplyr::select( + species_genes, + db_ensembl_gene = "human_ensembl", + gene_symbol = "symbol", + ncbi_gene = "entrez", + ensembl_gene = "ensembl", + ortholog_taxon_id = "taxon_id", + ortholog_sources = "support", + num_ortholog_sources = "support_n", + !tidyselect::any_of(c("human_symbol", "human_entrez")) + ) } - # combine gene sets and orthologs - genesets_subset <- inner_join(genesets_subset, orthologs_subset, by = "human_ensembl_gene", relationship = "many-to-many") - genesets_subset <- arrange(genesets_subset, .data$gs_name, .data$human_gene_symbol, .data$gene_symbol) - genesets_subset <- select( - genesets_subset, - "gs_cat", - "gs_subcat", - "gs_name", + # Remove duplicate entries + species_genes <- dplyr::distinct(species_genes) + + # Combine gene sets and orthologs + mdb <- dplyr::inner_join( + mdb, + species_genes, + by = "db_ensembl_gene", + relationship = "many-to-many" + ) + + # Reorder columns for better readability + mdb <- dplyr::select( + mdb, "gene_symbol", - "entrez_gene", + "ncbi_gene", "ensembl_gene", - "human_gene_symbol", - "human_entrez_gene", - "human_ensembl_gene", + "db_gene_symbol", + "db_ncbi_gene", + "db_ensembl_gene", + "source_gene", + "gs_id", + "gs_name", + "gs_collection", + "gs_subcollection", everything() ) + mdb <- dplyr::arrange(mdb, .data$gs_name, .data$db_gene_symbol, .data$gene_symbol) + + # Add columns from the old msigdbr output if old arguments are present + if (lifecycle::is_present(category) | lifecycle::is_present(subcategory)) { + mdb <- dplyr::mutate( + mdb, + entrez_gene = .data$ncbi_gene, + gs_cat = .data$gs_collection, + gs_subcat = .data$gs_subcollection, + ) + } - return(genesets_subset) + return(mdb) } diff --git a/R/sysdata.rda b/R/sysdata.rda deleted file mode 100644 index 7349daa..0000000 Binary files a/R/sysdata.rda and /dev/null differ diff --git a/R/utils-pipe.R b/R/utils-pipe.R deleted file mode 100644 index fd0b1d1..0000000 --- a/R/utils-pipe.R +++ /dev/null @@ -1,14 +0,0 @@ -#' Pipe operator -#' -#' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. -#' -#' @name %>% -#' @rdname pipe -#' @keywords internal -#' @export -#' @importFrom magrittr %>% -#' @usage lhs \%>\% rhs -#' @param lhs A value or the magrittr placeholder. -#' @param rhs A function call using the magrittr semantics. -#' @return The result of calling `rhs(lhs)`. -NULL diff --git a/data-raw/msigdbr-prepare.R b/data-raw/msigdbr-prepare.R deleted file mode 100644 index 3f3867e..0000000 --- a/data-raw/msigdbr-prepare.R +++ /dev/null @@ -1,282 +0,0 @@ -library(dplyr) -library(tidyr) -library(purrr) -library(readr) -library(stringr) -library(glue) -library(xml2) -library(usethis) -options(pillar.print_max = 100) - -# Import MSigDB gene sets ----- - -# Set MSigDB version -mdb_version <- "2023.1.Hs" - -# Set HGNC version (last quarterly release before MSigDB release) -hgnc_version <- "2023-01-01" - -# Set MSigDB file paths -mdb_xml <- glue("msigdb_v{mdb_version}.xml") -mdb_xml_zip <- str_glue("{mdb_xml}.zip") -mdb_url_base <- "https://data.broadinstitute.org/gsea-msigdb/msigdb" -mdb_zip_url <- glue("{mdb_url_base}/release/{mdb_version}/{mdb_xml_zip}") - -# Download and unzip the MSigDB XML file -options(timeout = 300) -download.file(url = mdb_zip_url, destfile = mdb_xml_zip) -unzip(mdb_xml_zip, exdir = ".") -file.remove(mdb_xml_zip) - -# Check MSigDB XML file size in bytes -utils:::format.object_size(file.size(mdb_xml), units = "auto") - -# Import the MSigDB XML file (fails if loaded directly from URL) -mdb_doc <- read_xml(mdb_xml) - -# Delete the MSigDB XML file and its contents since they are no longer needed -file.remove(mdb_xml) - -# Extract the XML attributes and convert into a tibble -# https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/mdb_XML_description -# GENESET record attributes: -# * STANDARD_NAME: gene set name -# * SYSTEMATIC_NAME: gene set name for internal indexing purposes -# * CATEGORY_CODE: gene set collection code, e.g., C2 -# * SUB_CATEGORY_CODE: gene set subcategory code, e.g., CGP -# * PMID: PubMed ID for the source publication -# * GEOID: GEO or ArrayExpress ID for the raw microarray data in GEO or ArrayExpress repository -# * EXACT_SOURCE: exact source of the set, usually a specific figure or table in the publication -# * GENESET_LISTING_URL: URL of the original source that listed the gene set members (all blank) -# * EXTERNAL_DETAILS_URL: URL of the original source page of the gene set -# * DESCRIPTION_BRIEF: brief description of the gene set -# * MEMBERS: list of gene set members as they originally appeared in the source -# * MEMBERS_SYMBOLIZED: list of gene set members in the form of human gene symbols -# * MEMBERS_EZID: list of gene set members in the form of human Entrez Gene IDs -# * MEMBERS_MAPPING: pipe-separated list of in the form of: MEMBERS, MEMBERS_SYMBOLIZED, MEMBERS_EZID -mdb_gs_ns <- xml_find_all(mdb_doc, xpath = ".//GENESET") -mdb_tbl <- - tibble( - gs_name = xml_attr(mdb_gs_ns, attr = "STANDARD_NAME"), - gs_id = xml_attr(mdb_gs_ns, attr = "SYSTEMATIC_NAME"), - gs_cat = xml_attr(mdb_gs_ns, attr = "CATEGORY_CODE"), - gs_subcat = xml_attr(mdb_gs_ns, attr = "SUB_CATEGORY_CODE"), - gs_pmid = xml_attr(mdb_gs_ns, attr = "PMID"), - gs_geoid = xml_attr(mdb_gs_ns, attr = "GEOID"), - gs_exact_source = xml_attr(mdb_gs_ns, attr = "EXACT_SOURCE"), - gs_url = xml_attr(mdb_gs_ns, attr = "EXTERNAL_DETAILS_URL"), - gs_description = xml_attr(mdb_gs_ns, attr = "DESCRIPTION_BRIEF"), - gs_members = xml_attr(mdb_gs_ns, attr = "MEMBERS_MAPPING") - ) %>% - filter(gs_cat != "ARCHIVED") - -# Get the number of gene sets per collection (for testing) -mdb_category_genesets <- mdb_tbl %>% - distinct(gs_cat, gs_subcat, gs_id) %>% - count(gs_cat, gs_subcat, name = "n_genesets") -mdb_category_genesets - -# Import MSigDB Ensembl mappings ----- - -# Download MSigDB Ensembl mappings -# Should include all MSigDB genes -ensembl_url <- glue("{mdb_url_base}/annotations/human/Human_Ensembl_Gene_ID_MSigDB.v{mdb_version}.chip") -ensembl_tbl <- read_tsv(ensembl_url, progress = FALSE, show_col_types = FALSE) -ensembl_tbl <- distinct(ensembl_tbl, human_ensembl_gene = `Probe Set ID`, human_gene_symbol = `Gene Symbol`) -ensembl_tbl <- filter(ensembl_tbl, str_detect(human_ensembl_gene, "^ENSG000")) -ensembl_tbl <- arrange(ensembl_tbl, human_ensembl_gene) - -# Check for multi-mappers (should be many) -count(ensembl_tbl, human_ensembl_gene, sort = TRUE) -count(ensembl_tbl, human_gene_symbol, sort = TRUE) - -# Import HGNC mappings ----- - -# Download HGNC mappings -# May not include all MSigDB genes, but there is usually one Ensembl ID per gene -hgnc_url <- str_glue("https://storage.googleapis.com/public-download-files/hgnc/archive/archive/quarterly/tsv/hgnc_complete_set_{hgnc_version}.txt") -hgnc_tbl <- read_tsv(hgnc_url, progress = FALSE, show_col_types = FALSE, guess_max = 10000) -hgnc_tbl <- distinct(hgnc_tbl, human_ensembl_gene = ensembl_gene_id, human_entrez_gene = entrez_id) -hgnc_tbl <- mutate(hgnc_tbl, human_entrez_gene = as.integer(human_entrez_gene)) - -# Keep only MSigDB Ensembl IDs -setdiff(hgnc_tbl$human_ensembl_gene, ensembl_tbl$human_ensembl_gene) %>% length() -hgnc_tbl <- filter(hgnc_tbl, human_ensembl_gene %in% ensembl_tbl$human_ensembl_gene) -hgnc_tbl <- arrange(hgnc_tbl, human_ensembl_gene) - -# Check for multi-mappers (should be few) -count(hgnc_tbl, human_ensembl_gene, sort = TRUE) -count(hgnc_tbl, human_entrez_gene, sort = TRUE) - -# Generate a gene sets table ----- - -# Create a table for gene sets -msigdbr_genesets <- mdb_tbl %>% - select(!gs_members) %>% - distinct() %>% - arrange(gs_name, gs_id) - -if (nrow(msigdbr_genesets) != sum(mdb_category_genesets$n_genesets)) stop() - -# Extract gene set members ----- - -# Create a table for genes in a tidy/long format (one gene per row) -geneset_genes <- select(mdb_tbl, gs_id, gs_members) -geneset_genes <- mutate(geneset_genes, gs_members_split = strsplit(gs_members, "|", fixed = TRUE)) -geneset_genes <- unnest(geneset_genes, cols = gs_members_split, names_repair = "minimal") -nrow(geneset_genes) %>% prettyNum(big.mark = ",") - -# Remove genes that do not have comma-separated parts (not a proper source gene) -geneset_genes <- filter(geneset_genes, str_detect(gs_members_split, fixed(","))) -nrow(geneset_genes) %>% prettyNum(big.mark = ",") - -# Split member details into separate columns -geneset_genes <- geneset_genes %>% - separate( - col = gs_members_split, - into = c("source_gene", "human_gene_symbol", "human_entrez_gene"), - sep = "," - ) %>% - mutate(human_entrez_gene = as.integer(human_entrez_gene)) -nrow(geneset_genes) %>% prettyNum(big.mark = ",") - -# Check for any strange patterns -count(geneset_genes, source_gene, sort = TRUE) -count(geneset_genes, human_gene_symbol, human_entrez_gene, sort = TRUE) - -# Get the number of members per gene set (for testing) -# Not all members map to unique genes -mdb_geneset_members <- geneset_genes %>% count(gs_id, name = "n_members") -mdb_geneset_members - -# Confirm that gene set sizes are reasonable -if (min(mdb_geneset_members$n_members) < 5) stop() -if (max(mdb_geneset_members$n_members) > 3000) stop() -if (min(geneset_genes$human_entrez_gene, na.rm = TRUE) < 1) stop() - -# Skip genes without an Entrez or Ensembl ID -geneset_genes <- geneset_genes %>% - filter(human_entrez_gene > 0 | str_detect(source_gene, "^ENSG000")) -nrow(geneset_genes) %>% prettyNum(big.mark = ",") - -# Keep only the relevant fields -geneset_genes <- geneset_genes %>% - distinct(gs_id, source_gene, human_entrez_gene, human_gene_symbol) -nrow(geneset_genes) %>% prettyNum(big.mark = ",") - -# Add Ensembl IDs to genes without them ----- - -# Split genes based on if they include Ensembl IDs -# Starting with MSigDB 7.0, Ensembl is the platform annotation authority -# Add internal gene ID to track both Entrez and Ensembl genes -# Using Ensembl IDs as IDs for all genes resulted in a larger data file -geneset_genes_entrez <- geneset_genes %>% - filter(str_detect(source_gene, "^ENSG000", negate = TRUE)) %>% - distinct(gs_id, human_entrez_gene, human_gene_symbol) -geneset_genes_ensembl <- geneset_genes %>% - filter(str_detect(source_gene, "^ENSG000")) %>% - select(gs_id, human_entrez_gene, human_ensembl_gene = source_gene, human_gene_symbol) %>% - mutate(human_gene_symbol = if_else(human_gene_symbol == "", human_ensembl_gene, human_gene_symbol)) - -# Very few gene sets should have only some source genes as Ensembl IDs -intersect(geneset_genes_entrez$gs_id, geneset_genes_ensembl$gs_id) - -# Check the number of genes -nrow(geneset_genes_entrez) %>% prettyNum(big.mark = ",") -n_distinct(geneset_genes_entrez$human_gene_symbol) -n_distinct(geneset_genes_entrez$human_entrez_gene) -nrow(geneset_genes_ensembl) %>% prettyNum(big.mark = ",") -n_distinct(geneset_genes_ensembl$human_gene_symbol) -n_distinct(geneset_genes_ensembl$human_ensembl_gene) - -if (length(setdiff(geneset_genes_entrez$human_gene_symbol, ensembl_tbl$human_gene_symbol))) stop() - -# Further split genes without Ensembl IDs based on HGNC Ensembl IDs -geneset_genes_entrez_hgnc <- geneset_genes_entrez %>% - filter(human_entrez_gene %in% hgnc_tbl$human_entrez_gene) -geneset_genes_entrez_ensembl <- geneset_genes_entrez %>% - filter(!human_entrez_gene %in% hgnc_tbl$human_entrez_gene) - -# Add Ensembl IDs to genes without them -geneset_genes_entrez_hgnc <- left_join(geneset_genes_entrez_hgnc, hgnc_tbl, by = "human_entrez_gene") -geneset_genes_entrez_ensembl <- left_join(geneset_genes_entrez_ensembl, ensembl_tbl, by = "human_gene_symbol") - -# Check the number of genes -nrow(geneset_genes_entrez_hgnc) %>% prettyNum(big.mark = ",") -n_distinct(geneset_genes_entrez_hgnc$human_entrez_gene) -n_distinct(geneset_genes_entrez_hgnc$human_ensembl_gene) -nrow(geneset_genes_entrez_ensembl) %>% prettyNum(big.mark = ",") -n_distinct(geneset_genes_entrez_ensembl$human_entrez_gene) -n_distinct(geneset_genes_entrez_ensembl$human_ensembl_gene) - -# Combine different types of genes into a single table -geneset_genes_clean <- - bind_rows(geneset_genes_entrez_hgnc, geneset_genes_entrez_ensembl, geneset_genes_ensembl) %>% - mutate(gene_id = str_remove(human_ensembl_gene, "ENSG000")) %>% - mutate(gene_id = as.integer(gene_id)) %>% - distinct() %>% - arrange(gs_id, gene_id) -nrow(geneset_genes_clean) %>% prettyNum(big.mark = ",") - -# Make internal IDs consecutive -geneset_genes_clean$gene_id <- dense_rank(geneset_genes_clean$gene_id) -geneset_genes_clean %>% - count(human_gene_symbol, gene_id) %>% - arrange(human_gene_symbol) -geneset_genes_clean %>% - count(human_ensembl_gene, gene_id) %>% - arrange(human_ensembl_gene) - -# Generate a gene set members table ----- - -# Combine Entrez and Ensembl genes into a single table -msigdbr_geneset_genes <- geneset_genes_clean %>% - distinct(gs_id, gene_id) %>% - arrange(gs_id, gene_id) - -# Check gene numbers -nrow(geneset_genes) %>% prettyNum(big.mark = ",") -nrow(msigdbr_geneset_genes) %>% prettyNum(big.mark = ",") - -# Check that all the original gene sets are present -if (length(setdiff(mdb_geneset_members$gs_id, msigdbr_geneset_genes$gs_id)) > 0) stop() - -# Check that most of the original gene set members converted to genes -if (nrow(msigdbr_geneset_genes) < (sum(mdb_geneset_members$n_members) * 0.85)) stop() -genes_members_ratio <- full_join(mdb_geneset_members, count(msigdbr_geneset_genes, gs_id, name = "n_genes"), by = "gs_id") -genes_members_ratio$ratio <- genes_members_ratio$n_genes / genes_members_ratio$n_members -if (min(genes_members_ratio$n_genes) < 5) stop() -if (max(genes_members_ratio$n_genes) > 2300) stop() -if (max(genes_members_ratio$ratio) > 2.2) stop() -if (quantile(genes_members_ratio$ratio, 0.99) > 1) stop() -if (quantile(genes_members_ratio$ratio, 0.001) < 0.3) stop() -if (quantile(genes_members_ratio$ratio, 0.1) < 0.7) stop() -if (quantile(genes_members_ratio$ratio, 0.2) < 0.9) stop() -if (quantile(genes_members_ratio$ratio, 0.3) < 0.99) stop() - -# Generate a genes table ----- - -# Extract the unique genes -msigdbr_genes <- geneset_genes_clean %>% - distinct(gene_id, human_gene_symbol, human_entrez_gene, human_ensembl_gene) %>% - arrange(human_gene_symbol, gene_id) - -# Check the total number of genes -nrow(msigdbr_genes) %>% prettyNum(big.mark = ",") - -# Prepare package ----- - -# Check the size of final tables -format(object.size(msigdbr_genesets), units = "Mb") -format(object.size(msigdbr_geneset_genes), units = "Mb") -format(object.size(msigdbr_genes), units = "Mb") - -# Create package data -use_data( - msigdbr_genesets, - msigdbr_geneset_genes, - msigdbr_genes, - internal = TRUE, - overwrite = TRUE, - compress = "xz" -) diff --git a/man/figures/lifecycle-deprecated.svg b/man/figures/lifecycle-deprecated.svg new file mode 100644 index 0000000..b61c57c --- /dev/null +++ b/man/figures/lifecycle-deprecated.svg @@ -0,0 +1,21 @@ + + lifecycle: deprecated + + + + + + + + + + + + + + + lifecycle + + deprecated + + diff --git a/man/figures/lifecycle-experimental.svg b/man/figures/lifecycle-experimental.svg new file mode 100644 index 0000000..5d88fc2 --- /dev/null +++ b/man/figures/lifecycle-experimental.svg @@ -0,0 +1,21 @@ + + lifecycle: experimental + + + + + + + + + + + + + + + lifecycle + + experimental + + diff --git a/man/figures/lifecycle-stable.svg b/man/figures/lifecycle-stable.svg new file mode 100644 index 0000000..9bf21e7 --- /dev/null +++ b/man/figures/lifecycle-stable.svg @@ -0,0 +1,29 @@ + + lifecycle: stable + + + + + + + + + + + + + + + + lifecycle + + + + stable + + + diff --git a/man/figures/lifecycle-superseded.svg b/man/figures/lifecycle-superseded.svg new file mode 100644 index 0000000..db8d757 --- /dev/null +++ b/man/figures/lifecycle-superseded.svg @@ -0,0 +1,21 @@ + + lifecycle: superseded + + + + + + + + + + + + + + + lifecycle + + superseded + + diff --git a/man/msigdbr-package.Rd b/man/msigdbr-package.Rd new file mode 100644 index 0000000..6cb6e09 --- /dev/null +++ b/man/msigdbr-package.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/msigdbr-package.R +\docType{package} +\name{msigdbr-package} +\alias{msigdbr-package} +\title{msigdbr: MSigDB Gene Sets for Multiple Organisms in a Tidy Data Format} +\description{ +Provides the 'Molecular Signatures Database' (MSigDB) gene sets typically used with the 'Gene Set Enrichment Analysis' (GSEA) software (Subramanian et al. 2005 \doi{10.1073/pnas.0506580102}, Liberzon et al. 2015 \doi{10.1016/j.cels.2015.12.004}, Castanza et al. 2023 \doi{10.1038/s41592-023-02014-7}) as an R data frame. The package includes the human genes as listed in MSigDB as well as the corresponding symbols and IDs for frequently studied model organisms such as mouse, rat, pig, fly, and yeast. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://igordot.github.io/msigdbr/} + \item Report bugs at \url{https://github.com/igordot/msigdbr/issues} +} + +} +\author{ +\strong{Maintainer}: Igor Dolgalev \email{igor.dolgalev@nyumc.org} (\href{https://orcid.org/0000-0003-4451-126X}{ORCID}) + +} +\keyword{internal} diff --git a/man/msigdbr.Rd b/man/msigdbr.Rd index 646bc93..ea623b8 100644 --- a/man/msigdbr.Rd +++ b/man/msigdbr.Rd @@ -4,21 +4,44 @@ \alias{msigdbr} \title{Retrieve the gene sets data frame} \usage{ -msigdbr(species = "Homo sapiens", category = NULL, subcategory = NULL) +msigdbr( + species = "Homo sapiens", + db_species = "HS", + collection = NULL, + subcollection = NULL, + category = deprecated(), + subcategory = deprecated() +) } \arguments{ -\item{species}{Species name, such as Homo sapiens or Mus musculus.} +\item{species}{Species name for output genes, such as \code{"Homo sapiens"} or \code{"Mus musculus"}. Use \code{msigdbr_species()} for available options.} -\item{category}{MSigDB collection abbreviation, such as H or C1.} +\item{db_species}{Species abbreviation for the human or mouse databases (\code{"HS"} or \code{"MM"}).} -\item{subcategory}{MSigDB sub-collection abbreviation, such as CGP or BP.} +\item{collection}{Collection abbreviation, such as \code{"H"} or \code{"C1"}. Use \code{msigdbr_collections()} for the available options.} + +\item{subcollection}{Sub-collection abbreviation, such as \code{"CGP"} or \code{"BP"}. Use \code{msigdbr_collections()} for the available options.} + +\item{category}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} use the \code{collection} argument} + +\item{subcategory}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} use the \code{subcollection} argument} } \value{ A data frame of gene sets with one gene per row. } \description{ Retrieve a data frame of gene sets and their member genes. -The available species and collections can be checked with \code{msigdbr_species()} and \code{msigdbr_collections()}. +The original human genes can be converted into their corresponding counterparts in various model organisms, including mouse, rat, pig, zebrafish, fly, and yeast. +The output includes gene symbols along with NCBI and Ensembl IDs. +} +\details{ +Historically, the MSigDB resource has been tailored to the analysis of human-specific datasets, with gene sets exclusively aligned to the human genome. +Starting with release 2022.1, MSigDB incorporated a database of mouse-native gene sets and was split into human and mouse divisions ("Hs" and "Mm"). +Each one is provided in the approved gene symbols of its respective species. +The versioning convention of MSigDB is in the format \code{Year.Release.Species}. +The genes within each gene set may originate from a species different from the database target species, indicated by the \code{gs_source_species} and \code{db_target_species} fields. + +Mouse MSigDB includes gene sets curated from mouse-centric datasets and specified in native mouse gene identifiers, eliminating the need for ortholog mapping. } \examples{ # get all human gene sets @@ -28,9 +51,9 @@ msigdbr(species = "Homo sapiens") # get mouse C2 (curated) CGP (chemical and genetic perturbations) gene sets \donttest{ -msigdbr(species = "Mus musculus", category = "C2", subcategory = "CGP") +msigdbr(species = "Mus musculus", collection = "C2", subcollection = "CGP") } } \references{ -\url{https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp} +\url{https://www.gsea-msigdb.org/gsea/msigdb/index.jsp} } diff --git a/man/msigdbr_check_data.Rd b/man/msigdbr_check_data.Rd new file mode 100644 index 0000000..32e2e9f --- /dev/null +++ b/man/msigdbr_check_data.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/msigdbr-check-data.R +\name{msigdbr_check_data} +\alias{msigdbr_check_data} +\title{Check that the data package is installed} +\usage{ +msigdbr_check_data() +} +\description{ +Check that the 'msigdbdf' data package is installed. +If not, provide instructions for installation. +A dependency listed in DESCRIPTION Suggests is not guaranteed to be installed. +} diff --git a/man/msigdbr_collections.Rd b/man/msigdbr_collections.Rd index 017a553..a1f2959 100644 --- a/man/msigdbr_collections.Rd +++ b/man/msigdbr_collections.Rd @@ -4,7 +4,10 @@ \alias{msigdbr_collections} \title{List the collections available in the msigdbr package} \usage{ -msigdbr_collections() +msigdbr_collections(db_species = "Hs") +} +\arguments{ +\item{db_species}{Species abbreviation for the human or mouse databases (\code{"Hs"} or \code{"Mm"}).} } \value{ A data frame of the available collections. diff --git a/man/msigdbr_show_species.Rd b/man/msigdbr_show_species.Rd deleted file mode 100644 index dfc6097..0000000 --- a/man/msigdbr_show_species.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/msigdbr-species.R -\name{msigdbr_show_species} -\alias{msigdbr_show_species} -\title{List the species available in the msigdbr package} -\usage{ -msigdbr_show_species() -} -\value{ -A vector of possible species. -} -\description{ -This function is being deprecated and replaced by \code{msigdbr_species()}. -} diff --git a/man/pipe.Rd b/man/pipe.Rd deleted file mode 100644 index a648c29..0000000 --- a/man/pipe.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-pipe.R -\name{\%>\%} -\alias{\%>\%} -\title{Pipe operator} -\usage{ -lhs \%>\% rhs -} -\arguments{ -\item{lhs}{A value or the magrittr placeholder.} - -\item{rhs}{A function call using the magrittr semantics.} -} -\value{ -The result of calling \code{rhs(lhs)}. -} -\description{ -See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. -} -\keyword{internal} diff --git a/tests/testthat/test-msigdbr-collections.R b/tests/testthat/test-msigdbr-collections.R index f08577a..4c6ec79 100644 --- a/tests/testthat/test-msigdbr-collections.R +++ b/tests/testthat/test-msigdbr-collections.R @@ -1,16 +1,25 @@ -library(msigdbr) - test_that("msigdbr_collections()", { collections <- msigdbr_collections() expect_s3_class(collections, "tbl_df") - expect_gt(nrow(collections), 20) - expect_lt(nrow(collections), 25) - expect_match(collections$gs_cat, "H", fixed = TRUE, all = FALSE) - expect_match(collections$gs_cat, "C2", fixed = TRUE, all = FALSE) - expect_match(collections$gs_cat, "C7", fixed = TRUE, all = FALSE) - expect_match(collections$gs_cat, "C8", fixed = TRUE, all = FALSE) - expect_match(collections$gs_subcat, "CGP", fixed = TRUE, all = FALSE) - expect_match(collections$gs_subcat, "CP:REACTOME", fixed = TRUE, all = FALSE) - expect_match(collections$gs_subcat, "GO:BP", fixed = TRUE, all = FALSE) - expect_match(collections$gs_subcat, "HPO", fixed = TRUE, all = FALSE) + expect_gt(nrow(collections), 10) + expect_lt(nrow(collections), 30) + expect_match(collections$gs_collection, "H", fixed = TRUE, all = FALSE) + expect_match(collections$gs_collection, "C2", fixed = TRUE, all = FALSE) + expect_match(collections$gs_collection, "C7", fixed = TRUE, all = FALSE) + expect_match(collections$gs_collection, "C8", fixed = TRUE, all = FALSE) +}) + +test_that("msigdbr_collections() db_species", { + chs <- msigdbr_collections(db_species = "Hs") + expect_s3_class(chs, "tbl_df") + expect_gt(nrow(chs), 10) + expect_lt(nrow(chs), 30) + expect_match(chs$gs_collection, "H", fixed = TRUE, all = FALSE) + expect_match(chs$gs_collection, "C8", fixed = TRUE, all = FALSE) + cmm <- msigdbr_collections(db_species = "Mm") + expect_s3_class(cmm, "tbl_df") + expect_gt(nrow(cmm), 10) + expect_lt(nrow(cmm), 30) + expect_match(cmm$gs_collection, "MH", fixed = TRUE, all = FALSE) + expect_match(cmm$gs_collection, "M8", fixed = TRUE, all = FALSE) }) diff --git a/tests/testthat/test-msigdbr-species.R b/tests/testthat/test-msigdbr-species.R index d1c9561..6554c6c 100644 --- a/tests/testthat/test-msigdbr-species.R +++ b/tests/testthat/test-msigdbr-species.R @@ -1,5 +1,3 @@ -library(msigdbr) - test_that("msigdbr_species()", { species <- msigdbr_species() expect_s3_class(species, "tbl_df") @@ -9,7 +7,3 @@ test_that("msigdbr_species()", { expect_match(species$species_name, "Mus musculus", fixed = TRUE, all = FALSE) expect_match(species$species_name, "Drosophila melanogaster", fixed = TRUE, all = FALSE) }) - -test_that("msigdbr_show_species()", { - expect_warning(msigdbr_show_species()) -}) diff --git a/tests/testthat/test-msigdbr.R b/tests/testthat/test-msigdbr.R index 032ec41..7df8265 100644 --- a/tests/testthat/test-msigdbr.R +++ b/tests/testthat/test-msigdbr.R @@ -1,192 +1,163 @@ -library(msigdbr) library(dplyr) -test_that("human gene sets overall stats", { - msigdbr_hs <- msigdbr() - expect_s3_class(msigdbr_hs, "tbl_df") - expect_identical(msigdbr_hs, msigdbr(species = "human")) - expect_gt(nrow(msigdbr_hs), 3900000) - expect_identical(colnames(msigdbr_hs)[1:6], c("gs_cat", "gs_subcat", "gs_name", "gene_symbol", "entrez_gene", "ensembl_gene")) - expect_gt(n_distinct(msigdbr_hs$gs_id), 33000) - expect_gt(n_distinct(msigdbr_hs$gene_symbol), 40000) - expect_gt(n_distinct(msigdbr_hs$entrez_gene), 40000) - expect_gt(n_distinct(msigdbr_hs$ensembl_gene), 40000) - # msigdbr_hs %>% count(gs_id) %>% arrange(n) - expect_equal(min(table(msigdbr_hs$gs_id)), 5) - # msigdbr_hs %>% count(gs_id) %>% arrange(desc(n)) - expect_lt(max(table(msigdbr_hs$gs_id)), 2400) - expect_lt(quantile(table(msigdbr_hs$gs_id), 0.999), 2000) - expect_lt(quantile(table(msigdbr_hs$gs_id), 0.98), 1000) - expect_lt(quantile(table(msigdbr_hs$gs_id), 0.9), 250) - expect_gt(quantile(table(msigdbr_hs$gs_id), 0.9), 200) - expect_gt(quantile(table(msigdbr_hs$gs_id), 0.5), 40) - expect_gt(quantile(table(msigdbr_hs$gs_id), 0.2), 10) - msigdbr_hs_symbol <- distinct(msigdbr_hs, gs_id, gene_symbol) - # msigdbr_hs_symbol %>% count(gs_id) %>% arrange(desc(n)) - expect_gt(nrow(msigdbr_hs_symbol), 3800000) - expect_lt(max(table(msigdbr_hs_symbol$gs_id)), 2300) - msigdbr_hs_entrez <- distinct(msigdbr_hs, gs_id, entrez_gene) - # msigdbr_hs_entrez %>% count(gs_id) %>% arrange(desc(n)) - expect_gt(nrow(msigdbr_hs_entrez), 3800000) - expect_gt(max(table(msigdbr_hs_entrez$gs_id)), 1990) - expect_lt(max(table(msigdbr_hs_entrez$gs_id)), 2000) +test_that("species variations", { + # human genes + m_hs_hs <- msigdbr() + expect_s3_class(m_hs_hs, "tbl_df") + expect_identical(m_hs_hs, msigdbr(species = "Homo sapiens")) + expect_identical(m_hs_hs, msigdbr(db_species = "hs", species = "human")) + # mouse genes + m_hs_mm <- msigdbr(species = "Mus musculus") + expect_s3_class(m_hs_mm, "tbl_df") + expect_identical(m_hs_mm, msigdbr(db_species = "hs", species = "mouse")) + # rat genes + m_hs_rn <- msigdbr(species = "Rattus norvegicus") + expect_s3_class(m_hs_rn, "tbl_df") + # mouse database and genes + m_mm_mm <- msigdbr(db_species = "mm", species = "Mus musculus") + expect_s3_class(m_mm_mm, "tbl_df") + # column names should be identical (extra output with orthologs) + expect_identical(names(m_hs_hs)[1:19], names(m_hs_mm)[1:19]) + expect_identical(names(m_hs_mm)[1:19], names(m_hs_rn)[1:19]) + expect_identical(names(m_mm_mm)[1:19], names(m_hs_mm)[1:19]) + # ortholog conversion should not reduce the database size substantially + expect_gt(nrow(m_hs_mm), nrow(m_hs_hs) * 0.9) + expect_gt(nrow(m_hs_rn), nrow(m_hs_hs) * 0.9) + # non-supported combinations + expect_error(msigdbr(db_species = "mm", species = "Homo sapiens")) + expect_error(msigdbr(db_species = "mm", species = "human")) + expect_error(msigdbr(db_species = "mm", species = "Rattus norvegicus")) }) -test_that("mouse gene sets overall stats", { - msigdbr_mm <- msigdbr(species = "Mus musculus") - expect_s3_class(msigdbr_mm, "tbl_df") - expect_identical(msigdbr_mm, msigdbr(species = "mouse")) - expect_gt(nrow(msigdbr_mm), 3700000) - expect_identical(colnames(msigdbr_mm)[1:6], c("gs_cat", "gs_subcat", "gs_name", "gene_symbol", "entrez_gene", "ensembl_gene")) - expect_gt(n_distinct(msigdbr_mm$gs_id), 33000) - expect_gt(n_distinct(msigdbr_mm$human_gene_symbol), 18000) - expect_gt(n_distinct(msigdbr_mm$gene_symbol), 17000) - expect_gt(n_distinct(msigdbr_mm$entrez_gene), 17000) - expect_gt(n_distinct(msigdbr_mm$ensembl_gene), 17000) - expect_equal(max(msigdbr_mm$num_ortholog_sources), 12) - expect_gt(min(table(msigdbr_mm$gs_id)), 0) - expect_lt(max(table(msigdbr_mm$gs_id)), 2500) - expect_gt(quantile(table(msigdbr_mm$gs_id), 0.8), 100) - expect_gt(quantile(table(msigdbr_mm$gs_id), 0.5), 40) +test_that("human db human genes", { + m_hs <- msigdbr() + expect_s3_class(m_hs, "tbl_df") + expect_identical(m_hs, msigdbr(species = "human")) + expect_identical(m_hs, msigdbr(db_species = "hs", species = "human")) + expect_gt(nrow(m_hs), 1000000) + expect_identical(names(m_hs)[1:3], c("gene_symbol", "ncbi_gene", "ensembl_gene")) + expect_identical(names(m_hs)[4:8], c("db_gene_symbol", "db_ncbi_gene", "db_ensembl_gene", "source_gene", "gs_id")) + expect_gt(n_distinct(m_hs$gs_id), 30000) + expect_gt(n_distinct(m_hs$gene_symbol), 40000) + expect_gt(n_distinct(m_hs$ncbi_gene), 40000) + expect_gt(n_distinct(m_hs$ensembl_gene), 40000) + expect_equal(min(table(m_hs$gs_id)), 5) + m_hs_sym <- distinct(m_hs, gs_id, gene_symbol) + expect_gt(nrow(m_hs_sym), 1000000) }) -test_that("rat gene sets overall stats", { - msigdbr_rn <- msigdbr(species = "Rattus norvegicus") - expect_s3_class(msigdbr_rn, "tbl_df") - expect_identical(msigdbr_rn, msigdbr(species = "rat")) - expect_gt(nrow(msigdbr_rn), 3600000) - expect_gt(n_distinct(msigdbr_rn$gs_id), 33000) - expect_gt(n_distinct(msigdbr_rn$human_gene_symbol), 15000) - expect_gt(n_distinct(msigdbr_rn$gene_symbol), 15000) - expect_equal(max(msigdbr_rn$num_ortholog_sources), 10) - expect_gt(min(table(msigdbr_rn$gs_id)), 0) - expect_lt(max(table(msigdbr_rn$gs_id)), 2000) - expect_gt(quantile(table(msigdbr_rn$gs_id), 0.8), 100) - expect_gt(quantile(table(msigdbr_rn$gs_id), 0.5), 40) +test_that("human db mouse genes", { + m_mm <- msigdbr(species = "Mus musculus") + expect_s3_class(m_mm, "tbl_df") + expect_identical(m_mm, msigdbr(species = "mouse")) + expect_gt(nrow(m_mm), 1000000) + expect_gt(n_distinct(m_mm$gs_id), 30000) + expect_gt(n_distinct(m_mm$gene_symbol), 15000) + expect_gt(n_distinct(m_mm$ncbi_gene), 15000) + expect_gt(n_distinct(m_mm$ensembl_gene), 15000) + expect_equal(max(m_mm$num_ortholog_sources), 12) }) -test_that("human hallmark category", { - msigdbr_hs_h <- msigdbr(species = "Homo sapiens", category = "H") - expect_s3_class(msigdbr_hs_h, "tbl_df") - expect_gt(nrow(msigdbr_hs_h), 5000) - expect_equal(n_distinct(msigdbr_hs_h$gs_cat), 1) - expect_equal(n_distinct(msigdbr_hs_h$gs_subcat), 1) - expect_equal(n_distinct(msigdbr_hs_h$gs_id), 50) - expect_gt(min(table(msigdbr_hs_h$gs_id)), 30) - expect_lt(max(table(msigdbr_hs_h$gs_id)), 350) - msigdbr_hs_h_entrez <- distinct(msigdbr_hs_h, gs_id, entrez_gene) - expect_gt(min(table(msigdbr_hs_h_entrez$gs_id)), 30) - expect_equal(max(table(msigdbr_hs_h_entrez$gs_id)), 200) - msigdbr_hs_h_symbol <- distinct(msigdbr_hs_h, gs_id, gene_symbol) - expect_gt(min(table(msigdbr_hs_h_symbol$gs_id)), 30) - expect_equal(max(table(msigdbr_hs_h_symbol$gs_id)), 200) +test_that("human db rat genes", { + m_rn <- msigdbr(species = "Rattus norvegicus") + expect_s3_class(m_rn, "tbl_df") + expect_identical(m_rn, msigdbr(species = "rat")) + expect_gt(nrow(m_rn), 1000000) + expect_gt(n_distinct(m_rn$gs_id), 30000) + expect_gt(n_distinct(m_rn$gene_symbol), 15000) + expect_gt(n_distinct(m_rn$ncbi_gene), 15000) + expect_gt(n_distinct(m_rn$ensembl_gene), 15000) + expect_equal(max(m_rn$num_ortholog_sources), 10) }) -test_that("mouse hallmark category", { - msigdbr_mm_h <- msigdbr(species = "Mus musculus", category = "H") - expect_s3_class(msigdbr_mm_h, "tbl_df") - expect_gt(nrow(msigdbr_mm_h), 5000) - expect_equal(n_distinct(msigdbr_mm_h$gs_cat), 1) - expect_equal(n_distinct(msigdbr_mm_h$gs_subcat), 1) - expect_equal(n_distinct(msigdbr_mm_h$gs_id), 50) - expect_gt(min(table(msigdbr_mm_h$gs_id)), 30) - expect_lt(max(table(msigdbr_mm_h$gs_id)), 250) - msigdbr_mm_h_entrez <- distinct(msigdbr_mm_h, gs_id, entrez_gene) - expect_gt(min(table(msigdbr_mm_h_entrez$gs_id)), 30) - expect_lt(max(table(msigdbr_mm_h_entrez$gs_id)), 220) - msigdbr_mm_h_symbol <- distinct(msigdbr_mm_h, gs_id, gene_symbol) - expect_gt(min(table(msigdbr_mm_h_symbol$gs_id)), 30) - expect_lt(max(table(msigdbr_mm_h_symbol$gs_id)), 220) +test_that("human hallmark category", { + m_hs_h <- msigdbr(species = "Homo sapiens", collection = "H") + expect_s3_class(m_hs_h, "tbl_df") + expect_gt(nrow(m_hs_h), 5000) + expect_equal(n_distinct(m_hs_h$gs_collection), 1) + expect_equal(n_distinct(m_hs_h$gs_subcollection), 1) + expect_equal(n_distinct(m_hs_h$gs_id), 50) + expect_gt(min(table(m_hs_h$gs_id)), 30) + expect_lt(max(table(m_hs_h$gs_id)), 350) + m_hs_h_ncbi <- distinct(m_hs_h, gs_id, ncbi_gene) + expect_gt(min(table(m_hs_h_ncbi$gs_id)), 30) + expect_equal(max(table(m_hs_h_ncbi$gs_id)), 200) + m_hs_h_sym <- distinct(m_hs_h, gs_id, gene_symbol) + expect_gt(min(table(m_hs_h_sym$gs_id)), 30) + expect_equal(max(table(m_hs_h_sym$gs_id)), 200) }) -test_that("human CGP subcategory", { - msigdbr_hs_cgp <- msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CGP") - expect_s3_class(msigdbr_hs_cgp, "tbl_df") - expect_gt(nrow(msigdbr_hs_cgp), 100000) - expect_equal(n_distinct(msigdbr_hs_cgp$gs_cat), 1) - expect_equal(n_distinct(msigdbr_hs_cgp$gs_subcat), 1) - expect_gt(n_distinct(msigdbr_hs_cgp$gs_id), 3000) - expect_lt(n_distinct(msigdbr_hs_cgp$gs_id), 5000) -}) +# test_that("mouse hallmark category", { +# msigdbr_mm_h <- msigdbr(species = "Mus musculus", category = "H") +# expect_s3_class(msigdbr_mm_h, "tbl_df") +# expect_gt(nrow(msigdbr_mm_h), 5000) +# expect_equal(n_distinct(msigdbr_mm_h$gs_cat), 1) +# expect_equal(n_distinct(msigdbr_mm_h$gs_subcat), 1) +# expect_equal(n_distinct(msigdbr_mm_h$gs_id), 50) +# expect_gt(min(table(msigdbr_mm_h$gs_id)), 30) +# expect_lt(max(table(msigdbr_mm_h$gs_id)), 250) +# msigdbr_mm_h_entrez <- distinct(msigdbr_mm_h, gs_id, entrez_gene) +# expect_gt(min(table(msigdbr_mm_h_entrez$gs_id)), 30) +# expect_lt(max(table(msigdbr_mm_h_entrez$gs_id)), 220) +# msigdbr_mm_h_symbol <- distinct(msigdbr_mm_h, gs_id, gene_symbol) +# expect_gt(min(table(msigdbr_mm_h_symbol$gs_id)), 30) +# expect_lt(max(table(msigdbr_mm_h_symbol$gs_id)), 220) +# }) -test_that("human BP subcategory", { - msigdbr_hs_bp <- msigdbr(species = "Homo sapiens", category = "C5", subcategory = "BP") - expect_s3_class(msigdbr_hs_bp, "tbl_df") - expect_gt(nrow(msigdbr_hs_bp), 100000) - expect_equal(n_distinct(msigdbr_hs_bp$gs_cat), 1) - expect_equal(n_distinct(msigdbr_hs_bp$gs_subcat), 1) - expect_gt(n_distinct(msigdbr_hs_bp$gs_id), 7000) - expect_lt(n_distinct(msigdbr_hs_bp$gs_id), 9000) -}) +# test_that("human CGP subcategory", { +# msigdbr_hs_cgp <- msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CGP") +# expect_s3_class(msigdbr_hs_cgp, "tbl_df") +# expect_gt(nrow(msigdbr_hs_cgp), 100000) +# expect_equal(n_distinct(msigdbr_hs_cgp$gs_cat), 1) +# expect_equal(n_distinct(msigdbr_hs_cgp$gs_subcat), 1) +# expect_gt(n_distinct(msigdbr_hs_cgp$gs_id), 3000) +# expect_lt(n_distinct(msigdbr_hs_cgp$gs_id), 5000) +# }) -test_that("rat BP subcategory", { - msigdbr_rn_bp <- msigdbr(species = "Rattus norvegicus", category = "C5", subcategory = "BP") - expect_s3_class(msigdbr_rn_bp, "tbl_df") - expect_gt(nrow(msigdbr_rn_bp), 500000) - expect_equal(n_distinct(msigdbr_rn_bp$gs_cat), 1) - expect_equal(n_distinct(msigdbr_rn_bp$gs_subcat), 1) - expect_gt(n_distinct(msigdbr_rn_bp$gs_id), 7000) - expect_lt(n_distinct(msigdbr_rn_bp$gs_id), 9000) -}) +# test_that("human BP subcategory", { +# msigdbr_hs_bp <- msigdbr(species = "Homo sapiens", category = "C5", subcategory = "BP") +# expect_s3_class(msigdbr_hs_bp, "tbl_df") +# expect_gt(nrow(msigdbr_hs_bp), 100000) +# expect_equal(n_distinct(msigdbr_hs_bp$gs_cat), 1) +# expect_equal(n_distinct(msigdbr_hs_bp$gs_subcat), 1) +# expect_gt(n_distinct(msigdbr_hs_bp$gs_id), 7000) +# expect_lt(n_distinct(msigdbr_hs_bp$gs_id), 9000) +# }) -test_that("subcategory partial match", { - msigdbr_mm_gomf <- msigdbr(species = "Mus musculus", category = "C5", subcategory = "GO:MF") - expect_s3_class(msigdbr_mm_gomf, "tbl_df") - msigdbr_mm_mf <- msigdbr(species = "Mus musculus", category = "C5", subcategory = "MF") - expect_s3_class(msigdbr_mm_mf, "tbl_df") - expect_equal(nrow(msigdbr_mm_gomf), nrow(msigdbr_mm_mf)) - expect_identical(msigdbr_mm_gomf, msigdbr_mm_mf) -}) - -test_that("specific genes present in specific gene sets", { - msigdbr_hs <- msigdbr() - expect_gt(nrow(filter(msigdbr_hs, gene_symbol == "NRAS")), 100) - expect_gt(nrow(filter(msigdbr_hs, gene_symbol == "PIK3CA")), 100) - expect_equal(nrow(filter(msigdbr_hs, gs_id == "M30055", gene_symbol == "FOS")), 1) - expect_equal(nrow(filter(msigdbr_hs, gs_id == "M30055", entrez_gene == 2353)), 1) - expect_equal(nrow(filter(msigdbr_hs, gs_id == "M30055", ensembl_gene == "ENSG00000170345")), 1) - expect_equal(nrow(filter(msigdbr_hs, gs_id == "M40827", gene_symbol == "ABCA11P")), 1) - expect_equal(nrow(filter(msigdbr_hs, gs_id == "M40827", entrez_gene == 79963)), 1) - expect_equal(nrow(filter(msigdbr_hs, gs_id == "M40827", ensembl_gene == "ENSG00000251595")), 1) - expect_equal(nrow(filter(msigdbr_hs, gs_id == "M8918", gene_symbol == "NEPNP")), 1) - expect_equal(nrow(filter(msigdbr_hs, gs_id == "M8918", entrez_gene == 442253)), 1) - expect_equal(nrow(filter(msigdbr_hs, gs_id == "M8918", ensembl_gene == "ENSG00000218233")), 1) -}) +# test_that("rat BP subcategory", { +# msigdbr_rn_bp <- msigdbr(species = "Rattus norvegicus", category = "C5", subcategory = "BP") +# expect_s3_class(msigdbr_rn_bp, "tbl_df") +# expect_gt(nrow(msigdbr_rn_bp), 500000) +# expect_equal(n_distinct(msigdbr_rn_bp$gs_cat), 1) +# expect_equal(n_distinct(msigdbr_rn_bp$gs_subcat), 1) +# expect_gt(n_distinct(msigdbr_rn_bp$gs_id), 7000) +# expect_lt(n_distinct(msigdbr_rn_bp$gs_id), 9000) +# }) -test_that("number of genes in specific gene sets", { - msigdbr_hs <- msigdbr() - msigdbr_hs_sym <- distinct(msigdbr_hs, gs_id, gene_symbol) - msigdbr_mm <- msigdbr(species = "Mus musculus") - # H: HALLMARK_APOPTOSIS - expect_equal(nrow(filter(msigdbr_hs_sym, gs_id == "M5902")), 161) - expect_equal(nrow(filter(msigdbr_mm, gs_id == "M5902")), 160) - expect_equal(nrow(filter(msigdbr_hs_sym, gs_id == "M5903")), 32) - expect_equal(nrow(filter(msigdbr_mm, gs_id == "M5903")), 32) - # C8: HAY_BONE_MARROW_PRE_DENDRITIC - expect_equal(nrow(filter(msigdbr_hs_sym, gs_id == "M39207")), 5) - expect_equal(nrow(filter(msigdbr_mm, gs_id == "M39207")), 5) - expect_equal(nrow(filter(msigdbr_hs_sym, gs_id == "M40020")), 12) - expect_equal(nrow(filter(msigdbr_mm, gs_id == "M40020")), 12) - # C2: REACTOME_PYRUVATE_METABOLISM_AND_CITRIC_ACID_TCA_CYCLE - expect_equal(nrow(filter(msigdbr_hs, gs_id == "M490")), 57) - expect_equal(nrow(filter(msigdbr_hs_sym, gs_id == "M490")), 55) - expect_gt(nrow(filter(msigdbr_mm, gs_id == "M490")), 50) - expect_lt(nrow(filter(msigdbr_mm, gs_id == "M490")), 60) - # C8: DESCARTES_FETAL_EYE_STROMAL_CELLS - expect_equal(nrow(filter(msigdbr_hs, gs_id == "M40180")), 95) - expect_equal(nrow(filter(msigdbr_hs_sym, gs_id == "M40180")), 95) - expect_gt(nrow(filter(msigdbr_mm, gs_id == "M40180")), 80) - expect_lt(nrow(filter(msigdbr_mm, gs_id == "M40180")), 95) -}) +# test_that("subcategory partial match", { +# msigdbr_mm_gomf <- msigdbr(species = "Mus musculus", category = "C5", subcategory = "GO:MF") +# expect_s3_class(msigdbr_mm_gomf, "tbl_df") +# msigdbr_mm_mf <- msigdbr(species = "Mus musculus", category = "C5", subcategory = "MF") +# expect_s3_class(msigdbr_mm_mf, "tbl_df") +# expect_equal(nrow(msigdbr_mm_gomf), nrow(msigdbr_mm_mf)) +# expect_identical(msigdbr_mm_gomf, msigdbr_mm_mf) +# }) test_that("wrong parameters", { + expect_error(msigdbr(db_species = "X")) + expect_error(msigdbr(db_species = "RN")) expect_error(msigdbr(species = "test")) expect_error(msigdbr(species = c("Homo sapiens", "Mus musculus"))) expect_error(msigdbr(species = "")) expect_error(msigdbr(species = NA)) - expect_error(msigdbr(species = "Homo sapiens", category = "X")) - expect_error(msigdbr(species = "Homo sapiens", category = "X", subcategory = "X")) - expect_error(msigdbr(species = "Homo sapiens", category = "H", subcategory = "H")) - expect_error(msigdbr(species = "Homo sapiens", category = c("C1", "C2"))) - expect_error(msigdbr(species = "Homo sapiens", category = "C2", subcategory = c("CGP", "CP"))) + expect_error(msigdbr(species = "Homo sapiens", collection = "X")) + expect_error(msigdbr(species = "Homo sapiens", collection = "X", subcollection = "X")) + expect_error(msigdbr(species = "Homo sapiens", collection = "H", subcollection = "H")) + expect_error(msigdbr(species = "Homo sapiens", collection = c("C1", "C2"))) + expect_error(msigdbr(species = "Homo sapiens", collection = "C2", subcollection = c("CGP", "CP"))) + # deprecated parameters + expect_warning(msigdbr(species = "Homo sapiens", category = "H")) + expect_warning(msigdbr(species = "Homo sapiens", subcategory = "CGP")) + m_hs <- msigdbr(species = "Homo sapiens", category = "H") + expect_contains(colnames(m_hs), c("gene_symbol", "entrez_gene", "ensembl_gene", "gs_cat", "gs_subcat")) }) diff --git a/vignettes/msigdbr-intro.Rmd b/vignettes/msigdbr-intro.Rmd index 52c65a5..c5579e6 100644 --- a/vignettes/msigdbr-intro.Rmd +++ b/vignettes/msigdbr-intro.Rmd @@ -27,12 +27,12 @@ Depending on the tool, it may be necessary to import the pathways, translate gen The msigdbr R package provides Molecular Signatures Database (MSigDB) gene sets typically used with the Gene Set Enrichment Analysis (GSEA) software: -* in an R-friendly "[tidy](https://r4ds.had.co.nz/tidy-data.html)" format with one gene pair per row +* in an R-friendly "[tidy](https://r4ds.hadley.nz/data-tidy.html)" format with one gene pair per row * for multiple frequently studied model organisms, such as mouse, rat, pig, zebrafish, fly, and yeast, in addition to the original human genes * as gene symbols as well as NCBI Entrez and Ensembl IDs -* without accessing external resources and requiring an active internet connection +* without accessing external resources requiring an active internet connection -Please be aware that the homologs were computationally predicted for distinct genes. +Please be aware that the orthologs were computationally predicted at the gene level. The full pathways may not be well conserved across species. ## Installation @@ -58,26 +58,26 @@ all_gene_sets <- msigdbr(species = "Mus musculus") head(all_gene_sets) ``` -You can retrieve data just for a specific collection/category, such as the hallmark gene sets. +You can retrieve data just for a specific collection, such as the hallmark gene sets. ```{r msigdbr-mouse-h} -h_gene_sets <- msigdbr(species = "mouse", category = "H") +h_gene_sets <- msigdbr(species = "mouse", collection = "H") head(h_gene_sets) ``` -You can specify a sub-category, such as C2 (curated) CGP (chemical and genetic perturbations) gene sets. +You can specify a sub-collection, such as C2 (curated) CGP (chemical and genetic perturbations) gene sets. ```{r msigdbr-mouse-c2} -cgp_gene_sets <- msigdbr(species = "mouse", category = "C2", subcategory = "CGP") +cgp_gene_sets <- msigdbr(species = "mouse", collection = "C2", subcollection = "CGP") head(cgp_gene_sets) ``` -If you require more custom filtering, the `msigdbr()` function output is a data frame that can be manipulated using standard methods. -For example, you can subset to a specific collection/category using `dplyr::filter()`. +If you require more precise filtering, the `msigdbr()` function output is a data frame that can be manipulated using standard methods. +For example, you can subset to a specific collection using dplyr. ```{r filter-h} -all_gene_sets %>% - dplyr::filter(gs_cat == "H") %>% +all_gene_sets |> + dplyr::filter(gs_collection == "H") |> head() ``` @@ -85,7 +85,8 @@ all_gene_sets %>% There are `msigdbr_species()` and `msigdbr_collections()` helper functions to assist with setting the `msigdbr()` parameters. -You can check the available species with `msigdbr_species()`. Either scientific or common names are acceptable for the `msigdbr()` function. +You can check the available species with `msigdbr_species()`. +Both scientific and common names are acceptable for the `msigdbr()` function. ```{r species} msigdbr_species() @@ -101,11 +102,11 @@ msigdbr_collections() The msigdbr output can be used with various pathway analysis packages. -Use the gene sets data frame for [clusterProfiler](https://bioconductor.org/packages/clusterProfiler/) with genes as Entrez Gene IDs. +Use the gene sets data frame for [clusterProfiler](https://bioconductor.org/packages/clusterProfiler/) with genes as NCBI Entrez Gene IDs. -```{r cp-entrez, eval=FALSE} -msigdbr_t2g <- msigdbr_df %>% - dplyr::distinct(gs_name, entrez_gene) %>% +```{r cp-ncbi, eval=FALSE} +msigdbr_t2g <- msigdbr_df |> + dplyr::distinct(gs_name, ncbi_gene) |> as.data.frame() enricher(gene = gene_ids_vector, TERM2GENE = msigdbr_t2g, ...) ``` @@ -113,8 +114,8 @@ enricher(gene = gene_ids_vector, TERM2GENE = msigdbr_t2g, ...) Use the gene sets data frame for [clusterProfiler](https://bioconductor.org/packages/clusterProfiler/) with genes as gene symbols. ```{r cp-symbols, eval=FALSE} -msigdbr_t2g <- msigdbr_df %>% - dplyr::distinct(gs_name, gene_symbol) %>% +msigdbr_t2g <- msigdbr_df |> + dplyr::distinct(gs_name, gene_symbol) |> as.data.frame() enricher(gene = gene_symbols_vector, TERM2GENE = msigdbr_t2g, ...) ``` @@ -133,37 +134,34 @@ msigdbr_list <- split(x = msigdbr_df$gene_symbol, f = msigdbr_df$gs_name) gsva(gset.idx.list = msigdbr_list, ...) ``` -## Potential questions or concerns +## Potential questions and concerns **Which version of MSigDB was used?** -This package was generated with MSigDB v2023.1.Hs. -The MSigDB version is used as the base of the msigdbr CRAN package version. -You can check the installed version with `packageVersion("msigdbr")`. +The MSigDB version is stored in the `db_version` column of the returned data frame. +You can check the version used with `unique(msigdbr_df$db_version)`. -**Can I download the gene sets directly from MSigDB instead of using this package?** +**Why use this package when I can download the gene sets directly from MSigDB?** -Yes. -You can then import the GMT files (with `getGmt()` from the `GSEABase` package, for example). -The GMTs only include the human genes, even for gene sets generated from mouse experiments. -If you are working with non-human data, you then have to convert the MSigDB genes to your organism or your genes to human. +This package makes it more convenient to work with MSigDB gene sets in R. +You can download the GMT files and import them (with `getGmt()` from the GSEABase package, for example). +You then have to format the output to be compatible with downstream tools. +If you are working with non-human data, you then have to convert the MSigDB genes to your organism. **Can I convert between human and mouse genes just by adjusting gene capitalization?** -That will work for most genes, but not all. +That will work for most, but not all, genes. **Can I convert human genes to any organism myself instead of using this package?** -Yes. -A popular method is using the `biomaRt` package. +One popular method is using the biomaRt package. You may still end up with dozens of homologs for some genes, so additional cleanup may be helpful. **Aren't there already other similar tools?** There are a few resources that provide some of the msigdbr functionality and served as an inspiration for this package. [WEHI](https://bioinf.wehi.edu.au/software/MSigDB/) provides MSigDB gene sets in R format for human and mouse. -[MSigDF](https://github.com/stephenturner/msigdf) relies on the WEHI resource, but is converted to a more tidyverse-friendly data frame. -There is a more recent [ToledoEM/msigdf](https://github.com/ToledoEM/msigdf) fork. +[MSigDF](https://github.com/stephenturner/msigdf) and a more recent [ToledoEM/msigdf](https://github.com/ToledoEM/msigdf) fork provide a tidyverse-friendly data frame. These are updated at varying frequencies and may not be based on the latest version of MSigDB. Since 2022, the GSEA/MSigDB team provides [collections that are natively mouse](https://www.gsea-msigdb.org/gsea/msigdb/mouse/collections.jsp) and don't require orthology conversion. @@ -174,7 +172,7 @@ You can submit feedback and report bugs on [GitHub](https://github.com/igordot/m ## Details The Molecular Signatures Database (MSigDB) is a collection of gene sets originally created for use with the Gene Set Enrichment Analysis (GSEA) software. -To cite use of the underlying MSigDB data, reference Subramanian, Tamayo, et al. (2005, PNAS) and one or more of the following as appropriate: Liberzon, et al. (2011, Bioinformatics), Liberzon, et al. (2015, Cell Systems), and also the source for the gene set. +To cite use of the underlying MSigDB data, reference Subramanian, Tamayo, et al. (2005, PNAS) and one or more of the following as appropriate: Liberzon, et al. (2011, Bioinformatics), Liberzon, et al. (2015, Cell Systems), Castanza, et al. (2023, Nature Methods) and also the source for the gene set. Gene homologs are provided by HUGO Gene Nomenclature Committee at the European Bioinformatics Institute which integrates the orthology assertions predicted for human genes by eggNOG, Ensembl Compara, HGNC, HomoloGene, Inparanoid, NCBI Gene Orthology, OMA, OrthoDB, OrthoMCL, Panther, PhylomeDB, TreeFam and ZFIN. For each human equivalent within each species, only the ortholog supported by the largest number of databases is used.