diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 8e9e284..45c8fce 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -18,9 +18,9 @@ jobs: fail-fast: false matrix: config: - - {os: macos-latest, r: 'release'} + # - {os: macos-latest, r: 'release'} #- {os: windows-latest, r: 'release'} - - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + # - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - {os: ubuntu-latest, r: 'release'} #- {os: ubuntu-latest, r: 'oldrel-1'} @@ -47,4 +47,5 @@ jobs: - uses: r-lib/actions/check-r-package@v2 with: upload-snapshots: true + error-on: '"error"' build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' diff --git a/.gitignore b/.gitignore index 086b3a0..a6d9e0e 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,12 @@ docs /doc/ /Meta/ .DS* +inst/rmarkdown/templates/rnaseq/skeleton/DE/Multiplicative_DGE_Analysis.Rmd +.Rdata +.httr-oauth +.DS_Store +.quarto +inst/templates/chipseq/QC/QC.html +*.html +*.csv +*.qs \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION old mode 100644 new mode 100755 index 40012fd..8442eee --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: bcbioR Type: Package Title: Templates and functions to guide downstream analysis and data interpretation -Version: 0.1.3 +Version: 0.3.0 Authors@R: person("Pantano", "Lorena", , "lorena.pantano@gmail.com", role = c("aut", "cre")) Description: Collaborative code repository at the Harvard Chan Bioinformatics Core. @@ -11,18 +11,23 @@ Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.1 Imports: - DESeq2, - stringr, - ggplot2, magrittr, - hues, - ggprism, grDevices, R.utils, readr, + withr, + usethis, fs, - withr + jsonlite, + yaml, + whisker, + rlang, + stringr, + utils Suggests: + hues, + ggprism, + ggplot2, knitr, rmarkdown, testthat (>= 3.0.0) @@ -30,4 +35,4 @@ VignetteBuilder: knitr URL: http://bcb.io/bcbioR/ Config/testthat/edition: 3 Depends: - R (>= 2.10) + R (>= 4.2.0) diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/LICENSE.md b/LICENSE.md old mode 100644 new mode 100755 diff --git a/NAMESPACE b/NAMESPACE old mode 100644 new mode 100755 index 3cc700a..5882394 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,19 +1,20 @@ # Generated by roxygen2: do not edit by hand export(bcbio_nfcore_check) -export(bcbio_set_project) export(bcbio_templates) export(cb_friendly_cols) export(cb_friendly_pal) export(list_cb_friendly_cols) export(scale_color_cb_friendly) export(scale_fill_cb_friendly) -import(DESeq2) import(R.utils) -import(ggplot2) -import(ggprism) -import(hues) +import(fs) +import(usethis) +import(whisker) +import(withr) importFrom(grDevices,colorRampPalette) importFrom(magrittr,"%>%") importFrom(readr,read_csv) importFrom(stringr,str_replace_all) +importFrom(utils,download.file) +importFrom(utils,unzip) diff --git a/NEWS.md b/NEWS.md old mode 100644 new mode 100755 index dd18251..4ab3378 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,22 @@ +# bcbioR 0.3.0 + +* re-structure templates +* Add text with best practices +* Reproducibility: + * test data for RNAseq, singlecell, CHIPseq +* Base project: + * Guidelines to create repo easily + * Example Rmd with headers and aesthetics +* RNASEQ + * Use provenance for FA in DE report + * Support multiple comparisons +* New templates: + * methylation - draft + * singcell cell QC and Inegration - stable + * scQC shiny app - stable + * chipseq QC and Diffbind - beta + * COSMX - draft + # bcbioR 0.1.3 * fix duplicated gene names diff --git a/R/bcbioR-package.R b/R/bcbioR-package.R index 64459df..2ddfc44 100644 --- a/R/bcbioR-package.R +++ b/R/bcbioR-package.R @@ -6,10 +6,12 @@ #' @importFrom magrittr %>% #' @importFrom readr read_csv #' @importFrom stringr str_replace_all +#' @importFrom utils download.file +#' @importFrom utils unzip ## usethis namespace: end -#' @import DESeq2 -#' @import ggplot2 -#' @import hues -#' @import ggprism +#' @import usethis +#' @import fs #' @import R.utils +#' @import withr +#' @import whisker NULL diff --git a/R/cb_friendly.R b/R/cb_friendly.R index 119d94c..90fdcdd 100644 --- a/R/cb_friendly.R +++ b/R/cb_friendly.R @@ -49,12 +49,13 @@ cb_friendly_palettes <- list( `cool` = cb_friendly_cols("blue", "dark_purple", "purple", "sky_blue"), `hot` = cb_friendly_cols("yellow", "light_orange", "dark_orange"), `grey` = cb_friendly_cols("black", "dark_grey", "blue_grey"), - `heatmap` = cb_friendly_cols("blue", "white", "brown") + `heatmap` = cb_friendly_cols("blue", "white", "brown"), + `white_to_blue` = cb_friendly_cols('white', 'blue') ) #' access cb friendly palette by name, reversing if necessary #' -#' @param palette name of the palette to be returned +#' @param palette name of the palette to be returned (main, cool, hot, grey, white_to_blue, or heatmap) #' @param reverse boolean, reverse order of colors in palette #' @param ... pass to ggplot #' @export @@ -66,7 +67,7 @@ cb_friendly_pal <-function(palette = 'main', reverse = F, ...){ #' use cb friendly colors as color aesthetic with ggplot #' -#' @param palette name of the palette to be returned +#' @param palette name of the palette to be returned (main, cool, hot, grey, white_to_blue, or heatmap) #' @param discrete boolean, whether to make palette discretely divided into colors or continuous #' @param reverse boolean, reverse order of colors in palette #' @param ... pass to ggplot @@ -83,7 +84,7 @@ scale_color_cb_friendly <- function(palette = "main", discrete = TRUE, reverse = #' use cb friendly colors as fill aesthetic with ggplot #' -#' @param palette name of the palette to be returned +#' @param palette name of the palette to be returned (main, cool, hot, grey, white_to_blue, or heatmap) #' @param discrete boolean, whether to make palette discretely divided into colors or continuous #' @param reverse boolean, reverse order of colors in palette #' @param ... pass to ggplot diff --git a/R/hello.R b/R/hello.R deleted file mode 100644 index aee6389..0000000 --- a/R/hello.R +++ /dev/null @@ -1,167 +0,0 @@ -.fix <- function(x){ - x <- tolower(x) %>% str_replace_all(., "[[:punct:]]", "_") - x <- str_replace_all(x, " ", "_") - return(x) -} - - -#' Function to check samplesheet for nf-core -#' -#' @param file path to CSV file for nf-core -#' @examples -#' -#' bcbio_nfcore_check(system.file("extdata", "rnaseq_good.csv", package = "bcbioR") ) -#' -#' @export -bcbio_nfcore_check <- function(file){ - required=c("sample","fastq_1","fastq_2","strandedness") - samplesheet=read_csv(file) - - if (!(all(required %in% colnames(samplesheet)))){ - print(colnames(samplesheet)) - stop("Missing required columns ", paste(required, collapse = " ")) - }else if (any(grepl("^[1-9]", samplesheet[["sample"]]))){ - stop("Avoid samples starting with numbers ") - }else if (any(is.na(samplesheet))){ - warning("Columns with missing values") - }else{ - message("All good.") - } -} - -#' Function to help deploy analysis folder inside a project folder -#' -#' This function contains Rmd, R, md, files that help to structure -#' an analysis following HCBC best-practices. -#' For rnaseq, it will deploy: QC and DE Rmd with additional files to help -#' to facilitate the analysis as needed. -#' -#' Normally these helper files are inside a report folder inside a -#' project folder. -#' -#' @param type string indicating the type of analysis, supported: -#' - base -#' - rnaseq, scrnaseq, -#' - teaseq -#' - cosmx -#' -#' @param outpath string path indicating where to copy all the files to -#' @examples -#' \dontrun{ -#' bcbio_templates("rnaseq", "path_to_projects/project1/reports") -#' } -#' @export -bcbio_templates <- function(type="rnaseq", outpath){ - switch(type, - base={ - fpath <- system.file("rmarkdown/templates/common", "skeleton", package="bcbioR") - copyDirectory(fpath, outpath) - }, - rnaseq={ - fpath <- system.file("rmarkdown/templates/rnaseq", "skeleton", package="bcbioR") - copyDirectory(fpath, outpath) - }, - scrnaseq={ - fpath <- system.file("rmarkdown/templates/singlecell", "skeleton", package="bcbioR") - copyDirectory(fpath, outpath) - }, - teaseq={ - fpath <- system.file("rmarkdown/templates/teaseq", "skeleton", package="bcbioR") - copyDirectory(fpath, outpath) - }, - cosmx={ - fpath <- system.file("rmarkdown/templates/cosmx", "skeleton", package="bcbioR") - copyDirectory(fpath, outpath) - }, - { - stop('project type not recognize, please choose: ', 'base', - 'rnaseq', 'scrnaseq', - 'teaseq', 'cosmx') - } - ) -} - -#' Function to help with project name used for parent folder -#' -#' This function will ask for user input about: -#' * numeric code -#' * PI full name -#' * technology -#' * tissue -#' * organism -#' * project description -#' -#' It removes special character with `_`. The output is a guideline to -#' what the folder used can be. -#' -#' @returns A string list with hbc_code, and project folder name -#' @export -bcbio_set_project <- function() { - hbc_code <- readline("What is the hbc code (only numbers):\n") - hbc_code <- paste0("hbc", hbc_code) - pi <- readline("What is PI last name:\n") - technology <- readline("What is the technology:\n") - tissue <- readline("What is the tissue:\n") - org <- readline("What is the organism:\n") - project <- readline("What is the project name:\n") - #dropbox <- readline("What is the dropbox name:\n") - #github_org <- readline("What is the github organization:\n") - #hbc_$technology_of_$pilastname_$intervention_on_$tissue_in_$organism_$hbccode - project_full <- paste(technology, .fix(pi), .fix(project), tissue, org, hbc_code, sep="_") - #github <- c(github_org,project_full) - opts <- list(code=hbc_code, project=project_full) - #dropbox=file.path(dropbox,project_full), - #github=github) - print(opts) - return(opts) -} - - -bcbio_start_project <- function(options) { - -} - -bcbio_gitignore <- function(options) { - -} - -# This function showcases how one might write a function to be used as an -# RStudio project template. This function will be called when the user invokes -# the New Project wizard using the project template defined in the template file -# at: -# -# inst/rstudio/templates/project/hello_world.dcf -# -# The function itself just echos its inputs and outputs to a file called INDEX, -# which is then opened by RStudio when the new project is opened. -rnaseq <- function(path, ...) { - - # ensure path exists - dir.create(path, recursive = TRUE, showWarnings = FALSE) - - # generate header - header <- c( - "# This file was generated by a call to 'ptexamples::hello_world()'.", - "# The following inputs were received:", - "" - ) - - # collect inputs - dots <- list(...) - text <- lapply(seq_along(dots), function(i) { - key <- names(dots)[[i]] - val <- dots[[i]] - paste0(key, ": ", val) - }) - - # collect into single text string - contents <- paste( - paste(header, collapse = "\n"), - paste(text, collapse = "\n"), - sep = "\n" - ) - - # write to index file - writeLines(contents, con = file.path(path, "README.md")) - -} diff --git a/R/helpers.R b/R/helpers.R new file mode 100644 index 0000000..d88209f --- /dev/null +++ b/R/helpers.R @@ -0,0 +1,378 @@ +.fix <- function(x){ + x <- tolower(x) %>% str_replace_all(., "[[:punct:]]", "_") + x <- str_replace_all(x, " ", "_") + return(x) +} + + +#' Function to check samplesheet for nf-core +#' +#' @param file path to CSV file for nf-core +#' @examples +#' +#' bcbio_nfcore_check(system.file("extdata", "rnaseq_good.csv", package = "bcbioR") ) +#' +#' @export +bcbio_nfcore_check <- function(file){ + required=c("sample","fastq_1","fastq_2","strandedness") + samplesheet=read_csv(file) + + if (!(all(required %in% colnames(samplesheet)))){ + print(colnames(samplesheet)) + stop("Missing required columns ", paste(required, collapse = " ")) + }else if (any(grepl("^[1-9]", samplesheet[["sample"]]))){ + stop("Avoid samples starting with numbers ") + }else if (any(grep("[^a-zA-Z0-9_]", samplesheet[["sample"]]))){ + stop("Sample names should contain only letters, numbers, and underscores") + }else if (any(is.na(samplesheet))){ + warning("Columns with missing values") + }else{ + message("All good.") + } +} + +#' Function to help deploy analysis folder inside a project folder +#' +#' This function contains Rmd, R, md, files that help to structure +#' an analysis following HCBC best-practices. +#' For rnaseq, it will deploy: QC and DE Rmd with additional files to help +#' to facilitate the analysis as needed. +#' +#' Normally these helper files are inside a report folder inside a +#' project folder. +#' +#' @param type string indicating the type of analysis, supported: rnaseq. +#' +#' @param outpath string path indicating where to copy all the files to +#' @param org string with the organization name. To deploy specific files. +#' @examples +#' \dontrun{ +#' path <- withr::local_tempdir() +#' bcbio_templates(type="base",outpath=path) +#' fs::dir_ls(path,all=T) +#' } +#' @export +bcbio_templates <- function(type="rnaseq", outpath=NULL, org=NULL){ + if (type=="all"){ + usethis::ui_info("Showing analysis:") + msg <- basename(fs::dir_ls(fs::path_package("bcbioR", "templates"))) + return(msg) + } + if (is.null(outpath)){ + usethis::ui_stop("outpath needs to be defined.") + } + fs::dir_create(outpath) + switch(type, + base={ + #file.copy(fpath, outpath, recursive = T) + copy_templates(outpath, "base", org) + }, + rnaseq={ + #file.copy(fpath, outpath, recursive = T) + copy_templates(outpath, "nf-core/rnaseq", org) + }, + singlecell={ + #file.copy(fpath, outpath, recursive = T) + copy_templates(outpath, "singlecell", org) + }, + singlecell_delux={ + #file.copy(fpath, outpath, recursive = T) + copy_templates(outpath, "singlecell_delux", org) + }, + spatial={ + #file.copy(fpath, outpath, recursive = T) + copy_templates(outpath, "spatial", org) + }, + chipseq={ + #file.copy(fpath, outpath, recursive = T) + copy_templates(outpath, "chipseq", org) + }, + multiomics={ + #file.copy(fpath, outpath, recursive = T) + copy_templates(outpath, "multiomics", org) + }, + { + stop(paste('project type not recognize, please choose: ', + 'rnaseq', 'chipseq', + 'singlecell','singlecell_delux','spatial')) + } + ) +} + +read_pipeline_info <- function(nfcore){ + # pipeline_info/params_2024-05-28_12-28-51.json + config <- fs::path_join(c(nfcore, "pipeline_info")) + params <- fs::dir_ls(config, regexp = "params") + metadata <- jsonlite::read_json(params)[["input"]] + # input + # tmp_rna/pipeline_info/software_versions.yml + software <- fs::path_join(c(nfcore, "pipeline_info", "software_versions.yml")) + software_txt <- yaml::read_yaml(software) + pipeline <- grep("nf-core", names(software_txt$Workflow), value = TRUE) + # Workflow: + # Nextflow: 24.04.1 + # nf-core/rnaseq: 3.14.0 + # check only rnaseq is supported + if (!(pipeline %in% c("nf-core/rnaseq"))){ + ui_stop("Sorry, we don't yet support {ui_value(pipeline)}") + } + list(metadata=metadata, pipeline=pipeline) +} + +render_rmd <- function(infile, outfile, ls_data){ + whisker.render(read_file(infile), + ls_data) %>% + write_file(outfile) +} + +bcbio_params <-function(nfcore_path, pipeline, metadata, copy){ + ui_info("Reading input files from {ui_value(nfcore_path)}") + if (pipeline=="nf-core/rnaseq"){ + if (!copy){ + ls_data<-list( + se_object =fs::path_join(c(nfcore_path, "star_salmon/salmon.merged.gene_counts.rds")), + metadata_fn = metadata, + counts_fn = fs::path_join(c(nfcore_path, "star_salmon/salmon.merged.gene_counts.tsv")), + multiqc_data_dir = fs::path_join(c(nfcore_path, "multiqc/star_salmon/multiqc-report-data/")), + gtf_fn = fs::path_join(c(nfcore_path, "genome/genome.filtered.gtf"))) + return(ls_data) + } + } + +} + +detect_gitignores <- function(path){ + gits <- fs::dir_ls(path, recurse = TRUE, regexp = 'gitignore') + sapply(gits, function(fn){ + hidden <- file.path(dirname(fn), paste0(".", basename(fn))) + fs::file_move(fn, hidden) + }) +} + +copy_files_in_folder<- function(origin, remote, is_org=FALSE){ + to_copy <- fs::dir_ls(origin,all = TRUE) + if (!is_org) { + to_copy <- grep("org", to_copy, + value = TRUE, invert = TRUE) + }else{ + # don't allow doc files + to_copy <- grep(".doc.*$", to_copy, + value = TRUE, invert = TRUE) + } + for (element in to_copy){ + full_new_path <- fs::path_join(c(remote, fs::path_file(element))) + + if (fs::is_dir(element)){ + if (!(fs::dir_exists(full_new_path)) | is_org) + fs::dir_copy(element, full_new_path, overwrite = is_org) + } + if (fs::is_file(element)){ + if (!(fs::file_exists(full_new_path)) | is_org) + fs::file_copy(element, full_new_path, overwrite = is_org) + } + } + detect_gitignores(remote) +} + +deploy_apps <- function(apps, path){ + fs::dir_create(file.path(path, "apps")) + sapply(names(apps), function(app){ + dest_file=file.path(path, "apps", paste0(app, ".zip")) + download.file(url = apps[[app]], + destfile = dest_file) + unzip(zipfile = dest_file, exdir = dirname(dest_file)) + fs::file_delete(dest_file) + }) +} + +copy_templates <- function(path, pipeline, org=NULL){ + apps=list() + base = c("bcbioR") + if (pipeline=="base"){ + parts = c("templates/base") + }else if(pipeline=="nf-core/rnaseq"){ + parts = c("templates/rnaseq") + }else if(pipeline=="singlecell"){ + parts = c("templates/singlecell") + apps=c(apps, scRNAseq_qc="https://github.com/hbc/scRNAseq_qc_app/archive/refs/heads/main.zip") + }else if(pipeline=="singlecell_delux"){ + parts = c("templates/singlecell_delux") + }else if(pipeline=="multiomics"){ + parts = c("templates/multiomics") + }else if(pipeline=="spatial"){ + parts = c("templates/spatial") + }else if(pipeline=="chipseq"){ + parts = c("templates/chipseq") + } + analysis_template <- fs::path_package(base, parts) + + ui_info("Getting templates from {ui_value(analysis_template)}") + # ls_files <- grep("org", list.files(analysis_template, full.names = TRUE), + # value = TRUE, invert = TRUE) + # ui_info("{ui_value(length(ls_files))} amount of files to copy") + copy_files_in_folder(analysis_template, path) + if (!is.null(org)){ + org_template <- fs::path_package(base, parts, "org", org) + if (fs::dir_exists(org_template)){ + ui_info("Getting templates from {ui_value(org_template)}") + copy_files_in_folder(org_template, path, is_org=TRUE) + } + } + + # check org folder is in there + # search for param + _README.md + # concat file to README.md + deploy_apps(apps, path) +} + +bcbio_render <- function(path, pipeline, data){ + + if (pipeline=="nf-core/rnaseq"){ + # analysis_template <- fs::path_package("bcbioR", "templates", "rnaseq", "qc") + # fs::dir_copy(analysis_template, fs::path_join(c(path, "reports", "qc")), overwrite=TRUE) + # analysis_template <- fs::path_package("bcbioR", "templates", "rnaseq", "de") + # fs::dir_copy(analysis_template, fs::path_join(c(path, "reports", "de")), overwrite=TRUE) + render_rmd( + fs::path_join(c(path, "reports", "qc", "QC_nf-core.Rmd")), + fs::path_join(c(path, "reports", "qc", "QC_nf-core.Rmd")), + data + ) + render_rmd( + fs::path_join(c(path, "reports", "de", "DEG.Rmd")), + fs::path_join(c(path, "reports", "de", "DEG.Rmd")), + data + ) + ui_info("Please, to start the analysis, modify these parameter in QC/QC.rmd") + ui_todo("set genome to hg38, mm10, mm39, or other") + ui_todo("set factor_of_interest to a column in your metadata") + }else{ + ui_warn("These are draft templates, are meant to show examples of specific analysis") + ui_todo("Please, read carefully and adapt to your data and question.") + } +} + +# help with bcbio analysis setup +use_bcbio_analysis <- function(path, pipeline, copy=TRUE, metadata=NULL){ + + if (copy){ + # deploy files + ui_info("Rmd templates will be copied but variables path won't be filled automatically.") + if (!is.null(metadata)){ + meta_path <- fs::path_join(c(path, "meta", fs::path_file(metadata))) + if (!(fs::file_exists(metadata))) + ui_stop("{ui_value(metadata)} doesn't exist.") + fs::file_copy(metadata, meta_path) + } + } + if (!is.null(pipeline) & fs::dir_exists(pipeline)){ + # ui_stop("{ui_value(nfcore)} doesn't exist. point to nfcore path or turn on copy mode.") + ui_info("Trying to guess nf-core pipeline at {ui_value(pipeline)}") + # guess analysis from pipeline file + information <- read_pipeline_info(pipeline) + fs::dir_create(fs::path_join(c(path, "meta"))) + meta_path <- fs::path_join(c(path, "meta", fs::path_file(information$metadata))) + pipeline <- information$pipeline + if (!is.null(metadata)){ + if (!(fs::file_exists(metadata))) + ui_stop("{ui_value(metadata)} doesn't exist.") + fs::file_copy(metadata, meta_path) + }else{ + if (!fs::file_exists(information$metadata)){ + ui_warn("{ui_value(metadata)} not found. We can only work with local filesytems right now.") + ui_todo("Please, copy {ui_value(metadata)} to {ui_value(meta_path)}.") + ui_warn("If this file isn't manually set up, the Rmd code will fail.") + }else{ + ui_info("Copy metadata to {ui_value(meta_path)}") + fs::file_copy(information$metadata, meta_path) + } + metadata <- meta_path + } + path_final <- fs::path_join(c(path, "final")) + ui_todo("Please, copy nf-core output directory to {ui_value(path_final)}") + } + # set all files from analysis + copy_templates(fs::path_join(c(path, "reports")), pipeline) + if (fs::dir_exists(pipeline)){ + data <- bcbio_params(nfcore, pipeline, metadata) + bcbio_render(path, pipeline, data) + } + + +} + +# Pilot to deploy full projects at once +# path <- withr::local_tempdir() +# use_bcbio_projects(path,pipeline="nf-core/rnaseq",copy=TRUE) +# fs::dir_ls(path) +use_bcbio_projects <- function(path, pipeline=NULL, metadata=NULL, + git=TRUE, gh=FALSE, org=NULL, copy=TRUE) { + + ui_info("Creating project at {ui_value(path)}") + if (!fs::dir_exists(path)) + fs::dir_create(path, mode = "u=xrw,g=xwr,o=r", recurse = TRUE) + + ui_info("Populating base project") + base_template <- fs::path_package("bcbioR", "templates", "base") + copy_files_in_folder(base_template, path) + + if (!is.null(pipeline)){ + ui_info("Using this pipeline templates {ui_value(pipeline)}") + use_bcbio_analysis(path, pipeline, copy = copy, metadata=metadata) + } + # is_nfcore_ready <- FALSE + # if (is.null(pipeline) && rlang::is_interactive()){ + # is_nfcore_ready <- ui_yeah("Have you already run nf-core pipeline?", + # n_yes=1, n_no =1) + # if (is_nfcore_ready && rlang::is_interactive()){ + # nfcore <- readline("? Enter path to nf-core output: ") + # }else{ + # ui_warn("Please, turn copy = TRUE to only deploy files or,") + # ui_stop("Please use {ui_code('use_bcbio_projects')} again when you have the nf-core output.") + # } + # use_bcbio_analysis(path, nfcore, copy, metadata) + # }else{ + # if (fs::dir_exists(nfcore)){ + # ui_info("Checking {ui_value(nfcore)} as nf-core output directory") + # use_bcbio_analysis(path, nfcore, copy, metadata) + # }else if (copy){ + # # deploy only files + # ui_info("Deploying only templates without pipeline information.") + # use_bcbio_analysis(path, nfcore, copy = TRUE, metadata=metadata) + # }else{ + # ui_warn("Please, provide nfcore working directory or") + # ui_warn("turn copy = TRUE to only deploy files.") + # } + # } + + # if (git){ + # ui_info("Create Git local repo at {ui_value(path)}") + # use_git() + # } + # if (gh){ + # ui_info("Create GitHub repo at {ui_value(path)}") + # whoami <- suppressMessages(gh::gh_whoami()) + # if (is.null(whoami)) { + # ui_warn(c( + # "x" = "Unable to discover a GitHub personal access token.", + # "i" = "A token is required in order to create and push to a new repo.", + # "_" = "Call {.run usethis::gh_token_help()} for help configuring a token." + # )) + # ui_todo("Try this later: use_github(organisation=org), private = TRUE") + # + # } + # use_github(organisation=org, private = TRUE) + # }else{ + # ui_info("You decided not to create a repo, please use this to push when ready") + # ui_todo("Try this later: use_github(organisation=org), private = TRUE") + # } + + answer <- FALSE + if (rlang::is_interactive()) + answer <- ui_yeah("Please, read the README.md file as the session starts.Are you ready?", + n_yes=1, n_no =1, shuffle=FALSE) + if (answer) + proj_activate(path) + if (!answer) + ui_info("Please use proj_activate({ui_value(path)})} to start this project.") + +} diff --git a/inst/rmarkdown/templates/common/skeleton/code/placeholder.R b/R/orgs.R similarity index 100% rename from inst/rmarkdown/templates/common/skeleton/code/placeholder.R rename to R/orgs.R diff --git a/R/test_data.R b/R/test_data.R new file mode 100644 index 0000000..4ba4787 --- /dev/null +++ b/R/test_data.R @@ -0,0 +1,35 @@ +#' Function to help download test data for QC chipseq report +#' +#' It downloads files from our testdata repository: [bcbio/bcbioR-test-data](https://github.com/bcbio/bcbioR-test-data/tree/main/chipseq) +#' +#' It downloads the `bowtie2/mergedLibrary/macs2/narrowPeak` output +#' +#' @export +bcbio_qc_chipseq_testdata <- function(){ + # if using example data to render report, download peaks from github + api_url <- "https://api.github.com/repos/bcbio/bcbioR-test-data/contents/chipseq/bowtie2/mergedLibrary/macs2/narrowPeak" + response <- GET(api_url) + + if (status_code(response) == 200) { + content <- content(response, as = "text") + files_info <- fromJSON(content) %>% filter(name != 'consensus') + + # Filter out file paths and construct raw URLs + file_paths <- files_info$path + raw_base_url <- "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/" + + raw_file_urls <- paste0(raw_base_url, file_paths) + + # Function to download a file from a URL + download_file <- function(url) { + file_name <- basename(url) + download.file(url, destfile = file_name, mode = "wb") + } + + # Download all files using the constructed raw URLs + for (url in raw_file_urls) { + download_file(url) + } + peaks_dir = '.' + } +} diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 35161bb..45ea579 --- a/README.md +++ b/README.md @@ -20,15 +20,6 @@ devtools::install_github("bcbio/bcbioR",ref = "devel") ## Quick start -Use this code to generate a standard project name for all of your folders. **This code will not create any folders or files.** - -``` -library(bcbioR) -## basic example code -# will help you to build a folder name following HCBC naming rules -bcbio_set_project() -``` - ### Set base project use `setwd()` to set your current directory to the place where you want to work. The bcbioR functions will automatically write to whatever directory you have set. @@ -40,7 +31,10 @@ setwd("/path/to/analysis/folder") The following code will pop up a Rmd template will populate that folder with HCBC data structure guidelines ``` -bcbio_templates(type="base", outpath="/path/to/analysis/folder") +path="/path/to/analysis/folder" +bcbio_templates(type="base", outpath=path) +bcbio_templates(type="rnaseq", outpath=path) +bcbio_templates(type="singlecell", outpath=path) ``` ### Set RNAseq report folder @@ -53,7 +47,7 @@ bcbio_templates(type="rnaseq", outpath="/path/to/analysis/folder/reports") ### Discover more… -Go to the vignette to know more `vignette("bcbioR_quick_start",package="bcbioR")` +Go to the vignette to know more `vignette("bcbioR_quick_start", package="bcbioR")` ## How to Contribute @@ -87,3 +81,8 @@ Some best practices when developing: - James Billingsley - Zhu Zhuo - Elizabeth Partan +- Noor Sohail +- Meeta Mistry +- Will Gammerdinger +- Upen Bhattarai +- Shannan Ho Sui diff --git a/_pkgdown.yml b/_pkgdown.yml old mode 100644 new mode 100755 diff --git a/inst/apps/app.R b/inst/apps/app.R new file mode 100644 index 0000000..531206b --- /dev/null +++ b/inst/apps/app.R @@ -0,0 +1,51 @@ +# Global variables can go here +library(stringr) +.fix <- function(x){ + x <- tolower(x) + x <- str_replace_all(x, "[[:punct:]]", "_") + x <- str_replace_all(x, " ", "_") + return(x) +} + + +# Define the UI +ui <- fluidPage( + # Application title + titlePanel("Create project name"), + + sidebarLayout( + # Sidebar with a slider and selection inputs + sidebarPanel( + textInput('hbc', 'hbc-code (no letters)', value = "00000"), + textInput('pi', 'What is PI last name:', value = "lastname"), + textInput('scientist', 'What is the scientist last name:', value = "scientist"), + textInput('tech', 'What is the technology:', value = "rnaseq"), + textInput('tissue', 'What is the tissue:', value = "mix|cells|heart"), + textInput('org', 'What is the organism:', value = "mix|human"), + textInput('proj', 'What is the project name:', value = "this_analysis_is_cool"), + + ), + + # Show Word Cloud + mainPanel( + br("Suggested project name:"), + br(), + verbatimTextOutput('project') + ) + ) +) + + +# Define the server code +server <- function(input, output, session) { + output$project <- renderText({ + hbc_code <- paste0("hbc", input$hbc) + project_full <- paste(input$tech, .fix(input$pi), .fix(input$scientist), + .fix(input$proj), + input$tissue, input$org, hbc_code, sep="_") + project_full + }) +} + +# Return a Shiny app object +shinyApp(ui = ui, server = server) diff --git a/inst/orgs/hcbc.yml b/inst/orgs/hcbc.yml new file mode 100644 index 0000000..9f263a5 --- /dev/null +++ b/inst/orgs/hcbc.yml @@ -0,0 +1,4 @@ +name: HCBC +description: Variables specifics to HCBC +github_org: hbc + diff --git a/inst/rmarkdown/templates/common/skeleton/.gitignore b/inst/rmarkdown/templates/common/skeleton/.gitignore deleted file mode 100644 index 4cdfa50..0000000 --- a/inst/rmarkdown/templates/common/skeleton/.gitignore +++ /dev/null @@ -1,9 +0,0 @@ -*.Rproj* -.Rhistory -.Rproj.user -.Rhistory -.DS* -*._* -*placeholder* -data/* -final/* \ No newline at end of file diff --git a/inst/rmarkdown/templates/common/skeleton/README.md b/inst/rmarkdown/templates/common/skeleton/README.md deleted file mode 100644 index 6218884..0000000 --- a/inst/rmarkdown/templates/common/skeleton/README.md +++ /dev/null @@ -1,40 +0,0 @@ -# Guidelines - -## Set up work-space - -- [ ] Replace the title in this file to match the project's title -- [ ] Modify `information.R` with the right text for this project, it can be used to source in other `Rmd` files. The main `Rmd` file in this directory can be used to show general information of the project if needed. -- [ ] Use the same project name to create a folder in *Dropbox* and a repo in *GitHub* -- [ ] Use the function `bcbio_templates` to create templates inside `reports` for each type of analysis. For instance, for *RNAseq*: - - `bcbio_templates(type="rnaseq", outpath="reports")` or - - `bcbio_templates(type="rnaseq", outpath="reports/experiment1")` - - Then go to that folder and read the `README.md` - -## Folders - -- `meta` should contain the CSV/YAML files used by *bcbio* or *nextflow* -- `scripts` should contain `sbatch` scripts or any custom scripts used in this project -- `data` contains raw data, it can contains big data objects -- `reports` contains `Rmd` and `html` together with their files that will be added to *DropBox*. Each type of project have different guidelines. -- `final` contains the output of *nextflow/bcbio* -- `code` contains any other files that support custom analysis and don't generate a report -- For any relevant client files or papers use the `docs` folder on *DropBox* - -## Download - -- [ ] Download data to the `data` directory on O2. Check the md5 checksums if available. - -## Analysis - -- [ ] Make sure that final folder is copied from *scratch* or *S3* to `/n/data1/cores/bcbio/PIs/` - -## GitHub - -- [ ] Track in *Git* this `README` file -- [ ] Track in *Git* files in `scripts`, `meta`, and `reports` that belongs to these type: - - **Note** Git add `*.Rmd *.R *ipynb *.sh *.yaml`. (feel free use `.gitignore` if you use a GUI for non-tracked files). *DO NOT* use `git add *`. *DO NOT* track `html/csv/figures` -- [ ] Commit files and push to *Github* as necessary throughout the project, but especially when work is complete - -## Dropbox - -- [ ] Add to the *DropBox* folder all files in `reports` diff --git a/inst/rmarkdown/templates/common/skeleton/skeleton.Rmd b/inst/rmarkdown/templates/common/skeleton/skeleton.Rmd deleted file mode 100644 index 49c7ed5..0000000 --- a/inst/rmarkdown/templates/common/skeleton/skeleton.Rmd +++ /dev/null @@ -1,28 +0,0 @@ ---- -title: "General Project Information" -author: "Harvard Chan Bioinformatics Core" -date: "`r Sys.Date()`" -output: - html_document: - code_folding: hide - df_print: paged - highlights: pygments - number_sections: true - self_contained: true - theme: default - toc: true - toc_float: - collapsed: true - smooth_scroll: true -editor_options: - chunk_output_type: console -params: - params_file: information.R ---- - -```{r echo = F} -source(params$params_file) -``` - - - diff --git a/inst/rmarkdown/templates/common/template.yaml b/inst/rmarkdown/templates/common/template.yaml deleted file mode 100644 index 328d8b0..0000000 --- a/inst/rmarkdown/templates/common/template.yaml +++ /dev/null @@ -1,3 +0,0 @@ -name: bcbio base -description: Standard NGS down-stream analyses -create_dir: false diff --git a/inst/rmarkdown/templates/cosmx/template.yml b/inst/rmarkdown/templates/cosmx/template.yml deleted file mode 100644 index 12712ff..0000000 --- a/inst/rmarkdown/templates/cosmx/template.yml +++ /dev/null @@ -1,3 +0,0 @@ -name: bcbio CosMx -description: Standard CoxMx down-stream analyses -create_dir: false diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd deleted file mode 100644 index 5da794c..0000000 --- a/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd +++ /dev/null @@ -1,579 +0,0 @@ ---- -title: "Differential Expression" -author: "Harvard Chan Bioinformatics Core" -date: "`r Sys.Date()`" -output: - html_document: - code_folding: hide - df_print: paged - highlights: pygments - number_sections: true - self_contained: true - theme: default - toc: true - toc_float: - collapsed: true - smooth_scroll: true -editor_options: - chunk_output_type: console -params: - # Put hg38, mm10, mm39, or other - - ## Combatseq and ruv can both be false or ONLY ONE can be true - ## Both cannot be true - numerator: tumor - denominator: normal - column: sample_type - subset_column: null - subset_value: null - genome: hg38 - ruv: false - combatseq: false - params_file: params_de-example.R - project_file: ../information.R - functions_file: load_data.R ---- - - -```{r} -# This set up the working directory to this file so all files can be found -library(rstudioapi) -setwd(fs::path_dir(getSourceEditorContext()$path)) -``` - - -```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} -# 1. Set up input files in this R file (params_de.R) -source(params$params_file) -# 2. Set up project file (already done from QC probably) -source(params$project_file) -# 3. Load custom functions to load data from coldata/metrics/counts -source(params$functions_file) -# IMPORTANT set these values if you are not using the parameters in the header (lines 22-31) -genome=params$genome -column=params$column -numerator=params$numerator -denominator=params$denominator -subset_column=params$subset_column -subset_value=params$subset_value -run_ruv=params$ruv -run_combatseq=params$combatseq -factor_of_interest <- column -``` - - -```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} -library(rtracklayer) -library(DESeq2) -library(tidyverse) -library(stringr) -library(DEGreport) -library(ggpubr) -library(msigdbr) -library(fgsea) -library(org.Hs.eg.db) -library(knitr) -library(EnhancedVolcano) -library(bcbioR) -library(ggprism) -library(viridis) -library(pheatmap) -library(janitor) -library(ggforce) -library(vegan) - -colors=cb_friendly_cols(1:15) -ggplot2::theme_set(theme_prism(base_size = 14)) -opts_chunk[["set"]]( - cache = F, - cache.lazy = FALSE, - dev = c("png", "pdf"), - error = TRUE, - highlight = TRUE, - message = FALSE, - prompt = FALSE, - tidy = FALSE, - warning = FALSE, - echo = T, - fig.height = 4) - -# set seed for reproducibility -set.seed(1234567890L) -``` - -```{r sanitize_datatable} -sanitize_datatable = function(df, ...) { - # remove dashes which cause wrapping - DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), - colnames=gsub("-", "_", colnames(df))) -} -``` - - - -```{r load_data, message=F, warning=F} -# This code will load from bcbio or nf-core folder -# NOTE make sure to set numerator and denominator -coldata <- load_coldata(coldata_fn, column, - numerator, denominator, - subset_column, subset_value) -coldata$sample=row.names(coldata) - -counts <- load_counts(counts_fn) -counts <- counts[,colnames(counts) %in% coldata$sample] - -metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>% - left_join(coldata, by = c('sample')) %>% - as.data.frame() -rownames(metrics) <- metrics$sample -# if the names don't match in order or string check files names and coldata information -counts = counts[,rownames(metrics)] -coldata = coldata[rownames(metrics),] -stopifnot(all(names(counts) == rownames(metrics))) -``` - - - -# Overview - -- Project: `r project` -- PI: `r PI` -- Analyst: `r analyst` -- Experiment: `r experiment` -- Aim: `r aim` -- Comparison: `r ifelse(is.null(subset_value), paste0(numerator, ' vs. ', denominator), paste0(subset_value, ': ', numerator, ' vs. ', denominator))` - -```{r create_filenames} - -if (!is.null(subset_value) & !is.null(subset_value)){ - filenames = str_interp("${subset_value}_${numerator}_vs_${denominator}") -} else { - filenames = str_interp("${numerator}_vs_${denominator}") -} - -contrasts = c(column,numerator,denominator) -coef=paste0(column,"_",numerator,"_vs_",denominator) - -name_expression_fn=file.path( - basedir, - str_interp("${filenames}_expression.csv")) -name_deg_fn=file.path( - basedir, - str_interp("${filenames}_deg.csv")) -name_pathways_fn=file.path( - basedir, - str_interp("${filenames}_pathways.csv")) - -``` - -```{r load_counts_data} - -rdata = AnnotationDbi::select(org.Hs.eg.db, rownames(counts), 'SYMBOL', 'ENSEMBL') %>% - dplyr::select(gene_id = ENSEMBL, gene_name = SYMBOL) %>% - distinct(gene_id, .keep_all = TRUE) - -``` - -```{r setup_RUV} - -dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = ~1) - -vsd_before <- vst(dds_to_use) -norm_matrix = assay(vsd_before) -``` - - -# PCA and group level variance. - -**Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells).** - -Dispersion estimates are a key part of the DESEQ2 analysis. DESEQ2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per gene dispersions to match this distribution. If one group has higher variance than all others this will affect the dispersion estimates. Here we visually check that the variance per group is similar using a PCA. The ellipses are minimal volume enclosing ellipses using the Khachiyan algorithm. - -**It is best practice NOT to subset your data unless one group has significantly higher variance than the others. The best dispersion estimates are obtained with more data.** - -**This code automatically uses the column value from the header. You can also manually add a factor of interest to define the groups. One can be created by combining multiple metadata columns using the paste0 function.** - -```{r set group, eval=FALSE, echo=FALSE} -## Example of creating a group covariate - -meta$group <- paste0(meta$sex,"_", meta$age,"_",meta$treatment) - -factor_of_interest <- "insert column name for covariate of interest" -``` - - -```{r PCA} -pca <- degPCA(norm_matrix, metrics, - condition = factor_of_interest, name = "sample", data = T) - -pca$plot + ggtitle(paste0("All samples", "\nPCA using ", nrow(vsd_before), " genes")) + - theme(plot.title=element_text(hjust=0.5)) + - geom_mark_ellipse(aes(color = sample_type)) + scale_color_cb_friendly() -``` - -## PERMDISP - -Groups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test. PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. [Source](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/) - -Here we apply this test to our variance stabilized data. We calculate distances between samples and then use the `betadisper()` function from the popular vegan package. We get two overall p-values where significant means that the dispersions are different between groups. The first p-value comes from the `anova()` function and the second from the `permutest()` function. We also get pairwise p-values for every group-group comparison. - -```{r PERMDISP} -vare.disa <- vegdist(t(assay(vsd_before))) - -mod = betadisper(vare.disa, metrics[[factor_of_interest]]) -anova(mod) -permutest(mod, pairwise = TRUE) - -``` - - - -# Covariate analysis - -Multiple factors related to the experimental design or quality of sequencing may influence the outcomes of a given RNA-seq experiment. To further determine whether any confounding covariate risks affecting the results of our differential expression analyses, it is useful to assess the correlation between covariates and principal component (PC) values. - -Here, we are using `DEGreport::degCovariates()` to explore potential correlations between variables provided in the metadata and all PCs that account for at least 5% of the variability in the data. If applicable, significant correlations (FDR < 0.1) are circled. **This diagnostic plot helps us determine which variables we may need to add to our DE model.** - - -```{r covariates, fig.height = 6, fig.width = 10} -degCovariates( - norm_matrix, - metrics, -) -``` - - - -```{r init_DESEQ} -formula <- as.formula(paste0("~ ", " + ", column)) -## Check if sample name matches -stopifnot(all(names(counts) == rownames(coldata))) - -dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = formula) - -vsd_before <- vst(dds_to_use) -norm_matrix = assay(vsd_before) -new_cdata <- coldata -``` - - -```{r, eval=F, echo=FALSE} -#### IF YOU ARE RUNNING RUV OR COMBATSEQ RUN THE CHUNKS BELOW OTHERWISE SKIP TO Differential Expression SECTION - -### RUV - LINES 261-296 -### COMBATSEQ - LINES 303-369 -``` - - - -```{r, eval=run_ruv, results='asis', echo=run_ruv} -cat("# Remove Unwanted Variability - -When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.") -``` - -```{r do_RUV, eval=run_ruv, echo=run_ruv} -library(RUVSeq) - -# If you want to skip the code, just set up formula to be your model in the next chunk of code -design <- coldata[[column]] -diffs <- makeGroups(design) -dat <- norm_matrix -# by default is running one variable, -# change K parameter to other number to find more unknown covariates -ruvset <- RUVs(dat, cIdx=rownames(dat), k=1, diffs, isLog = T, round = F) -vars <- ruvset$W - -new_cdata <- cbind(coldata, vars) - -formula <- as.formula(paste0("~ ", - paste0( - colnames(new_cdata)[grepl("W", colnames(new_cdata))], - collapse = " + " - ), " + ", column) -) -norm_matrix=ruvset$normalizedCounts -pca2 <- degPCA(norm_matrix, new_cdata, - condition = column) + ggtitle('After RUV') -pca2 + scale_color_cb_friendly() - -``` - -```{r after_RUV, eval=run_ruv} - -dds_to_use <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula) -vsd_to_use<- vst(dds_to_use, blind=FALSE) - -``` - -```{r combat-text , eval=run_combatseq, results='asis', echo=run_combatseq} -library(sva) - -cat("# Remove Batch Effects - -Here we apply Combat-seq (https://github.com/zhangyuqing/ComBat-seq) to try to remove batch effects so we can better tease out the effects of interest. - -Combat-seq uses a negative binomial regression to model batch effects, providing adjusted data by mapping the original data to an expected distribution if there were no batch effects. The adjusted data preserves the integer nature of counts, so that it is compatible with the assumptions of state-of-the-art differential expression software (e.g. edgeR, DESeq2, which specifically request untransformed count data).") - -``` - - -```{r set_variable_combatseq, eval=run_combatseq, echo=run_combatseq} - -## FILL OUT THIS CHUNK OF CODE IF YOU WANT TO RUN COMBATSEQ - -## Set your batch effect variable here this is the variable that combatseq will try to remove - -## Column name of your batch variable -to_remove = "batch" - -## Column name of of your variable(s) of interest - -to_keep = "sample_type" - - -coldata[[to_remove]] <- as.factor(coldata[[to_remove]]) -coldata[[to_keep]] <- as.factor(coldata[[to_keep]]) - - -batch = coldata[[to_remove]] -treatment = coldata[[to_keep]] - -## If you have multiple variables of interest you will need to cbind them into one variable - -#treatment1 = metrics[[to_keep]] -#treatment2 = metrics[[to_keep]] -#treatment3 = metrics[[to_keep]] - - -# imp = cbind(as.numeric(as.character(treatment1)),as.numeric(as.character(treatment2)), as.numeric(as.character(treatment3))) - -``` - - -```{r do_combatseq, eval=run_combatseq} -adjusted_counts <- ComBat_seq(as.matrix(counts), batch=batch, group = treatment) - -## For multiple variables of interest - -# adjusted_counts <- ComBat_seq(as.matrix(counts2), batch=batch, covar_mod = imp) - -``` - -```{r after_combatseq, eval=run_combatseq} -# NOTE: Make sure the formula doens't contain the covariates used in combatseq above -dds_to_use <- DESeqDataSetFromMatrix(adjusted_counts, coldata, design = formula) -vsd_combat<- vst(dds_to_use, blind=FALSE) - -combat_matrix = assay(vsd_combat) - -pca_combat <- degPCA(combat_matrix, coldata, - condition = column) + ggtitle('After Combatseq') -pca_combat + scale_color_cb_friendly() - -``` - - -# Differential Expression - -Differential gene expression analysis of count data was performed using the Bioconductor R package, DESeq2, which fits the count data to a negative binomial model. - -Before fitting the model, we often look at a metric called dispersion, which is a measure for variance which also takes into consideration mean expression. A dispersion value is estimated for each individual gene, then 'shrunken' to a more accurate value based on expected variation for a typical gene exhibiting that level of expression. Finally, the shrunken dispersion value is used in the final GLM fit. - -We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model. - -```{r DE} -de <- DESeq(dds_to_use) - -DESeq2::plotDispEsts(de) -``` - -Because it is difficult to accurately detect and quantify the expression of lowly expressed genes, differences in their expression between treatment conditions can be unduly exaggerated after the model is fit. We correct for this so that gene LFC is not dependent overall on basal gene expression level. - -```{r lfc_shrink} -# resultsNames(de) # check the order is right -resLFC = results(de, contrast=contrasts) -resLFCS <- lfcShrink(de, coef=coef, type="apeglm") - -res <- as.data.frame(resLFCS) %>% - rownames_to_column('gene_id') %>% left_join(rdata, by = 'gene_id') %>% - relocate(gene_name) %>% dplyr::rename(lfc = log2FoldChange) %>% - mutate(pi = abs(lfc) * -log10(padj)) %>% arrange(-pi) - -res_sig <- res %>% filter(padj < 0.05) %>% arrange(padj) %>% - mutate(gene_name = ifelse(is.na(gene_name), gene_id, gene_name)) - -res_mod <- res %>% mutate(lfc = replace(lfc, lfc < -5, -5)) %>% mutate(lfc = replace(lfc, lfc > 5, 5)) -show <- as.data.frame(res_mod[1:10, c("lfc", "padj", "gene_name")]) - -degMA(as.DEGSet(resLFC)) + ggtitle('Before LFC Shrinking') -``` - -## MA plot - -```{r after_lfc_shrink} -degMA(as.DEGSet(resLFCS), limit = 2) + ggtitle('After LFC Shrinking') - -``` - -## Volcano plot - -This volcano plot shows the genes that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in red are genes that have padj < 0.05 and a log2-fold change > 1. Points in blue have a padj < 0.05 and a log2-fold change < 1 and points in green have a padj > 0.05 and a log2-fold change > 2. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2 foldchance and padj that we have chosen. - -```{r volcano_plot, fig.height=6} -# degVolcano(res_mod[,c('lfc', 'padj')], plot_text = show) -EnhancedVolcano(res_mod, - lab= res_mod$gene_name, - pCutoff = 0.05, - selectLab = c(res_sig$gene_name[1:15]), - FCcutoff = 0.5, - x = 'lfc', - y = 'padj', - title="Volcano Tumor vs. Normal", - col=as.vector(colors[c("dark_grey", "light_blue", - "purple", "purple")]), - subtitle = "", drawConnectors = T, max.overlaps = Inf) -``` - -## Heatmap - -```{r heapmap} -### Run pheatmap using the metadata data frame for the annotation -ma=norm_matrix[res_sig$gene_id,] -colma=coldata[,c(column), drop=FALSE] -colors=lapply(colnames(colma), function(c){ - l.col=colors[1:length(unique(colma[[c]]))] - names(l.col)=unique(colma[[c]]) - l.col -}) -names(colors)=colnames(colma) -pheatmap(ma, - color = inferno(10), - cluster_rows = T, - show_rownames = F, - annotation = colma, - annotation_colors = colors, - border_color = NA, - fontsize = 10, - scale = "row", - fontsize_row = 10, - height = 20) -``` - - -## Differentially Expressed Genes - -```{r sig_genes_table} -res_sig %>% sanitize_datatable -``` - -## Plot top 16 genes - -```{r top n DEGs, fig.height = 6, fig.width = 8} -n = 16 -top_n <- res_sig %>% slice_min(order_by = padj, n = n, with_ties = F) %>% - dplyr::select(gene_name, gene_id) -top_n_exp <- norm_matrix %>% as.data.frame() %>% - rownames_to_column('gene_id') %>% - # dplyr::select(-group, -group_name) %>% - pivot_longer(!gene_id, names_to = 'sample', values_to = 'log2_expression') %>% - right_join(top_n, relationship = "many-to-many") %>% - left_join(coldata, by = 'sample') - -ggplot(top_n_exp, aes_string(x = column, y = 'log2_expression')) + - geom_boxplot(outlier.shape = NA, linewidth=0.5, color="grey") + - geom_point() + - facet_wrap(~gene_name) + - ggtitle(str_interp('Expression of Top ${n} DEGs')) + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - -``` - -# Pathway Enrichment - -From the set of differentially expressed genes and using publicly available information about gene sets involved in biological processes and functions, we can calculate which biological processes and functions are significantly perturbed as a result of the treatment. - -```{r} -universe=res %>% - filter(!is.na(padj)) %>% pull(gene_id) -mapping = AnnotationDbi::select(org.Hs.eg.db, universe, 'ENTREZID', 'ENSEMBL') - -all_in_life=list( - msigdbr(species = "human", category = "H") %>% mutate(gs_subcat="Hallmark"), - msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"), - msigdbr(species = "human", category = "C2", subcategory = "CP:KEGG"), - msigdbr(species = "human", category = "C2", subcategory = "CP:PID"), - msigdbr(species = "human", category = "C5", subcategory = "GO:BP"), - msigdbr(species = "human", category = "C5", subcategory = "GO:MF"), - msigdbr(species = "human", category = "C5", subcategory = "HPO"), - msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"), - msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic") -) - -ora_input = res %>% filter(!is.na(padj), padj<0.01, abs(lfc)>0.3) %>% pull(gene_id) -input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL')) - -total_deg=length(unique(ora_input))/length(unique(mapping$ENTREZID)) -pathways_ora_all = lapply(all_in_life, function(p){ - pathway = split(x = p$entrez_gene, f = p$gs_name) - db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":") - respath <- fora(pathways = pathway, - genes = unique(input_entrezid$ENTREZID), - universe = unique(mapping$ENTREZID), - minSize = 15, - maxSize = 500) - coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1], - pathway, unique(input_entrezid$ENTREZID), unique(mapping$ENTREZID)) - as_tibble(respath[pathway %in% coll_respath$mainPathways]) %>% - mutate(database=db_name, NES=(overlap/size)/(total_deg)) -}) %>% bind_rows() %>% - mutate(analysis="ORA") - -ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>% - group_by(pathway) %>% - left_join(mapping, by =c("overlapGenes"="ENTREZID")) %>% - dplyr::select(pathway, padj, NES, ENSEMBL, analysis, - database) - -pathways_long = ora_tb - -``` - - -```{r pathaways_table} -pathways_ora_all %>% sanitize_datatable() -``` - - -```{r write-files} -counts_norm=norm_matrix %>% as.data.frame() %>% - rownames_to_column("gene_id") %>% - mutate(comparison = str_interp("${numerator}_vs_${denominator}")) - -res_for_writing <- res %>% - mutate(comparison = str_interp("${numerator}_vs_${denominator}")) - -pathways_for_writing <- pathways_long %>% - mutate(comparison = str_interp("${numerator}_vs_${denominator}")) - -if (!is.null(subset_value)){ - counts_norm <- counts_norm %>% - mutate(subset = subset_value) - res_for_writing <- res_for_writing %>% - mutate(subset = subset_value) - pathways_for_writing <- pathways_for_writing %>% - mutate(subset = subset_value) -} - -write_csv(counts_norm, name_expression_fn) -write_csv(res_for_writing, name_deg_fn) -write_csv(pathways_for_writing, name_pathways_fn) -``` - -# R session - -List and version of tools used for the DE report generation. - -```{r} -sessionInfo() -``` diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/PCA_variance_analysis.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/DE/PCA_variance_analysis.Rmd deleted file mode 100644 index 4074693..0000000 --- a/inst/rmarkdown/templates/rnaseq/skeleton/DE/PCA_variance_analysis.Rmd +++ /dev/null @@ -1,54 +0,0 @@ ---- -title: "PCA with variance analysis" -author: "Harvard Chan Bioinformatics Core" ---- - -Dispersion estimates are a key part of the DESEQ2 analysis. DESEQ2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per gene dispersions to match this distribution. If one group has higher variance than all others this will affect the dispersion estimates. Here we visually check that the variance per group is similar using a PCA. The ellipses are minimal volume enclosing ellipses using the Khachiyan algorithm. - - -**Manually add in your covariate of interest to define the groups. One can be created by combining multiple metadata columns using the paste0 function.** - -```{r } -## Example of creating a group covariate - -# meta$group <- paste0(meta$sex,"_", meta$age,"_",meta$treatment) - -factor_of_interest <- "insert column name for covariate of interest" -``` - - -```{r } -library(DEGreport) -library(ggplot2) -library(ggforce) - -data("bcbio_vsd_data") - -colors=cb_friendly_cols(1:15) -ggplot2::theme_set(theme_prism(base_size = 14)) - -pca <- degPCA(assay(bcbio_vsd_data), colData(bcbio_vsd_data), - condition = factor_of_interest, name = "sample", data = T) - -pca$plot + ggtitle(paste0("All samples", "\nPCA using ", nrow(vst), " genes")) + - theme(plot.title=element_text(hjust=0.5)) + - geom_mark_ellipse(aes(color = sample_type)) -``` - -## PERMDISP - -Groups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test. PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. [Source](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/) - -Here we apply this test to our variance stabilized data. We calculate distances betwen samples and then use the betadisper() function from the popular vegan package. We get two overall p-values where significant means that the dispersions are different between groups. The first p-value comes from the anova() function and the second from the permutest() function. We also get pairwise p-values for every group-group comparison. - -```{r} -library(vegan) -vare.disa <- vegdist(t(assay(bcbio_vsd_data))) - -mod = betadisper(vare.disa, colData(bcbio_vsd_data)[['sample_type']]) -anova(mod) -permutest(mod, pairwise = TRUE) - -``` - - diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/README.md b/inst/rmarkdown/templates/rnaseq/skeleton/README.md deleted file mode 100644 index b04a5d5..0000000 --- a/inst/rmarkdown/templates/rnaseq/skeleton/README.md +++ /dev/null @@ -1,71 +0,0 @@ -# Guideline for RNAseq downstream analysis - -Make sure there is a project name for this. - -## Run data with nf-core rnaseq - -- Make sure you have access to our [Seqera WorkSpace](https://cloud.seqera.io/orgs/HBC/workspaces/core_production/launchpad) -- Transfer data to HCBC S3: Ask Alex/Lorena. Files will be at our S3 bucket `input/rawdata` folder -- Prepare the CSV file according this [instructions](https://nf-co.re/rnaseq/3.14.0/docs/usage#multiple-runs-of-the-same-sample). File should look like this: - -```csv -sample,fastq_1,fastq_2,strandedness -CONTROL_REP1,s3path/AEG588A1_S1_L002_R1_001.fastq.gz,s3path/AEG588A1_S1_L002_R2_001.fastq.gz,auto -CONTROL_REP1,s3path/AEG588A1_S1_L003_R1_001.fastq.gz,s3path/AEG588A1_S1_L003_R2_001.fastq.gz,auto -CONTROL_REP1,s3path/AEG588A1_S1_L004_R1_001.fastq.gz,s3path/AEG588A1_S1_L004_R2_001.fastq.gz,auto -``` - -Use `bcbio_nfcore_check(csv_file)` to check the file is correct. - -You can add more columns to this file with more metadata, and use this file as the `coldata` file in the templates. - -- Upload file to our `Datasets` in Seqera using the name of the project but starting with `rnaseq-pi_lastname-hbc_code` -- Go to `Launchpad`, select `nf-core_rnaseq` pipeline, and select the previous created `Datasets` in the `input` parameter after clicking in `Browser` - - Select an output directory with the same name used for the `Dataset` inside the `results` folder in S3 -- When pipeline is down, data will be copied to our on-premise HPC in the scratch system under `scratch/groups/hsph/hbc/bcbio/` folder - -## Downstream analysis - -Please, modify `information.R` with the right information. You can use this file with any other Rmd to include the project/analysis information. - -### QC - -`QC/QC.Rmd` is a template for QC metrics. Use `params_qc.R` for `bcbio` - or `QC/QC_nf-core.Rmd` `params_qc_nf-core.R` for `nf-core/rnaseq` outputs. - -Read instruction in the R and Rmd scripts to render it. - -### DE - -`DE/DEG.Rmd` is a template for two groups comparison. `params_de.R` has the information of the input files to load. You can point to `bcbio` or `nf-core/rnaseq` output files. - -On the `YAML` header file of the `Rmd` you can specify some parameters or just set them up in the first chunk of code of the template. This template has examples of: - -- sub-setting data -- two groups comparison -- volcano plot -- MA plot -- Pathway analysis -- Tables - -## DropBox - -- In `reports/QC` - - [ ] copy `bcbio-se.rds` and `tximport-counts.csv` - - [ ] copy QC `Rmd/R/html/figures` -- In `reports/DE` - - [ ] Normalized counts for all genes x all samples (csv format) -- In `reports/DE`, for *each analysis*: - - **Note** For multiple comparisons/analysis, do a single report/template if possible in the parent folder using parameters whenever possible. - - Create a folder with the comparison names in the files. Numbering by comparison (`01.1_DE_comp1`, `01.2_DE_comp2`, etc.). If you’re running multiple models for the same comparison, append `_M#`. Add the following files under each folder: - - [ ] Normalized count table with the samples used in this analysis/comparison. - - [ ] Full results `DESeq2` for all genes (csv format) with annotation columns appended. - - [ ] Significant genes results file (subset of annotated full results by chosen p-value and LFC). Separate files will be created for each individual contrast. - - [ ] Significant genes results file as described above, but additionally append columns containing normalized count values for each sample. - - Make sure to append the gene symbols to these tables so the researcher can interpret the results. - -## GitHub - -- [ ] Push all `*Rmd` `*R` files used for the *QC* and *DE* analysis respecting folder structure. - -Please, ignore `*html/figures/csv` and any output of the code. diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/skeleton.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/skeleton.Rmd deleted file mode 100644 index dc4bbf5..0000000 --- a/inst/rmarkdown/templates/rnaseq/skeleton/skeleton.Rmd +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: "General Project Information" -author: "Harvard Chan Bioinformatics Core" -date: "`r Sys.Date()`" -output: - html_document: - code_folding: hide - df_print: paged - highlights: pygments - number_sections: true - self_contained: true - theme: default - toc: true - toc_float: - collapsed: true - smooth_scroll: true -editor_options: - chunk_output_type: console -params: - params_file: information.R ---- - -```{r echo = F} -source(params$params_file) -``` diff --git a/inst/rmarkdown/templates/rnaseq/template.yaml b/inst/rmarkdown/templates/rnaseq/template.yaml deleted file mode 100644 index 17cd50f..0000000 --- a/inst/rmarkdown/templates/rnaseq/template.yaml +++ /dev/null @@ -1,3 +0,0 @@ -name: bcbio RNAseq -description: Standard RNAseq down-stream analyses -create_dir: false diff --git a/inst/rmarkdown/templates/singlecell/skeleton/Integration/helpers.R b/inst/rmarkdown/templates/singlecell/skeleton/Integration/helpers.R deleted file mode 100644 index c0fc928..0000000 --- a/inst/rmarkdown/templates/singlecell/skeleton/Integration/helpers.R +++ /dev/null @@ -1,61 +0,0 @@ -# some code showing integrating two seurat objects -library(Seurat) -library(tidyverse) -library(harmony) - -# We assume exp1 and exp2 has a Group column with naming the sample groups -# We create a batch annotation for each batch -exp1=readRDS("data/exp1.rds") -exp1$batch="n10" -exp2=readRDS("data/exp2.rds") -exp2$batch="n6" - -# Normalize ---- -exp = SCTransform(exp, verbose = FALSE,conserve.memory=TRUE, - variable.features.n = 3000) -exp <- RunPCA(exp) -ElbowPlot(exp,ndims=40) -end_dimension=35 -resolution=0.5 - -# Clustering for each batch ---- -exp <- FindNeighbors(exp, dims = 1:end_dimension, reduction = "pca") -exp <- FindClusters(exp, resolution = resolution, cluster.name = "unintegrated_clusters") -exp <- RunUMAP(exp, dims = 1:end_dimension, reduction = "pca", reduction.name = "umap.unintegrated") -saveRDS(exp, file="data/merged_umap.rds") - -## Plot by batch---- -DimPlot(exp, reduction = "umap.unintegrated", split.by = c("Group"), - group.by = c("batch")) - -# Integration ---- -exp <- IntegrateLayers( - object = exp, method = HarmonyIntegration, - orig.reduction = "pca", new.reduction = "harmony", - verbose = FALSE -) -exp[["RNA"]] <- JoinLayers(exp[["RNA"]]) -end_dimension=35 -resolution=0.5 -exp <- FindNeighbors(exp, reduction = "harmony", dims = 1:end_dimension) -exp <- FindClusters(exp, resolution = resolution, cluster.name = "harmony_clusters") -exp <- RunUMAP(exp, reduction = "harmony", dims = 1:end_dimension, reduction.name = "umap.harmony") -saveRDS(exp, file="data/integrated_harmony.rds") - -## Plot by Group and Cluster ---- -DimPlot( - exp, - reduction = "umap.harmony", - split.by = c("Group"), - group.by = c("batch"), - combine = TRUE, label.size = 2 -) - -DimPlot( - exp, - reduction = "umap.harmony", - split.by = c("Group"), - group.by = c("ident"), - combine = TRUE, label.size = 2 -) - diff --git a/inst/rmarkdown/templates/singlecell/skeleton/README.md b/inst/rmarkdown/templates/singlecell/skeleton/README.md deleted file mode 100644 index 0a6a5b2..0000000 --- a/inst/rmarkdown/templates/singlecell/skeleton/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# Tipical steps for scRNAseq downstream analysis - -# DropBox - -- In `reports/QC` - - [ ] copy QC `Rmd/R/html/figures` -- In `reports/Clusters` - - [ ] the analysis of `SCTransform`, ,`RunPCA` ,`FindNeighbors`, ,`FindClusters`, `RunUMAP` - - [ ] the analysis of `FindMarkers` and `Cell Identification` -- In `reports/DE`, for *each analysis*: - - TBD - -# GitHub - -- [ ] Push all `*Rmd` `*R` files used for the *QC* and *DE* analysis respecting folder structure. - -Please, ignore `*html/figures/csv` and any output of the code. diff --git a/inst/rmarkdown/templates/singlecell/skeleton/skeleton.Rmd b/inst/rmarkdown/templates/singlecell/skeleton/skeleton.Rmd deleted file mode 100644 index dc4bbf5..0000000 --- a/inst/rmarkdown/templates/singlecell/skeleton/skeleton.Rmd +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: "General Project Information" -author: "Harvard Chan Bioinformatics Core" -date: "`r Sys.Date()`" -output: - html_document: - code_folding: hide - df_print: paged - highlights: pygments - number_sections: true - self_contained: true - theme: default - toc: true - toc_float: - collapsed: true - smooth_scroll: true -editor_options: - chunk_output_type: console -params: - params_file: information.R ---- - -```{r echo = F} -source(params$params_file) -``` diff --git a/inst/rmarkdown/templates/singlecell/template.yml b/inst/rmarkdown/templates/singlecell/template.yml deleted file mode 100644 index f6b1119..0000000 --- a/inst/rmarkdown/templates/singlecell/template.yml +++ /dev/null @@ -1,3 +0,0 @@ -name: bcbio scRNAseq -description: Standard scRNAseq down-stream analyses -create_dir: false diff --git a/inst/rmarkdown/templates/teaseq/skeleton/information.R b/inst/rmarkdown/templates/teaseq/skeleton/information.R deleted file mode 100644 index 3444196..0000000 --- a/inst/rmarkdown/templates/teaseq/skeleton/information.R +++ /dev/null @@ -1,8 +0,0 @@ -# project params -root = "../" -date = "YYYYMMDD" -column = "treatment" -subset_column = 'cell' -metadata_fn = "../meta/samplesheet.csv" -counts_fn = '../data/tximport-counts.csv' -basedir <- 'reports' diff --git a/inst/rmarkdown/templates/teaseq/skeleton/skeleton.Rmd b/inst/rmarkdown/templates/teaseq/skeleton/skeleton.Rmd deleted file mode 100644 index dc4bbf5..0000000 --- a/inst/rmarkdown/templates/teaseq/skeleton/skeleton.Rmd +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: "General Project Information" -author: "Harvard Chan Bioinformatics Core" -date: "`r Sys.Date()`" -output: - html_document: - code_folding: hide - df_print: paged - highlights: pygments - number_sections: true - self_contained: true - theme: default - toc: true - toc_float: - collapsed: true - smooth_scroll: true -editor_options: - chunk_output_type: console -params: - params_file: information.R ---- - -```{r echo = F} -source(params$params_file) -``` diff --git a/inst/rmarkdown/templates/teaseq/template.yml b/inst/rmarkdown/templates/teaseq/template.yml deleted file mode 100644 index 6838f13..0000000 --- a/inst/rmarkdown/templates/teaseq/template.yml +++ /dev/null @@ -1,3 +0,0 @@ -name: bcbio TEAseq -description: Standard TEAseq down-stream analyses -create_dir: false diff --git a/inst/templates/base/README.md b/inst/templates/base/README.md new file mode 100644 index 0000000..8d34a89 --- /dev/null +++ b/inst/templates/base/README.md @@ -0,0 +1,49 @@ +# Guidelines + +## Set Repository + +- Start a git repository: `usethis::use_git()` +- Push this project to GitHub, follow these steps: + +* Only once every 30 days, set up your github credentials: `usethis::gh_token_help()` + * **NOTE** You may want to run this first (one time) to keep this token working in future sessions: `git config --global credential.helper store` + +- Push repository to HBC github as private: `usethis::use_github(org="hbc",private=TRUE)` + +## Set up work-space + +- [ ] Replace the title in this file to match the project's title +- [ ] Modify `information.R` with the right text for this project, it can be used to source in other `Rmd` files. The main `Rmd` file in this directory can be used to show general information of the project if needed. +- [ ] If you didn't provide the pipeline when creating this project: + Use the function `bcbio_templates` to create templates inside `reports` for each type of analysis. For instance, for *RNAseq*: + - `bcbio_templates(type="rnaseq", outpath=path)` or + - `bcbio_templates(type="singlecell", outpath=path)` + - Then go to that folder and read the `README.md` + +## Folders + +- `meta` should contain the CSV/YAML files used by *nextflow* or your pipelines +- `scripts` should contain `sbatch` scripts or any custom scripts used in this project +- `data` contains raw data, it can contains big data objects +- `reports` contains `Rmd` and `html` together with their files that will be published and shared. +- `final` contains the output of *nextflow* +- `code` contains any other files that support custom analysis and don't generate a report +- For any relevant client files or papers use the `docs` folder + +## Guidelines for file naming + +[AnalysisType]_[BiologicalQuestion]_[YYYYMMDD].[extension] + +Some examples: + +- `TwoGroupsDE_ADrelated_20241011.html` +- `singlecellClustering_SkinTissue_20242011.html` +- `singlecellClustering_SkinTissue-GenePositiveOnly_20242011.html` +- `DiffBind-TreatedvsHealthy_20242011.html` + +## GitHub + +- [ ] Track in *Git* this `README` file +- [ ] Track in *Git* files in `scripts`, `meta`, and `reports` that belongs to these type: + - **Note** Git add `*.Rmd *.R *ipynb *.sh *.yaml`. (feel free use `.gitignore` if you use a GUI for non-tracked files). *DO NOT* use `git add *`. *DO NOT* track `html/csv/figures`. *DO NOT* track files that you did not use for this project (i.e. irrelevant templates, placeholders) +- [ ] Commit files and push to *Github* as necessary throughout the project, but especially when work is complete diff --git a/inst/rmarkdown/templates/common/skeleton/meta/placeholder.R b/inst/templates/base/code/placeholder.R similarity index 100% rename from inst/rmarkdown/templates/common/skeleton/meta/placeholder.R rename to inst/templates/base/code/placeholder.R diff --git a/inst/rmarkdown/templates/common/skeleton/data/readme b/inst/templates/base/data/readme similarity index 100% rename from inst/rmarkdown/templates/common/skeleton/data/readme rename to inst/templates/base/data/readme diff --git a/inst/templates/base/gitignore b/inst/templates/base/gitignore new file mode 100644 index 0000000..5f86d62 --- /dev/null +++ b/inst/templates/base/gitignore @@ -0,0 +1,18 @@ +*.Rproj* +.Rhistory +.Rproj.user +.Rhistory +.DS* +*._* +*placeholder* +data/* +final/* +.Rproj.user +data/* +docs/* +**/*html +**/*rds +**/*rda +**/*csv +**/*tsv +**/*app.R diff --git a/inst/rmarkdown/templates/common/skeleton/information.R b/inst/templates/base/information.R similarity index 100% rename from inst/rmarkdown/templates/common/skeleton/information.R rename to inst/templates/base/information.R diff --git a/inst/rmarkdown/templates/common/skeleton/scripts/placeholder b/inst/templates/base/meta/placeholder.R similarity index 100% rename from inst/rmarkdown/templates/common/skeleton/scripts/placeholder rename to inst/templates/base/meta/placeholder.R diff --git a/inst/templates/base/org/hcbc/DataManagement-Checklist.docx b/inst/templates/base/org/hcbc/DataManagement-Checklist.docx new file mode 100644 index 0000000..76d49ff Binary files /dev/null and b/inst/templates/base/org/hcbc/DataManagement-Checklist.docx differ diff --git a/inst/templates/base/org/hcbc/DataManagement-Checklist.pdf b/inst/templates/base/org/hcbc/DataManagement-Checklist.pdf new file mode 100644 index 0000000..e455651 Binary files /dev/null and b/inst/templates/base/org/hcbc/DataManagement-Checklist.pdf differ diff --git a/inst/templates/base/reports/example.Rmd b/inst/templates/base/reports/example.Rmd new file mode 100644 index 0000000..772e8dd --- /dev/null +++ b/inst/templates/base/reports/example.Rmd @@ -0,0 +1,78 @@ +--- +title: "Example" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + project_file: ../information.R +--- + +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-stable-green) revision. + +```{r source_params, cache = FALSE, message = FALSE, warning=FALSE} +source(params$project_file) +``` + +```{r} +knitr::opts_chunk$set(echo = TRUE) +# Load libraries +library(knitr) +library(rmarkdown) +library(DT) +library(ggprism) +library(grafify) + +ggplot2::theme_set(theme_prism(base_size = 12)) +# https://grafify-vignettes.netlify.app/colour_palettes.html +# NOTE change colors here if you wish +scale_colour_discrete <- function(...) + scale_colour_manual(..., values = as.vector(grafify:::graf_palettes[["kelly"]])) + +#options(ggplot2.discrete.colour= ) + +# Set seed for reproducibility +set.seed(1454944673L) +opts_chunk[["set"]]( + audodep = TRUE, + cache = FALSE, + cache.lazy = FALSE, + error = TRUE, + echo = TRUE, + fig.height = 5L, + fig.retina = 2L, + fig.width = 9.6, + message = FALSE, + tidy = TRUE, + warning = TRUE) +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/placeholder b/inst/templates/base/reports/placeholder similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/QC/placeholder rename to inst/templates/base/reports/placeholder diff --git a/inst/templates/base/scripts/placeholder b/inst/templates/base/scripts/placeholder new file mode 100644 index 0000000..e69de29 diff --git a/inst/templates/chipseq/QC/QC.Rmd b/inst/templates/chipseq/QC/QC.Rmd new file mode 100644 index 0000000..4a7dbef --- /dev/null +++ b/inst/templates/chipseq/QC/QC.Rmd @@ -0,0 +1,426 @@ +--- +title: "Quality Control" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + # Fill this file with the right paths to nfcore output + # params_file: params_qc_nf-core-example.R # example data + params_file: params_qc-example.R + project_file: ../information.R + functions_file: ../libs/load_data.R + factor_of_interest: genotype +--- + +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) + +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")>=0) # requires >=4.3.3 +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. + + +```{r source_params, cache = FALSE, message = FALSE, warning=FALSE} +# 1. set up factor_of_interest parameter from parameter above or manually +# this is used to color plots, it needs to be part of the metadata +# 2. Set input files in this file +source(params$params_file) +# 3. If you set up this file, project information will be printed below and +#. it can be reused for other Rmd files. +source(params$project_file) +# 4. Load custom functions to load data from coldata/metrics/counts +source(params$functions_file) +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` + + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(tidyverse) +library(knitr) +library(rtracklayer) +library(DESeq2) +library(DEGreport) +library(ggrepel) +# library(RColorBrewer) +library(DT) +library(pheatmap) +library(bcbioR) +library(janitor) +library(ChIPpeakAnno) +library(UpSetR) +library(httr) +library(jsonlite) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_light(base_size = 14)) +opts_chunk[["set"]]( + cache = FALSE, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + fig.height = 4) +``` + + +```{r sanitize-datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + +# Samples and metadata + +```{r example_data, message=F, warning=F, eval = params$params_file == 'params_qc-example.R'} +bcbio_qc_chipseq_testdata() +``` + +```{r load_data, message=F, warning=F} +# This code will load from bcbio or nf-core folder +coldata <- load_coldata(coldata_fn) +coldata$sample=row.names(coldata) + +metrics <- load_metrics(multiqc_data_dir) + +metrics <- full_join(coldata, metrics) +rownames(metrics) <- metrics$sample +dds <- load_counts(counts_fn) + +coldata_for_dds = metrics[colnames(dds),] +stopifnot(all(colnames(dds) == rownames(coldata_for_dds))) + +peaks <- load_peaks(peaks_dir) %>% left_join(coldata) +``` + +```{r show_metadata} +metrics_lite <- metrics %>% dplyr::select(sample, total_reads, mapped_reads_pct, frip, peak_count) +full_join(coldata, metrics_lite) %>% sanitize_datatable() +``` + +# Read metrics {.tabset} + +## Total reads + +Here, we want to see consistency and a minimum of 20 million reads (the grey line). + +```{r plot_total_reads} +metrics %>% + ggplot(aes(x = sample, + y = total_reads, + fill = antibody)) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "million reads") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Total reads")+ + geom_hline(yintercept=20000000, color = "grey", linewidth=2) + +``` + +## Mapping rate + +```{r calc_min_max_pct_mapped} +#get min percent mapped reads for reference +min_pct_mapped <- round(min(metrics$mapped_reads/metrics$total_reads)*100,1) +max_pct_mapped <- round(max(metrics$mapped_reads/metrics$total_reads)*100,1) +``` + +The genomic mapping rate represents the percentage of reads mapping to the reference genome. We want to see consistent mapping rates between samples and over 70% mapping (the grey line). These samples have mapping rates: `r min_pct_mapped` - `r max_pct_mapped`%. + +```{r plot_mapping_rate} +metrics %>% + ggplot(aes(x = sample, + y = mapped_reads_pct, + fill = antibody)) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "% reads mapped") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Mapping rate") + xlab("") + + geom_hline(yintercept=70, color = "grey", linewidth=2) + +``` + +## Mapped Reads + +```{r plot_mapped_reads} +metrics %>% + ggplot(aes(x = sample, + y = mapped_reads, + fill = antibody)) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "million reads") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Mapped reads") + +``` + +## NSC + +```{r plot_nsc} +metrics %>% + ggplot(aes(x = sample, + y = nsc, + fill = antibody)) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "NSC coefficient") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Normalized Strand Cross-Correlation") + +``` + + +## RSC + +```{r plot_rsc} +metrics %>% + ggplot(aes(x = sample, + y = rsc, + fill = antibody)) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "RSC coefficient") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Relative Strand Cross-Correlation") + +``` + + +## Fraction of reads in peaks + +This figure shows what percentage of reads are mapping to regions within peaks called by macs2. The expected fraction of reads in peaks will vary by protein. + +```{r plot_frip} +metrics %>% filter(!is.na(frip)) %>% + ggplot(aes(x = sample, + y = frip, + fill = antibody)) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "FRiP") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Fraction of reads in peaks") + +``` + +## Number of peaks + +Ideally, we will see a similar number of peaks between replicates. + +```{r plot_peak_count} +metrics %>% filter(!is.na(peak_count)) %>% + ggplot(aes(x = sample, + y = peak_count, + fill = antibody)) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "Number of Peaks") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Number of Peaks") + +``` + +## Non-Redundant Fraction + +The NRF is the number of uniquely mapping reads divided by the total number of reads. The ENCODE website also sets out standardized thresholds for this as well and those are summarized in the table below. + +```{r nrf table} +NRF <- c("NRF < 0.5", "0.5 < NRF < 0.8", "0.8 < NRF < 0.9", "NRF > 0.9") +NRF_level <- c("Concerning", "Acceptable", "Compliant", "Ideal") + +NRF_df <- data.frame(NRF, NRF_level) + +colnames(NRF_df) <- c("NRF", "NRF Level") +NRF_df %>% sanitize_datatable() + +``` + +```{r plot_nrf} +metrics %>% + ggplot(aes(x = sample, + y = nrf, + fill = antibody)) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "Non-Redundant Fraction") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Non-Redundant Fraction")+ + geom_hline(yintercept = 0.9, linetype = "dashed", color="green") + + geom_hline(yintercept = 0.8, linetype = "dashed", color="orange") + + geom_hline(yintercept = 0.5, linetype = "dashed", color="red") + +``` + +# Correlation Heatmap + +Inter-correlation analysis (ICA) is another way to look at how well samples +cluster by plotting the correlation between the peak regions of the +samples. + +```{r clustering fig, fig.width = 10, fig.asp = .62} +vst_cor <- cor(assays(dds)$vst) + +colma=coldata_for_dds %>% as.data.frame() +rownames(colma) <- colma$sample +colma <- colma[rownames(vst_cor), ] +colma <- colma %>% dplyr::select(.data[[params$factor_of_interest]]) +anno_colors=lapply(colnames(colma), function(c){ + l.col=cb_friendly_pal('grey')(length(unique(colma[[c]]))) + names(l.col)=unique(colma[[c]]) + l.col +}) +names(anno_colors)=colnames(colma) + +p <- pheatmap(vst_cor, + annotation = colma, + annotation_colors = anno_colors, + show_rownames = T, + show_colnames = T, + color = cb_friendly_pal('heatmap')(15) + ) +p +``` + +# PCA + +We can run PCA to evaluate the variation amongst our samples and whether or not the greatest sources of variation in the data (PC1 and PC2) can be attributed to the factors of interest in this experiment. + +```{r PCA1:5 summary, all, unlabeled, fig.width= 7, fig.height = 5} + +pca1 <- degPCA(assays(dds)$vst, coldata_for_dds, + condition = params$factor_of_interest, data = T)[["plot"]] +pca2 <- degPCA(assays(dds)$vst, coldata_for_dds, + condition = params$factor_of_interest, data = T, pc1="PC3", pc2="PC4")[["plot"]] + +pca1 + scale_color_cb_friendly() +pca2 + scale_color_cb_friendly() + +``` + +# Peak Signal Concordance {.tabset} + +## Peak enrichment vs. Peak rank + +In this plot, we are looking at each individual replicates to evaluate what number of peaks we would retain if threshholding by peak enrichment. It is also valuable to see how this differs between replicates within a sample group. + +```{r peak enrichment vs rank} +ggplot(peaks, aes(x = peak_rank, y = peak_enrichment, color = sample)) + + geom_line() + + scale_color_cb_friendly() + + xlab("Peak rank") + ylab("Peak enrichment") + +``` + +## Peak signal distribution + +Here, we plot a histogram of peak signal values for each sample. This plot can be used to help determine a minimum value for peak enrichment that can be used for filtering. + +```{r peak signal distribution} +ggplot(peaks, aes(x = peak_enrichment, fill = .data[[params$factor_of_interest]])) + + geom_histogram(aes(peak_enrichment)) + + scale_fill_cb_friendly() + + xlab("Peak enrichment") + +``` + +# Peak Overlap {.tabset} + +We examine the amount of overlap between peaks in replicates of the same experimental condition. + +``` {r peak overlap, results = 'asis', fig.width = 8, fig.height = 6} +for (current_sample_group in unique(peaks$sample_group)){ + cat("## ", current_sample_group, "\n") + + peaks_sample_group <- peaks %>% filter(sample_group == current_sample_group) + + peaks_sample_group_granges <- sapply( + unique(peaks_sample_group$sample), + function(current_sample) { + ChIPpeakAnno::toGRanges( + peaks_sample_group %>% filter(sample == current_sample), + format = ifelse(grepl('broadPeak', peaks_dir), 'broadPeak', 'narrowPeak') + ) + } + ) + + # maxgap defaults to -1 which means that two peaks will be merged if they overlap by at least 1 bp + # connectedpeaks examples (https://support.bioconductor.org/p/133486/#133603), if 5 peaks in group1 overlap with 2 peaks in group 2, setting connectedPeaks to "merge" will add 1 to the overlapping counts + overlaps <- findOverlapsOfPeaks(peaks_sample_group_granges, connectedPeaks = 'merge') + + n_samples <- length(names(overlaps$overlappingPeaks)) + + if (n_samples > 3){ + set_counts <- overlaps$venn_cnt[, colnames(overlaps$venn_cnt)] %>% + as.data.frame() %>% + mutate(group_number = row_number()) %>% + pivot_longer(!Counts & !group_number, names_to = 'sample', values_to = 'member') %>% + filter(member > 0) %>% + group_by(Counts, group_number) %>% + summarize(group = paste(sample, collapse = '&')) + + set_counts_upset <- set_counts$Counts + names(set_counts_upset) <- set_counts$group + + p <- upset(fromExpression(set_counts_upset), order.by = "freq", text.scale = 1.5) + print(p) + } else{ + venn_sample_names <- gsub(paste0(current_sample_group, '_'), '', names(overlaps$all.peaks)) + invisible(capture.output(makeVennDiagram(overlaps, connectedPeaks = "merge", fill = colors[1:n_samples], + NameOfPeaks = venn_sample_names))) + } + + cat('\n\n') + +} + +``` + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/templates/chipseq/QC/params_qc-example.R b/inst/templates/chipseq/QC/params_qc-example.R new file mode 100644 index 0000000..67f2d0e --- /dev/null +++ b/inst/templates/chipseq/QC/params_qc-example.R @@ -0,0 +1,14 @@ +# info params +# Example data + +# this is the samplesheet used to run nf-core, with additional columns containing covariates of interest +coldata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/refs/heads/main/chipseq/chipseq_peakanalysis_H3K27Ac.csv' + +# This folder is in the output directory inside multiqc folder +multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/refs/heads/main/chipseq/multiqc/narrowPeak/multiqc_data/' + +# This folder is in the output directory +# peaks_dir = "https://api.github.com/repos/bcbio/bcbioR-test-data/contents/chipseq/bowtie2/mergedLibrary/macs2/narrowPeak" + +# This folder is in the output directory +counts_fn = url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/refs/heads/main/chipseq/bowtie2/mergedLibrary/macs2/narrowPeak/consensus/H3K27ac/deseq2/H3K27ac.consensus_peaks_small.rds') diff --git a/inst/templates/chipseq/QC/params_qc.R b/inst/templates/chipseq/QC/params_qc.R new file mode 100644 index 0000000..125baf5 --- /dev/null +++ b/inst/templates/chipseq/QC/params_qc.R @@ -0,0 +1,10 @@ +# info params + + +coldata_fn='/path/to/nf-core/samplesheet.csv' +# This folder is in the nf-core output directory inside multiqc folder +multiqc_data_dir='/path/to/nf-core/output/multiqc/narrowPeak/multiqc_data/' +# This folder is in the nf-core output directory, maybe is broadPeak instead of narrowPeak +peaks_dir = '/path/to/nf-core/output/bowtie2/mergedLibrary/macs2/narrowPeak/' +# This folder is in the nf-core output directory, maybe is broadPeak instead of narrowPeak, also includes antibody name +counts_fn = '/path/to/nf-core/output/bowtie2/mergedLibrary/macs2/narrowPeak/consensus/antibody/deseq2/antibody.consensus_peaks.rds' diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd new file mode 100644 index 0000000..71f028f --- /dev/null +++ b/inst/templates/chipseq/diffbind/diffbind.Rmd @@ -0,0 +1,456 @@ +--- +title: "ChIPSeq DiffBind" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + # Fill this file with the right paths to nfcore output + # .qs file name for saving DiffBind Counts object + # species = mouse or human + params_file: params_diffbind-example.R + project_file: ../information.R + functions_file: ../libs/load_data.R + condition_of_interest: genotype + numerator: cKO + denominator: WT + species: mouse + counts_csv_fn: diffbind_counts.csv + results_sig_anno_fn: diffbind_results_anno.csv +--- +Template developed with materials from https://hbctraining.github.io/main/. + +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) + +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")>=0) # requires >=4.3.3 +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. + + +```{r source_params, cache = FALSE, message = FALSE, warning=FALSE} +# 1. set up condition_of_interest parameter from parameter above or manually +# this is used to color plots, it needs to be part of the metadata +# 2. Set input files in this file +source(params$params_file) +# 3. If you set up this file, project information will be printed below and +#. it can be reused for other Rmd files. +source(params$project_file) +# 4. Load custom functions to load data from coldata/metrics/counts +source(params$functions_file) +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` + +# Methodology + +[DiffBind](https://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf) +is an R Bioconductor package which provides functions for processing +DNA data enriched for genomic loci, including ChIPseq data enriched for sites +where specific protein/DNA binding occurs or histone marks are enriched. + +DiffBind is mainly used for identifying sites that are differentially enriched +between two or more sample groups. It works primarily with sets of peak calls +('peaksets'), which are sets of genomic intervals representing candidate protein +binding sites for each sample. It includes functions that support the processing +of peaksets, including overlapping and merging peak sets across an entire dataset, +counting sequencing reads in overlapping intervals in peak sets, and identifying +statistically significantly differentially bound sites based on evidence of +binding affinity (measured by differences in read densities). To this end it uses +statistical routines developed in an RNA-Seq context (primarily the Bioconductor packages [edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html) and [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html)). + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(tidyverse) +library(knitr) +# library(rtracklayer) +library(DESeq2) +library(DEGreport) +library(ggrepel) +# library(RColorBrewer) +library(DT) +library(pheatmap) +library(bcbioR) +library(janitor) +library(ChIPpeakAnno) +library(UpSetR) +library(DiffBind) +library(qs) +library(EnhancedVolcano) +library(ggprism) +library(ChIPseeker) +library(msigdbr) +library(fgsea) + +if (params$species == 'mouse'){ + library(TxDb.Mmusculus.UCSC.mm10.knownGene) + txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene + anno_db <- 'org.Mm.eg.db' + library(org.Mm.eg.db) +} else if (params$species == human){ + library(TxDb.Hsapiens.UCSC.hg38.knownGene) + txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene + anno_db <- 'org.Hs.eg.db' + library(org.Hs.eg.db) +} + + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = FALSE, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) + +``` + + +```{r sanitize-datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + +```{r download example data, eval = params$params_file == 'params_diffbind-example.R'} +# if using example data, download it from github +new_samplesheet_fn <- paste('download', basename(diffbind_samplesheet_fn), sep = '_') +download.file(diffbind_samplesheet_fn, destfile = new_samplesheet_fn, mode = "wb") +diffbind_samplesheet_fn <- new_samplesheet_fn + +new_counts_fn <- paste('download', basename(diffbind_counts_fn), sep = '_') +download.file(diffbind_counts_fn, destfile = new_counts_fn, mode = "wb") +diffbind_counts_fn <- new_counts_fn +``` + +# Samples and metadata +```{r load_coldata, message=F, warning=F} +coldata <- load_coldata(coldata_fn) +``` + +```{r make samplesheet, message = F, warning =F, eval = !file.exists(diffbind_samplesheet_fn)} +# DiffBind requires a very specific samplesheet in order to create the peak counts object, see https://www.rdocumentation.org/packages/DiffBind/versions/2.0.2/topics/dba for further details + +# make_diffbind_samplesheet is a function provided by bcbioR to help assemble DiffBind's samplesheet +# using the nf-core samplesheet and output. In the resulting DiffBind counts object, it +# encodes your condition of interest as "Condition" and the antibody as "Factor" + +samplesheet <- make_diffbind_samplesheet(coldata, bam_dir, peaks_dir, params$condition_of_interest) +write_csv(samplesheet, diffbind_samplesheet_fn) + +# if necessary, one additional covariate of interest can be encoded as "Tissue" +``` + +```{r show_metadata} +samplesheet <- read_csv(diffbind_samplesheet_fn) +samplesheet %>% dplyr::select(SampleID, Replicate, Condition, Factor, ControlID) %>% sanitize_datatable() +``` + +# Calculate counts matrix + +The first step is to read in a set of peaksets and associated metadata. +This is done using the DiffBind sample sheet. Once the peaksets are read in, +a merging function finds all overlapping peaks and derives a single set of unique +genomic intervals covering all the supplied peaks (a consensus peakset for +the experiment). A region is considered for the consensus set if it appears in +more than two of the samples. This consensus set represents the overall set of +candidate binding sites to be used in further analysis. + +The next step is to take the alignment files and compute count information for +each of the peaks/regions in the consensus set. In this step, for each of the +consensus regions, DiffBind uses the number of aligned reads in the ChIP sample +and the input sample to compute a normalized read count for each sample at every +potential binding site. The peaks in the consensus peakset may be re-centered and +trimmed based on calculating their summits (point of greatest read overlap) in +order to provide more standardized peak intervals. + +We then normalize the count matrix to adjust for varying +library size, and we use the normalized counts for further analysis including PCA. + +```{r create diffbind counts object, eval = !file.exists(diffbind_counts_fn)} +diffbind_obj <- dba(sampleSheet = samplesheet, scoreCol = 5) + +# This command may take several minutes. Recommend using multiple cores and lots of memory +diffbind_counts <- dba.count(diffbind_obj, bUseSummarizeOverlaps = TRUE, bParallel = T) + +# save object when time-intensive command is finished, so that this cell only need run once +qsave(diffbind_counts, diffbind_counts_fn) +``` + +# PCA + +Principal Component Analysis (PCA) is a statistical technique used to simplify +high-dimensional data by identifying patterns and reducing the number of variables. +In the context of ChIPseq, PCA helps analyze large datasets containing information +about thousands of binding locations across different samples (e.g., tissues, cells). + +```{r PCA} +diffbind_counts <- qread(diffbind_counts_fn) + +diffbind_norm <- dba.normalize(diffbind_counts) + +norm_counts <- dba.peakset(diffbind_norm, bRetrieve=TRUE, DataType=DBA_DATA_FRAME) %>% + mutate(peak = paste(CHR, START, END, sep = '_')) %>% + dplyr::select(-CHR, -START, -END) +rownames(norm_counts) <- norm_counts$peak +norm_counts <- norm_counts %>% dplyr::select(-peak) %>% as.matrix() +norm_counts_log <- log2(norm_counts + 1) +norm_counts_log_df <- norm_counts_log %>% as.data.frame() %>% + rownames_to_column('peak') + +write_csv(norm_counts_log_df, params$counts_csv_fn) + +coldata_for_pca <- coldata[colnames(norm_counts), ] + +stopifnot(all(colnames(norm_counts) == rownames(coldata_for_pca))) + +degPCA(norm_counts_log, coldata_for_pca, condition = params$condition_of_interest) + + scale_color_cb_friendly() +``` + + +# Differentially Bound Peaks + +A standardized differential analysis is performed using DiffBind and the DESeq2 package, +including estimation of size factors and dispersions, fitting and testing the +model, evaluating the supplied contrast, and shrinking the LFCs. A p-value and FDR +is assigned to each candidate binding site indicating confidence that they are differentially bound. + +We use [ChIPpeakAnno](https://bioconductor.org/packages/release/bioc/html/ChIPpeakAnno.html) +to identify any gene features within 1000 bp of a differentially bound site. + + +```{r DB analysis} +diffbind_norm <- dba.contrast(diffbind_norm, contrast = c('Condition', params$numerator, params$denominator)) +results_obj <- dba.analyze(diffbind_norm, + bBlacklist = F, # Use TRUE with your data + bGreylist = F) + +results_report <- dba.report(results_obj, th = 1) +results_report_sig <- dba.report(results_obj) + +results <- results_report %>% as.data.frame() + +``` + +```{r annotate DB peaks} + +anno_data <- toGRanges(txdb, feature = 'gene') +results_anno_batch <- annotatePeakInBatch(results_report, + AnnotationData = anno_data, + output = 'overlapping', + maxgap = 1000) + +results_anno_batch_df <- results_anno_batch %>% as.data.frame() + +if(params$species == 'mouse'){ + entrez_to_symbol <- AnnotationDbi::select(org.Mm.eg.db, results_anno_batch_df$feature, + "ENTREZID", columns = 'SYMBOL') %>% + filter(!is.na(ENTREZID)) %>% distinct() +} else if (params$species == 'human'){ + entrez_to_symbol <- AnnotationDbi::select(org.Hs.eg.db, results_anno_batch_df$feature, + "ENTREZID", columns = 'SYMBOL') %>% + filter(!is.na(ENTREZID)) %>% distinct() +} + +results_anno_batch_df <- results_anno_batch_df %>% + left_join(entrez_to_symbol %>% dplyr::select(feature = ENTREZID, gene_name = SYMBOL)) + +write_csv(results_anno_batch_df, params$results_sig_anno_fn) + +``` + + +## MA plot + +This plot can help to: +- Identify Differential Binding: Sites that show a significant log-fold change (M value away from 0) indicate changes in binding between conditions. +- Assess Data Quality: The plot can help in identifying biases or systematic errors in the data. Ideally, most points should scatter around the M=0 line, indicating that there is no significant systematic difference between the conditions. +- Visualize data dispersion: The distribution of points along the A-axis gives a sense of the spread of binding levels and any patterns or anomalies in the dataset. + +```{r MA plot} +results_for_ma <- results_anno_batch_df%>% + mutate(peak = paste(seqnames, start, end, sep = '_')) %>% + mutate(t = 0) %>% + dplyr::select(peak, AveExpr = Conc, logFC = Fold, P.Value = p.value, adj.P.Val = FDR, t) +degMA(as.DEGSet(results_for_ma, contrast = paste(params$numerator, params$denominator, sep = ' vs. '))) + +``` + +## Table of differentially bound peaks + +```{r DB table} + +results_sig_anno_batch_df <- results_anno_batch_df %>% filter(FDR < 0.05) +results_sig_anno_batch_df %>% dplyr::select(names(results), feature, gene_name) %>% + sanitize_datatable() + +``` + + +## Volcano plot + +This volcano plot shows the binding sites that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in purple are sites that have padj < 0.05 and a log2-fold change magnitude > 0.5. Points in blue have a padj > 0.05 and a log2-fold change magnitude > 0.5. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2-fold change and padj that we have chosen. + +```{r volcano, fig.height = 8} +results_mod <- results_sig_anno_batch_df %>% + mutate(Fold = replace(Fold, Fold < -5, -5)) %>% + mutate(Fold = replace(Fold, Fold > 5, 5)) %>% + mutate(peak = paste(seqnames, start, end, sep = '_')) +# show <- as.data.frame(results_mod[1:6, c("Fold", "FDR", "gene_name")]) + +show <- results_mod %>% filter(!is.na(gene_name)) %>% slice_min(n = 6, order_by = FDR) + +results_mod <- results_mod %>% mutate(gene_name = ifelse(peak %in% show$peak , gene_name, NA)) +EnhancedVolcano(results_mod, + lab= results_mod$gene_name, + pCutoff = 0.05, + selectLab = c(show$gene_name), + FCcutoff = 0.5, + x = 'Fold', + y = 'FDR', + title = paste(params$condition_of_interest, ':', params$numerator, 'vs', params$denominator), + col=as.vector(colors[c("dark_grey", "light_blue", + "purple", "purple")]), + subtitle = "", drawConnectors = T, max.overlaps = Inf) + +``` + +## Plot top peaks + +We visualize the log2 normalized read counts at a few of the most differentially +bound sites. +```{r plot top peaks, fig.width = 8, fig.height = 6} +norm_counts_log_long <- norm_counts_log %>% as.data.frame() %>% + rownames_to_column('peak') %>% + pivot_longer(!peak, names_to = 'sample', values_to = 'norm_counts_log2') %>% + left_join(coldata) + +norm_counts_log_long_top <- norm_counts_log_long %>% filter(peak %in% show$peak) + +ggplot(norm_counts_log_long_top, aes(x = .data[[params$condition_of_interest]], y = norm_counts_log2)) + + facet_wrap(~peak, scale = 'free_y') + geom_boxplot() +``` + +## Annotate DB peaks + +We use the [ChIPseeker](https://www.bioconductor.org/packages/release/bioc/html/ChIPseeker.html) +package to determine the genomic context of the differentially bound peaks and +visualize these annotations. We consider the promoter region to be within 2000 bp in either direction of the TSS. + +```{r annotate, echo = F} + +results_sig_anno <- annotatePeak(results_report_sig, + tssRegion = c(-2000, 2000), + TxDb = txdb, + annoDb = anno_db, + verbose = F) +results_sig_anno_df <- results_sig_anno %>% as.data.frame() + +plotAnnoPie(results_sig_anno) + +plotDistToTSS(results_sig_anno) + +``` + +# Functional Enrichment + +Over-Representation Analysis (ORA) is a statistical method used to determine whether a predefined set of genes (e.g., genes belonging to a specific biological pathway or function) is over-represented (or enriched) among a list of differentially bound genes (DEGs) from ChIP-seq. Adventages of ORA: + +- Simplicity: Easy to perform and interpret. +- Biological Insight: Helps to identify pathways and processes that are significantly affected in the condition studied. +- Prior Knowledge Integration: Utilizes existing biological knowledge through predefined gene sets. + +```{r get databases} +if(params$species == 'human'){ + all_in_life=get_databases() +} else if (params$species == 'mouse'){ + all_in_life = get_databases('Mus musculus') +} +``` + +```{r ora} + +universe_mapping = results_anno_batch_df %>% + filter(!is.na(FDR), !is.na(feature)) %>% + dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct() + +ora_input = results_anno_batch_df %>% + filter(!is.na(FDR), FDR < 0.01, abs(Fold) > 0.3, !is.na(feature)) %>% + dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct() +all = run_fora(ora_input, universe_mapping, all_in_life) + +ora_input = results_anno_batch_df %>% + filter(!is.na(FDR), FDR < 0.01, Fold > 0.3, !is.na(feature)) %>% + dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct() +up = run_fora(ora_input, universe_mapping, all_in_life) + +ora_input = results_anno_batch_df %>% + filter(!is.na(FDR), FDR < 0.01, Fold < -0.3, !is.na(feature)) %>% + dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct() +down = run_fora(ora_input, universe_mapping, all_in_life) + +``` + + +## Significant pathways using all DB genes + +```{r all pathways} +all %>% sanitize_datatable() +``` + + +## Significant pathways using increased DB genes + +```{r up pathways} +up %>% sanitize_datatable() +``` + + +## Significant pathways using decreased DB genes + +```{r down pathways, results='asis'} +down %>% sanitize_datatable() +``` + +# R session + +List and version of tools used for the report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/templates/chipseq/diffbind/params_diffbind-example.R b/inst/templates/chipseq/diffbind/params_diffbind-example.R new file mode 100644 index 0000000..0394d39 --- /dev/null +++ b/inst/templates/chipseq/diffbind/params_diffbind-example.R @@ -0,0 +1,14 @@ +# info params +# Example data + +# this is the samplesheet used to run nf-core, with additional columns containing covariates of interest +coldata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/refs/heads/main/chipseq/chipseq_peakanalysis_H3K27Ac.csv' + +# example data doesn't need this but this folder is in the nf-core output directory, maybe is broadPeak instead of narrowPeak +# peaks_dir = '/path/to/nf-core/output/bowtie2/mergedLibrary/macs2/narrowPeak/' + +# example data doesn't need this but this folder is in the nf-core output directory +# bam_dir = '/path/to/nf-core/output/bowtie2/mergedLibrary/' + +diffbind_samplesheet_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/refs/heads/main/chipseq/diffbind_samplesheet.csv' +diffbind_counts_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/refs/heads/main/chipseq/diffbind_counts.qs' diff --git a/inst/templates/chipseq/diffbind/params_diffbind.R b/inst/templates/chipseq/diffbind/params_diffbind.R new file mode 100644 index 0000000..ade763b --- /dev/null +++ b/inst/templates/chipseq/diffbind/params_diffbind.R @@ -0,0 +1,15 @@ +# info params + +coldata_fn='/path/to/nf-core/samplesheet.csv' + +# This folder is in the nf-core output directory, maybe is broadPeak instead of narrowPeak +peaks_dir = '/path/to/nf-core/output/bowtie2/mergedLibrary/macs2/narrowPeak/' + +# This folder is in the nf-core output directory +bam_dir = '/path/to/nf-core/output/bowtie2/mergedLibrary/' + +# this will be the file that the diffbind samplesheet is eventually saved in +diffbind_samplesheet_fn = 'diffbind_samplesheet.csv' + +# This will be the file that the diffbind counts matrix is eventually saved in +diffbind_counts_fn = 'diffbind_counts.qs' \ No newline at end of file diff --git a/inst/rmarkdown/templates/cosmx/skeleton/information.R b/inst/templates/chipseq/information.R old mode 100644 new mode 100755 similarity index 100% rename from inst/rmarkdown/templates/cosmx/skeleton/information.R rename to inst/templates/chipseq/information.R diff --git a/inst/templates/chipseq/libs/load_data.R b/inst/templates/chipseq/libs/load_data.R new file mode 100755 index 0000000..324e808 --- /dev/null +++ b/inst/templates/chipseq/libs/load_data.R @@ -0,0 +1,160 @@ +library(tidyverse) +library(SummarizedExperiment) +library(janitor) + +load_metrics <- function(multiqc_data_dir){ + + fastqc <- read_tsv(file.path(multiqc_data_dir, 'multiqc_fastqc.txt')) %>% clean_names() %>% + dplyr::select(sample, total_reads = total_sequences) %>% + mutate(new_sample = gsub('_T[0-9]+', '', sample)) %>% + group_by(new_sample) %>% + summarize(new_total_reads = sum(total_reads)) %>% + dplyr::select(sample = new_sample, total_reads = new_total_reads) + samtools <- read_tsv(file.path(multiqc_data_dir, 'multiqc_samtools_stats.txt')) %>% clean_names() %>% + dplyr::select(sample, mapped_reads = reads_mapped) %>% + mutate(new_sample = gsub('_T[0-9]+', '', sample)) %>% + group_by(new_sample) %>% + summarize(new_mapped_reads = sum(mapped_reads)) %>% + dplyr::select(sample = new_sample, mapped_reads = new_mapped_reads) + + phantom <- read_tsv(file.path(multiqc_data_dir, 'multiqc_phantompeakqualtools.txt')) %>% clean_names() %>% + dplyr::select(sample, nsc, rsc) + frip <- read_tsv(file.path(multiqc_data_dir, 'multiqc_frip_score-plot.txt')) %>% dplyr::select(-Sample) %>% + pivot_longer(everything(), names_to = 'sample', values_to = 'frip') %>% filter(!is.na(frip)) + peak_count <- read_tsv(file.path(multiqc_data_dir, 'multiqc_peak_count-plot.txt')) %>% dplyr::select(-Sample) %>% + pivot_longer(everything(), names_to = 'sample', values_to = 'peak_count') %>% filter(!is.na(peak_count)) + nrf <- read_tsv(file.path(multiqc_data_dir, 'mqc_picard_deduplication_1.txt')) %>% clean_names() %>% + mutate(nrf = unique_unpaired / (unique_unpaired + duplicate_unpaired)) %>% + dplyr::select(sample, nrf) + + metrics <- full_join(fastqc, samtools) %>% full_join(phantom) %>% full_join(frip) %>% + full_join(peak_count) %>% full_join(nrf) %>% + mutate(mapped_reads_pct = round(mapped_reads/total_reads*100,1)) + + metrics$sample <- make.names(metrics$sample) + rownames(metrics) <- metrics$sample + return(metrics) +} + +load_coldata <- function(coldata_fn, column=NULL, numerator=NULL, denominator=NULL, subset_column = NULL, subset_value = NULL){ + coldata=read.csv(coldata_fn) %>% + dplyr::distinct(sample, .keep_all = T) %>% + dplyr::select(!matches("fastq")) %>% + distinct() + if('description' %in% names(coldata)){ + coldata$sample <- tolower(coldata$description) + } + coldata <- coldata %>% distinct(sample, .keep_all = T) + if (!is.null(column)) + stopifnot(column %in% names(coldata)) + + # use only some samples, by default use all + if (!is.null(subset_column)){ + coldata <- coldata[coldata[[paste(subset_column)]] == subset_value, ] + } + #coldata <- coldata[coldata[[paste(column)]] %in% c(numerator, denominator), ] + #browser() + coldata$sample <- make.names(coldata$sample) + rownames(coldata) <- coldata$sample + coldata$description <- coldata$sample + coldata$antibody <- ifelse(coldata$antibody == '', 'input', coldata$antibody) + coldata$type <- ifelse(coldata$antibody == 'input', 'input', 'chip') + + if (!is.null(denominator)) + coldata[[column]] = relevel(as.factor(coldata[[column]]), denominator) + + return(coldata) +} + +load_counts <- function(counts_fn){ + + counts <- readRDS(counts_fn) + return(counts) + +} + +load_peaks <- function(peaks_dir){ + if(grepl('broadPeak', peaks_dir)){ + peaks_fns <- list.files(peaks_dir, pattern = '_peaks.broadPeak') + names(peaks_fns) <- gsub('_peaks.broadPeak', '', peaks_fns) + } else { + peaks_fns <- list.files(peaks_dir, pattern = '_peaks.narrowPeak') + names(peaks_fns) <- gsub('_peaks.narrowPeak', '', peaks_fns) + } + peaks_all <- lapply(peaks_fns, function(fn) { + peaks <- read_delim(file.path(peaks_dir, fn), col_names = F) + peaks_df <- data.frame(seqnames = peaks$X1, start = peaks$X2, end = peaks$X3, + peak_enrichment = peaks$X7, peak_rank = rank(dplyr::desc(peaks$X7))) %>% + dplyr::arrange(peak_rank) + return(peaks_df) + }) %>% bind_rows(.id = 'sample') + peaks_all$sample_group <- gsub('_REP[0-9]+', '', peaks_all$sample) + + return(peaks_all) +} + +make_diffbind_samplesheet <- function(coldata, bam_dir, peaks_dir, column = NULL){ + bam_files <- data.frame(bam = list.files(bam_dir, pattern = '.bam$', full.names = T)) %>% + mutate(sample = sub("\\..*", "",basename(bam))) + + peak_files <- data.frame(Peaks = list.files(peaks_dir, pattern = 'Peak$', full.names = T)) %>% + mutate(SampleID = sub("\\..*", "",basename(Peaks))) %>% + mutate(SampleID = gsub('_peaks', '', SampleID)) + + coldata_for_diffbind <- coldata %>% + filter(!is.na(control) & control != '') %>% + dplyr::rename(ControlID = control, SampleID = sample, Factor = antibody) %>% + separate(SampleID, into = c('sample', 'Replicate'), remove = F, sep = '_REP') + coldata_for_diffbind$Condition <- coldata_for_diffbind[[column]] + + samplesheet <- coldata_for_diffbind %>% + left_join(bam_files %>% dplyr::select(SampleID = sample, bamReads = bam), by = 'SampleID') %>% + left_join(bam_files %>% dplyr::select(ControlID = sample, bamControl = bam), by = 'ControlID') %>% + left_join(peak_files, by = 'SampleID') + + return(samplesheet) +} + +get_databases=function(sps="human"){ + all_in_life=list( + msigdbr(species = sps, category = "H") %>% mutate(gs_subcat="Hallmark"), + # msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"), + msigdbr(species = sps, category = "C2", subcategory = "CP:KEGG"), + # msigdbr(species = "human", category = "C2", subcategory = "CP:PID"), + msigdbr(species = sps, category = "C5", subcategory = "GO:BP"), + msigdbr(species = sps, category = "C5", subcategory = "GO:MF") + # msigdbr(species = "human", category = "C5", subcategory = "HPO"), + # msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"), + # msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic") + ) + all_in_life +} + +run_fora=function(input, uni,all_in_life){ + # browser() + total_deg=length(unique(input))/length(unique(uni$ENTREZID)) + pathways_ora_all = lapply(all_in_life, function(p){ + pathway = split(x = p$entrez_gene, f = p$gs_name) + db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":") + respath <- fora(pathways = pathway, + genes = unique(input$ENTREZID), + universe = unique(uni$ENTREZID), + minSize = 15, + maxSize = 500) + # coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1], + # pathway, unique(input$ENTREZID), unique(uni$ENTREZID)) + as_tibble(respath) %>% + mutate(database=db_name, NES=(overlap/size)/(total_deg)) + }) %>% bind_rows() %>% + mutate(analysis="ORA") + ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>% + group_by(pathway) %>% + left_join(uni, by =c("overlapGenes"="ENTREZID")) %>% + dplyr::select(pathway, padj, NES, SYMBOL, analysis, + database) %>% + group_by(pathway,padj,NES,database,analysis) %>% + summarise(genes=paste(SYMBOL,collapse = ",")) + ora_tb + +} + diff --git a/inst/templates/chipseq/readme.md b/inst/templates/chipseq/readme.md new file mode 100755 index 0000000..645b56a --- /dev/null +++ b/inst/templates/chipseq/readme.md @@ -0,0 +1,25 @@ +# Guidelines for ChIPSeq analysis + +Make sure there is a valid project name, and modify `information.R` with the right information for your project. You can use this file with any other Rmd to include the project/analysis information. + +## Run data with nf-core rnaseq + +This templates assume data has been processed by [nf-core/chipseq](https://nf-co.re/chipseq/2.1.0/docs/usage/). +We recommend to use the samplesheet.csv used with nf-core as metadata file, where other relevant columns can be there even if they are not used by the pipeline. + +## QC + +`QC/QC.Rmd` is a template for QC metrics. It includes basic read-level statistics, peak quality information, sample correlation analysis, and PCA that it produces using the above samplesheet and output from the nf-core pipeline. Use `params_qc.R` to provide the required input files. + +## DiffBind + +`diffbind/diffbind.Rmd` is a template for comparing peak binding betweeen two groups. Use `params_diffbind.R` to provide the required input files. + +On the YAML header file of the Rmd you can specify some parameters including the conditions to be compared, the genome used, and the desired output file names. This template has examples of: +* calculating a peak counts matrix +* PCA +* differential binding analysis +* peak annotation +* functional analysis (coming soon) + +This template writes to CSV a log2 normalized counts matrix of peaks x samples as well as the annotated significant results of the differential binding analysis. diff --git a/inst/templates/methylation/QC/QC.Rmd b/inst/templates/methylation/QC/QC.Rmd new file mode 100644 index 0000000..bb912a0 --- /dev/null +++ b/inst/templates/methylation/QC/QC.Rmd @@ -0,0 +1,613 @@ +--- +title: "Quality Control" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + params_file: ../information.R + meta_fn: ../meta/methylation_mucci_hbc04926.csv +--- + +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-draft-grey) revision. + +```{r echo = F} +source(params$params_file) +``` + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE, echo=FALSE,} + +library(tidyverse) +library(bcbioR) +library(ggprism) +library(knitr) +library(tools) +library(qs) +library(janitor) +library(DEGreport) +library(ggrepel) +library(pheatmap) +library(minfi) +library(IlluminaHumanMethylationEPICv2manifest) +library(IlluminaHumanMethylationEPICv2anno.20a1.hg38) +library(methylclock) +options(stringsAsFactors = FALSE) +options(future.globals.maxSize= 891289600) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) + +``` + +```{r sanitize_datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df)), + filter = 'top') +} +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` +- Aim: `r aim` + +The workflow followed in this report is descriped in the documentation [here](https://nbis-workshop-epigenomics.readthedocs.io/en/latest/content/tutorials/methylationArray/Array_Tutorial.html) + +# Stratify samples by DNA quantity + + +```{r examine_sample_quant} + +metadata <- read.csv(params$meta_fn, row.names = 1) %>% clean_names() + +# TODO remove from HERE to next TODO if quantile classification not relevant +thirds_blood <- quantile( + metadata %>% filter(tissue == 'blood') %>% pull(total_ng), + c(0.33, 0.67) +) + +thirds_tissue <- quantile( + metadata %>% filter(tissue != 'blood') %>% pull(ng_dna), + c(0.33, 0.67) +) + +``` + +For blood samples, input DNA quantity was considered low if <= `r round(thirds_blood[1], 2)` ng, high if it was >= `r round(thirds_blood[2], 2)` ng, and medium if in between. + +For tissue samples, input DNA quantity was considered low if <= `r round(thirds_tissue[1], 2)` ng, high if it was >= `r round(thirds_tissue[2], 2)` ng, and medium if in between. + +```{r plot_sample_quant} + +ggplot(metadata, aes(x = ng_dna, fill = tissue)) + + geom_histogram() + + geom_vline(aes(xintercept = thirds_tissue[1])) + + geom_vline(aes(xintercept = thirds_tissue[2])) + + scale_fill_cb_friendly() + +ggplot(metadata, aes(x = total_ng, fill = tissue)) + + geom_histogram() + + geom_vline(aes(xintercept = thirds_blood[1])) + + geom_vline(aes(xintercept = thirds_blood[2])) + + scale_fill_cb_friendly() + + +``` + + +## Metadata by quantity classification + +```{r stratify_samples} + +metadata <- metadata %>% + mutate(dna_quant_level = case_when( + tissue == 'blood' ~ case_when( + total_ng <= thirds_blood[1] ~ 'Low', + total_ng >= thirds_blood[2] ~ 'High', + TRUE ~ 'Medium' + ), + TRUE ~ case_when( + ng_dna <= thirds_tissue[1] ~ 'Low', + ng_dna >= thirds_tissue[2] ~ 'High', + TRUE ~ 'Medium' + ) + )) %>% + mutate(subject_id = ifelse(is.na(subject_id), id, subject_id)) + +# TODO + +metadata <- metadata %>% group_by(dna_quant_level, subject_id) %>% + mutate(sample_name = paste('ID', subject_id, dna_quant_level, row_number(), sep = '_')) +names_vec <- metadata %>% + pull(sample_name) +names(names_vec) <- metadata$ch_ip_id + +metadata %>% sanitize_datatable() +``` + +# Minfi QC {.tabset} + +```{r load methyl data} + +rgSet <- read.metharray.exp(base = "../data/", recursive = TRUE) +colnames(rgSet) <- names_vec[colnames(rgSet)] +rgSet@annotation <- c(array = "IlluminaHumanMethylationEPICv2", annotation = "20a1.hg38") +rownames(metadata) <- metadata$sample_name +rgSet@colData <- DataFrame(metadata) + +MSet <- preprocessRaw(rgSet) +ratioSet <- ratioConvert(MSet, what = "both", keepCN = TRUE) +gset <- mapToGenome(ratioSet) + +beta <- getBeta(gset) +m <- getM(gset) + +``` + +## Intensity + +From the documentation: "minfi provides a simple quality control plot that uses the log median intensity in both the methylated (M) and unmethylated (U) channels. When plotting these two medians against each other, good samples tend to cluster together, while failed samples tend to separate and have lower median intensities" + +```{r intensity} +qc <- getQC(MSet) +# plotQC(qc) +badSampleCutoff <- 10.5 +qc_df <- as.data.frame(qc) %>% rownames_to_column('sample_name') %>% + mutate(sample_qual = ifelse((mMed + uMed) / 2 < badSampleCutoff, 'bad', 'good')) + + +ggplot(qc_df, aes(x = mMed, y = uMed, color = sample_qual)) + + geom_point() + + geom_text_repel(data = qc_df %>% filter(sample_qual == 'bad'), + aes(x = mMed, y = uMed, color = sample_qual, label = sample_name)) + + xlab('Meth median intensity (log2)') + ylab('Unmeth median intensity (log2)') + + xlim(c(8,14)) + ylim(c(8,14)) + scale_color_cb_friendly() +``` + +## Detection + +From the documentation: "We can additionally look at the detection p-values for every CpG in every sample, which is indicative of the quality of the signal. The method used by minfi to calculate detection p-values compares the total signal (M+U) for each probe to the background signal level, which is estimated from the negative control probes. Very small p-values are indicative of a reliable signal whilst large p-values generally indicate a poor quality signal." + + +```{r detection, fig.width=10, fig.height = 6} + +detP <- detectionP(rgSet) +# barplot(colMeans(detP), las=2, cex.names=0.8, ylab="Mean detection p-values") +# abline(h=0.05,col="red") + +det_df <- data.frame(sample_name = colnames(detP), detection_p_val = colMeans(detP)) %>% + left_join(metadata) +ggplot(det_df, aes(x = sample_name, y = detection_p_val, fill = tissue)) + geom_col() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + + geom_hline(yintercept = 0.01) + scale_fill_cb_friendly() + +det_frac_df <- data.frame(sample_name = colnames(detP), frac_probes_detected = 1 - colSums(detP > 0.05) / nrow(detP)) %>% + left_join(metadata) +ggplot(det_frac_df, aes(x = sample_name, y = frac_probes_detected, fill = tissue)) + geom_col() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + scale_fill_cb_friendly() + +``` + +## Control Probes {.tabset} + +From the documentation: "The array contains several internal control probes that can be used to assess the quality control of different sample preparation steps (bisulfite conversion, hybridization, etc.)" + + +```{r control probes, results = 'asis', fig.height = 8} + +control_probes <- c("BISULFITE CONVERSION I", "BISULFITE CONVERSION II", "HYBRIDIZATION", "NON-POLYMORPHIC", "SPECIFICITY I", "SPECIFICITY II", "TARGET REMOVAL", "NEGATIVE") + +for (probe in control_probes){ + cat('\n') + cat('### ', probe, '\n') + controlStripPlot(rgSet, controls=probe) + cat('\n') +} +``` + +# Minfi Normalization (Funnorm) {.tabset} + +From the documentation: "If there exist global biological methylation differences between your samples, as for instance a dataset with cancer and normal samples, or a dataset with different tissues/cell types, use the preprocessFunnorm function as it is aimed for such datasets" + +## Between Arrays + +From the documentation: "The overall density distribution of Beta values for each sample is another useful metric to determine sample quality. Usually, one would expect to see most Beta values to be either close to 0 or 1, indicating most of the CpG sites in the sample are unmethylated or methylated" + + +```{r funnorm between} + +ggdat=as.data.frame(getBeta(MSet)) %>% rownames_to_column("cpgs") %>% + pivot_longer(cols = !matches("cpgs")) %>% + inner_join(metadata[,c("sample_name","tissue")], by=c("name"="sample_name")) + +MSet_funnorm <- preprocessFunnorm(rgSet) + +ggdat_funnorm=as.data.frame(getBeta(MSet_funnorm)) %>% rownames_to_column("cpgs") %>% + pivot_longer(cols = !matches("cpgs")) %>% + inner_join(metadata[,c("sample_name","tissue")], by=c("name"="sample_name")) + +par(mfrow=c(1,2)) +ggplot(ggdat,aes(value, group=name,color=tissue))+ + geom_density(alpha=0.6, size=1) + + scale_color_cb_friendly() + ggtitle('Raw') +ggplot(ggdat_funnorm, aes(value, group=name,color=tissue))+ + geom_density(alpha=0.6, size=1) + + scale_color_cb_friendly() + ggtitle('Funnorm') + +``` + +## Within Arrays + +From the documentation: "A comparison of the Beta distributions for the different probe designs. This will give an indication of the effectiveness of the within-array normalization." + + +```{r funnorm within, message = F, echo = F, results = 'hide'} + +typeI <- getProbeInfo(MSet, type = "I")[, c("Name","nCpG")] +typeII <- getProbeInfo(MSet, type = "II")[, c("Name","nCpG")] +probeTypes <- rbind(typeI, typeII) +probeTypes$Type <- rep(x = c("I", "II"), times = c(nrow(typeI), nrow(typeII))) + +lapply(colnames(MSet), function(sample){ + par(mfrow=c(1,2)) + plotBetasByType(MSet[, sample], main = paste('Raw', sample)) + plotBetasByType(getBeta(MSet_funnorm)[, sample], probeTypes = probeTypes, + main = paste('Funnorm', sample)) +}) + +``` + +# Sample Clustering {.tabset} + +Post-normalization, we are interested to look at the similarities or differences between various samples. One way to do this is by creating PCA plots, which graphically represent the relationships between objects in multidimensional space into 2 dimensional space, where the two dimensions are chosen so that they capture the greatest sources of variation in the data. + +Another way to do this is by calculating the correlation between beta values for samples, clustering the samples based on these correlations, and visualizing the information in a heatmap. + + +## All samples + +```{r pca_all_samples} +beta_funnorm <- getBeta(MSet_funnorm) +metadata <- as.data.frame(metadata) +rownames(metadata) <- metadata$sample_name +degPCA(beta_funnorm, metadata, condition = 'tissue', shape = 'dna_quant_level') + + scale_color_cb_friendly() + +``` + +```{r heatmap_all_samples, fig.width = 12, fig.height = 10} +beta_funnorm_cor <- cor(beta_funnorm) + +colma <- metadata %>% select(dna_quant_level, tissue) + +anno_colors=lapply(colnames(colma), function(c){ + l.col=c(cb_friendly_pal('grey')(length(unique(colma[[c]])) - 1), 'white') + names(l.col)=unique(colma[[c]]) + l.col +}) +names(anno_colors)=colnames(colma) +p <- pheatmap(beta_funnorm_cor, + annotation = colma, + annotation_colors = anno_colors, + # show_rownames = F, + # show_colnames = F, + color = cb_friendly_pal('heatmap')(15) +) +p +``` + +## Blood +```{r pca_blood_samples, fig.height = 7} + +# TODO remove from here to next TODO if PCA per tissue type is not needed +metadata_blood <- metadata %>% filter(tissue == 'blood') +beta_funnorm_blood <- beta_funnorm[, colnames(beta_funnorm) %in% rownames(metadata_blood)] + +degPCA(beta_funnorm_blood, metadata_blood, condition = 'subject_id', shape = 'dna_quant_level') + + scale_color_cb_friendly() + + +``` + +```{r heatmap_blood_samples, fig.width = 10, fig.height = 8} +beta_funnorm_cor_blood <- cor(beta_funnorm_blood) + +colma_blood <- metadata_blood %>% select(dna_quant_level) + +anno_colors_blood=lapply(colnames(colma_blood), function(c){ + l.col=c(cb_friendly_pal('grey')(length(unique(colma_blood[[c]])) - 1), 'white') + names(l.col)=unique(colma_blood[[c]]) + l.col +}) +names(anno_colors_blood)=colnames(colma_blood) +p <- pheatmap(beta_funnorm_cor_blood, + annotation = colma_blood, + annotation_colors = anno_colors_blood, + # show_rownames = F, + # show_colnames = F, + color = cb_friendly_pal('heatmap')(15) +) +p + +``` + +## Prostate +```{r pca_prostate_samples} +metadata_prostate <- metadata %>% filter(tissue != 'blood') +beta_funnorm_prostate <- beta_funnorm[, colnames(beta_funnorm) %in% rownames(metadata_prostate)] + +degPCA(beta_funnorm_prostate, metadata_prostate, condition = 'subject_id', shape = 'dna_quant_level') + + scale_color_cb_friendly() + +``` + +```{r heatmap_prostate_samples, fig.width = 10, fig.height = 8} +beta_funnorm_cor_prostate <- cor(beta_funnorm_prostate) + +colma_prostate <- metadata_prostate %>% select(dna_quant_level) + +anno_colors_prostate=lapply(colnames(colma_prostate), function(c){ + l.col=c(cb_friendly_pal('grey')(length(unique(colma_prostate[[c]])) - 1), 'white') + names(l.col)=unique(colma_prostate[[c]]) + l.col +}) +names(anno_colors_prostate)=colnames(colma_prostate) +p <- pheatmap(beta_funnorm_cor_prostate, + annotation = colma_prostate, + annotation_colors = anno_colors_prostate, + # show_rownames = F, + # show_colnames = F, + color = cb_friendly_pal('heatmap')(15) +) +p + +# TODO +``` + + +# Filtered Sample Clustering {.tabset} + +Poor performing probes as well as probes with SNPs were filtered from the data. + +```{r filter probes} + +detP <- detectionP(rgSet) +detP <- detP[match(featureNames(MSet_funnorm), rownames(detP)),] +keep <- rowSums(detP < 0.01) == ncol(MSet_funnorm) +MSet_funnorm_filt <- MSet_funnorm[keep,] +MSet_funnorm_filt <- dropLociWithSnps(MSet_funnorm_filt) + +``` + +## All samples + +```{r pca_all_samples_filt} +beta_funnorm_filt <- getBeta(MSet_funnorm_filt) +metadata <- as.data.frame(metadata) +rownames(metadata) <- metadata$sample_name +degPCA(beta_funnorm_filt, metadata, condition = 'tissue', shape = 'dna_quant_level') + + scale_color_cb_friendly() + +``` + +```{r heatmap_all_samples_filt, fig.width = 12, fig.height = 10} +beta_funnorm_filt_cor <- cor(beta_funnorm_filt) + +colma <- metadata %>% select(dna_quant_level, tissue) + +anno_colors=lapply(colnames(colma), function(c){ + l.col=c(cb_friendly_pal('grey')(length(unique(colma[[c]])) - 1), 'white') + names(l.col)=unique(colma[[c]]) + l.col +}) +names(anno_colors)=colnames(colma) +p <- pheatmap(beta_funnorm_filt_cor, + annotation = colma, + annotation_colors = anno_colors, + # show_rownames = F, + # show_colnames = F, + color = cb_friendly_pal('heatmap')(15) +) +p +``` + +## Blood +```{r pca_blood_samples_filt, fig.height = 7} + +# TODO remove from here to next TODO if PCA per tissue type is not relevant + +metadata_blood <- metadata %>% filter(tissue == 'blood') +beta_funnorm_filt_blood <- beta_funnorm_filt[, colnames(beta_funnorm_filt) %in% rownames(metadata_blood)] + +degPCA(beta_funnorm_filt_blood, metadata_blood, condition = 'subject_id', shape = 'dna_quant_level') + + scale_color_cb_friendly() + + +``` + +```{r heatmap_blood_samples_filt, fig.width = 10, fig.height = 8} +beta_funnorm_filt_cor_blood <- cor(beta_funnorm_filt_blood) + +colma_blood <- metadata_blood %>% select(dna_quant_level) + +anno_colors_blood=lapply(colnames(colma_blood), function(c){ + l.col=c(cb_friendly_pal('grey')(length(unique(colma_blood[[c]])) - 1), 'white') + names(l.col)=unique(colma_blood[[c]]) + l.col +}) +names(anno_colors_blood)=colnames(colma_blood) +p <- pheatmap(beta_funnorm_filt_cor_blood, + annotation = colma_blood, + annotation_colors = anno_colors_blood, + # show_rownames = F, + # show_colnames = F, + color = cb_friendly_pal('heatmap')(15) +) +p + +``` + +## Prostate +```{r pca_prostate_samples_filt} +metadata_prostate <- metadata %>% filter(tissue != 'blood') +beta_funnorm_filt_prostate <- beta_funnorm_filt[, colnames(beta_funnorm_filt) %in% rownames(metadata_prostate)] + +degPCA(beta_funnorm_filt_prostate, metadata_prostate, condition = 'subject_id', shape = 'dna_quant_level') + + scale_color_cb_friendly() + +``` + +```{r heatmap_prostate_samples_filt, fig.width = 10, fig.height = 8} +beta_funnorm_filt_cor_prostate <- cor(beta_funnorm_filt_prostate) + +colma_prostate <- metadata_prostate %>% select(dna_quant_level) + +anno_colors_prostate=lapply(colnames(colma_prostate), function(c){ + l.col=c(cb_friendly_pal('grey')(length(unique(colma_prostate[[c]])) - 1), 'white') + names(l.col)=unique(colma_prostate[[c]]) + l.col +}) +names(anno_colors_prostate)=colnames(colma_prostate) +p <- pheatmap(beta_funnorm_filt_cor_prostate, + annotation = colma_prostate, + annotation_colors = anno_colors_prostate, + # show_rownames = F, + # show_colnames = F, + color = cb_friendly_pal('heatmap')(15) +) +p + +# TODO + +``` + +# Epigenetic Clocks {.tabset} + + +```{r calculate epi age} +clock_rownames <- gsub('_[TB]+C[12]+1$', '', rownames(MSet_funnorm_filt)) +dups <- clock_rownames[duplicated(clock_rownames)] +keep <- !(clock_rownames %in% dups) + +MSet_funnorm_clock <- MSet_funnorm_filt[keep, ] +rownames(MSet_funnorm_clock) <- gsub('_[TB]+C[12]+1$', '', rownames(MSet_funnorm_clock)) +# cpgs.missing <- checkClocks(MSet_funnorm_clock) + + +age <- DNAmAge(MSet_funnorm_clock) +age_merged <- age %>% left_join(metadata, by = c('id' = 'sample_name')) %>% + mutate(age = ifelse(is.na(age), agedx, age)) + +# plotDNAmAge(age_merged$Horvath, age_merged$age) +``` + +## Horvath +```{r horvath} +ggplot(age_merged, aes(x = age, y = Horvath, color = tissue, shape = dna_quant_level)) + + geom_point() + + geom_text_repel(data = age_merged %>% filter(Horvath < 50), + aes(x = age, y = Horvath, color = tissue, label = id)) + + scale_color_cb_friendly() + +``` + +## Levine +```{r levine} +ggplot(age_merged, aes(x = age, y = Levine, color = tissue, shape = dna_quant_level)) + + geom_point() + + geom_text_repel(data = age_merged %>% filter(Levine < 40), + aes(x = age, y = Levine, color = tissue, label = id)) + + scale_color_cb_friendly() + +``` + +# Markers + +We observe that the CpG sites closest to the canonical TSS of GSTP1 are more methylated in malignant samples than other samples, although we see the opposite effect or little difference for sites farther from the TSS. + +```{r markers, fig.width = 12} + +# TODO replace with markers of interest, or remove this chunk if not relevant +annotation <- getAnnotation(MSet_funnorm_filt) + +gstp1 <- annotation %>% as.data.frame() %>% + filter(chr == 'chr11', pos < 67590000, pos > 67580000, strand == '+') + +gstp1_beta <- beta_funnorm_filt %>% as.data.frame() %>% rownames_to_column('Name') %>% + filter(Name %in% gstp1$Name) %>% + pivot_longer(!Name, names_to = 'sample_name', values_to = 'beta') %>% + left_join(metadata) %>% + left_join(gstp1) %>% + mutate(dist_from_TSS = pos - 67583812) + +ggplot(gstp1_beta, aes(x = as.factor(dist_from_TSS), y = beta, fill = tissue)) + geom_boxplot() + + facet_wrap(~tissue) + scale_fill_cb_friendly() + xlab('distance from TSS') + ggtitle('GSTP1 probes') + +``` + + +```{r, eval = F} +cn <- getCN(MSet_funnorm_filt) + +cn_sample <- cn %>% as.data.frame() %>% rownames_to_column('Name') %>% + select(Name, `ID_170-11-002_Medium_1`) %>% + left_join(annotation %>% as.data.frame() %>% select(Name, chr, strand, pos)) %>% + arrange(chr, pos) %>% mutate(xpos = row_number()) + +ggplot(cn_sample, aes(x = pos, y = `ID_170-11-002_Medium_1`)) + geom_point() + facet_wrap(~chr) +``` + +# Conclusions + +TODO write here + +# R session + +List and version of tools used for the report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/information.R b/inst/templates/methylation/information.R similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/information.R rename to inst/templates/methylation/information.R diff --git a/inst/rmarkdown/templates/teaseq/skeleton/QC/QC-01-load_data.R b/inst/templates/multiomics/teaseq/QC/QC-01-load_data.R similarity index 100% rename from inst/rmarkdown/templates/teaseq/skeleton/QC/QC-01-load_data.R rename to inst/templates/multiomics/teaseq/QC/QC-01-load_data.R diff --git a/inst/rmarkdown/templates/teaseq/skeleton/QC/QC-02-run_analysis.R b/inst/templates/multiomics/teaseq/QC/QC-02-run_analysis.R similarity index 100% rename from inst/rmarkdown/templates/teaseq/skeleton/QC/QC-02-run_analysis.R rename to inst/templates/multiomics/teaseq/QC/QC-02-run_analysis.R diff --git a/inst/rmarkdown/templates/teaseq/skeleton/QC/QC.Rmd b/inst/templates/multiomics/teaseq/QC/QC.Rmd similarity index 97% rename from inst/rmarkdown/templates/teaseq/skeleton/QC/QC.Rmd rename to inst/templates/multiomics/teaseq/QC/QC.Rmd index 9f1a302..85d0e44 100644 --- a/inst/rmarkdown/templates/teaseq/skeleton/QC/QC.Rmd +++ b/inst/templates/multiomics/teaseq/QC/QC.Rmd @@ -70,6 +70,19 @@ opts_chunk[["set"]]( ) ``` +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-draft-grey) revision. + ```{r setup, cache=FALSE, message=FALSE} library(Seurat) library(tidyverse) diff --git a/inst/rmarkdown/templates/teaseq/skeleton/README.md b/inst/templates/multiomics/teaseq/README.md similarity index 100% rename from inst/rmarkdown/templates/teaseq/skeleton/README.md rename to inst/templates/multiomics/teaseq/README.md diff --git a/inst/rmarkdown/templates/singlecell/skeleton/information.R b/inst/templates/multiomics/teaseq/information.R similarity index 100% rename from inst/rmarkdown/templates/singlecell/skeleton/information.R rename to inst/templates/multiomics/teaseq/information.R diff --git a/inst/rmarkdown/templates/teaseq/skeleton/scripts/fix_filenames.R b/inst/templates/multiomics/teaseq/scripts/fix_filenames.R similarity index 100% rename from inst/rmarkdown/templates/teaseq/skeleton/scripts/fix_filenames.R rename to inst/templates/multiomics/teaseq/scripts/fix_filenames.R diff --git a/inst/rmarkdown/templates/teaseq/skeleton/scripts/gex_adt_hto.sbatch b/inst/templates/multiomics/teaseq/scripts/gex_adt_hto.sbatch similarity index 100% rename from inst/rmarkdown/templates/teaseq/skeleton/scripts/gex_adt_hto.sbatch rename to inst/templates/multiomics/teaseq/scripts/gex_adt_hto.sbatch diff --git a/inst/rmarkdown/templates/teaseq/skeleton/scripts/gex_atac.sbatch b/inst/templates/multiomics/teaseq/scripts/gex_atac.sbatch similarity index 100% rename from inst/rmarkdown/templates/teaseq/skeleton/scripts/gex_atac.sbatch rename to inst/templates/multiomics/teaseq/scripts/gex_atac.sbatch diff --git a/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd b/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd new file mode 100644 index 0000000..3a6674d --- /dev/null +++ b/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd @@ -0,0 +1,244 @@ +--- +title: "Comparing DE Results - Pairwise" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + project_file: ../information.R +--- +```{r, message=FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +library(tidyverse) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(rtracklayer) +library(tidyverse) +library(stringr) +library(ggpubr) +library(knitr) +library(bcbioR) +library(ggprism) +library(viridis) +library(pheatmap) +library(janitor) +library(ggvenn) +library(ggplot2) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) +``` + +# Compare two differential expression analysis + +```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} +# 1. Set up input files in this R file (params_pairwisecomp.R) +## Full results file (all genes) for contrastt 1 +comp1_fn <- 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group1.csv.gz' +## Name of contrast 1. This will be displayed on the figures +comp1_name <- "DMSO vs. Group1" +## Full results file (all genes) for contrast 2 +comp2_fn <- 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group2.csv.gz' +## Name of contrast 2. This will be displayed on the figures +comp2_name <- "DMSO vs. Group2" +## Adjusted P-value used for significance +padj_co <- 0.05 +## Log2FC used for significance. If no cutoff used put 0 +LFC <- 0.5 + +comp1 <- read_csv(comp1_fn) %>% + dplyr::filter(padj <= 1) +comp2 <- read_csv(comp2_fn) %>% + dplyr::filter(padj <= 1) +``` + +## Load Data + +We load our dataset + +```{r load_data} +# this code will load from bcbio or nf-core folder +# NOTE make sure to set numerator and denominator + +comp1_sig <- comp1 %>% + dplyr::filter(padj < padj_co, abs(lfc) > LFC) + +comp2_sig <- comp2 %>% + dplyr::filter(padj < padj_co, abs(lfc) > LFC) +``` + + +# Comparisons + +We start with a venn diagram looking at the overlap between our two contrasts + +```{r, fig.height=8, fig.width=8} +name1 <- rlang::ensym(comp1_name) +name2 <- rlang::ensym(comp2_name) +names <- c(name1, name2) + +full <- list(comp1_sig$gene_id,comp2_sig$gene_id) +names(full) <-names + +ggvenn(full, show_percentage = F) + +``` + +## Compare effect sizes and direction + +We plot Log2FC for our contrasts and color points by whether or not they are significant in our contrasts. The black line is 1:1. + + +```{r fig.height=6, fig.width=8} +# Edit based on the data you are using + +#make sure to only use genes present in both results files +test_intersect <- intersect(comp1$gene_id, comp2$gene_id) +comp1_sub <- subset(comp1, comp1$gene_id %in% test_intersect) +comp2_sub <- subset(comp2, comp2$gene_id %in% test_intersect) + +## Check that gene names match +all(comp1_sub$gene_id== comp2_sub$gene_id) + +## Gather necessary data +lfc <- data.frame(comp1_sub$gene_id, comp1_sub$gene_name, comp1_sub$lfc, comp2_sub$lfc) +colnames(lfc) <- c("gene_id","gene_name", "comp1", "comp2") + +# subset to only include genes in both datasets and color by grouping +DE_comp1 <- setdiff(comp1_sig$gene_id, comp2_sig$gene_id) +DE_comp2 <- setdiff(comp2_sig$gene_id, comp1_sig$gene_id) +DE_both <- intersect(comp2_sig$gene_id, comp1_sig$gene_id) +not_sig <- comp1_sub$gene_id[!(comp1_sub$gene_id %in% c(DE_comp1,DE_comp2,DE_both))] + + +col <- rep(4, nrow(lfc)) +col[lfc$gene_id %in% not_sig] <- 1 +col[lfc$gene_id %in% DE_comp1] <- 2 +col[lfc$gene_id %in% DE_comp2] <- 3 +col[lfc$gene_id %in% DE_both] <- 4 + + +lfc$col <- lfc %>% + dplyr::mutate(color = case_when( + gene_id %in% DE_both ~ 3, + gene_id %in% DE_comp1 ~ 1, + gene_id %in% DE_comp2 ~ 2, + gene_id %in% not_sig ~ 8 + )) %>% pull(color) +lfc$col <- as.factor(lfc$col) + + +ggplot(lfc, aes(x=comp1, y=comp2, color=col)) + geom_point() + + labs(color="Group") + + scale_color_discrete(name = "Group", labels = c(paste0("Only DE in ",paste0(comp1_name)), paste0("Only DE in ",paste0(comp2_name)),"DE in both comparisons", "Not Significant")) + + geom_abline(intercept=0, slope=1) + + geom_hline(aes(yintercept=0)) + + geom_vline(aes(xintercept=0)) + + scale_color_cb_friendly() + + xlab(paste0("Log2FC in ",paste0(comp1_name))) + + ylab(paste0("Log2FC in ",paste0(comp2_name))) + +``` + + +## Compare ajusted P-values + +We plot adjusted P-values for our contrasts and color points by whether or not they are significant in our contrasts. The black line is 1:1. + + +```{r fig.height=6, fig.width=8} +# Edit based on the data you are using + +#make sure to only use genes present in both results files +test_intersect <- intersect(comp1$gene_id, comp2$gene_id) +comp1_sub <- subset(comp1, comp1$gene_id %in% test_intersect) +comp2_sub <- subset(comp2, comp2$gene_id %in% test_intersect) + +## Check that gene names match +all(comp1_sub$gene_id== comp2_sub$gene_id) + +## Gather necessary data +lfc <- data.frame(comp1_sub$gene_id, comp1_sub$gene_name, comp1_sub$padj, comp2_sub$padj) +colnames(lfc) <- c("gene_id","gene_name", "comp1", "comp2") + +# subset to only include genes in both datasets and color by grouping +DE_comp1 <- setdiff(comp1_sig$gene_id, comp2_sig$gene_id) +DE_comp2 <- setdiff(comp2_sig$gene_id, comp1_sig$gene_id) +DE_both <- intersect(comp2_sig$gene_id, comp1_sig$gene_id) +not_sig <- comp1_sub$gene_id[!(comp1_sub$gene_id %in% c(DE_comp1,DE_comp2,DE_both))] + + +col <- rep(4, nrow(lfc)) +col[lfc$gene_id %in% not_sig] <- 1 +col[lfc$gene_id %in% DE_comp1] <- 2 +col[lfc$gene_id %in% DE_comp2] <- 3 +col[lfc$gene_id %in% DE_both] <- 4 + + +lfc$col <- lfc %>% + dplyr::mutate(color = case_when( + gene_id %in% DE_both ~ 3, + gene_id %in% DE_comp1 ~ 1, + gene_id %in% DE_comp2 ~ 2, + gene_id %in% not_sig ~ 8 + )) %>% pull(color) +lfc$col <- as.factor(lfc$col) + + +ggplot(lfc, aes(x=-log10(comp1), y=-log10(comp2), color=col)) + + geom_point() + labs(color="Group") + + scale_color_discrete(name = "Group", labels = c(paste0("-Log10 adjusted p-value ",paste0(comp1_name)), paste0("-Log10 adjusted p-value ",paste0(comp2_name)),"DE in both comparisons", "Not Significant")) + + geom_abline(intercept=0, slope=1) + + geom_hline(aes(yintercept=0)) + + geom_vline(aes(xintercept=0)) + + scale_color_cb_friendly() + + xlab(paste0("Log2FC in ",paste0(comp1_name))) + + ylab(paste0("Log2FC in ",paste0(comp2_name))) + +``` + + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/templates/rnaseq/DE/DEG.Rmd b/inst/templates/rnaseq/DE/DEG.Rmd new file mode 100644 index 0000000..098a49d --- /dev/null +++ b/inst/templates/rnaseq/DE/DEG.Rmd @@ -0,0 +1,709 @@ +--- +title: "Differential Expression" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: inline +params: + ## Combatseq and ruv can both be false or ONLY ONE can be true + # numerator: tumor + # denominator: normal + column: "sample_type" + contrasts: !r list(c("sample_type", "tumor", "normal")) + subset_column: null + subset_value: null + genome: hg38 + ruv: true + combatseq: false + params_file: params_de-example.R + project_file: ../information.R + functions_file: ../libs +--- + +Template developed with materials from https://hbctraining.github.io/main/. + +```{r} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-stable-green) revision. + +```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} +library(tidyverse) +# 1. Set up input files in this R file (params_de.R) +source(params$params_file) +# 2. Set up project file (already done from QC probably) +source(params$project_file) +# 3. Load custom functions to load data from coldata/metrics/counts +map(list.files(params$functions_file,pattern = "*.R$",full.names = T), source) %>% invisible() +# IMPORTANT set these values if you are not using the parameters in the header (lines 22-31) +genome=params$genome +column=params$column +contrasts=params$contrasts +# numerator=params$numerator +# denominator=params$denominator +subset_column=params$subset_column +subset_value=params$subset_value +run_ruv=params$ruv +run_combatseq=params$combatseq +run_rmv=run_ruv | run_combatseq +factor_of_interest <- column +``` + + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(rtracklayer) +library(DESeq2) +library(tidyverse) +library(stringr) +library(DEGreport) +library(ggpubr) +library(msigdbr) +library(fgsea) +library(org.Hs.eg.db) +library(knitr) +library(EnhancedVolcano) +library(bcbioR) +library(ggprism) +library(viridis) +library(pheatmap) +library(janitor) +library(ggforce) +library(vegan) +library(htmltools) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) +``` + +```{r sanitize_datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + +```{r load_data, message=F, warning=F} +# This code will load from bcbio or nf-core folder +# NOTE make sure to set numerator and denominator +coldata <- load_coldata(coldata_fn, column, + subset_column, subset_value) +coldata$sample=row.names(coldata) + +counts <- load_counts(counts_fn) +counts <- counts[,colnames(counts) %in% coldata$sample] + +metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>% + left_join(coldata, by = c('sample')) %>% + as.data.frame() +rownames(metrics) <- metrics$sample +# if the names don't match in order or string check files names and coldata information +counts = counts[,rownames(metrics)] +coldata = coldata[rownames(metrics),] +coldata[[contrasts[[1]][1]]] = relevel(as.factor(coldata[[contrasts[[1]][1]]]), contrasts[[1]][3]) +stopifnot(all(names(counts) == rownames(metrics))) +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` +- Aim: `r aim` + +```{r load_counts_data} + +rdata = AnnotationDbi::select(org.Hs.eg.db, rownames(counts), 'SYMBOL', 'ENSEMBL') %>% + dplyr::select(gene_id = ENSEMBL, gene_name = SYMBOL) %>% + distinct(gene_id, .keep_all = TRUE) + +``` + +# Set up + +We recommend not to filter before DESeq2 since it will be handling by it. There are cases where pre-filtering could be good: + +- large number of drop-outs, to reduce computation +- large number of samples, to reduce computation +- unbalanced groups, many less samples for one group than another, maybe filtering by group is an option. + + +```{r setup_RUV} +dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = ~1) + +vsd_before <- vst(dds_to_use) +norm_matrix = assay(vsd_before) +``` + + +# PCA and group level variance. + +**Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells).** + +Dispersion estimates are a key part of the DESEQ2 analysis. DESEQ2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per gene dispersions to match this distribution. If one group has higher variance than all others this will affect the dispersion estimates. Here we visually check that the variance per group is similar using a PCA. The ellipses are minimal volume enclosing ellipses using the Khachiyan algorithm. + +**It is best practice NOT to subset your data unless one group has significantly higher variance than the others. The best dispersion estimates are obtained with more data.** + +**This code automatically uses the column value from the header. You can also manually add a factor of interest to define the groups. One can be created by combining multiple metadata columns using the paste0 function.** + +```{r set group, eval=FALSE, echo=FALSE} +## Example of creating a group covariate + +meta$group <- paste0(meta$sex,"_", meta$age,"_",meta$treatment) + +factor_of_interest <- "insert column name for covariate of interest" +``` + + +```{r PCA} +pca <- degPCA(norm_matrix, metrics, + condition = factor_of_interest, name = "sample", data = T) + +pca$plot + ggtitle(paste0("All samples", "\nPCA using ", nrow(vsd_before), " genes")) + + theme(plot.title=element_text(hjust=0.5)) + + geom_mark_ellipse(aes(color = sample_type)) + scale_color_cb_friendly() +``` + +## Analysis of the variance by group + +Groups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test. PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. [Source](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/) + +Here we apply this test to our variance stabilized data. We calculate distances between samples and then use the `betadisper()` function from the popular vegan package. We get two overall p-values where significant means that the dispersions are different between groups. The first p-value comes from the `anova()` function and the second from the `permutest()` function. We also get pairwise p-values for every group-group comparison. + +```{r PERMDISP} +vare.disa <- vegdist(t(assay(vsd_before))) + +mod = betadisper(vare.disa, metrics[[factor_of_interest]]) +anova(mod) +permutest(mod, pairwise = TRUE) + +``` + + +# Covariate analysis + +Multiple factors related to the experimental design or quality of sequencing may influence the outcomes of a given RNA-seq experiment. To further determine whether any confounding covariate risks affecting the results of our differential expression analyses, it is useful to assess the correlation between covariates and principal component (PC) values. + +Here, we are using `DEGreport::degCovariates()` to explore potential correlations between variables provided in the metadata and all PCs that account for at least 5% of the variability in the data. If applicable, significant correlations (FDR < 0.1) are circled. **This diagnostic plot helps us determine which variables we may need to add to our DE model.** + + +```{r covariates, fig.height = 6, fig.width = 10} +degCovariates( + norm_matrix, + metrics, +) +``` + +# Data modeling + + +```{r init_DESEQ} +formula <- as.formula(paste0("~ ", " + ", column)) +## Check if sample name matches +stopifnot(all(names(counts) == rownames(coldata))) + +dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = formula) + +vsd_before <- vst(dds_to_use) +norm_matrix = assay(vsd_before) +new_cdata <- coldata +``` + +For this study, this formula is recommended: `r as.character(formula)` + +```{r, eval=F, echo=FALSE} +#### IF YOU ARE RUNNING RUV OR COMBATSEQ RUN THE CHUNKS BELOW OTHERWISE SKIP TO Differential Expression SECTION or remove this section +``` + +## Remove unwanted variation + +Removing unwanted variation from RNA-seq analysis is essential to ensure that the results reflect biological rather than technical differences. Methods like ComBat, RUVseq, or surrogate variable analysis (SVA) can be applied to adjust for batch effects, library preparation discrepancies, or other confounders. These techniques model and subtract the unwanted variation, enhancing the ability to detect true biological signals in the data. Proper normalization and careful experimental design are also crucial steps to mitigate such unwanted variation. + +### Assessing unknown factors + +```{r, results='asis'} +if (run_ruv){ + cat("When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.") +}else{ + cat("There is no need to assess unknown factor for this study.") +} +``` + +```{r do_RUV, eval=run_ruv, echo=run_ruv} +# TOFIX Add to template: check correlation of dummy variables produced by ruvseq with existing covariates in metadata +# NOTE ruvseq (used when you don’t know where the unwanted variation is coming from. Package utilizes dummy variable(s), 1-5 used, start with 1, look at PCA, decide if you want more separation) Add any known-created RUV variables to DESeq2 formula. Normalized matrix produced – only for visualization, not for input into DESeq2 +library(RUVSeq) + +# If you want to skip the code, just set up formula to be your model in the next chunk of code +design <- coldata[[column]] +diffs <- makeGroups(design) +dat <- norm_matrix +# by default is running one variable, +# change K parameter to other number to find more unknown covariates +ruvset <- RUVs(dat, cIdx=rownames(dat), k=1, diffs, isLog = T, round = F) +vars <- ruvset$W + +new_cdata <- cbind(coldata, vars) + +formula <- as.formula(paste0("~ ", + paste0( + colnames(new_cdata)[grepl("W", colnames(new_cdata))], + collapse = " + " + ), " + ", column) +) +norm_matrix=ruvset$normalizedCounts # NOTE use this for visualization +pca2 <- degPCA(norm_matrix, new_cdata, + condition = column) + ggtitle('After RUV') +pca2 + scale_color_cb_friendly() + +``` + +```{r after_RUV, eval=run_ruv} +dds_to_use <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula) +vsd_to_use<- vst(dds_to_use, blind=FALSE) +``` + +### Remove Batch Effects + +```{r combat-text , eval=run_combatseq, results='asis', echo=run_combatseq} +# NOTE Combatseq (part of the SVA package) - corrected count, removing the effects while retaining the structure of the data. Used in a scenario where you know what covariate/batch is. Do not add know-removed known covariates to DESeq2 formula. Also, don’t attempt to remove biological effect (e.g. donor), this is not conceptually valid; best for technical variation. +library(sva) + +cat("Here we apply Combat-seq (https://github.com/zhangyuqing/ComBat-seq) to try to remove batch effects so we can better tease out the effects of interest. +Combat-seq uses a negative binomial regression to model batch effects, providing adjusted data by mapping the original data to an expected distribution if there were no batch effects. The adjusted data preserves the integer nature of counts, so that it is compatible with the assumptions of state-of-the-art differential expression software (e.g. edgeR, DESeq2, which specifically request untransformed count data).") + +``` + +```{r, eval=!run_combatseq, results='asis', echo=run_combatseq} +cat("There is no need to remove known factors like batch effect in this study.") +``` + +```{r set_variable_combatseq, eval=run_combatseq, echo=run_combatseq} +# NOTE work on this code if you need to run combatseq + +# Set your batch effect variable here this is the variable that combatseq will try to remove + +## Column name of your batch variable +to_remove = "batch" + +## Column name of of your variable(s) of interest + +to_keep = "sample_type" + +coldata[[to_remove]] <- as.factor(coldata[[to_remove]]) +coldata[[to_keep]] <- as.factor(coldata[[to_keep]]) + +batch = coldata[[to_remove]] +treatment = coldata[[to_keep]] + +## If you have multiple variables of interest you will need to cbind them into one variable + +#treatment1 = metrics[[to_keep]] +#treatment2 = metrics[[to_keep]] +#treatment3 = metrics[[to_keep]] + +# imp = cbind(as.numeric(as.character(treatment1)),as.numeric(as.character(treatment2)), as.numeric(as.character(treatment3))) +``` + +```{r do_combatseq, eval=run_combatseq} +adjusted_counts <- ComBat_seq(as.matrix(counts), batch=batch, group = treatment) + +#NOTE For multiple variables of interest + +# adjusted_counts <- ComBat_seq(as.matrix(counts2), batch=batch, covar_mod = imp) +``` + +```{r after_combatseq, eval=run_combatseq} +# NOTE: Make sure the formula doens't contain the covariates used in combatseq above +dds_to_use <- DESeqDataSetFromMatrix(adjusted_counts, coldata, design = formula) +vsd_combat<- vst(dds_to_use, blind=FALSE) + +norm_matrix = assay(vsd_combat) + +pca_combat <- degPCA(norm_matrix, coldata, + condition = column) + ggtitle('After Combatseq') +pca_combat + scale_color_cb_friendly() +``` + +# Differential Expression + +Differential gene expression analysis of count data was performed using the Bioconductor R package, DESeq2, which fits the count data to a negative binomial model. + +Before fitting the model, we often look at a metric called dispersion, which is a measure for variance which also takes into consideration mean expression. A dispersion value is estimated for each individual gene, then 'shrunken' to a more accurate value based on expected variation for a typical gene exhibiting that level of expression. Finally, the shrunken dispersion value is used in the final GLM fit. + +We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model. + +```{r DE} +# NOTE Note VST won’t regress out this when normalizing +de <- DESeq(dds_to_use) +DESeq2::plotDispEsts(de) +``` + +Because it is difficult to accurately detect and quantify the expression of lowly expressed genes, differences in their expression between treatment conditions can be unduly exaggerated after the model is fit. We correct for this so that gene LFC is not dependent overall on basal gene expression level. + +In cases there are multiple groups and conditions across groups is recommended to use dummy variables instead of interaction terms: https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#interactions. + +The LRT is useful for testing multiple terms at once, for example testing 3 or more levels of a factor at once, or all interactions between two variables. The LRT for count data is conceptually similar to an analysis of variance (ANOVA) calculation in linear regression, except that in the case of the Negative Binomial GLM, we use an analysis of deviance (ANODEV), where the deviance captures the difference in likelihood between a full and a reduced model. + +```{r lfc_shrink} +# NOTE As a note: Use `ashr` for comparisons with many groups to be able to pull out all the contrasts; otherwise `apeglm` is fine. It shrinks less. + +# NOTE We recommend LRT for time series + +# resultsNames(de) # check the order is right +names_to_use=lapply(contrasts, function(contrast){coef = paste0(contrast[1], "_", contrast[2], "_vs_", contrast[3])}) +names(contrasts)=names_to_use +de_list=lapply(contrasts, function(contrast){ + resLFC = results(de, contrast=contrast) + coef = paste0(contrast[1], "_", contrast[2], "_vs_", contrast[3]) + resLFCS <- lfcShrink(de, coef=coef, type="apeglm") + # resLFCS <- lfcShrink(de, contrast=contrast, type="ash") + + res <- as.data.frame(resLFCS) %>% + rownames_to_column('gene_id') %>% left_join(rdata, by = 'gene_id') %>% + relocate(gene_name) %>% dplyr::rename(lfc = log2FoldChange) %>% + mutate(pi = abs(lfc) * -log10(padj)) %>% arrange(-pi) + + ## Filter out genes that have no expression or were filtered out by DESEQ2 + res <- res[res$baseMean>0,] %>% drop_na(padj) %>% drop_na(pvalue) + + res_sig <- res %>% filter(padj < 0.05) %>% arrange(padj) %>% + mutate(gene_name = ifelse(is.na(gene_name), gene_id, gene_name)) + results=list(lfc=resLFC, lfcs=resLFCS, all=res, sig=res_sig) + return(results) +}) + +# NOTE if you add manually any other comparison to the list with the following variables, +# the code below will make the plots for those as wells: +# de_list=c(de_list, new_comparison=list(lfc=resLFC, lfcs=resLFCS, all=res, sig=res_sig)) +``` + +## MA plot {.tabset} + +This plot can help to: +- Identify Differential Expression: Genes that show a significant log-fold change (M value away from 0) indicate changes in expression between conditions. +- Assess Data Quality: The plot can help in identifying biases or systematic errors in the data. Ideally, most points should scatter around the M=0 line, indicating that there is no significant systematic difference between the conditions. +- Visualize data dispersion: The distribution of points along the A-axis gives a sense of the spread of expression levels and any patterns or anomalies in the dataset. + +```{r after_lfc_shrink, results='asis', message=F, warning=F} +for (contrast in names(de_list)){ + cat("### ", contrast, "\n\n") + p1=degMA(as.DEGSet(de_list[[contrast]]$lfc)) + ggtitle('Before LFC Shrinking') + print(p1) + p2=degMA(as.DEGSet(de_list[[contrast]]$lfcs), limit = 2) + ggtitle('After LFC Shrinking') + print(p2) + cat("\n\n") +} +``` + +## Volcano plot {.tabset} + +This volcano plot shows the genes that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in red are genes that have padj < 0.05 and a log2-fold change > 1. Points in blue have a padj < 0.05 and a log2-fold change < 1 and points in green have a padj > 0.05 and a log2-fold change > 2. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2 foldchance and padj that we have chosen. + +```{r volcano_plot, fig.height=6, results='asis'} +# degVolcano(res_mod[,c('lfc', 'padj')], plot_text = show) +for (contrast in names(de_list)){ + cat("### ", contrast, "\n\n") + res <- de_list[[contrast]][["all"]] + res_mod <- res %>% mutate(lfc = replace(lfc, lfc < -5, -5)) %>% mutate(lfc = replace(lfc, lfc > 5, 5)) + show <- as.data.frame(res_mod[1:10, c("lfc", "padj", "gene_name")]) + p1=EnhancedVolcano(res_mod, + lab= res_mod$gene_name, + pCutoff = 0.05, + selectLab = c(show$gene_name), + FCcutoff = 0.5, + x = 'lfc', + y = 'padj', + title=contrast, + col=as.vector(colors[c("dark_grey", "light_blue", + "purple", "purple")]), + subtitle = "", drawConnectors = T, max.overlaps = Inf) + print(p1) + cat("\n\n") +} +``` + +## Heatmap {.tabset} + +```{r heapmap, results='asis'} +### Run pheatmap using the metadata data frame for the annotation +for (contrast in names(de_list)){ + cat("### ", contrast, "\n\n") + res_sig = de_list[[contrast]][["sig"]] + ma=norm_matrix[res_sig$gene_id,] + colma=coldata[,c(column), drop=FALSE] + ma_colors=lapply(colnames(colma), function(c){ + l.col=colors[1:length(unique(colma[[c]]))] + names(l.col)=unique(colma[[c]]) + l.col + }) + names(ma_colors)=colnames(colma) + p1=pheatmap(ma, + color = inferno(10), + cluster_rows = T, + show_rownames = F, + annotation = colma, + annotation_colors = ma_colors, + border_color = NA, + fontsize = 10, + scale = "row", + fontsize_row = 10, + height = 20) + print(p1) + cat("\n\n") +} +``` + + +## Differentially Expressed Genes {.tabset} + +```{r sig_genes_table, results='asis'} +dt_list=list() +for (contrast in names(de_list)){ + res_sig=de_list[[contrast]][["sig"]] + dt_list=c(dt_list, + list(h3(contrast)), + list(DT::datatable(res_sig))) +} +tagList(dt_list) +``` + +## Plot top 16 genes {.tabset} + +```{r top n DEGs, fig.height = 6, fig.width = 8, results='asis'} +n = 16 + +for (contrast in names(de_list)){ + cat("### ", contrast, "\n\n") + res_sig = de_list[[contrast]][["sig"]] + top_n <- res_sig %>% slice_min(order_by = padj, n = n, with_ties = F) %>% + dplyr::select(gene_name, gene_id) + top_n_exp <- norm_matrix %>% as.data.frame() %>% + rownames_to_column('gene_id') %>% + # dplyr::select(-group, -group_name) %>% + pivot_longer(!gene_id, names_to = 'sample', values_to = 'log2_expression') %>% + right_join(top_n, relationship = "many-to-many") %>% + left_join(coldata, by = 'sample') + + p1=ggplot(top_n_exp, aes_string(x = column, y = 'log2_expression')) + + geom_boxplot(outlier.shape = NA, linewidth=0.5, color="grey") + + geom_point() + + facet_wrap(~gene_name) + + ggtitle(str_interp('Expression of Top ${n} DEGs')) + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + print(p1) + cat("\n\n") +} +``` + +# Pathway Enrichment + +From the set of differentially expressed genes and using publicly available information about gene sets involved in biological processes and functions, we can calculate which biological processes and functions are significantly perturbed as a result of the treatment. + +```{r} +all_in_life=get_databases() +all_in_life=get_databases_v2() +``` + +# Pathway Analysis- GSEA + +Gene Set Enrichment Analysis (GSEA) is a computational method used to determine whether a predefined set of genes shows statistically significant, concordant differences between two biological states (e.g., disease vs. normal) in RNA-seq data or other types of gene expression data. Advantages of GSEA. + +- Biological Insight: Helps in understanding the underlying biological processes and pathways affected, rather than focusing on individual genes. +- Incorporation of Prior Knowledge: Utilizes predefined gene sets, allowing integration of existing biological knowledge. +- Contextual Relevance: Can reveal subtle but coordinated changes in biologically meaningful gene sets that might not be apparent when looking at individual genes. + +```{r, warning=F, message=F} +fa_gsea_list=lapply(de_list,function(contrast){ + + res=contrast[["all"]] + gsea_input = res %>% filter(!is.na(padj)) %>% dplyr::select(gene_id, lfc) + #change to the right species + input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, gsea_input$gene_id, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL')) + input_entrezid <- inner_join(gsea_input, input_entrezid, by=c("gene_id"="ENSEMBL")) %>% + filter(!is.na(ENTREZID)) %>% + distinct(ENTREZID, .keep_all=TRUE) + + tb = run_fgsea_v2(input_entrezid, all_in_life) + tb %>% filter(padj<0.05) + }) +``` + +```{r, results='asis'} +# NOTE DT::datatables doesn't work with tabset and for loops +# You can use the following code to print dynamically or call manually sanitize_datatable() +# multiple times +dt_list=list() +for (contrast in names(de_list)){ + res_sig=fa_gsea_list[[contrast]] + dt_list=c(dt_list, + list(h3(contrast)), + list(sanitize_datatable(res_sig))) +} +tagList(dt_list) +``` + +# Pathway Analysis- Over-representation + +Over-Representation Analysis (ORA) is a statistical method used to determine whether a predefined set of genes (e.g., genes belonging to a specific biological pathway or function) is over-represented (or enriched) among a list of differentially expressed genes (DEGs) from RNA-seq data. Adventages of ORA: + +- Simplicity: Easy to perform and interpret. +- Biological Insight: Helps to identify pathways and processes that are significantly affected in the condition studied. +- Prior Knowledge Integration: Utilizes existing biological knowledge through predefined gene sets. + +```{r, warning=F, message=F} +fa_list=lapply(de_list,function(contrast){ + + res=contrast[["all"]] + universe=res %>% + filter(!is.na(padj)) %>% pull(gene_id) + universe_mapping = AnnotationDbi::select(org.Hs.eg.db, universe, 'ENSEMBL', columns=c('ENTREZID', 'SYMBOL')) + + ora_input = res %>% filter(!is.na(padj), padj<0.01, abs(lfc)> 0.3) %>% pull(gene_id) + #change to the right species + input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL')) + all=run_fora_v2(input_entrezid, universe_mapping,all_in_life) + + ora_input = res %>% filter(!is.na(padj), padj<0.01, lfc> 0.3) %>% pull(gene_id) + #change to the right species + input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL')) + up=run_fora_v2(input_entrezid, universe_mapping,all_in_life) + + ora_input = res %>% filter(!is.na(padj), padj<0.01, lfc< -0.3) %>% pull(gene_id) + #change to the right species + input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL')) + down=run_fora_v2(input_entrezid, universe_mapping,all_in_life) + + list(all=all,up=up,down=down) +}) +``` + +## All significant genes {.tabset} + +```{r, results='asis'} +# NOTE DT::datatables doesn't work with tabset and for loops +# You can use the following code to print dynamically or call manually sanitize_datatable() +# multiple times +dt_list=list() +for (contrast in names(de_list)){ + res_sig=fa_list[[contrast]][["all"]] + dt_list=c(dt_list, + list(h3(contrast)), + list(sanitize_datatable(res_sig))) +} +tagList(dt_list) +``` + + +## Down-regulated genes {.tabset} + +```{r, results='asis'} +dt_list=list() +for (contrast in names(de_list)){ + res_sig=fa_list[[contrast]][["down"]] + dt_list=c(dt_list, + list(h3(contrast)), + list(sanitize_datatable(res_sig))) +} +tagList(dt_list) +``` + + +## Up-regulated genes {.tabset} + +```{r, results='asis'} +dt_list=list() +for (contrast in names(de_list)){ + res_sig=fa_list[[contrast]][["up"]] + dt_list=c(dt_list, + list(h3(contrast)), + list(sanitize_datatable(res_sig))) +} +tagList(dt_list) +``` + + +# Save files + +```{r write_files} +if (!is.null(subset_value) & !is.null(subset_value)){ + filenames = str_interp("${subset_value}") +} else { + filenames = "full" +} +for (contrast in names(contrasts)){ + filenames = paste0(filenames, "_", contrast) + name_expression_fn=file.path( + basedir, + str_interp("${filenames}_expression.csv")) + + name_deg_fn=file.path( + basedir, + str_interp("${filenames}_deg.csv")) + + name_pathways_fn=file.path( + basedir, + str_interp("${filenames}_pathways.csv")) + + counts_norm=norm_matrix %>% as.data.frame() %>% + rownames_to_column("gene_id") %>% + mutate(comparison = contrast) + + res_for_writing <- de_list[[contrast]][["all"]] %>% + mutate(comparison = contrast) + + # NOTE choose what pathway to save, all, down or up, or everything, just + #. need to add more lines + pathways_for_writing <- fa_list[[contrast]][["all"]] %>% + mutate(comparison = contrast) + + write_csv(counts_norm, name_expression_fn) + write_csv(res_for_writing, name_deg_fn) + write_csv(pathways_for_writing, name_pathways_fn) +} + +``` + +# R session + +List and version of tools used for the DE report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/templates/rnaseq/DE/GSVA.Rmd b/inst/templates/rnaseq/DE/GSVA.Rmd new file mode 100644 index 0000000..c7028a3 --- /dev/null +++ b/inst/templates/rnaseq/DE/GSVA.Rmd @@ -0,0 +1,241 @@ +--- +title: "GSVA" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: inline +params: + # set column name and contrasts to be factors of interest + column: "sample_type" + contrasts: !r list(c("sample_type", "tumor", "normal")) + project_file: ../information.R + params_file: params_de-example.R + functions_file: ../libs + # select from gene set repository at https://github.com/bcbio/resources/tree/main/gene_sets/gene_sets + # choose geneset, click "Raw', and copy url + geneset_fn: https://raw.githubusercontent.com/bcbio/resources/main/gene_sets/gene_sets/20240904/human/h.all.v2024.1.Hs.entrez.gmt +--- + +```{r, message=FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +library(tidyverse) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. + +```{r libraries, message = FALSE, warning=FALSE} +# path to libraries if working on O2 +# .libPaths("/n/app/bcbio/R4.3.1_rnaseq/") + +## load libraries +library(GSVA) +library(GSEABase) +library(reshape2) +library(ChIPpeakAnno) +library(org.Hs.eg.db) +# library(org.Mm.eg.db) +library(AnnotationDbi) +library(DESeq2) +library(limma) +library(gridExtra) +library(bcbioR) +library(ggprism) +library(knitr) +library(rstudioapi) +library(tidyverse) +library(clusterProfiler) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) +``` + + +```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} +source(params$project_file) +source(params$params_file) +map(list.files(params$functions_file,pattern = "*.R$",full.names = T),source) %>% invisible() +column=params$column +contrasts=params$contrasts +subset_column=params$subset_column +subset_value=params$subset_value + +``` + +```{r sanitize_datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` +- Aim: `r aim` + +```{r load_data} + +coldata <- load_coldata(coldata_fn, column, + subset_column, subset_value) +coldata[[contrasts[[1]][1]]] = relevel(as.factor(coldata[[contrasts[[1]][1]]]), contrasts[[1]][3]) +coldata$sample=row.names(coldata) + +counts <- load_counts(counts_fn) +counts <- counts[,colnames(counts) %in% coldata$sample] + +``` + +# Method + +Gene Set Variation Analysis (GSVA) is a non-parametric, unsupervised method for estimating variation of gene set enrichment through the samples of a expression data set. GSVA performs a change in coordinate systems, transforming the data from a gene by sample matrix to a gene-set by sample matrix, thereby allowing the evaluation of pathway enrichment for each sample. This new matrix of GSVA enrichment scores facilitates applying standard analytical methods like functional enrichment, survival analysis, clustering, CNV-pathway analysis or cross-tissue pathway analysis, in a pathway-centric manner. More info in the vignette [here](https://bioconductor.org/packages/release/bioc/vignettes/GSVA/inst/doc/GSVA.html). + +Hänzelmann S, Castelo R, Guinney J (2013). “GSVA: gene set variation analysis for microarray and RNA-Seq data.” BMC Bioinformatics, 14, 7. doi:10.1186/1471-2105-14-7, [https://doi.org/10.1186/1471-2105-14-7](https://doi.org/10.1186/1471-2105-14-7) + +# Data + +```{r show_coldata} +coldata %>% sanitize_datatable() +``` + +```{r normalize_data} +dds <- DESeqDataSetFromMatrix(counts, + colData = coldata, + design = ~ 1) + +dds <- DESeq(dds) +norm_counts <- counts(dds, normalized=TRUE) + +``` + + +```{r ensembl_to_entrez} +## convert ensembl to entrez + +entrezIDs_all = convert2EntrezID(IDs=rownames(norm_counts), orgAnn="org.Hs.eg.db", + ID_type="ensembl_gene_id") + +entrezid <- mapIds(org.Hs.eg.db, keys = rownames(norm_counts), keytype="ENSEMBL", column = "ENTREZID") + +counts_entrez <- norm_counts +stopifnot(nrow(counts_entrez) == length(entrezid)) +rownames(counts_entrez) <- entrezid + +``` + + +# Prep and run GSVA + +```{r load_genesets} + +# gene_sets = read_table(params$geneset_fn, col_names = F) +gene_sets <- read.gmt(params$geneset_fn) + +genes_by_pathway <- split(gene_sets$gene, gene_sets$term) + +``` + + +```{r GSVA, message = F, warning = F} + +gsvaPar <- GSVA::gsvaParam(counts_entrez, genes_by_pathway, kcdf = "Poisson") + +gsva.es <- gsva(gsvaPar, verbose = F) + +``` + +## Test for Significance + +```{r limma} + +mod <- model.matrix(~ factor(coldata[[column]])) +fit <- lmFit(gsva.es, mod) +fit <- eBayes(fit) +res <- topTable(fit, coef=paste0("factor(coldata[[column]])",contrasts[[1]][2]),number=Inf,sort.by="P") + +res %>% sanitize_datatable() +``` + +## Graph top 5 pathways{.tabset} + +```{r graph pathways, results = 'asis'} +scores <- t(gsva.es) + +sig <- subset(res, res$adj.P.Val < 0.1) + +if(nrow(sig) >= 5){ + pathways <- rownames(sig)[1:5] +} else if(nrow(sig) == 0) { + pathways <- c() +} else { + pathways <- rownames(sig) +} + +if (length(pathways) > 0){ + to_graph = data.frame(scores[,pathways]) %>% rownames_to_column('sample') %>% + pivot_longer(!sample, names_to = 'pathway', values_to = 'enrichment_score') + to_graph <- left_join(to_graph, coldata) + + for (single_pathway in pathways) { + + cat('### ', single_pathway, '\n') + + to_graph_single_pathway <- to_graph %>% filter(pathway == single_pathway) + p <- ggplot(to_graph_single_pathway, aes(x = .data[[column]], y = enrichment_score)) + geom_boxplot() + + geom_point(alpha=0.5) + ggtitle(single_pathway) + print(p) + + cat('\n\n') + + } +} else { + cat('No pathways were detected as significantly enriched') +} + +``` + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` + diff --git a/inst/templates/rnaseq/DE/Intersections.Rmd b/inst/templates/rnaseq/DE/Intersections.Rmd new file mode 100644 index 0000000..fc1778b --- /dev/null +++ b/inst/templates/rnaseq/DE/Intersections.Rmd @@ -0,0 +1,261 @@ +--- +title: "Comparing DE Results - Multiple Contrasts" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + project_file: ../information.R +--- + +```{r, message=FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +library(tidyverse) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(rtracklayer) +library(tidyverse) +library(stringr) +library(ggpubr) +library(knitr) +library(bcbioR) +library(ggprism) +library(viridis) +library(pheatmap) +library(janitor) +library(ggvenn) +library(ggplot2) +library(UpSetR) +library(ggprism) +#library(org.Ce.eg.db) +library(org.Hs.eg.db) +#library(org.Mm.eg.db) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) +``` + + +```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} +## Adjusted P-value used for significance +padj_co <- 0.05 +## Log2FC used for significance. If no cutoff used put 0 +LFC <- 0.5 +## Normalized counts for ALL samples +# norm <- "/Users/emb016/Documents/comparisons_templates/norm_counts.csv" +# Load the count data, for this example it is the last columns of the DE table +norm_counts <- read.csv("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/norm_counts.csv.gz", + row.names = 1) + +# Load the meta data, here we are making one for the exmaple +metadata <- read_csv("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/meta.csv.gz") %>% as.data.frame() + +## Full results file (all genes) for contrast 1 +files=c("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group1.csv.gz", + "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group2.csv.gz", + "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group3.csv.gz") + +``` + +# Load Data + +We load our dataset + +```{r load_data} + +## Name of contrast. This will be displayed on the figures. +# you can manually indicate a list of names as comp_names=c("name1","name2"...) +comp_names = basename(files) %>% + str_remove_all("all_results_|.csv|.gz") %>% + str_replace_all("_", " ") +names(files)=comp_names +N=length(files) +stopifnot(length(files)==length(comp_names)) + +## Make sure you have set up N above +all_genes=lapply(names(files), function(name){ + data <- read_csv(files[name]) %>% + dplyr::filter(padj <= 1) +}) +sign_genes=lapply(names(files), function(name){ + data <- read_csv(files[name]) %>% + dplyr::filter(padj <= 1) + data %>% + dplyr::filter(padj < padj_co, abs(lfc) > LFC) +}) +``` + + +# Make list of comparisons + +```{r, fig.height=8, fig.width=8, warning=FALSE, error=FALSE, message=FALSE} +de=lapply(sign_genes, function(x){ + x$gene_id +}) +names(de) <- comp_names +``` + +## Make an upset plot + +Because we have done so many tests venn diagrams no longer work for our data. Instead we will use upset plots. *These plots are relatively intuitive for 2 or 3 categories, but can tend to get more complex for >3 categories. In all cases, you will find the categories being compared and their size listed below the bar plots on the left. As you look to the right (directly below each bar) there are dots with connecting lines that denote which categories the overlap is between, or if there is no overlap (just a dot). The numbers at the top of the bars denote the size of the overlap.* + +```{r, fig.height=8, fig.width=12} +upset(fromList(de), order.by = "freq", nsets=N) +``` + +## Pull intersect(s) of interest + +After identifying intersect(s) of interest we can determine which genes are found in which intersections + +```{r, warning=FALSE, error=FALSE, message=FALSE} +## Grab intersection +gene_names <- data.frame(gene=unique(unlist(de))) + +df1 <- lapply(de,function(x){ + data.frame(gene = x) +}) %>% + bind_rows(.id = "path") + +df_int <- lapply(gene_names$gene,function(x){ + # pull the name of the intersections + intersection <- df1 %>% + dplyr::filter(gene==x) %>% + arrange(path) %>% + pull("path") %>% + paste0(collapse = "|") + # build the dataframe + data.frame(gene = x,int = intersection) +}) %>% bind_rows() +``` + +```{r, eval=F} +## Run this code to find the name of your intersect of interest. You will use this in the next code chunk +table(df_int$int) +``` + +```{r, warning=FALSE, error=FALSE, message=FALSE} +## NOTE: subset interaction of interest replace the intersect name with the name of the intersect from above. You can copy and paste the below commands to grab multiple intersects. + +Intersect1 <- subset(df_int, df_int$int=="DMSO vs Group2|DMSO vs Group3") +``` + +## Get annotation data + +```{r, warning=FALSE, error=FALSE, message=FALSE} +# NOTE: edit this to be the correct organism. One set of annotations per intersect. +# rdata = AnnotationDbi::select(org.Hs.eg.db, Intersect1$gene, 'SYMBOL', 'ENSEMBL') %>% +# dplyr::select(gene_id = ENSEMBL, gene_name = SYMBOL) %>% distinct(gene_id, .keep_all = T) + +# NOTE: following code is only for test data, use the above with real data +rdata=data.frame(gene_id=row.names(norm_counts), gene_name=row.names(norm_counts)) +``` + +## Heatmap of intersect + +We generate a heatmap with all samples to see the patterns contained in this intersect. + +```{r, fig.height=6, warning=FALSE, error=FALSE, message=FALSE} +## NOTE: Assign factors of interest. These need to correspond to columns in your metadata. +factor1 <- "Treatment" +factor2 <- "Cell_line" + +# Extract significant genes +stopifnot(all(Intersect1$gene %in% row.names(norm_counts))) +sigGenes <- Intersect1$gene + +### Extract normalized expression for significant genes +norm_sig <- norm_counts[sigGenes,] +meta <- data.frame(metadata[,print(factor1)],metadata[,print(factor1)]) +colnames(meta) <- c(print(factor1),print(factor2)) +rownames(meta) <- colnames(norm_sig) +### Set a color palette +heat_colors <- lapply(colnames(norm_sig), function(c){ + l.col=colors[1:length(unique(norm_sig[[c]]))] + names(l.col)=unique(norm_sig[[c]]) + l.col +}) + +### Run pheatmap using the metadata data frame for the annotation (11 x 5) +pheatmap(norm_sig, + color = inferno(10), + cluster_rows = T, + show_rownames = F, + annotation = meta, + annotation_colors = heat_colors, + border_color = NA, + fontsize = 10, + scale = "row", + fontsize_row = 10, + height = 20) +``` + +## Graph genes in intersect + +```{r, warning=FALSE, error=FALSE, message=FALSE} +Intersect1_annot <- Intersect1 %>% left_join(rdata, by=c("gene"="gene_id")) +# REMOVE to plot all +Intersect1_annot <- Intersect1_annot[1:10,] + +graphs <- length(Intersect1_annot$gene) +to_test <- t(norm_counts) +rna = Intersect1_annot$gene +names = Intersect1_annot$gene_name + +to_graph = data.frame(to_test[,rna]) +to_graph = to_graph[Intersect1_annot$gene] +to_graph$Factor1 <- metadata[,factor1] +to_graph$Factor2 <- metadata[,factor2] + +#out <- vector("list", length = graphs) +for (i in seq(1,graphs)) { + to_graph$temp=to_graph[[i]] + print(ggplot(to_graph,aes(x=Factor1,y=temp,color=Factor2)) + + geom_boxplot() + geom_point(alpha=0.5, position = position_dodge(width = .75)) + + ylab(paste0(names[[i]])) + xlab(factor1) + scale_color_discrete(name = "Covariate")) +} +``` + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/params_de-example.R b/inst/templates/rnaseq/DE/params_de-example.R similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/DE/params_de-example.R rename to inst/templates/rnaseq/DE/params_de-example.R diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/params_de.R b/inst/templates/rnaseq/DE/params_de.R similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/DE/params_de.R rename to inst/templates/rnaseq/DE/params_de.R diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/run_markdown.R b/inst/templates/rnaseq/DE/run_markdown.R similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/DE/run_markdown.R rename to inst/templates/rnaseq/DE/run_markdown.R diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC.Rmd b/inst/templates/rnaseq/QC/QC-bcbio.Rmd similarity index 98% rename from inst/rmarkdown/templates/rnaseq/skeleton/QC/QC.Rmd rename to inst/templates/rnaseq/QC/QC-bcbio.Rmd index 48afe76..aca91c1 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC.Rmd +++ b/inst/templates/rnaseq/QC/QC-bcbio.Rmd @@ -18,13 +18,12 @@ editor_options: chunk_output_type: console params: params_file: params_qc.R - project_file: ../information.R --- ```{r source_params, echo = F} -source(params$params_file) -source(params$project_file) +metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv' +se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") ``` ```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC_nf-core.Rmd b/inst/templates/rnaseq/QC/QC_nf-core.Rmd similarity index 71% rename from inst/rmarkdown/templates/rnaseq/skeleton/QC/QC_nf-core.Rmd rename to inst/templates/rnaseq/QC/QC_nf-core.Rmd index dee4712..a60e4e4 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC_nf-core.Rmd +++ b/inst/templates/rnaseq/QC/QC_nf-core.Rmd @@ -17,23 +17,32 @@ output: editor_options: chunk_output_type: console params: - # params_file: params_qc_nf-core-example.R # example data # Fill this file with the right paths to nfcore output - params_file: params_qc_nf-core.R # Put hg38, mm10, mm39, or other + # params_file: params_qc_nf-core-example.R # example data + params_file: params_qc_nf-core-example.R genome: hg38 project_file: ../information.R + functions_file: ../libs/load_data.R factor_of_interest: sample_type --- -```{r} +Template developed with materials from https://hbctraining.github.io/main/. + +```{r, cache = FALSE, message = FALSE, warning=FALSE} # This set up the working directory to this file so all files can be found library(rstudioapi) setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) ``` +This code is in this ![](https://img.shields.io/badge/status-stable-green) revision. -```{r source_params, echo = F} +```{r source_params, cache = FALSE, message = FALSE, warning=FALSE} # 1. set up factor_of_interest parameter from parameter above or manually # this is used to color plots, it needs to be part of the metadata factor_of_interest=params$factor_of_interest @@ -43,6 +52,8 @@ source(params$params_file) # 3. If you set up this file, project information will be printed below and #. it can be reused for other Rmd files. source(params$project_file) +# 4. Load custom functions to load data from coldata/metrics/counts +source(params$functions_file) ``` # Overview @@ -80,50 +91,6 @@ opts_chunk[["set"]]( ``` -```{r subchunkify, echo=FALSE, eval=FALSE} -#' Create sub-chunks for plots -#' -#' taken from: https://stackoverflow.com/questions/15365829/dynamic-height-and-width-for-knitr-plots -#' -#' @param pl a plot object -#' @param fig.height figure height -#' @param fig.width figure width -#' @param chunk_name name of the chunk -#' -#' @author Andreas Scharmueller \email{andschar@@protonmail.com} -#' -subchunkify = function(pl, - fig.height = 7, - fig.width = 5, - chunk_name = 'plot') { - pl_deparsed = paste0(deparse(function() { - pl - }), collapse = '') - - sub_chunk = paste0( - "```{r ", - chunk_name, - ", fig.height=", - fig.height, - ", fig.width=", - fig.width, - ", dpi=72", - ", echo=FALSE, message=FALSE, warning=FALSE, fig.align='center'}", - "\n(", - pl_deparsed, - ")()", - "\n```" - ) - - cat(knitr::knit( - text = knitr::knit_expand(text = sub_chunk), - quiet = TRUE - )) -} - -``` - - ```{r sanitize-datatable} sanitize_datatable = function(df, ...) { # remove dashes which cause wrapping @@ -135,16 +102,29 @@ sanitize_datatable = function(df, ...) { # Samples and metadata +```{r load_data, message=F, warning=F} +# This code will load from bcbio or nf-core folder +# NOTE make sure to set numerator and denominator +coldata <- load_coldata(coldata_fn) +coldata$sample=row.names(coldata) + +counts <- load_counts(counts_fn) +counts <- counts[,colnames(counts) %in% coldata$sample] + +metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>% + left_join(coldata, by = c('sample')) %>% + as.data.frame() +# TODO: change order as needed +order <- unique(metrics[["sample"]]) +rownames(metrics) <- metrics$sample +# if the names don't match in order or string check files names and coldata information +counts = counts[,rownames(metrics)] +coldata = coldata[rownames(metrics),] +stopifnot(all(names(counts) == rownames(metrics))) +``` ```{r load_metadata} - -meta_df=read_csv(metadata_fn) %>% - arrange(.data[[factor_of_interest]]) %>% - distinct(sample, .keep_all = T) %>% - dplyr::select(!matches("fastq"), !matches("strandness")) -meta_df$sample <- make.names(meta_df$sample) -order <- meta_df$sample - +meta_df=coldata ggplot(meta_df, aes(.data[[factor_of_interest]], fill = .data[[factor_of_interest]])) + geom_bar() + ylab("") + xlab("") + ylab("# of samples") + @@ -152,112 +132,11 @@ ggplot(meta_df, aes(.data[[factor_of_interest]], ``` -```{r load_data} -# read counts from SE object -se <- readRDS(se_object) -raw_counts <- assays(se)[["counts"]] %>% round() %>% - as.matrix() -raw_counts=raw_counts[rowSums(raw_counts)!=0,] -``` - -```{r prepare_metrics} -# Get metrics from nf-core into bcbio like table -# many metrics are already in the General Table of MultiQC, this reads the file -metrics <- read_tsv(file.path(multiqc_data_dir, 'multiqc_general_stats.txt')) - -# we get some more metrics from Qualimap and rename columns -metrics_qualimap <- read_tsv(file.path(multiqc_data_dir, 'mqc_qualimap_genomic_origin_1.txt')) -metrics <- metrics %>% full_join(metrics_qualimap) -metrics <- metrics %>% - clean_names() %>% - dplyr::rename_with(~gsub('.*mqc_generalstats_', '', .)) - -# This uses the fastqc metrics to get total reads -total_reads <- metrics %>% - dplyr::filter(!is.na(fastqc_raw_total_sequences)) %>% - remove_empty(which = 'cols') %>% - dplyr::rename(single_sample = sample) %>% - mutate(sample = gsub('_[12]+$', '', single_sample)) %>% - group_by(sample) %>% - summarize(total_reads = sum(fastqc_raw_total_sequences)) - -# This renames to user-friendly names the metrics columns -metrics <- metrics %>% - dplyr::filter(is.na(fastqc_raw_total_sequences)) %>% - remove_empty(which = 'cols') %>% - full_join(total_reads) %>% - mutate(mapped_reads = samtools_reads_mapped) %>% - mutate(exonic_rate = exonic/(star_uniquely_mapped * 2)) %>% - mutate(intronic_rate = intronic/(star_uniquely_mapped * 2)) %>% - mutate(intergenic_rate = intergenic/(star_uniquely_mapped * 2)) %>% - mutate(x5_3_bias = qualimap_5_3_bias) - -# Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case -gtf <- NULL -if (genome =="other"){ - gtf <- gtf_fn -}else{ - if (genome == "hg38") { - gtf <- "hg38.rna.gtf.gz" - } else if (genome == "mm10") { - gtf <- "mm10.rna.gtf.gz" - } else if (genome == "mm39") { - gtf <- "mm39.rna.gtf.gz" - } - gtf <- system.file("extdata", "annotation", - gtf, - package="bcbioR") -} -if (is.null(gtf)) { - print("No genome provided! Please add it at the top of this Rmd") -} - -gtf=rtracklayer::import(gtf) - -one=grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE) -another=grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE) -biotype=NULL -if(length(one)==1){ - biotype=one -}else if(length(another)==1){ - biotype=another -}else{ - warning("No gene biotype founded") -} - -if (!is.null(biotype)){ - annotation=as.data.frame(gtf) %>% .[,c("gene_id", biotype)] - rRNA=grepl("rRNA|tRNA",annotation[[biotype]]) - genes=intersect(annotation[rRNA,"gene_id"],row.names(raw_counts)) - ratio=data.frame(sample=colnames(raw_counts), - r_and_t_rna_rate=colSums(raw_counts[genes,])/colSums(raw_counts)) - metrics = left_join(metrics, ratio, by="sample") -}else{ - metrics[["r_and_t_rna_rate"]] <- NA -} - -# if ("custom_content_biotype_counts_percent_r_rna" %in% colnames(metrics)){ -# metrics <- mutate(metrics, r_rna_rate = custom_content_biotype_counts_percent_r_rna) -# }else{ -# metrics[["r_rna_rate"]] <- NA -# } -metrics=metrics[,c("sample","mapped_reads","exonic_rate","intronic_rate", - "total_reads", - "x5_3_bias", "r_and_t_rna_rate","intergenic_rate")] -metrics$sample <- make.names(metrics$sample) -metrics <- metrics %>% - full_join(meta_df , by = c("sample" = "sample")) %>% - dplyr::select(where(~!all(is.na(.)))) - -``` - ```{r show_metadata} meta_sm <- meta_df %>% - as.data.frame() %>% - column_to_rownames("sample") + as.data.frame() meta_sm %>% sanitize_datatable() - ``` # Read metrics {.tabset} @@ -321,7 +200,7 @@ metrics %>% The number of genes represented in every sample is expected to be consistent and over 20K (grey line). ```{r calc_genes_detected} -genes_detected <- colSums(assays(se)[["counts"]] > 0) %>% enframe() +genes_detected <- colSums(counts > 0) %>% enframe() sample_names <- metrics[,c("sample"), drop=F] genes_detected <- left_join(genes_detected, sample_names, by = c("name" = "sample")) genes_detected <- genes_detected %>% group_by(name) @@ -359,8 +238,6 @@ metrics %>% ``` - - ## Gene detection saturation This plot shows how complex the samples are. We expect samples with more reads to detect more genes. @@ -484,15 +361,15 @@ We expect consistency in the box plots here between the samples, i.e. the distri metrics_small <- metrics %>% dplyr::select(sample, .data[[factor_of_interest]]) metrics_small <- left_join(sample_names, metrics_small) -counts <- - assays(se)[["counts"]] %>% +counts_lng <- + counts %>% as_tibble() %>% filter(rowSums(.)!=0) %>% gather(name, counts) -counts <- left_join(counts, metrics_small, by = c("name" = "sample")) +counts_lng <- left_join(counts_lng, metrics_small, by = c("name" = "sample")) -ggplot(counts, aes(factor(name, level = order), +ggplot(counts_lng, aes(factor(name, level = order), log2(counts+1), fill = .data[[factor_of_interest]])) + geom_boxplot() + @@ -515,11 +392,11 @@ Principal Component Analysis (PCA) is a statistical technique used to simplify h ```{r PCA1:5 summary, all, unlabeled, fig.width= 7, fig.height = 5} -vst <- vst(raw_counts) +vst <- vst(counts) coldat_for_pca <- as.data.frame(metrics) rownames(coldat_for_pca) <- coldat_for_pca$sample -coldat_for_pca <- coldat_for_pca[colnames(raw_counts),] +coldat_for_pca <- coldat_for_pca[colnames(counts),] pca1 <- degPCA(vst, coldat_for_pca, condition = factor_of_interest, data = T)[["plot"]] pca2 <- degPCA(vst, coldat_for_pca, @@ -552,10 +429,9 @@ cluster by plotting the correlation between the expression profiles of the samples. ```{r clustering fig, fig.width = 10, fig.asp = .62} - vst_cor <- cor(vst) -colma=meta_df %>% as.data.frame() +colma=coldata %>% as.data.frame() rownames(colma) <- colma$sample colma <- colma[rownames(vst_cor), ] colma <- colma %>% dplyr::select(.data[[factor_of_interest]]) diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc.R b/inst/templates/rnaseq/QC/params_qc.R similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc.R rename to inst/templates/rnaseq/QC/params_qc.R diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core-example.R b/inst/templates/rnaseq/QC/params_qc_nf-core-example.R similarity index 66% rename from inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core-example.R rename to inst/templates/rnaseq/QC/params_qc_nf-core-example.R index dae62ce..b364967 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core-example.R +++ b/inst/templates/rnaseq/QC/params_qc_nf-core-example.R @@ -1,9 +1,12 @@ # info params -# Example data: COMMENT THESE LINE IF YOU ARE USING YOUR DATA -metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv' + +# Example data +coldata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv' +counts_fn=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.tsv') se_object=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.rds') # This folder is in the output directory inside multiqc folder multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/multiqc/star_salmon/multiqc-report-data/' # This file is inside the genome folder in the output directory gtf_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/devel/nf-core/genome/genome.filtered.gtf.gz' +se_object = NA diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R b/inst/templates/rnaseq/QC/params_qc_nf-core.R similarity index 78% rename from inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R rename to inst/templates/rnaseq/QC/params_qc_nf-core.R index 08b3ec0..897b6b0 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R +++ b/inst/templates/rnaseq/QC/params_qc_nf-core.R @@ -2,10 +2,11 @@ # Your data # This is the file used to run nf-core or compatible to that -metadata_fn='/Path/to/metadata/meta.csv' +coldata_fn='/Path/to/metadata/meta.csv' # This file is inside star_salmon/ folder -se_object='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.rds' +counts_fn='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.tsv' # This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder multiqc_data_dir='/path/to/nf-core/output/multiqc/star_salmon/multiqc_report_data' # This file is inside the genome folder in the output directory, use this only for non-model organism # gtf_fn='/path/to/nf-core/output/genome/hg38.filtered.gtf' +se_object = NA diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/run_markdown.R b/inst/templates/rnaseq/QC/run_markdown.R similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/QC/run_markdown.R rename to inst/templates/rnaseq/QC/run_markdown.R diff --git a/inst/templates/rnaseq/README.md b/inst/templates/rnaseq/README.md new file mode 100644 index 0000000..9714054 --- /dev/null +++ b/inst/templates/rnaseq/README.md @@ -0,0 +1,40 @@ +# Guideline for RNAseq downstream analysis + +Make sure there is a valid project name, and modify `information.R` with the right information for your project. You can use this file with any other Rmd to include the project/analysis information. + +## Run data with nf-core rnaseq + +This templates assume data has been processed by [nf-core/rnaseq](https://nf-co.re/rnaseq/3.14.0/docs/usage). +We recommend to use the samplesheet.csv used with nf-core as metadata file, where other relevant columns can be there even if they are not used by the pipeline. + +## Downstream analysis + +Modify `information.R` with the right information. You can use this file with any other Rmd to include the project/analysis information. + +### QC + +`QC/QC.Rmd` is a template for QC metrics. Use `params_qc.R` for `bcbio` + or `QC/QC_nf-core.Rmd` `params_qc_nf-core.R` for `nf-core/rnaseq` outputs. + +Read instruction in the R and Rmd scripts to render it. + +### DE + +`DE/DEG.Rmd` is a template for comparison between two groups. `params_de.R` has the information for the input files to load. You can point to `bcbio` or `nf-core/rnaseq` output files. + +On the `YAML` header file of the `Rmd` you can specify some parameters or just set them up in the first chunk of code of the template. This template has examples of: + +- sub-setting data +- two groups comparison +- volcano plot +- MA plot +- Pathway analysis +- Tables + +### Other templates + +- `DE/GSVA.Rmd` shows an example on how to use [GSVA package](https://bioconductor.org/packages/release/bioc/html/GSVA.html) for estimating variation of gene set enrichment through the samples of a expression data set +- `DE/Cross-comparison-analysis.Rmd` shows an exmaple on how to compare two differential expression analysis from the `DEG.Rmd` template. +- `DE/Intersections.Rmd` shows an example on how to compare multiple differential expression analyses from `DE/DEG.Rmd` and find intersections. + + diff --git a/inst/templates/rnaseq/information.R b/inst/templates/rnaseq/information.R new file mode 100644 index 0000000..6e15eef --- /dev/null +++ b/inst/templates/rnaseq/information.R @@ -0,0 +1,6 @@ +# info params +project = "name_hbcXXXXX" +PI = 'person name' +experiment = 'short description' +aim = 'short description' +analyst = 'person in the core' diff --git a/inst/templates/rnaseq/libs/FA.R b/inst/templates/rnaseq/libs/FA.R new file mode 100644 index 0000000..7d09d0e --- /dev/null +++ b/inst/templates/rnaseq/libs/FA.R @@ -0,0 +1,151 @@ +library(msigdbr) +library(clusterProfiler) +source <- "https://github.com/bcbio/resources/raw/refs/heads/main/gene_sets/gene_sets/20240904" +get_databases_v2=function(sps="human"){ + gmt.files=list(human=c("h.all.v2024.1.Hs.entrez.gmt", + "c5.go.v2024.1.Hs.entrez.gmt", + "c5.go.mf.v2024.1.Hs.entrez.gmt", + "c5.go.cc.v2024.1.Hs.entrez.gmt", + "c5.go.bp.v2024.1.Hs.entrez.gmt", + "c2.cp.reactome.v2024.1.Hs.entrez.gmt", + "c2.cp.kegg_legacy.v2024.1.Hs.entrez.gmt"), + mouse=c("mh.all.v2024.1.Mm.entrez.gmt", + "m5.go.v2024.1.Mm.entrez.gmt", + "m5.go.mf.v2024.1.Mm.entrez.gmt", + "m5.go.cc.v2024.1.Mm.entrez.gmt", + "m5.go.bp.v2024.1.Mm.entrez.gmt", + "m2.cp.reactome.v2024.1.Mm.entrez.gmt", + "m2.cp.kegg_legacy.v2024.1.Mm.entrez.gmt")) + all_in_life=lapply(gmt.files[[sps]], function(gmt){ + df=read.gmt(file.path(source,sps,gmt)) + names(df)=c("gs_name", "entrez_gene") + df + }) + names(all_in_life) = str_remove(gmt.files[[sps]], ".v2024.*$") + all_in_life +} + +get_databases=function(sps="human"){ + all_in_life=list( + msigdbr(species = sps, category = "H") %>% mutate(gs_subcat="Hallmark"), + # msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"), + msigdbr(species = sps, category = "C2", subcategory = "CP:KEGG"), + # msigdbr(species = "human", category = "C2", subcategory = "CP:PID"), + msigdbr(species = sps, category = "C5", subcategory = "GO:BP"), + msigdbr(species = sps, category = "C5", subcategory = "GO:MF") + # msigdbr(species = "human", category = "C5", subcategory = "HPO"), + # msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"), + # msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic") + ) + all_in_life +} + +run_fora_v2=function(input, uni, all_in_life){ + total_deg=length(unique(input$ENTREZID))/length(unique(uni$ENTREZID)) + pathways_ora_all = lapply(names(all_in_life), function(database){ + p = all_in_life[[database]] + #browser() + pathway = split(x = p$entrez_gene, f = p$gs_name) + db_name = database + respath <- fora(pathways = pathway, + genes = unique(input$ENTREZID), + universe = unique(uni$ENTREZID), + minSize = 15, + maxSize = 500) + respath %>% + mutate(database=db_name, NES=(overlap/size)/(total_deg)) + }) %>% bind_rows() %>% + mutate(analysis="ORA") + ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>% + group_by(pathway) %>% + left_join(uni, by =c("overlapGenes"="ENTREZID")) %>% + dplyr::select(pathway, padj, NES, SYMBOL, analysis, + database) %>% + group_by(pathway,padj,NES,database,analysis) %>% + summarise(genes=paste(SYMBOL,collapse = ",")) + ora_tb + +} + +run_fgsea_v2=function(input, all_in_life){ + # browser() + input_gsea <- input$lfc + names(input_gsea) <- input$ENTREZID + pathways_all = lapply(names(all_in_life), function(database){ + p = all_in_life[[database]] + pathway = split(x = p$entrez_gene, f = p$gs_name) + db_name = database + respath <- fgsea(pathways = pathway, + stats = input_gsea, + minSize = 15, + maxSize = 500) + + as_tibble(respath) %>% + mutate(database=db_name) + }) %>% bind_rows() %>% + mutate(analysis="GSEA") + tb = pathways_all %>% unnest(leadingEdge) %>% + group_by(pathway) %>% + left_join(input, by =c("leadingEdge"="ENTREZID")) %>% + dplyr::select(pathway, padj, size, NES, SYMBOL, analysis, + database) %>% + group_by(pathway, padj, size, NES, database, analysis) %>% + summarise(genes=paste(SYMBOL,collapse = ",")) + tb + +} + +run_fora=function(input, uni,all_in_life){ + # browser() + total_deg=length(unique(input))/length(unique(uni$ENTREZID)) + pathways_ora_all = lapply(all_in_life, function(p){ + pathway = split(x = p$entrez_gene, f = p$gs_name) + db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":") + respath <- fora(pathways = pathway, + genes = unique(input$ENTREZID), + universe = unique(uni$ENTREZID), + minSize = 15, + maxSize = 500) + # coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1], + # pathway, unique(input$ENTREZID), unique(uni$ENTREZID)) + as_tibble(respath) %>% + mutate(database=db_name, NES=(overlap/size)/(total_deg)) + }) %>% bind_rows() %>% + mutate(analysis="ORA") + ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>% + group_by(pathway) %>% + left_join(uni, by =c("overlapGenes"="ENTREZID")) %>% + dplyr::select(pathway, padj, NES, SYMBOL, analysis, + database) %>% + group_by(pathway,padj,NES,database,analysis) %>% + summarise(genes=paste(SYMBOL,collapse = ",")) + ora_tb + +} + +run_fgsea=function(input, all_in_life){ + # browser() + input_gsea <- input$lfc + names(input_gsea) <- input$ENTREZID + pathways_all = lapply(all_in_life, function(p){ + pathway = split(x = p$entrez_gene, f = p$gs_name) + db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":") + respath <- fgsea(pathways = pathway, + stats = input_gsea, + minSize = 15, + maxSize = 500) + + as_tibble(respath) %>% + mutate(database=db_name) + }) %>% bind_rows() %>% + mutate(analysis="GSEA") + tb = pathways_all %>% unnest(leadingEdge) %>% + group_by(pathway) %>% + left_join(input, by =c("leadingEdge"="ENTREZID")) %>% + dplyr::select(pathway, padj, size, NES, SYMBOL, analysis, + database) %>% + group_by(pathway, padj, size, NES, database, analysis) %>% + summarise(genes=paste(SYMBOL,collapse = ",")) + tb + +} diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/load_data.R b/inst/templates/rnaseq/libs/load_data.R similarity index 91% rename from inst/rmarkdown/templates/rnaseq/skeleton/DE/load_data.R rename to inst/templates/rnaseq/libs/load_data.R index 8a1d297..aaa32af 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/DE/load_data.R +++ b/inst/templates/rnaseq/libs/load_data.R @@ -101,15 +101,17 @@ load_metrics <- function(se_object, multiqc_data_dir, gtf_fn, counts){ return(metrics) } -load_coldata <- function(coldata_fn, column, numerator, denominator, subset_column = NULL, subset_value = NULL){ +load_coldata <- function(coldata_fn, column=NULL, subset_column = NULL, subset_value = NULL){ coldata=read.csv(coldata_fn) %>% - dplyr::select(!matches("fastq") & !matches("strandness")) %>% + dplyr::distinct(sample, .keep_all = T) %>% + dplyr::select(!matches("fastq"), !matches("strandness")) %>% distinct() if('description' %in% names(coldata)){ coldata$sample <- tolower(coldata$description) } coldata <- coldata %>% distinct(sample, .keep_all = T) - stopifnot(column %in% names(coldata)) + if (!is.null(column)) + stopifnot(column %in% names(coldata)) # use only some samples, by default use all if (!is.null(subset_column)){ @@ -121,7 +123,8 @@ load_coldata <- function(coldata_fn, column, numerator, denominator, subset_colu rownames(coldata) <- coldata$sample coldata$description <- coldata$sample - coldata[[column]] = relevel(as.factor(coldata[[column]]), denominator) + # if (!is.null(denominator)) + # coldata[[column]] = relevel(as.factor(coldata[[column]]), denominator) return(coldata) } @@ -138,8 +141,8 @@ load_counts <- function(counts_fn){ } else { # nf-core input counts <- read_tsv(counts_fn) %>% dplyr::select(-gene_name) %>% mutate(gene_id = str_replace(gene_id, pattern = "\\.[0-9]+$", "")) %>% - column_to_rownames('gene_id') %>% round - + column_to_rownames('gene_id') %>% round %>% as.matrix() + counts=counts[rowSums(counts)!=0,] return(counts) } diff --git a/inst/templates/rnaseq/org/hcbc/DataManagement-Checklist-BulkRNA.docx b/inst/templates/rnaseq/org/hcbc/DataManagement-Checklist-BulkRNA.docx new file mode 100644 index 0000000..38f76bc Binary files /dev/null and b/inst/templates/rnaseq/org/hcbc/DataManagement-Checklist-BulkRNA.docx differ diff --git a/inst/templates/rnaseq/org/hcbc/DataManagement-Checklist-BulkRNA.pdf b/inst/templates/rnaseq/org/hcbc/DataManagement-Checklist-BulkRNA.pdf new file mode 100644 index 0000000..2aae411 Binary files /dev/null and b/inst/templates/rnaseq/org/hcbc/DataManagement-Checklist-BulkRNA.pdf differ diff --git a/inst/templates/rnaseq/org/hcbc/hcbc_README.md b/inst/templates/rnaseq/org/hcbc/hcbc_README.md new file mode 100644 index 0000000..d21e19e --- /dev/null +++ b/inst/templates/rnaseq/org/hcbc/hcbc_README.md @@ -0,0 +1,3 @@ +# Guideline for RNAseq downstream analysis + +Please follow the DataManagement-Checklist-BulkRNA.pdf diff --git a/inst/templates/singlecell/Integration/norm_integration.rmd b/inst/templates/singlecell/Integration/norm_integration.rmd new file mode 100644 index 0000000..68ab9d4 --- /dev/null +++ b/inst/templates/singlecell/Integration/norm_integration.rmd @@ -0,0 +1,654 @@ +--- +title: "scRNA normalization and clustering" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: inline +params: + project_file: ../information.R +--- + +Template developed with materials from https://hbctraining.github.io/main/. + +```{r, message=FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +library(tidyverse) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) +stopifnot(compareVersion(as.character(packageVersion("Seurat")), "5.0.0")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. + +```{r} +# parameters +## Cell cycle markers for c.elegans, human, mouse, D. rerio, and D. melanogaster can be found here: https://github.com/hbc/tinyatlas/tree/1e2136a35e773f14d97ae9cbdb6c375327b2dd2b/cell_cycle +## This files needs gene_name and phase columns to work with this template + +cell_cycle_file="https://github.com/bcbio/resources/raw/refs/heads/main/singlecell/human_cell_cycle.csv" +seurat_obj="https://github.com/bcbio/bcbioR-test-data/raw/refs/heads/main/singlecell/tiny.rds" +seurat_output="/tmp/seurat_clust.rds" + +source(params$project_file) +``` + +```{r setup, cache=FALSE, message=FALSE, warning=FALSE, echo=FALSE} +knitr::opts_chunk$set(echo = TRUE) +# Load libraries +library(Seurat) +library(harmony) +library(knitr) +library(rmarkdown) +library(data.table) +library(DT) +library(patchwork) +library(clustree) +library(ggprism) +library(grafify) +library(R.utils) +#library(future) + +ggplot2::theme_set(theme_prism(base_size = 12)) +# https://grafify-vignettes.netlify.app/colour_palettes.html +# NOTE change colors here if you wish +scale_colour_discrete <- function(...) + scale_colour_manual(..., values = as.vector(grafify:::graf_palettes[["kelly"]])) + +#options(ggplot2.discrete.colour= ) + +# Set seed for reproducibility +set.seed(1454944673L) +opts_chunk[["set"]]( + audodep = TRUE, + cache = FALSE, + cache.lazy = FALSE, + error = TRUE, + echo = TRUE, + fig.height = 5L, + fig.retina = 2L, + fig.width = 9.6, + message = FALSE, + tidy = TRUE, + warning = TRUE +) +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` +- Aim: `r aim` + +## Dataset + +The Seurat object used as input for this report was prepared with the thresholds detailed below applied. + +-nGenes > `nFeature_RNA_cutoff` +-nUMI > `nCount_RNA_cutoff` +-complexity > `Log10GenesPerUMI_cutoff` +-percent mitochondrial reads < `mitoRatio_cutoff` + + +```{r load_data, cache = TRUE} +# Source cell cycle markers +cc_markers = read_csv(cell_cycle_file) +stopifnot(c("gene_name", "phase") %in% colnames(cc_markers)) + +# Loading QC'd object +if (isUrl(seurat_obj)){ + seurat_qc <- readRDS(url(seurat_obj)) +}else{ + seurat_qc <- readRDS(seurat_obj) +} + +DefaultAssay(seurat_qc) <- "RNA" + +# Define color scales for up to 24 clusters/samples +colsD <- RColorBrewer::brewer.pal(8, "Dark2") +colsM <- RColorBrewer::brewer.pal(8, "Set2") +colsL <- RColorBrewer::brewer.pal(8, "Pastel2") +# Stack same colors from dark to pastel +cols3 <- unlist(strsplit(paste(colsD, colsM, colsL, sep = "_"), "_")) +cols2 <- c(unlist(strsplit(paste(colsD, colsM, sep = "_"), "_")), "deepskyblue2") +``` + + +After filtering, each sample contributed the following number of cells to the analysis: + +```{r meta pre doub} +table(seurat_qc$orig.ident) +``` + + +# Sources of variability Log normalization {.tabset} + +In this section, we look at potential confounding variables in our (post-QC) dataset, to determine whether their effect needs to be accounted for before normalizing and integrating the data. + +To enable meaningful visualization of the data, we apply a minimal normalization to our raw data (log-normalization). We then identify the top 2000 most variable genes across the log-normalized data, i.e. those with the greatest variability in expression level from one cell to the next. Finally, we calculate principal components (PCs) based on these top 2000 most variable genes, and use the first 50 PCs to derive reduced UMAP (Uniform Manifold Approximation and Projection) components. + +**We start with log normalization because it is good to observe the data and any trends using a simple transformation. More complex methods like SCT can alter the data in a way that is not as intuitive to interpret.** + +```{r rna_norm0, warning=FALSE, message=FALSE} +#Normalize data +seurat_lognorm <- NormalizeData(seurat_qc, + normalization.method = "LogNormalize", + scale.factor = 10000) + +# Find variable genes (largest dispersion in expression across cells) +seurat_lognorm <- FindVariableFeatures(seurat_lognorm, nfeatures = 2000) + +# Scale and center data +seurat_lognorm <- ScaleData(seurat_lognorm, model.use = "linear") + +# Calculate PCs and UMAP +seurat_lognorm <- RunPCA(seurat_lognorm) +seurat_lognorm <- RunUMAP(seurat_lognorm, 1:40) +``` + +## Examine highly variable genes + +Highly variable gene selection is extremely important since many downstream steps are computed only on these genes. Seurat allows us to access the ranked highly variable genes with the VariableFeatures() function. We can additionally visualize the dispersion of all genes using Seurat’s VariableFeaturePlot(), which shows a gene’s average expression across all cells on the x-axis and variance on the y-axis. Ideally we want to use genes that have high variance since this can indicate a change in expression depending on populations of cells. Adding labels using the LabelPoints() helps us understand which genes will be driving shape of our data. + +```{r} +# Identify the 15 most highly variable genes +ranked_variable_genes <- VariableFeatures(seurat_lognorm) +top_genes <- ranked_variable_genes[1:15] + +# Plot the average expression and variance of these genes +# With labels to indicate which genes are in the top 15 +p <- VariableFeaturePlot(seurat_lognorm) +LabelPoints(plot = p, points = top_genes, repel = TRUE) +``` + +## Sample x covariates + +We then use the UMAP reduction to explore our dataset and assess how different variables influence cell clustering. Throughout this report, **UMAP representations are split by various covariates**, to enable checking for potential phenotype-specific clustering. + +```{r} +## Below is an example plot, change the group.by and split.by parameters to make plots with your own covariates. + +UMAPPlot(seurat_lognorm, group.by = "orig.ident") + ggtitle("UMAP") +``` + +## Cell cycle + +The phase of the cell cycle that cells are in at the time of sample preparation can introduce some variability in the transcriptome that we are not interested in exploring. + +To examine cell cycle variation in our data, we assign a score to each cell, derived from the overall expression level of known markers of the G2/M and S phase in that cell. We then display the cells, color-coded by inferred cell cycle phase, on our UMAP. + +Unless cells very strongly cluster by phase of the cell cycle (which is not the case here), we do not recommend to regress out the effect of the cell cycle. + +```{r cell_cycle_scoring, message=FALSE, warning=FALSE} +# Step 1 - Get cell cycle markers + +# Compute cell cycle score for each cell +## NOTE use the right column names for cc_markers if they are different than +# external_gene_name and phase +seurat_lognorm <- CellCycleScoring(seurat_lognorm, + g2m.features = cc_markers$gene_name[cc_markers$phase=="G2/M"], + s.features = cc_markers$gene_name[cc_markers$phase=="S"]) + +## Plot cell cycle (grouped by) along with covariates (split.by). Add in your covariates of interest + +UMAPPlot(seurat_lognorm, group.by = "Phase") + ggtitle("UMAP") +``` + +## mitoRatio + +The mitochondrial to nuclear gene ratio (mitoRatio) is a marker of cellular stress and might also affect cell clustering. For this dataset, we have seen during QC that the fraction of mitochondrial genes was negligible (which is good). Therefore, we do not expect the need to regress out this variable for normalization purposes, but it's always good to check. + +```{r mito_ratio} +## This custom function by Amelie Jule creates great plots for looking at different QC parameters across the UMAP + +signaturePlot <- function(seurat_object, + gene_signature, + reduction = "umap", + split_var = NULL, + pt_size = 0.5) { + + g1 <- FeaturePlot(seurat_object, + features = gene_signature, + reduction = reduction, + split.by = split_var, + order = TRUE, + pt.size = pt_size, + combine = FALSE) + + min_val <- min(pull(seurat_object@meta.data, gene_signature)) + max_val <- max(pull(seurat_object@meta.data, gene_signature)) + fix_params <- scale_color_gradientn(colours = c("grey80", "blue"), + limits = c(min_val, max_val)) + + g2 <- lapply(g1, function (x) { x + fix_params + + theme_minimal() + # theme_void() + theme(legend.position = "bottom", + plot.title = element_text(hjust = 0.5)) + + ggtitle("") }) + + g2 + +} + +### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] + +g <- signaturePlot(seurat_lognorm, + gene_signature = "mitoRatio", + split_var = "orig.ident") +g[[1]] + ggtitle("S1") | g[[2]]+ ggtitle("S2") +``` + +## nUMIs (nCount) + +```{r} +### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] + +g <- signaturePlot(seurat_lognorm, + gene_signature = "nCount_RNA", + split_var = "subj") +g[[1]] + ggtitle("S1") | g[[2]]+ ggtitle("S2") +``` + +## nGenes (nFeature) + +```{r} +### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] + +g <- signaturePlot(seurat_lognorm, + gene_signature = "nFeature_RNA", + split_var = "orig.ident") +g[[1]] + ggtitle("S1") | g[[2]]+ ggtitle("S2") +``` + +## Complexity + +```{r} +### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] + +g <- signaturePlot(seurat_lognorm, + gene_signature = "Log10GenesPerUMI", + split_var = "orig.ident") +g[[1]] + ggtitle("S1") | g[[2]]+ ggtitle("S2") +``` + +# SCT Normalization {.tabset} + +Now that we have established which effects are observed in our data, we can use the SCTransform method to regress out these effects. The SCTransform method was proposed as a better alternative to the log transform normalization method that we used for exploring sources of unwanted variation. The method not only normalizes data, but it also performs a variance stabilization and allows for additional covariates to be regressed out. + +All genes cannot be treated the same, as such, the SCTransform method constructs a generalized linear model (GLM) for each gene with UMI counts as the response and sequencing depth as the explanatory variable. Information is pooled across genes with similar abundances, to regularize parameter estimates and obtain residuals which represent effectively normalized data values which are no longer correlated with sequencing depth. + +We searched for the top 3000 genes with the largest variability in expression level from cell to cell after SCT-normalization, and re-calculated our principal and UMAP components based on the SCT-normalized data for these top genes. + +**We keep each sample separate for SCT normalization.** + +```{r, eval=file.exists("seurat_sctnorm.rds")} +#NOTE run the chunck below to create this object, and loading will be used while +# knitting to speed up the rendering +seurat_sctnorm = readRDS("seurat_sctnorm.rds") +``` + +```{r eval=!exists("seurat_sctnorm"), warning=FALSE, message=FALSE} +#NOTE: this should be ran previous rendering to prepare the object +## Note that this single command replaces NormalizeData(), ScaleData(), and FindVariableFeatures() + +## SCT can be run with and without regressing out variables. Generally we do not regress out covariates. However, we provide both options below. +seurat_lognorm[["RNA"]] <- split(seurat_lognorm[["RNA"]], + f = seurat_lognorm$orig.ident) + +seurat_sctnorm <- SCTransform(seurat_lognorm, + vst.flavor = "v2", + # vars.to.regress = c("mitoRatio") + variable.features.n = 3000) +saveRDS(seurat_sctnorm, file="seurat_sctnorm.rds") +``` + +### Look at UMAPs post SCT + +The plots below show the same variables as before, this time **displayed on the UMAP calculated after applying SCT-normalization**. + +We qualitatively reviewed the "structure" in our normalized data projection . We were particularly interested in seeing whether similar cell populations across samples clustered together (i.e. overlapped on the UMAP). + +```{r, fig.width=10} +DefaultAssay(seurat_sctnorm) <- "SCT" +seurat_sctnorm <- RunPCA(seurat_sctnorm) +UMAPPlot(seurat_sctnorm, group.by = "orig.ident") + ggtitle("UMAP") +``` + +## Cell cycle + +The phase of the cell cycle that cells are in at the time of sample preparation can introduce some variability in the transcriptome that we are not interested in exploring. + +To examine cell cycle variation in our data, we assign a score to each cell, derived from the overall expression level of known markers of the G2/M and S phase in that cell. We then display the cells, color-coded by inferred cell cycle phase, on our UMAP. + +Unless cells very strongly cluster by phase of the cell cycle (which is not the case here), we do not recommend to regress out the effect of the cell cycle. + +```{r cell_cycle_scoring2, message=FALSE, warning=FALSE} +# Step 1 - Get cell cycle markers +## Cell cycle markers for c.elegans, human, mouse, D. rerio, and D. melanogaster can be found here: https://github.com/hbc/tinyatlas/tree/1e2136a35e773f14d97ae9cbdb6c375327b2dd2b/cell_cycle +# Compute cell cycle score for each cell +seurat_sctnorm <- CellCycleScoring(seurat_sctnorm, + g2m.features = cc_markers$gene_name[cc_markers$phase=="G2/M"], + s.features = cc_markers$gene_name[cc_markers$phase=="S"]) + +## Plot cell cycle (grouped by) along with covariates (split.by). Add in your covariates of interest +UMAPPlot(seurat_sctnorm, group.by = "Phase", split.by = "orig.ident") + ggtitle("UMAP") +``` + +## mitoRatio + +The mitochondrial to nuclear gene ratio (mitoRatio) is a marker of cellular stress and might also affect cell clustering. For this dataset, we have seen during QC that the fraction of mitochondrial genes was negligible (which is good). Therefore, we do not expect the need to regress out this variable for normalization purposes, but it's always good to check. + +```{r mito_ratio2} +## This custom function by Amelie Jule creates great plots for looking at different QC parameters across the UMAP + +signaturePlot <- function(seurat_object, + gene_signature, + reduction = "umap", + split_var = NULL, + pt_size = 0.5) { + + g1 <- FeaturePlot(seurat_object, + features = gene_signature, + reduction = reduction, + split.by = split_var, + order = TRUE, + pt.size = pt_size, + combine = FALSE) + + min_val <- min(pull(seurat_object@meta.data, gene_signature)) + max_val <- max(pull(seurat_object@meta.data, gene_signature)) + fix_params <- scale_color_gradientn(colours = c("grey80", "blue"), + limits = c(min_val, max_val)) + + g2 <- lapply(g1, function (x) { x + fix_params + + theme_minimal() + # theme_void() + theme(legend.position = "bottom", + plot.title = element_text(hjust = 0.5)) + + ggtitle("") }) + + g2 + +} + +### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] + +g <- signaturePlot(seurat_sctnorm, + gene_signature = "mitoRatio", + split_var = "orig.ident") +g[[1]] + ggtitle("S1") | g[[2]] + ggtitle("S2") +``` + +## nUMIs (nCount) + +```{r} +### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] + +g <- signaturePlot(seurat_sctnorm, + gene_signature = "nCount_RNA", + split_var = "orig.ident") +g[[1]] + ggtitle("S1") | g[[2]] + ggtitle("S2") +``` + +## nGenes (nFeature) + +```{r} +### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] + +g <- signaturePlot(seurat_sctnorm, + gene_signature = "nFeature_RNA", + split_var = "orig.ident") +g[[1]] + ggtitle("S1") | g[[2]] + ggtitle("S2") +``` + +## Complexity + +```{r} + +### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] + +g <- signaturePlot(seurat_sctnorm, + gene_signature = "Log10GenesPerUMI", + split_var = "orig.ident") +g[[1]] + ggtitle("S1") | g[[2]] + ggtitle("S2") +``` + + +# Integration + +## CCA integration + +```{r, eval=file.exists("seurat_cca.rds")} +#NOTE run the chunck below to create this object, and loading will be used while +# knitting to speed up the rendering +seurat_cca= readRDS("seurat_cca.rds") +``` + +```{r rna_cca, warning=FALSE, message=FALSE, eval=!exists("seurat_cca")} +#NOTE: this should be ran previous rendering to prepare the object +## Note that this single command replaces NormalizeData(), ScaleData(), and FindVariableFeatures() + +## SCT can be run with and without regressing out variables. Generally we do not regress out covariates. However, we provide both options below. + +## To properly integrate with harmony we split our object by sample first. +split_sctnorm <- SplitObject(seurat_lognorm, split.by = "orig.ident") +# NOTE If we have a large dataset, then we might need to adjust the limit for allowable object sizes within R +# options(future.globals.maxSize = 4000 * 1024^2) +for (i in 1:length(split_sctnorm)) { + split_sctnorm[[i]] <- SCTransform(split_sctnorm[[i]], + vst.flavor = "v2", + # vars.to.regress = c("mitoRatio") + variable.features.n = 3000) +} + +integ_features <- SelectIntegrationFeatures(object.list = split_sctnorm, + nfeatures = 3000) +split_sctnorm <- PrepSCTIntegration(object.list = split_sctnorm, + anchor.features = integ_features) + +# Find best buddies - can take a while to run +integ_anchors <- FindIntegrationAnchors(object.list = split_sctnorm, + normalization.method = "SCT", + anchor.features = integ_features) +# Integrate across conditions +seurat_cca <- IntegrateData(anchorset = integ_anchors, + normalization.method = "SCT") + +# Rejoin the layers in the RNA assay that we split earlier +seurat_cca[["RNA"]] <- JoinLayers(seurat_cca[["RNA"]]) + +# Run PCA +seurat_cca <- RunPCA(object = seurat_cca) + +# Run UMAP +seurat_cca <- RunUMAP(seurat_cca, + reduction.name = "umap.cca", + dims = 1:40) + +saveRDS(seurat_cca, file="seurat_cca.rds") +``` + +```{r} +p1 <- DimPlot(seurat_lognorm, group.by = "orig.ident", + reduction = "umap") + + theme(legend.position = "bottom") + + ggtitle("pre-integration") +p2 <- DimPlot(seurat_cca, group.by = "orig.ident", + reduction = "umap.cca") + + theme(legend.position = "bottom") + + ggtitle("post-integration") + +p1 | p2 +``` + +## Harmony + +If cells cluster by sample, condition, batch, dataset, modality, this integration step can greatly improve the clustering and the downstream analyses. + +To integrate, we will use the shared highly variable genes (identified using SCTransform) from each group, then, we will “integrate” or “harmonize” the groups to overlay cells that are similar or have a “common set of biological features” between groups. + +We use [`Harmony`](https://portals.broadinstitute.org/harmony/articles/quickstart.html), which is based on a transformation of principal components (PCs) to find similarities across datasets. Here we group samples by the original sample id. + +```{r rna_hrmny, eval=file.exists("seurat_harmony.rds")} +#NOTE run the chunck below to create this object, and loading will be used while +# knitting to speed up the rendering +seurat_harmony = readRDS("seurat_harmony.rds") +``` + +```{r, eval=!exists("seurat_harmony"), warning=FALSE, message=FALSE} +## Here seurat will integrate on the level of sample id. If you want to integrate on other aspects the SCT normalization will need to be done with all of the data together. +# seurat_sctnorm[["RNA"]] <- split(seurat_sctnorm[["RNA"]], f = seurat_sctnorm$orig.ident) +seurat_harmony <- IntegrateLayers(object = seurat_sctnorm, + method = HarmonyIntegration, + orig.reduction = "pca", + new.reduction = 'harmony', + assay = "SCT", verbose = FALSE) +seurat_harmony <- RunPCA(seurat_harmony) +seurat_harmony <- RunUMAP(seurat_harmony, reduction = "harmony", + dims = 1:40, + reduction.name = "umap.harmony") +saveRDS(seurat_harmony, file="seurat_harmony.rds") +``` + +## Pre vs. Post integration + +```{r dimplot_both all, echo=FALSE} +p1 <- DimPlot(seurat_sctnorm, group.by = "orig.ident", + reduction = "umap") + + theme(legend.position = "bottom") + + ggtitle("pre-integration") +p2 <- DimPlot(seurat_cca, group.by = "orig.ident", + reduction = "umap.cca") + + theme(legend.position = "bottom") + + ggtitle("post-integration CCA") +p3 <- DimPlot(seurat_harmony, group.by = "orig.ident", + reduction = "umap.harmony") + + theme(legend.position = "bottom") + + ggtitle("post-integration Harmony") + +p1 | p2 | p3 +``` + +## Clustering + +For single-modality scRNA-seq analysis, `Seurat` clusters the cells using a Louvain clustering approach. First, a K-nearest neighbor (KNN) graph is built, where cells are connected if they have a similar transcriptome, as determined from their scores on the first PCs. Then, the graph is partitioned into "communities" or "clusters" of interconnected cells that are more tightly connected with each other than with cells outside of the corresponding cluster. + +A limitation of this approach is that the number of identified clusters depends on the chosen resolution, a parameter that must be set by the user and does not necessarily reflect the underlying biology of the dataset. For most single-cell datasets, a resolution of 0.1 to 1 will provide a reasonable number of clusters. Complex datasets with multiple cell types may require a larger resolution, and vice versa. + +```{r find_neighbors all, echo=TRUE} +# NOTE use seurat_harmony or seurat_cca +# seurat_clust <- FindNeighbors(seurat_harmony, assay = "SCT", +# reduction = "harmony", dims = 1:40) +seurat_clust <- FindNeighbors(seurat_cca, assay = "SCT", dims = 1:40) +# check graph names names(seurat_harmony@graphs) +# DefaultAssay(object = seurat_harmony[["pca"]]) +seurat_clust <- FindClusters(object = seurat_clust, + resolution = c(0.1, 0.2, 0.4, 0.6, 0.8, 1.0), + verbose = FALSE) +``` + +## Clustering Tree + +We build a clustering tree using the [clustree](https://lazappi.github.io/clustree/articles/clustree.html) package to show how cells move as the clustering resolution is increased. Each cluster forms a node in the tree and edges are constructed by considering the cells in a cluster at a lower resolution (say 𝑘=2) that end up in a cluster at the next highest resolution (say 𝑘=3). By connecting clusters in this way we can see how clusters are related to each other, which are clearly distinct and which are unstable. The size of each node is related to the number of samples in each cluster and the color indicates the clustering resolution. Edges are colored according to the number of samples they represent and the transparency shows the incoming node proportion, the number of samples in the edge divided by the number of samples in the node it points to. + +```{r, fig.height=10, fig.width=8} +library(clustree) + +meta = seurat_clust@meta.data +meta = na.omit(meta) + +## Change the prefix to match your clusters +# NOTE: this if you have run HARMONY +# prefix_clu <- "SNN_res." +# show_this <- "umap.harmony" +# NOTE: this if you have run CCA +prefix_clu <- "integrated_snn_res." +show_this <- "umap.cca" +clustree(meta, prefix = prefix_clu) +``` + +## Visualize clusters {.tabset} + +We take a look at how the clusters look at resolutions 0.1, 0.2,0.4, and 0.6 + +### 0.1 + +```{r umap_0.1} +cluster_res <- 0.2 +Idents(object = seurat_clust) <- paste0(prefix_clu, cluster_res) +DimPlot(seurat_clust, + reduction = show_this, + split.by = "orig.ident", + label = TRUE) +``` + +* * * + +### 0.2 + +```{r umap_0.2} +cluster_res <- 0.2 +Idents(object = seurat_clust) <- paste0(prefix_clu, cluster_res) +DimPlot(seurat_clust, + reduction = show_this, + split.by = "orig.ident", + label = TRUE) +``` + +* * * + +### 0.4 + +```{r umap_0.4} +cluster_res <- 0.4 +Idents(object = seurat_clust) <- paste0(prefix_clu, cluster_res) +DimPlot(seurat_clust, + reduction = show_this, + split.by = "orig.ident", + label = TRUE) +``` + +* * * +### 0.6 + +```{r umap_0.6} +cluster_res <- 0.6 +Idents(object = seurat_clust) <- paste0(prefix_clu, cluster_res) +DimPlot(seurat_clust, + reduction = show_this, + split.by = "orig.ident", + label = TRUE) +``` + +* * * + + +```{r} +saveRDS(seurat_clust, file=seurat_output) +``` + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` + diff --git a/inst/templates/singlecell/QC/QC.rmd b/inst/templates/singlecell/QC/QC.rmd new file mode 100644 index 0000000..6c24e4c --- /dev/null +++ b/inst/templates/singlecell/QC/QC.rmd @@ -0,0 +1,431 @@ +--- +title: "scRNA QC" +output: html_document +date: "`r Sys.Date()`" +params: + ## If you have Ribosomal ratio in your raw seurat object put this as TRUE otherwise leave as FALSE + ribosomal: FALSE + params_file: https://github.com/bcbio/bcbioR-test-data/raw/refs/heads/main/singlecell/parameters.R + project_file: ../information.R +--- + +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) +stopifnot(compareVersion(as.character(packageVersion("Seurat")), "5.0.0")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. + +```{r, eval=FALSE} +### READ ME FIRST + +# This is a template for scRNA QC to present to your client. The actual QC can be done using our rshiny app: + +# After you have decided on your QC metrics load your raw object (i.e. right after you first read data into seurat) and put the parameters.R file you got from the shiny app in the same folder as this rmd. + +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` + +```{r setup, include=FALSE} +library(Seurat) +library(tidyverse) +library(ggplot2) + +# 1. Set up input files in this R file (params_de.R) +# Loading QC'd object +parameters = params$params_file +if (isUrl(seurat_obj)){ + source(url(parameters)) +}else{ + source(parameters) +} + +seurat_obj = "~/Downloads/filtered_gene_bc_matrices/tiny.rds" + +knitr::opts_chunk[["set"]]( + cache = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + fig.height = 4, + echo=FALSE) +``` + + + +```{r load and filter no ribo} +## Load data +seurat_raw <- readRDS(seurat_obj) + +## Creat QC object USE METRICS YOU CHOSE IN THE RSHINY APP + +seurat_qc <- subset(x = seurat_raw, + subset = (nCount_RNA >= nCount_RNA_cutoff) + & (nFeature_RNA >= nFeature_RNA_cutoff) + & (mitoRatio < mitoRatio_cutoff) + ## & (riboRatio < riboRatio_cutoff) + & (Log10GenesPerUMI > Log10GenesPerUMI_cutoff) + ) + + +``` + + +```{r load and filter ribo, eval=ribosomal, warning=FALSE, results='asis'} + +seurat_qc <- subset(x = seurat_raw, + subset = (nCount_RNA >= nCount_RNA_cutoff) + & (nFeature_RNA >= nFeature_RNA_cutoff) + & (mitoRatio < mitoRatio_cutoff) + & (riboRatio < riboRatio_cutoff) + & (Log10GenesPerUMI > Log10GenesPerUMI_cutoff) + ) + +``` + + +```{r} + +## Save QC object +saveRDS(seurat_qc, file = "seurat_post-QC.rds") +``` + + +```{r prep-info} +## Prep information for plotting +metadata0 <- seurat_raw@meta.data +metadata0 = metadata0 %>% dplyr::rename(nUMI = nCount_RNA, + nGene = nFeature_RNA) +metadata1 <- seurat_qc@meta.data +metadata1 = metadata1 %>% dplyr::rename(nUMI = nCount_RNA, + nGene = nFeature_RNA) +``` + + +# QC metrics: raw data {.tabset} + +In this section, we review quality control (QC) metrics for the **raw feature matrices** generated by `Cellranger`. Only a low level filter excluding cells with <100 nUMIs (= number of unique molecular identifiers, or sequenced reads per cell) was applied when uploading the data into `R`. + + +## Cells per sample + +```{r cells raw} +table(metadata0$orig.ident) + +``` + + +## UMIs per cell + +Here, we look at the distribution of UMIs (unique molecular identifiers, or sequenced reads) per cell (droplet) in the dataset. Before QC, we expect a biomodal distribution with a first *small* peak at low numbers of UMIs (<250) corresponding to droplets that encapsulated background/dying cells, and a second higher peak centered at >1000. The line is at 250. + + +```{r raw_nUMIs} +metadata0 %>% + ggplot(aes(x = nUMI, color = orig.ident, fill = orig.ident)) + + geom_density(alpha = 0.2) + + theme_classic() + + ylab("Cell density") + scale_x_log10() + + geom_vline(xintercept = 250) + + facet_wrap(. ~ orig.ident) + + ggtitle("UMIs per cell in raw dataset") +``` + + + +```{r} +# Visualize the distribution of nUMIs per cell (boxplot) +metadata0 %>% + ggplot(aes(x=orig.ident, y=log10(nUMI), fill=orig.ident)) + + geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + + theme_classic() + + geom_hline(yintercept = c(log10(1000), log10(50000))) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + + theme(plot.title = element_text(hjust = 0.5, face = "bold")) +``` + +## Genes per cell + +Here, we look at the number of different genes that were detected in each cell. By "detected", we mean genes with a non-zero read count measurement. Gene detection in the range of 500 to 5000 is normal for most single-cell experiments. The line is at 750. + +```{r raw_nGene} +# Visualize the distribution of genes detected per cell (histogram) +metadata0 %>% + ggplot(aes(x = nGene, color = orig.ident, fill = orig.ident)) + + geom_density(alpha = 0.2) + + theme_classic() + + scale_x_log10() + + geom_vline(xintercept = c(700)) + + facet_wrap(. ~ orig.ident) + + ggtitle("Detected genes per cell in raw dataset") +``` + + +```{r} +# Visualize the distribution of nUMIs per cell (boxplot) +metadata0 %>% + ggplot(aes(x=orig.ident, y=log10(nGene), fill=orig.ident)) + + geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + + theme_classic() + + geom_hline(yintercept = c(log10(700))) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + + theme(plot.title = element_text(hjust = 0.5, face = "bold")) +``` + + +## Mitochondrial ratio + +We evaluate overall mitochondrial gene expression as a biomarker of cellular stress during sample preparation. Typically, we expect mitochondrial genes to account for <20% of overall transcripts in each cell. The line indicates 10%. + +```{r raw_mito, warning=FALSE} +# Visualize the distribution of mitochondrial gene expression detected per cell +metadata0 %>% + ggplot(aes(color = orig.ident, x = mitoRatio, fill = orig.ident)) + + geom_density(alpha = 0.2) + + scale_x_log10() + + theme_classic() + + geom_vline(xintercept = c(0.1)) + + facet_wrap(. ~ surgery) + + ggtitle("Percentage of mitochondrial gene expression per cell in raw dataset") +``` + + + +```{r raw_ribo, eval=ribosomal, warning=FALSE, results='asis'} + +cat("## Ribosomal ratio \n") + +cat("We evaluate overall ribosomal gene expression. The line indicates 5%. \n" +) +# Visualize the distribution of mitochondrial gene expression detected per cell +metadata0 %>% + ggplot(aes(color = orig.ident, x = riboRatio, fill = orig.ident)) + + geom_density(alpha = 0.2) + + scale_x_log10() + + theme_classic() + + geom_vline(xintercept = c(0.05)) + + facet_wrap(. ~ orig.ident) + + ggtitle("Percentage of ribosomal gene expression per cell in raw dataset") +``` + + +## UMIs vs. Genes + +By plotting the number of UMIs per cell (x-axis) vs. the number of genes per cell (y-axis), we can visually assess whether there is a large proportion of low quality cells with low read counts and/or gene detection (bottom left quadrant of the plot). In the following representation, cells are further color-coded based on the percentage of mitochondrial genes found among total detected genes. The line for nUMI is at 1000 and the line for nGene is at 700. + +```{r raw_gene_by_umi, fig.height=12, fig.width=15, warning=FALSE} +# Visualize the correlation between genes detected and number of UMIs and determine whether strong presence of cells with low numbers of genes/UMIs +metadata0 %>% + ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) + + geom_point() + + stat_smooth(method=lm) + + scale_x_log10() + + scale_y_log10() + + theme_classic() + + geom_vline(xintercept = 1000) + + geom_hline(yintercept = 700) + + ggtitle("Genes vs. nUMIs in raw dataset") + + facet_wrap(~orig.ident) +``` + + +## Complexity + +Another way to assess the quality and purity of a single-cell dataset is to look for cells that have fewer detected genes per UMI than others. Typical values for this metric are >0.8 for most cells. Cells with lower diversity in the genes they express may be low-complexity cell types such as red blood cells. With sorted populations, we expect high purity and a very similar complexity distribution across samples. + +```{r raw_novelty} +# Visualize the overall novelty of the gene expression by visualizing the genes detected per UMI +metadata0 %>% + ggplot(aes(x = Log10GenesPerUMI, color = orig.ident, fill = orig.ident)) + + geom_density(alpha = 0.2) + + theme_classic() + + geom_vline(xintercept = c(0.85)) + + facet_wrap(. ~ orig.ident) + + ggtitle("log10(Genes per UMI) in raw dataset") +``` + + +```{r} +# Visualize the distribution of nUMIs per cell (boxplot) +metadata0 %>% + ggplot(aes(x=orig.ident, Log10GenesPerUMI, fill=orig.ident)) + + geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + + theme_classic() + + geom_hline(yintercept = c(0.8, 0.85)) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + + theme(plot.title = element_text(hjust = 0.5, face = "bold")) +``` + + +# QC metrics: Filtered data {.tabset} + +Based on the above QC metrics, we filtered the dataset to isolate cells passing the following thresholds: >`nCount_RNA_cutoff` UMIs, >`nFeature_RNA_cutoff` genes, <`mitoRatio_cutoff` mitochondrial gene ratio, and >`Log10GenesPerUMI_cutoff` complexity. + +In this section, we review QC metrics for our filtered dataset. + +## Cells per sample + +```{r cells filtered} +table(metadata1$orig.ident) + +``` + + +## UMIs per cell + +The line is at 1000 + +```{r qc1_nUMIs} +metadata1 %>% + ggplot(aes(color = orig.ident, x = nUMI, fill = orig.ident)) + + geom_density(alpha = 0.2) + + scale_x_log10() + + theme_classic() + + ylab("Cell density") + xlab("nUMI") + + geom_vline(xintercept = c(1000)) + + facet_wrap(. ~ orig.ident) +``` + + +```{r} +# Visualize the distribution of nUMIs per cell (boxplot) +metadata1 %>% + ggplot(aes(x=orig.ident, y=log10(nUMI), fill=orig.ident)) + + geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + + theme_classic() + + geom_hline(yintercept = c(log10(1000))) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + + theme(plot.title = element_text(hjust = 0.5, face = "bold")) +``` + + +## Genes detected + +The line is at 750 + +```{r qc1_genes} +# Visualize the distribution of genes detected per cell via histogram +metadata1 %>% + ggplot(aes(color = orig.ident, x = nGene, fill= orig.ident)) + + geom_density(alpha = 0.2) + + theme_classic() + + scale_x_log10() + xlab("nGene") + + facet_wrap(. ~ orig.ident) + + geom_vline(xintercept = c(750)) +``` + + +```{r} +# Visualize the distribution of nUMIs per cell (boxplot) +metadata1 %>% + ggplot(aes(x=orig.ident, y=log10(nGene), fill=orig.ident)) + + geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + + theme_classic() + + geom_hline(yintercept = c(log10(7500))) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + + theme(plot.title = element_text(hjust = 0.5, face = "bold")) + +``` + + +## Mitochondrial ratio + +The line is at 10%. + +```{r qc1_mitoratio, message=FALSE, warning=FALSE} +# Visualize the distribution of mitochondrial gene expression detected per cell +metadata1 %>% + ggplot(aes(color = orig.ident, x = mitoRatio, fill = orig.ident)) + + geom_density(alpha = 0.2) + + scale_x_log10() + + theme_classic() + + geom_vline(xintercept = 0.1) + + facet_wrap(. ~ surgery) +``` + + +```{r qc1_ribo, eval=ribosomal, warning=FALSE, results='asis'} + +cat("## Ribosomal ratio \n") + +cat("We evaluate overall ribosomal gene expression. The line indicates 10%. \n" +) +# Visualize the distribution of mitochondrial gene expression detected per cell +metadata0 %>% + ggplot(aes(color = orig.ident, x = riboRatio, fill = orig.ident)) + + geom_density(alpha = 0.2) + + scale_x_log10() + + theme_classic() + + geom_vline(xintercept = c(0.1)) + + facet_wrap(. ~ orig.ident) + + ggtitle("Percentage of ribosomal gene expression per cell in raw dataset") +``` + + +## UMIs vs. Genes + +Both the horizontal and vertical lines are at 1000. + +```{r qc1_genes_per_UMI, fig.height=12, fig.width=15, warning=FALSE} +# Visualize the correlation between genes detected and number of UMIs and determine whether strong presence of cells with low numbers of genes/UMIs +metadata1 %>% + ggplot(aes(x = nUMI, y = nGene, color = mitoRatio)) + + geom_point() + + stat_smooth(method=lm) + + scale_x_log10() + + scale_y_log10() + + theme_classic() + + geom_vline(xintercept = c(1000)) + + geom_hline(yintercept = c(1000)) + + ggtitle("Genes vs. nUMIs in raw dataset") + + xlab("nUMI") + ylab("nGene") + + facet_wrap(~orig.ident) +``` + + +## Complexity + +```{r qc1_complexity} +# Visualize the overall novelty of the gene expression by visualizing the genes detected per UMI +metadata1 %>% + ggplot(aes(x = Log10GenesPerUMI, color = orig.ident, fill = orig.ident)) + + geom_density(alpha = 0.2) + + theme_classic() + + #geom_vline(xintercept = c(0.85)) + + facet_wrap(. ~ orig.ident) +``` + +```{r} +# Visualize the distribution of nUMIs per cell (boxplot) +metadata1 %>% + ggplot(aes(x=orig.ident, Log10GenesPerUMI, fill=orig.ident)) + + geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + + theme_classic() + + #geom_hline(yintercept = c(0.85)) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + + theme(plot.title = element_text(hjust = 0.5, face = "bold")) +``` + + +# R session + +```{r} +sessionInfo() +``` diff --git a/inst/templates/singlecell/README.md b/inst/templates/singlecell/README.md new file mode 100644 index 0000000..c950a4e --- /dev/null +++ b/inst/templates/singlecell/README.md @@ -0,0 +1,16 @@ +# Guideline for scRNAseq analysis + +Make sure there is a valid project name, and modify `information.R` with the right information for your project. You can use this file with any other Rmd to include the project/analysis information. + +# cell-ranger + +`pre-process-w-cellranger.md` contains step by step guidelines on how to run cellranger and load data into R. This `scripts/seurat_init.R` script contains all the pieces to go from cellranger output to Seurat obj. It is assuming a mouse genome. + +# QC + +Currently we are working on deploying a shiny app to inspect the single cell object and find the best cut-offs for filtering. The Rmd that helps to visualize the before and after is `QC.Rmd`. + +# Integration + +`Integration/norm_integration.rmd` is a template with guidelines on how to work with multiple samples. It compares log2norm vs SCT, work with SCT by samples to remove batch biases better, provide options for integration between CCA and Harmony. As last step, it contains cell type clustering and visualization to help decide the best parameters. + diff --git a/inst/templates/singlecell/information.R b/inst/templates/singlecell/information.R new file mode 100644 index 0000000..6e15eef --- /dev/null +++ b/inst/templates/singlecell/information.R @@ -0,0 +1,6 @@ +# info params +project = "name_hbcXXXXX" +PI = 'person name' +experiment = 'short description' +aim = 'short description' +analyst = 'person in the core' diff --git a/inst/templates/singlecell/pre-process-w-cellranger.md b/inst/templates/singlecell/pre-process-w-cellranger.md new file mode 100644 index 0000000..05a57e4 --- /dev/null +++ b/inst/templates/singlecell/pre-process-w-cellranger.md @@ -0,0 +1,275 @@ +# Tipical steps for scRNAseq downstream analysis +--- +title: "From raw data to Seurat" +--- + + +# Overview + +This tutorial assumes that you are starting with 10x genomic data that has not yet been run through cellranger. If you have output files from cellranger (raw_feature_bc_matrix.h5) files skip to step 2. + +# Step 1 running cellranger + +## Set up + +Here are the steps that need to be completed prior to running cellranger. + +### Locate or create your genome + +#### I have mouse or human + +We have prebuilt references for mouse and human located here: + +#### I have another genome + +It is easy to generate a cellranger reference for any genome. All you need as input are a fasta file and a gtf file. Here is some [information](https://kb.10xgenomics.com/hc/en-us/articles/115003327112-How-can-we-add-genes-to-a-reference-package-for-Cell-Ranger) about what is required for the gtf file. + +**Note: what is listed as "gene_id" (required in gtf) or "gene_name" (if used will be preferred) will be your row names (i.e. gene names). Make sure this is something useful or can be connected to information on what these genes are.** + +Below is an example script for a non-model reference + +``` +#!/bin/sh +#SBATCH --partition=short +#SBATCH -o run.o +#SBATCH -e run.e +#SBATCH -t 0-2:30 +#SBATCH -c 1 +#SBATCH --mem=48G + +module load cellranger/7.1.0 + +cellranger mkref \ + --genome=my_nonmodel_genome \ + --fasta=/path/to/my/genome/fasta/file/my_nonmodel_genome.fasta \ + --genes=/path/to/my/genome/annotation/file/my_nonmodel_genome.gtf + +``` + + +### Fastq data + +You should have a number of output files from each sample. These should look like those below: + +``` +sample1_I1_001.fastq.gz +sample1_I2_001.fastq.gz +sample1_R1_001.fastq.gz +sample1_R2_001.fastq.gz +``` + +Cellranger will be looking for both the I and R files. If you do not have both you may have to run demultiplexing, [See here](https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/inputs/cr-mkfastq). + +**NOTE: you may have multiple lanes per sample. there is no need to concatentate these prior to running cellranger.** + + +It is best to create one folder per sample with that sample name and put all of the files there. + +**Cellranger expects 1 folder per sample** + +Here is an example file structure + +``` +fastq_files +├── sample1 +│   ├── sample1_I1_001.fastq.gz +│   ├── sample1_I2_001.fastq.gz +│   ├── sample1_R1_001.fastq.gz +│   ├── sample1_R2_001.fastq.gz +├── sample2 +│   ├── sample2_I1_001.fastq.gz +``` + +## Run Cellranger + +The easiest way to run cellranger is using the array feature on O2. [Here](https://github.com/hbc/knowledgebase/blob/master/rc/arrays_in_slurm.md) is a tutorial on arrays. + +To run cellranger as an array you will need one extra file. This file called `samples.txt` will have the name of each sample on its own line. + +``` +sample1 +sample2 +sample3 +... +sampleN +``` + +for ease `samples.txt` should be in the same directory as your sbatch script. + +Here is an example sbatch script for running cellranger as an array + +```(bash) +#!/bin/bash + +#SBATCH --job-name=CellRangerCount3 # Job name +#SBATCH --partition=short # Partition name +#SBATCH --time=0-05:00 # Runtime in D-HH:MM format +#SBATCH --nodes=1 # Number of nodes (keep at 1) +#SBATCH --ntasks=1 # Number of tasks per node (keep at 1) +#SBATCH --cpus-per-task=16 # CPU cores requested per task (change for threaded jobs) +#SBATCH --mem=128G # Memory needed per node (total) +#SBATCH --error=jobid_%j.err # File to which STDERR will be written, including job ID +#SBATCH --output=jobid_%j.out # File to which STDOUT will be written, including job ID + + +samp=$(awk -v awkvar="${SLURM_ARRAY_TASK_ID}" 'NR==awkvar' samples.txt) ### This line will take the numeric slurm array task id and find the corresponding line number in samples.txt. The sample name is made into a variable called samp. + +module load cellranger/7.1.0 + +cellranger count \ + --id=${samp} \ ## This is what your output folders will be named + --fastqs=/path/to/your/folder/of/fastq/folders/${samp} \ + --transcriptome=/path/to/your/genome \ + --localcores=16 \ + --localmem=128 + +``` + +This script can be run depending on the number of samples you have. Here we will call it N: + +``` +sbatch --array=1-N run_cellranger.sh +``` + +For example if you have 9 samples to run you can use: + +``` +sbatch --array=1-9 run_cellranger.sh +``` + +Arrays are also handy if you need to re-run just a single sample. Let's say you need to re-run the 1st and 9th sample in samples.txt + +``` +sbatch --array=1,9 run_cellranger.sh +``` + +# Step 2 - going from cellranger output to Seurat + +All of your output is now in the output folders you denoted in your cellranger script. These files are almost identical inside. The bit we want to keep is going to always be the same + +``` +sample1/out/raw_feature_bc_matrix.h5 +``` + +**Note: Always use the raw matrix and not filtered_feature_bc_matrix.h5. We will do our own QC downstream.** + +In this part of the tutorial we will go through the parts of creating the seurat object piece by piece. At the bottom is the entire script that you can copy and paste and edit. + +## Part 1 - reading all the files in and creating initial seurat object + +This should all be done in an interactive session on O2 with at least 96G of memory. + + +```(R) + +library(Seurat) +library(data.table) +library(hdf5r) + +### Set up run information +data_dir <- "/path/to/cellranger/output/folders/" + +samples <- c("sample1", "sample2", "sample3") + +### Make individual seurat objects for each sample + +for (i in 1:length(samples)){ + seurat_data <- Read10X_h5(paste(c(data_dir,samples[i],"/outs/raw_feature_bc_matrix.h5"),sep="",collapse = "")) + seurat_obj <- CreateSeuratObject(counts = seurat_data, + min.features = 100, ## only keep cells with at least 100 genes + project = samples[i]) + assign(paste0(samples[i], "_seurat"), + seurat_obj) # stores Seurat object in variable of corresponding sample name +} + +### Merge all seurat objects + +seurat_ID <- paste0(samples, "_seurat") # get names of all objects + + +u <- get(seurat_ID[2]) +for (i in 3:length(seurat_ID)) { + u <- c(u, get(seurat_ID[i])) +} ## makes a list of all seurat objects + +seurat_merge <- merge(x = get(seurat_ID[1]), + y = u, + add.cell.ids = all_samples, + project = "my_scRNA_project") + +``` + +## Part 2 - Add mitochondrial information + +This code is for using mouse mitochondrial genes: + + +```(R) +# Mitochondrial genes for mouse genome +idx <- grep("^mt-", rownames(GetAssay(seurat_merge, "RNA"))) +rownames(GetAssay(seurat_merge, "RNA"))[idx] +# Mitochondrial genes vs. nuclear genes ratio +seurat_merge$mitoRatio <- PercentageFeatureSet(object = seurat_merge, pattern = "^mt-") +seurat_merge$mitoRatio <- seurat_merge@meta.data$mitoRatio/100 # Divide by 100 for Ratio instead of Percentage +``` + +This code is for using human mitochondrial genes: + +```(R) +# Mitochondrial genes for human genome +idx <- grep("^MT-", rownames(GetAssay(seurat_obj, "RNA"))) +rownames(GetAssay(seurat_merge, "RNA"))[idx] +# Mitochondrial genes vs. nuclear genes ratio +seurat_merge$mitoRatio <- PercentageFeatureSet(object = seurat_merge, pattern = "^mt-") +seurat_merge$mitoRatio <- seurat_merge@meta.data$mitoRatio/100 # Divide by 100 for Ratio instead of Percentage +``` + +Below is an example code from the siberian hamster. Here I all of the mitochondrial genes were found manually a list was created. + +```(R) +mito_genes <- c("ND1", "ND2","COX1","COX2","ATP8","ATP6","COX3","ND3","ND4L","ND4","ND5","ND6","CYTB") +idx <- (rownames(GetAssay(seurat_merge, "RNA")) %in% mito_genes) +rownames(GetAssay(seurat_merge, "RNA"))[idx] +# Mitochondrial genes vs. nuclear genes ratio +seurat_merge$mitoRatio <- PercentageFeatureSet(object = seurat_merge, features = mito_genes) +seurat_merge$mitoRatio <- seurat_merge@meta.data$mitoRatio/100 # +``` + +## Part 3 - Add additional metadata + +Here we add some additonal metrics and sample metadata from the client. + +```(R) +# Number of genes per UMI for each cell +seurat_merge$Log10GenesPerUMI <- log10(seurat_merge$nFeature_RNA) / log10(seurat_merge$nCount_RNA) + +# Import experimental metadata +metaexp <- read.csv("/path/to/experimental/metadata/meta.csv") + +# Check matching of IDs +all(metaexp$sample %in% metadata$orig.ident) +all(metadata$orig.ident %in% metaexp$sample) + +#change headings to match +colnames(metaexp)[1] <- "orig.ident" + +metafull <- plyr::join(metadata, metaexp, + by = c("orig.ident")) + +# Replace seurat object metadata +if(all(metafull$barcode == rownames(seurat_merge@meta.data))) { + rownames(metafull) <- metafull$barcode + seurat_merge@meta.data <- metafull +} +``` + +## Part 4 - Save object + +```(R) +## Join layers (each sample is a separate layer) +seurat_merge[["RNA"]] <- JoinLayers(seurat_merge[["RNA"]]) + +### Save Seurat object for future processing +saveRDS(seurat_merge, file = "seurat_pre-filtered.rds") +write.csv(seurat_merge@meta.data, file = "metadata_pre-filtered.csv") +``` diff --git a/inst/templates/singlecell/scripts/demux_HTOs.R b/inst/templates/singlecell/scripts/demux_HTOs.R new file mode 100644 index 0000000..70ba210 --- /dev/null +++ b/inst/templates/singlecell/scripts/demux_HTOs.R @@ -0,0 +1,44 @@ +library(Seurat) +library(qs) + +############################ perform demultiplexing ############################ + +# replace these paths with ones to your data (cellranger outputs) +sample_matrix <- ReadMtx('data/tumor_4068_ovarian_cdki/processed_data/matrix.mtx.gz', + cells = 'data/tumor_4068_ovarian_cdki/processed_data/barcodes.tsv.gz', + features = 'data/tumor_4068_ovarian_cdki/processed_data/features.tsv.gz') + +# create two matrices of counts: one of for hashtag oligo counts, and one for counts for actual genes +# include in the HTO count matrix only those HTOs that are actually assigned to samples in your dataset (in this case, Hashtag1 and Hashtag2) +hto_matrix <- full_matrix[grepl('Hashtag[12]+', rownames(full_matrix)), ] +expression_matrix <- full_matrix[!grepl('Hashtag', rownames(full_matrix)), ] + +# create a Seurat object from the raw data, including a slot for HTO counts +sample_seurat <- CreateSeuratObject( + counts = Matrix::Matrix(as.matrix(expression_matrix), sparse = T)) +sample_seurat[["HTO"]] <- CreateAssayObject(counts = hto_matrix) + +# normalize both slots in the Seurat object +sample_seurat <- NormalizeData(sample_seurat) +sample_seurat <- NormalizeData(sample_seurat, assay = "HTO", normalization.method = "CLR") + +# perform demultiplexing. adjust positive.quantile as necessary to call more/fewer cells as hashtag-positive +sample_seurat <- HTODemux(sample_seurat, assay = "HTO", positive.quantile = 0.99) + +qsave(sample_seurat, 'data/processed/hto_demux_seurat.qs') +# saveRDS(sample_seurat, 'data/processed/hto_demux_seurat.rds') + + +################## evaluate demultiplexing performance ######################### + +# distributions of expression of hashtags should make sense considering hashtag assigned +RidgePlot(sample_seurat, assay = "HTO", features = c("Hashtag1", 'Hashtag2'), ncol = 2) + +# evaluate expression of hashtags vs calls for singlet, doublet, and unassigned +FeatureScatter(sample_seurat, feature1 = "Hashtag1", feature2 = "Hashtag2") +HTOHeatmap(sample_seurat, assay = "HTO") + +# evaluate nCount_RNA of cells classified as doublets +Idents(sample_seurat) <- "HTO_classification.global" +VlnPlot(sample_seurat, features = "nCount_RNA", pt.size = 0.1, log = TRUE) + diff --git a/inst/templates/singlecell/scripts/seurat_init.R b/inst/templates/singlecell/scripts/seurat_init.R new file mode 100644 index 0000000..b2b64c2 --- /dev/null +++ b/inst/templates/singlecell/scripts/seurat_init.R @@ -0,0 +1,86 @@ +library(rstudioapi) +library(tidyverse) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +stopifnot(compareVersion(package.version("Seurat"), "5.0.0")>=0) + +library(Seurat) +library(data.table) +library(hdf5r) + +### Set up run information +data_dir <- "/path/to/cellranger/output/folders/" + +samples <- c("sample1", "sample2", "sample3") + +### Make individual seurat objects for each sample + +for (i in 1:length(samples)){ + seurat_data <- Read10X_h5(paste(c(data_dir,samples[i],"/outs/raw_feature_bc_matrix.h5"),sep="",collapse = "")) + seurat_obj <- CreateSeuratObject(counts = seurat_data, + min.features = 100, ## only keep cells with at least 100 genes + project = samples[i]) + assign(paste0(samples[i], "_seurat"), + seurat_obj) # stores Seurat object in variable of corresponding sample name +} + +### Merge all seurat objects + +seurat_ID <- paste0(samples, "_seurat") # get names of all objects + + +u <- get(seurat_ID[2]) +for (i in 3:length(seurat_ID)) { + u <- c(u, get(seurat_ID[i])) +} ## makes a list of all seurat objects + +seurat_merge <- merge(x = get(seurat_ID[1]), + y = u, + add.cell.ids = all_samples, + project = "my_scRNA_project") + + +# Mitochondrial genes for mouse genome +idx <- grep("^mt-", rownames(GetAssay(seurat_merge, "RNA"))) +rownames(GetAssay(seurat_merge, "RNA"))[idx] +# Mitochondrial genes vs. nuclear genes ratio +seurat_merge$mitoRatio <- PercentageFeatureSet(object = seurat_merge, pattern = "^mt-") +seurat_merge$mitoRatio <- seurat_merge@meta.data$mitoRatio/100 # Divide by 100 for Ratio instead of Percentage + +# Number of genes per UMI for each cell +seurat_merge$Log10GenesPerUMI <- log10(seurat_merge$nFeature_RNA) / log10(seurat_merge$nCount_RNA) + +# Extract cell level metadata +metadata <- seurat_merge@meta.data +metadata$barcode <- rownames(metadata) + +# Import experimental metadata +metaexp <- read.csv("/path/to/experimental/metadata/meta.csv") + +# Check matching of IDs +all(metaexp$sample %in% metadata$orig.ident) +all(metadata$orig.ident %in% metaexp$sample) + +#change headings to match +colnames(metaexp)[1] <- "orig.ident" + +metafull <- plyr::join(metadata, metaexp, + by = c("orig.ident")) + +# Replace seurat object metadata +if(all(metafull$barcode == rownames(seurat_merge@meta.data))) { + rownames(metafull) <- metafull$barcode + seurat_merge@meta.data <- metafull +} + + +## Join layers (each sample is a separate layer) +seurat_merge[["RNA"]] <- JoinLayers(seurat_merge[["RNA"]]) + +### Save Seurat object for future processing +saveRDS(seurat_merge, file = "seurat_pre-filtered.rds") +write.csv(seurat_merge@meta.data, file = "metadata_pre-filtered.csv") diff --git a/inst/templates/singlecell_delux/CellToCell/cellchat.Rmd b/inst/templates/singlecell_delux/CellToCell/cellchat.Rmd new file mode 100644 index 0000000..6e49644 --- /dev/null +++ b/inst/templates/singlecell_delux/CellToCell/cellchat.Rmd @@ -0,0 +1,456 @@ +--- +title: "CellChat" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: false + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + project_file: information.R + seurat_fn: ../data/fDat_sn_RC.rds + cellchat_fn: ../data/snrna_cellchat.qs + cellchat_grade2_fn: ../data/snrna_cellchat_grade2.qs + cellchat_grade0_fn: ../data/snrna_cellchat_grade0.qs +--- + +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) +stopifnot(compareVersion(as.character(packageVersion("Seurat")), "5.0.0")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-draft-grey) revision. + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE, echo=FALSE} +# NOTE change or remove according your environment +reticulate::use_virtualenv("/n/app/bcbio/R4.3.1_python_cellchat") +reticulate::py_config() # should show v3.9.14 +Sys.getenv("PYTHONPATH") # should be empty + +current_libs <- .libPaths() +.libPaths(c('/n/app/bcbio/R4.3.1_cellchat/', current_libs)) +``` + +```{r} +library(CellChat) + +library(tidyverse) +library(Seurat) +library(bcbioR) +library(ggprism) +library(knitr) +library(tools) +library(qs) +library(patchwork) +library(ComplexHeatmap) + +options(stringsAsFactors = FALSE) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) + +cellchat_ran <- file.exists(params$cellchat_fn) +cellchat_rejection_ran <- file.exists(params$cellchat_grade2_fn) & file.exists(params$cellchat_grade0_fn) +``` + +# Clustering + +```{r load_data } + +snrna <- readRDS(params$seurat_fn) + +# in this case, Chris_annot = cell_type +DimPlot(snrna, reduction = 'umap', group.by = 'Chris_annot') + +``` + +```{r prep cellchat inputs, eval = !cellchat_ran } + +# need to use normalized counts as input +data.input <- snrna[["SCT"]]@data +labels <- snrna$Chris_annot +meta <- data.frame(labels = labels, row.names = names(labels), samples = snrna$orig.ident) + +``` + +```{r create cellchat object, eval = !cellchat_ran } +cellchat <- createCellChat(object = data.input, meta = meta, group.by = "labels") + +``` + +```{r set cellchat db, eval = !cellchat_ran} +CellChatDB <- CellChatDB.human +CellChatDB.use <- subsetDB(CellChatDB) +cellchat@DB <- CellChatDB.use + +``` + +```{r subset and preprocess data, eval = !cellchat_ran } + +cellchat <- subsetData(cellchat) +cellchat <- updateCellChat(cellchat) +future::plan("multisession", workers = 8) # recommend running with at 8-16 cores +cellchat <- identifyOverExpressedGenes(cellchat) # may take a couple minutes +cellchat <- identifyOverExpressedInteractions(cellchat) # may take a couple minutes + +``` + +```{r compute communication prob, eval = !cellchat_ran} + +# Not recommended: project gene expression data onto protein-protein interaction network. +# Useful with shallow sequencing depth but introduces many weak communications. +# If used, must set raw.use = FALSE when running computeCommunProb +# cellchat <- projectData(cellchat, PPI.human) + + +# this next command takes 0.5-2+ hours +# can choose various methods for caculating average gene exp per group, +# 'triMean' allegedly produces fewer but stronger interactions +cellchat <- computeCommunProb(cellchat, type = "triMean") + +# filter out the cell-cell communication if < 50 cells per group +cellchat <- filterCommunication(cellchat, min.cells = 50) + +qsave(cellchat, '../data/snrna_cellchat.qs', preset = 'fast') + +``` + +# Overall Results + +```{r load cellchat, eval = cellchat_ran} +cellchat <- qread(params$cellchat_fn) + +df.net <- subsetCommunication(cellchat) %>% dplyr::arrange(pval) +df.net %>% sanitize_datatable() + +``` + +## Top interactions {.tabset} +```{r check pairs, results = 'asis', fig.width = 8, fig.height = 12} + +top_ints <- (df.net %>% pull(interaction_name) %>% unique)[1:10] +for (interaction in top_ints){ + cat('\n') + cat('### ', as.character(interaction), '\n') + interactors <- unlist(strsplit(as.character(interaction), '_')) + p1 <- VlnPlot(snrna, features = interactors, group.by = 'Chris_annot', + pt.size = 0.1, log = T, ncol = 1) + print(p1) + cat('\n') +} + +``` + +```{r compute pathway communication probs, eval = cellchat_ran} +cellchat <- computeCommunProbPathway(cellchat) +cellchat <- aggregateNet(cellchat) + +``` + +## Visualize Cell-Cell Communication Networks + +```{r chord plots, fig.width = 10, fig.height = 8, eval = cellchat_ran} + +groupSize <- as.numeric(table(cellchat@idents)) +par(mfrow = c(1,2), xpd=TRUE) +netVisual_circle(cellchat@net$count, vertex.weight = rowSums(cellchat@net$count), + weight.scale = T, label.edge= F, title.name = "Number of interactions") +netVisual_circle(cellchat@net$weight, vertex.weight = rowSums(cellchat@net$weight), + weight.scale = T, label.edge= F, title.name = "Interaction weights/strength") + +``` + +```{r heatmaps, eval = cellchat_ran} + +netVisual_heatmap(cellchat, measure = "count", color.heatmap = "Blues") +netVisual_heatmap(cellchat, measure = "weight", color.heatmap = "Blues") + +``` + +# Comparison Results + +Here we run the CellChat analysis twice, once on the Grade 2 rejection samples and once on the Grade 0 rejection samples. We compare the significant signaling interactions and investigate changes in them between rejection grades. + +```{r prep inputs rejection, eval=!cellchat_rejection_ran} + +grade2 <- subset(snrna, orig.ident %in% c('BRI-2396', 'BRI-2402')) +grade0 <- subset(snrna, orig.ident %in% c('BRI-2395', 'BRI-2411')) + +data.input_grade2 <- grade2[["SCT"]]@data +labels_grade2 <- grade2$Chris_annot +meta_grade2 <- data.frame(labels = labels_grade2, row.names = names(labels_grade2), samples = grade2$orig.ident) + +data.input_grade0 <- grade0[["SCT"]]@data +labels_grade0 <- grade0$Chris_annot +meta_grade0 <- data.frame(labels = labels_grade0, row.names = names(labels_grade0), samples = grade0$orig.ident) + +``` + +```{r create cellchat object rejection, eval=!cellchat_rejection_ran} +cellchat_grade2 <- createCellChat(object = data.input_grade2, meta = meta_grade2, group.by = "labels") +cellchat_grade0 <- createCellChat(object = data.input_grade0, meta = meta_grade0, group.by = "labels") + +``` + +```{r subset and preprocess data rejection, eval=!cellchat_rejection_ran} + +future::plan("multisession", workers = 8) # recommend running with at 8-16 cores + +cellchat_grade2@DB <- CellChatDB.use +cellchat_grade0@DB <- CellChatDB.use + +cellchat_grade2 <- subsetData(cellchat_grade2) +cellchat_grade2 <- updateCellChat(cellchat_grade2) +cellchat_grade2 <- identifyOverExpressedGenes(cellchat_grade2) # may take a couple minutes +cellchat_grade2 <- identifyOverExpressedInteractions(cellchat_grade2) # may take a couple minutes + +cellchat_grade0 <- subsetData(cellchat_grade0) +cellchat_grade0 <- updateCellChat(cellchat_grade0) +cellchat_grade0 <- identifyOverExpressedGenes(cellchat_grade0) # may take a couple minutes +cellchat_grade0 <- identifyOverExpressedInteractions(cellchat_grade0) # may take a couple minutes + +``` + +```{r compute communication prob rejection, eval=!cellchat_rejection_ran} +cellchat_grade2 <- computeCommunProb(cellchat_grade2, type = "triMean") # command takes 0.5-2+ hours +cellchat_grade2 <- filterCommunication(cellchat_grade2, min.cells = 50) +qsave(cellchat_grade2, params$cellchat_grade2_fn, preset = 'fast') + +cellchat_grade0 <- computeCommunProb(cellchat_grade0, type = "triMean") # command takes 0.5-2+ hours +cellchat_grade0 <- filterCommunication(cellchat_grade0, min.cells = 50) +qsave(cellchat_grade0, params$cellchat_grade0_fn, preset = 'fast') + +``` + +```{r load cellchat rejection, eval = cellchat_rejection_ran} + +cellchat_grade2 <- qread(params$cellchat_grade2_fn) +cellchat_grade0 <- qread(params$cellchat_grade0_fn) + +cellchat_grade2 <- filterCommunication(cellchat_grade2, min.cells = 50) +cellchat_grade0 <- filterCommunication(cellchat_grade0, min.cells = 50) + +df.net_grade2 <- subsetCommunication(cellchat_grade2)%>% dplyr::arrange(pval) +df.net_grade0 <- subsetCommunication(cellchat_grade0)%>% dplyr::arrange(pval) + +``` + +## Grade 2 + +```{r datatable grade 2, eval = cellchat_rejection_ran} +df.net_grade2 %>% sanitize_datatable() + +``` + +### Top interactions {.tabset} +```{r check pairs grade 2, results = 'asis', fig.width = 8, fig.height = 12} + +top_ints <- (df.net_grade2 %>% pull(interaction_name) %>% unique)[1:10] +for (interaction in top_ints){ + cat('\n') + cat('#### ', as.character(interaction), '\n') + interactors <- unlist(strsplit(as.character(interaction), '_')) + p1 <- VlnPlot(snrna, features = interactors, group.by = 'Chris_annot', pt.size = 0.1, log = T, ncol = 1) + print(p1) + cat('\n') +} + +``` + + +## Grade 0 + +```{r datatable grade 0, eval = cellchat_rejection_ran} +df.net_grade0 %>% sanitize_datatable() + +``` + +### Top interactions {.tabset} +```{r check pairs grade 0, results = 'asis', fig.width = 8, fig.height = 12} + +top_ints <- (df.net_grade0 %>% pull(interaction_name) %>% unique)[1:10] +for (interaction in top_ints){ + cat('\n') + cat('#### ', as.character(interaction), '\n') + interactors <- unlist(strsplit(as.character(interaction), '_')) + p1 <- VlnPlot(snrna, features = interactors, group.by = 'Chris_annot', pt.size = 0.1, log = T, ncol = 1) + print(p1) + cat('\n') +} + +``` + +```{r merge rejection objects, eval = cellchat_rejection_ran} + +cellchat_grade2 <- computeCommunProbPathway(cellchat_grade2) +cellchat_grade2 <- aggregateNet(cellchat_grade2) +cellchat_grade2 <- netAnalysis_computeCentrality(cellchat_grade2) +cellchat_grade0 <- computeCommunProbPathway(cellchat_grade0) +cellchat_grade0 <- aggregateNet(cellchat_grade0) +cellchat_grade0 <- netAnalysis_computeCentrality(cellchat_grade0) + +object.list <- list(grade0 = cellchat_grade0, grade2 = cellchat_grade2) +cellchat_merged <- mergeCellChat(object.list, add.names = names(object.list)) + +df.net_merged <- subsetCommunication(cellchat_merged) + +``` + +## Compare Interactions/Interaction Strength + +```{r compare interactions, eval = cellchat_rejection_ran} + +gg1 <- compareInteractions(cellchat_merged, show.legend = F, group = c(1,2)) +gg2 <- compareInteractions(cellchat_merged, show.legend = F, group = c(1,2), measure = "weight") +gg1 + gg2 + +``` + +```{r chord plots merged, eval = cellchat_rejection_ran, fig.width = 10, fig.height = 8} +par(mfrow = c(1,2), xpd=TRUE) +netVisual_diffInteraction(cellchat_merged, weight.scale = T) +netVisual_diffInteraction(cellchat_merged, weight.scale = T, measure = "weight") + +``` + +```{r heatmaps merged, eval = cellchat_rejection_ran, fig.width = 10, fig.height = 8} + +gg1 <- netVisual_heatmap(cellchat_merged) +gg2 <- netVisual_heatmap(cellchat_merged, measure = "weight") +gg1 + gg2 + +``` + +## Compare Major Pathway Sources and Targets + +From the CellChat documentation: "Comparing the outgoing and incoming interaction strength in a 2D space allows ready identification of the cell populations with significant changes in sending or receiving signals between different datasets." + +```{r compare send/receive changes, eval = cellchat_rejection_ran, fig.width = 10, fig.height = 6} + +num.link <- sapply(object.list, function(x) {rowSums(x@net$count) + colSums(x@net$count)-diag(x@net$count)}) +weight.MinMax <- c(min(num.link), max(num.link)) # control the dot size in the different datasets +gg <- list() +for (i in 1:length(object.list)) { + gg[[i]] <- netAnalysis_signalingRole_scatter(object.list[[i]], title = names(object.list)[i], weight.MinMax = weight.MinMax) +} +patchwork::wrap_plots(plots = gg) +``` + + +```{r identify signaling changes, eval = cellchat_rejection_ran, fig.width = 12, fig.height = 12} +gg1 <- netAnalysis_signalingChanges_scatter(cellchat_merged, idents.use = "Vascular_EC") +gg2 <- netAnalysis_signalingChanges_scatter(cellchat_merged, idents.use = "Lymphatic_EC") +gg3 <- netAnalysis_signalingChanges_scatter(cellchat_merged, idents.use = "Pericyte") +patchwork::wrap_plots(plots = list(gg1,gg2,gg3), nrow = 3, ncol = 1) + +``` + +## Cluster Altered Signaling Interactions + +From the CellChat documentation: "CellChat performs joint manifold learning and classification of the inferred communication networks based on their functional and topological similarity across different conditions. + +By quantifying the similarity between the cellular communication networks of signaling pathways across conditions, this analysis highlights the potentially altered signaling pathways. CellChat adopts the concept of network rewiring from network biology and hypothesized that the difference between different communication networks may affect biological processes across conditions. UMAP is used for visualizing signaling relationship and interpreting our signaling outputs in an intuitive way without involving the classification of conditions. + +Functional similarity: High degree of functional similarity indicates major senders and receivers are similar, and it can be interpreted as the two signaling pathways or two ligand-receptor pairs exhibit similar and/or redundant roles. + +Structural similarity: A structural similarity was used to compare their signaling network structure, without considering the similarity of senders and receivers." + + +### Based on Functional Similarity + +```{r identify signaling groups functional, eval = cellchat_rejection_ran} + +cellchat_merged <- computeNetSimilarityPairwise(cellchat_merged, type = "functional") +cellchat_merged <- netEmbedding(cellchat_merged, type = "functional") +cellchat_merged <- netClustering(cellchat_merged, type = "functional") +netVisual_embeddingPairwise(cellchat_merged, type = "functional", label.size = 3.5) + +``` + +### Based on Structural Similarity + +```{r identify signaling groups structural, eval = cellchat_rejection_ran} +cellchat_merged <- computeNetSimilarityPairwise(cellchat_merged, type = "structural") +cellchat_merged <- netEmbedding(cellchat_merged, type = "structural") +cellchat_merged <- netClustering(cellchat_merged, type = "structural") +netVisual_embeddingPairwise(cellchat_merged, type = "structural", label.size = 3.5) +``` + +## Compare Overall Signaling Information Flow + +"CellChat can identify the conserved and context-specific signaling pathways by simply comparing the information flow for each signaling pathway, which is defined by the sum of communication probability among all pairs of cell groups in the inferred network (i.e., the total weights in the network)." + +```{r info flow, fig.height = 9, eval = cellchat_rejection_ran} + +rankNet(cellchat_merged, mode = "comparison", measure = "weight", sources.use = NULL, targets.use = NULL, stacked = F, do.stat = TRUE) + +``` + +## Compare Signaling Patterns Across Cell Populations + +"In this heatmap, colobar represents the relative signaling strength of a signaling pathway across cell groups (Note that values are row-scaled). The top colored bar plot shows the total signaling strength of a cell group by summarizing all signaling pathways displayed in the heatmap. The right grey bar plot shows the total signaling strength of a signaling pathway by summarizing all cell groups displayed in the heatmap." + + +```{r outgoing signaling, fig.height = 9, eval = cellchat_rejection_ran} + +i = 1 +pathway.union <- union(object.list[[i]]@netP$pathways, object.list[[i+1]]@netP$pathways) +ht1 = netAnalysis_signalingRole_heatmap(object.list[[i]], pattern = "outgoing", signaling = pathway.union, title = names(object.list)[i], width = 5, height = 16, cluster.cols = T) +ht2 = netAnalysis_signalingRole_heatmap(object.list[[i+1]], pattern = "outgoing", signaling = pathway.union, title = names(object.list)[i+1], width = 5, height = 16, cluster.cols = T) +draw(ht1 + ht2, ht_gap = unit(0.5, "cm")) +``` + +```{r incoming signaling, fig.height = 9, eval = cellchat_rejection_ran} +ht1 = netAnalysis_signalingRole_heatmap(object.list[[i]], pattern = "incoming", signaling = pathway.union, title = names(object.list)[i], width = 5, height = 16, cluster.cols = T) +ht2 = netAnalysis_signalingRole_heatmap(object.list[[i+1]], pattern = "incoming", signaling = pathway.union, title = names(object.list)[i+1], width = 5, height = 16, cluster.cols = T) +draw(ht1 + ht2, ht_gap = unit(0.5, "cm")) +``` + +## Identify Dysfunctional Interaction Signaling Using Communication Probabilities + +"CellChat can identify the up-regulated (increased) and down-regulated (decreased) signaling ligand-receptor pairs in one dataset compared to the other dataset by comparing the communication probability between two datasets for each L-R pair and each pair of cell groups" + +```{r compare signaling, fig.height = 12, fig.width = 8, eval = cellchat_rejection_ran} + +gg1 <- netVisual_bubble(cellchat_merged, + # sources.use = c('Vascular_EC', 'Lymphatic_EC', 'Pericyte'), + # targets.use = c('Vascular_EC', 'Lymphatic_EC', 'Pericyte'), + comparison = c(1, 2), + max.dataset = 2, + title.name = "Increased signaling in Grade 2", + angle.x = 45, + remove.isolate = T) +gg1 +signaling.grade2_increased = gg1$data diff --git a/inst/rmarkdown/templates/cosmx/skeleton/QC/QC.Rmd b/inst/templates/spatial/cosmx/QC/QC.Rmd similarity index 93% rename from inst/rmarkdown/templates/cosmx/skeleton/QC/QC.Rmd rename to inst/templates/spatial/cosmx/QC/QC.Rmd index 4271822..bff6628 100644 --- a/inst/rmarkdown/templates/cosmx/skeleton/QC/QC.Rmd +++ b/inst/templates/spatial/cosmx/QC/QC.Rmd @@ -26,6 +26,20 @@ params: umap_dim: approximateumap_8c6f278e.b9f4.4535.aeca.8955c1dff614_1 --- +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +stopifnot(compareVersion(package.version("Seurat"), "5.0.0")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-draft-grey) revision. + ```{r load_params, echo = F} source(params$project_file) ``` @@ -75,7 +89,6 @@ sanitize_datatable = function(df, ...) { - Analyst: `r analyst` - Experiment: `r experiment` - Aim: `r aim` -- Sample: `r params$seurat_fn` ```{r read rds} @@ -314,4 +327,4 @@ rownames(seurat_filtered@meta.data) <- seurat_filtered@meta.data$cell_id ImageDimPlot(seurat_filtered, group.by = 'cell_type', axes = TRUE, crop = TRUE) -``` \ No newline at end of file +``` diff --git a/inst/rmarkdown/templates/cosmx/skeleton/QC/run_markdown.R b/inst/templates/spatial/cosmx/QC/run_markdown.R similarity index 100% rename from inst/rmarkdown/templates/cosmx/skeleton/QC/run_markdown.R rename to inst/templates/spatial/cosmx/QC/run_markdown.R diff --git a/inst/templates/spatial/cosmx/information.R b/inst/templates/spatial/cosmx/information.R new file mode 100644 index 0000000..6e15eef --- /dev/null +++ b/inst/templates/spatial/cosmx/information.R @@ -0,0 +1,6 @@ +# info params +project = "name_hbcXXXXX" +PI = 'person name' +experiment = 'short description' +aim = 'short description' +analyst = 'person in the core' diff --git a/inst/templates/spatial/cosmx/readme.md b/inst/templates/spatial/cosmx/readme.md new file mode 100644 index 0000000..637c16b --- /dev/null +++ b/inst/templates/spatial/cosmx/readme.md @@ -0,0 +1,16 @@ +# Guidelines for analysis + +Make sure there is a valid project name, and modify `information.R` with the right information for your project. You can use this file with any other Rmd to include the project/analysis information. + +## QC + +`QC/QC.Rmd` is a template for QC metrics. It plots the locations of cells on the slide, filters cells using the number of genes and AtoMX quality flags, and normalizes the data. It also provides sample code for clustering and cell type identification. + +Read instruction in the R and Rmd scripts to render it. + +Note that future versions of this template will include code for building your own RDS object out of .csv.gz files produced by the AtoMX software instead of loading an RDS directly, as this allows the analyst to access information about transcript locations and cell sementation that is not available in the pre-made RDS objects from AtoMx or BWH. + +## DropBox + +- In `reports/QC` + - [ ] copy QC `Rmd/R/html/figures` diff --git a/man/bcbio_nfcore_check.Rd b/man/bcbio_nfcore_check.Rd index 76c954f..022c903 100644 --- a/man/bcbio_nfcore_check.Rd +++ b/man/bcbio_nfcore_check.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hello.R +% Please edit documentation in R/helpers.R \name{bcbio_nfcore_check} \alias{bcbio_nfcore_check} \title{Function to check samplesheet for nf-core} diff --git a/man/bcbio_set_project.Rd b/man/bcbio_set_project.Rd deleted file mode 100644 index b3c67ef..0000000 --- a/man/bcbio_set_project.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hello.R -\name{bcbio_set_project} -\alias{bcbio_set_project} -\title{Function to help with project name used for parent folder} -\usage{ -bcbio_set_project() -} -\value{ -A string list with hbc_code, and project folder name -} -\description{ -This function will ask for user input about: -\itemize{ -\item numeric code -\item PI full name -\item technology -\item tissue -\item organism -\item project description -} -} -\details{ -It removes special character with \verb{_}. The output is a guideline to -what the folder used can be. -} diff --git a/man/bcbio_templates.Rd b/man/bcbio_templates.Rd index 45c9ad9..e5e39e3 100644 --- a/man/bcbio_templates.Rd +++ b/man/bcbio_templates.Rd @@ -1,21 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hello.R +% Please edit documentation in R/helpers.R \name{bcbio_templates} \alias{bcbio_templates} \title{Function to help deploy analysis folder inside a project folder} \usage{ -bcbio_templates(type = "rnaseq", outpath) +bcbio_templates(type = "rnaseq", outpath = NULL, org = NULL) } \arguments{ -\item{type}{string indicating the type of analysis, supported: -\itemize{ -\item base -\item rnaseq, scrnaseq, -\item teaseq -\item cosmx -}} +\item{type}{string indicating the type of analysis, supported: rnaseq.} \item{outpath}{string path indicating where to copy all the files to} + +\item{org}{string with the organization name. To deploy specific files.} } \description{ This function contains Rmd, R, md, files that help to structure @@ -29,6 +25,8 @@ project folder. } \examples{ \dontrun{ - bcbio_templates("rnaseq", "path_to_projects/project1/reports") + path <- withr::local_tempdir() + bcbio_templates(type="base",outpath=path) + fs::dir_ls(path,all=T) } } diff --git a/man/cb_friendly_pal.Rd b/man/cb_friendly_pal.Rd index d1d9181..09215b1 100644 --- a/man/cb_friendly_pal.Rd +++ b/man/cb_friendly_pal.Rd @@ -7,7 +7,7 @@ cb_friendly_pal(palette = "main", reverse = F, ...) } \arguments{ -\item{palette}{name of the palette to be returned} +\item{palette}{name of the palette to be returned (main, cool, hot, grey, white_to_blue, or heatmap)} \item{reverse}{boolean, reverse order of colors in palette} diff --git a/man/cb_friendly_palettes.Rd b/man/cb_friendly_palettes.Rd index 88ae911..d8bb831 100644 --- a/man/cb_friendly_palettes.Rd +++ b/man/cb_friendly_palettes.Rd @@ -5,7 +5,7 @@ \alias{cb_friendly_palettes} \title{define main colorblind-friendly palette as well as sub-palettes} \format{ -An object of class \code{list} of length 5. +An object of class \code{list} of length 6. } \usage{ cb_friendly_palettes diff --git a/man/scale_color_cb_friendly.Rd b/man/scale_color_cb_friendly.Rd index a89f1f8..34ab816 100644 --- a/man/scale_color_cb_friendly.Rd +++ b/man/scale_color_cb_friendly.Rd @@ -12,7 +12,7 @@ scale_color_cb_friendly( ) } \arguments{ -\item{palette}{name of the palette to be returned} +\item{palette}{name of the palette to be returned (main, cool, hot, grey, white_to_blue, or heatmap)} \item{discrete}{boolean, whether to make palette discretely divided into colors or continuous} diff --git a/man/scale_fill_cb_friendly.Rd b/man/scale_fill_cb_friendly.Rd index b48fc73..89ecf35 100644 --- a/man/scale_fill_cb_friendly.Rd +++ b/man/scale_fill_cb_friendly.Rd @@ -7,7 +7,7 @@ scale_fill_cb_friendly(palette = "main", discrete = TRUE, reverse = FALSE, ...) } \arguments{ -\item{palette}{name of the palette to be returned} +\item{palette}{name of the palette to be returned (main, cool, hot, grey, white_to_blue, or heatmap)} \item{discrete}{boolean, whether to make palette discretely divided into colors or continuous} diff --git a/tests/testthat/rnaseq.R b/tests/testthat/rnaseq.R index e30ebea..42dfd1f 100644 --- a/tests/testthat/rnaseq.R +++ b/tests/testthat/rnaseq.R @@ -1,29 +1,37 @@ library(bcbioR) -test_that("rnaseq testing", { +test_that("rnaseq deg",{ path <- withr::local_tempdir() print(path) bcbio_templates(type="rnaseq", outpath=path) - numerator="tumor" - denominator="normal" - subset_value=NA + fs::dir_ls(path,all=T) rmarkdown::render(input = file.path(path,"DE/DEG.Rmd"), output_dir = file.path(path,"DE"), output_format = "html_document", - output_file = ifelse(!is.na(subset_value), - paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), - paste0('DE_', numerator, '_vs_', denominator, '.html') - ), + clean = TRUE + # envir = new.env(), + ) + + # browseURL(file.path(path, "DE/DE_tumor_vs_normal.html")) + # usethis::proj_activate(path) +}) + +test_that("rnaseq qc",{ + path <- withr::local_tempdir() + print(path) + bcbio_templates(type="rnaseq", outpath=path) + fs::dir_ls(path,all=T) + rmarkdown::render(input = file.path(path,"QC/QC_nf-core.Rmd"), + output_dir = file.path(path,"QC"), + output_format = "html_document", clean = TRUE, - envir = new.env(), params = list( - subset_value = subset_value, - numerator = numerator, - denominator = denominator, - params_file = file.path(path,'DE/params_de-example.R'), + params_file = file.path(path,'QC/params_qc_nf-core-example.R'), project_file = file.path(path,'information.R'), - functions_file = file.path(path,'DE/load_data.R') + functions_file = file.path(path,'libs/load_data.R') ) ) + # browseURL(file.path(path, "QC/QC_nf-core.html")) + # usethis::proj_activate(path) }) diff --git a/tests/testthat/test-deploy.R b/tests/testthat/test-deploy.R new file mode 100644 index 0000000..049480a --- /dev/null +++ b/tests/testthat/test-deploy.R @@ -0,0 +1,75 @@ +library(bcbioR) + + +test_that("scrnaseq",{ + path <- withr::local_tempdir() + print(path) + copy_templates(path, "singlecell") + expect_length(fs::dir_ls(path,all=T),8) + expect_true(grepl("scRNAseq_qc_app", + fs::dir_ls(file.path(path, "apps"), recurse=T, all=T)[2])) +}) + +test_that("base copy",{ + path <- withr::local_tempdir() + print(path) + bcbio_templates(type="base", outpath=path) + expect_length(fs::dir_ls(path,all=T),10) + expect_true(file.exists(file.path(path,".gitignore"))) +}) + +test_that("rnaseq copy",{ + path <- withr::local_tempdir() + print(path) + bcbio_templates(type="rnaseq", outpath=path) + expect_length(fs::dir_ls(path,all=T),6) + # numerator="tumor" + # denominator="normal" + # subset_value=NA + # rmarkdown::render(input = file.path(path,"DE/DEG.Rmd"), + # output_dir = file.path(path,"DE"), + # output_format = "html_document", + # output_file = ifelse(!is.na(subset_value), + # paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), + # paste0('DE_', numerator, '_vs_', denominator, '.html') + # ), + # clean = TRUE, + # envir = new.env(), + # params = list( + # subset_value = subset_value, + # numerator = numerator, + # denominator = denominator, + # params_file = file.path(path,'DE/params_de-example.R'), + # project_file = file.path(path,'information.R'), + # functions_file = file.path(path,'DE/load_data.R') + # ) + # ) + # use_bcbio_projects(path, nfcore="nf-core/rnaseq", copy=TRUE, git=FALSE) +}) + +# test_that("rnaseq testing", { +# path <- withr::local_tempdir() +# print(path) +# bcbio_templates(type="rnaseq", outpath=path) +# numerator="tumor" +# denominator="normal" +# subset_value=NA +# rmarkdown::render(input = file.path(path,"DE/DEG.Rmd"), +# output_dir = file.path(path,"DE"), +# output_format = "html_document", +# output_file = ifelse(!is.na(subset_value), +# paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), +# paste0('DE_', numerator, '_vs_', denominator, '.html') +# ), +# clean = TRUE, +# envir = new.env(), +# params = list( +# subset_value = subset_value, +# numerator = numerator, +# denominator = denominator, +# params_file = file.path(path,'DE/params_de.R'), +# project_file = file.path(path,'information.R'), +# functions_file = file.path(path,'DE/load_data.R') +# ) +# ) +# }) diff --git a/tests/testthat/misc.R b/tests/testthat/test-misc.R similarity index 100% rename from tests/testthat/misc.R rename to tests/testthat/test-misc.R diff --git a/vignettes/bcbioR_quick_start.Rmd b/vignettes/bcbioR_quick_start.Rmd index a05670a..002003f 100644 --- a/vignettes/bcbioR_quick_start.Rmd +++ b/vignettes/bcbioR_quick_start.Rmd @@ -64,8 +64,7 @@ We support multiple analyses type: - RNAseq - scRNAseq -- TEAseq -- COSMX +- ChipPseq To get the example code for any of them you can use a similar command: