diff --git a/scripts/collate_deseq_results.R b/scripts/collate_deseq_results.R new file mode 100644 index 0000000..2ba5025 --- /dev/null +++ b/scripts/collate_deseq_results.R @@ -0,0 +1,89 @@ +# PiGx RNAseq Pipeline. +# +# Copyright © 2019 Bora Uyar +# +# This file is part of the PiGx RNAseq Pipeline. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + + + +# Collate DESeq results into one big matrix + +args <- commandArgs(trailingOnly = TRUE) +mapper <- args[1] +inpDir <- args[2] +outDir <- args[3] + +if (mapper=="hisat2"||mapper=="star") { + mapper <- "" +} + +strpat <- paste0(mapper, ".deseq_results.tsv$") +filenames <- dir(inpDir, pattern = strpat, full.names = TRUE) + +#filenames <- list.files(pattern=".deseq_results.tsv$") + +all_files <- lapply(filenames, function(x) { + file <- read.table(x) + sub_file <- subset(file, select = c(0,2,6)) + # Get the start of filename prefix + prefix = sub(".deseq_results.tsv", "", x) + inpstr = paste(inpDir, "/", sep = "") + prefix = sub(inpstr, "", prefix) + colnames(sub_file) <- paste(prefix, colnames(sub_file), sep='_') + return(sub_file) +}) + + +multimerge <- function (mylist) { + ## mimics a recursive merge or full outer join + + unames <- unique(unlist(lapply(mylist, rownames))) + + n <- length(unames) + + out <- lapply(mylist, function(df) { + + tmp <- matrix(nr = n, nc = ncol(df), dimnames = list(unames,colnames(df))) + tmp[rownames(df), ] <- as.matrix(df) + rm(df); gc() + + return(tmp) + }) + + stopifnot( all( sapply(out, function(x) identical(rownames(x), unames)) ) ) + + bigout <- do.call(cbind, out) + colnames(bigout) <- paste(rep(names(mylist), sapply(mylist, ncol)), unlist(sapply(mylist, colnames)), sep = "") + return(bigout) +} + + +collated_dataframe <- multimerge(all_files) + + +# save results to out file +if (mapper=="genes"||mapper=="transcripts") { + mapper <- paste0(".", mapper) +} +finalname <- paste("collated", mapper, ".deseq_results.tsv", sep="") + +collatedFile <- file.path(outDir, finalname) + +write.table(x = as.data.frame(collated_dataframe), file = collatedFile, + quote = FALSE, sep = '\t') + +#write.table(collated_dataframe, file ="collated.deseq_results.tsv", quote = FALSE, sep = '\t') diff --git a/snakefile.py b/snakefile.py old mode 100755 new mode 100644 index 4aa2a1b..43e6b5d --- a/snakefile.py +++ b/snakefile.py @@ -135,7 +135,7 @@ def lookup(column, predicate, fields=[]): 'final-report': { 'description': "Produce a comprehensive report. This is the default target.", 'files': - [os.path.join(OUTPUT_DIR, "input_annotation_stats.tsv"), + [os.path.join(OUTPUT_DIR, "input_annotation_stats.tsv"), os.path.join(MULTIQC_DIR, 'multiqc_report.html'), os.path.join(COUNTS_DIR, "raw_counts", "salmon", "counts_from_SALMON.transcripts.tsv"), os.path.join(COUNTS_DIR, "raw_counts", "salmon", "counts_from_SALMON.genes.tsv"), @@ -143,7 +143,10 @@ def lookup(column, predicate, fields=[]): os.path.join(COUNTS_DIR, "normalized", "salmon", "TPM_counts_from_SALMON.genes.tsv"), os.path.join(COUNTS_DIR, "raw_counts", MAPPER, "counts.tsv"), os.path.join(COUNTS_DIR, "normalized", MAPPER, "deseq_normalized_counts.tsv"), - os.path.join(COUNTS_DIR, "normalized", MAPPER, "deseq_size_factors.txt")] + + os.path.join(COUNTS_DIR, "normalized", MAPPER, "deseq_size_factors.txt"), + os.path.join(OUTPUT_DIR, "report", MAPPER, "collated.deseq_results.tsv"), + os.path.join(OUTPUT_DIR, "report", 'salmon', "collated.transcripts.deseq_results.tsv"), + os.path.join(OUTPUT_DIR, "report", 'salmon', "collated.genes.deseq_results.tsv")] + BIGWIG_OUTPUT + expand(os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) + expand(os.path.join(OUTPUT_DIR, "report", 'salmon', '{analysis}.salmon.transcripts.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) + @@ -152,22 +155,26 @@ def lookup(column, predicate, fields=[]): 'deseq_report_star': { 'description': "Produce one HTML report for each analysis based on STAR results.", 'files': - expand(os.path.join(OUTPUT_DIR, "report", 'star', '{analysis}.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) + expand(os.path.join(OUTPUT_DIR, "report", 'star', '{analysis}.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) + + [os.path.join(OUTPUT_DIR, "report", 'star', "collated.deseq_results.tsv")] }, 'deseq_report_hisat2': { 'description': "Produce one HTML report for each analysis based on Hisat2 results.", 'files': - expand(os.path.join(OUTPUT_DIR, "report", 'hisat2', '{analysis}.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) + expand(os.path.join(OUTPUT_DIR, "report", 'hisat2', '{analysis}.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) + + [os.path.join(OUTPUT_DIR, "report", 'hisat2', "collated.deseq_results.tsv")] }, 'deseq_report_salmon_transcripts': { 'description': "Produce one HTML report for each analysis based on SALMON results at transcript level.", 'files': - expand(os.path.join(OUTPUT_DIR, "report", 'salmon', '{analysis}.salmon.transcripts.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) + expand(os.path.join(OUTPUT_DIR, "report", 'salmon', '{analysis}.salmon.transcripts.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) + + [os.path.join(OUTPUT_DIR, "report", 'salmon', "collated.transcripts.deseq_results.tsv")] }, 'deseq_report_salmon_genes': { 'description': "Produce one HTML report for each analysis based on SALMON results at gene level.", 'files': - expand(os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) + expand(os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) + + [os.path.join(OUTPUT_DIR, "report", 'salmon', "collated.genes.deseq_results.tsv")] }, 'star_map' : { 'description': "Produce a STAR mapping results in BAM file format.", @@ -231,7 +238,8 @@ def lookup(column, predicate, fields=[]): OUTPUT_FILES.append(os.path.join(OUTPUT_DIR, "annotations.tgz")) rule all: - input: OUTPUT_FILES + input: + OUTPUT_FILES, rule check_annotation_files: input: @@ -561,7 +569,7 @@ def hisat2_file_arguments(args): "{RSCRIPT_EXEC} {params.script} {params.mapped_dir} {input.colDataFile} {output} >> {log} 2>&1" # create a normalized counts table including all samples -# using the median-of-ratios normalization procedure of +# using the median-of-ratios normalization procedure ofcollate_deseq_results.R # deseq2 rule norm_counts_deseq: input: @@ -596,12 +604,30 @@ def hisat2_file_arguments(args): logo = LOGO log: os.path.join(LOG_DIR, MAPPER, "{analysis}.report.log") output: - os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq.report.html') + os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq.report.html'), + os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq_results.tsv') resources: mem_mb = config['execution']['rules']['reports']['memory'] shell: "{RSCRIPT_EXEC} {params.reportR} --logo={params.logo} --prefix='{wildcards.analysis}' --reportFile={params.reportRmd} --countDataFile={input.counts} --colDataFile={input.coldata} --gtfFile={GTF_FILE} --caseSampleGroups='{params.case}' --controlSampleGroups='{params.control}' --covariates='{params.covariates}' --workdir={params.outdir} --organism='{ORGANISM}' --description='{params.description}' --selfContained='{params.selfContained}' >> {log} 2>&1" +rule deseq_collate_report1: + input: + html_reports=expand(os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()), + deseq_results=expand(os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq_results.tsv'), analysis = DE_ANALYSIS_LIST.keys()) + params: + mapper=MAPPER, + outdir=os.path.join(OUTPUT_DIR, "report", MAPPER), + inpdir=os.path.join(OUTPUT_DIR, "report", MAPPER), + script=os.path.join(SCRIPTS_DIR, "collate_deseq_results.R"), + log: os.path.join(LOG_DIR, MAPPER, "collate_deseq.report.log") + output: + os.path.join(OUTPUT_DIR, "report", MAPPER, 'collated.deseq_results.tsv') + resources: + mem_mb = config['execution']['rules']['reports']['memory'] + shell: + "{RSCRIPT_EXEC} {params.script} {params.mapper} {params.inpdir} {params.outdir} >> {log} 2>&1" + rule report2: input: counts=os.path.join(COUNTS_DIR, "raw_counts", "salmon", "counts_from_SALMON.transcripts.tsv"), @@ -618,11 +644,29 @@ def hisat2_file_arguments(args): logo = LOGO log: os.path.join(LOG_DIR, "salmon", "{analysis}.report.salmon.transcripts.log") output: - os.path.join(OUTPUT_DIR, "report", 'salmon', '{analysis}.salmon.transcripts.deseq.report.html') + os.path.join(OUTPUT_DIR, "report", 'salmon', '{analysis}.salmon.transcripts.deseq.report.html'), + os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.transcripts.deseq_results.tsv') resources: mem_mb = config['execution']['rules']['reports']['memory'] shell: "{RSCRIPT_EXEC} {params.reportR} --logo={params.logo} --prefix='{wildcards.analysis}.salmon.transcripts' --reportFile={params.reportRmd} --countDataFile={input.counts} --colDataFile={input.coldata} --gtfFile={GTF_FILE} --caseSampleGroups='{params.case}' --controlSampleGroups='{params.control}' --covariates='{params.covariates}' --workdir={params.outdir} --organism='{ORGANISM}' --description='{params.description}' --selfContained='{params.selfContained}' >> {log} 2>&1" +rule deseq_collate_report2: + input: + html_reports=expand(os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.transcripts.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()), + deseq_results=expand(os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.transcripts.deseq_results.tsv'), analysis = DE_ANALYSIS_LIST.keys()) + params: + mapper="transcripts", + outdir=os.path.join(OUTPUT_DIR, "report", 'salmon'), + inpdir=os.path.join(OUTPUT_DIR, "report", 'salmon'), + script=os.path.join(SCRIPTS_DIR, "collate_deseq_results.R"), + log: os.path.join(LOG_DIR, "salmon", "collate_transcripts_deseq.report.log") + output: + os.path.join(OUTPUT_DIR, "report", 'salmon', 'collated.transcripts.deseq_results.tsv') + resources: + mem_mb = config['execution']['rules']['reports']['memory'] + shell: + "{RSCRIPT_EXEC} {params.script} {params.mapper} {params.inpdir} {params.outdir} >> {log} 2>&1" + rule report3: input: counts=os.path.join(COUNTS_DIR, "raw_counts", "salmon", "counts_from_SALMON.genes.tsv"), @@ -639,7 +683,25 @@ def hisat2_file_arguments(args): logo = LOGO log: os.path.join(LOG_DIR, "salmon", "{analysis}.report.salmon.genes.log") output: - os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq.report.html') + os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq.report.html'), + os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq_results.tsv') resources: mem_mb = config['execution']['rules']['reports']['memory'] shell: "{RSCRIPT_EXEC} {params.reportR} --logo={params.logo} --prefix='{wildcards.analysis}.salmon.genes' --reportFile={params.reportRmd} --countDataFile={input.counts} --colDataFile={input.coldata} --gtfFile={GTF_FILE} --caseSampleGroups='{params.case}' --controlSampleGroups='{params.control}' --covariates='{params.covariates}' --workdir={params.outdir} --organism='{ORGANISM}' --description='{params.description}' --selfContained='{params.selfContained}' >> {log} 2>&1" + +rule deseq_collate_report3: + input: + html_reports=expand(os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()), + deseq_results=expand(os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq_results.tsv'), analysis = DE_ANALYSIS_LIST.keys()) + params: + mapper="genes", + outdir=os.path.join(OUTPUT_DIR, "report", 'salmon'), + inpdir=os.path.join(OUTPUT_DIR, "report", 'salmon'), + script=os.path.join(SCRIPTS_DIR, "collate_deseq_results.R"), + log: os.path.join(LOG_DIR, "salmon", "collate_genes_deseq.report.log") + output: + os.path.join(OUTPUT_DIR, "report", 'salmon', 'collated.genes.deseq_results.tsv') + resources: + mem_mb = config['execution']['rules']['reports']['memory'] + shell: + "{RSCRIPT_EXEC} {params.script} {params.mapper} {params.inpdir} {params.outdir} >> {log} 2>&1"