Added short script to collate deseq_results.tsv from all the differen…

…t analysis On branch deseq-collate Changes to be committed: new file: scripts/collate_deseq_results.R modified: snakefile.py
BIMSBbioinfo · Feb 21, 2024 · 76fdc77 · 76fdc77
1 parent 5689530
commit 76fdc77
Show file tree

Hide file tree

Showing 2 changed files with 162 additions and 11 deletions.
diff --git a/scripts/collate_deseq_results.R b/scripts/collate_deseq_results.R
@@ -0,0 +1,89 @@
+# PiGx RNAseq Pipeline.
+#
+# Copyright © 2019 Bora Uyar <[email protected]>
+#
+# This file is part of the PiGx RNAseq Pipeline.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+
+
+# Collate DESeq results into one big matrix
+
+args <- commandArgs(trailingOnly = TRUE)
+mapper <- args[1]
+inpDir <- args[2]
+outDir <- args[3]
+
+if (mapper=="hisat2"||mapper=="star") {
+  mapper <- ""
+}
+
+strpat <- paste0(mapper, ".deseq_results.tsv$")
+filenames <- dir(inpDir, pattern = strpat, full.names = TRUE)
+
+#filenames <- list.files(pattern=".deseq_results.tsv$")
+
+all_files <- lapply(filenames, function(x) {
+  file <- read.table(x)
+  sub_file <- subset(file, select = c(0,2,6))
+  # Get the start of filename prefix 
+  prefix = sub(".deseq_results.tsv", "", x)
+  inpstr = paste(inpDir, "/", sep = "")
+  prefix = sub(inpstr, "", prefix)
+  colnames(sub_file) <- paste(prefix, colnames(sub_file), sep='_')
+  return(sub_file)
+})
+
+
+multimerge <- function (mylist) {
+  ## mimics a recursive merge or full outer join
+
+  unames <- unique(unlist(lapply(mylist, rownames)))
+
+  n <- length(unames)
+
+  out <- lapply(mylist, function(df) {
+
+    tmp <- matrix(nr = n, nc = ncol(df), dimnames = list(unames,colnames(df)))
+    tmp[rownames(df), ] <- as.matrix(df)
+    rm(df); gc()
+
+    return(tmp)
+  })
+
+  stopifnot( all( sapply(out, function(x) identical(rownames(x), unames)) ) )
+
+  bigout <- do.call(cbind, out)
+  colnames(bigout) <- paste(rep(names(mylist), sapply(mylist, ncol)), unlist(sapply(mylist, colnames)), sep = "")
+  return(bigout)
+}
+
+
+collated_dataframe <- multimerge(all_files)
+
+
+# save results to out file
+if (mapper=="genes"||mapper=="transcripts") {
+  mapper <- paste0(".", mapper)
+}
+finalname <- paste("collated", mapper, ".deseq_results.tsv", sep="")
+
+collatedFile <- file.path(outDir, finalname)
+
+write.table(x = as.data.frame(collated_dataframe), file = collatedFile, 
+            quote = FALSE, sep = '\t')
+
+#write.table(collated_dataframe, file ="collated.deseq_results.tsv", quote = FALSE, sep = '\t')
diff --git a/snakefile.py b/snakefile.py
@@ -135,15 +135,18 @@ def lookup(column, predicate, fields=[]):
     'final-report': {
         'description': "Produce a comprehensive report.  This is the default target.",
         'files':
-        [os.path.join(OUTPUT_DIR, "input_annotation_stats.tsv"), 
+        [os.path.join(OUTPUT_DIR, "input_annotation_stats.tsv"),
          os.path.join(MULTIQC_DIR, 'multiqc_report.html'),
          os.path.join(COUNTS_DIR, "raw_counts", "salmon", "counts_from_SALMON.transcripts.tsv"),
          os.path.join(COUNTS_DIR, "raw_counts", "salmon", "counts_from_SALMON.genes.tsv"),
          os.path.join(COUNTS_DIR, "normalized", "salmon", "TPM_counts_from_SALMON.transcripts.tsv"),
          os.path.join(COUNTS_DIR, "normalized", "salmon", "TPM_counts_from_SALMON.genes.tsv"),
          os.path.join(COUNTS_DIR, "raw_counts", MAPPER, "counts.tsv"),
          os.path.join(COUNTS_DIR, "normalized", MAPPER, "deseq_normalized_counts.tsv"),
-         os.path.join(COUNTS_DIR, "normalized", MAPPER, "deseq_size_factors.txt")] +
+         os.path.join(COUNTS_DIR, "normalized", MAPPER, "deseq_size_factors.txt"),
+         os.path.join(OUTPUT_DIR, "report", MAPPER, "collated.deseq_results.tsv"),
+         os.path.join(OUTPUT_DIR, "report", 'salmon', "collated.transcripts.deseq_results.tsv"),
+         os.path.join(OUTPUT_DIR, "report", 'salmon', "collated.genes.deseq_results.tsv")] +
         BIGWIG_OUTPUT +
         expand(os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) +
         expand(os.path.join(OUTPUT_DIR, "report", 'salmon', '{analysis}.salmon.transcripts.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) +
@@ -152,22 +155,26 @@ def lookup(column, predicate, fields=[]):
     'deseq_report_star': {
         'description': "Produce one HTML report for each analysis based on STAR results.",
         'files':
-          expand(os.path.join(OUTPUT_DIR, "report", 'star', '{analysis}.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys())
+          expand(os.path.join(OUTPUT_DIR, "report", 'star', '{analysis}.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) +
+          [os.path.join(OUTPUT_DIR, "report", 'star', "collated.deseq_results.tsv")]
     },
     'deseq_report_hisat2': {
         'description': "Produce one HTML report for each analysis based on Hisat2 results.",
         'files':
-          expand(os.path.join(OUTPUT_DIR, "report", 'hisat2', '{analysis}.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys())
+          expand(os.path.join(OUTPUT_DIR, "report", 'hisat2', '{analysis}.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) +
+          [os.path.join(OUTPUT_DIR, "report", 'hisat2', "collated.deseq_results.tsv")]
     },
     'deseq_report_salmon_transcripts': {
         'description': "Produce one HTML report for each analysis based on SALMON results at transcript level.",
         'files':
-          expand(os.path.join(OUTPUT_DIR, "report", 'salmon', '{analysis}.salmon.transcripts.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys())
+          expand(os.path.join(OUTPUT_DIR, "report", 'salmon', '{analysis}.salmon.transcripts.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) +
+          [os.path.join(OUTPUT_DIR, "report", 'salmon', "collated.transcripts.deseq_results.tsv")]
     },
     'deseq_report_salmon_genes': {
         'description': "Produce one HTML report for each analysis based on SALMON results at gene level.",
         'files':
-          expand(os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys())
+          expand(os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) +
+          [os.path.join(OUTPUT_DIR, "report", 'salmon', "collated.genes.deseq_results.tsv")]
     },
     'star_map' : {
         'description': "Produce a STAR mapping results in BAM file format.",
@@ -231,7 +238,8 @@ def lookup(column, predicate, fields=[]):
 OUTPUT_FILES.append(os.path.join(OUTPUT_DIR, "annotations.tgz"))
 
 rule all:
-  input: OUTPUT_FILES
+  input:
+    OUTPUT_FILES,
 
 rule check_annotation_files:
   input: 
@@ -561,7 +569,7 @@ def hisat2_file_arguments(args):
     "{RSCRIPT_EXEC} {params.script} {params.mapped_dir} {input.colDataFile} {output} >> {log} 2>&1"
 
 # create a normalized counts table including all samples
-# using the median-of-ratios normalization procedure of
+# using the median-of-ratios normalization procedure ofcollate_deseq_results.R
 # deseq2
 rule norm_counts_deseq:
     input:
@@ -596,12 +604,30 @@ def hisat2_file_arguments(args):
     logo = LOGO
   log: os.path.join(LOG_DIR, MAPPER, "{analysis}.report.log")
   output:
-    os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq.report.html')
+    os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq.report.html'),
+    os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq_results.tsv')
   resources:
     mem_mb = config['execution']['rules']['reports']['memory']
   shell:
     "{RSCRIPT_EXEC} {params.reportR} --logo={params.logo} --prefix='{wildcards.analysis}' --reportFile={params.reportRmd} --countDataFile={input.counts} --colDataFile={input.coldata} --gtfFile={GTF_FILE} --caseSampleGroups='{params.case}' --controlSampleGroups='{params.control}' --covariates='{params.covariates}'  --workdir={params.outdir} --organism='{ORGANISM}' --description='{params.description}' --selfContained='{params.selfContained}' >> {log} 2>&1"
 
+rule deseq_collate_report1:
+  input:
+    html_reports=expand(os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()),
+    deseq_results=expand(os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq_results.tsv'), analysis = DE_ANALYSIS_LIST.keys())
+  params:
+    mapper=MAPPER,
+    outdir=os.path.join(OUTPUT_DIR, "report", MAPPER),
+    inpdir=os.path.join(OUTPUT_DIR, "report", MAPPER),
+    script=os.path.join(SCRIPTS_DIR, "collate_deseq_results.R"),
+  log: os.path.join(LOG_DIR, MAPPER, "collate_deseq.report.log")
+  output:
+    os.path.join(OUTPUT_DIR, "report", MAPPER, 'collated.deseq_results.tsv')
+  resources:
+    mem_mb = config['execution']['rules']['reports']['memory']
+  shell:
+    "{RSCRIPT_EXEC} {params.script} {params.mapper} {params.inpdir} {params.outdir} >> {log} 2>&1"
+
 rule report2:
   input:
     counts=os.path.join(COUNTS_DIR, "raw_counts", "salmon", "counts_from_SALMON.transcripts.tsv"),
@@ -618,11 +644,29 @@ def hisat2_file_arguments(args):
     logo = LOGO
   log: os.path.join(LOG_DIR, "salmon", "{analysis}.report.salmon.transcripts.log")
   output:
-    os.path.join(OUTPUT_DIR, "report", 'salmon', '{analysis}.salmon.transcripts.deseq.report.html')
+    os.path.join(OUTPUT_DIR, "report", 'salmon', '{analysis}.salmon.transcripts.deseq.report.html'),
+    os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.transcripts.deseq_results.tsv')
   resources:
     mem_mb = config['execution']['rules']['reports']['memory']
   shell: "{RSCRIPT_EXEC} {params.reportR} --logo={params.logo} --prefix='{wildcards.analysis}.salmon.transcripts' --reportFile={params.reportRmd} --countDataFile={input.counts} --colDataFile={input.coldata} --gtfFile={GTF_FILE} --caseSampleGroups='{params.case}' --controlSampleGroups='{params.control}' --covariates='{params.covariates}' --workdir={params.outdir} --organism='{ORGANISM}' --description='{params.description}' --selfContained='{params.selfContained}' >> {log} 2>&1"
 
+rule deseq_collate_report2:
+  input:
+    html_reports=expand(os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.transcripts.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()),
+    deseq_results=expand(os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.transcripts.deseq_results.tsv'), analysis = DE_ANALYSIS_LIST.keys())
+  params:
+    mapper="transcripts",
+    outdir=os.path.join(OUTPUT_DIR, "report", 'salmon'),
+    inpdir=os.path.join(OUTPUT_DIR, "report", 'salmon'),
+    script=os.path.join(SCRIPTS_DIR, "collate_deseq_results.R"),
+  log: os.path.join(LOG_DIR, "salmon", "collate_transcripts_deseq.report.log")
+  output:
+    os.path.join(OUTPUT_DIR, "report", 'salmon', 'collated.transcripts.deseq_results.tsv')
+  resources:
+    mem_mb = config['execution']['rules']['reports']['memory']
+  shell:
+    "{RSCRIPT_EXEC} {params.script} {params.mapper} {params.inpdir} {params.outdir} >> {log} 2>&1"
+
 rule report3:
   input:
     counts=os.path.join(COUNTS_DIR, "raw_counts", "salmon", "counts_from_SALMON.genes.tsv"),
@@ -639,7 +683,25 @@ def hisat2_file_arguments(args):
     logo = LOGO
   log: os.path.join(LOG_DIR, "salmon", "{analysis}.report.salmon.genes.log")
   output:
-    os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq.report.html')
+    os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq.report.html'),
+    os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq_results.tsv')
   resources:
     mem_mb = config['execution']['rules']['reports']['memory']
   shell: "{RSCRIPT_EXEC} {params.reportR} --logo={params.logo} --prefix='{wildcards.analysis}.salmon.genes' --reportFile={params.reportRmd} --countDataFile={input.counts} --colDataFile={input.coldata} --gtfFile={GTF_FILE} --caseSampleGroups='{params.case}' --controlSampleGroups='{params.control}' --covariates='{params.covariates}' --workdir={params.outdir} --organism='{ORGANISM}' --description='{params.description}' --selfContained='{params.selfContained}' >> {log} 2>&1"
+
+rule deseq_collate_report3:
+  input:
+    html_reports=expand(os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()),
+    deseq_results=expand(os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq_results.tsv'), analysis = DE_ANALYSIS_LIST.keys())
+  params:
+    mapper="genes",
+    outdir=os.path.join(OUTPUT_DIR, "report", 'salmon'),
+    inpdir=os.path.join(OUTPUT_DIR, "report", 'salmon'),
+    script=os.path.join(SCRIPTS_DIR, "collate_deseq_results.R"),
+  log: os.path.join(LOG_DIR, "salmon", "collate_genes_deseq.report.log")
+  output:
+    os.path.join(OUTPUT_DIR, "report", 'salmon', 'collated.genes.deseq_results.tsv')
+  resources:
+    mem_mb = config['execution']['rules']['reports']['memory']
+  shell:
+    "{RSCRIPT_EXEC} {params.script} {params.mapper} {params.inpdir} {params.outdir} >> {log} 2>&1"