Skip to content

Commit

Permalink
Added short script to collate deseq_results.tsv from all the differen…
Browse files Browse the repository at this point in the history
…t analysis

On branch deseq-collate
Changes to be committed:
new file:   scripts/collate_deseq_results.R
modified:   snakefile.py
  • Loading branch information
Francesco Santini committed Feb 21, 2024
1 parent 5689530 commit 76fdc77
Show file tree
Hide file tree
Showing 2 changed files with 162 additions and 11 deletions.
89 changes: 89 additions & 0 deletions scripts/collate_deseq_results.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# PiGx RNAseq Pipeline.
#
# Copyright © 2019 Bora Uyar <[email protected]>
#
# This file is part of the PiGx RNAseq Pipeline.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.




# Collate DESeq results into one big matrix

args <- commandArgs(trailingOnly = TRUE)
mapper <- args[1]
inpDir <- args[2]
outDir <- args[3]

if (mapper=="hisat2"||mapper=="star") {
mapper <- ""
}

strpat <- paste0(mapper, ".deseq_results.tsv$")
filenames <- dir(inpDir, pattern = strpat, full.names = TRUE)

#filenames <- list.files(pattern=".deseq_results.tsv$")

all_files <- lapply(filenames, function(x) {
file <- read.table(x)
sub_file <- subset(file, select = c(0,2,6))
# Get the start of filename prefix
prefix = sub(".deseq_results.tsv", "", x)
inpstr = paste(inpDir, "/", sep = "")
prefix = sub(inpstr, "", prefix)
colnames(sub_file) <- paste(prefix, colnames(sub_file), sep='_')
return(sub_file)
})


multimerge <- function (mylist) {
## mimics a recursive merge or full outer join

unames <- unique(unlist(lapply(mylist, rownames)))

n <- length(unames)

out <- lapply(mylist, function(df) {

tmp <- matrix(nr = n, nc = ncol(df), dimnames = list(unames,colnames(df)))
tmp[rownames(df), ] <- as.matrix(df)
rm(df); gc()

return(tmp)
})

stopifnot( all( sapply(out, function(x) identical(rownames(x), unames)) ) )

bigout <- do.call(cbind, out)
colnames(bigout) <- paste(rep(names(mylist), sapply(mylist, ncol)), unlist(sapply(mylist, colnames)), sep = "")
return(bigout)
}


collated_dataframe <- multimerge(all_files)


# save results to out file
if (mapper=="genes"||mapper=="transcripts") {
mapper <- paste0(".", mapper)
}
finalname <- paste("collated", mapper, ".deseq_results.tsv", sep="")

collatedFile <- file.path(outDir, finalname)

write.table(x = as.data.frame(collated_dataframe), file = collatedFile,
quote = FALSE, sep = '\t')

#write.table(collated_dataframe, file ="collated.deseq_results.tsv", quote = FALSE, sep = '\t')
84 changes: 73 additions & 11 deletions snakefile.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -135,15 +135,18 @@ def lookup(column, predicate, fields=[]):
'final-report': {
'description': "Produce a comprehensive report. This is the default target.",
'files':
[os.path.join(OUTPUT_DIR, "input_annotation_stats.tsv"),
[os.path.join(OUTPUT_DIR, "input_annotation_stats.tsv"),
os.path.join(MULTIQC_DIR, 'multiqc_report.html'),
os.path.join(COUNTS_DIR, "raw_counts", "salmon", "counts_from_SALMON.transcripts.tsv"),
os.path.join(COUNTS_DIR, "raw_counts", "salmon", "counts_from_SALMON.genes.tsv"),
os.path.join(COUNTS_DIR, "normalized", "salmon", "TPM_counts_from_SALMON.transcripts.tsv"),
os.path.join(COUNTS_DIR, "normalized", "salmon", "TPM_counts_from_SALMON.genes.tsv"),
os.path.join(COUNTS_DIR, "raw_counts", MAPPER, "counts.tsv"),
os.path.join(COUNTS_DIR, "normalized", MAPPER, "deseq_normalized_counts.tsv"),
os.path.join(COUNTS_DIR, "normalized", MAPPER, "deseq_size_factors.txt")] +
os.path.join(COUNTS_DIR, "normalized", MAPPER, "deseq_size_factors.txt"),
os.path.join(OUTPUT_DIR, "report", MAPPER, "collated.deseq_results.tsv"),
os.path.join(OUTPUT_DIR, "report", 'salmon', "collated.transcripts.deseq_results.tsv"),
os.path.join(OUTPUT_DIR, "report", 'salmon', "collated.genes.deseq_results.tsv")] +
BIGWIG_OUTPUT +
expand(os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) +
expand(os.path.join(OUTPUT_DIR, "report", 'salmon', '{analysis}.salmon.transcripts.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) +
Expand All @@ -152,22 +155,26 @@ def lookup(column, predicate, fields=[]):
'deseq_report_star': {
'description': "Produce one HTML report for each analysis based on STAR results.",
'files':
expand(os.path.join(OUTPUT_DIR, "report", 'star', '{analysis}.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys())
expand(os.path.join(OUTPUT_DIR, "report", 'star', '{analysis}.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) +
[os.path.join(OUTPUT_DIR, "report", 'star', "collated.deseq_results.tsv")]
},
'deseq_report_hisat2': {
'description': "Produce one HTML report for each analysis based on Hisat2 results.",
'files':
expand(os.path.join(OUTPUT_DIR, "report", 'hisat2', '{analysis}.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys())
expand(os.path.join(OUTPUT_DIR, "report", 'hisat2', '{analysis}.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) +
[os.path.join(OUTPUT_DIR, "report", 'hisat2', "collated.deseq_results.tsv")]
},
'deseq_report_salmon_transcripts': {
'description': "Produce one HTML report for each analysis based on SALMON results at transcript level.",
'files':
expand(os.path.join(OUTPUT_DIR, "report", 'salmon', '{analysis}.salmon.transcripts.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys())
expand(os.path.join(OUTPUT_DIR, "report", 'salmon', '{analysis}.salmon.transcripts.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) +
[os.path.join(OUTPUT_DIR, "report", 'salmon', "collated.transcripts.deseq_results.tsv")]
},
'deseq_report_salmon_genes': {
'description': "Produce one HTML report for each analysis based on SALMON results at gene level.",
'files':
expand(os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys())
expand(os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()) +
[os.path.join(OUTPUT_DIR, "report", 'salmon', "collated.genes.deseq_results.tsv")]
},
'star_map' : {
'description': "Produce a STAR mapping results in BAM file format.",
Expand Down Expand Up @@ -231,7 +238,8 @@ def lookup(column, predicate, fields=[]):
OUTPUT_FILES.append(os.path.join(OUTPUT_DIR, "annotations.tgz"))

rule all:
input: OUTPUT_FILES
input:
OUTPUT_FILES,

rule check_annotation_files:
input:
Expand Down Expand Up @@ -561,7 +569,7 @@ def hisat2_file_arguments(args):
"{RSCRIPT_EXEC} {params.script} {params.mapped_dir} {input.colDataFile} {output} >> {log} 2>&1"

# create a normalized counts table including all samples
# using the median-of-ratios normalization procedure of
# using the median-of-ratios normalization procedure ofcollate_deseq_results.R
# deseq2
rule norm_counts_deseq:
input:
Expand Down Expand Up @@ -596,12 +604,30 @@ def hisat2_file_arguments(args):
logo = LOGO
log: os.path.join(LOG_DIR, MAPPER, "{analysis}.report.log")
output:
os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq.report.html')
os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq.report.html'),
os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq_results.tsv')
resources:
mem_mb = config['execution']['rules']['reports']['memory']
shell:
"{RSCRIPT_EXEC} {params.reportR} --logo={params.logo} --prefix='{wildcards.analysis}' --reportFile={params.reportRmd} --countDataFile={input.counts} --colDataFile={input.coldata} --gtfFile={GTF_FILE} --caseSampleGroups='{params.case}' --controlSampleGroups='{params.control}' --covariates='{params.covariates}' --workdir={params.outdir} --organism='{ORGANISM}' --description='{params.description}' --selfContained='{params.selfContained}' >> {log} 2>&1"

rule deseq_collate_report1:
input:
html_reports=expand(os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()),
deseq_results=expand(os.path.join(OUTPUT_DIR, "report", MAPPER, '{analysis}.deseq_results.tsv'), analysis = DE_ANALYSIS_LIST.keys())
params:
mapper=MAPPER,
outdir=os.path.join(OUTPUT_DIR, "report", MAPPER),
inpdir=os.path.join(OUTPUT_DIR, "report", MAPPER),
script=os.path.join(SCRIPTS_DIR, "collate_deseq_results.R"),
log: os.path.join(LOG_DIR, MAPPER, "collate_deseq.report.log")
output:
os.path.join(OUTPUT_DIR, "report", MAPPER, 'collated.deseq_results.tsv')
resources:
mem_mb = config['execution']['rules']['reports']['memory']
shell:
"{RSCRIPT_EXEC} {params.script} {params.mapper} {params.inpdir} {params.outdir} >> {log} 2>&1"

rule report2:
input:
counts=os.path.join(COUNTS_DIR, "raw_counts", "salmon", "counts_from_SALMON.transcripts.tsv"),
Expand All @@ -618,11 +644,29 @@ def hisat2_file_arguments(args):
logo = LOGO
log: os.path.join(LOG_DIR, "salmon", "{analysis}.report.salmon.transcripts.log")
output:
os.path.join(OUTPUT_DIR, "report", 'salmon', '{analysis}.salmon.transcripts.deseq.report.html')
os.path.join(OUTPUT_DIR, "report", 'salmon', '{analysis}.salmon.transcripts.deseq.report.html'),
os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.transcripts.deseq_results.tsv')
resources:
mem_mb = config['execution']['rules']['reports']['memory']
shell: "{RSCRIPT_EXEC} {params.reportR} --logo={params.logo} --prefix='{wildcards.analysis}.salmon.transcripts' --reportFile={params.reportRmd} --countDataFile={input.counts} --colDataFile={input.coldata} --gtfFile={GTF_FILE} --caseSampleGroups='{params.case}' --controlSampleGroups='{params.control}' --covariates='{params.covariates}' --workdir={params.outdir} --organism='{ORGANISM}' --description='{params.description}' --selfContained='{params.selfContained}' >> {log} 2>&1"

rule deseq_collate_report2:
input:
html_reports=expand(os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.transcripts.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()),
deseq_results=expand(os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.transcripts.deseq_results.tsv'), analysis = DE_ANALYSIS_LIST.keys())
params:
mapper="transcripts",
outdir=os.path.join(OUTPUT_DIR, "report", 'salmon'),
inpdir=os.path.join(OUTPUT_DIR, "report", 'salmon'),
script=os.path.join(SCRIPTS_DIR, "collate_deseq_results.R"),
log: os.path.join(LOG_DIR, "salmon", "collate_transcripts_deseq.report.log")
output:
os.path.join(OUTPUT_DIR, "report", 'salmon', 'collated.transcripts.deseq_results.tsv')
resources:
mem_mb = config['execution']['rules']['reports']['memory']
shell:
"{RSCRIPT_EXEC} {params.script} {params.mapper} {params.inpdir} {params.outdir} >> {log} 2>&1"

rule report3:
input:
counts=os.path.join(COUNTS_DIR, "raw_counts", "salmon", "counts_from_SALMON.genes.tsv"),
Expand All @@ -639,7 +683,25 @@ def hisat2_file_arguments(args):
logo = LOGO
log: os.path.join(LOG_DIR, "salmon", "{analysis}.report.salmon.genes.log")
output:
os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq.report.html')
os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq.report.html'),
os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq_results.tsv')
resources:
mem_mb = config['execution']['rules']['reports']['memory']
shell: "{RSCRIPT_EXEC} {params.reportR} --logo={params.logo} --prefix='{wildcards.analysis}.salmon.genes' --reportFile={params.reportRmd} --countDataFile={input.counts} --colDataFile={input.coldata} --gtfFile={GTF_FILE} --caseSampleGroups='{params.case}' --controlSampleGroups='{params.control}' --covariates='{params.covariates}' --workdir={params.outdir} --organism='{ORGANISM}' --description='{params.description}' --selfContained='{params.selfContained}' >> {log} 2>&1"

rule deseq_collate_report3:
input:
html_reports=expand(os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq.report.html'), analysis = DE_ANALYSIS_LIST.keys()),
deseq_results=expand(os.path.join(OUTPUT_DIR, "report", "salmon", '{analysis}.salmon.genes.deseq_results.tsv'), analysis = DE_ANALYSIS_LIST.keys())
params:
mapper="genes",
outdir=os.path.join(OUTPUT_DIR, "report", 'salmon'),
inpdir=os.path.join(OUTPUT_DIR, "report", 'salmon'),
script=os.path.join(SCRIPTS_DIR, "collate_deseq_results.R"),
log: os.path.join(LOG_DIR, "salmon", "collate_genes_deseq.report.log")
output:
os.path.join(OUTPUT_DIR, "report", 'salmon', 'collated.genes.deseq_results.tsv')
resources:
mem_mb = config['execution']['rules']['reports']['memory']
shell:
"{RSCRIPT_EXEC} {params.script} {params.mapper} {params.inpdir} {params.outdir} >> {log} 2>&1"

0 comments on commit 76fdc77

Please sign in to comment.