Fixes #63; when sample sheet updates, the collating and following cou…

…nting steps should follow accordingly
BIMSBbioinfo · Mar 29, 2022 · 2592bf6 · 2592bf6
1 parent 8e8dc75
commit 2592bf6
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 3 deletions.
diff --git a/scripts/collate_read_counts.R b/scripts/collate_read_counts.R
@@ -22,7 +22,8 @@
 args <- commandArgs(trailingOnly = TRUE)
 
 input_dir <- args[1]
-out_file <- args[2]
+colDataFile <- args[2]
+out_file <- args[3]
 
 count_files <- dir(input_dir, pattern = ".read_counts.csv$", full.names = TRUE)
 
@@ -38,6 +39,16 @@ counts_all <- as.data.frame(Reduce(function(dtf1, dtf2)
 rownames(counts_all) <- counts_all$V1
 counts_all$V1 <- NULL
 
+# subset to only keep the counts for the samples in the 
+# colDataFile,which is the same as the sample sheet)
+colData <- read.table(colDataFile, header = T, row.names = 1)
+if(sum(!rownames(colData) %in% colnames(counts_all) > 0)){
+ stop("ERROR collating counts for samples in the colData file. 
+      The count data for the following samples are missing:",
+      setdiff(rownames(colData), colnames(counts_all))) 
+}
+counts_all <- subset(counts_all, select = rownames(colData))
+
 # save results to out file
 write.table(counts_all, out_file, quote = FALSE,
             sep = '\t')

diff --git a/snakefile.py b/snakefile.py
@@ -512,7 +512,8 @@ def hisat2_file_arguments(args):
 
 rule collate_read_counts:
   input:
-    expand(os.path.join(MAPPED_READS_DIR, MAPPER, "{sample}.read_counts.csv"), sample = SAMPLES)
+    colDataFile = rules.translate_sample_sheet_for_report.output,
+    count_files = expand(os.path.join(MAPPED_READS_DIR, MAPPER, "{sample}.read_counts.csv"), sample = SAMPLES)
   output:
     os.path.join(COUNTS_DIR, "raw_counts", MAPPER, "counts.tsv")
   resources:
@@ -522,7 +523,7 @@ def hisat2_file_arguments(args):
     mapped_dir = os.path.join(MAPPED_READS_DIR, MAPPER),
     script = os.path.join(SCRIPTS_DIR, "collate_read_counts.R")
   shell:
-    "{RSCRIPT_EXEC} {params.script} {params.mapped_dir} {output} >> {log} 2>&1"
+    "{RSCRIPT_EXEC} {params.script} {params.mapped_dir} {input.colDataFile} {output} >> {log} 2>&1"
 
 # create a normalized counts table including all samples
 # using the median-of-ratios normalization procedure of