add transcript-level summary to quant-salmon

NYU-BFX · Feb 2, 2022 · 2f777bf · 2f777bf
1 parent 065a08a
commit 2f777bf
Show file tree

Hide file tree

Showing 2 changed files with 55 additions and 26 deletions.
diff --git a/scripts/quant-merge-salmon.R b/scripts/quant-merge-salmon.R
@@ -38,8 +38,8 @@ load_install_packages("readr")
 
 # import GTF
 txdb = makeTxDbFromGFF(genes_gtf, format = "gtf")
-txkey = keys(txdb, keytype = "TXNAME")
-tx2gene = select(txdb, txkey, "GENEID", "TXNAME")
+txkey = AnnotationDbi::keys(txdb, keytype = "TXNAME")
+tx2gene = AnnotationDbi::select(txdb, txkey, "GENEID", "TXNAME")
 
 # find quant.sf files in a specified directory
 quant_files = list.files(path = quant_sf_dir, pattern = "quant.sf.gz", full.names = TRUE, recursive = TRUE)
@@ -55,57 +55,77 @@ quant_files_names = str_remove(quant_files_names, "/quant.sf.gz")
 quant_files_names = str_remove(quant_files_names, ".*/")
 names(quant_files) = quant_files_names
 
-# import transcript-level estimates and summarizes to the gene-level
-# "salmon" software type uses "TPM" column as abundances and "NumReads" as estimated counts
-# scale using the average transcript length over samples and the library size (lengthScaledTPM)
-txi = NULL
-try(txi <- tximport(quant_files, type = "salmon", tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM"))
+# import and output transcript-level estimates
+# all transcripts get scaled by the same fixed median transcript length (dtuScaledTPM)
+txi_tx = NULL
+try(txi_tx <- tximport(quant_files, type = "salmon", tx2gene = tx2gene, txOut = TRUE, countsFromAbundance = "dtuScaledTPM"))
 
 # if tximport failed and GTF transcripts did not have version decimals, try ignoring version in Salmon files
-if (is.null(txi)) {
-  message("tximport failed")
+if (is.null(txi_tx)) {
+  message("transcript-level tximport failed")
   if (length(str_which(tx2gene$TXNAME, "\\.")) == 0) {
     message("repeating tximport ignoring transcript version")
-    txi = tximport(quant_files, type = "salmon", tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM",
-                   ignoreTxVersion = TRUE)
+    txi_tx = tximport(quant_files, type = "salmon", tx2gene = tx2gene, txOut = TRUE,
+                      countsFromAbundance = "dtuScaledTPM", ignoreTxVersion = TRUE)
   }
+}
 
+# import transcript-level estimates and summarizs to the gene-level
+# scale using the average transcript length over samples and the library size (lengthScaledTPM)
+txi_gene = NULL
+try(txi_gene <- tximport(quant_files, type = "salmon", tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM"))
+
+# if tximport failed and GTF transcripts did not have version decimals, try ignoring version in Salmon files
+if (is.null(txi_gene)) {
+  message("gene-level tximport failed")
+  if (length(str_which(tx2gene$TXNAME, "\\.")) == 0) {
+    message("repeating tximport ignoring transcript version")
+    txi_gene = tximport(quant_files, type = "salmon", tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM",
+                        ignoreTxVersion = TRUE)
+  }
 }
 
-message("num imported samples: ", ncol(txi$counts))
-message("num imported genes:   ", nrow(txi$counts))
+message("num imported samples:     ", ncol(txi_gene$counts))
+message("num imported genes:       ", nrow(txi_gene$counts))
+message("num imported transcripts: ", nrow(txi_tx$counts))
 
 # check that the counts table has a reasonable number of genes
-if (nrow(txi$counts) < 1000) stop("tximport counts table too small")
+if (nrow(txi_gene$counts) < 1000) stop("tximport counts table too small")
 
 # importing estimates for use with differential gene expression methods
 # use the tximport argument countsFromAbundance="lengthScaledTPM" or "scaledTPM"
 # use the gene-level matrix txi$counts as you would a regular count matrix ("bias corrected counts without an offset")
+# "salmon" software type uses "TPM" column as abundances and "NumReads" as estimated counts
 
 # extract bias corrected gene counts
-txi_counts = txi$counts
+txi_counts = txi_gene$counts
 txi_counts = round(txi_counts, 3)
 
 # extract TPMs
-txi_tpms = txi$abundance
+txi_tpms = txi_gene$abundance
 txi_tpms = round(txi_tpms, 3)
 
 # generate random string so the output files do not conflict if running multiple samples in parallel
 rand_str = paste0(sample(LETTERS, 10, replace = TRUE), collapse = "")
 
-# save tximport list (for downstream Bioconductor DGE packages such as edgeR or DESeq2)
-saveRDS(txi, file = glue("temp.{rand_str}.rds"))
-system(glue("mv -vf temp.{rand_str}.rds {out_base}.tximport.gene.lengthScaledTPM.rds"))
+# save tximport list (for downstream DTU packages such as DRIMSeq or DEXSeq)
+saveRDS(txi_tx, file = glue("temp.{rand_str}.transcript.rds"))
+system(glue("mv -vf temp.{rand_str}.transcript.rds {out_base}.tximport.transcript.dtuScaledTPM.rds"))
+Sys.sleep(1)
+
+# save tximport list (for downstream DGE packages such as edgeR or DESeq2)
+saveRDS(txi_gene, file = glue("temp.{rand_str}.gene.rds"))
+system(glue("mv -vf temp.{rand_str}.gene.rds {out_base}.tximport.gene.lengthScaledTPM.rds"))
 Sys.sleep(1)
 
 # save gene-level counts table
 write.table(txi_counts, file = glue("temp.{rand_str}.counts.txt"), quote = FALSE, sep = "\t", col.names = NA)
-system(glue("mv -vf temp.{rand_str}.counts.txt {out_base}.tximport.counts.txt"))
+system(glue("mv -vf temp.{rand_str}.counts.txt {out_base}.tximport.gene.counts.txt"))
 Sys.sleep(1)
 
 # save gene-level counts table
 write.table(txi_tpms, file = glue("temp.{rand_str}.tpms.txt"), quote = FALSE, sep = "\t", col.names = NA)
-system(glue("mv -vf temp.{rand_str}.tpms.txt {out_base}.tximport.tpms.txt"))
+system(glue("mv -vf temp.{rand_str}.tpms.txt {out_base}.tximport.gene.tpms.txt"))
 Sys.sleep(1)
 
 

diff --git a/segments/quant-salmon.sh b/segments/quant-salmon.sh
@@ -187,14 +187,23 @@ fi
 
 # clean up
 
-# clean up the full output table
-# Name	Length	EffectiveLength	TPM	NumReads
+# quant.sf quantification file columns
+# Name: the name of the target transcript provided in the input transcript database
+# Length: the length of the target transcript in nucleotides
+# EffectiveLength: the computed effective length of the target transcript (takes into account all factors being modeled)
+# TPM: estimate of the relative abundance of this transcript in units of TPM (recommended for downstream analysis)
+# NumReads: the expected number of reads that have originated from each transcript
+
+# clean up the full quantification table (usually not relevant since a merged table is generated later)
+# columns: Name, Length, EffectiveLength, TPM, NumReads
+# use NumReads (estimate of the number of reads mapping to each transcript) as counts
 echo -e "#GENE\t${sample}" > "$salmon_counts_txt"
 cat "$salmon_quant_genes_sf" | grep -v "EffectiveLength" | cut -f 1,5 | LC_ALL=C sort -k1,1 >> "$salmon_counts_txt"
+# use TPM (relative abundance of this transcript) as TPMs
 echo -e "#GENE\t${sample}" > "$salmon_tpms_txt"
 cat "$salmon_quant_genes_sf" | grep -v "EffectiveLength" | cut -f 1,4 | LC_ALL=C sort -k1,1 >> "$salmon_tpms_txt"
 
-# rename and gzip the quant.sf quantification file
+# rename (to have all samples in one directory) and compress the quant.sf quantification file
 mv -v "$salmon_quant_sf" "${salmon_quant_dir}/${sample}.quant.sf"
 gzip "${salmon_quant_dir}/${sample}.quant.sf"
 
@@ -235,9 +244,9 @@ cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "
 #########################
 
 
-# generate counts matrix for all samples
+# generate a merged counts matrix for all samples with tximport
 # this may be possible with "salmon quantmerge" in the future (does not currently support gene-level output)
-# "tximport is, and has been, the recommended way to aggregate transcript-level abundances to the gene-level"
+# "tximport is ... the recommended way to aggregate transcript-level abundances to the gene-level"
 
 # file base of merged output
 merged_counts_base="${proj_dir}/quant.salmon"