Skip to content

Commit

Permalink
add transcript-level summary to quant-salmon
Browse files Browse the repository at this point in the history
  • Loading branch information
igordot committed Feb 2, 2022
1 parent 065a08a commit 2f777bf
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 26 deletions.
62 changes: 41 additions & 21 deletions scripts/quant-merge-salmon.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ load_install_packages("readr")

# import GTF
txdb = makeTxDbFromGFF(genes_gtf, format = "gtf")
txkey = keys(txdb, keytype = "TXNAME")
tx2gene = select(txdb, txkey, "GENEID", "TXNAME")
txkey = AnnotationDbi::keys(txdb, keytype = "TXNAME")
tx2gene = AnnotationDbi::select(txdb, txkey, "GENEID", "TXNAME")

# find quant.sf files in a specified directory
quant_files = list.files(path = quant_sf_dir, pattern = "quant.sf.gz", full.names = TRUE, recursive = TRUE)
Expand All @@ -55,57 +55,77 @@ quant_files_names = str_remove(quant_files_names, "/quant.sf.gz")
quant_files_names = str_remove(quant_files_names, ".*/")
names(quant_files) = quant_files_names

# import transcript-level estimates and summarizes to the gene-level
# "salmon" software type uses "TPM" column as abundances and "NumReads" as estimated counts
# scale using the average transcript length over samples and the library size (lengthScaledTPM)
txi = NULL
try(txi <- tximport(quant_files, type = "salmon", tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM"))
# import and output transcript-level estimates
# all transcripts get scaled by the same fixed median transcript length (dtuScaledTPM)
txi_tx = NULL
try(txi_tx <- tximport(quant_files, type = "salmon", tx2gene = tx2gene, txOut = TRUE, countsFromAbundance = "dtuScaledTPM"))

# if tximport failed and GTF transcripts did not have version decimals, try ignoring version in Salmon files
if (is.null(txi)) {
message("tximport failed")
if (is.null(txi_tx)) {
message("transcript-level tximport failed")
if (length(str_which(tx2gene$TXNAME, "\\.")) == 0) {
message("repeating tximport ignoring transcript version")
txi = tximport(quant_files, type = "salmon", tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM",
ignoreTxVersion = TRUE)
txi_tx = tximport(quant_files, type = "salmon", tx2gene = tx2gene, txOut = TRUE,
countsFromAbundance = "dtuScaledTPM", ignoreTxVersion = TRUE)
}
}

# import transcript-level estimates and summarizs to the gene-level
# scale using the average transcript length over samples and the library size (lengthScaledTPM)
txi_gene = NULL
try(txi_gene <- tximport(quant_files, type = "salmon", tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM"))

# if tximport failed and GTF transcripts did not have version decimals, try ignoring version in Salmon files
if (is.null(txi_gene)) {
message("gene-level tximport failed")
if (length(str_which(tx2gene$TXNAME, "\\.")) == 0) {
message("repeating tximport ignoring transcript version")
txi_gene = tximport(quant_files, type = "salmon", tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM",
ignoreTxVersion = TRUE)
}
}

message("num imported samples: ", ncol(txi$counts))
message("num imported genes: ", nrow(txi$counts))
message("num imported samples: ", ncol(txi_gene$counts))
message("num imported genes: ", nrow(txi_gene$counts))
message("num imported transcripts: ", nrow(txi_tx$counts))

# check that the counts table has a reasonable number of genes
if (nrow(txi$counts) < 1000) stop("tximport counts table too small")
if (nrow(txi_gene$counts) < 1000) stop("tximport counts table too small")

# importing estimates for use with differential gene expression methods
# use the tximport argument countsFromAbundance="lengthScaledTPM" or "scaledTPM"
# use the gene-level matrix txi$counts as you would a regular count matrix ("bias corrected counts without an offset")
# "salmon" software type uses "TPM" column as abundances and "NumReads" as estimated counts

# extract bias corrected gene counts
txi_counts = txi$counts
txi_counts = txi_gene$counts
txi_counts = round(txi_counts, 3)

# extract TPMs
txi_tpms = txi$abundance
txi_tpms = txi_gene$abundance
txi_tpms = round(txi_tpms, 3)

# generate random string so the output files do not conflict if running multiple samples in parallel
rand_str = paste0(sample(LETTERS, 10, replace = TRUE), collapse = "")

# save tximport list (for downstream Bioconductor DGE packages such as edgeR or DESeq2)
saveRDS(txi, file = glue("temp.{rand_str}.rds"))
system(glue("mv -vf temp.{rand_str}.rds {out_base}.tximport.gene.lengthScaledTPM.rds"))
# save tximport list (for downstream DTU packages such as DRIMSeq or DEXSeq)
saveRDS(txi_tx, file = glue("temp.{rand_str}.transcript.rds"))
system(glue("mv -vf temp.{rand_str}.transcript.rds {out_base}.tximport.transcript.dtuScaledTPM.rds"))
Sys.sleep(1)

# save tximport list (for downstream DGE packages such as edgeR or DESeq2)
saveRDS(txi_gene, file = glue("temp.{rand_str}.gene.rds"))
system(glue("mv -vf temp.{rand_str}.gene.rds {out_base}.tximport.gene.lengthScaledTPM.rds"))
Sys.sleep(1)

# save gene-level counts table
write.table(txi_counts, file = glue("temp.{rand_str}.counts.txt"), quote = FALSE, sep = "\t", col.names = NA)
system(glue("mv -vf temp.{rand_str}.counts.txt {out_base}.tximport.counts.txt"))
system(glue("mv -vf temp.{rand_str}.counts.txt {out_base}.tximport.gene.counts.txt"))
Sys.sleep(1)

# save gene-level counts table
write.table(txi_tpms, file = glue("temp.{rand_str}.tpms.txt"), quote = FALSE, sep = "\t", col.names = NA)
system(glue("mv -vf temp.{rand_str}.tpms.txt {out_base}.tximport.tpms.txt"))
system(glue("mv -vf temp.{rand_str}.tpms.txt {out_base}.tximport.gene.tpms.txt"))
Sys.sleep(1)


Expand Down
19 changes: 14 additions & 5 deletions segments/quant-salmon.sh
Original file line number Diff line number Diff line change
Expand Up @@ -187,14 +187,23 @@ fi

# clean up

# clean up the full output table
# Name Length EffectiveLength TPM NumReads
# quant.sf quantification file columns
# Name: the name of the target transcript provided in the input transcript database
# Length: the length of the target transcript in nucleotides
# EffectiveLength: the computed effective length of the target transcript (takes into account all factors being modeled)
# TPM: estimate of the relative abundance of this transcript in units of TPM (recommended for downstream analysis)
# NumReads: the expected number of reads that have originated from each transcript

# clean up the full quantification table (usually not relevant since a merged table is generated later)
# columns: Name, Length, EffectiveLength, TPM, NumReads
# use NumReads (estimate of the number of reads mapping to each transcript) as counts
echo -e "#GENE\t${sample}" > "$salmon_counts_txt"
cat "$salmon_quant_genes_sf" | grep -v "EffectiveLength" | cut -f 1,5 | LC_ALL=C sort -k1,1 >> "$salmon_counts_txt"
# use TPM (relative abundance of this transcript) as TPMs
echo -e "#GENE\t${sample}" > "$salmon_tpms_txt"
cat "$salmon_quant_genes_sf" | grep -v "EffectiveLength" | cut -f 1,4 | LC_ALL=C sort -k1,1 >> "$salmon_tpms_txt"

# rename and gzip the quant.sf quantification file
# rename (to have all samples in one directory) and compress the quant.sf quantification file
mv -v "$salmon_quant_sf" "${salmon_quant_dir}/${sample}.quant.sf"
gzip "${salmon_quant_dir}/${sample}.quant.sf"

Expand Down Expand Up @@ -235,9 +244,9 @@ cat ${summary_dir}/*.${segment_name}.csv | LC_ALL=C sort -t ',' -k1,1 | uniq > "
#########################


# generate counts matrix for all samples
# generate a merged counts matrix for all samples with tximport
# this may be possible with "salmon quantmerge" in the future (does not currently support gene-level output)
# "tximport is, and has been, the recommended way to aggregate transcript-level abundances to the gene-level"
# "tximport is ... the recommended way to aggregate transcript-level abundances to the gene-level"

# file base of merged output
merged_counts_base="${proj_dir}/quant.salmon"
Expand Down

0 comments on commit 2f777bf

Please sign in to comment.