-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathcounts_matrix_from_SALMON.R
85 lines (73 loc) · 3.36 KB
/
counts_matrix_from_SALMON.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# PiGx RNAseq Pipeline.
#
# Copyright © 2017, 2018 Bora Uyar <[email protected]>
#
# This file is part of the PiGx RNAseq Pipeline.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# R script takes outputs from salmon and uses tximport package to create a matrix
# that is genes x samples
library(data.table)
args <- commandArgs(trailingOnly = TRUE)
salmon_output_folder <- args[1] #folder where salmon files are written
counts_dir <- args[2] #where to save to collated count files
colDataFile <- args[3]
writeCounts <- function(colDataFile, salmon_output_folder, counts_dir, type) {
colData <- read.table(colDataFile)
if(type == 'transcripts') {
files <- file.path(salmon_output_folder, rownames(colData), "quant.sf")
} else if ( type == 'genes') {
files <- file.path(salmon_output_folder, rownames(colData), "quant.genes.sf")
}
names(files) <- rownames(colData)
txi <- tximport::tximport(files, type = "salmon", txOut = TRUE)
if(length(unique(colData$sample_type)) < 2) {
dds <- DESeq2::DESeqDataSetFromTximport(txi, colData, ~1)
} else {
dds <- DESeq2::DESeqDataSetFromTximport(txi, colData, ~group)
}
#save raw counts
write.table(x = DESeq2::counts(dds),
file = file.path(counts_dir, paste0("counts_from_SALMON.", type,".tsv")),
quote = F, sep = '\t')
}
writeTPMcounts <- function(colDataFile, salmon_output_folder, counts_dir, type) {
colData <- read.table(colDataFile)
if(type == 'transcripts') {
files <- file.path(salmon_output_folder, rownames(colData), "quant.sf")
} else if ( type == 'genes') {
files <- file.path(salmon_output_folder, rownames(colData), "quant.genes.sf")
}
names(files) <- rownames(colData)
tpmTables <- lapply(names(files), function(n) {
f <- files[[n]]
df <- subset(fread(f), select = c('Name', 'TPM'))
colnames(df) <- c('Name', n)
return(df)
})
names(tpmTables) <- names(files)
#tpmMatrix = data.frame(Reduce(function(...) merge(..., all = TRUE, by = c('Name')), tpmTables))
tpmMatrix = Reduce(function(...) merge(..., all = TRUE, by = c('Name')), tpmTables)
rownames(tpmMatrix) <- tpmMatrix$Name
tpmMatrix <- tpmMatrix[-1]
write.table(x = tpmMatrix,
file = file.path(counts_dir, paste0("TPM_counts_from_SALMON.", type,".tsv")),
quote = F, sep = '\t')
}
raw_counts_dir <- file.path(counts_dir, "raw_counts", "salmon")
writeCounts(colDataFile, salmon_output_folder, raw_counts_dir, type = 'transcripts')
writeCounts(colDataFile, salmon_output_folder, raw_counts_dir, type = 'genes')
normalized_counts_dir <- file.path(counts_dir, "normalized", "salmon")
writeTPMcounts(colDataFile, salmon_output_folder, normalized_counts_dir, type = 'transcripts')
writeTPMcounts(colDataFile, salmon_output_folder, normalized_counts_dir, type = 'genes')