-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathrunDeseqReport.R
292 lines (262 loc) · 10.5 KB
/
runDeseqReport.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
# PiGx RNAseq Pipeline.
#
# Copyright © 2017 Bora Uyar <[email protected]>
# Copyright © 2018 Jonathan Ronen <[email protected]>
# Copyright © 2018, 2022 Ricardo Wurmus <[email protected]>
#
# This file is part of the PiGx RNAseq Pipeline.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#' runReport
#'
#' Generate a DESeq2 Report in a self-contained HTML file
#'
#'
#' @param reportFile Path to .Rmd script to generate a HTML report
#' @param countDataFile Path to count data file which contains raw read counts
#' per gene/transcript for each sample replicate
#' @param colDataFile Path to a tab-separated file with the experimental set-up
#' description. The row-names are the sample names, and the columns consist of
#' meta-data such as sample group, batch, sequencing run, condition, treatment
#' etc.
#' @param gtfFile Path to the GTF file that was used as reference to calculate
#' countDataFile
#' @param caseSampleGroups Comma separated list of sample group names (not
#' sample replicate names) that should be treated as 'case' groups (e.g.
#' mutant or treated samples)
#' @param controlSampleGroups Comma separated list of sample group names (not
#' sample replicate names) that should be treated as 'control' groups (e.g.
#' wild-type or untreated samples)
#' @param covariates Comma separated list of co-variates to control for in
#' differential expression analysis (e.g. batch, age, temperature,
#' sequencing_technology etc.). Must correspond to a column field in the
#' colDataFile file).
#' @param workdir Path to working directory where the output files will be
#' written
#' @param organism The organism for which the analysis is done (e.g. hsapiens,
#' mmusculus, celegans) via g:Profiler. This argument only affects GO term
#' analysis results. If the organism is not supported or there is no internet
#' connection, GO results will not be displayed.
#' @param prefix Prefix to be attached to the beginning of output files
#' @param logo Location of PiGx logo
#' @param quiet boolean value (default: FALSE). If set to TRUE, progress bars
#' and chunk labels will be suppressed while knitting the Rmd file.
#' @param selfContained boolean value (default: TRUE). By default, the generated
#' html file will be self-contained, which means that all figures and tables
#' will be embedded in a single html file with no external dependencies (See
#' rmarkdown::html_document)
#' @return An html generated using rmarkdown/knitr/pandoc that contains
#' interactive figures, tables, and text that provide an overview of the
#' experiment
runReport <- function(reportFile,
countDataFile,
colDataFile,
gtfFile,
caseSampleGroups,
controlSampleGroups,
covariates,
geneSetsFolder,
workdir = getwd(),
organism,
prefix,
logo,
selfContained = TRUE,
quiet = FALSE) {
# knitr 0.39 changed the default behavior.
options(knitr.graphics.rel_path = FALSE)
outFile <- paste0(prefix, '.deseq.report.html')
htmlwidgets::setWidgetIdSeed(1234)
rmarkdown::render(
input = reportFile,
output_dir = workdir,
intermediates_dir = file.path(workdir, prefix),
clean = TRUE,
output_file = outFile,
output_format = rmarkdown::html_document(
code_folding = 'hide',
depth = 2,
toc = TRUE,
toc_float = TRUE,
theme = 'lumen',
number_sections = TRUE
),
output_options = list(self_contained = selfContained),
params = list(countDataFile = countDataFile,
colDataFile = colDataFile,
gtfFile = gtfFile,
caseSampleGroups = caseSampleGroups,
controlSampleGroups = controlSampleGroups,
covariates = covariates,
prefix = prefix,
workdir = workdir,
logo = logo,
organism = organism),
quiet = quiet
)
if(dir.exists(file.path(workdir, prefix))) {
unlink(file.path(workdir, prefix), recursive = TRUE)
}
}
#1. Collect arguments
args <- commandArgs(TRUE)
cat("arguments:", args,"\n")
## Default setting when no arguments passed
if(length(args) == 0) {
args <- c("--help")
}
help_command = "
runDeseqReport.R: Generate an HTML report based on the analysis of raw count data for differential expression analysis using DESeq2
Arguments:
--reportFile Path to .Rmd script to generate a HTML report
--countDataFile Path to count data file which contains raw read counts
per gene/transcript for each sample replicate
--colDataFile Path to a tab-separated file with the experimental set-up
description. The row-names are the sample names, and the columns consist of
meta-data such as sample group, batch, sequencing run, condition, treatment
etc.
--gtfFile Path to the GTF file that was used as reference to calculate
countDataFile
--caseSampleGroups Comma separated list of sample group names (not
sample replicate names) that should be treated as 'case' groups (e.g.
mutant or treated samples)
--controlSampleGroups Comma separated list of sample group names (not
sample replicate names) that should be treated as 'control' groups (e.g.
wild-type or untreated samples)
--covariates Comma separated list of co-variates to control for in
differential expression analysis (e.g. batch, age, temperature,
sequencing_technology etc.). Must correspond to a column field in the
colDataFile file).
--workdir (Optional, default: current working directory) Path to working directory
where the output files will be written
--organism (Optional) The organism for which the analysis is done. Supported
organisms are 'human', 'mouse', 'worm', and 'fly'. This argument only
affects GO term analysis results. If the organism is not supported, GO
results will not be displayed.
--prefix (Optional, default: 'comparison1') Prefix to be attached to the beginning
of output files
--workdir (Optional, default: 'current working directory')
--selfContained boolean value (default: TRUE). By default, the generated
html file will be self-contained, which means that all figures and tables
will be embedded in a single html file with no external dependencies (See
markdown::html_document)
Example:
Rscript runDeseqReport.R --reportFile=./deseqReport.Rmd \\\
--countDataFile=./sample_data/counts.tsv \\\
--colDataFile=./sample_data/colData.tsv \\\
--gtfFile=./Ensembl.Celegans.90.gtf \\\
--caseSampleGroups='spt.16_N2_L4, hmg.4_N2_L4' \\\
--controlSampleGroups='ctrl_N2_L4' \\\
--covariates='batch, temp' \\\
--workdir=`pwd` \\\
--organism='hsapiens' \\\
--prefix='spt-16_hmg-4_vs_ctrl' \\\
--logo='/usr/local/share/pigx_rnaseq/logo.png' \\\
--selfContained=TRUE"
## Help section
if("--help" %in% args) {
cat(help_command, "\n")
q(save="no")
}
## Parse arguments (we expect the form --arg=value)
#parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
parseArgs <- function(x) {
myArgs <- unlist(strsplit(x, "--"))
myArgs <- myArgs[myArgs != '']
#when no values are provided for the argument
myArgs <- gsub(pattern = "=$", replacement = "= ", x = myArgs)
myArgs <- as.data.frame(do.call(rbind, strsplit(myArgs, "=")))
myArgs$V2 <- gsub(' ', '', myArgs$V2)
return(myArgs)
}
argsDF <- parseArgs(args)
argsL <- as.list(as.character(argsDF$V2))
names(argsL) <- argsDF$V1
if(!("reportFile" %in% argsDF$V1)) {
cat(help_command, "\n")
stop("Missing argument: reportFile. Provide the path to .Rmd script")
}
if(!("countDataFile" %in% argsDF$V1)) {
cat(help_command, "\n")
stop("Missing argument: countDataFile. Provide the path to counts data file")
}
if(!("colDataFile" %in% argsDF$V1)) {
cat(help_command, "\n")
stop("Missing argument: colDataFile. Provide the path to colData file which defines the experimental design\n")
}
if(!("gtfFile" %in% argsDF$V1)) {
cat(help_command, "\n")
stop("Missing argument: gtfFile Provide the path to GTF file\n")
}
if(!("caseSampleGroups") %in% argsDF$V1) {
cat(help_command, "\n")
stop("Missing argument: caseSampleGroups Provide a comma separated list of sample group names that will be treated as 'case' samples")
}
if(!("controlSampleGroups") %in% argsDF$V1) {
cat(help_command, "\n")
stop("Missing argument: controlSampleGroups Provide a comma separated list of sample group names that will be treated as 'case' samples")
}
if(!("covariates") %in% argsDF$V1) {
covariates <- ''
} else {
covariates <- argsL$covariates
}
if(!("organism") %in% argsDF$V1) {
cat(help_command, "\n")
warning("Missing argument: organism Will skip GO term analysis\n")
organism <- ''
} else {
organism <- argsL$organism
}
if(!("prefix" %in% argsDF$V1)) {
cat(help_command, "\n")
prefix <- 'comparison1'
warning("No prefix provided. Will use '",prefix,"' as the prefix to output files\n")
} else {
prefix <- argsL$prefix
}
if(!("workdir" %in% argsDF$V1)) {
workdir <- getwd()
warning("No output folder provided. Setting working directory to current directory:\n",workdir,"\n")
} else {
workdir <- argsL$workdir
cat("setting working directory to ",workdir,"\n")
}
if(!("selfContained" %in% argsDF$V1)) {
selfContained <- TRUE
} else {
selfContained <- argsL$selfContained
}
if(!("logo" %in% argsDF$V1)) {
warning("No logo specified.\n")
} else {
logo <- argsL$logo
}
reportFile = argsL$reportFile
countDataFile = argsL$countDataFile
colDataFile = argsL$colDataFile
gtfFile = argsL$gtfFile
caseSampleGroups = argsL$caseSampleGroups
controlSampleGroups = argsL$controlSampleGroups
runReport(reportFile = reportFile,
countDataFile = countDataFile,
colDataFile = colDataFile,
gtfFile = gtfFile,
caseSampleGroups = caseSampleGroups,
controlSampleGroups = controlSampleGroups,
covariates = covariates,
workdir = workdir,
organism = organism,
prefix = prefix,
logo = logo,
selfContained = selfContained)