-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgmql_read.R
executable file
·254 lines (234 loc) · 7.95 KB
/
gmql_read.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
#' Function read
#'
#' It reads a GMQL dataset, as a folder containing some homogenus samples on
#' disk or as a GRangesList, saving it in Scala memory in a way that can be
#' referenced in R. It is also used to read a repository dataset in case of
#' remote processing.
#'
#' @importFrom rJava J .jnull .jarray
#' @importFrom methods is
#'
#' @param dataset folder path for GMQL dataset or dataset name on repository
#' @param parser string used to parsing dataset files.
#' The Parsers available are:
#' \itemize{
#' \item{BedParser}
#' \item{BroadPeakParser}
#' \item{NarrowPeakParser}
#' \item{CustomParser}
#' }
#' Default is CustomParser.
#' @param is_local logical value indicating local or remote dataset
#'
#' @return GMQLDataset object. It contains the value to use as input
#' for the subsequent GMQLDataset method
#'
#' @details
#' Normally, a GMQL dataset contains an XML schema file that contains
#' name of region attributes. (e.g chr, start, stop, strand)
#' The CustomParser reads this XML schema;
#' if you already know what kind of schema your files have, use one of the
#' parsers defined, without reading any XML schema.
#'
#' If GRangesList has no metadata: i.e. metadata() is empty, two metadata are
#' generated:
#' \itemize{
#' \item{"provider" = "PoliMi"}
#' \item{"application" = "RGMQL"}
#' }
#'
#' NOTE:
#' The folder layout must obey the following rules and adopt
#' the following layout:
#' The dataset folder can have any name, but must contains the
#' sub-folders named: "files".
#' The sub-folder "files" contains the dataset files and
#' the schema xml file.
#' The schema files adopt the following the naming conventions:
#'
#' - "schema.xml"
#' - "test.schema"
#'
#' The names must be in LOWERCASE. Any other schema file
#' will not be conisdered, if both are present, "test.schema" will be used.
#'
#' @examples
#'
#' ## This statement initializes and runs the GMQL server for local execution
#' ## and creation of results on disk. Then, with system.file() it defines
#' ## the path to the folder "DATASET" in the subdirectory "example"
#' ## of the package "RGMQL" and opens such folder as a GMQL dataset
#' ## named "data" using CustomParser
#'
#' init_gmql()
#' test_path <- system.file("example", "DATASET", package = "RGMQL")
#' data = read_gmql(test_path)
#'
#' ## This statement opens such folder as a GMQL dataset named "data" using
#' ## "NarrowPeakParser"
#' dataPeak = read_gmql(test_path,"NarrowPeakParser")
#'
#' ## This statement reads a remote public dataset stored into GMQL system
#' ## repository. For a public dataset in a (remote) GMQL repository the
#' ## prefix "public." is needed before dataset name
#'
#' remote_url = "http://www.gmql.eu/gmql-rest/"
#' login_gmql(remote_url)
#' data1 = read_gmql("public.Example_Dataset_1", is_local = FALSE)
#'
#'
#' @name read_gmql
#' @rdname read-function
#' @export
#'
read_gmql <- function(
dataset,
parser = "CustomParser",
is_local = TRUE
) {
.check_input(dataset)
.check_logical(is_local)
WrappeR <- J("it/polimi/genomics/r/Wrapper")
parser_name <- .check_parser(parser)
if(is_local) {
if(!dir.exists(dataset))
stop("folder does not exist")
dataset <- sub("/*[/]$","",dataset)
if(basename(dataset) !="files")
dataset <- file.path(dataset,"files")
schema_XML <- .retrieve_schema(dataset, TRUE)
schema_matrix <- .jnull("java/lang/String")
url <- .jnull("java/lang/String")
coords_sys <- .jnull("java/lang/String")
type <- .jnull("java/lang/String")
} else {
if(!exists("GMQL_credentials", envir = .GlobalEnv))
stop("You have to log on using login function")
credential <- get("GMQL_credentials", envir = .GlobalEnv)
url <- credential$remote_url
if(is.null(url))
stop("You have to log on using login function")
if(identical(parser_name,"CUSTOMPARSER")) {
list <- show_schema(url, dataset)
coords_sys <- list$coordinate_system
type <- list$type
schema_names <- vapply(
list$fields, function(x){ x$name },character(1)
)
schema_type <- vapply(
list$fields, function(x){ x$type },character(1)
)
schema_matrix <- cbind(schema_names,schema_type)
if(is.null(schema_matrix) || !length(schema_matrix))
schema_matrix <- .jnull("java/lang/String")
else
schema_matrix <- .jarray(schema_matrix, dispatch = TRUE)
}
else
schema_matrix <- .jnull("java/lang/String")
schema_XML <- .jnull("java/lang/String")
}
response <- WrappeR$readDataset(
dataset,
parser_name,
is_local,
schema_matrix,
schema_XML,
coords_sys,
type
)
error <- strtoi(response[1])
data <- response[2]
if(error)
stop(data)
else
GMQLDataset(data)
}
#' @importFrom S4Vectors metadata
#' @importFrom rJava J .jarray
#'
#' @param samples GRangesList
#'
#' @name read_gmql
#' @rdname read-function
#' @export
#'
read_GRangesList <- function(samples) {
if(!is(samples,"GRangesList"))
stop("only GrangesList")
meta <- S4Vectors::metadata(samples)
if(is.null(meta) || !length(meta)) {
#repeat meta for each sample in samples list
len <- length(samples)
warning(
"No metadata.\nWe provide two metadata for you:
\n1.provider = PoliMi\n2.application = RGMQL\n"
)
index_meta <- rep(seq_len(len),each = len)
rep_meta <- rep(
c("provider","PoliMi", "application", "RGMQL"),
times = len
)
meta_matrix <- matrix(rep_meta,ncol = 2,byrow = TRUE)
meta_matrix <- cbind(index_meta,meta_matrix)
} else {
unlist_meta <- unlist(meta)
names_meta <- names(unlist_meta)
group_names <- gsub(".*_([0-9]*)\\..*","\\1", names_meta)
names(unlist_meta) <- NULL
meta_matrix <- cbind(group_names,names_meta,unlist_meta)
}
df <- data.frame(samples)
df <- df[-2] #delete group_name
len_df <- dim(df)[1] # number of rows
col_types <- vapply(df,class,character(1))
col_names <- names(col_types)
#re order the schema?
# if GTF, change
if("phase" %in% col_names) {
col_names <- plyr::revalue(
col_names,c(type = "feature", phase = "frame", seqnames = "seqname")
)
schema_matrix <- cbind(toupper(col_types),col_names)
schema_matrix<- schema_matrix[
setdiff(rownames(schema_matrix), c("group","width")),]
} else {
col_names <- plyr::revalue(
col_names,
c(start = "left", end = "right", seqnames = "chr")
)
schema_matrix <- cbind(col_names,toupper(col_types))
df$start = df$start - 1
schema_matrix<- schema_matrix[
setdiff(rownames(schema_matrix),c("group","width")),]
}
region_matrix <- as.matrix(vapply(df, as.character,character(len_df)))
region_matrix[is.na(region_matrix)] <- "NA"
region_matrix <- region_matrix[,setdiff(colnames(region_matrix),"width")]
rownames(schema_matrix) <- NULL
colnames(schema_matrix) <- NULL
schema_matrix <- .jarray(schema_matrix,dispatch = TRUE)
meta_matrix <- .jarray(meta_matrix,dispatch = TRUE)
region_matrix <- .jarray(region_matrix,dispatch = TRUE)
WrappeR <- J("it/polimi/genomics/r/Wrapper")
response <- WrappeR$read(
meta_matrix,
region_matrix,
schema_matrix,
"default",
"TAB"
)
GMQLDataset(response)
}
.check_parser <- function(parser) {
parser <- toupper(parser)
parsers <- c(
"BEDPARSER",
"BROADPEAKPARSER",
"NARROWPEAKPARSER",
"CUSTOMPARSER"
)
if(!parser %in% parsers)
stop("parser not defined")
parser
}