3
3
# ' @description Performs one and two sample t-tests on vectors of data.
4
4
# ' @details Summary statistics are obtained from each of the data sets that are located on the
5
5
# ' distinct computers/servers. And then grand means and variances are calculated. Those are used
6
- # ' for performing t-test.
7
- # ' @param x a character, the name of a (non-empty) numeric vector of data values.
6
+ # ' for performing t-test. The funtion allows for the calculation of t-test between two continuous variables
7
+ # ' or between a continuous and a factor variable; the latter option requires a formula (see parameter \code{dataframe}).
8
+ # ' If a formula is provided all other but 'conf.level=0.95' are ignored.
9
+ # ' @param x a character, the name of a (non-empty) numeric vector of data values or a formula of the
10
+ # ' form 'a~b' where 'a' is the name of a continuous variable and 'b' that of a factor variable.
8
11
# ' @param y a character, the name of an optional (non-empty) numeric vector of data values.
9
12
# ' @param type a character which tells if the test is ran for the pooled data or not.
10
13
# ' By default \code{type} is set to 'combine' and a t.test of the pooled data is
31
34
# ' was a one-sample test or a two-sample test.
32
35
# ' \code{alternative} a character string describing the alternative hypothesis
33
36
# ' \code{method} a character string indicating what type of t-test was performed
37
+ # ' @return an object of type 'htest' if both x and y are continuous and a list otherwise.
34
38
# ' @author Isaeva, J.; Gaye, A.
35
39
# ' @export
36
40
# ' @examples {
37
41
# '
38
42
# ' # load that contains the login details
39
43
# ' data(logindata)
40
44
# '
41
- # ' # login and assign specific variable(s)
42
- # ' myvar <- list("LAB_HDL", "LAB_TSC")
43
- # ' opals <- datashield.login(logins=logindata,assign=TRUE,variables=myvar)
45
+ # ' # login and assign all the variables
46
+ # ' opals <- datashield.login(logins=logindata,assign=TRUE)
44
47
# '
45
48
# ' # Example 1: Run a t.test of the pooled data for the variables 'LAB_HDL' and 'LAB_TSC' - default
46
49
# ' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC')
47
- # '
48
- # ' # Example 2: Run a t.test for each study separately for the same variables as above
50
+ # '
51
+ # ' # Example 2: Run a test to compare the mean of a continuous variable across the two categories of a categorical variable
52
+ # ' s <- ds.tTest(x='D$PM_BMI_CONTINUOUS~D$GENDER')
53
+ # '
54
+ # ' # Example 3: Run a t.test for each study separately for the same variables as above
49
55
# ' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', type='split')
50
56
# '
51
- # ' # Example 3 : Run a paired t.test of the pooled data
57
+ # ' # Example 4 : Run a paired t.test of the pooled data
52
58
# ' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', paired=TRUE)
53
59
# '
54
- # ' # Example 4 : Run a paired t.test for each study separately
60
+ # ' # Example 5 : Run a paired t.test for each study separately
55
61
# ' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', paired=TRUE, type='split')
56
62
# '
57
- # ' # Example 5 : Run a t.test of the pooled data with different alternatives
63
+ # ' # Example 6 : Run a t.test of the pooled data with different alternatives
58
64
# ' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', alternative='greater')
59
65
# ' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', alternative='less')
60
66
# '
61
- # ' # Example 6 : Run a t.test of the pooled data with mu different from zero
67
+ # ' # Example 7 : Run a t.test of the pooled data with mu different from zero
62
68
# ' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', mu=-4)
63
69
# '
64
- # ' # Example 7 : Run a t.test of the pooled data assuming that variances of variables are equal
70
+ # ' # Example 8 : Run a t.test of the pooled data assuming that variances of variables are equal
65
71
# ' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', var.equal=TRUE)
66
72
# '
67
- # ' # Example 8 : Run a t.test of the pooled data with 90% confidence interval
73
+ # ' # Example 9 : Run a t.test of the pooled data with 90% confidence interval
68
74
# ' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', conf.level=0.90)
69
75
# '
70
- # ' # Example 9 : Run a one-sample t.test of the pooled data
76
+ # ' # Example 10 : Run a one-sample t.test of the pooled data
71
77
# ' ds.tTest(x='D$LAB_HDL')
72
78
# ' # the below example should not work, paired t.test is not possible if the 'y' variable is missing
73
79
# ' # ds.tTest(x='D$LAB_HDL', paired=TRUE)
@@ -85,268 +91,34 @@ ds.tTest <- function (x=NULL, y=NULL, type="combine", alternative="two.sided", m
85
91
}
86
92
87
93
if (is.null(x )){
88
- stop(" Please provide the name of the x vector!" , call. = FALSE )
89
- }
90
-
91
- # get the names of the variables used for the analysis
92
- # the input variable might be given as column table (i.e. D$x)
93
- # or just as a vector not attached to a table (i.e. x)
94
- # we have to make sure the function deals with each case
95
- if (is.null(y )){
96
- xname <- extract(x )
97
- dname = xname $ elements
98
- variables <- dname
99
- }else {
100
- xname <- extract(x )
101
- yname <- extract(y )
102
- dname1 = xname $ elements
103
- dname2 = yname $ elements
104
- variables <- c(dname1 , dname2 )
105
- dname = paste(dname1 , ' and' , dname2 )
94
+ stop(" Please provide the name of the x vector or a formula if performing t.test between a numeric and a factor vector!" , call. = FALSE )
106
95
}
107
96
108
- # call the function that checks theinput variables are defined in all the studies
109
- if (is.null(y )){
110
- obj2lookfor <- xname $ holders
111
- if (is.na(obj2lookfor )){
112
- defined <- isDefined(datasources , variables [1 ])
97
+ # check if the user provided two continuous variables or a formula to run t-test between a continuous and a facyor variable
98
+ # depending on what the user specified call the relevant function ('tTestHelper1' in the 1st case or 'tTesHelper2' ortherwise)
99
+ # tTeshelper
100
+ if (length(unlist(strsplit(x , split = ' ~' )))== 2 ){
101
+ if (type == " combine" ){
102
+ results <- tTestHelper2(x , conf.level , datasources )
113
103
}else {
114
- defined <- isDefined(datasources , obj2lookfor )
115
- }
116
- }else {
117
- obj2lookfor1 <- xname $ holders
118
- obj2lookfor2 <- yname $ holders
119
- if (is.na(obj2lookfor1 )){
120
- defined <- isDefined(datasources , variables [1 ])
121
- }else {
122
- defined <- isDefined(datasources , obj2lookfor1 )
123
- }
124
- if (is.na(obj2lookfor2 )){
125
- defined <- isDefined(datasources , variables [2 ])
126
- }else {
127
- defined <- isDefined(datasources , obj2lookfor2 )
128
- }
129
- }
130
-
131
- # call the internal function that checks an input object is of the same class in all studies.
132
- if (is.null(y )){
133
- typ1 <- checkClass(datasources , x )
134
- }else {
135
- typ1 <- checkClass(datasources , x )
136
- typ2 <- checkClass(datasources , y )
137
- }
138
-
139
- # number of studies
140
- num.sources = length(datasources )
141
-
142
- if (type == " combine" ){
143
-
144
- # Performs t-test on merged data sets
145
- if (! missing(mu ) && (length(mu ) != 1 || is.na(mu )))
146
- stop(" 'mu' must be a single number" )
147
- if (! missing(conf.level ) && (length(conf.level ) != 1 || ! is.finite(conf.level ) ||
148
- conf.level < 0 || conf.level > 1 ))
149
- stop(" 'conf.level' must be a single number between 0 and 1" )
150
- if (! is.null(y )) {
151
- if (paired ) {
152
- cally = paste0(" complete.cases(" , x , " ," , y , " )" )
153
- datashield.assign(datasources , ' pair.compl.obs' , as.symbol(cally ))
154
- cally = paste0(" subsetDS('" , x , " ', pair.compl.obs)" )
155
- datashield.assign(datasources , ' xok' , as.symbol(cally ))
156
- cally = paste0(" subsetDS('" , y , " ', pair.compl.obs)" )
157
- datashield.assign(datasources , ' yok' , as.symbol(cally ))
158
- } else {
159
- cally = paste0(" complete.cases(" , x , " )" )
160
- datashield.assign(datasources , ' not.na.x' , as.symbol(cally ))
161
- cally = paste0(" subsetDS('" , x , " ',not.na.x)" )
162
- datashield.assign(datasources , ' xok' , as.symbol(cally ))
163
-
164
- cally = paste0(" complete.cases(" , y , " )" )
165
- datashield.assign(datasources , ' not.na.y' , as.symbol(cally ))
166
- cally = paste0(" subsetDS('" , y , " ',not.na.y)" )
167
- datashield.assign(datasources , ' yok' , as.symbol(cally ))
168
- }
169
- } else {
170
- if (paired )
171
- stop(" 'y' is missing for paired test" )
172
- cally = paste0(" complete.cases(" , x , " )" )
173
- datashield.assign(datasources , ' not.na.x' , as.symbol(cally ))
174
- cally = paste0(" subsetDS('" , x , " ', not.na.x)" )
175
- datashield.assign(datasources , ' xok' , as.symbol(cally ))
176
- cally = paste0(" as.null(" , x , " )" )
177
- datashield.assign(datasources , ' yok' , as.symbol(cally )) # does not matter that as.null(x) since we just want to set y to NULL
178
- }
179
-
180
- if (paired ) {
181
- cally = paste0(" (yok)" ," *(" ,- 1 ," )" )
182
- datashield.assign(datasources , ' minus_y' , as.symbol(cally ))
183
- # datashield.assign(datasources, 'dummy', quote(cbind(xok, minus_y)))
184
- datashield.assign(datasources , ' xok' , as.symbol(" xok+minus_y" ))
185
- datashield.assign(datasources , ' yok' , as.symbol(" as.null(yok)" ))
186
- }
187
-
188
- length.local.x = datashield.aggregate(datasources , as.symbol(" NROW(xok)" ))
189
- mean.local.x = datashield.aggregate(datasources , as.symbol(" meanDS(xok)" ))
190
- var.local.x = datashield.aggregate(datasources , as.symbol(" varDS(xok)" ))
191
-
192
- length.total.x = 0
193
- sum.weighted.x = 0
194
-
195
- for (i in 1 : num.sources )
196
- if (! is.null(length.local.x [[i ]]))
197
- length.total.x = length.total.x + length.local.x [[i ]]
198
-
199
- for (i in 1 : num.sources )
200
- if (! is.null(length.local.x [[i ]]))
201
- sum.weighted.x = sum.weighted.x + length.local.x [[i ]]* mean.local.x [[i ]]
202
-
203
- if (! is.na(sum.weighted.x ))
204
- mean.global.x = sum.weighted.x / length.total.x else
205
- stop(paste(" Check the data supplied: global " , variables [1 ], " mean is NA" , sep = " " ))
206
- estimate = mean.global.x
207
-
208
- nrows_var.x = NROW(var.local.x [[1 ]])
209
- ncols_var.x = NCOL(var.local.x [[1 ]])
210
- dummy.sum.x = matrix (0 , nrows_var.x , ncols_var.x )
211
-
212
- for (i in 1 : num.sources ) {
213
- if (! is.null(var.local.x [[i ]]) & ! is.null(mean.local.x [[i ]]))
214
- if (! is.na(var.local.x [[i ]]) & ! is.na(mean.local.x [[i ]])) {
215
- var.weight.x = (length.local.x [[i ]]- 1 )* var.local.x [[i ]]
216
- add.elem.x = length.local.x [[i ]]* (mean.local.x [[i ]]%x% t(mean.local.x [[i ]]))
217
- dummy.sum.x = dummy.sum.x + var.weight.x + add.elem.x
218
- }
219
- }
220
- mean.global.products.x = length.total.x * (mean.global.x %x% t(mean.global.x ))
221
- var.global.x = 1 / (length.total.x - 1 )* (dummy.sum.x - mean.global.products.x )
222
-
223
- null.y = datashield.aggregate(datasources , as.symbol(" is.null(yok)" ))
224
- null.y = unlist(null.y )
225
-
226
- if (all(null.y )) {
227
- if (length.total.x < 2 )
228
- stop(" not enough 'x' observations" )
229
- df <- length.total.x - 1
230
- stderr <- sqrt(var.global.x / length.total.x )
231
- if (stderr < 10 * .Machine $ double.eps * abs(mean.global.x ))
232
- stop(" data are essentially constant" )
233
- tstat <- (mean.global.x - mu )/ stderr
234
- method <- ifelse(paired , " Paired t-test" , " One Sample t-test" )
235
- names(estimate ) <- ifelse(paired , " mean of the differences" , paste(" mean of" , variables [1 ], sep = " " ))
236
- } else {
237
- length.local.y = datashield.aggregate(datasources , as.symbol(" NROW(yok)" ))
238
-
239
- length.total.y = 0
240
- sum.weighted.y = 0
241
-
242
- for (i in 1 : num.sources )
243
- if (! is.null(length.local.y [[i ]]))
244
- length.total.y = length.total.y + length.local.y [[i ]]
245
-
246
- if (length.total.x < 1 || (! var.equal && length.total.x < 2 ))
247
- stop(paste(" not enough " , variables [1 ], " observations" , sep = " " ))
248
- if (length.total.y < 1 || (! var.equal && length.total.y < 2 ))
249
- stop(paste(" not enough " , variables [2 ], " observations" , sep = " " ))
250
- if (var.equal && length.total.x + length.total.y < 3 )
251
- stop(" not enough observations" )
252
-
253
- mean.local.y = datashield.aggregate(datasources , as.symbol(" meanDS(yok)" ))
254
- var.local.y = datashield.aggregate(datasources , as.symbol(" varDS(yok)" ))
255
- method <- paste(if (! var.equal )
256
- " Welch" , " Two Sample t-test" )
257
-
258
- length.total.y = 0
259
- sum.weighted.y = 0
260
-
261
- for (i in 1 : num.sources )
262
- if (! is.null(length.local.y [[i ]])) {
263
- length.total.y = length.total.y + length.local.y [[i ]]
264
- sum.weighted.y = sum.weighted.y + length.local.y [[i ]]* mean.local.y [[i ]]
104
+ if (type == " split" ){
105
+ results <- vector(" list" , length(datasources ))
106
+ for (i in 1 : length(datasources )){
107
+ message(paste0(" ----" ,names(datasources )[i ], " ----" ))
108
+ out <- tTestHelper2(x , conf.level , datasources [i ])
109
+ results [[i ]] <- out
110
+ message(" " )
111
+ rm(out )
265
112
}
266
- if (! is.na(sum.weighted.y ))
267
- mean.global.y = sum.weighted.y / length.total.y else
268
- stop(paste(" Check the data supplied: global " , variables [2 ], " mean is NA" , sep = " " ))
269
-
270
- estimate <- c(mean.global.x , mean.global.y )
271
- names(estimate ) <- c(paste(" mean of " , variables [1 ], sep = " " ), paste(" mean of " , variables [2 ], sep = " " ))
272
-
273
- nrows_var.y = NROW(var.local.y [[1 ]])
274
- ncols_var.y = NCOL(var.local.y [[1 ]])
275
- dummy.sum.y = matrix (0 , nrows_var.y , ncols_var.y )
276
-
277
- for (i in 1 : num.sources ) {
278
- if (! is.null(var.local.y [[i ]]) & ! is.null(mean.local.y [[i ]]))
279
- if (! is.na(var.local.y [[i ]]) & ! is.na(mean.local.y [[i ]])) {
280
- var.weight.y = (length.local.y [[i ]]- 1 )* var.local.y [[i ]]
281
- add.elem.y = length.local.y [[i ]]* (mean.local.y [[i ]]%x% t(mean.local.y [[i ]]))
282
- dummy.sum.y = dummy.sum.y + var.weight.y + add.elem.y
283
- }
113
+ names(results ) <- names(datasources )
114
+ }else {
115
+ stop(' Function argument "type" has to be either "combine" or "split"' )
284
116
}
285
- mean.global.products.y = length.total.y * (mean.global.y %x% t(mean.global.y ))
286
- var.global.y = 1 / (length.total.y - 1 )* (dummy.sum.y - mean.global.products.y )
287
-
288
- if (var.equal ) {
289
- df <- length.total.x + length.total.x - 2
290
- v <- 0
291
- if (length.total.x > 1 )
292
- v <- v + (length.total.x - 1 ) * var.global.x
293
- if (length.total.y > 1 )
294
- v <- v + (length.total.y - 1 ) * var.global.y
295
- v <- v / df
296
- stderr <- sqrt(v * (1 / length.total.x + 1 / length.total.y ))
297
- } else {
298
- stderrx <- sqrt(var.global.x / length.total.x )
299
- stderry <- sqrt(var.global.y / length.total.y )
300
- stderr <- sqrt(stderrx ^ 2 + stderry ^ 2 )
301
- df <- stderr ^ 4 / (stderrx ^ 4 / (length.total.x - 1 ) + stderry ^ 4 / (length.total.y - 1 ))
302
- }
303
- if (stderr < 10 * .Machine $ double.eps * max(abs(mean.global.x ),
304
- abs(mean.global.y )))
305
- stop(" data are essentially constant" )
306
- tstat <- (mean.global.x - mean.global.y - mu )/ stderr
307
- }
308
-
309
-
310
- if (alternative == " less" ) {
311
- pval <- pt(tstat , df )
312
- cint <- c(- Inf , tstat + qt(conf.level , df ))
313
- } else if (alternative == " greater" ) {
314
- pval <- pt(tstat , df , lower.tail = FALSE )
315
- cint <- c(tstat - qt(conf.level , df ), Inf )
316
- } else {
317
- pval <- 2 * pt(- abs(tstat ), df )
318
- alpha <- 1 - conf.level
319
- cint <- qt(1 - alpha / 2 , df )
320
- cint <- tstat + c(- cint , cint )
321
117
}
322
- cint <- mu + cint * stderr
323
- names(tstat ) <- " t"
324
- names(df ) <- " df"
325
- names(mu ) <- if (paired || ! is.null(y ))
326
- " difference in means" else
327
- " mean"
328
- attr(cint , " conf.level" ) <- conf.level
329
- rval <- list (statistic = tstat , parameter = df , p.value = pval ,
330
- conf.int = cint , estimate = estimate , null.value = mu ,
331
- alternative = alternative , method = method , data.name = dname )
332
- class(rval ) <- " htest"
333
-
334
- # delete files that are no more required
335
- datashield.rm(datasources , ' pair.compl.obs' )
336
- datashield.rm(datasources , ' xok' )
337
- datashield.rm(datasources , ' yok' )
338
- datashield.rm(datasources , ' not.na.x' )
339
- datashield.rm(datasources , ' minus_y' )
340
-
341
- return (rval )
342
-
118
+
343
119
}else {
344
- if (type == " split" ){
345
- cally <- paste0(" t.test(" , x , " ," , y , " ,alternative='" ,alternative , " ',mu=" ,mu , " ,paired=" ,paired , " ,var.equal=" ,var.equal , " ,conf.level=" ,conf.level ," )" )
346
- results <- datashield.aggregate(datasources , as.symbol(cally ))
347
- return (results )
348
- }else {
349
- stop(' Function argument "type" has to be either "combine" or "split"' )
350
- }
120
+ results <- tTestHelper1(x , y , type , alternative , mu , paired , var.equal , conf.level , datasources )
351
121
}
122
+
123
+ return (results )
352
124
}
0 commit comments