Skip to content
This repository was archived by the owner on Oct 22, 2019. It is now read-only.

Commit d70a841

Browse files
author
agaye
committed
Amended ds.tTest to allow for a test between a continuous and a categorical variable.
1 parent a8723e8 commit d70a841

File tree

6 files changed

+613
-283
lines changed

6 files changed

+613
-283
lines changed

R/ds.tTest.R

Lines changed: 42 additions & 270 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,11 @@
33
#' @description Performs one and two sample t-tests on vectors of data.
44
#' @details Summary statistics are obtained from each of the data sets that are located on the
55
#' distinct computers/servers. And then grand means and variances are calculated. Those are used
6-
#' for performing t-test.
7-
#' @param x a character, the name of a (non-empty) numeric vector of data values.
6+
#' for performing t-test. The funtion allows for the calculation of t-test between two continuous variables
7+
#' or between a continuous and a factor variable; the latter option requires a formula (see parameter \code{dataframe}).
8+
#' If a formula is provided all other but 'conf.level=0.95' are ignored.
9+
#' @param x a character, the name of a (non-empty) numeric vector of data values or a formula of the
10+
#' form 'a~b' where 'a' is the name of a continuous variable and 'b' that of a factor variable.
811
#' @param y a character, the name of an optional (non-empty) numeric vector of data values.
912
#' @param type a character which tells if the test is ran for the pooled data or not.
1013
#' By default \code{type} is set to 'combine' and a t.test of the pooled data is
@@ -31,43 +34,46 @@
3134
#' was a one-sample test or a two-sample test.
3235
#' \code{alternative} a character string describing the alternative hypothesis
3336
#' \code{method} a character string indicating what type of t-test was performed
37+
#' @return an object of type 'htest' if both x and y are continuous and a list otherwise.
3438
#' @author Isaeva, J.; Gaye, A.
3539
#' @export
3640
#' @examples {
3741
#'
3842
#' # load that contains the login details
3943
#' data(logindata)
4044
#'
41-
#' # login and assign specific variable(s)
42-
#' myvar <- list("LAB_HDL", "LAB_TSC")
43-
#' opals <- datashield.login(logins=logindata,assign=TRUE,variables=myvar)
45+
#' # login and assign all the variables
46+
#' opals <- datashield.login(logins=logindata,assign=TRUE)
4447
#'
4548
#' # Example 1: Run a t.test of the pooled data for the variables 'LAB_HDL' and 'LAB_TSC' - default
4649
#' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC')
47-
#'
48-
#' # Example 2: Run a t.test for each study separately for the same variables as above
50+
#'
51+
#' # Example 2: Run a test to compare the mean of a continuous variable across the two categories of a categorical variable
52+
#' s <- ds.tTest(x='D$PM_BMI_CONTINUOUS~D$GENDER')
53+
#'
54+
#' # Example 3: Run a t.test for each study separately for the same variables as above
4955
#' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', type='split')
5056
#'
51-
#' # Example 3: Run a paired t.test of the pooled data
57+
#' # Example 4: Run a paired t.test of the pooled data
5258
#' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', paired=TRUE)
5359
#'
54-
#' # Example 4: Run a paired t.test for each study separately
60+
#' # Example 5: Run a paired t.test for each study separately
5561
#' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', paired=TRUE, type='split')
5662
#'
57-
#' # Example 5: Run a t.test of the pooled data with different alternatives
63+
#' # Example 6: Run a t.test of the pooled data with different alternatives
5864
#' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', alternative='greater')
5965
#' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', alternative='less')
6066
#'
61-
#' # Example 6: Run a t.test of the pooled data with mu different from zero
67+
#' # Example 7: Run a t.test of the pooled data with mu different from zero
6268
#' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', mu=-4)
6369
#'
64-
#' # Example 7: Run a t.test of the pooled data assuming that variances of variables are equal
70+
#' # Example 8: Run a t.test of the pooled data assuming that variances of variables are equal
6571
#' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', var.equal=TRUE)
6672
#'
67-
#' # Example 8: Run a t.test of the pooled data with 90% confidence interval
73+
#' # Example 9: Run a t.test of the pooled data with 90% confidence interval
6874
#' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', conf.level=0.90)
6975
#'
70-
#' # Example 9: Run a one-sample t.test of the pooled data
76+
#' # Example 10: Run a one-sample t.test of the pooled data
7177
#' ds.tTest(x='D$LAB_HDL')
7278
#' # the below example should not work, paired t.test is not possible if the 'y' variable is missing
7379
#' # ds.tTest(x='D$LAB_HDL', paired=TRUE)
@@ -85,268 +91,34 @@ ds.tTest <- function (x=NULL, y=NULL, type="combine", alternative="two.sided", m
8591
}
8692

8793
if(is.null(x)){
88-
stop("Please provide the name of the x vector!", call.=FALSE)
89-
}
90-
91-
# get the names of the variables used for the analysis
92-
# the input variable might be given as column table (i.e. D$x)
93-
# or just as a vector not attached to a table (i.e. x)
94-
# we have to make sure the function deals with each case
95-
if(is.null(y)){
96-
xname <- extract(x)
97-
dname = xname$elements
98-
variables <- dname
99-
}else{
100-
xname <- extract(x)
101-
yname <- extract(y)
102-
dname1 = xname$elements
103-
dname2 = yname$elements
104-
variables <- c(dname1, dname2)
105-
dname = paste(dname1, 'and', dname2)
94+
stop("Please provide the name of the x vector or a formula if performing t.test between a numeric and a factor vector!", call.=FALSE)
10695
}
10796

108-
# call the function that checks theinput variables are defined in all the studies
109-
if(is.null(y)){
110-
obj2lookfor <- xname$holders
111-
if(is.na(obj2lookfor)){
112-
defined <- isDefined(datasources, variables[1])
97+
# check if the user provided two continuous variables or a formula to run t-test between a continuous and a facyor variable
98+
# depending on what the user specified call the relevant function ('tTestHelper1' in the 1st case or 'tTesHelper2' ortherwise)
99+
# tTeshelper
100+
if(length(unlist(strsplit(x, split='~')))==2){
101+
if(type=="combine"){
102+
results <- tTestHelper2(x, conf.level, datasources)
113103
}else{
114-
defined <- isDefined(datasources, obj2lookfor)
115-
}
116-
}else{
117-
obj2lookfor1 <- xname$holders
118-
obj2lookfor2 <- yname$holders
119-
if(is.na(obj2lookfor1)){
120-
defined <- isDefined(datasources, variables[1])
121-
}else{
122-
defined <- isDefined(datasources, obj2lookfor1)
123-
}
124-
if(is.na(obj2lookfor2)){
125-
defined <- isDefined(datasources, variables[2])
126-
}else{
127-
defined <- isDefined(datasources, obj2lookfor2)
128-
}
129-
}
130-
131-
# call the internal function that checks an input object is of the same class in all studies.
132-
if(is.null(y)){
133-
typ1 <- checkClass(datasources, x)
134-
}else{
135-
typ1 <- checkClass(datasources, x)
136-
typ2 <- checkClass(datasources, y)
137-
}
138-
139-
# number of studies
140-
num.sources = length(datasources)
141-
142-
if(type == "combine"){
143-
144-
# Performs t-test on merged data sets
145-
if (!missing(mu) && (length(mu) != 1 || is.na(mu)))
146-
stop("'mu' must be a single number")
147-
if (!missing(conf.level) && (length(conf.level) != 1 || !is.finite(conf.level) ||
148-
conf.level < 0 || conf.level > 1))
149-
stop("'conf.level' must be a single number between 0 and 1")
150-
if (!is.null(y)) {
151-
if (paired) {
152-
cally = paste0("complete.cases(", x, ",", y, ")")
153-
datashield.assign(datasources, 'pair.compl.obs', as.symbol(cally))
154-
cally = paste0("subsetDS('", x, "', pair.compl.obs)")
155-
datashield.assign(datasources, 'xok', as.symbol(cally))
156-
cally = paste0("subsetDS('", y, "', pair.compl.obs)")
157-
datashield.assign(datasources, 'yok', as.symbol(cally))
158-
} else {
159-
cally = paste0("complete.cases(", x, ")")
160-
datashield.assign(datasources, 'not.na.x', as.symbol(cally))
161-
cally = paste0("subsetDS('", x, "',not.na.x)")
162-
datashield.assign(datasources, 'xok', as.symbol(cally))
163-
164-
cally = paste0("complete.cases(", y, ")")
165-
datashield.assign(datasources, 'not.na.y', as.symbol(cally))
166-
cally = paste0("subsetDS('", y, "',not.na.y)")
167-
datashield.assign(datasources, 'yok', as.symbol(cally))
168-
}
169-
} else {
170-
if (paired)
171-
stop("'y' is missing for paired test")
172-
cally = paste0("complete.cases(", x, ")")
173-
datashield.assign(datasources, 'not.na.x', as.symbol(cally))
174-
cally = paste0("subsetDS('", x, "', not.na.x)")
175-
datashield.assign(datasources, 'xok', as.symbol(cally))
176-
cally = paste0("as.null(", x, ")")
177-
datashield.assign(datasources, 'yok', as.symbol(cally)) # does not matter that as.null(x) since we just want to set y to NULL
178-
}
179-
180-
if (paired) {
181-
cally = paste0("(yok)","*(",-1,")")
182-
datashield.assign(datasources, 'minus_y', as.symbol(cally))
183-
# datashield.assign(datasources, 'dummy', quote(cbind(xok, minus_y)))
184-
datashield.assign(datasources, 'xok', as.symbol("xok+minus_y"))
185-
datashield.assign(datasources, 'yok', as.symbol("as.null(yok)"))
186-
}
187-
188-
length.local.x = datashield.aggregate(datasources, as.symbol("NROW(xok)"))
189-
mean.local.x = datashield.aggregate(datasources, as.symbol("meanDS(xok)"))
190-
var.local.x = datashield.aggregate(datasources, as.symbol("varDS(xok)"))
191-
192-
length.total.x = 0
193-
sum.weighted.x = 0
194-
195-
for (i in 1:num.sources)
196-
if (!is.null(length.local.x[[i]]))
197-
length.total.x = length.total.x + length.local.x[[i]]
198-
199-
for (i in 1:num.sources)
200-
if (!is.null(length.local.x[[i]]))
201-
sum.weighted.x = sum.weighted.x + length.local.x[[i]]*mean.local.x[[i]]
202-
203-
if (!is.na(sum.weighted.x))
204-
mean.global.x = sum.weighted.x/length.total.x else
205-
stop(paste("Check the data supplied: global ", variables[1], " mean is NA", sep=""))
206-
estimate = mean.global.x
207-
208-
nrows_var.x = NROW(var.local.x[[1]])
209-
ncols_var.x = NCOL(var.local.x[[1]])
210-
dummy.sum.x = matrix(0, nrows_var.x, ncols_var.x)
211-
212-
for (i in 1:num.sources) {
213-
if (!is.null(var.local.x[[i]]) & !is.null(mean.local.x[[i]]))
214-
if (!is.na(var.local.x[[i]]) & !is.na(mean.local.x[[i]])) {
215-
var.weight.x = (length.local.x[[i]]-1)*var.local.x[[i]]
216-
add.elem.x = length.local.x[[i]]*(mean.local.x[[i]]%x%t(mean.local.x[[i]]))
217-
dummy.sum.x = dummy.sum.x +var.weight.x+add.elem.x
218-
}
219-
}
220-
mean.global.products.x = length.total.x*(mean.global.x%x%t(mean.global.x))
221-
var.global.x = 1/(length.total.x-1)*(dummy.sum.x-mean.global.products.x)
222-
223-
null.y = datashield.aggregate(datasources, as.symbol("is.null(yok)"))
224-
null.y = unlist(null.y)
225-
226-
if (all(null.y)) {
227-
if (length.total.x < 2)
228-
stop("not enough 'x' observations")
229-
df <- length.total.x - 1
230-
stderr <- sqrt(var.global.x/length.total.x)
231-
if (stderr < 10 * .Machine$double.eps * abs(mean.global.x))
232-
stop("data are essentially constant")
233-
tstat <- (mean.global.x - mu)/stderr
234-
method <- ifelse(paired, "Paired t-test", "One Sample t-test")
235-
names(estimate) <- ifelse(paired, "mean of the differences", paste("mean of", variables[1], sep=""))
236-
} else {
237-
length.local.y = datashield.aggregate(datasources, as.symbol("NROW(yok)"))
238-
239-
length.total.y = 0
240-
sum.weighted.y = 0
241-
242-
for (i in 1:num.sources)
243-
if (!is.null(length.local.y[[i]]))
244-
length.total.y = length.total.y + length.local.y[[i]]
245-
246-
if (length.total.x < 1 || (!var.equal && length.total.x < 2))
247-
stop(paste("not enough ", variables[1], "observations", sep=""))
248-
if (length.total.y < 1 || (!var.equal && length.total.y < 2))
249-
stop(paste("not enough ", variables[2], "observations", sep=""))
250-
if (var.equal && length.total.x + length.total.y < 3)
251-
stop("not enough observations")
252-
253-
mean.local.y = datashield.aggregate(datasources, as.symbol("meanDS(yok)"))
254-
var.local.y = datashield.aggregate(datasources, as.symbol("varDS(yok)"))
255-
method <- paste(if (!var.equal)
256-
"Welch", "Two Sample t-test")
257-
258-
length.total.y = 0
259-
sum.weighted.y = 0
260-
261-
for (i in 1:num.sources)
262-
if (!is.null(length.local.y[[i]])) {
263-
length.total.y = length.total.y + length.local.y[[i]]
264-
sum.weighted.y = sum.weighted.y + length.local.y[[i]]*mean.local.y[[i]]
104+
if(type=="split"){
105+
results <- vector("list", length(datasources))
106+
for(i in 1:length(datasources)){
107+
message(paste0("----",names(datasources)[i], "----"))
108+
out <- tTestHelper2(x, conf.level, datasources[i])
109+
results[[i]] <- out
110+
message(" ")
111+
rm(out)
265112
}
266-
if (!is.na(sum.weighted.y))
267-
mean.global.y = sum.weighted.y/length.total.y else
268-
stop(paste("Check the data supplied: global ", variables[2], " mean is NA", sep=""))
269-
270-
estimate <- c(mean.global.x, mean.global.y)
271-
names(estimate) <- c(paste("mean of ", variables[1], sep=""), paste("mean of ", variables[2], sep=""))
272-
273-
nrows_var.y = NROW(var.local.y[[1]])
274-
ncols_var.y = NCOL(var.local.y[[1]])
275-
dummy.sum.y = matrix(0, nrows_var.y, ncols_var.y)
276-
277-
for (i in 1:num.sources) {
278-
if (!is.null(var.local.y[[i]]) & !is.null(mean.local.y[[i]]))
279-
if (!is.na(var.local.y[[i]]) & !is.na(mean.local.y[[i]])) {
280-
var.weight.y = (length.local.y[[i]]-1)*var.local.y[[i]]
281-
add.elem.y = length.local.y[[i]]*(mean.local.y[[i]]%x%t(mean.local.y[[i]]))
282-
dummy.sum.y = dummy.sum.y +var.weight.y+add.elem.y
283-
}
113+
names(results) <- names(datasources)
114+
}else{
115+
stop('Function argument "type" has to be either "combine" or "split"')
284116
}
285-
mean.global.products.y = length.total.y*(mean.global.y%x%t(mean.global.y))
286-
var.global.y = 1/(length.total.y-1)*(dummy.sum.y-mean.global.products.y)
287-
288-
if (var.equal) {
289-
df <- length.total.x + length.total.x - 2
290-
v <- 0
291-
if (length.total.x > 1)
292-
v <- v + (length.total.x - 1) * var.global.x
293-
if (length.total.y > 1)
294-
v <- v + (length.total.y - 1) * var.global.y
295-
v <- v/df
296-
stderr <- sqrt(v * (1/length.total.x + 1/length.total.y))
297-
} else {
298-
stderrx <- sqrt(var.global.x/length.total.x)
299-
stderry <- sqrt(var.global.y/length.total.y)
300-
stderr <- sqrt(stderrx^2 + stderry^2)
301-
df <- stderr^4/(stderrx^4/(length.total.x - 1) + stderry^4/(length.total.y - 1))
302-
}
303-
if (stderr < 10 * .Machine$double.eps * max(abs(mean.global.x),
304-
abs(mean.global.y)))
305-
stop("data are essentially constant")
306-
tstat <- (mean.global.x - mean.global.y - mu)/stderr
307-
}
308-
309-
310-
if (alternative == "less") {
311-
pval <- pt(tstat, df)
312-
cint <- c(-Inf, tstat + qt(conf.level, df))
313-
} else if (alternative == "greater") {
314-
pval <- pt(tstat, df, lower.tail = FALSE)
315-
cint <- c(tstat - qt(conf.level, df), Inf)
316-
} else {
317-
pval <- 2 * pt(-abs(tstat), df)
318-
alpha <- 1 - conf.level
319-
cint <- qt(1 - alpha/2, df)
320-
cint <- tstat + c(-cint, cint)
321117
}
322-
cint <- mu + cint * stderr
323-
names(tstat) <- "t"
324-
names(df) <- "df"
325-
names(mu) <- if (paired || !is.null(y))
326-
"difference in means" else
327-
"mean"
328-
attr(cint, "conf.level") <- conf.level
329-
rval <- list(statistic = tstat, parameter = df, p.value = pval,
330-
conf.int = cint, estimate = estimate, null.value = mu,
331-
alternative = alternative, method = method, data.name=dname)
332-
class(rval) <- "htest"
333-
334-
# delete files that are no more required
335-
datashield.rm(datasources, 'pair.compl.obs')
336-
datashield.rm(datasources, 'xok')
337-
datashield.rm(datasources, 'yok')
338-
datashield.rm(datasources, 'not.na.x')
339-
datashield.rm(datasources, 'minus_y')
340-
341-
return(rval)
342-
118+
343119
}else{
344-
if(type == "split"){
345-
cally <- paste0("t.test(", x, ",", y, ",alternative='",alternative, "',mu=",mu, ",paired=",paired, ",var.equal=",var.equal, ",conf.level=",conf.level,")")
346-
results <- datashield.aggregate(datasources, as.symbol(cally))
347-
return(results)
348-
}else{
349-
stop('Function argument "type" has to be either "combine" or "split"')
350-
}
120+
results <- tTestHelper1(x, y, type, alternative, mu, paired, var.equal, conf.level, datasources)
351121
}
122+
123+
return(results)
352124
}

0 commit comments

Comments
 (0)