Skip to content
This repository has been archived by the owner on Oct 22, 2019. It is now read-only.

Commit

Permalink
Amended ds.tTest to allow for a test between a continuous and a categ…
Browse files Browse the repository at this point in the history
…orical variable.
  • Loading branch information
agaye committed Mar 24, 2015
1 parent a8723e8 commit d70a841
Show file tree
Hide file tree
Showing 6 changed files with 613 additions and 283 deletions.
312 changes: 42 additions & 270 deletions R/ds.tTest.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
#' @description Performs one and two sample t-tests on vectors of data.
#' @details Summary statistics are obtained from each of the data sets that are located on the
#' distinct computers/servers. And then grand means and variances are calculated. Those are used
#' for performing t-test.
#' @param x a character, the name of a (non-empty) numeric vector of data values.
#' for performing t-test. The funtion allows for the calculation of t-test between two continuous variables
#' or between a continuous and a factor variable; the latter option requires a formula (see parameter \code{dataframe}).
#' If a formula is provided all other but 'conf.level=0.95' are ignored.
#' @param x a character, the name of a (non-empty) numeric vector of data values or a formula of the
#' form 'a~b' where 'a' is the name of a continuous variable and 'b' that of a factor variable.
#' @param y a character, the name of an optional (non-empty) numeric vector of data values.
#' @param type a character which tells if the test is ran for the pooled data or not.
#' By default \code{type} is set to 'combine' and a t.test of the pooled data is
Expand All @@ -31,43 +34,46 @@
#' was a one-sample test or a two-sample test.
#' \code{alternative} a character string describing the alternative hypothesis
#' \code{method} a character string indicating what type of t-test was performed
#' @return an object of type 'htest' if both x and y are continuous and a list otherwise.
#' @author Isaeva, J.; Gaye, A.
#' @export
#' @examples {
#'
#' # load that contains the login details
#' data(logindata)
#'
#' # login and assign specific variable(s)
#' myvar <- list("LAB_HDL", "LAB_TSC")
#' opals <- datashield.login(logins=logindata,assign=TRUE,variables=myvar)
#' # login and assign all the variables
#' opals <- datashield.login(logins=logindata,assign=TRUE)
#'
#' # Example 1: Run a t.test of the pooled data for the variables 'LAB_HDL' and 'LAB_TSC' - default
#' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC')
#'
#' # Example 2: Run a t.test for each study separately for the same variables as above
#'
#' # Example 2: Run a test to compare the mean of a continuous variable across the two categories of a categorical variable
#' s <- ds.tTest(x='D$PM_BMI_CONTINUOUS~D$GENDER')
#'
#' # Example 3: Run a t.test for each study separately for the same variables as above
#' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', type='split')
#'
#' # Example 3: Run a paired t.test of the pooled data
#' # Example 4: Run a paired t.test of the pooled data
#' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', paired=TRUE)
#'
#' # Example 4: Run a paired t.test for each study separately
#' # Example 5: Run a paired t.test for each study separately
#' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', paired=TRUE, type='split')
#'
#' # Example 5: Run a t.test of the pooled data with different alternatives
#' # Example 6: Run a t.test of the pooled data with different alternatives
#' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', alternative='greater')
#' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', alternative='less')
#'
#' # Example 6: Run a t.test of the pooled data with mu different from zero
#' # Example 7: Run a t.test of the pooled data with mu different from zero
#' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', mu=-4)
#'
#' # Example 7: Run a t.test of the pooled data assuming that variances of variables are equal
#' # Example 8: Run a t.test of the pooled data assuming that variances of variables are equal
#' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', var.equal=TRUE)
#'
#' # Example 8: Run a t.test of the pooled data with 90% confidence interval
#' # Example 9: Run a t.test of the pooled data with 90% confidence interval
#' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', conf.level=0.90)
#'
#' # Example 9: Run a one-sample t.test of the pooled data
#' # Example 10: Run a one-sample t.test of the pooled data
#' ds.tTest(x='D$LAB_HDL')
#' # the below example should not work, paired t.test is not possible if the 'y' variable is missing
#' # ds.tTest(x='D$LAB_HDL', paired=TRUE)
Expand All @@ -85,268 +91,34 @@ ds.tTest <- function (x=NULL, y=NULL, type="combine", alternative="two.sided", m
}

if(is.null(x)){
stop("Please provide the name of the x vector!", call.=FALSE)
}

# get the names of the variables used for the analysis
# the input variable might be given as column table (i.e. D$x)
# or just as a vector not attached to a table (i.e. x)
# we have to make sure the function deals with each case
if(is.null(y)){
xname <- extract(x)
dname = xname$elements
variables <- dname
}else{
xname <- extract(x)
yname <- extract(y)
dname1 = xname$elements
dname2 = yname$elements
variables <- c(dname1, dname2)
dname = paste(dname1, 'and', dname2)
stop("Please provide the name of the x vector or a formula if performing t.test between a numeric and a factor vector!", call.=FALSE)
}

# call the function that checks theinput variables are defined in all the studies
if(is.null(y)){
obj2lookfor <- xname$holders
if(is.na(obj2lookfor)){
defined <- isDefined(datasources, variables[1])
# check if the user provided two continuous variables or a formula to run t-test between a continuous and a facyor variable
# depending on what the user specified call the relevant function ('tTestHelper1' in the 1st case or 'tTesHelper2' ortherwise)
# tTeshelper
if(length(unlist(strsplit(x, split='~')))==2){
if(type=="combine"){
results <- tTestHelper2(x, conf.level, datasources)
}else{
defined <- isDefined(datasources, obj2lookfor)
}
}else{
obj2lookfor1 <- xname$holders
obj2lookfor2 <- yname$holders
if(is.na(obj2lookfor1)){
defined <- isDefined(datasources, variables[1])
}else{
defined <- isDefined(datasources, obj2lookfor1)
}
if(is.na(obj2lookfor2)){
defined <- isDefined(datasources, variables[2])
}else{
defined <- isDefined(datasources, obj2lookfor2)
}
}

# call the internal function that checks an input object is of the same class in all studies.
if(is.null(y)){
typ1 <- checkClass(datasources, x)
}else{
typ1 <- checkClass(datasources, x)
typ2 <- checkClass(datasources, y)
}

# number of studies
num.sources = length(datasources)

if(type == "combine"){

# Performs t-test on merged data sets
if (!missing(mu) && (length(mu) != 1 || is.na(mu)))
stop("'mu' must be a single number")
if (!missing(conf.level) && (length(conf.level) != 1 || !is.finite(conf.level) ||
conf.level < 0 || conf.level > 1))
stop("'conf.level' must be a single number between 0 and 1")
if (!is.null(y)) {
if (paired) {
cally = paste0("complete.cases(", x, ",", y, ")")
datashield.assign(datasources, 'pair.compl.obs', as.symbol(cally))
cally = paste0("subsetDS('", x, "', pair.compl.obs)")
datashield.assign(datasources, 'xok', as.symbol(cally))
cally = paste0("subsetDS('", y, "', pair.compl.obs)")
datashield.assign(datasources, 'yok', as.symbol(cally))
} else {
cally = paste0("complete.cases(", x, ")")
datashield.assign(datasources, 'not.na.x', as.symbol(cally))
cally = paste0("subsetDS('", x, "',not.na.x)")
datashield.assign(datasources, 'xok', as.symbol(cally))

cally = paste0("complete.cases(", y, ")")
datashield.assign(datasources, 'not.na.y', as.symbol(cally))
cally = paste0("subsetDS('", y, "',not.na.y)")
datashield.assign(datasources, 'yok', as.symbol(cally))
}
} else {
if (paired)
stop("'y' is missing for paired test")
cally = paste0("complete.cases(", x, ")")
datashield.assign(datasources, 'not.na.x', as.symbol(cally))
cally = paste0("subsetDS('", x, "', not.na.x)")
datashield.assign(datasources, 'xok', as.symbol(cally))
cally = paste0("as.null(", x, ")")
datashield.assign(datasources, 'yok', as.symbol(cally)) # does not matter that as.null(x) since we just want to set y to NULL
}

if (paired) {
cally = paste0("(yok)","*(",-1,")")
datashield.assign(datasources, 'minus_y', as.symbol(cally))
# datashield.assign(datasources, 'dummy', quote(cbind(xok, minus_y)))
datashield.assign(datasources, 'xok', as.symbol("xok+minus_y"))
datashield.assign(datasources, 'yok', as.symbol("as.null(yok)"))
}

length.local.x = datashield.aggregate(datasources, as.symbol("NROW(xok)"))
mean.local.x = datashield.aggregate(datasources, as.symbol("meanDS(xok)"))
var.local.x = datashield.aggregate(datasources, as.symbol("varDS(xok)"))

length.total.x = 0
sum.weighted.x = 0

for (i in 1:num.sources)
if (!is.null(length.local.x[[i]]))
length.total.x = length.total.x + length.local.x[[i]]

for (i in 1:num.sources)
if (!is.null(length.local.x[[i]]))
sum.weighted.x = sum.weighted.x + length.local.x[[i]]*mean.local.x[[i]]

if (!is.na(sum.weighted.x))
mean.global.x = sum.weighted.x/length.total.x else
stop(paste("Check the data supplied: global ", variables[1], " mean is NA", sep=""))
estimate = mean.global.x

nrows_var.x = NROW(var.local.x[[1]])
ncols_var.x = NCOL(var.local.x[[1]])
dummy.sum.x = matrix(0, nrows_var.x, ncols_var.x)

for (i in 1:num.sources) {
if (!is.null(var.local.x[[i]]) & !is.null(mean.local.x[[i]]))
if (!is.na(var.local.x[[i]]) & !is.na(mean.local.x[[i]])) {
var.weight.x = (length.local.x[[i]]-1)*var.local.x[[i]]
add.elem.x = length.local.x[[i]]*(mean.local.x[[i]]%x%t(mean.local.x[[i]]))
dummy.sum.x = dummy.sum.x +var.weight.x+add.elem.x
}
}
mean.global.products.x = length.total.x*(mean.global.x%x%t(mean.global.x))
var.global.x = 1/(length.total.x-1)*(dummy.sum.x-mean.global.products.x)

null.y = datashield.aggregate(datasources, as.symbol("is.null(yok)"))
null.y = unlist(null.y)

if (all(null.y)) {
if (length.total.x < 2)
stop("not enough 'x' observations")
df <- length.total.x - 1
stderr <- sqrt(var.global.x/length.total.x)
if (stderr < 10 * .Machine$double.eps * abs(mean.global.x))
stop("data are essentially constant")
tstat <- (mean.global.x - mu)/stderr
method <- ifelse(paired, "Paired t-test", "One Sample t-test")
names(estimate) <- ifelse(paired, "mean of the differences", paste("mean of", variables[1], sep=""))
} else {
length.local.y = datashield.aggregate(datasources, as.symbol("NROW(yok)"))

length.total.y = 0
sum.weighted.y = 0

for (i in 1:num.sources)
if (!is.null(length.local.y[[i]]))
length.total.y = length.total.y + length.local.y[[i]]

if (length.total.x < 1 || (!var.equal && length.total.x < 2))
stop(paste("not enough ", variables[1], "observations", sep=""))
if (length.total.y < 1 || (!var.equal && length.total.y < 2))
stop(paste("not enough ", variables[2], "observations", sep=""))
if (var.equal && length.total.x + length.total.y < 3)
stop("not enough observations")

mean.local.y = datashield.aggregate(datasources, as.symbol("meanDS(yok)"))
var.local.y = datashield.aggregate(datasources, as.symbol("varDS(yok)"))
method <- paste(if (!var.equal)
"Welch", "Two Sample t-test")

length.total.y = 0
sum.weighted.y = 0

for (i in 1:num.sources)
if (!is.null(length.local.y[[i]])) {
length.total.y = length.total.y + length.local.y[[i]]
sum.weighted.y = sum.weighted.y + length.local.y[[i]]*mean.local.y[[i]]
if(type=="split"){
results <- vector("list", length(datasources))
for(i in 1:length(datasources)){
message(paste0("----",names(datasources)[i], "----"))
out <- tTestHelper2(x, conf.level, datasources[i])
results[[i]] <- out
message(" ")
rm(out)
}
if (!is.na(sum.weighted.y))
mean.global.y = sum.weighted.y/length.total.y else
stop(paste("Check the data supplied: global ", variables[2], " mean is NA", sep=""))

estimate <- c(mean.global.x, mean.global.y)
names(estimate) <- c(paste("mean of ", variables[1], sep=""), paste("mean of ", variables[2], sep=""))

nrows_var.y = NROW(var.local.y[[1]])
ncols_var.y = NCOL(var.local.y[[1]])
dummy.sum.y = matrix(0, nrows_var.y, ncols_var.y)

for (i in 1:num.sources) {
if (!is.null(var.local.y[[i]]) & !is.null(mean.local.y[[i]]))
if (!is.na(var.local.y[[i]]) & !is.na(mean.local.y[[i]])) {
var.weight.y = (length.local.y[[i]]-1)*var.local.y[[i]]
add.elem.y = length.local.y[[i]]*(mean.local.y[[i]]%x%t(mean.local.y[[i]]))
dummy.sum.y = dummy.sum.y +var.weight.y+add.elem.y
}
names(results) <- names(datasources)
}else{
stop('Function argument "type" has to be either "combine" or "split"')
}
mean.global.products.y = length.total.y*(mean.global.y%x%t(mean.global.y))
var.global.y = 1/(length.total.y-1)*(dummy.sum.y-mean.global.products.y)

if (var.equal) {
df <- length.total.x + length.total.x - 2
v <- 0
if (length.total.x > 1)
v <- v + (length.total.x - 1) * var.global.x
if (length.total.y > 1)
v <- v + (length.total.y - 1) * var.global.y
v <- v/df
stderr <- sqrt(v * (1/length.total.x + 1/length.total.y))
} else {
stderrx <- sqrt(var.global.x/length.total.x)
stderry <- sqrt(var.global.y/length.total.y)
stderr <- sqrt(stderrx^2 + stderry^2)
df <- stderr^4/(stderrx^4/(length.total.x - 1) + stderry^4/(length.total.y - 1))
}
if (stderr < 10 * .Machine$double.eps * max(abs(mean.global.x),
abs(mean.global.y)))
stop("data are essentially constant")
tstat <- (mean.global.x - mean.global.y - mu)/stderr
}


if (alternative == "less") {
pval <- pt(tstat, df)
cint <- c(-Inf, tstat + qt(conf.level, df))
} else if (alternative == "greater") {
pval <- pt(tstat, df, lower.tail = FALSE)
cint <- c(tstat - qt(conf.level, df), Inf)
} else {
pval <- 2 * pt(-abs(tstat), df)
alpha <- 1 - conf.level
cint <- qt(1 - alpha/2, df)
cint <- tstat + c(-cint, cint)
}
cint <- mu + cint * stderr
names(tstat) <- "t"
names(df) <- "df"
names(mu) <- if (paired || !is.null(y))
"difference in means" else
"mean"
attr(cint, "conf.level") <- conf.level
rval <- list(statistic = tstat, parameter = df, p.value = pval,
conf.int = cint, estimate = estimate, null.value = mu,
alternative = alternative, method = method, data.name=dname)
class(rval) <- "htest"

# delete files that are no more required
datashield.rm(datasources, 'pair.compl.obs')
datashield.rm(datasources, 'xok')
datashield.rm(datasources, 'yok')
datashield.rm(datasources, 'not.na.x')
datashield.rm(datasources, 'minus_y')

return(rval)


}else{
if(type == "split"){
cally <- paste0("t.test(", x, ",", y, ",alternative='",alternative, "',mu=",mu, ",paired=",paired, ",var.equal=",var.equal, ",conf.level=",conf.level,")")
results <- datashield.aggregate(datasources, as.symbol(cally))
return(results)
}else{
stop('Function argument "type" has to be either "combine" or "split"')
}
results <- tTestHelper1(x, y, type, alternative, mu, paired, var.equal, conf.level, datasources)
}

return(results)
}
Loading

0 comments on commit d70a841

Please sign in to comment.