From d70a84175d36b3078097db1d96ad0e6d2047f475 Mon Sep 17 00:00:00 2001 From: agaye Date: Tue, 24 Mar 2015 00:05:05 -0400 Subject: [PATCH] Amended ds.tTest to allow for a test between a continuous and a categorical variable. --- R/ds.tTest.R | 312 ++++++-------------------------------------- R/tTestHelper1.R | 276 +++++++++++++++++++++++++++++++++++++++ R/tTestHelper2.R | 201 ++++++++++++++++++++++++++++ man/ds.tTest.Rd | 38 ++++-- man/tTestHelper1.Rd | 37 ++++++ man/tTestHelper2.Rd | 32 +++++ 6 files changed, 613 insertions(+), 283 deletions(-) create mode 100644 R/tTestHelper1.R create mode 100644 R/tTestHelper2.R create mode 100644 man/tTestHelper1.Rd create mode 100644 man/tTestHelper2.Rd diff --git a/R/ds.tTest.R b/R/ds.tTest.R index aebc174..782767e 100644 --- a/R/ds.tTest.R +++ b/R/ds.tTest.R @@ -3,8 +3,11 @@ #' @description Performs one and two sample t-tests on vectors of data. #' @details Summary statistics are obtained from each of the data sets that are located on the #' distinct computers/servers. And then grand means and variances are calculated. Those are used -#' for performing t-test. -#' @param x a character, the name of a (non-empty) numeric vector of data values. +#' for performing t-test. The funtion allows for the calculation of t-test between two continuous variables +#' or between a continuous and a factor variable; the latter option requires a formula (see parameter \code{dataframe}). +#' If a formula is provided all other but 'conf.level=0.95' are ignored. +#' @param x a character, the name of a (non-empty) numeric vector of data values or a formula of the +#' form 'a~b' where 'a' is the name of a continuous variable and 'b' that of a factor variable. #' @param y a character, the name of an optional (non-empty) numeric vector of data values. #' @param type a character which tells if the test is ran for the pooled data or not. #' By default \code{type} is set to 'combine' and a t.test of the pooled data is @@ -31,6 +34,7 @@ #' was a one-sample test or a two-sample test. #' \code{alternative} a character string describing the alternative hypothesis #' \code{method} a character string indicating what type of t-test was performed +#' @return an object of type 'htest' if both x and y are continuous and a list otherwise. #' @author Isaeva, J.; Gaye, A. #' @export #' @examples { @@ -38,36 +42,38 @@ #' # load that contains the login details #' data(logindata) #' -#' # login and assign specific variable(s) -#' myvar <- list("LAB_HDL", "LAB_TSC") -#' opals <- datashield.login(logins=logindata,assign=TRUE,variables=myvar) +#' # login and assign all the variables +#' opals <- datashield.login(logins=logindata,assign=TRUE) #' #' # Example 1: Run a t.test of the pooled data for the variables 'LAB_HDL' and 'LAB_TSC' - default #' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC') -#' -#' # Example 2: Run a t.test for each study separately for the same variables as above +#' +#' # Example 2: Run a test to compare the mean of a continuous variable across the two categories of a categorical variable +#' s <- ds.tTest(x='D$PM_BMI_CONTINUOUS~D$GENDER') +#' +#' # Example 3: Run a t.test for each study separately for the same variables as above #' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', type='split') #' -#' # Example 3: Run a paired t.test of the pooled data +#' # Example 4: Run a paired t.test of the pooled data #' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', paired=TRUE) #' -#' # Example 4: Run a paired t.test for each study separately +#' # Example 5: Run a paired t.test for each study separately #' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', paired=TRUE, type='split') #' -#' # Example 5: Run a t.test of the pooled data with different alternatives +#' # Example 6: Run a t.test of the pooled data with different alternatives #' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', alternative='greater') #' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', alternative='less') #' -#' # Example 6: Run a t.test of the pooled data with mu different from zero +#' # Example 7: Run a t.test of the pooled data with mu different from zero #' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', mu=-4) #' -#' # Example 7: Run a t.test of the pooled data assuming that variances of variables are equal +#' # Example 8: Run a t.test of the pooled data assuming that variances of variables are equal #' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', var.equal=TRUE) #' -#' # Example 8: Run a t.test of the pooled data with 90% confidence interval +#' # Example 9: Run a t.test of the pooled data with 90% confidence interval #' ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', conf.level=0.90) #' -#' # Example 9: Run a one-sample t.test of the pooled data +#' # Example 10: Run a one-sample t.test of the pooled data #' ds.tTest(x='D$LAB_HDL') #' # the below example should not work, paired t.test is not possible if the 'y' variable is missing #' # ds.tTest(x='D$LAB_HDL', paired=TRUE) @@ -85,268 +91,34 @@ ds.tTest <- function (x=NULL, y=NULL, type="combine", alternative="two.sided", m } if(is.null(x)){ - stop("Please provide the name of the x vector!", call.=FALSE) - } - - # get the names of the variables used for the analysis - # the input variable might be given as column table (i.e. D$x) - # or just as a vector not attached to a table (i.e. x) - # we have to make sure the function deals with each case - if(is.null(y)){ - xname <- extract(x) - dname = xname$elements - variables <- dname - }else{ - xname <- extract(x) - yname <- extract(y) - dname1 = xname$elements - dname2 = yname$elements - variables <- c(dname1, dname2) - dname = paste(dname1, 'and', dname2) + stop("Please provide the name of the x vector or a formula if performing t.test between a numeric and a factor vector!", call.=FALSE) } - # call the function that checks theinput variables are defined in all the studies - if(is.null(y)){ - obj2lookfor <- xname$holders - if(is.na(obj2lookfor)){ - defined <- isDefined(datasources, variables[1]) + # check if the user provided two continuous variables or a formula to run t-test between a continuous and a facyor variable + # depending on what the user specified call the relevant function ('tTestHelper1' in the 1st case or 'tTesHelper2' ortherwise) + # tTeshelper + if(length(unlist(strsplit(x, split='~')))==2){ + if(type=="combine"){ + results <- tTestHelper2(x, conf.level, datasources) }else{ - defined <- isDefined(datasources, obj2lookfor) - } - }else{ - obj2lookfor1 <- xname$holders - obj2lookfor2 <- yname$holders - if(is.na(obj2lookfor1)){ - defined <- isDefined(datasources, variables[1]) - }else{ - defined <- isDefined(datasources, obj2lookfor1) - } - if(is.na(obj2lookfor2)){ - defined <- isDefined(datasources, variables[2]) - }else{ - defined <- isDefined(datasources, obj2lookfor2) - } - } - - # call the internal function that checks an input object is of the same class in all studies. - if(is.null(y)){ - typ1 <- checkClass(datasources, x) - }else{ - typ1 <- checkClass(datasources, x) - typ2 <- checkClass(datasources, y) - } - - # number of studies - num.sources = length(datasources) - - if(type == "combine"){ - - # Performs t-test on merged data sets - if (!missing(mu) && (length(mu) != 1 || is.na(mu))) - stop("'mu' must be a single number") - if (!missing(conf.level) && (length(conf.level) != 1 || !is.finite(conf.level) || - conf.level < 0 || conf.level > 1)) - stop("'conf.level' must be a single number between 0 and 1") - if (!is.null(y)) { - if (paired) { - cally = paste0("complete.cases(", x, ",", y, ")") - datashield.assign(datasources, 'pair.compl.obs', as.symbol(cally)) - cally = paste0("subsetDS('", x, "', pair.compl.obs)") - datashield.assign(datasources, 'xok', as.symbol(cally)) - cally = paste0("subsetDS('", y, "', pair.compl.obs)") - datashield.assign(datasources, 'yok', as.symbol(cally)) - } else { - cally = paste0("complete.cases(", x, ")") - datashield.assign(datasources, 'not.na.x', as.symbol(cally)) - cally = paste0("subsetDS('", x, "',not.na.x)") - datashield.assign(datasources, 'xok', as.symbol(cally)) - - cally = paste0("complete.cases(", y, ")") - datashield.assign(datasources, 'not.na.y', as.symbol(cally)) - cally = paste0("subsetDS('", y, "',not.na.y)") - datashield.assign(datasources, 'yok', as.symbol(cally)) - } - } else { - if (paired) - stop("'y' is missing for paired test") - cally = paste0("complete.cases(", x, ")") - datashield.assign(datasources, 'not.na.x', as.symbol(cally)) - cally = paste0("subsetDS('", x, "', not.na.x)") - datashield.assign(datasources, 'xok', as.symbol(cally)) - cally = paste0("as.null(", x, ")") - datashield.assign(datasources, 'yok', as.symbol(cally)) # does not matter that as.null(x) since we just want to set y to NULL - } - - if (paired) { - cally = paste0("(yok)","*(",-1,")") - datashield.assign(datasources, 'minus_y', as.symbol(cally)) - # datashield.assign(datasources, 'dummy', quote(cbind(xok, minus_y))) - datashield.assign(datasources, 'xok', as.symbol("xok+minus_y")) - datashield.assign(datasources, 'yok', as.symbol("as.null(yok)")) - } - - length.local.x = datashield.aggregate(datasources, as.symbol("NROW(xok)")) - mean.local.x = datashield.aggregate(datasources, as.symbol("meanDS(xok)")) - var.local.x = datashield.aggregate(datasources, as.symbol("varDS(xok)")) - - length.total.x = 0 - sum.weighted.x = 0 - - for (i in 1:num.sources) - if (!is.null(length.local.x[[i]])) - length.total.x = length.total.x + length.local.x[[i]] - - for (i in 1:num.sources) - if (!is.null(length.local.x[[i]])) - sum.weighted.x = sum.weighted.x + length.local.x[[i]]*mean.local.x[[i]] - - if (!is.na(sum.weighted.x)) - mean.global.x = sum.weighted.x/length.total.x else - stop(paste("Check the data supplied: global ", variables[1], " mean is NA", sep="")) - estimate = mean.global.x - - nrows_var.x = NROW(var.local.x[[1]]) - ncols_var.x = NCOL(var.local.x[[1]]) - dummy.sum.x = matrix(0, nrows_var.x, ncols_var.x) - - for (i in 1:num.sources) { - if (!is.null(var.local.x[[i]]) & !is.null(mean.local.x[[i]])) - if (!is.na(var.local.x[[i]]) & !is.na(mean.local.x[[i]])) { - var.weight.x = (length.local.x[[i]]-1)*var.local.x[[i]] - add.elem.x = length.local.x[[i]]*(mean.local.x[[i]]%x%t(mean.local.x[[i]])) - dummy.sum.x = dummy.sum.x +var.weight.x+add.elem.x - } - } - mean.global.products.x = length.total.x*(mean.global.x%x%t(mean.global.x)) - var.global.x = 1/(length.total.x-1)*(dummy.sum.x-mean.global.products.x) - - null.y = datashield.aggregate(datasources, as.symbol("is.null(yok)")) - null.y = unlist(null.y) - - if (all(null.y)) { - if (length.total.x < 2) - stop("not enough 'x' observations") - df <- length.total.x - 1 - stderr <- sqrt(var.global.x/length.total.x) - if (stderr < 10 * .Machine$double.eps * abs(mean.global.x)) - stop("data are essentially constant") - tstat <- (mean.global.x - mu)/stderr - method <- ifelse(paired, "Paired t-test", "One Sample t-test") - names(estimate) <- ifelse(paired, "mean of the differences", paste("mean of", variables[1], sep="")) - } else { - length.local.y = datashield.aggregate(datasources, as.symbol("NROW(yok)")) - - length.total.y = 0 - sum.weighted.y = 0 - - for (i in 1:num.sources) - if (!is.null(length.local.y[[i]])) - length.total.y = length.total.y + length.local.y[[i]] - - if (length.total.x < 1 || (!var.equal && length.total.x < 2)) - stop(paste("not enough ", variables[1], "observations", sep="")) - if (length.total.y < 1 || (!var.equal && length.total.y < 2)) - stop(paste("not enough ", variables[2], "observations", sep="")) - if (var.equal && length.total.x + length.total.y < 3) - stop("not enough observations") - - mean.local.y = datashield.aggregate(datasources, as.symbol("meanDS(yok)")) - var.local.y = datashield.aggregate(datasources, as.symbol("varDS(yok)")) - method <- paste(if (!var.equal) - "Welch", "Two Sample t-test") - - length.total.y = 0 - sum.weighted.y = 0 - - for (i in 1:num.sources) - if (!is.null(length.local.y[[i]])) { - length.total.y = length.total.y + length.local.y[[i]] - sum.weighted.y = sum.weighted.y + length.local.y[[i]]*mean.local.y[[i]] + if(type=="split"){ + results <- vector("list", length(datasources)) + for(i in 1:length(datasources)){ + message(paste0("----",names(datasources)[i], "----")) + out <- tTestHelper2(x, conf.level, datasources[i]) + results[[i]] <- out + message(" ") + rm(out) } - if (!is.na(sum.weighted.y)) - mean.global.y = sum.weighted.y/length.total.y else - stop(paste("Check the data supplied: global ", variables[2], " mean is NA", sep="")) - - estimate <- c(mean.global.x, mean.global.y) - names(estimate) <- c(paste("mean of ", variables[1], sep=""), paste("mean of ", variables[2], sep="")) - - nrows_var.y = NROW(var.local.y[[1]]) - ncols_var.y = NCOL(var.local.y[[1]]) - dummy.sum.y = matrix(0, nrows_var.y, ncols_var.y) - - for (i in 1:num.sources) { - if (!is.null(var.local.y[[i]]) & !is.null(mean.local.y[[i]])) - if (!is.na(var.local.y[[i]]) & !is.na(mean.local.y[[i]])) { - var.weight.y = (length.local.y[[i]]-1)*var.local.y[[i]] - add.elem.y = length.local.y[[i]]*(mean.local.y[[i]]%x%t(mean.local.y[[i]])) - dummy.sum.y = dummy.sum.y +var.weight.y+add.elem.y - } + names(results) <- names(datasources) + }else{ + stop('Function argument "type" has to be either "combine" or "split"') } - mean.global.products.y = length.total.y*(mean.global.y%x%t(mean.global.y)) - var.global.y = 1/(length.total.y-1)*(dummy.sum.y-mean.global.products.y) - - if (var.equal) { - df <- length.total.x + length.total.x - 2 - v <- 0 - if (length.total.x > 1) - v <- v + (length.total.x - 1) * var.global.x - if (length.total.y > 1) - v <- v + (length.total.y - 1) * var.global.y - v <- v/df - stderr <- sqrt(v * (1/length.total.x + 1/length.total.y)) - } else { - stderrx <- sqrt(var.global.x/length.total.x) - stderry <- sqrt(var.global.y/length.total.y) - stderr <- sqrt(stderrx^2 + stderry^2) - df <- stderr^4/(stderrx^4/(length.total.x - 1) + stderry^4/(length.total.y - 1)) - } - if (stderr < 10 * .Machine$double.eps * max(abs(mean.global.x), - abs(mean.global.y))) - stop("data are essentially constant") - tstat <- (mean.global.x - mean.global.y - mu)/stderr - } - - - if (alternative == "less") { - pval <- pt(tstat, df) - cint <- c(-Inf, tstat + qt(conf.level, df)) - } else if (alternative == "greater") { - pval <- pt(tstat, df, lower.tail = FALSE) - cint <- c(tstat - qt(conf.level, df), Inf) - } else { - pval <- 2 * pt(-abs(tstat), df) - alpha <- 1 - conf.level - cint <- qt(1 - alpha/2, df) - cint <- tstat + c(-cint, cint) } - cint <- mu + cint * stderr - names(tstat) <- "t" - names(df) <- "df" - names(mu) <- if (paired || !is.null(y)) - "difference in means" else - "mean" - attr(cint, "conf.level") <- conf.level - rval <- list(statistic = tstat, parameter = df, p.value = pval, - conf.int = cint, estimate = estimate, null.value = mu, - alternative = alternative, method = method, data.name=dname) - class(rval) <- "htest" - - # delete files that are no more required - datashield.rm(datasources, 'pair.compl.obs') - datashield.rm(datasources, 'xok') - datashield.rm(datasources, 'yok') - datashield.rm(datasources, 'not.na.x') - datashield.rm(datasources, 'minus_y') - - return(rval) - + }else{ - if(type == "split"){ - cally <- paste0("t.test(", x, ",", y, ",alternative='",alternative, "',mu=",mu, ",paired=",paired, ",var.equal=",var.equal, ",conf.level=",conf.level,")") - results <- datashield.aggregate(datasources, as.symbol(cally)) - return(results) - }else{ - stop('Function argument "type" has to be either "combine" or "split"') - } + results <- tTestHelper1(x, y, type, alternative, mu, paired, var.equal, conf.level, datasources) } + + return(results) } diff --git a/R/tTestHelper1.R b/R/tTestHelper1.R new file mode 100644 index 0000000..bd01297 --- /dev/null +++ b/R/tTestHelper1.R @@ -0,0 +1,276 @@ +#' +#' @title runs a t-test for two continuous variables +#' @description This is an internal function. +#' @param type see the calling function +#' @param alternative see the calling function +#' @param mu see the calling function +#' @param paired see the calling function +#' @param var.equal see the calling function +#' @param conf.level see the calling function +#' @param datasources see the calling function +#' @keywords internal see the calling function +#' @return the results of the t-test +#' +tTestHelper1 <- function(x, y, type, alternative, mu, paired, var.equal, conf.level, datasources){ + # get the names of the variables used for the analysis + # the input variable might be given as column table (i.e. D$x) + # or just as a vector not attached to a table (i.e. x) + # we have to make sure the function deals with each case + if(is.null(y)){ + xname <- extract(x) + dname = xname$elements + variables <- dname + }else{ + xname <- extract(x) + yname <- extract(y) + dname1 = xname$elements + dname2 = yname$elements + variables <- c(dname1, dname2) + dname = paste(dname1, 'and', dname2) + } + + # call the function that checks theinput variables are defined in all the studies + if(is.null(y)){ + obj2lookfor <- xname$holders + if(is.na(obj2lookfor)){ + defined <- isDefined(datasources, variables[1]) + }else{ + defined <- isDefined(datasources, obj2lookfor) + } + }else{ + obj2lookfor1 <- xname$holders + obj2lookfor2 <- yname$holders + if(is.na(obj2lookfor1)){ + defined <- isDefined(datasources, variables[1]) + }else{ + defined <- isDefined(datasources, obj2lookfor1) + } + if(is.na(obj2lookfor2)){ + defined <- isDefined(datasources, variables[2]) + }else{ + defined <- isDefined(datasources, obj2lookfor2) + } + } + + # call the internal function that checks an input object is of the same class in all studies. + if(is.null(y)){ + typ1 <- checkClass(datasources, x) + }else{ + typ1 <- checkClass(datasources, x) + typ2 <- checkClass(datasources, y) + } + + # number of studies + num.sources = length(datasources) + + if(type == "combine"){ + + # Performs t-test on merged data sets + if (!missing(mu) && (length(mu) != 1 || is.na(mu))) + stop("'mu' must be a single number") + if (!missing(conf.level) && (length(conf.level) != 1 || !is.finite(conf.level) || + conf.level < 0 || conf.level > 1)) + stop("'conf.level' must be a single number between 0 and 1") + if (!is.null(y)) { + if (paired) { + cally = paste0("complete.cases(", x, ",", y, ")") + datashield.assign(datasources, 'pair.compl.obs', as.symbol(cally)) + cally = paste0("subsetDS('", x, "', pair.compl.obs)") + datashield.assign(datasources, 'xok', as.symbol(cally)) + cally = paste0("subsetDS('", y, "', pair.compl.obs)") + datashield.assign(datasources, 'yok', as.symbol(cally)) + } else { + cally = paste0("complete.cases(", x, ")") + datashield.assign(datasources, 'not.na.x', as.symbol(cally)) + cally = paste0("subsetDS('", x, "',not.na.x)") + datashield.assign(datasources, 'xok', as.symbol(cally)) + + cally = paste0("complete.cases(", y, ")") + datashield.assign(datasources, 'not.na.y', as.symbol(cally)) + cally = paste0("subsetDS('", y, "',not.na.y)") + datashield.assign(datasources, 'yok', as.symbol(cally)) + } + } else { + if (paired) + stop("'y' is missing for paired test") + cally = paste0("complete.cases(", x, ")") + datashield.assign(datasources, 'not.na.x', as.symbol(cally)) + cally = paste0("subsetDS('", x, "', not.na.x)") + datashield.assign(datasources, 'xok', as.symbol(cally)) + cally = paste0("as.null(", x, ")") + datashield.assign(datasources, 'yok', as.symbol(cally)) # does not matter that as.null(x) since we just want to set y to NULL + } + + if (paired) { + cally = paste0("(yok)","*(",-1,")") + datashield.assign(datasources, 'minus_y', as.symbol(cally)) + # datashield.assign(datasources, 'dummy', quote(cbind(xok, minus_y))) + datashield.assign(datasources, 'xok', as.symbol("xok+minus_y")) + datashield.assign(datasources, 'yok', as.symbol("as.null(yok)")) + } + + length.local.x = datashield.aggregate(datasources, as.symbol("NROW(xok)")) + mean.local.x = datashield.aggregate(datasources, as.symbol("meanDS(xok)")) + var.local.x = datashield.aggregate(datasources, as.symbol("varDS(xok)")) + + length.total.x = 0 + sum.weighted.x = 0 + + for (i in 1:num.sources) + if (!is.null(length.local.x[[i]])) + length.total.x = length.total.x + length.local.x[[i]] + + for (i in 1:num.sources) + if (!is.null(length.local.x[[i]])) + sum.weighted.x = sum.weighted.x + length.local.x[[i]]*mean.local.x[[i]] + + if (!is.na(sum.weighted.x)) + mean.global.x = sum.weighted.x/length.total.x else + stop(paste("Check the data supplied: global ", variables[1], " mean is NA", sep="")) + estimate = mean.global.x + + nrows_var.x = NROW(var.local.x[[1]]) + ncols_var.x = NCOL(var.local.x[[1]]) + dummy.sum.x = matrix(0, nrows_var.x, ncols_var.x) + + for (i in 1:num.sources) { + if (!is.null(var.local.x[[i]]) & !is.null(mean.local.x[[i]])) + if (!is.na(var.local.x[[i]]) & !is.na(mean.local.x[[i]])) { + var.weight.x = (length.local.x[[i]]-1)*var.local.x[[i]] + add.elem.x = length.local.x[[i]]*(mean.local.x[[i]]%x%t(mean.local.x[[i]])) + dummy.sum.x = dummy.sum.x +var.weight.x+add.elem.x + } + } + mean.global.products.x = length.total.x*(mean.global.x%x%t(mean.global.x)) + var.global.x = 1/(length.total.x-1)*(dummy.sum.x-mean.global.products.x) + + null.y = datashield.aggregate(datasources, as.symbol("is.null(yok)")) + null.y = unlist(null.y) + + if (all(null.y)) { + if (length.total.x < 2) + stop("not enough 'x' observations") + df <- length.total.x - 1 + stderr <- sqrt(var.global.x/length.total.x) + if (stderr < 10 * .Machine$double.eps * abs(mean.global.x)) + stop("data are essentially constant") + tstat <- (mean.global.x - mu)/stderr + method <- ifelse(paired, "Paired t-test", "One Sample t-test") + names(estimate) <- ifelse(paired, "mean of the differences", paste("mean of", variables[1], sep="")) + } else { + length.local.y = datashield.aggregate(datasources, as.symbol("NROW(yok)")) + + length.total.y = 0 + sum.weighted.y = 0 + + for (i in 1:num.sources) + if (!is.null(length.local.y[[i]])) + length.total.y = length.total.y + length.local.y[[i]] + + if (length.total.x < 1 || (!var.equal && length.total.x < 2)) + stop(paste("not enough ", variables[1], "observations", sep="")) + if (length.total.y < 1 || (!var.equal && length.total.y < 2)) + stop(paste("not enough ", variables[2], "observations", sep="")) + if (var.equal && length.total.x + length.total.y < 3) + stop("not enough observations") + + mean.local.y = datashield.aggregate(datasources, as.symbol("meanDS(yok)")) + var.local.y = datashield.aggregate(datasources, as.symbol("varDS(yok)")) + method <- paste(if (!var.equal) + "Welch", "Two Sample t-test") + + length.total.y = 0 + sum.weighted.y = 0 + + for (i in 1:num.sources) + if (!is.null(length.local.y[[i]])) { + length.total.y = length.total.y + length.local.y[[i]] + sum.weighted.y = sum.weighted.y + length.local.y[[i]]*mean.local.y[[i]] + } + if (!is.na(sum.weighted.y)) + mean.global.y = sum.weighted.y/length.total.y else + stop(paste("Check the data supplied: global ", variables[2], " mean is NA", sep="")) + + estimate <- c(mean.global.x, mean.global.y) + names(estimate) <- c(paste("mean of ", variables[1], sep=""), paste("mean of ", variables[2], sep="")) + + nrows_var.y = NROW(var.local.y[[1]]) + ncols_var.y = NCOL(var.local.y[[1]]) + dummy.sum.y = matrix(0, nrows_var.y, ncols_var.y) + + for (i in 1:num.sources) { + if (!is.null(var.local.y[[i]]) & !is.null(mean.local.y[[i]])) + if (!is.na(var.local.y[[i]]) & !is.na(mean.local.y[[i]])) { + var.weight.y = (length.local.y[[i]]-1)*var.local.y[[i]] + add.elem.y = length.local.y[[i]]*(mean.local.y[[i]]%x%t(mean.local.y[[i]])) + dummy.sum.y = dummy.sum.y +var.weight.y+add.elem.y + } + } + mean.global.products.y = length.total.y*(mean.global.y%x%t(mean.global.y)) + var.global.y = 1/(length.total.y-1)*(dummy.sum.y-mean.global.products.y) + + if (var.equal) { + df <- length.total.x + length.total.x - 2 + v <- 0 + if (length.total.x > 1) + v <- v + (length.total.x - 1) * var.global.x + if (length.total.y > 1) + v <- v + (length.total.y - 1) * var.global.y + v <- v/df + stderr <- sqrt(v * (1/length.total.x + 1/length.total.y)) + } else { + stderrx <- sqrt(var.global.x/length.total.x) + stderry <- sqrt(var.global.y/length.total.y) + stderr <- sqrt(stderrx^2 + stderry^2) + df <- stderr^4/(stderrx^4/(length.total.x - 1) + stderry^4/(length.total.y - 1)) + } + if (stderr < 10 * .Machine$double.eps * max(abs(mean.global.x), + abs(mean.global.y))) + stop("data are essentially constant") + tstat <- (mean.global.x - mean.global.y - mu)/stderr + } + + + if (alternative == "less") { + pval <- pt(tstat, df) + cint <- c(-Inf, tstat + qt(conf.level, df)) + } else if (alternative == "greater") { + pval <- pt(tstat, df, lower.tail = FALSE) + cint <- c(tstat - qt(conf.level, df), Inf) + } else { + pval <- 2 * pt(-abs(tstat), df) + alpha <- 1 - conf.level + cint <- qt(1 - alpha/2, df) + cint <- tstat + c(-cint, cint) + } + cint <- mu + cint * stderr + names(tstat) <- "t" + names(df) <- "df" + names(mu) <- if (paired || !is.null(y)) + "difference in means" else + "mean" + attr(cint, "conf.level") <- conf.level + rval <- list(statistic = tstat, parameter = df, p.value = pval, + conf.int = cint, estimate = estimate, null.value = mu, + alternative = alternative, method = method, data.name=dname) + class(rval) <- "htest" + + # delete files that are no more required + datashield.rm(datasources, 'pair.compl.obs') + datashield.rm(datasources, 'xok') + datashield.rm(datasources, 'yok') + datashield.rm(datasources, 'not.na.x') + datashield.rm(datasources, 'minus_y') + + return(rval) + + }else{ + if(type == "split"){ + cally <- paste0("t.test(", x, ",", y, ",alternative='",alternative, "',mu=",mu, ",paired=",paired, ",var.equal=",var.equal, ",conf.level=",conf.level,")") + results <- datashield.aggregate(datasources, as.symbol(cally)) + return(results) + }else{ + stop('Function argument "type" has to be either "combine" or "split"') + } + } +} \ No newline at end of file diff --git a/R/tTestHelper2.R b/R/tTestHelper2.R new file mode 100644 index 0000000..4c1bbea --- /dev/null +++ b/R/tTestHelper2.R @@ -0,0 +1,201 @@ +#' +#' @title Uses glm to compute means of numeric vector across factor vector +#' @description This is an internal function. +#' @param formula a character form of a formula (e.g. 'a~b') +#' @param CI a numeric, the confidence interval +#' @param family a description of the error distribution function to use in the model +#' @param datasources see the calling function +#' @keywords internal see the calling function +#' @return results similar to that of a t.test +#' +tTestHelper2 <- function(formula, CI, datasources) { + + # The variables names + a <- unlist(strsplit(formula, split='~'))[1] + b <- unlist(strsplit(formula, split='~'))[2] + + # check if the variables in the formula are defined in all the studies, if not defined a message is thrown and the process stops + defined <- isDefined(datasources, a) + defined <- isDefined(datasources, b) + + # Check that the variables in the formula are of the right type: 'numeric'/'integer' for the outcome and 'factor' for the covariate + typ1 <- checkClass(datasources, a) + if(typ1 != "numeric" & typ1 != "integer"){ + stop(paste0(" ", a, " must be a numeric vector!"), call.=FALSE) + } + typ2 <- checkClass(datasources, b) + if(typ2 != "factor"){ + stop(paste0(" ", b, " must be a factor vector!"), call.=FALSE) + }else{ + # the covariate must have only two categories (as for t.test in R), so stop and throw a message otherwise + cally <- paste0("levels(", b, ")") + levels_all <- datashield.aggregate(datasources, as.symbol(cally)) + classes <- unique(unlist(levels_all)) + if(length(classes) != 2){ + stop(paste0(" ", b, " must two and only two categories!"), call.=FALSE) + } + } + + # turn the formula provided as a character into a formula object + formula <- as.formula(formula) + family <- 'gaussian' + + # number of 'valid' studies (those that passed the checks) and vector of beta values + numstudies <- length(datasources) + + # start beta values + beta.vect.next <- c(0,0) + beta.vect.temp <- paste0(as.character(beta.vect.next), collapse=",") + + # Iterations need to be counted. Start off with the count at 0 + # and increment by 1 at each new iteration + iteration.count <- 0 + + # identify the correct dimension for start beta coeffs by calling the 1st component of glmDS + cally1 <- call('glmDS1', formula, family, beta.vect=beta.vect.temp, NULL) + + study.summary <- datashield.aggregate(datasources, cally1) + num.par.glm <- study.summary[[1]][[1]][[2]] + + beta.vect.next <- rep(0, num.par.glm) + beta.vect.temp <- paste0(as.character(beta.vect.next), collapse=",") + + + # Provide arbitrary starting value for deviance to enable subsequent calculation of the + # change in deviance between iterations + dev.old <- 9.99e+99 + + #Convergence state needs to be monitored. + converge.state <- FALSE + + # Define a convergence criterion. This value of epsilon corresponds to that used + # by default for GLMs in R (see section S3 for details) + epsilon <- 1.0e-08 + + f <- NULL + + while(!converge.state && iteration.count < 20) { + + iteration.count <- iteration.count+1 + + # now call second component of glmDS to generate score vectors and informations matrices + cally2 <- call('glmDS2', formula, family, beta.vect=beta.vect.temp, NULL, NULL, NULL) + + study.summary <- datashield.aggregate(datasources, cally2) + + + .select <- function(l, field) { + lapply(l, function(obj) {obj[[field]]}) + } + + info.matrix.total<-Reduce(f="+", .select(study.summary, 'info.matrix')) + score.vect.total<-Reduce(f="+", .select(study.summary, 'score.vect')) + dev.total<-Reduce(f="+", .select(study.summary, 'dev')) + + if(iteration.count==1) { + # Sum participants only during first iteration. + nsubs.total <- Reduce(f="+", .select(study.summary, 'numsubs')) + # Save family + f <- study.summary[[1]]$family + } + + # create variance covariance matrix as inverse of information matrix + variance.covariance.matrix.total<-solve(info.matrix.total) + + # create beta vector update terms + beta.update.vect<-variance.covariance.matrix.total %*% score.vect.total + + # add update terms to current beta vector to obtain new beta vector for next iteration + if(iteration.count==1){ + beta.vect.next<-rep(0,length(beta.update.vect)) + } + + beta.vect.next<-beta.vect.next+beta.update.vect + beta.vect.temp <- paste0(as.character(beta.vect.next), collapse=",") + + # calculate value of convergence statistic and test whether meets convergence criterion + converge.value<-abs(dev.total-dev.old)/(abs(dev.total)+0.1) + if(converge.value<=epsilon)converge.state<-TRUE + if(converge.value>epsilon)dev.old<-dev.total + + } + + # if convergence has been obtained, declare final (maximum likelihood) beta vector, + # and calculate the corresponding standard errors, z scores and p values + # (the latter two to be consistent with the output of a standard GLM analysis) + # then print out final model summary + if(converge.state){ + + beta.vect.final<-beta.vect.next + + scale.par <- 1 + scale.par <- dev.total / (nsubs.total-length(beta.vect.next)) + + se.vect.final <- sqrt(diag(variance.covariance.matrix.total)) * sqrt(scale.par) + z.vect.final<-beta.vect.final/se.vect.final + pval.vect.final<-2*pnorm(-abs(z.vect.final)) + parameter.names<-names(score.vect.total[,1]) + model.parameters<-cbind(beta.vect.final,se.vect.final,z.vect.final,pval.vect.final) + dimnames(model.parameters)<-list(parameter.names,c("Estimate","Std. Error","z-value","p-value")) + + if(CI > 0){ + + ci.mult <- qnorm(1-(1-CI)/2) + low.ci.lp <- model.parameters[,1]-ci.mult*model.parameters[,2] + hi.ci.lp <- model.parameters[,1]+ci.mult*model.parameters[,2] + estimate.lp <- model.parameters[,1] + + estimate.natural <- estimate.lp + low.ci.natural <- low.ci.lp + hi.ci.natural <- hi.ci.lp + name1 <- paste0("low",CI,"CI") + name2 <- paste0("high",CI,"CI") + ci.mat <- cbind(low.ci.lp,hi.ci.lp) + dimnames(ci.mat) <- list(NULL,c(name1,name2)) + } + + model.parameters <- cbind(model.parameters,ci.mat) + + formulatext <- Reduce(paste, deparse(formula)) + + glmds <- list( + formula = formulatext, + coefficients = model.parameters, + dev = dev.total, + nsubs = nsubs.total, + df = (nsubs.total-length(beta.vect.next)), + iter = iteration.count + ) + + class(glmds) <- 'glmds' + + # get the values that one would expect from a t.test: means for each category, p.value etc... + meanA <- round(glmds$coefficients[1,1],4) + meanB <- round(meanA + glmds$coefficients[2,1],4) + meanLabel <- paste0("mean in group ", classes[1], " & ", "mean in group ", classes[2]) + output <- list( + statistic = glmds$coefficients[2,3], + parameter = glmds$df, + p.value = glmds$coefficients[2,4], + conf.int = c(glmds$coefficients[2,5], glmds$coefficients[2,6]), + estimate = c(meanA, meanB), + alternative = "two.sided", + method = "GLM", + data.name = paste0(a, " by ", b) + ) + # print results to screen in a 't.test' way + mylabels <- c("z ", "df ", "p.value ", paste0(CI*100, " percent confidence interval"), "sample estimates:\nmeanLabel") + message("GLM to assess difference of statistical means across two categories") + message(paste0("data: ", a, " by ", b)) + message(paste0("z = ", round(glmds$coefficients[2,3],4), ", df = ", glmds$df, ", p.value = ", signif(glmds$coefficients[2,4]),4)) + message(paste0(CI*100, " percent confidence interval:")) + message(signif(output$conf.int),5) + message(paste0("sample estimates: ", meanLabel)) + message(paste0(output$estimate[1], " ", output$estimate[2])) + + return(output) + } else { + warning(paste("GLM did not converge after", 20, "iterations.")) + return(NULL) + } +} \ No newline at end of file diff --git a/man/ds.tTest.Rd b/man/ds.tTest.Rd index 6ac6f59..4142b7f 100644 --- a/man/ds.tTest.Rd +++ b/man/ds.tTest.Rd @@ -8,7 +8,9 @@ ds.tTest(x = NULL, y = NULL, type = "combine", } \arguments{ \item{x}{a character, the name of a (non-empty) numeric - vector of data values.} + vector of data values or a formula of the form 'a~b' + where 'a' is the name of a continuous variable and 'b' + that of a factor variable.} \item{y}{a character, the name of an optional (non-empty) numeric vector of data values.} @@ -59,6 +61,9 @@ two-sample test. \code{alternative} a character string describing the alternative hypothesis \code{method} a character string indicating what type of t-test was performed + +an object of type 'htest' if both x and y are continuous +and a list otherwise. } \description{ Performs one and two sample t-tests on vectors of data. @@ -69,7 +74,12 @@ Performs one and two sample t-tests on vectors of data. Summary statistics are obtained from each of the data sets that are located on the distinct computers/servers. And then grand means and variances are calculated. Those are -used for performing t-test. +used for performing t-test. The funtion allows for the +calculation of t-test between two continuous variables or +between a continuous and a factor variable; the latter +option requires a formula (see parameter \code{dataframe}). +If a formula is provided all other but 'conf.level=0.95' +are ignored. } \examples{ { @@ -77,36 +87,38 @@ used for performing t-test. # load that contains the login details data(logindata) - # login and assign specific variable(s) - myvar <- list("LAB_HDL", "LAB_TSC") - opals <- datashield.login(logins=logindata,assign=TRUE,variables=myvar) + # login and assign all the variables + opals <- datashield.login(logins=logindata,assign=TRUE) # Example 1: Run a t.test of the pooled data for the variables 'LAB_HDL' and 'LAB_TSC' - default ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC') - # Example 2: Run a t.test for each study separately for the same variables as above + # Example 2: Run a test to compare the mean of a continuous variable across the two categories of a categorical variable + s <- ds.tTest(x='D$PM_BMI_CONTINUOUS~D$GENDER') + + # Example 3: Run a t.test for each study separately for the same variables as above ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', type='split') - # Example 3: Run a paired t.test of the pooled data + # Example 4: Run a paired t.test of the pooled data ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', paired=TRUE) - # Example 4: Run a paired t.test for each study separately + # Example 5: Run a paired t.test for each study separately ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', paired=TRUE, type='split') - # Example 5: Run a t.test of the pooled data with different alternatives + # Example 6: Run a t.test of the pooled data with different alternatives ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', alternative='greater') ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', alternative='less') - # Example 6: Run a t.test of the pooled data with mu different from zero + # Example 7: Run a t.test of the pooled data with mu different from zero ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', mu=-4) - # Example 7: Run a t.test of the pooled data assuming that variances of variables are equal + # Example 8: Run a t.test of the pooled data assuming that variances of variables are equal ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', var.equal=TRUE) - # Example 8: Run a t.test of the pooled data with 90\% confidence interval + # Example 9: Run a t.test of the pooled data with 90\% confidence interval ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', conf.level=0.90) - # Example 9: Run a one-sample t.test of the pooled data + # Example 10: Run a one-sample t.test of the pooled data ds.tTest(x='D$LAB_HDL') # the below example should not work, paired t.test is not possible if the 'y' variable is missing # ds.tTest(x='D$LAB_HDL', paired=TRUE) diff --git a/man/tTestHelper1.Rd b/man/tTestHelper1.Rd new file mode 100644 index 0000000..35123cb --- /dev/null +++ b/man/tTestHelper1.Rd @@ -0,0 +1,37 @@ +\name{tTestHelper1} +\alias{tTestHelper1} +\title{runs a t-test for two continuous variables} +\usage{ +tTestHelper1(x, y, type, alternative, mu, paired, var.equal, conf.level, + datasources) +} +\arguments{ + \item{type}{see the calling function} + + \item{alternative}{see the calling function} + + \item{mu}{see the calling function} + + \item{paired}{see the calling function} + + \item{var.equal}{see the calling function} + + \item{conf.level}{see the calling function} + + \item{datasources}{see the calling function} +} +\value{ +the results of the t-test +} +\description{ +This is an internal function. +} +\details{ + +} +\keyword{calling} +\keyword{function} +\keyword{internal} +\keyword{see} +\keyword{the} + diff --git a/man/tTestHelper2.Rd b/man/tTestHelper2.Rd new file mode 100644 index 0000000..2bf5b26 --- /dev/null +++ b/man/tTestHelper2.Rd @@ -0,0 +1,32 @@ +\name{tTestHelper2} +\alias{tTestHelper2} +\title{Uses glm to compute means of numeric vector across factor vector} +\usage{ +tTestHelper2(formula, CI, datasources) +} +\arguments{ + \item{formula}{a character form of a formula (e.g. + 'a~b')} + + \item{CI}{a numeric, the confidence interval} + + \item{family}{a description of the error distribution + function to use in the model} + + \item{datasources}{see the calling function} +} +\value{ +results similar to that of a t.test +} +\description{ +This is an internal function. +} +\details{ + +} +\keyword{calling} +\keyword{function} +\keyword{internal} +\keyword{see} +\keyword{the} +