From d70a84175d36b3078097db1d96ad0e6d2047f475 Mon Sep 17 00:00:00 2001
From: agaye <ag13748@it017324.(none)>
Date: Tue, 24 Mar 2015 00:05:05 -0400
Subject: [PATCH] Amended ds.tTest to allow for a test between a continuous and
 a categorical variable.

---
 R/ds.tTest.R        | 312 ++++++--------------------------------------
 R/tTestHelper1.R    | 276 +++++++++++++++++++++++++++++++++++++++
 R/tTestHelper2.R    | 201 ++++++++++++++++++++++++++++
 man/ds.tTest.Rd     |  38 ++++--
 man/tTestHelper1.Rd |  37 ++++++
 man/tTestHelper2.Rd |  32 +++++
 6 files changed, 613 insertions(+), 283 deletions(-)
 create mode 100644 R/tTestHelper1.R
 create mode 100644 R/tTestHelper2.R
 create mode 100644 man/tTestHelper1.Rd
 create mode 100644 man/tTestHelper2.Rd

diff --git a/R/ds.tTest.R b/R/ds.tTest.R
index aebc174..782767e 100644
--- a/R/ds.tTest.R
+++ b/R/ds.tTest.R
@@ -3,8 +3,11 @@
 #' @description Performs one and two sample t-tests on vectors of data.
 #' @details Summary statistics are obtained from each of the data sets that are located on the 
 #' distinct computers/servers. And then grand means and variances are calculated. Those are used 
-#' for performing t-test.
-#' @param x a character, the name of a (non-empty) numeric vector of data values.
+#' for performing t-test. The funtion allows for the calculation of t-test between two continuous variables
+#' or between a continuous and a factor variable; the latter option requires a formula (see parameter \code{dataframe}).
+#' If a formula is provided all other but 'conf.level=0.95' are ignored.
+#' @param x a character, the name of a (non-empty) numeric vector of data values or a formula of the 
+#' form 'a~b' where 'a' is the name of a continuous variable and 'b' that of a factor variable.
 #' @param y a character, the name of an optional (non-empty) numeric vector of data values.
 #' @param type a character which tells if the test is ran for the pooled data or not. 
 #' By default \code{type} is set to 'combine' and a t.test of the pooled data is 
@@ -31,6 +34,7 @@
 #' was a one-sample test or a two-sample test.
 #' \code{alternative} a character string describing the alternative hypothesis 
 #' \code{method} a character string indicating what type of t-test was performed 
+#' @return an object of type 'htest' if both x and y are continuous and a list otherwise.
 #' @author Isaeva, J.; Gaye, A.
 #' @export
 #' @examples {
@@ -38,36 +42,38 @@
 #'   # load that contains the login details
 #'   data(logindata)
 #' 
-#'   # login and assign specific variable(s)
-#'   myvar <- list("LAB_HDL", "LAB_TSC")
-#'   opals <- datashield.login(logins=logindata,assign=TRUE,variables=myvar)
+#'   # login and assign all the variables
+#'   opals <- datashield.login(logins=logindata,assign=TRUE)
 #' 
 #'   # Example 1: Run a t.test of the pooled data for the variables 'LAB_HDL' and 'LAB_TSC' - default
 #'   ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC')
-#' 
-#'   # Example 2: Run a t.test for each study separately for the same variables as above
+#'   
+#'   # Example 2: Run a test to compare the mean of a continuous variable across the two categories of a categorical variable
+#'   s <- ds.tTest(x='D$PM_BMI_CONTINUOUS~D$GENDER')
+#'    
+#'   # Example 3: Run a t.test for each study separately for the same variables as above
 #'   ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', type='split')
 #' 
-#'   # Example 3: Run a paired t.test of the pooled data
+#'   # Example 4: Run a paired t.test of the pooled data
 #'   ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', paired=TRUE)
 #' 
-#'   # Example 4: Run a paired t.test for each study separately
+#'   # Example 5: Run a paired t.test for each study separately
 #'   ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', paired=TRUE, type='split')
 #' 
-#'   # Example 5: Run a t.test of the pooled data with different alternatives
+#'   # Example 6: Run a t.test of the pooled data with different alternatives
 #'   ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', alternative='greater')
 #'   ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', alternative='less')
 #' 
-#'   # Example 6: Run a t.test of the pooled data with mu different from zero
+#'   # Example 7: Run a t.test of the pooled data with mu different from zero
 #'   ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', mu=-4)
 #' 
-#'   # Example 7: Run a t.test of the pooled data assuming that variances of variables are equal
+#'   # Example 8: Run a t.test of the pooled data assuming that variances of variables are equal
 #'   ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', var.equal=TRUE)
 #' 
-#'   # Example 8: Run a t.test of the pooled data with 90% confidence interval
+#'   # Example 9: Run a t.test of the pooled data with 90% confidence interval
 #'   ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', conf.level=0.90)
 #' 
-#'   # Example 9: Run a one-sample t.test of the pooled data
+#'   # Example 10: Run a one-sample t.test of the pooled data
 #'   ds.tTest(x='D$LAB_HDL')
 #'   # the below example should not work, paired t.test is not possible if the 'y' variable is missing
 #'   # ds.tTest(x='D$LAB_HDL', paired=TRUE) 
@@ -85,268 +91,34 @@ ds.tTest <- function (x=NULL, y=NULL, type="combine", alternative="two.sided", m
   }
   
   if(is.null(x)){
-    stop("Please provide the name of the x vector!", call.=FALSE)
-  }
-    
-  # get the names of the variables used for the analysis
-  # the input variable might be given as column table (i.e. D$x)
-  # or just as a vector not attached to a table (i.e. x)
-  # we have to make sure the function deals with each case
-  if(is.null(y)){
-    xname <- extract(x)
-    dname = xname$elements
-    variables <- dname
-  }else{
-    xname <- extract(x)
-    yname <- extract(y)
-    dname1 = xname$elements
-    dname2 = yname$elements
-    variables <- c(dname1, dname2)
-    dname = paste(dname1, 'and', dname2)
+    stop("Please provide the name of the x vector or a formula if performing t.test between a numeric and a factor vector!", call.=FALSE)
   }
   
-  # call the function that checks theinput variables are defined in all the studies
-  if(is.null(y)){
-    obj2lookfor <- xname$holders
-    if(is.na(obj2lookfor)){
-      defined <- isDefined(datasources, variables[1])
+  # check if the user provided two continuous variables or a formula to run t-test between a continuous and a facyor variable 
+  # depending on what the user specified call the relevant function ('tTestHelper1' in the 1st case or 'tTesHelper2' ortherwise)
+  # tTeshelper 
+  if(length(unlist(strsplit(x, split='~')))==2){
+    if(type=="combine"){
+      results <- tTestHelper2(x, conf.level, datasources)
     }else{
-      defined <- isDefined(datasources, obj2lookfor)
-    }
-  }else{
-    obj2lookfor1 <- xname$holders
-    obj2lookfor2 <- yname$holders
-    if(is.na(obj2lookfor1)){
-      defined <- isDefined(datasources, variables[1])
-    }else{
-      defined <- isDefined(datasources, obj2lookfor1)
-    }
-    if(is.na(obj2lookfor2)){
-      defined <- isDefined(datasources, variables[2])
-    }else{
-      defined <- isDefined(datasources, obj2lookfor2)
-    }
-  }
-  
-  # call the internal function that checks an input object is of the same class in all studies.
-  if(is.null(y)){
-    typ1 <- checkClass(datasources, x)
-  }else{
-    typ1 <- checkClass(datasources, x)
-    typ2 <- checkClass(datasources, y)
-  }
-  
-  # number of studies
-  num.sources = length(datasources)
-  
-  if(type == "combine"){
-    
-    # Performs t-test on merged data sets
-    if (!missing(mu) && (length(mu) != 1 || is.na(mu))) 
-      stop("'mu' must be a single number")
-    if (!missing(conf.level) && (length(conf.level) != 1 || !is.finite(conf.level) || 
-                                   conf.level < 0 || conf.level > 1)) 
-      stop("'conf.level' must be a single number between 0 and 1")
-    if (!is.null(y)) {
-      if (paired) {
-        cally = paste0("complete.cases(",  x, ",", y, ")")
-        datashield.assign(datasources, 'pair.compl.obs', as.symbol(cally))
-        cally = paste0("subsetDS('",  x, "', pair.compl.obs)")
-        datashield.assign(datasources, 'xok', as.symbol(cally))
-        cally = paste0("subsetDS('",  y, "', pair.compl.obs)")
-        datashield.assign(datasources, 'yok', as.symbol(cally))
-      } else {
-        cally = paste0("complete.cases(",  x, ")")
-        datashield.assign(datasources, 'not.na.x', as.symbol(cally))
-        cally = paste0("subsetDS('",  x, "',not.na.x)")
-        datashield.assign(datasources, 'xok', as.symbol(cally))
-
-        cally = paste0("complete.cases(",  y, ")")
-        datashield.assign(datasources, 'not.na.y', as.symbol(cally))
-        cally = paste0("subsetDS('",  y, "',not.na.y)")
-        datashield.assign(datasources, 'yok', as.symbol(cally))
-      }
-    } else {
-      if (paired) 
-      stop("'y' is missing for paired test")
-      cally = paste0("complete.cases(",  x, ")")
-      datashield.assign(datasources, 'not.na.x', as.symbol(cally))
-      cally = paste0("subsetDS('",  x, "', not.na.x)")
-      datashield.assign(datasources, 'xok', as.symbol(cally))
-      cally = paste0("as.null(",  x, ")")
-      datashield.assign(datasources, 'yok', as.symbol(cally)) # does not matter that as.null(x) since we just want to set y to NULL
-    }    
-    
-    if (paired) {
-      cally = paste0("(yok)","*(",-1,")")
-      datashield.assign(datasources, 'minus_y', as.symbol(cally))
-      # datashield.assign(datasources, 'dummy', quote(cbind(xok, minus_y)))
-      datashield.assign(datasources, 'xok', as.symbol("xok+minus_y"))
-      datashield.assign(datasources, 'yok', as.symbol("as.null(yok)"))
-    }
-    
-    length.local.x = datashield.aggregate(datasources, as.symbol("NROW(xok)"))
-    mean.local.x = datashield.aggregate(datasources, as.symbol("meanDS(xok)"))
-    var.local.x = datashield.aggregate(datasources, as.symbol("varDS(xok)"))
-    
-    length.total.x = 0
-    sum.weighted.x = 0
-    
-    for (i in 1:num.sources) 
-      if (!is.null(length.local.x[[i]]))
-        length.total.x = length.total.x + length.local.x[[i]]
-    
-    for (i in 1:num.sources) 
-      if (!is.null(length.local.x[[i]]))
-        sum.weighted.x = sum.weighted.x + length.local.x[[i]]*mean.local.x[[i]]
-    
-    if (!is.na(sum.weighted.x))
-      mean.global.x = sum.weighted.x/length.total.x else
-        stop(paste("Check the data supplied: global ", variables[1], " mean is NA", sep=""))
-    estimate = mean.global.x
-    
-    nrows_var.x = NROW(var.local.x[[1]])
-    ncols_var.x = NCOL(var.local.x[[1]])
-    dummy.sum.x = matrix(0, nrows_var.x, ncols_var.x)
-    
-    for (i in 1:num.sources) {
-      if (!is.null(var.local.x[[i]]) & !is.null(mean.local.x[[i]]))
-        if (!is.na(var.local.x[[i]]) & !is.na(mean.local.x[[i]])) {
-          var.weight.x = (length.local.x[[i]]-1)*var.local.x[[i]]
-          add.elem.x = length.local.x[[i]]*(mean.local.x[[i]]%x%t(mean.local.x[[i]]))
-          dummy.sum.x = dummy.sum.x +var.weight.x+add.elem.x
-        }      
-    }
-    mean.global.products.x = length.total.x*(mean.global.x%x%t(mean.global.x))
-    var.global.x = 1/(length.total.x-1)*(dummy.sum.x-mean.global.products.x)
-    
-    null.y = datashield.aggregate(datasources, as.symbol("is.null(yok)"))
-    null.y = unlist(null.y)
-    
-    if (all(null.y)) {
-      if (length.total.x < 2) 
-        stop("not enough 'x' observations")
-      df <- length.total.x - 1
-      stderr <- sqrt(var.global.x/length.total.x)
-      if (stderr < 10 * .Machine$double.eps * abs(mean.global.x)) 
-        stop("data are essentially constant")
-      tstat <- (mean.global.x - mu)/stderr
-      method <- ifelse(paired, "Paired t-test", "One Sample t-test")
-      names(estimate) <- ifelse(paired, "mean of the differences", paste("mean of", variables[1], sep=""))
-    } else {
-      length.local.y = datashield.aggregate(datasources, as.symbol("NROW(yok)"))
-      
-      length.total.y = 0
-      sum.weighted.y = 0
-      
-      for (i in 1:num.sources) 
-        if (!is.null(length.local.y[[i]]))
-          length.total.y = length.total.y + length.local.y[[i]]
-      
-      if (length.total.x < 1 || (!var.equal && length.total.x < 2)) 
-        stop(paste("not enough ", variables[1], "observations", sep=""))
-      if (length.total.y < 1 || (!var.equal && length.total.y < 2)) 
-        stop(paste("not enough ", variables[2], "observations", sep=""))
-      if (var.equal && length.total.x + length.total.y < 3) 
-        stop("not enough observations")
-      
-      mean.local.y = datashield.aggregate(datasources, as.symbol("meanDS(yok)"))
-      var.local.y = datashield.aggregate(datasources, as.symbol("varDS(yok)"))
-      method <- paste(if (!var.equal) 
-        "Welch", "Two Sample t-test")
-      
-      length.total.y = 0
-      sum.weighted.y = 0
-      
-      for (i in 1:num.sources) 
-        if (!is.null(length.local.y[[i]])) {
-          length.total.y = length.total.y + length.local.y[[i]]
-          sum.weighted.y = sum.weighted.y + length.local.y[[i]]*mean.local.y[[i]]
+      if(type=="split"){
+        results <- vector("list", length(datasources))
+        for(i in 1:length(datasources)){
+          message(paste0("----",names(datasources)[i], "----"))
+          out <- tTestHelper2(x, conf.level, datasources[i])
+          results[[i]] <- out
+          message(" ")
+          rm(out)
         }
-      if (!is.na(sum.weighted.y))
-        mean.global.y = sum.weighted.y/length.total.y else
-          stop(paste("Check the data supplied: global ", variables[2], " mean is NA", sep=""))
-      
-      estimate <- c(mean.global.x, mean.global.y)
-      names(estimate) <- c(paste("mean of ", variables[1], sep=""), paste("mean of ", variables[2], sep=""))
-      
-      nrows_var.y = NROW(var.local.y[[1]])
-      ncols_var.y = NCOL(var.local.y[[1]])
-      dummy.sum.y = matrix(0, nrows_var.y, ncols_var.y)
-      
-      for (i in 1:num.sources) {
-        if (!is.null(var.local.y[[i]]) & !is.null(mean.local.y[[i]]))
-          if (!is.na(var.local.y[[i]]) & !is.na(mean.local.y[[i]])) {
-            var.weight.y = (length.local.y[[i]]-1)*var.local.y[[i]]
-            add.elem.y = length.local.y[[i]]*(mean.local.y[[i]]%x%t(mean.local.y[[i]]))
-            dummy.sum.y = dummy.sum.y +var.weight.y+add.elem.y
-          }      
+        names(results) <- names(datasources)
+      }else{
+        stop('Function argument "type" has to be either "combine" or "split"')
       }
-      mean.global.products.y = length.total.y*(mean.global.y%x%t(mean.global.y))
-      var.global.y = 1/(length.total.y-1)*(dummy.sum.y-mean.global.products.y)
-      
-      if (var.equal) {
-        df <- length.total.x + length.total.x - 2
-        v <- 0
-        if (length.total.x > 1) 
-          v <- v + (length.total.x - 1) * var.global.x
-        if (length.total.y > 1) 
-          v <- v + (length.total.y - 1) * var.global.y
-        v <- v/df
-        stderr <- sqrt(v * (1/length.total.x + 1/length.total.y))
-      } else {
-        stderrx <- sqrt(var.global.x/length.total.x)
-        stderry <- sqrt(var.global.y/length.total.y)
-        stderr <- sqrt(stderrx^2 + stderry^2)
-        df <- stderr^4/(stderrx^4/(length.total.x - 1) + stderry^4/(length.total.y - 1))
-      }
-      if (stderr < 10 * .Machine$double.eps * max(abs(mean.global.x), 
-                                                  abs(mean.global.y))) 
-        stop("data are essentially constant")
-      tstat <- (mean.global.x - mean.global.y - mu)/stderr
-    }
-    
-    
-    if (alternative == "less") {
-      pval <- pt(tstat, df)
-      cint <- c(-Inf, tstat + qt(conf.level, df))
-    } else if (alternative == "greater") {
-      pval <- pt(tstat, df, lower.tail = FALSE)
-      cint <- c(tstat - qt(conf.level, df), Inf)
-    } else {
-      pval <- 2 * pt(-abs(tstat), df)
-      alpha <- 1 - conf.level
-      cint <- qt(1 - alpha/2, df)
-      cint <- tstat + c(-cint, cint)
     }
-    cint <- mu + cint * stderr
-    names(tstat) <- "t"
-    names(df) <- "df"
-    names(mu) <- if (paired || !is.null(y)) 
-      "difference in means" else
-        "mean"
-    attr(cint, "conf.level") <- conf.level
-    rval <- list(statistic = tstat, parameter = df, p.value = pval, 
-                 conf.int = cint, estimate = estimate, null.value = mu, 
-                 alternative = alternative, method = method, data.name=dname)
-    class(rval) <- "htest"
-    
-    # delete files that are no more required
-    datashield.rm(datasources, 'pair.compl.obs')
-    datashield.rm(datasources, 'xok')
-    datashield.rm(datasources, 'yok')
-    datashield.rm(datasources, 'not.na.x')
-    datashield.rm(datasources, 'minus_y')  
-    
-    return(rval)
-    
+
   }else{
-    if(type == "split"){
-      cally <- paste0("t.test(", x, ",", y, ",alternative='",alternative, "',mu=",mu, ",paired=",paired, ",var.equal=",var.equal, ",conf.level=",conf.level,")")
-      results <- datashield.aggregate(datasources, as.symbol(cally))
-      return(results)
-    }else{
-      stop('Function argument "type" has to be either "combine" or "split"')
-    }
+    results <- tTestHelper1(x, y, type, alternative, mu, paired, var.equal, conf.level, datasources)
   }
+  
+  return(results)
 }
diff --git a/R/tTestHelper1.R b/R/tTestHelper1.R
new file mode 100644
index 0000000..bd01297
--- /dev/null
+++ b/R/tTestHelper1.R
@@ -0,0 +1,276 @@
+#' 
+#' @title runs a t-test for two continuous variables
+#' @description This is an internal function.
+#' @param type see the calling function
+#' @param alternative see the calling function
+#' @param mu see the calling function 
+#' @param paired see the calling function
+#' @param var.equal see the calling function
+#' @param conf.level see the calling function
+#' @param datasources see the calling function
+#' @keywords internal see the calling function
+#' @return the results of the t-test 
+#'
+tTestHelper1 <- function(x, y, type, alternative, mu, paired, var.equal, conf.level, datasources){
+  # get the names of the variables used for the analysis
+  # the input variable might be given as column table (i.e. D$x)
+  # or just as a vector not attached to a table (i.e. x)
+  # we have to make sure the function deals with each case
+  if(is.null(y)){
+    xname <- extract(x)
+    dname = xname$elements
+    variables <- dname
+  }else{
+    xname <- extract(x)
+    yname <- extract(y)
+    dname1 = xname$elements
+    dname2 = yname$elements
+    variables <- c(dname1, dname2)
+    dname = paste(dname1, 'and', dname2)
+  }
+  
+  # call the function that checks theinput variables are defined in all the studies
+  if(is.null(y)){
+    obj2lookfor <- xname$holders
+    if(is.na(obj2lookfor)){
+      defined <- isDefined(datasources, variables[1])
+    }else{
+      defined <- isDefined(datasources, obj2lookfor)
+    }
+  }else{
+    obj2lookfor1 <- xname$holders
+    obj2lookfor2 <- yname$holders
+    if(is.na(obj2lookfor1)){
+      defined <- isDefined(datasources, variables[1])
+    }else{
+      defined <- isDefined(datasources, obj2lookfor1)
+    }
+    if(is.na(obj2lookfor2)){
+      defined <- isDefined(datasources, variables[2])
+    }else{
+      defined <- isDefined(datasources, obj2lookfor2)
+    }
+  }
+  
+  # call the internal function that checks an input object is of the same class in all studies.
+  if(is.null(y)){
+    typ1 <- checkClass(datasources, x)
+  }else{
+    typ1 <- checkClass(datasources, x)
+    typ2 <- checkClass(datasources, y)
+  }
+  
+  # number of studies
+  num.sources = length(datasources)
+  
+  if(type == "combine"){
+    
+    # Performs t-test on merged data sets
+    if (!missing(mu) && (length(mu) != 1 || is.na(mu))) 
+      stop("'mu' must be a single number")
+    if (!missing(conf.level) && (length(conf.level) != 1 || !is.finite(conf.level) || 
+                                   conf.level < 0 || conf.level > 1)) 
+      stop("'conf.level' must be a single number between 0 and 1")
+    if (!is.null(y)) {
+      if (paired) {
+        cally = paste0("complete.cases(",  x, ",", y, ")")
+        datashield.assign(datasources, 'pair.compl.obs', as.symbol(cally))
+        cally = paste0("subsetDS('",  x, "', pair.compl.obs)")
+        datashield.assign(datasources, 'xok', as.symbol(cally))
+        cally = paste0("subsetDS('",  y, "', pair.compl.obs)")
+        datashield.assign(datasources, 'yok', as.symbol(cally))
+      } else {
+        cally = paste0("complete.cases(",  x, ")")
+        datashield.assign(datasources, 'not.na.x', as.symbol(cally))
+        cally = paste0("subsetDS('",  x, "',not.na.x)")
+        datashield.assign(datasources, 'xok', as.symbol(cally))
+        
+        cally = paste0("complete.cases(",  y, ")")
+        datashield.assign(datasources, 'not.na.y', as.symbol(cally))
+        cally = paste0("subsetDS('",  y, "',not.na.y)")
+        datashield.assign(datasources, 'yok', as.symbol(cally))
+      }
+    } else {
+      if (paired) 
+        stop("'y' is missing for paired test")
+      cally = paste0("complete.cases(",  x, ")")
+      datashield.assign(datasources, 'not.na.x', as.symbol(cally))
+      cally = paste0("subsetDS('",  x, "', not.na.x)")
+      datashield.assign(datasources, 'xok', as.symbol(cally))
+      cally = paste0("as.null(",  x, ")")
+      datashield.assign(datasources, 'yok', as.symbol(cally)) # does not matter that as.null(x) since we just want to set y to NULL
+    }    
+    
+    if (paired) {
+      cally = paste0("(yok)","*(",-1,")")
+      datashield.assign(datasources, 'minus_y', as.symbol(cally))
+      # datashield.assign(datasources, 'dummy', quote(cbind(xok, minus_y)))
+      datashield.assign(datasources, 'xok', as.symbol("xok+minus_y"))
+      datashield.assign(datasources, 'yok', as.symbol("as.null(yok)"))
+    }
+    
+    length.local.x = datashield.aggregate(datasources, as.symbol("NROW(xok)"))
+    mean.local.x = datashield.aggregate(datasources, as.symbol("meanDS(xok)"))
+    var.local.x = datashield.aggregate(datasources, as.symbol("varDS(xok)"))
+    
+    length.total.x = 0
+    sum.weighted.x = 0
+    
+    for (i in 1:num.sources) 
+      if (!is.null(length.local.x[[i]]))
+        length.total.x = length.total.x + length.local.x[[i]]
+    
+    for (i in 1:num.sources) 
+      if (!is.null(length.local.x[[i]]))
+        sum.weighted.x = sum.weighted.x + length.local.x[[i]]*mean.local.x[[i]]
+    
+    if (!is.na(sum.weighted.x))
+      mean.global.x = sum.weighted.x/length.total.x else
+        stop(paste("Check the data supplied: global ", variables[1], " mean is NA", sep=""))
+    estimate = mean.global.x
+    
+    nrows_var.x = NROW(var.local.x[[1]])
+    ncols_var.x = NCOL(var.local.x[[1]])
+    dummy.sum.x = matrix(0, nrows_var.x, ncols_var.x)
+    
+    for (i in 1:num.sources) {
+      if (!is.null(var.local.x[[i]]) & !is.null(mean.local.x[[i]]))
+        if (!is.na(var.local.x[[i]]) & !is.na(mean.local.x[[i]])) {
+          var.weight.x = (length.local.x[[i]]-1)*var.local.x[[i]]
+          add.elem.x = length.local.x[[i]]*(mean.local.x[[i]]%x%t(mean.local.x[[i]]))
+          dummy.sum.x = dummy.sum.x +var.weight.x+add.elem.x
+        }      
+    }
+    mean.global.products.x = length.total.x*(mean.global.x%x%t(mean.global.x))
+    var.global.x = 1/(length.total.x-1)*(dummy.sum.x-mean.global.products.x)
+    
+    null.y = datashield.aggregate(datasources, as.symbol("is.null(yok)"))
+    null.y = unlist(null.y)
+    
+    if (all(null.y)) {
+      if (length.total.x < 2) 
+        stop("not enough 'x' observations")
+      df <- length.total.x - 1
+      stderr <- sqrt(var.global.x/length.total.x)
+      if (stderr < 10 * .Machine$double.eps * abs(mean.global.x)) 
+        stop("data are essentially constant")
+      tstat <- (mean.global.x - mu)/stderr
+      method <- ifelse(paired, "Paired t-test", "One Sample t-test")
+      names(estimate) <- ifelse(paired, "mean of the differences", paste("mean of", variables[1], sep=""))
+    } else {
+      length.local.y = datashield.aggregate(datasources, as.symbol("NROW(yok)"))
+      
+      length.total.y = 0
+      sum.weighted.y = 0
+      
+      for (i in 1:num.sources) 
+        if (!is.null(length.local.y[[i]]))
+          length.total.y = length.total.y + length.local.y[[i]]
+      
+      if (length.total.x < 1 || (!var.equal && length.total.x < 2)) 
+        stop(paste("not enough ", variables[1], "observations", sep=""))
+      if (length.total.y < 1 || (!var.equal && length.total.y < 2)) 
+        stop(paste("not enough ", variables[2], "observations", sep=""))
+      if (var.equal && length.total.x + length.total.y < 3) 
+        stop("not enough observations")
+      
+      mean.local.y = datashield.aggregate(datasources, as.symbol("meanDS(yok)"))
+      var.local.y = datashield.aggregate(datasources, as.symbol("varDS(yok)"))
+      method <- paste(if (!var.equal) 
+        "Welch", "Two Sample t-test")
+      
+      length.total.y = 0
+      sum.weighted.y = 0
+      
+      for (i in 1:num.sources) 
+        if (!is.null(length.local.y[[i]])) {
+          length.total.y = length.total.y + length.local.y[[i]]
+          sum.weighted.y = sum.weighted.y + length.local.y[[i]]*mean.local.y[[i]]
+        }
+      if (!is.na(sum.weighted.y))
+        mean.global.y = sum.weighted.y/length.total.y else
+          stop(paste("Check the data supplied: global ", variables[2], " mean is NA", sep=""))
+      
+      estimate <- c(mean.global.x, mean.global.y)
+      names(estimate) <- c(paste("mean of ", variables[1], sep=""), paste("mean of ", variables[2], sep=""))
+      
+      nrows_var.y = NROW(var.local.y[[1]])
+      ncols_var.y = NCOL(var.local.y[[1]])
+      dummy.sum.y = matrix(0, nrows_var.y, ncols_var.y)
+      
+      for (i in 1:num.sources) {
+        if (!is.null(var.local.y[[i]]) & !is.null(mean.local.y[[i]]))
+          if (!is.na(var.local.y[[i]]) & !is.na(mean.local.y[[i]])) {
+            var.weight.y = (length.local.y[[i]]-1)*var.local.y[[i]]
+            add.elem.y = length.local.y[[i]]*(mean.local.y[[i]]%x%t(mean.local.y[[i]]))
+            dummy.sum.y = dummy.sum.y +var.weight.y+add.elem.y
+          }      
+      }
+      mean.global.products.y = length.total.y*(mean.global.y%x%t(mean.global.y))
+      var.global.y = 1/(length.total.y-1)*(dummy.sum.y-mean.global.products.y)
+      
+      if (var.equal) {
+        df <- length.total.x + length.total.x - 2
+        v <- 0
+        if (length.total.x > 1) 
+          v <- v + (length.total.x - 1) * var.global.x
+        if (length.total.y > 1) 
+          v <- v + (length.total.y - 1) * var.global.y
+        v <- v/df
+        stderr <- sqrt(v * (1/length.total.x + 1/length.total.y))
+      } else {
+        stderrx <- sqrt(var.global.x/length.total.x)
+        stderry <- sqrt(var.global.y/length.total.y)
+        stderr <- sqrt(stderrx^2 + stderry^2)
+        df <- stderr^4/(stderrx^4/(length.total.x - 1) + stderry^4/(length.total.y - 1))
+      }
+      if (stderr < 10 * .Machine$double.eps * max(abs(mean.global.x), 
+                                                  abs(mean.global.y))) 
+        stop("data are essentially constant")
+      tstat <- (mean.global.x - mean.global.y - mu)/stderr
+    }
+    
+    
+    if (alternative == "less") {
+      pval <- pt(tstat, df)
+      cint <- c(-Inf, tstat + qt(conf.level, df))
+    } else if (alternative == "greater") {
+      pval <- pt(tstat, df, lower.tail = FALSE)
+      cint <- c(tstat - qt(conf.level, df), Inf)
+    } else {
+      pval <- 2 * pt(-abs(tstat), df)
+      alpha <- 1 - conf.level
+      cint <- qt(1 - alpha/2, df)
+      cint <- tstat + c(-cint, cint)
+    }
+    cint <- mu + cint * stderr
+    names(tstat) <- "t"
+    names(df) <- "df"
+    names(mu) <- if (paired || !is.null(y)) 
+      "difference in means" else
+        "mean"
+    attr(cint, "conf.level") <- conf.level
+    rval <- list(statistic = tstat, parameter = df, p.value = pval, 
+                 conf.int = cint, estimate = estimate, null.value = mu, 
+                 alternative = alternative, method = method, data.name=dname)
+    class(rval) <- "htest"
+    
+    # delete files that are no more required
+    datashield.rm(datasources, 'pair.compl.obs')
+    datashield.rm(datasources, 'xok')
+    datashield.rm(datasources, 'yok')
+    datashield.rm(datasources, 'not.na.x')
+    datashield.rm(datasources, 'minus_y')  
+    
+    return(rval)
+    
+  }else{
+    if(type == "split"){
+      cally <- paste0("t.test(", x, ",", y, ",alternative='",alternative, "',mu=",mu, ",paired=",paired, ",var.equal=",var.equal, ",conf.level=",conf.level,")")
+      results <- datashield.aggregate(datasources, as.symbol(cally))
+      return(results)
+    }else{
+      stop('Function argument "type" has to be either "combine" or "split"')
+    }
+  }
+}
\ No newline at end of file
diff --git a/R/tTestHelper2.R b/R/tTestHelper2.R
new file mode 100644
index 0000000..4c1bbea
--- /dev/null
+++ b/R/tTestHelper2.R
@@ -0,0 +1,201 @@
+#' 
+#' @title Uses glm to compute means of numeric vector across factor vector
+#' @description This is an internal function.
+#' @param formula a character form of a formula (e.g. 'a~b')
+#' @param CI a numeric, the confidence interval
+#' @param family a description of the error distribution function to use in the model
+#' @param datasources see the calling function
+#' @keywords internal see the calling function
+#' @return results similar to that of a t.test 
+#'
+tTestHelper2 <- function(formula, CI, datasources) {
+  
+  # The variables names 
+  a <- unlist(strsplit(formula, split='~'))[1]
+  b <- unlist(strsplit(formula, split='~'))[2]
+  
+  # check if the variables in the formula are defined in all the studies, if not defined a message is thrown and the process stops
+  defined <- isDefined(datasources, a)
+  defined <- isDefined(datasources, b)
+  
+  # Check that the variables in the formula are of the right type: 'numeric'/'integer' for the outcome and 'factor' for the covariate
+  typ1 <- checkClass(datasources, a)
+  if(typ1 != "numeric" & typ1 != "integer"){
+    stop(paste0(" ", a, " must be a numeric vector!"), call.=FALSE)
+  }
+  typ2 <- checkClass(datasources, b)
+  if(typ2 != "factor"){
+    stop(paste0(" ", b, " must be a factor vector!"), call.=FALSE)
+  }else{
+    # the covariate must have only two categories (as for t.test in R), so stop and throw a message otherwise    
+    cally <- paste0("levels(", b, ")")
+    levels_all <- datashield.aggregate(datasources, as.symbol(cally))
+    classes <- unique(unlist(levels_all))
+    if(length(classes) != 2){
+      stop(paste0(" ", b, " must two and only two categories!"), call.=FALSE)
+    }
+  }
+  
+  # turn the formula provided as a character into a formula object
+  formula <- as.formula(formula)
+  family <- 'gaussian'
+  
+  # number of 'valid' studies (those that passed the checks) and vector of beta values
+  numstudies <- length(datasources)
+  
+  # start beta values
+  beta.vect.next <- c(0,0)
+  beta.vect.temp <- paste0(as.character(beta.vect.next), collapse=",")
+  
+  # Iterations need to be counted. Start off with the count at 0
+  # and increment by 1 at each new iteration
+  iteration.count <- 0
+  
+  # identify the correct dimension for start beta coeffs by calling the 1st component of glmDS
+  cally1 <- call('glmDS1', formula, family, beta.vect=beta.vect.temp, NULL)
+  
+  study.summary <- datashield.aggregate(datasources, cally1)
+  num.par.glm <- study.summary[[1]][[1]][[2]]
+  
+  beta.vect.next <- rep(0, num.par.glm)
+  beta.vect.temp <- paste0(as.character(beta.vect.next), collapse=",")
+  
+  
+  # Provide arbitrary starting value for deviance to enable subsequent calculation of the
+  # change in deviance between iterations
+  dev.old <- 9.99e+99
+  
+  #Convergence state needs to be monitored.
+  converge.state <- FALSE
+  
+  # Define a convergence criterion. This value of epsilon corresponds to that used
+  # by default for GLMs in R (see section S3 for details)
+  epsilon <- 1.0e-08
+  
+  f <- NULL
+  
+  while(!converge.state && iteration.count < 20) {
+    
+    iteration.count <- iteration.count+1
+    
+    # now call second component of glmDS to generate score vectors and informations matrices
+    cally2 <- call('glmDS2', formula, family, beta.vect=beta.vect.temp, NULL, NULL, NULL)
+    
+    study.summary <- datashield.aggregate(datasources, cally2)
+    
+    
+    .select <- function(l, field) {
+      lapply(l, function(obj) {obj[[field]]})
+    }
+    
+    info.matrix.total<-Reduce(f="+", .select(study.summary, 'info.matrix'))
+    score.vect.total<-Reduce(f="+", .select(study.summary, 'score.vect'))
+    dev.total<-Reduce(f="+", .select(study.summary, 'dev'))
+    
+    if(iteration.count==1) {
+      # Sum participants only during first iteration.
+      nsubs.total <- Reduce(f="+", .select(study.summary, 'numsubs'))
+      # Save family
+      f <- study.summary[[1]]$family
+    }
+    
+    # create variance covariance matrix as inverse of information matrix
+    variance.covariance.matrix.total<-solve(info.matrix.total)
+    
+    # create beta vector update terms
+    beta.update.vect<-variance.covariance.matrix.total %*% score.vect.total
+    
+    # add update terms to current beta vector to obtain new beta vector for next iteration
+    if(iteration.count==1){
+      beta.vect.next<-rep(0,length(beta.update.vect))
+    }
+    
+    beta.vect.next<-beta.vect.next+beta.update.vect
+    beta.vect.temp <- paste0(as.character(beta.vect.next), collapse=",")
+  
+    # calculate value of convergence statistic and test whether meets convergence criterion
+    converge.value<-abs(dev.total-dev.old)/(abs(dev.total)+0.1)
+    if(converge.value<=epsilon)converge.state<-TRUE
+    if(converge.value>epsilon)dev.old<-dev.total
+    
+  }
+  
+  # if convergence has been obtained, declare final (maximum likelihood) beta vector,
+  # and calculate the corresponding standard errors, z scores and p values
+  # (the latter two to be consistent with the output of a standard GLM analysis)
+  # then print out final model summary
+  if(converge.state){
+    
+    beta.vect.final<-beta.vect.next
+    
+    scale.par <- 1
+    scale.par <- dev.total / (nsubs.total-length(beta.vect.next))
+    
+    se.vect.final <- sqrt(diag(variance.covariance.matrix.total)) * sqrt(scale.par)
+    z.vect.final<-beta.vect.final/se.vect.final
+    pval.vect.final<-2*pnorm(-abs(z.vect.final))
+    parameter.names<-names(score.vect.total[,1])
+    model.parameters<-cbind(beta.vect.final,se.vect.final,z.vect.final,pval.vect.final)
+    dimnames(model.parameters)<-list(parameter.names,c("Estimate","Std. Error","z-value","p-value"))
+    
+    if(CI > 0){
+      
+      ci.mult <- qnorm(1-(1-CI)/2)
+      low.ci.lp <- model.parameters[,1]-ci.mult*model.parameters[,2]
+      hi.ci.lp <- model.parameters[,1]+ci.mult*model.parameters[,2]
+      estimate.lp <- model.parameters[,1]
+      
+      estimate.natural <- estimate.lp
+      low.ci.natural <- low.ci.lp
+      hi.ci.natural <- hi.ci.lp
+      name1 <- paste0("low",CI,"CI")
+      name2 <- paste0("high",CI,"CI")
+      ci.mat <- cbind(low.ci.lp,hi.ci.lp)
+      dimnames(ci.mat) <- list(NULL,c(name1,name2))   
+   }
+    
+    model.parameters <- cbind(model.parameters,ci.mat)
+    
+    formulatext <- Reduce(paste, deparse(formula))
+    
+    glmds <- list(
+      formula = formulatext,
+      coefficients = model.parameters,
+      dev = dev.total,
+      nsubs = nsubs.total,
+      df = (nsubs.total-length(beta.vect.next)),
+      iter = iteration.count
+    )
+    
+    class(glmds) <- 'glmds'
+   
+    # get the values that one would expect from a t.test: means for each category, p.value etc...
+    meanA <- round(glmds$coefficients[1,1],4)
+    meanB <- round(meanA + glmds$coefficients[2,1],4)
+    meanLabel <- paste0("mean in group ", classes[1], " & ",  "mean in group ", classes[2])
+    output <- list(
+      statistic = glmds$coefficients[2,3],
+      parameter = glmds$df,
+      p.value = glmds$coefficients[2,4],
+      conf.int = c(glmds$coefficients[2,5], glmds$coefficients[2,6]),
+      estimate = c(meanA, meanB),
+      alternative = "two.sided",
+      method = "GLM",
+      data.name = paste0(a, " by ", b)
+    )
+    # print results to screen in a 't.test' way
+    mylabels <- c("z ", "df ", "p.value ", paste0(CI*100, " percent confidence interval"), "sample estimates:\nmeanLabel")
+    message("GLM to assess difference of statistical means across two categories")
+    message(paste0("data: ", a, " by ", b))
+    message(paste0("z = ", round(glmds$coefficients[2,3],4), ", df = ", glmds$df, ", p.value = ", signif(glmds$coefficients[2,4]),4))
+    message(paste0(CI*100, " percent confidence interval:"))
+    message(signif(output$conf.int),5)
+    message(paste0("sample estimates: ", meanLabel))
+    message(paste0(output$estimate[1], " ", output$estimate[2]))
+   
+    return(output)
+  } else {
+    warning(paste("GLM did not converge after", 20, "iterations."))
+    return(NULL)
+  }
+}
\ No newline at end of file
diff --git a/man/ds.tTest.Rd b/man/ds.tTest.Rd
index 6ac6f59..4142b7f 100644
--- a/man/ds.tTest.Rd
+++ b/man/ds.tTest.Rd
@@ -8,7 +8,9 @@ ds.tTest(x = NULL, y = NULL, type = "combine",
 }
 \arguments{
   \item{x}{a character, the name of a (non-empty) numeric
-  vector of data values.}
+  vector of data values or a formula of the form 'a~b'
+  where 'a' is the name of a continuous variable and 'b'
+  that of a factor variable.}
 
   \item{y}{a character, the name of an optional (non-empty)
   numeric vector of data values.}
@@ -59,6 +61,9 @@ two-sample test. \code{alternative} a character string
 describing the alternative hypothesis \code{method} a
 character string indicating what type of t-test was
 performed
+
+an object of type 'htest' if both x and y are continuous
+and a list otherwise.
 }
 \description{
 Performs one and two sample t-tests on vectors of data.
@@ -69,7 +74,12 @@ Performs one and two sample t-tests on vectors of data.
 Summary statistics are obtained from each of the data sets
 that are located on the distinct computers/servers. And
 then grand means and variances are calculated. Those are
-used for performing t-test.
+used for performing t-test. The funtion allows for the
+calculation of t-test between two continuous variables or
+between a continuous and a factor variable; the latter
+option requires a formula (see parameter \code{dataframe}).
+If a formula is provided all other but 'conf.level=0.95'
+are ignored.
 }
 \examples{
 {
@@ -77,36 +87,38 @@ used for performing t-test.
   # load that contains the login details
   data(logindata)
 
-  # login and assign specific variable(s)
-  myvar <- list("LAB_HDL", "LAB_TSC")
-  opals <- datashield.login(logins=logindata,assign=TRUE,variables=myvar)
+  # login and assign all the variables
+  opals <- datashield.login(logins=logindata,assign=TRUE)
 
   # Example 1: Run a t.test of the pooled data for the variables 'LAB_HDL' and 'LAB_TSC' - default
   ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC')
 
-  # Example 2: Run a t.test for each study separately for the same variables as above
+  # Example 2: Run a test to compare the mean of a continuous variable across the two categories of a categorical variable
+  s <- ds.tTest(x='D$PM_BMI_CONTINUOUS~D$GENDER')
+
+  # Example 3: Run a t.test for each study separately for the same variables as above
   ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', type='split')
 
-  # Example 3: Run a paired t.test of the pooled data
+  # Example 4: Run a paired t.test of the pooled data
   ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', paired=TRUE)
 
-  # Example 4: Run a paired t.test for each study separately
+  # Example 5: Run a paired t.test for each study separately
   ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', paired=TRUE, type='split')
 
-  # Example 5: Run a t.test of the pooled data with different alternatives
+  # Example 6: Run a t.test of the pooled data with different alternatives
   ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', alternative='greater')
   ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', alternative='less')
 
-  # Example 6: Run a t.test of the pooled data with mu different from zero
+  # Example 7: Run a t.test of the pooled data with mu different from zero
   ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', mu=-4)
 
-  # Example 7: Run a t.test of the pooled data assuming that variances of variables are equal
+  # Example 8: Run a t.test of the pooled data assuming that variances of variables are equal
   ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', var.equal=TRUE)
 
-  # Example 8: Run a t.test of the pooled data with 90\% confidence interval
+  # Example 9: Run a t.test of the pooled data with 90\% confidence interval
   ds.tTest(x='D$LAB_HDL', y='D$LAB_TSC', conf.level=0.90)
 
-  # Example 9: Run a one-sample t.test of the pooled data
+  # Example 10: Run a one-sample t.test of the pooled data
   ds.tTest(x='D$LAB_HDL')
   # the below example should not work, paired t.test is not possible if the 'y' variable is missing
   # ds.tTest(x='D$LAB_HDL', paired=TRUE)
diff --git a/man/tTestHelper1.Rd b/man/tTestHelper1.Rd
new file mode 100644
index 0000000..35123cb
--- /dev/null
+++ b/man/tTestHelper1.Rd
@@ -0,0 +1,37 @@
+\name{tTestHelper1}
+\alias{tTestHelper1}
+\title{runs a t-test for two continuous variables}
+\usage{
+tTestHelper1(x, y, type, alternative, mu, paired, var.equal, conf.level,
+  datasources)
+}
+\arguments{
+  \item{type}{see the calling function}
+
+  \item{alternative}{see the calling function}
+
+  \item{mu}{see the calling function}
+
+  \item{paired}{see the calling function}
+
+  \item{var.equal}{see the calling function}
+
+  \item{conf.level}{see the calling function}
+
+  \item{datasources}{see the calling function}
+}
+\value{
+the results of the t-test
+}
+\description{
+This is an internal function.
+}
+\details{
+
+}
+\keyword{calling}
+\keyword{function}
+\keyword{internal}
+\keyword{see}
+\keyword{the}
+
diff --git a/man/tTestHelper2.Rd b/man/tTestHelper2.Rd
new file mode 100644
index 0000000..2bf5b26
--- /dev/null
+++ b/man/tTestHelper2.Rd
@@ -0,0 +1,32 @@
+\name{tTestHelper2}
+\alias{tTestHelper2}
+\title{Uses glm to compute means of numeric vector across factor vector}
+\usage{
+tTestHelper2(formula, CI, datasources)
+}
+\arguments{
+  \item{formula}{a character form of a formula (e.g.
+  'a~b')}
+
+  \item{CI}{a numeric, the confidence interval}
+
+  \item{family}{a description of the error distribution
+  function to use in the model}
+
+  \item{datasources}{see the calling function}
+}
+\value{
+results similar to that of a t.test
+}
+\description{
+This is an internal function.
+}
+\details{
+
+}
+\keyword{calling}
+\keyword{function}
+\keyword{internal}
+\keyword{see}
+\keyword{the}
+