diff --git a/R-pkg/DESCRIPTION b/R-pkg/DESCRIPTION index 7d1f1529..6f4d93cf 100644 --- a/R-pkg/DESCRIPTION +++ b/R-pkg/DESCRIPTION @@ -1,8 +1,8 @@ Package: diyabcGUI Type: Package Title: Graphical User Interface for DYIABC-RF software -Version: 1.0.14 -Date: 2021-03-12 +Version: 1.1.0 +Date: 2021-04-26 Authors@R: c( person( "Ghislain", "Durif", comment="diyabcGUI main developer", @@ -55,15 +55,18 @@ VignetteBuilder: knitr Imports: dplyr, fs, + future, ggplot2, lubridate, jsonlite, magrittr, + mime, parallel, pbapply, processx, + readr, rlang, shiny, shinybusy (>= 0.2.2), - shinydashboard, shinyFiles, + shinydashboard, shinyFeedback, shinyFiles, shinyhelper, shinyjs, shinyWidgets, stringr, tibble, diff --git a/R-pkg/NAMESPACE b/R-pkg/NAMESPACE index 575391ac..cca76099 100644 --- a/R-pkg/NAMESPACE +++ b/R-pkg/NAMESPACE @@ -25,6 +25,9 @@ importFrom(fs,file_chmod) importFrom(fs,file_copy) importFrom(fs,file_delete) importFrom(fs,path_home) +importFrom(future,future) +importFrom(future,multisession) +importFrom(future,plan) importFrom(ggplot2,aes) importFrom(ggplot2,aes_string) importFrom(ggplot2,annotate) @@ -53,12 +56,16 @@ importFrom(ggplot2,ylim) importFrom(jsonlite,fromJSON) importFrom(lubridate,now) importFrom(magrittr,"%>%") +importFrom(mime,guess_type) importFrom(parallel,detectCores) importFrom(parallel,makeCluster) importFrom(parallel,stopCluster) importFrom(pbapply,pblapply) importFrom(processx,process) +importFrom(readr,read_file) importFrom(rlang,duplicate) +importFrom(shinyFeedback,feedbackWarning) +importFrom(shinyFeedback,useShinyFeedback) importFrom(shinyWidgets,actionBttn) importFrom(shinyWidgets,actionGroupButtons) importFrom(shinyWidgets,ask_confirmation) @@ -74,7 +81,9 @@ importFrom(shinydashboard,dashboardHeader) importFrom(shinydashboard,dashboardPage) importFrom(shinydashboard,dashboardSidebar) importFrom(shinydashboard,menuItem) +importFrom(shinydashboard,renderMenu) importFrom(shinydashboard,sidebarMenu) +importFrom(shinydashboard,sidebarMenuOutput) importFrom(shinydashboard,tabItem) importFrom(shinydashboard,tabItems) importFrom(shinydashboard,updateTabItems) diff --git a/R-pkg/R/01_directory.R b/R-pkg/R/01_directory.R index b85a4e29..954abbe1 100644 --- a/R-pkg/R/01_directory.R +++ b/R-pkg/R/01_directory.R @@ -75,3 +75,44 @@ mk_proj_dir <- function(tag = "diyabc") { # output return(tmp_dir) } + + +#' Clean project directory +#' @keywords internal +#' @description Remove all files and sub-folders in a given project directory. +#' @author Ghislain Durif +#' @param proj_dir character string, path to project directory. +clean_proj_dir <- function(proj_dir) { + # check if project directory exists + if(length(proj_dir) != 1 && !dir.exists(proj_dir)) { + stop("Input argument should be a valid path to a project directory") + } + # sub-folders + subdir_list <- list.dirs(proj_dir, full.names = FALSE, recursive = FALSE) + subdir_list <- subdir_list[subdir_list != ""] + if(length(subdir_list) > 0) { + fs::dir_delete(file.path(proj_dir, subdir_list)) + } + # files + file_list <- list.files(proj_dir) + if(length(file_list) > 0) { + fs::file_delete(file.path(proj_dir, file_list)) + } +} + + +#' Clean binary directory +#' @keywords internal +#' @author Ghislain Durif +clean_bin_dir <- function() { + # bin directory + path <- bin_dir() + # existing binary file + existing_bin_files <- list.files(path) + existing_bin_files <- existing_bin_files[str_detect(existing_bin_files, + "diyabc|abcranger|dll")] + # delete diyabc/abcranger files + if(length(existing_bin_files) > 0) { + fs::file_delete(file.path(path, existing_bin_files)) + } +} diff --git a/R-pkg/R/02_regex.R b/R-pkg/R/02_regex.R index cd418dd5..6f0f72ea 100644 --- a/R-pkg/R/02_regex.R +++ b/R-pkg/R/02_regex.R @@ -12,6 +12,13 @@ num_regex <- function() { return("[0-9]+\\.?[0-9]*") } +#' return numerical (xxEyy notation) regex +#' @keywords internal +#' @author Ghislain Durif +numexp_regex <- function() { + return("[0-9]+\\.?[0-9]*((E|e)\\-?[0-9]+)?") +} + #' return event numerical rate regex #' @keywords internal #' @author Ghislain Durif diff --git a/R-pkg/R/03_utils.R b/R-pkg/R/03_utils.R index 39c33bc2..8cff1382 100644 --- a/R-pkg/R/03_utils.R +++ b/R-pkg/R/03_utils.R @@ -42,21 +42,6 @@ find_bin <- function(prog = "diyabc") { return(bin_file) } -#' Clean binary directory -#' @keywords internal -#' @author Ghislain Durif -clean_bin_dir <- function() { - # bin directory - path <- bin_dir() - # existing binary file - existing_bin_files <- list.files(path) - existing_bin_files <- existing_bin_files[str_detect(existing_bin_files, - "diyabc|abcranger|dll")] - # delete diyabc/abcranger files - if(length(existing_bin_files) > 0) { - fs::file_delete(file.path(path, existing_bin_files)) - } -} #' Find which OS is running #' @keywords internal diff --git a/R-pkg/R/04_environment.R b/R-pkg/R/04_environment.R index bf8da83b..7407850b 100644 --- a/R-pkg/R/04_environment.R +++ b/R-pkg/R/04_environment.R @@ -6,85 +6,138 @@ #' (diyabc-rf analysis and data generation) inside `diyabcGUI` global #' environment. init_diyabc_env <- function() { - assign("ap", NULL, env) # diyabc-rf analysis project - assign("dp", NULL, env) # data generation project + # diyabc-rf analysis project + assign("ap", reactiveValues(), env) + + # diyabc-rf dataset metadata + assign("md", reactiveValues(), env) + + # diyabc-rf training set simulation sub-module + assign("ts", reactiveValues(), env) + + # diyabc-rf random forest sub-module + assign("rf", reactiveValues(), env) + + # data generation project + assign("dp", reactiveValues(), env) } #' Initialize environment for DIYABC-RF pipeline #' @keywords internal #' @author Ghislain Durif init_diyabcrf_env <- function() { + ## environment attributes - # clean environment - tmp <- reactiveValues( + # analysis project + tmp_ap <- list( ## project setup - name = NULL, # project name - dir = NULL, # project directory - locus_type = NULL, # "SNP" or "MSS" - seq_mode = NULL, # "IndSeq" or "PoolSeq" + proj_name = NULL, # project name + proj_dir = NULL, # project directory + locus_type = "snp", # "snp" or "mss" + seq_mode = "indseq", # "indseq" or "poolseq" + proj_type = "new", # "new", "existing" or "example" + proj_file_list = NULL, # content of the project + file_modif = NULL, # counter for project file modification + # (upload, new header, ...) + header_check = NULL, # result of header file check + reftable_check = NULL, # result of reftable file check + statobs_check = NULL, # result of statobs file check ## observed data data_file = NULL, # observed data file name - metadata = reactiveValues( - # number of loci in the data file - n_loci = NULL, - # table of locus description: name, type, number - locus_des = NULL - ), - ## training set simulation - ts = reactiveValues( - # list of historical models - hist_model = NULL, - # list number of parameters per model - n_param = NULL, - # list of model priors (discrete probabilities) - model_prior = NULL, - # table of historical model parameters (name, type, priors) - param = NULL, - # list of conditions on historical parameters - cond = NULL, - # table of loci description - loci_des = NULL, - # number of loci group - n_group = NULL, - # list of group priors for MSS data - mss_prior = NULL, - # specific ref table column names for MSS data - mss_reftab_colname = NULL - ), - ## random forest analysis - rf = reactiveValues( - # analysis mode: "param_estim" or "model_choice" - mode = NULL, - # number of samples - n_ref = NULL, - # minimal node size - min_node_size = NULL, - # number of tree - n_tree = NULL, - # number of noise columns - noise_columns = NULL, - # boolean: if TRUE, disable LDA for model choice or PLS for - # parameter estimation - no_linear = NULL, - # percentage of maximum explained Y-variance for retaining pls axis - pls_max_var = NULL, - # Chosen scenario (mandatory for parameter estimation) - chosen_scenario = NULL, - # number of oob testing samples (mandatory for parameter estimation) - noob = NULL, - # name of the parameter of interest (mandatory for parameter - # estimation) - parameter = NULL, - # subset and/or groups of models - groups = NULL - ) + data_check = NULL # result of data file check + ) + + # dataset metadata + tmp_md <- list( + # number of loci in the data file + n_loci = NULL, + # table of locus description: name, type, number + locus_des = NULL + ) + + # training set simulation + tmp_ts <-list( + # list of historical models + hist_model = NULL, + # list number of parameters per model + n_param = NULL, + # list of model priors (discrete probabilities) + model_prior = NULL, + # table of historical model parameters (name, type, priors) + param = NULL, + # list of conditions on historical parameters + cond = NULL, + # table of loci description + loci_des = NULL, + # number of loci group + n_group = NULL, + # list of group priors for MSS data + mss_prior = NULL, + # specific ref table column names for MSS data + mss_reftab_colname = NULL + ) + + # random forest analysis + tmp_rf <- list( + # analysis mode: "param_estim" or "model_choice" + mode = NULL, + # number of samples + n_ref = NULL, + # minimal node size + min_node_size = NULL, + # number of tree + n_tree = NULL, + # number of noise columns + noise_columns = NULL, + # boolean: if TRUE, disable LDA for model choice or PLS for + # parameter estimation + no_linear = NULL, + # percentage of maximum explained Y-variance for retaining pls axis + pls_max_var = NULL, + # Chosen scenario (mandatory for parameter estimation) + chosen_scenario = NULL, + # number of oob testing samples (mandatory for parameter estimation) + noob = NULL, + # name of the parameter of interest (mandatory for parameter + # estimation) + parameter = NULL, + # subset and/or groups of models + groups = NULL ) - # init project directory - tmp$dir <- mk_proj_dir("diyabc_rf") - # init env - assign("ap", tmp, env) + ## clean up and define environment + list2reactiveValues(tmp_ap, ap) + list2reactiveValues(tmp_md, md) + list2reactiveValues(tmp_ts, ts) + list2reactiveValues(tmp_rf, rf) +} + +#' Project env reset +#' @keywords internal +#' @author Ghislain Durif +reset_ap <- function() { + env$ap$header_check <- NULL + env$ap$reftable_check <- NULL + env$ap$statobs_check <- NULL + env$ap$data_file <- NULL + env$ap$data_check <- NULL +} + +#' Reset environment for DIYABC-RF pipeline +#' @keywords internal +#' @author Ghislain Durif +reset_diyabcrf_env <- function() { + # proj dir + env$ap$proj_dir <- mk_proj_dir("diyabc_rf") + # ap reset + reset_ap() + # reset specific var + env$ap$locus_type <- "snp" + env$ap$seq_mode <- "indseq" + env$ap$proj_type <- "new" + # proj files + update_proj_file("ap") } #' Initialize environment for data generation pipeline @@ -93,12 +146,15 @@ init_diyabcrf_env <- function() { init_datagen_env <- function() { # clean environment - tmp <- reactiveValues( + tmp_dp <- list( ## project setup - name = NULL, # project name - dir = NULL, # project directory - locus_type = NULL, # "SNP" or "MSS" - seq_mode = NULL, # "IndSeq" or "PoolSeq" + proj_name = NULL, # project name + proj_dir = NULL, # project directory + locus_type = NULL, # "SNP" or "MSS" + seq_mode = NULL, # "IndSeq" or "PoolSeq" + proj_file_list = NULL, # content of the project + file_modif = NULL, # counter for project file modification + # (upload, new header, ...) ## data description model = NULL, # historical model param = NULL, # list of parameter values @@ -110,18 +166,28 @@ init_datagen_env <- function() { sex_ratio = NULL # sex ratio in the simulation ) - # init project directory - tmp$dir <- mk_proj_dir("diyabc_datagen") - - # init env - assign("dp", tmp, env) + ## clean up and define environment + list2reactiveValues(tmp_dp, dp) +} + +#' Reset environment for data generation pipeline +#' @keywords internal +#' @author Ghislain Durif +reset_datagen_env <- function() { + # proj dir + env$dp$proj_dir <- mk_proj_dir("diyabc_datagen") } + #' Print content of diyabc-rf project sub-environment for debugging purpose #' @keywords internal #' @author Ghislain Durif debug_ap <- function() { - pprint(reactiveValuesToList(env$ap)) + if(is.reactivevalues(env$ap)) { + pprint(reactiveValuesToList(env$ap)) + } else { + pprint(env$ap) + } } #' Print content of metadata inside diyabc-rf project sub-environment @@ -129,7 +195,11 @@ debug_ap <- function() { #' @keywords internal #' @author Ghislain Durif debug_ap_metadata <- function() { - pprint(reactiveValuesToList(env$ap$metadata)) + if(is.reactivevalues(env$ap$metadata)) { + pprint(reactiveValuesToList(env$ap$metadata)) + } else { + pprint(env$ap$metadata) + } } #' Print content of training set simulation setup inside diyabc-rf project @@ -137,7 +207,11 @@ debug_ap_metadata <- function() { #' @keywords internal #' @author Ghislain Durif debug_ap_ts <- function() { - pprint(reactiveValuesToList(env$ap$ts)) + if(is.reactivevalues(env$ap)) { + pprint(reactiveValuesToList(env$ap$ts)) + } else { + pprint(env$ap$ts) + } } #' Print content of random forest setup inside diyabc-rf project @@ -145,7 +219,11 @@ debug_ap_ts <- function() { #' @keywords internal #' @author Ghislain Durif debug_ap_rf <- function() { - pprint(reactiveValuesToList(env$ap$rf)) + if(is.reactivevalues(env$ap$rf)) { + pprint(reactiveValuesToList(env$ap$rf)) + } else { + pprint(env$ap$rf) + } } #' Print content of data generation project sub-environment for debugging @@ -153,7 +231,32 @@ debug_ap_rf <- function() { #' @keywords internal #' @author Ghislain Durif debug_dp <- function() { - pprint(reactiveValuesToList(env$dp)) + if(is.reactivevalues(env$ap)) { + pprint(reactiveValuesToList(env$dp)) + } else { + pprint(env$dp) + } +} + +#' Fill reactiveValues from package global environment with named list elements +#' @keywords internal +#' @author Ghislain Durif +#' @param named_list list of named elements to fill the reactiveValues `rval` +#' input arguments. +#' @param tag `reactiveValues` variable from package global environment to be +#' filled with named elements from `named_list` input arguments. +list2reactiveValues <- function(named_list, tag) { + # check + if(length(names(named_list)) != length(named_list)) { + stop("'named_list' input arg should be a named list.") + } + # elements names + name_vec <- names(named_list) + # fill + for(ind in 1:length(named_list)) { + env[[as.character(substitute(tag))]][[name_vec[ind]]] <<- + named_list[[ind]] + } } @@ -180,3 +283,14 @@ getter <- function(var1, var2, env = diyabc_env) { setter <- function(val, var1, var2, env = diyabc_env) { env[[ as.character(substitute(var1)) ]][[ as.character(substitute(var2)) ]] <<- val } + +#' Update project files +#' @keywords internal +#' @author Ghislain Durif +update_proj_file <- function(tag = "ap") { + # increment file modification counter + env[[tag]]$file_modif <- + ifelse(!is.null(env[[tag]]$file_modif), env[[tag]]$file_modif, 0) + 1 + # file list + env[[tag]]$proj_file_list <- list.files(env[[tag]]$proj_dir) +} diff --git a/R-pkg/R/21_historical_model.R b/R-pkg/R/21_historical_model.R index 70d4fdbb..cd811cf4 100644 --- a/R-pkg/R/21_historical_model.R +++ b/R-pkg/R/21_historical_model.R @@ -74,7 +74,7 @@ hist_model_server <- function(input, output, session, updateTextAreaInput(session, "scenario", value = local$raw_scenario) }) - # unvalidate if scenario is edited + # invalidate if scenario is edited observeEvent(input$scenario, { local$validated <- FALSE }) diff --git a/R-pkg/R/23_genetic_loci.R b/R-pkg/R/23_genetic_loci.R index 2b41e4b5..27c2ed22 100644 --- a/R-pkg/R/23_genetic_loci.R +++ b/R-pkg/R/23_genetic_loci.R @@ -29,13 +29,10 @@ data_type_ui <- function(id) { #' Data type module server #' @keywords internal #' @author Ghislain Durif -#' @importFrom shinyjs disable enable -data_type_server <- function(input, output, session) { - # init output - out <- reactiveValues( - locus_type = NULL, - seq_mode = NULL - ) +#' @param tag character string, type of project identified by `"ap"` +#' (for diyabc-rf analysis project) or `"dp"` (data generation project). +data_type_server <- function(input, output, session, tag = "ap") { + # disable seq mode if relevant observeEvent(input$locus_type, { req(input$locus_type) @@ -47,22 +44,16 @@ data_type_server <- function(input, output, session) { shinyjs::enable("seq_mode") } }) - # react + + # update env observeEvent(input$locus_type, { req(input$locus_type) - out$locus_type <- input$locus_type + env[[tag]]$locus_type <<- input$locus_type }) observeEvent(input$seq_mode, { req(input$seq_mode) - out$seq_mode <- input$seq_mode + env[[tag]]$seq_mode <<- input$seq_mode }) - # # debugging - # observe({ - # logging("locus type:", out$locus_type) - # logging("seq mode:", out$seq_mode) - # }) - # output - return(out) } diff --git a/R-pkg/R/31_project_utils.R b/R-pkg/R/31_project_utils.R index 493173c8..1ce01a80 100644 --- a/R-pkg/R/31_project_utils.R +++ b/R-pkg/R/31_project_utils.R @@ -1,7 +1,7 @@ #' Project naming module ui #' @keywords internal #' @author Ghislain Durif -proj_name_ui <- function(id, label = "Project name", default = NULL) { +proj_name_ui <- function(id, label = "Project name") { ns <- NS(id) tagList( h3(label), @@ -11,113 +11,63 @@ proj_name_ui <- function(id, label = "Project name", default = NULL) { textInput( ns("proj_name"), label = NULL, - value = ifelse( - !is.null(default), - default, - "" - ), placeholder = "project name" ) ), column( width = 3, actionButton( - ns("validate_proj_name"), + ns("validate"), label = "Validate", icon = icon("check"), width = '100%' ) ) ), - uiOutput(ns("feedback_proj_name")) + uiOutput(ns("feedback")) ) } #' Project naming module server #' @keywords internal #' @author Ghislain Durif -proj_name_server <- function(input, output, session) { +#' @param tag character string, type of project identified by `"ap"` +#' (for diyabc-rf analysis project) or `"dp"` (data generation project). +proj_name_server <- function(input, output, session, tag = "ap") { # init local - local <- reactiveValues( - modified_proj_name = FALSE, - valid_proj_name = FALSE - ) - - # init output - out <- reactiveValues(proj_name = NULL, valid_proj_name = FALSE) + local <- reactiveValues(modified = TRUE) # new input observeEvent(input$proj_name, { - shinyjs::enable("validate_proj_name") - local$modified_proj_name <- TRUE - - # check project name - local$valid_proj_name <- FALSE - if(!is.null(input$proj_name)) { - if(str_length(input$proj_name) > 0) { - local$valid_proj_name <- TRUE - } - } + shinyjs::enable("validate") + local$modified <- TRUE }) # validate input - observeEvent(input$validate_proj_name, { - req(input$validate_proj_name) - - local$modified_proj_name <- FALSE - - out$proj_name <- input$proj_name - - out$valid_proj_name <- local$valid_proj_name - - shinyjs::disable("validate_proj_name") - }) - - # valid proj name ? - observeEvent(local$valid_proj_name, { - req(!is.null(local$valid_proj_name)) + observeEvent(input$validate, { + req(input$validate) - if(!local$valid_proj_name) { - shinyjs::enable("validate_proj_name") + local$modified <- FALSE + + # check project name + if(isTruthy(input$proj_name)) { + shinyjs::disable("validate") + env[[tag]]$proj_name <<- input$proj_name } }) + # # debugging + # observe({ + # logging("proj name:", env[[tag]]$proj_name) + # }) + # feedback on project name - output$feedback_proj_name <- renderUI({ - feedback<- tagList() - - # check if modified proj name - if(!is.null(local$modified_proj_name)) { - if(local$modified_proj_name) { - feedback <- tagAppendChild( - feedback, - helpText( - icon("warning"), - "Project name is not validated." - ) - ) - } - } - - # check proj name - if(!is.null(local$valid_proj_name)) { - if(!local$valid_proj_name) { - feedback <- tagAppendChild( - feedback, - helpText( - icon("warning"), "Project name is missing." - ) - ) - } - } else { - - } - - # feedback - feedback + observe({ + feedbackWarning( + "proj_name", + local$modified || !isTruthy(input$proj_name), + "Project name is missing or not validated." + ) }) - - # output - return(out) } diff --git a/R-pkg/R/32_project_admin.R b/R-pkg/R/32_project_admin.R index 85d03247..eb5b049c 100644 --- a/R-pkg/R/32_project_admin.R +++ b/R-pkg/R/32_project_admin.R @@ -37,55 +37,49 @@ proj_admin_ui <- function(id) { #' Project administration server #' @keywords internal #' @author Ghislain Durif -proj_admin_server <- function(input, output, session, - proj_dir = reactive({NULL}), - proj_name = reactive({NULL})) { - # init local - local <- reactiveValues( - proj_dir = NULL, - proj_name = NULL - ) - # get input - observe({ - local$proj_dir <- proj_dir() - local$proj_name <- proj_name() - }) +#' @param tag character string, type of project identified by `"ap"` +#' (for diyabc-rf analysis project) or `"dp"` (data generation project). +proj_admin_server <- function(input, output, session, tag = NULL) { + # init output out <- reactiveValues(reset = NULL) - # save - output$save <- downloadHandler( - filename = function() { - file_name <- "project_name.zip" - if(!is.null(local$proj_name)) { - if(str_length(local$proj_name) > 0) { - file_name <- str_c(local$proj_name, ".zip") + + # check input + if(isTruthy(tag)) { + + # save + output$save <- downloadHandler( + filename = function() { + file_name <- "project_name.zip" + if(isTruthy(env[[tag]]$proj_name)) { + file_name <- str_c(env[[tag]]$proj_name, ".zip") } + return(file_name) + }, + content = function(file) { + req(env[[tag]]$proj_dir) + wd <- getwd() + on.exit(setwd(wd)) + setwd(env[[tag]]$proj_dir) + cleanup_diyabc_run(env[[tag]]$proj_dir) + cleanup_abcranger_run(env[[tag]]$proj_dir) + zip::zip(file, list.files(env[[tag]]$proj_dir)) } - return(file_name) - }, - content = function(file) { - wd <- getwd() - on.exit(setwd(wd)) - setwd(local$proj_dir) - cleanup_diyabc_run(local$proj_dir) - cleanup_abcranger_run(local$proj_dir) - zip::zip(file, list.files(local$proj_dir)) - } - ) - - ## reset - observeEvent(input$reset, { - ask_confirmation( - inputId = "reset_confirmation", - title = "Want to confirm ?" ) - }) - observeEvent(input$reset_confirmation, { - req(!is.null(input$reset_confirmation)) - if(isTRUE(input$reset_confirmation)) { + + ## reset + observeEvent(input$reset, { + ask_confirmation( + inputId = "reset_confirmation", + title = "Want to confirm ?" + ) + }) + + observeEvent(input$reset_confirmation, { + req(input$reset_confirmation) out$reset <- ifelse(!is.null(out$reset), out$reset, 0) + 1 - } - }) + }) + } ## output return(out) diff --git a/R-pkg/R/41_input_read.R b/R-pkg/R/41_input_read.R new file mode 100644 index 00000000..a169354f --- /dev/null +++ b/R-pkg/R/41_input_read.R @@ -0,0 +1,646 @@ +#' Read and parse headerRF.txt and header.txt file +#' @keywords internal +#' @author Ghislain Durif +#' @description +#' Content: see doc +#' @param file_name string, (server-side) path to a headersim file. +#' @param file_type string, MIME file type. +#' @param locus_type string, "snp" or "mss" +#' @importFrom readr read_file +read_header <- function(file_name, file_type, locus_type = "snp") { + + # init output + out <- list( + msg = list(), valid = TRUE, + data_file = NULL, locus_desc = NULL, + n_param = NULL, n_prior = NULL, n_stat = NULL, + cond_list = NULL, prior_list = NULL, + n_group = NULL, group_prior_list = NULL, + n_scen = NULL, scenario_list = NULL, n_param_scen = NULL, + simu_mode = NULL, + header_file = NULL + ) + + current_line <- 0 + + # check file_name + tmp <- check_file_name(file_name) + if(!tmp) { + out$valid <- FALSE + msg <- tagList("Invalid file name.") + out$msg <- append(out$msg, list(msg)) + } + + # check file_type + if(file_type != "text/plain") { + out$valid <- FALSE + msg <- tagList("Wrong file type.") + out$msg <- append(out$msg, list(msg)) + } + + # continue ? + if(!out$valid) { + return(out) + } + + ## HEADER FILE NAME + out$header_file <- basename(file_name) + + ## HEADER FILE CONTENT + # read whole file in one string and split it by new line + header <- unlist(str_split(read_file(file_name), "\n")) + + ## data file + out$data_file <- header[1] + header <- header[-1] + current_line <- current_line + 1 + + ## number of parameters and statistics + pttrn <- "^[0-9]+ parameters and [0-9]+ summary statistics$" + strng <- header[1] + header <- header[-1] + current_line <- current_line + 1 + if(!str_detect(strng, pttrn)) { + out$valid <- FALSE + msg <- tagList( + "Issue with line", tags$b(as.character(current_line)), + "that should match the pattern", tags$code(pttrn), "." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + pttrn <- "[0-9]+(?= parameters)" + out$n_param <- as.integer(str_extract(strng, pttrn)) + pttrn <- "[0-9]+(?= summary statistics)" + out$n_stat <- as.integer(str_extract(strng, pttrn)) + + ## empty line + strng <- header[1] + header <- header[-1] + current_line <- current_line + 1 + if(strng != "") { + out$valid <- FALSE + msg <- tagList( + "Issue with line", tags$b(as.character(current_line)), + "that should be empty." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## scenarios + pttrn <- "^[0-9]+ scenarios:( [0-9]+)+ ?$" + strng <- header[1] + header <- header[-1] + current_line <- current_line + 1 + # find section + if(!str_detect(strng, pttrn)) { + out$valid <- FALSE + msg <- tagList( + "Issue with line", tags$b(as.character(current_line)), + "that should match the pattern", tags$code(pttrn), "." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## scenario config + pttrn <- "[0-9]+(?= scenarios:)" + out$n_scen <- as.integer(str_extract(strng, pttrn)) + pttrn <- "(?<= )[0-9]+" + nrow_per_scenario <- as.integer(unlist(str_extract_all(strng, pttrn))) + ## extract scenarii + row_seq <- cumsum(c(1, nrow_per_scenario+1)) + parsed_scenario_list <- lapply( + split( + header[(min(row_seq):(max(row_seq)-1))], + rep(seq(row_seq), diff(c(row_seq, max(row_seq)))) + ), + function(content) parse_header_scenario(content) + ) + # next + header <- header[-(1:(max(row_seq)-1))] + current_line <- current_line + max(row_seq) - 1 + # check extracted scnenarii + scen_check <- unlist(lapply( + parsed_scenario_list, function(item) item$valid + )) + scen_id <- unlist(lapply( + parsed_scenario_list, function(item) item$id + )) + if(!all(scen_check)) { + out$valid <- FALSE + msg <- tagList( + "Issue with format of following scenarii:", + str_c(scen_id[!scen_check], collapse = ", "), "." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + # number of parameters per scenario + out$n_param_scen <- unlist(unname(lapply( + parsed_scenario_list, function(item) item$n_param + ))) + # extract raw scenario list + out$scenario_list <- unlist(unname(lapply( + parsed_scenario_list, function(item) item$scenario + ))) + + ## empty line + strng <- header[1] + header <- header[-1] + current_line <- current_line + 1 + if(strng != "") { + out$valid <- FALSE + msg <- tagList( + "Issue with line", tags$b(as.character(current_line)), + "that should be empty." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## historical parameters + pttrn <- "^historical parameters priors \\([0-9]+,[0-9]+\\)$" + strng <- header[1] + header <- header[-1] + current_line <- current_line + 1 + # find section + if(!any(str_detect(strng, pttrn))) { + out$valid <- FALSE + msg <- tagList( + "Issue with line", tags$b(as.character(current_line)), + "that should match the pattern", tags$code(pttrn), "." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # number of priors + pttrn <- "(?<= \\()[0-9]+" + out$n_prior <- as.integer(str_extract(strng, pttrn)) + # number of conditions + pttrn <- "[0-9]+(?=\\)$)" + out$n_cond <- as.integer(str_extract(strng, pttrn)) + + ## parameter config + # extract priors + out$prior_list <- header[1:out$n_prior] + header <- header[-(1:out$n_prior)] + current_line <- current_line + out$n_prior + # check extracted priors + prior_check <- unlist(lapply(out$prior_list, check_header_prior)) + if(!all(prior_check)) { + out$valid <- FALSE + msg <- tagList( + "Issue with format of parameter priors at lines:", + str_c(which(!prior_check) + current_line, collapse = ", "), + "." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # extract conditions + if(out$n_cond > 0) { + + out$cond_list <- header[1:out$n_cond] + header <- header[-(1:out$n_cond)] + current_line <- current_line + out$n_cond + # check extracted conds + cond_check <- unlist(lapply(out$cond_list, check_header_cond)) + if(!all(cond_check)) { + out$valid <- FALSE + msg <- tagList( + "Issue with format of parameter conditions at lines:", + str_c(which(!cond_check) + current_line, collapse = ", "), + "." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # generation mode + pttrn <- "DRAW UNTIL" + out$simu_mode <- header[1] + header <- header[-1] + current_line <- current_line + 1 + if(out$simu_mode != pttrn) { + out$valid <- FALSE + msg <- tagList( + "Missing 'DRAW UNTIL' after conditions at lines:", + as.character(current_line+1), + "." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + } + + ## empty line (and check for unnecessary simulation mode) + strng <- header[1] + header <- header[-1] + current_line <- current_line + 1 + if(strng != "") { + if(strng == "DRAW UNTIL") { + out$valid <- FALSE + msg <- tagList( + "Issue with line", tags$b(as.character(current_line)), + "unnecessary 'DRAW UNTIL'." + ) + out$msg <- append(out$msg, list(msg)) + } else { + out$valid <- FALSE + msg <- tagList( + "Issue with line", tags$b(as.character(current_line)), + "that should be empty." + ) + out$msg <- append(out$msg, list(msg)) + } + return(out) + } + + ## loci description + pttrn <- "^loci description \\([0-9]+\\)$" + strng <- header[1] + header <- header[-1] + current_line <- current_line + 1 + # find section + if(!any(str_detect(strng, pttrn))) { + out$valid <- FALSE + msg <- tagList( + "Issue with line", tags$b(as.character(current_line)), + "that should match the pattern", tags$code(pttrn), "." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # number of loci description + pttrn <- "(?<=^loci description \\()[0-9]+" + out$n_locus_desc <- as.integer(str_extract(strng, pttrn)) + + # extract loci description + out$locus_desc <- header[1:out$n_locus_desc] + header <- header[-(1:out$n_locus_desc)] + current_line <- current_line + out$n_locus_desc + # check extracted loci description + locus_desc_check <- unlist(lapply( + out$locus_desc, + check_header_locus_desc, type = locus_type + )) + if(!all(locus_desc_check)) { + out$valid <- FALSE + msg <- tagList( + "Issue with format of locus description at lines:", + str_c(which(!locus_desc_check) + current_line, collapse = ", "), + "." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## empty line + strng <- header[1] + header <- header[-1] + current_line <- current_line + 1 + if(strng != "") { + out$valid <- FALSE + msg <- tagList( + "Issue with line", tags$b(as.character(current_line)), + "that should be empty." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## group prior (for microsat/sequence) + if(locus_type == "mss") { + + pttrn <- "^group priors \\([0-9]+\\)$" + strng <- header[1] + header <- header[-1] + current_line <- current_line + 1 + # find section + if(!any(str_detect(strng, pttrn))) { + out$valid <- FALSE + msg <- tagList( + "Issue with line", tags$b(as.character(current_line)), + "that should match the pattern", tags$code(pttrn), "." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # number of group prior + pttrn <- "(?<=^group priors \\()[0-9]" + out$n_group <- as.integer(str_extract(strng, pttrn)) + + # find next section + tmp_current_line <- current_line + end_sec <- head(which(header == ""), 1) + content <- header[1:(end_sec-1)] + header <- header[-(1:(end_sec-1))] + current_line <- current_line + end_sec - 1 + + # extract group prior + check_group_prior <- parse_header_group_prior( + content, out$n_group, tmp_current_line + ) + + if(!check_group_prior$valid) { + out$valid <- FALSE + msg <- tagList( + "Issue with format of group prior around line ", + tags$b(as.character(check_group_prior$current_line)), + "." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + current_line <- check_group_prior$current_line + out$group_prior_list <- check_group_prior$group_prior + + ## empty line + strng <- header[1] + header <- header[-1] + current_line <- current_line + 1 + if(strng != "") { + out$valid <- FALSE + msg <- tagList( + "Issue with line", tags$b(as.character(current_line)), + "that should be empty." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + } + + ## group summary statistics + pttrn <- "^group summary statistics \\([0-9]+\\)$" + strng <- header[1] + header <- header[-1] + current_line <- current_line + 1 + # find section + if(!any(str_detect(strng, pttrn))) { + out$valid <- FALSE + msg <- tagList( + "Issue with line", tags$b(as.character(current_line)), + "that should match the pattern", tags$code(pttrn), "." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # number of summary stats + pttrn <- "(?<=^group summary statistics \\()[0-9]+" + tmp_n_stat <- sum(as.integer(str_extract(strng, pttrn)), na.rm = TRUE) + # check + if(out$n_stat != tmp_n_stat) { + out$valid <- FALSE + msg <- tagList( + "Issue with line", tags$b(as.character(current_line)), + "the number of summary statistics is different", + "from the number given in line 2." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## + # TODO: parse end of file + + # output + return(out) +} + + +#' Parse scenario in header file +#' @keywords internal +#' @author Ghislain Durif +#' @description +#' Content: see doc +#' @param content string vector, scenario description +parse_header_scenario <- function(content) { + + # init output + out <- list( + valid = TRUE, + id = NULL, n_param = NULL, + prior = NULL, scenario = NULL + ) + + # description (1st line) + strng <- content[1] + pttrn <- str_c("^scenario [0-9]+ \\[", num_regex(), "\\] \\([0-9]+\\)$") + if(!str_detect(strng, pttrn)) { + out$valid <- FALSE + } else { + # scenario id + pttrn <- "(?<=^scenario )[0-9]+" + out$id <- as.integer(str_extract(strng, pttrn)) + # scenario prior + pttrn <- str_c("(?<= \\[)", num_regex(), "(?=\\] )") + out$prior <- as.numeric(str_extract(strng, pttrn)) + # number of parameters in scenario + pttrn <- "(?<= \\()[0-9]+(?=\\)$)" + out$n_param <- as.integer(str_extract(strng, pttrn)) + ## scenario + out$scenario <- str_c(content[-1], collapse = "\n") + } + ## output + return(out) +} + + +#' Parse scenario in header file +#' @keywords internal +#' @author Ghislain Durif +#' @description +#' Content: see doc +#' @param content string vector, scenario description +parse_header_group_prior <- function(content, n_group, current_line) { + + # init output + out <- list( + valid = FALSE, ind = NULL, group_prior = NULL, mss_type = NULL, + current_line = current_line + ) + + # loop over group priors + for(ind in 1:n_group) { + # which group + out$ind <- ind + # description (1st line) + pttrn <- str_c("^group G[0-9]+ \\[(M|S)\\]$") + strng <- content[1] + content <- content[-1] + out$current_line <- out$current_line + 1 + if(!str_detect(strng, pttrn)) { + out$valid <- FALSE + return(out) + } else { + group_prior_head <- strng + # locus type + pttrn <- "(?<=\\[)(M|S)(?=\\])" + out$mss_type[ind] <- str_extract(strng, pttrn) + n_line <- switch ( + out$mss_type[ind], + "M" = 6, + "S" = 7, + NA + ) + out$valid <- check_header_group_prior( + head(content, n_line), type = out$mss_type[ind] + ) + if(!out$valid) { + return(out) + } + out$group_prior[ind] <- str_c( + c(group_prior_head, head(content, n_line)), collapse = "\n" + ) + content <- content[-(1:n_line)] + out$current_line <- out$current_line + n_line + } + } + + ## output + return(out) +} + + + +#' Read and parse statobsRF.txt file +#' @keywords internal +#' @author Ghislain Durif +#' @description +#' Content: see doc +#' @param file_name string, (server-side) path to a headersim file. +#' @param file_type string, MIME file type. +#' @param n_stat integer, number of summary statistics in reftable. +read_statobs <- function(file_name, file_type, n_stat) { + + # init output + out <- list(msg = list(), valid = TRUE) + + # check file_name + tmp <- check_file_name(file_name) + if(!tmp) { + out$valid <- FALSE + msg <- tagList("Invalid file name.") + out$msg <- append(out$msg, list(msg)) + } + + # check file_type + if(file_type != "text/plain") { + out$valid <- FALSE + msg <- tagList("Wrong file type.") + out$msg <- append(out$msg, list(msg)) + } + + # continue ? + if(!out$valid) { + return(out) + } + + # try reading it + tmp <- tryCatch( + read.table(file_name), + error = function(e) return(NULL) + ) + + if(is.null(tmp)) { + out$valid <- FALSE + msg <- tagList("Issue with statobs file format.") + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # check number of rows and columns + if(nrow(tmp) != 2) { + out$valid <- FALSE + msg <- tagList( + "The statobs file is supposed to contain two lines:", + "one with the names of the summary statistics,", + "and a second with the corresponding values", + "computed on the dataset." + ) + out$msg <- append(out$msg, list(msg)) + } + + if(ncol(tmp) != n_stat) { + out$valid <- FALSE + msg <- tagList( + "The number of summary statistics in the statobs file", + "does not correspond to the number of summary statistics", + "in the reftable file." + ) + out$msg <- append(out$msg, list(msg)) + } + + # output + return(out) +} + +#' Read and parse reftableRF.bin file +#' @keywords internal +#' @author Ghislain Durif +#' @author François-David Collin +#' @description +#' Content: see doc +#' @param file_name string, (server-side) path to a headersim file. +#' @param file_type string, MIME file type. +read_reftable <- function(file_name, file_type) { + + # init output + out <- list( + msg = list(), valid = TRUE, + n_rec = NULL, n_scen = NULL, n_rec_scen = NULL, n_param_scen = NULL, + n_stat = NULL + ) + + # check file_name + tmp <- check_file_name(file_name) + if(!tmp) { + out$valid <- FALSE + msg <- tagList("Invalid file name.") + out$msg <- append(out$msg, list(msg)) + } + + # check file_type + if(file_type != "application/octet-stream") { + out$valid <- FALSE + msg <- tagList("Wrong file type.") + out$msg <- append(out$msg, list(msg)) + } + + # continue ? + if(!out$valid) { + return(out) + } + + ## Reftable feed + # Stream from reftable file + to_read = file(file_name,"rb") + # number of records + out$n_rec <- readBin(to_read, integer(), endian = "little") + # number of scenarios + out$n_scen <- readBin(to_read, integer(), endian = "little") + # number of records for each scenario + out$n_rec_scen <- readBin(to_read, integer(), n = out$n_scen, + endian = "little") + # number of used parameters (non constant) + out$n_param_scen <- readBin(to_read, integer(), n = out$n_scen, + endian = "little") + # number of stats + out$n_stat <- readBin(to_read, integer(), endian = "little") + + # close stream + close(to_read) + + # output + return(out) +} \ No newline at end of file diff --git a/R-pkg/R/41_io_management.R b/R-pkg/R/41_io_management.R index c7a53af8..6d10c5f9 100644 --- a/R-pkg/R/41_io_management.R +++ b/R-pkg/R/41_io_management.R @@ -1,1725 +1,3 @@ -#' Check file_name -#' @keywords internal -#' @author Ghislain Durif -check_file_name <- function(file_name) { - valid <- TRUE - if(!is.character(file_name)) - valid <- FALSE - else if(!file.exists(file_name)) - valid <- FALSE - return(valid) -} - -#' Check data file -#' @keywords internal -#' @author Ghislain Durif -#' @param data_file string, data file name. -#' @param data_dir string, path to directory where data file is stored. -#' @param locus_type string, locus type `"mss"` or `"snp"`. -#' @param seq_mode string, `"indseq"` or `"poolseq"`. -#' @param expected_data_file string, expected data file name for -#' existing project, default is NULL. -#' @importFrom tools file_ext -check_data_file <- function(data_file, data_dir, locus_type, seq_mode, - expected_data_file = NULL) { - # output - out <- NULL - # ## debugging - # logging("data_file = ", data_file) - ## mss locus - if(locus_type == "mss") { - out <- check_mss_data_file(data_file, data_dir, expected_data_file) - ## snp locus / indseq - } else if((locus_type == "snp") & (seq_mode == "indseq")) { - out <- check_indseq_snp_data_file(data_file, data_dir, - expected_data_file) - ## snp locus / poolseq - } else if((locus_type == "snp") & (seq_mode == "poolseq")) { - out <- check_poolseq_snp_data_file(data_file, data_dir, - expected_data_file) - } else { - stop("Issue with input arguments") - } - - ## output - return(out) -} - -#' Check IndSeq SNP data file -#' @keywords internal -#' @author Ghislain Durif -#' @param data_file string, data file name. -#' @param data_dir string, path to directory where data file is stored. -#' @param expected_data_file string, expected data file name for -#' existing project, default is NULL. -#' @importFrom tools file_ext -#' @importFrom parallel makeCluster stopCluster -#' @importFrom pbapply pblapply -check_indseq_snp_data_file <- function(data_file, data_dir, - expected_data_file = NULL) { - # init output and intermediate - header <- NULL - info <- NULL - spec <- NULL - valid <- TRUE - - content <- NULL - indiv_info <- NULL - - locus <- NULL - locus_details <- NULL - n_loci <- NULL - n_pop <- NULL - n_indiv <- NULL - snp_type <- NULL - maf <- NULL - - err <- list() - msg <- list() - - # data path - data_path <- file.path(data_dir, data_file) - - ## file exists and is not empty? - if(!file.exists(data_path)) { - err <- append(err, "Input data file does not exist") - valid <- FALSE - } else if(file.info(data_path)$size == 0) { - err <- append(err, "Input data file is empty") - valid <- FALSE - } else { - ## check file type - if(tools::file_ext(data_path) != "snp") { - err <- append( - err, - "IndSeq SNP files should have an extension '.snp'." - ) - valid <- FALSE - } else { - ## info - info <- readLines(data_path, n = 1, warn = FALSE) - # sex ratio - pttrn <- "(?<=<)NM=[0-9]+\\.?[0-9]*NF(?=>)" - if(str_detect(info, pttrn)) { - msg <- append( - msg, - str_c("Sex ratio:", str_extract(info, pttrn), sep = " ") - ) - } else { - err <- append( - err, - str_c( - "Issue with IndSeq SNP file header first line:", - "missing 'sex ratio', see manual", sep = " " - ) - ) - valid <- FALSE - } - # MAF - pttrn <- "(?<=<)MAF=[:graph:]+(?=>)" - if(str_detect(info, pttrn)) { - msg <- append( - msg, - str_c("Minimum allele frequency criterion:", - str_extract(info, pttrn), sep = " ") - ) - # numeric MAF - pttrn <- "(?<=)" - if(str_detect(info, pttrn)) { - tmp_maf <- str_extract(info, pttrn) - maf <- as.numeric(tmp_maf) - if(maf < 0 | maf > 1) { - err <- append( - err, - str_c( - "Issue with IndSeq SNP file header first line:", - "MAF should be a real number between 0 and 1", - "or the keyword 'hudson', see manual.", - sep = " " - ) - ) - valid <- FALSE - } - } - } else { - msg <- append( - msg, - str_c( - "Missing 'minimum allele frequency criterion (MAF)'", - "in IndSeq SNP file header first line:", - "Hudson algorithm will be used.", sep = " " - ) - ) - } - # additional info - pttrn <- str_c( - "()", - "()", - sep = "|" - ) - add_info <- str_trim(str_replace_all(info, pttrn, "")) - if(str_length(add_info) > 0) { - msg <- append( - msg, - str_c("Additional information:", add_info, sep = " ") - ) - } - ## header - header <- tryCatch( - unname(unlist(read.table(file = data_path, - skip = 1, nrows = 1))), - error = function(e) e - ) - if("error" %in% class(header)) { - err <- append( - err, - str_c( - "Issue with IndSeq SNP file header second line:", - header$message, sep = " " - ) - ) - valid <- FALSE - } else { - # upper case - header <- str_to_upper(header) - # header format - if(header[1] != "IND" & header[2] != "SEX" & - header[3] != "POP" & - !all(header[-(1:3)] %in% c("A", "H", "X", "Y", "M"))) { - err <- append( - err, - str_c( - "Issue with IndSeq SNP file header second line:", - "required format is 'IND SEX POP' followed by", - "a letter indicator among 'A', 'H', 'X', 'Y', 'M'", - "for each SNP locus in the file (see manual).", - sep = " " - ) - ) - valid <- FALSE - } - # nb of locus - n_loci <- length(header) - 3 - # locus type - candidate_locus <- c("A", "H", "X", "Y", "M") - locus_encoding <- str_c(header[-(1:3)], collapse = " ") - locus_details <- Reduce("rbind", lapply( - candidate_locus, - function(pttrn) { - count <- str_count(locus_encoding, pttrn) - return(data.frame( - count = count, - type = pttrn, - stringsAsFactors = FALSE - )) - } - )) - # save snp type for filtering - snp_type <- header[-(1:3)] - # merge header - header_length <- length(header) - header <- str_c(header[1:min(20,header_length)], - collapse = " ") - if(header_length > 20) - header <- str_c(header, "...", sep = " ") - msg <- append( - msg, - str_c("header:", str_c("'", header, "'"), sep = " ") - ) - ## content - content <- tryCatch( - read.table(file = data_path, skip = 2), - error = function(e) e - ) - if("error" %in% class(content)) { - err <- append( - err, - str_c( - "Issue with IndSeq SNP data file content:", - content$message, sep = " " - ) - ) - valid <- FALSE - } else { - # check for SEX column with only F (interpreted as FALSE) - if(is.logical(content[,2])) { - content[,2] <- ifelse(content[,2], "T", "F") - } - # check number of locus - if(n_loci != (ncol(content) - 3)) { - err <- append( - err, - str_c( - "Issue with IndSeq SNP data file content:", - "number of loci not consistent between", - "file header and file content.", - sep = " " - ) - ) - valid <- FALSE - } - # check sex content - if(!all(str_to_upper(as.character(content[,2])) %in% - c("F", "M", "9"))) { - err <- append( - err, - str_c( - "Issue with IndSeq SNP data file content:", - "'SEX' column should contain only", - "'F' for female, 'M' for male", - "or '9' for missing values (see manual).", - sep = " " - ) - ) - valid <- FALSE - } - # number of pop - n_pop <- length(unique(content[,3])) - # number of individuals - n_indiv <- nrow(content) - msg <- append( - msg, - str_c( - "Sample:", - n_indiv, "individuals", - "from", n_pop, "populations", - sep = " ") - ) - # reformat data - indiv_info <- content[,1:3] - colnames(indiv_info) <- c("IND", "SEX", "POP") - content <- t(as.matrix(content[,-(1:3)])) - # check for any - if(any(is.na(content)) | any(is.na(indiv_info))) { - err <- append( - err, - str_c( - "Issue with IndSeq SNP file data content:", - "NA values were found", - sep = " " - ) - ) - valid <- FALSE - } else { - # check SNP encoding - if(!all(content %in% c(0,1,2,9))) { - err <- append( - err, - str_c( - "Issue with IndSeq SNP file data content:", - "SNP encoding should be '0', '1', '2'", - "or '9' for missing data (see manual).", - sep = " " - ) - ) - valid <- FALSE - } - } - } - } - } - ## filtering locus - if(valid) { - filtered_locus <- filter_snp_indseq( - content, indiv_info, snp_type, locus_details, maf - ) - if(is.null(filtered_locus)) { - err <- append( - err, - str_c( - "Issue with IndSeq SNP file data content:", - "error during SNP filtering, ", - "probable issue with SNP data encoding (see manual).", - sep = " " - ) - ) - valid <- FALSE - } else { - locus <- unname(unlist(lapply( - split(filtered_locus, seq(nrow(filtered_locus))), - function(item) { - if(item$count > 0) - return(str_c(item$count - item$filter, - " <", item$type, ">")) - else - return(NULL) - } - ))) - locus_msg <- unname(unlist(lapply( - split(filtered_locus, seq(nrow(filtered_locus))), - function(item) { - if(item$count > 0) { - item_type <- str_c("<", item$type, ">") - txt <- str_c( - item$count - item$filter, item_type, sep = " " - ) - if(item$filter > 0) { - txt <- str_c( - item$count - item$filter, item_type, - "(note:", item$filter, item_type, - "loci are filtered out", - "based on MAF criterion)", sep = " " - ) - } - if(item$mono > 0) { - str_c( - item$count - item$filter, item_type, - "(note:", item$filter, item_type, - "loci, including ", - item$mono, item_type, - "monomorphic loci, are filtered out", - "based on MAF criterion)", sep = " " - ) - } - return(txt) - } else - return(NULL) - } - ))) - msg <- append( - msg, - str_c("Total number of loci =", n_loci, sep = " ") - ) - msg <- append( - msg, - str_c("Available loci:", str_c(locus_msg, collapse = "; "), - sep = " ") - ) - } - } - ## output - spec <- lst(locus, n_indiv, n_loci, n_pop) - } - ## expected data file ? - if(!is.null(expected_data_file)) { - if(data_file != expected_data_file) { - err <- append( - err, - str_c( - "Data file expected by provided 'header.txt'", - "or 'headerRF.txt' file(s)", - "and data file provided by user are different.", - sep = " " - ) - ) - valid <- FALSE - } - } - ## output - out <- lst(header, info, spec, valid, err, msg) - return(out) -} - -#' Locus specific filter for IndSeq SNP data based on MAF -#' @keywords internal -#' @author Ghislain Durif -#' @param snp_data integer vector encoding for each individual the -#' number of ancestral allele for the loci, i.e. `0`, `1` and `2` for -#' autosome (`A`) and X-chromosome (`X`) in female, or `0` and `1` for -#' haploid locus (`H`), Y-chromosome (`Y`) in male and -#' mitochondrial locus (`M`). -#' Note : missing values are encoded by a `9`. -#' @param sex_data character vector encoding for each individual the sex, i.e. -#' `"F"` for female and `"M"` for male. -#' Note : missing values are encoded by a `"9"`. -#' @param locus_type character encoding the type of the locus -#' (among `A`, `H`, `X`, `Y`, `M`). -#' @param maf double between 0 and 1 -indseq_locus_filter <- function(snp_data, sex_data, locus_type, maf) { - - # init - af <- 0 - issue <- FALSE - - # identify missing data and compute data size without missing data - non_missing_data <- (snp_data != 9) - true_data_size <- sum(non_missing_data) - # identify female, male, and missing sex - female_ind <- (sex_data == "F") - male_ind <- (sex_data == "M") - missing_sex <- (sex_data == "9") - - if(locus_type == "A") { - ## autosome - if(true_data_size > 0) { - # reference allele frequence - af <- sum(snp_data[non_missing_data]) / (2 * true_data_size) - } - } else if(locus_type %in% c("H", "M")) { - ## haploid & mitochondrial - if(true_data_size > 0) { - # reference allele frequence - af <- sum(snp_data[non_missing_data]) / true_data_size - } - } else if(locus_type == "X") { - ## X-chromosome - issue <- any(missing_sex & non_missing_data) | - any(male_ind & (snp_data == 2)) - - specific_ind <- non_missing_data & !missing_sex - - specific_data_size <- sum(specific_ind) - - if(specific_data_size > 0) { - weighted_data_size <- 2 * sum(non_missing_data & female_ind) + - sum(non_missing_data & male_ind) - # reference allele frequence - af <- sum(snp_data[specific_ind]) / weighted_data_size - } - } else if(locus_type == "Y") { - ## Y-chromosome - issue <- any(missing_sex & non_missing_data) | - any(male_ind & (snp_data == 2)) | - any(female_ind & (snp_data != 9)) - - specific_ind <- non_missing_data & male_ind - - specific_data_size <- sum(specific_ind) - - if(specific_data_size > 0) { - # reference allele frequence - af <- sum(snp_data[specific_ind]) / specific_data_size - } - } - - # filtering - # TODO check > or >= for maf filter - filter_ind <- !((af >= maf) & ((1 - af) >= maf)) - mono_ind <- !((af > 0) & ((1 - af) > 0)) - if(maf == 0) { - filter_ind <- mono_ind - } - - # filter - return(data.frame( - filter = filter_ind, - mono = mono_ind, - issue = issue, - stringsAsFactors = FALSE - )) -} - -#' Filter IndSeq SNP data based on MAF -#' @keywords internal -#' @author Ghislain Durif -#' @param content data.frame with data file content, with columns -#' `IND` (individual id), `SEX` (female or male), `POP` (population id), -#' and remaining columns corresponding to SNPs where each entry encode the -#' number of ancestral allele for the loci, i.e. `0`, `1` and `2` for -#' autosome (`A`) and X-chromosome (`X`) in female, or `0` and `1` for -#' haploid locus (`H`), Y-chromosome (`Y`) in male and -#' mitochondrial locus (`M`). -#' Note : missing values are encoded by a `9`. -#' @param snp_type vector of column header, being `IND`, `SEX`, `POP` followed -#' by each locus type (among `A`, `H`, `X`, `Y`, `M`). -#' @param locus_details data.frame with two columns, `count` being the number -#' of locus for each type in the data, and `type` being the corresponding locus -#' type (among `A`, `H`, `X`, `Y`, `M`). -#' @importFrom dplyr bind_rows -filter_snp_indseq <- function(content, indiv_info, snp_type, locus_details, - maf=0.05) { - - ncore <- getOption("diyabcGUI")$ncore - snp_filter <- NULL - - if(is.null(maf)) maf <- 0 - - snp_filter <- pblapply( - 1:nrow(content), - function(ind) { - out <- indseq_locus_filter( - snp_data = content[ind,], - sex_data = indiv_info$SEX, - locus_type = snp_type[ind], - maf = maf - ) - }, - cl = ncore - ) - - seek_error <- unlist(lapply( - snp_filter, - function(item) "try-error" %in% class(item) - )) - if(any(seek_error)) { - error_msg <- attributes( - snp_filter[[ which(seek_error[1]) ]] - )$condition$message - err <- str_c( - "Issue when checking IndSeq SNP data file", - "content:", - error_msg, - sep = " " - ) - pprint(err) - return(NULL) - } else { - # no error - snp_filter <- Reduce("bind_rows", snp_filter) - - # extract number of filtered loci by type - tmp_filter <- tapply( - snp_filter$filter, - snp_type, - sum - ) - tmp_filter <- data.frame( - filter=tmp_filter, type=names(tmp_filter), - row.names=NULL, stringsAsFactors = FALSE - ) - - # extract number of monomorphic loci by type - tmp_mono <- tapply( - snp_filter$mono, - snp_type, - sum - ) - tmp_mono <- data.frame( - mono=tmp_mono, type=names(tmp_mono), - row.names=NULL, stringsAsFactors = FALSE - ) - - # extract number of loci with issue regarding sex by type - tmp_issue <- tapply( - snp_filter$issue, - snp_type, - sum - ) - tmp_issue <- data.frame( - issue=tmp_issue, type=names(tmp_issue), - row.names=NULL, stringsAsFactors = FALSE - ) - - # merge all results into locos_details table - locus_details <- merge(locus_details, tmp_filter) - locus_details <- merge(locus_details, tmp_mono) - locus_details <- merge(locus_details, tmp_issue) - } - - return(locus_details) -} - -#' Check PoolSeq SNP data file -#' @keywords internal -#' @author Ghislain Durif -#' @param data_file string, data file name. -#' @param data_dir string, path to directory where data file is stored. -#' @param expected_data_file string, expected data file name for -#' existing project, default is NULL. -#' @importFrom tools file_ext -check_poolseq_snp_data_file <- function(data_file, data_dir, - expected_data_file = NULL) { - # output - header <- NULL - info <- NULL - spec <- NULL - valid <- TRUE - - err <- list() - msg <- list() - - locus <- NULL - n_loci <- NULL - n_pop <- NULL - n_indiv <- NULL - mrc <- NULL - - # data path - data_path <- file.path(data_dir, data_file) - - ## file exists and is not empty? - if(!file.exists(data_path)) { - err <- append(err, "Input data file does not exist") - valid <- FALSE - } else if(file.info(data_path)$size == 0) { - err <- append(err, "Input data file is empty") - valid <- FALSE - } else { - ## check file type - if(tools::file_ext(data_path) != "snp") { - err <- append( - err, - "PoolSeq SNP files should have an extension '.snp'." - ) - valid <- FALSE - } else { - ## info - info <- readLines(data_path, n = 1, warn = FALSE) - # sex ratio - pttrn <- "(?<=<)NM=[0-9]+\\.?[0-9]*NF(?=>)" - if(str_detect(info, pttrn)) { - msg <- append( - msg, - str_c("Sex ratio:", str_extract(info, pttrn), sep = " ") - ) - } else { - err <- append( - err, - str_c( - "Issue with PoolSeq SNP file header first line:", - "missing 'sex ratio', see manual", sep = " " - ) - ) - valid <- FALSE - } - # MRC - pttrn <- "(?<=<)MRC=[0-9]+(?=>)" - if(str_detect(info, pttrn)) { - msg <- append( - msg, - str_c( - "Minimum read count:", - str_extract(info, pttrn), sep = " " - ) - ) - pttrn <- "(?<=)" - mrc <- as.integer(str_extract(info, pttrn)) - } else { - err <- append( - err, - str_c( - "Issue with PoolSeq SNP file header first line:", - "missing 'minimum read count (MRC)', see manual", - sep = " " - ) - ) - valid <- FALSE - } - # additional info - pttrn <- str_c( - "()", - "()", - sep = "|" - ) - add_info <- str_trim(str_replace_all(info, pttrn, "")) - if(str_length(add_info) > 0) { - msg <- append( - msg, - str_c("Additional information in data file header:", - add_info, sep = " ") - ) - } - ## header - header <- tryCatch( - as.vector(read.table(file = data_path, skip = 1, nrows = 1)), - error = function(e) e - ) - if("error" %in% class(header)) { - err <- append( - err, - str_c( - "Issue with PoolSeq SNP file header second line:", - header$message, sep = " " - ) - ) - valid <- FALSE - } else { - # header format - if(header[1] != "POOL" & - header[2] != "POP_NAME:HAPLOID_SAMPLE_SIZE" & - !all(str_detect(header[-(1:2)], "POP[0-9]+:[0-9]+"))) { - err <- append( - err, - str_c( - "Issue with PoolSeq SNP file header second line:", - "required format is", - "'POOL POP_NAME:HAPLOID_SAMPLE_SIZE' followed by", - "a character string 'POP:'", - "for each population in the file (see manual).", - sep = " " - ) - ) - valid <- FALSE - } - # number of population - n_pop <- length(header) - 2 - # merge header - header <- str_c(header, collapse = " ") - msg <- append( - msg, - str_c("header:", str_c("'", header, "'"), sep = " ") - ) - ## content - content <- tryCatch( - read.table(file = data_path, skip = 2), - error = function(e) e - ) - if("error" %in% class(content)) { - err <- append( - err, - str_c( - "Issue with PoolSeq SNP data file content:", - content$message, sep = " " - ) - ) - valid <- FALSE - } else { - # check number of population - if(ncol(content) %% 2 != 0) { - err <- append( - err, - str_c( - "Issue with PoolSeq SNP data file content:", - "number of column should be even,", - "providing pair of counts for reference", - "and alternate alleles at each locus", - "in each popultation.", - sep = " " - ) - ) - valid <- FALSE - } - if((ncol(content) / 2) != n_pop) { - err <- append( - err, - str_c( - "Issue with PoolSeq SNP data", - "file second-line header", - "or content:", - "number of population indicated in", - "file second-line header", - "does not correspond to", - "number of columns in file content.", - sep = " " - ) - ) - valid <- FALSE - } - msg <- append( - msg, - str_c("Number of population(s) =", n_pop, sep = " ") - ) - # nb of locus - n_loci <- nrow(content) - msg <- append( - msg, - str_c("Total number of loci =", n_loci, sep = " ") - ) - # check data encoding - check_snp_encoding <- apply(content, 2, is.integer) - if(!all(unlist(check_snp_encoding))) { - err <- append( - err, - str_c( - "Issue with PoolSeq SNP file data content:", - "SNP encoding should be read counts,", - "i.e. positive integer (see manual).", - sep = " " - ) - ) - valid <- FALSE - } - if(any(is.na(content))) { - err <- append( - err, - str_c( - "Issue with PoolSeq SNP file data content:", - "no missing values are allowed.", - sep = " " - ) - ) - valid <- FALSE - } - # loci - if(valid) { - # filtering locus - # TODO check (> or >=) and (& or |) for mrc filter - allele1_count <- apply( - content[,rep(c(TRUE,FALSE), n_pop)], 1, sum - ) - allele2_count <- apply( - content[,rep(c(FALSE,TRUE), n_pop)], 1, sum - ) - snp_filter <- (allele1_count >= mrc) & - (allele2_count >= mrc) - # locus type - locus <- str_c(sum(snp_filter), "", sep = " ") - msg <- append( - msg, - str_c( - "Available loci:", locus, - "(note:", sum(!snp_filter), - "loci are filtered out", - "based on MRC criterion)", - sep = " " - ) - ) - } - } - } - } - # output - spec <- lst(locus, n_indiv, n_loci, n_pop) - } - ## expected data file ? - if(!is.null(expected_data_file)) { - if(data_file != expected_data_file) { - err <- append( - err, - str_c( - "Data file expected by provided 'header.txt'", - "or 'headerRF.txt' file(s)", - "and data file provided by user are different.", - sep = " " - ) - ) - valid <- FALSE - } - } - ## output - out <- lst(header, info, spec, valid, err, msg) - return(out) -} - -#' Check Microsat/Sequence (GenePop) data file -#' @keywords internal -#' @author Ghislain Durif -#' @param data_file string, data file name. -#' @param data_dir string, path to directory where data file is stored. -#' @param expected_data_file string, expected data file name for -#' existing project, default is NULL. -#' @importFrom tools file_ext -check_mss_data_file <- function(data_file, data_dir, - expected_data_file = NULL) { - # output - header <- NULL - info <- NULL - spec <- NULL - valid <- TRUE - - err <- list() - msg <- list() - - # data path - data_path <- file.path(data_dir, data_file) - - ## file exists and is not empty? - if(!file.exists(data_path)) { - err <- append(err, "Input data file does not exist") - valid <- FALSE - } else if(file.info(data_path)$size == 0) { - err <- append(err, "Input data file is empty") - valid <- FALSE - } else { - ## init output - locus <- NULL - locus_mode <- NULL - locus_name <- NULL - seq_length <- NULL - n_loci <- NULL - n_pop <- NULL - n_indiv <- NULL - pop_size <- NULL - ## check file type - if(tools::file_ext(data_path) != "mss") { - err <- append( - err, - "Microsat/Sequence files should have an extension '.mss'" - ) - valid <- FALSE - } else { - ## info - info <- readLines(data_path, n = 1, warn = FALSE) - # sex ratio - pttrn <- "(?<=<)NM=[0-9]+\\.?[0-9]*NF(?=>)" - if(str_detect(info, pttrn)) { - msg <- append( - msg, - str_c("Sex ratio:", str_extract(info, pttrn), sep = " ") - ) - } else { - err <- append( - err, - str_c( - "Issue with Microsat/Sequence file header first line:", - "missing 'sex ratio', see manual", sep = " " - ) - ) - valid <- FALSE - } - # additional info - pttrn <- "" - add_info <- str_trim(str_replace_all(info, pttrn, "")) - if(str_length(add_info) > 0) { - msg <- append( - msg, - str_c("Additional information:", add_info, sep = " ") - ) - } - - ## file content - file_content <- readLines(data_path, warn = FALSE) - - ## locus description - pttrn <- "(?<=<)(A|H|X|Y|M)(?=>)" - locus_pttrn_match <- str_extract(file_content, pttrn) - # check if present - if(all(is.na(locus_pttrn_match))) { - err <- append( - err, - str_c( - "Issue with Microsat/Sequence file locus description:", - "missing locus description, see manual", sep = " " - ) - ) - valid <- FALSE - } else { - # check if description follows title line - locus_match_ind <- which(!is.na(locus_pttrn_match)) - if(!all(locus_match_ind == 2:tail(locus_match_ind, 1))) { - err <- append( - err, - str_c( - "Issue with Microsat/Sequence file locus", - "description:", - "locus description should be located", - "at beginning of data file, after title line,", - "with a single locus per line, see manual", - sep = " " - ) - ) - valid <- FALSE - } - - # check locus format - # pttrn <- "^(Locus )?[A-Za-z0-9_-]+ <(A|H|X|Y|M)>$" - pttrn <- "^[A-Za-z0-9\\s_\\-]+ <(A|H|X|Y|M)>$" - locus_spec_match_ind <- which(str_detect(file_content, pttrn)) - if(!all(locus_spec_match_ind %in% locus_match_ind) & - !all(locus_match_ind %in% locus_spec_match_ind)) { - err <- append( - err, - str_c( - "Issue with Microsat/Sequence file locus", - "description format at rows:", - str_c( - locus_spec_match_ind[!locus_spec_match_ind %in% locus_match_ind], - collapse = ", " - ), - sep = " " - ), - str_c( - "You can use the following character to specify", - "locus names:", - "'A-Z', 'a-z', '0-9', '_', '-' and ' '", - sep = " " - ) - ) - valid <- FALSE - } - - ## number of locus - n_loci <- length(locus_match_ind) - msg <- append( - msg, - str_c("Number of loci =", n_loci, sep = " ") - ) - - ## locus info - pttrn <- "(?<=<)(A|H|X|Y|M)(?=>$)" - locus <- str_extract(file_content[locus_match_ind], pttrn) - - ## locus name - pttrn <- "^[A-Za-z0-9\\s_\\-]+(?= <)" - locus_name <- str_extract(file_content[locus_match_ind], pttrn) - locus_name <- str_replace_all(locus_name, " +", "_") - - if(length(unique(locus_name)) != length(locus_name)) { - err <- append( - err, - str_c( - "Issue with Microsat/Sequence file locus", - "description:", - "each locus should have a unique name", - sep = " " - ) - ) - valid <- FALSE - } - } - - ## remove unnecessary space - file_content <- str_replace_all( - str_trim(file_content), - " +", " " - ) - - ## population - pttrn <- "^(?i)pop(?-i)$" - pop_match_ind <- which(str_detect(file_content, pttrn)) - # check - if(length(pop_match_ind) == 0) { - err <- append( - err, - str_c( - "Issue with Microsat/Sequence file locus", - "data description:", - "keyword 'POP' is missing, see manual", - sep = " " - ) - ) - valid <- FALSE - } else { - # number of pop - n_pop <- length(pop_match_ind) - # pop size - pop_size <- diff(c(pop_match_ind, length(file_content))) - - c(rep(1, n_pop - 1), 0) - # number of individuals - n_indiv <- sum(pop_size) - # msg - msg <- append( - msg, - str_c( - "Sample:", - n_indiv, "individuals", - "from", n_pop, "populations", - sep = " ") - ) - - ## DATA content - data_ind <- head(pop_match_ind, 1):length(file_content) - data_ind <- data_ind[!data_ind %in% pop_match_ind] - data_content <- file_content[data_ind] - - # write data content to a temporary file - data_content <- str_replace_all( - str_replace_all(data_content, ",", " "), - " +", ";" - ) - tmpFile <- file.path(data_dir, "tmp_data_file.csv") - tmp <- writeLines(data_content, tmpFile) - on.exit({ - if(file.exists(tmpFile)) - fs::file_delete(tmpFile) - }) - - # read data as data.frame - data_content <- tryCatch( - read.table(tmpFile, sep = ";", stringsAsFactors = FALSE, - colClasses = "character"), - error = function(e) e - ) - if("error" %in% class(data_content)) { - err <- append( - err, - str_c( - "Issue with Microsat/Sequences data file content:", - content$message, sep = " " - ) - ) - valid <- FALSE - } else { - ##### check data - ## check number of loci - if(ncol(data_content) != n_loci + 1) { - err <- append( - err, - str_c( - "Issue with Microsat/Sequences data file", - "content:", - "number of declared loci at beginning of file", - "does not correspond to number of actual loci", - "in the data", sep = " " - ) - ) - valid <- FALSE - } - - # format data.frame - colnames(data_content) <- c("indiv", - str_c("locus", 1:n_loci)) - data_content$pop <- as.character(rep(1:n_pop, pop_size)) - - ## indiv name first column - if(!all(str_detect(data_content$indiv, - "[A-Za-z0-9_-]+"))) { - err <- append( - err, - str_c( - "Issue with Microsat/Sequences data file", - "content:", - "First column should provide individual names,", - "you can use the following character to specify", - "such names:", - "'A-Z', 'a-z', '0-9', '_', '-'", - sep = " " - ) - ) - valid <- FALSE - } - - ## check locus encoding - microsat_hap_encoding <- "^[0-9]{3}$" - microsat_dip_encoding <- "^[0-9]{6}$" - microsat_x_encoding <- "^[0-9]{3}|[0-9]{6}$" - nucleotid_encoding <- "[ATCG]*" - seq_hap_encoding <- str_c("^<\\[", - nucleotid_encoding, - "\\]>") - seq_dip_encoding <- str_c("^<\\[", - nucleotid_encoding, - "\\]\\[", - nucleotid_encoding, - "\\]>") - seq_x_encoding <- str_c("^<\\[", - nucleotid_encoding, - "\\](\\[", - nucleotid_encoding, - "\\])?>") - - # locus data - locus_data <- data_content[,2:(ncol(data_content) - 1)] - - if(ncol(data_content) == 3) { - locus_data <- data.frame(locus1 = locus_data) - } - - # microsat locus - microsat_hap_locus <- which( - apply( - locus_data, 2, function(loc) { - return(all(str_detect(loc, - microsat_hap_encoding))) - } - ) & (! locus == "X") - ) - microsat_dip_locus <- which( - apply( - locus_data, 2, function(loc) { - return(all(str_detect(loc, - microsat_dip_encoding))) - } - ) & (! locus == "X") - ) - microsat_x_locus <- which( - apply( - locus_data, 2, function(loc) { - return(all(str_detect(loc, - microsat_x_encoding))) - } - ) & (locus == "X") - ) - - # seq locus - seq_hap_locus <- which( - apply( - locus_data, 2, function(loc) { - return(all(str_detect(loc, - seq_hap_encoding))) - } - ) & (! locus == "X") - ) - seq_dip_locus <- which( - apply( - locus_data, 2, function(loc) { - return(all(str_detect(loc, - seq_dip_encoding))) - } - ) & (! locus == "X") - ) - seq_x_locus <- which( - apply( - locus_data, 2, function(loc) { - return(all(str_detect(loc, - seq_x_encoding))) - } - ) & (locus == "X") - ) - - # check if issue with formating - if(!all(sort(c(microsat_hap_locus, microsat_dip_locus, - microsat_x_locus, - seq_hap_locus, seq_dip_locus, - seq_x_locus)) == - 1:n_loci)) { - err <- append( - err, - str_c( - "Issue with Microsat/Sequences data file", - "content in columns:", - str_c( - (1:n_loci)[!(1:n_loci) %in% - c(microsat_hap_locus, - microsat_dip_locus, - microsat_x_locus, - seq_hap_locus, - seq_dip_locus, - seq_x_locus)], - collapse = ", " - ), - sep = " " - ) - ) - valid <- FALSE - } else { - ## locus mode (microsat or sequence, hap or dip) - tmp_locus_mode <- data.frame( - mode = c(rep("microsat_hap", - length(microsat_hap_locus)), - rep("microsat_dip", - length(microsat_dip_locus)), - rep("microsat_xy", - length(microsat_x_locus)), - rep("seq_hap", - length(seq_hap_locus)), - rep("seq_dip", - length(seq_dip_locus)), - rep("seq_xy", - length(seq_x_locus))), - index = c(microsat_hap_locus, microsat_dip_locus, - microsat_x_locus, - seq_hap_locus, seq_dip_locus, - seq_x_locus) - ) - locus_mode <- - tmp_locus_mode$mode[order(tmp_locus_mode$index)] - - ## check that A are diploid locus - if(!all(which(locus == "A") %in% - c(microsat_dip_locus, seq_dip_locus))) { - err <- append( - err, - str_c( - "Issue with Microsat/Sequences data file", - "content:", - "Autosomal diploid (A)", - "should all be diploid loci.", - sep = " " - ) - ) - valid <- FALSE - } - - ## check that H, M and Y are haploid locus - if(!all(which(locus %in% c("H", "M", "Y")) %in% - c(microsat_hap_locus, seq_hap_locus))) { - err <- append( - err, - str_c( - "Issue with Microsat/Sequences data file", - "content:", - "Autosomal haploid (H), Y-linked (Y)", - "and Mitochondrial (M) loci", - "should all be haploid loci.", - sep = " " - ) - ) - valid <- FALSE - } - } - - ## check seq locus length - seq_length <- unlist(lapply( - 1:n_loci, - function(col_ind) { - if(col_ind %in% c(seq_hap_locus, seq_dip_locus, - seq_x_locus)) { - - tmp <- unlist(lapply( - 1:n_indiv, - function(row_ind) { - return( - str_length(str_extract_all( - locus_data[row_ind,col_ind], - "\\[[ATCG]*\\]", - simplify = TRUE - )) - 2 - ) - } - )) - - out <- max(tmp) - if(!all(unique(tmp) %in% c(0, out))) { - return(-100) - } else { - return(out) - } - } else { - return(NA) - } - } - )) - - if(any(seq_length[!is.na(seq_length)] == -100)) { - err <- append( - err, - str_c( - "Issue with Microsat/Sequences data file", - "content:", - "non-missing sequence data attached to", - "following loci", - str_c(which(seq_length == -100), - collapse = ", "), - "do not have the same length in all", - "individuals", - sep = " " - ) - ) - valid <- FALSE - } - } - } - } - - # output - spec <- lst(locus, locus_mode, locus_name, seq_length, - n_indiv, n_loci, n_pop, pop_size) - } - ## expected data file ? - if(!is.null(expected_data_file)) { - if(data_file != expected_data_file) { - err <- append( - err, - str_c( - "Data file expected by provided 'header.txt'", - "or 'headerRF.txt' file(s)", - "and data file provided by user are different.", - sep = " " - ) - ) - valid <- FALSE - } - } - ## output - out <- lst(header, info, spec, valid, err, msg) - return(out) -} - -#' Parse diyabc header file -#' @keywords internal -#' @author Ghislain Durif -#' @description -#' Content: see doc -#' @param file_name string, (server-side) path to a header file. -#' @param file_type string, MIME file type. -#' @param locus_type string, `"mss"` for MicroSat/Sequence or `"snp"` for SNP. -parse_diyabc_header <- function(file_name, file_type, locus_type) { - # init output - data_file <- NULL - issues <- list() - loci_description <- NULL - n_loci_des <- NULL - n_param <- NULL - n_prior <- NULL - n_sumstat <- NULL - raw_cond_list <- NULL - raw_prior_list <- NULL - raw_group_prior_list <- NULL - raw_scenario_list <- NULL - simu_mode <- NULL - valid <- TRUE - # check file name - valid <- check_file_name(file_name) - # check file type - if(file_type != "text/plain") - valid <- FALSE - ## read file - if(valid) { - # local variable - next_sec_line <- 3 - # file content - raw_content <- readLines(file_name, warn = FALSE) - raw_content <- raw_content[raw_content != ""] - ## data file - data_file <- raw_content[1] - ## number of paramters and statistics - strng <- raw_content[2] - pttrn <- "^[0-9]+ parameters and [0-9]+ summary statistics$" - if(!str_detect(strng, pttrn)) { - issues <- append(issues, pttrn) - valid <- FALSE - } else { - pttrn <- "[0-9]+(?= parameters)" - n_param <- as.integer(str_extract(strng, pttrn)) - pttrn <- "[0-9]+(?= summary statistics)" - n_sumstat <- as.integer(str_extract(strng, pttrn)) - } - - ## scenarios - pttrn <- "^[0-9]+ scenarios:( [0-9]+)+ ?$" - # find section - if(!any(str_detect(raw_content, pttrn))) { - issues <- append(issues, pttrn) - valid <- FALSE - } else if(!(which(str_detect(raw_content, pttrn)) == next_sec_line)) { - issues <- append(issues, pttrn) - valid <- FALSE - } else { - strng <- raw_content[3] - pttrn <- "[0-9]+(?= scenarios:)" - n_scenario <- as.integer(str_extract(strng, pttrn)) - pttrn <- "(?<= )[0-9]+" - lines_per_scenario <- as.integer(unlist(str_extract_all(strng, - pttrn))) - # extract scenarii - line_seq <- cumsum(c(1, lines_per_scenario+1)) - scenario_list <- lapply( - split( - raw_content[(min(line_seq):(max(line_seq)-1)) - + next_sec_line], - rep(seq(line_seq), diff(c(line_seq, max(line_seq)))) - ), - function(content) { - parse_diyabc_header_scenarii(content) - }) - # check extracted scnenarii - valid <- all(unlist(lapply(scenario_list, - function(item) item$valid))) - # extract raw scenario list - raw_scenario_list <- unlist(unname( - lapply( - scenario_list, - function(item) item$raw_scenario - ) - )) - # next section - next_sec_line <- next_sec_line + max(line_seq) - } - - ## historical parameters - pttrn <- "^historical parameters priors \\([0-9]+,[0-9]+\\)$" - # find section - if(!any(str_detect(raw_content, pttrn))) { - issues <- append(issues, pttrn) - valid <- FALSE - } else if(!(which(str_detect(raw_content, pttrn)) == next_sec_line)) { - issues <- append(issues, pttrn) - valid <- FALSE - } else { - strng <- raw_content[next_sec_line] - next_sec_line <- next_sec_line + 1 - # number of priors - pttrn <- "(?<= \\()[0-9]+" - n_prior <- as.integer(str_extract(strng, pttrn)) - # number of conditions - pttrn <- "[0-9]+(?=\\)$)" - n_cond <- as.integer(str_extract(strng, pttrn)) - # extract priors - raw_prior_list <- raw_content[next_sec_line:(next_sec_line - + n_prior - 1)] - next_sec_line <- next_sec_line + n_prior - # check extracted priors - valid <- all(unlist(lapply(raw_prior_list, check_header_prior))) - # extract conditions - if(n_cond > 0) { - raw_cond_list <- raw_content[next_sec_line:(next_sec_line - + n_cond - 1)] - next_sec_line <- next_sec_line + n_cond - # check extracted conditions - valid <- all(unlist(lapply(raw_cond_list, check_header_cond))) - - # generation mode - simu_mode <- raw_content[next_sec_line] - next_sec_line <- next_sec_line + 1 - if(simu_mode != "DRAW UNTIL") { - issues <- append(issues, "missing 'DRAW UNTIL'") - valid <- FALSE - } - } else { - if(str_detect(raw_content[next_sec_line], "DRAW UNTIL")) { - issues <- append(issues, "unnecessary 'DRAW UNTIL'") - valid <- FALSE - } - } - } - - ## loci description - pttrn <- "^loci description \\([0-9]+\\)$" - # find section - if(!any(str_detect(raw_content, pttrn))) { - issues <- append(issues, pttrn) - valid <- FALSE - } else if(!(which(str_detect(raw_content, pttrn)) == next_sec_line)) { - issues <- append(issues, pttrn) - valid <- FALSE - } else { - strng <- raw_content[next_sec_line] - next_sec_line <- next_sec_line + 1 - # number of loci description - pttrn <- "(?<=^loci description \\()[0-9]+" - n_loci_des <- as.integer(str_extract(strng, pttrn)) - # extract loci description - loci_description <- raw_content[next_sec_line:(next_sec_line - + n_loci_des - 1)] - next_sec_line <- next_sec_line + n_loci_des - # check loci description - valid <- all(unlist(lapply(loci_description, - check_header_loci_des))) - } - - ## group prior (for microsat/sequence) - if(locus_type == "mss") { - pttrn <- "^group priors \\([0-9]+\\)$" - # find section - if(!any(str_detect(raw_content, pttrn))) { - issues <- append(issues, pttrn) - valid <- FALSE - } else if(!(which(str_detect(raw_content, pttrn)) == next_sec_line)) { - issues <- append(issues, pttrn) - valid <- FALSE - } else { - strng <- raw_content[next_sec_line] - next_sec_line <- next_sec_line + 1 - # check next section - pttrn <- "^group summary statistics \\([0-9]+\\)$" - tmp_next <- head(which(str_detect(raw_content, pttrn)), 1) - if(length(tmp_next) == 0) { - issues <- append(issues, pttrn) - valid <- FALSE - } else if(tmp_next <= next_sec_line) { - issues <- append(issues, pttrn) - valid <- FALSE - } else { - # extract info - raw_group_prior_list <- raw_content[next_sec_line:(tmp_next-1)] - - # next section - next_sec_line <- tmp_next - } - } - } - - ## group summary statistics - pttrn <- "^group summary statistics \\([0-9]+\\)$" - # find section - if(!any(str_detect(raw_content, pttrn))) { - issues <- append(issues, pttrn) - valid <- FALSE - } else if(!(head(which(str_detect(raw_content, pttrn)), 1) - == next_sec_line)) { - issues <- append(issues, pttrn) - valid <- FALSE - } else { - strng <- raw_content[next_sec_line:length(raw_content)] - next_sec_line <- next_sec_line + 1 - # number of summary stats - pttrn <- "(?<=^group summary statistics \\()[0-9]+" - tmp_n_sumstat <- sum(as.integer(str_extract(strng, pttrn)), - na.rm = TRUE) - # check - valid <- (n_sumstat == tmp_n_sumstat) - # TODO: parse end of file - } - } - ## output - return(lst(data_file, loci_description, - n_loci_des, n_param, n_prior, n_sumstat, - raw_cond_list, raw_prior_list, raw_group_prior_list, - raw_scenario_list, simu_mode, valid)) -} - -#' Parse diyabc header file scenarii -#' @keywords internal -#' @author Ghislain Durif -#' @description -#' Content: see doc -#' @param content string vector, scenario description -parse_diyabc_header_scenarii <- function(content) { - # init output - issues <- list() - id <- NULL - n_param <- NULL - prior <- NULL - raw_scenario <- NULL - valid <- TRUE - # first line - strng <- content[1] - pttrn <- str_c("^scenario [0-9]+ \\[", num_regex(), "\\] \\([0-9]+\\)$") - if(!str_detect(strng, pttrn)) { - issues <- append(issues, pttrn) - valid <- FALSE - } else { - # scenario id - pttrn <- "(?<=^scenario )[0-9]+" - id <- as.integer(str_extract(strng, pttrn)) - # scenario prior - pttrn <- str_c("(?<= \\[)", num_regex(), "(?=\\] )") - prior <- as.numeric(str_extract(strng, pttrn)) - # number of parameters in scenario - pttrn <- "(?<= \\()[0-9]+(?=\\)$)" - n_param <- as.integer(str_extract(strng, pttrn)) - ## raw scenario - raw_scenario <- str_c(content[-1], collapse = "\n") - } - ## output - return(lst(id, n_param, prior, raw_scenario, valid)) -} - -#' Check header file prior definition -#' @keywords internal -#' @author Ghislain Durif -#' @description -#' Content: see doc -#' @param cstrng string, prior description. -check_header_prior <- function(strng) { - # init output - issues <- list() - valid <- TRUE - # check - pttrn <- str_c(single_param_regex(), " ", - "(N|T|A)", " ", - "(UN|LU|NO|LN)", "\\[", - str_c(rep(num_regex(), 4), collapse = ","), - "\\]") - valid <- str_detect(strng, pttrn) - ## output - return(valid) -} - -#' Check header file condition definition -#' @keywords internal -#' @author Ghislain Durif -#' @description -#' Content: see doc -#' @param strng string, prior description. -check_header_cond <- function(strng) { - # init output - issues <- list() - valid <- TRUE - # check - pttrn <- str_c("^", single_param_regex(), "(<|=<|>|>=)", - single_param_regex(), "$") - valid <- str_detect(strng, pttrn) - ## output - return(valid) -} - -#' Check header file loci description -#' @keywords internal -#' @author Ghislain Durif -#' @description -#' Content: see doc -#' @param strng string, prior description. -#' @param type string, `"mss"` or `"snp"` -check_header_loci_des <- function(strng, type = "mss") { - # init output - issues <- list() - valid <- TRUE - # init local - pttrn <- NULL - # Microsat/Sequence - if(type == "mss") { - # Locus_M_A_1_ [M] G1 2 40 - # Locus_S_A_21_ [S] G4 1000 - pttrn <- str_c("^", single_param_regex(), " ", - "<(A|H|X|Y|M)>", " ", - "\\[(M|S)\\]", " ", - "G[0-9]+", " ", - "[0-9]+", "( [0-9]+)?", "$") - } else if(type == "snp") { - # 5000 G1 from 1 - pttrn <- str_c("^", "[0-9]+", " ", - "<(A|H|X|Y|M)>", " ", - "G[0-9]+", " ", - "from ", "[0-9]+", "$") - } - # check - valid <- str_detect(strng, pttrn) - ## output - return(valid) -} - #' Parse diyabc simulation header file #' @keywords internal #' @author Ghislain Durif @@ -1727,6 +5,4 @@ check_header_loci_des <- function(strng, type = "mss") { #' Content: see doc #' @param file_name string, (server-side) path to a headersim file. #' @param type string, MIME file type. -parse_diyabc_headersim <- function(file_name, type) { - -} \ No newline at end of file +parse_diyabc_headersim <- function(file_name, type) {} diff --git a/R-pkg/R/42_input_check.R b/R-pkg/R/42_input_check.R new file mode 100644 index 00000000..0c1f024a --- /dev/null +++ b/R-pkg/R/42_input_check.R @@ -0,0 +1,153 @@ +#' Check file_name +#' @keywords internal +#' @author Ghislain Durif +check_file_name <- function(file_name) { + valid <- TRUE + if((length(file_name) != 1) || !is.character(file_name) || + !file.exists(file_name)) valid <- FALSE + return(valid) +} + +#' Check header file prior definition +#' @keywords internal +#' @author Ghislain Durif +#' @description +#' Content: see doc +#' @param cstrng string, prior description. +check_header_prior <- function(strng) { + # check + pttrn <- str_c(single_param_regex(), " ", + "(N|T|A)", " ", + "(UN|LU|NO|LN)", "\\[", + str_c(rep(num_regex(), 4), collapse = ","), + "\\]") + valid <- str_detect(strng, pttrn) + ## output + return(valid) +} + +#' Check header file condition definition +#' @keywords internal +#' @author Ghislain Durif +#' @description +#' Content: see doc +#' @param strng string, prior description. +check_header_cond <- function(strng) { + # check + pttrn <- str_c("^", single_param_regex(), "(<|=<|>|>=)", + single_param_regex(), "$") + valid <- str_detect(strng, pttrn) + ## output + return(valid) +} + +#' Check header file loci description +#' @keywords internal +#' @author Ghislain Durif +#' @description +#' Content: see doc +#' @param strng string, prior description. +#' @param type string, `"mss"` or `"snp"` +check_header_locus_desc <- function(strng, type = "mss") { + # init output + valid <- FALSE + # Microsat/Sequence + if(type == "mss") { + # Locus_M_A_1_ [M] G1 2 40 + pttrn1 <- str_c("^", single_param_regex(), " ", + "<(A|H|X|Y|M)>", " ", + "\\[M\\]", " ", + "G[0-9]+", " ", + "[0-9]+", " ", "[0-9]+", "$") + # Locus_S_A_21_ [S] G4 1000 + pttrn2 <- str_c("^", single_param_regex(), " ", + "<(A|H|X|Y|M)>", " ", + "\\[S\\]", " ", + "G[0-9]+", " ", + "[0-9]+", "$") + + # check + valid <- str_detect(strng, pttrn1) || str_detect(strng, pttrn2) + } else if(type == "snp") { + # 5000 G1 from 1 + pttrn <- str_c("^", "[0-9]+", " ", + "<(A|H|X|Y|M)>", " ", + "G[0-9]+", " ", + "from ", "[0-9]+", "$") + + # check + valid <- str_detect(strng, pttrn) + } + ## output + return(valid) +} + +#' Check header file group prior definition +#' @keywords internal +#' @author Ghislain Durif +#' @description +#' Content: see doc +#' @param content string vector of group prior descriptions. +#' @param type string, locus type `"M"` or `"S"`. +check_header_group_prior <- function(content, type = "M") { + + prior_regex_list <- NULL + + prior_param_regex4 <- str_c(rep(numexp_regex(), 4), collapse = ",") + prior_param_regex2 <- str_c(rep(numexp_regex(), 2), collapse = ",") + + # vector of regex + if(type == "M") { + prior_regex_list <- c( + str_c("^MEANMU (UN|LU|GA)\\[", prior_param_regex4, "\\]$"), + str_c( + "^GAMMU GA\\[", prior_param_regex2, ",Mean_u,", + numexp_regex(), "\\]$" + ), + str_c("^MEANP (UN|LU|GA)\\[", prior_param_regex4, "\\]$"), + str_c( + "^GAMP GA\\[", prior_param_regex2, ",Mean_P,", + numexp_regex(), "\\]$" + ), + str_c("^MEANSNI (UN|LU|GA)\\[", prior_param_regex4, "\\]$"), + str_c( + "^GAMSNI GA\\[", prior_param_regex2, ",Mean_u_SNI,", + numexp_regex(), "\\]$" + ) + ) + } else if(type == "S") { + prior_regex_list <- c( + str_c("^MEANMU (UN|LU|GA)\\[", prior_param_regex4, "\\]$"), + str_c( + "^GAMMU GA\\[", prior_param_regex2, ",Mean_u,", + numexp_regex(), "\\]$" + ), + str_c("^MEANK1 (UN|LU|GA)\\[", prior_param_regex4, "\\]$"), + str_c( + "^GAMK1 GA\\[", prior_param_regex2, ",Mean_k1,", + numexp_regex(), "\\]$" + ), + str_c("^MEANK2 (UN|LU|GA)\\[", prior_param_regex4, "\\]$"), + str_c( + "^GAMK2 GA\\[", prior_param_regex2, ",Mean_k2,", + numexp_regex(), "\\]$" + ), + str_c("^MODEL (TN|HKY|JK|K2P) ", int_regex(), " ", numexp_regex()) + ) + } + + # check number of group prior + if(length(content) != length(prior_regex_list)) { + return(FALSE) + } + + # check all pattern + check_pttrn <- unlist(lapply( + 1:length(content), + function(ind) return(str_detect(content[ind], prior_regex_list[ind])) + )) + valid <- all(check_pttrn) + + ## output + return(valid) +} diff --git a/R-pkg/R/43_data_read.R b/R-pkg/R/43_data_read.R new file mode 100644 index 00000000..1e970e01 --- /dev/null +++ b/R-pkg/R/43_data_read.R @@ -0,0 +1,1308 @@ +#' Read and parse IndSeq SNP data file +#' @keywords internal +#' @author Ghislain Durif +#' @param data_file string, data file name. +#' @param data_dir string, path to directory where data file is stored. +#' @importFrom tools file_ext +#' @importFrom parallel makeCluster stopCluster +#' @importFrom pbapply pblapply +read_indseq_snp_data <- function(data_file, data_dir) { + + # init output + out <- list( + msg = list(), valid = TRUE, + data_file = NULL, n_loci = NULL, locus_count = NULL, + n_pop = NULL, n_indiv = NULL, + sex_ratio = NULL, maf = NULL + ) + + # full path + file_name <- file.path(data_dir, data_file) + + # check file_name + tmp <- check_file_name(file_name) + if(!tmp) { + out$valid <- FALSE + msg <- tagList("Invalid data file name.") + out$msg <- append(out$msg, list(msg)) + } + + # check file content + if(file.info(file_name)$size == 0) { + out$valid <- FALSE + msg <- tagList("Data file is empty.") + out$msg <- append(out$msg, list(msg)) + } + + # check file_type + if(tools::file_ext(file_name) != "snp") { + out$valid <- FALSE + msg <- tagList( + "IndSeq SNP files should have an extension", + tags$code(".snp"), "." + ) + out$msg <- append(out$msg, list(msg)) + } + + # continue ? + if(!out$valid) { + return(out) + } + + # data file name + out$data_file <- data_file + + ## DATA FILE CONTENT + + ## HEADER 1 + header1 <- readLines(file_name, n = 1, warn = FALSE) + + # sex ratio + pttrn <- "(?i)NM=[0-9]+\\.?[0-9]*NF(?-i)" + if(!str_detect(header1, pttrn)) { + out$valid <- FALSE + msg <- tagList( + "Missing", tags$b("sex ratio"), "in first line header." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + out$sex_ratio <- str_extract(header1, pttrn) + + # MAF + pttrn <- "(?i)(?<=MAF=)([0-9]+\\.?[0-9]*|hudson)(?-i)" + if(!str_detect(header1, pttrn)) { + out$valid <- FALSE + msg <- tagList( + "Missing", tags$b("Minimum Allele Frequency"), + "(MAF) in first line header.", + "MAF should be a real number between 0 and 1", + "or the keyword", tags$code("hudson"), + ", see manual." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + out$maf <- str_extract(header1, pttrn) + + if(str_detect(out$maf, "^[0-9]+\\.?[0-9]*$")) { + out$maf <- as.numeric(out$maf) + if(out$maf < 0 || out$maf > 1) { + out$valid <- FALSE + msg <- tagList( + tags$b("Minimum Allele Frequency"), + "(MAF) should be a real number between 0 and 1", + "or the keyword", tags$code("hudson"), + ", see manual." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + } + + ## HEADER 2 + header2 <- tryCatch( + unname(unlist(read.table(file = file_name, skip = 1, nrows = 1))), + error = function(e) return(e) + ) + if("error" %in% class(header2)) { + out$valid <- FALSE + msg <- tagList( + "Formatting issue with second line header, ", + "impossible to read it, see manual." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # upper case + header2 <- str_to_upper(header2) + + # header 2 content + if(header2 != "IND" && header2[2] != "SEX" && header2[3] != "POP" && + !all(header2[-(1:3)] %in% c("A", "H", "X", "Y", "M"))) { + out$valid <- FALSE + msg <- tagList( + "Formatting issue with header second line.", + "Required column titles are", tags$code("IND"), + tags$code("SEX"), tags$code("POP"), + "followed by a letter indicator among", + tags$code("A"), tags$code("H"), tags$code("X"), + tags$code("Y"), tags$code("M"), + "for each SNP locus in the file (see manual)." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # nb of locus + out$n_loci <- length(header2) - 3 + + # locus type + candidate_locus <- c("A", "H", "X", "Y", "M") + locus_encoding <- str_c(header2[-(1:3)], collapse = " ") + locus_count <- Reduce("rbind", lapply( + candidate_locus, + function(pttrn) { + count <- str_count(locus_encoding, pttrn) + return(data.frame( + count = count, + type = pttrn, + stringsAsFactors = FALSE + )) + } + )) + + # save snp type for filtering + snp_type <- header2[-(1:3)] + + ## DATA FILE CONTENT + content <- tryCatch( + read.table(file_name, skip = 2), error = function(e) return(e) + ) + if("error" %in% class(content)) { + out$valid <- FALSE + msg <- tagList( + "Formatting issue with data, ", + "impossible to read the file, see manual." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # check for SEX column with only F (interpreted as FALSE) + if(is.logical(content[,2])) { + content[,2] <- ifelse(content[,2], "T", "F") + } + + # check number of locus + if(out$n_loci != (ncol(content) - 3)) { + out$valid <- FALSE + msg <- tagList( + "Number of loci not consistent between", + "file header and file content." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # check sex content + if(!all(str_to_upper(as.character(content[,2])) %in% + c("F", "M", "9"))) { + out$valid <- FALSE + msg <- tagList( + tags$code("SEX"), "column should only contain", + tags$code("F"), "for female,", + tags$code("M"), "for male or", + tags$code("9"), "for missing values (see manual)." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # number of pop + out$n_pop <- length(unique(content[,3])) + + # number of individuals + out$n_indiv <- nrow(content) + + ## REFORMAT DATA (to speed up the checks) + + # individual information + indiv_info <- content[,1:3] + colnames(indiv_info) <- c("IND", "SEX", "POP") + + # check for any missing values + if(any(is.na(indiv_info))) { + out$valid <- FALSE + msg <- tagList( + tags$code("NA"), "values were found", + "in one (or more) of the columns", + tags$code("IND"), tags$code("SEX"), "or", tags$code("POP"), "." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # SNP info + content <- t(as.matrix(content[,-(1:3)])) + + # check for any missing values + if(any(is.na(content))) { + out$valid <- FALSE + msg <- tagList( + tags$code("NA"), "values were found", + "in one (or more) of the columns encoding the SNPs." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # check data type + if(!is.integer(content)) { + out$valid <- FALSE + msg <- tagList( + "SNP encoding should be only contain integer values." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # check data encoding + if(!all(content %in% c(0,1,2,9))) { + out$valid <- FALSE + msg <- tagList( + "SNP encoding should only be", + tags$code("0"), tags$code("1"), tags$code("2"), + "or", tags$code("9"), "for missing data (see manual)." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## LOCUS FILTERING + # run + snp_check <- check_snp_indseq( + content, indiv_info, snp_type, locus_count, out$maf + ) + # check + if(!snp_check$valid) { + out$valid <- FALSE + out$msg <- append(out$msg, snp_check$msg) + return(out) + } + # results + out$locus_count <- snp_check$locus_count + + ## output + return(out) +} + +#' Processing individual locus specific in IndSeq SNP data file based on MAF +#' @keywords internal +#' @author Ghislain Durif +#' @param snp_data integer vector encoding for each individual the +#' number of ancestral allele for the loci, i.e. `0`, `1` and `2` for +#' autosome (`A`) and X-chromosome (`X`) in female, or `0` and `1` for +#' haploid locus (`H`), Y-chromosome (`Y`) in male and +#' mitochondrial locus (`M`). +#' Note : missing values are encoded by a `9`. +#' @param sex_data character vector encoding for each individual the sex, i.e. +#' `"F"` for female and `"M"` for male. +#' Note : missing values are encoded by a `"9"`. +#' @param pop_data character vector encoding each individual population. +#' @param snp_type character encoding the type of the locus +#' (among `A`, `H`, `X`, `Y`, `M`). +#' @param maf double between 0 and 1 +process_indseq_locus <- function(snp_data, sex_data, pop_data, snp_type, maf) { + + # init local + local <- list( + valid = TRUE, filter = FALSE, mono = FALSE, + missing_pop = NA, issue_X = NA, issue_Y = NA, af = 0, + hudson = FALSE + ) + + ## identify excessive missing data for a single pop + is_missing_pop <- tapply( + snp_data, as.factor(pop_data), function(x) return(x == 9) + ) + check_missing_pop <- unlist(lapply( + 1:length(is_missing_pop), + function(ind) + return(sum(is_missing_pop[[ind]]) == length(is_missing_pop[[ind]])) + )) + if(any(check_missing_pop)) { + local$valid <- FALSE + local$missing_pop <- str_c( + names(is_missing_pop)[check_missing_pop], collapse = "," + ) + } + + ## identify missing data and compute data size without missing data + non_missing_data <- (snp_data != 9) + true_data_size <- sum(non_missing_data) + ## identify female, male, and missing sex + female_ind <- (sex_data == "F") + male_ind <- (sex_data == "M") + missing_sex <- (sex_data == "9") + + if(snp_type == "A") { + ## autosome + if(true_data_size > 0) { + # reference allele frequence + local$af <- sum(snp_data[non_missing_data]) / (2 * true_data_size) + } + } else if(snp_type %in% c("H", "M")) { + ## haploid & mitochondrial + if(true_data_size > 0) { + # reference allele frequence + local$af <- sum(snp_data[non_missing_data]) / true_data_size + } + } else if(snp_type == "X") { + ## X-chromosome + check_X <- (missing_sex & non_missing_data) | + (male_ind & (snp_data == 2)) + if(any(check_X)) { + local$valid <- FALSE + local$issue_X <- str_c(which(check_X), collapse = ",") + } + + specific_ind <- non_missing_data & !missing_sex + specific_data_size <- sum(specific_ind) + + if(specific_data_size > 0) { + weighted_data_size <- 2 * sum(non_missing_data & female_ind) + + sum(non_missing_data & male_ind) + # reference allele frequence + local$af <- sum(snp_data[specific_ind]) / weighted_data_size + } + } else if(snp_type == "Y") { + ## Y-chromosome + check_Y <- (missing_sex & non_missing_data) | + (male_ind & (snp_data == 2)) | (female_ind & (snp_data != 9)) + if(any(check_Y)) { + local$valid <- FALSE + local$issue_Y <- str_c(which(check_Y), collapse = ",") + } + + specific_ind <- non_missing_data & male_ind + specific_data_size <- sum(specific_ind) + + if(specific_data_size > 0) { + # reference allele frequence + local$af <- sum(snp_data[specific_ind]) / specific_data_size + } + } + + # filtering + if(maf == "hudson") { + maf <- 0 + local$hudson <- TRUE + } + # MAF filter + local$filter <- (local$af < maf) || (1 - local$af < maf) + # mono + local$mono <- (local$af == 0) || (1 - local$af == 0) + + if(local$hudson) { + local$filter <- local$mono + } + + # filter + return(data.frame( + valid = local$valid, filter = local$filter, mono = local$mono, + missing_pop = local$missing_pop, + issue_X = local$issue_X, issue_Y = local$issue_Y, af = local$af, + maf = maf, hudson = local$hudson, + stringsAsFactors = FALSE + )) +} + +#' Check (for missing values) and filter (based on MAF) IndSeq SNP data +#' @keywords internal +#' @author Ghislain Durif +#' @param content data.frame with columns corresponding to SNPs +#' where each entry encode the number of ancestral allele for the loci, +#' i.e. `0`, `1` and `2` for autosome (`A`) and X-chromosome (`X`) in female, +#' or `0` and `1` for haploid locus (`H`), Y-chromosome (`Y`) in male and +#' mitochondrial locus (`M`). +#' Note : missing values are encoded by a `9`. +#' @param indiv_info data.frame with columns `IND` (individual id), +#' `SEX` (female or male), `POP` (population id). +#' Note : missing values are encoded by a `9`. +#' @param snp_type vector of locus type (among `A`, `H`, `X`, `Y`, `M`). +#' @param locus_cout data.frame with two columns, `count` being the number +#' of locus for each type in the data, and `type` being the corresponding locus +#' type (among `A`, `H`, `X`, `Y`, `M`). +#' @importFrom dplyr bind_rows +check_snp_indseq <- function(content, indiv_info, snp_type, locus_count, + maf=0.05) { + + # init output + out <- list( + valid = TRUE, locus_count = NULL, msg = list() + ) + + # process SNPs + ncore <- getOption("diyabcGUI")$ncore + snp_list <- pblapply( + 1:nrow(content), + function(ind) { + out <- process_indseq_locus( + snp_data = content[ind,], sex_data = indiv_info$SEX, + pop_data = indiv_info$POP, snp_type = snp_type[ind], + maf = maf + ) + }, + cl = ncore + ) + + seek_error <- unlist(lapply( + snp_list, function(item) "try-error" %in% class(item) + )) + if(any(seek_error)) { + error_msg <- attributes( + snp_list[[ which(seek_error)[1] ]] + )$condition$message + err <- str_c( + "Issue when checking IndSeq SNP data file content:", + error_msg, sep = " " + ) + out$valid <- FALSE + msg <- tagList("Error when checking data file content.") + out$msg <- append(out$msg, list(msg)) + pprint(err) + return(out) + } + + # no error + snp_tab <- Reduce("bind_rows", snp_list) + rm("snp_list") + + # check for invalid locus + if(any(!snp_tab$valid)) { + # missing pop + is_missing_pop <- !is.na(snp_tab$missing_pop) + if(any(is_missing_pop)) { + out$valid <- FALSE + missing_pop <- snp_tab$missing_pop[is_missing_pop] + snp_issue <- which(is_missing_pop) + msg <- tagList( + "Issue with missing data for entire population(s)", + "regarding SNP:", + tags$div( + do.call( + tags$ul, + lapply( + 1:length(snp_issue), + function(ind) { + tags$li( + tags$b(snp_issue[ind]), + "(for population(s)", + tags$code(missing_pop[ind]), ")" + ) + } + ) + ), + style = "column-count:2;" + ), + "Remove this locus (these loci) from your dataset." + ) + out$msg <- append(out$msg, list(msg)) + } + + # X chromosome + is_issue_X <- !is.na(snp_tab$issue_X) + if(any(is_issue_X)) { + out$valid <- FALSE + issue_X <- snp_tab$issue_X[is_issue_X] + snp_issue <- which(is_issue_X) + msg <- tagList( + "Issue with data for SNP on X chromosome (see manual)", + "regarding SNP:", + tags$div( + do.call( + tags$ul, + lapply( + 1:length(snp_issue), + function(ind) { + tags$li( + tags$b(snp_issue[ind]), + "(for individuals", + tags$code(issue_X[ind]), ")" + ) + } + ) + ) + ), + "Remove this locus (these loci) from your dataset." + ) + out$msg <- append(out$msg, list(msg)) + } + # Y chromosome + is_issue_Y <- !is.na(snp_tab$issue_Y) + if(any(is_issue_Y)) { + out$valid <- FALSE + issue_Y <- snp_tab$issue_Y[is_issue_Y] + snp_issue <- which(is_issue_Y) + msg <- tagList( + "Issue with data for SNP on Y chromosome (see manual)", + "regarding SNP:", + tags$div( + do.call( + tags$ul, + lapply( + 1:length(snp_issue), + function(ind) { + tags$li( + tags$b(snp_issue[ind]), + "(for individuals", + tags$code(issue_Y[ind]), ")" + ) + } + ) + ) + ) + ) + out$msg <- append(out$msg, list(msg)) + } + } + + # continue ? + if(!out$valid) { + return(out) + } + + # extract number of filtered loci by type + tmp_filter <- tapply( + snp_tab$filter, snp_type, sum + ) + tmp_filter <- data.frame( + filter=tmp_filter, type=names(tmp_filter), + row.names=NULL, stringsAsFactors = FALSE + ) + + # extract number of monomorphic loci by type + tmp_mono <- tapply( + snp_tab$mono, snp_type, sum + ) + tmp_mono <- data.frame( + mono=tmp_mono, type=names(tmp_mono), + row.names=NULL, stringsAsFactors = FALSE + ) + + # merge all results into locus_count table + out$locus_count <- merge(locus_count, tmp_filter) + out$locus_count <- merge(out$locus_count, tmp_mono) + + # output + return(out) +} + +#' Read and parse PoolSeq SNP data file +#' @keywords internal +#' @author Ghislain Durif +#' @param data_file string, data file name. +#' @param data_dir string, path to directory where data file is stored. +#' @importFrom tools file_ext +read_poolseq_snp_data <- function(data_file, data_dir) { + + # init output + out <- list( + msg = list(), valid = TRUE, + data_file = NULL, n_loci = NULL, locus_count = NULL, + n_pop = NULL, sex_ratio = NULL, mrc = NULL + ) + + # full path + file_name <- file.path(data_dir, data_file) + + # check file_name + tmp <- check_file_name(file_name) + if(!tmp) { + out$valid <- FALSE + msg <- tagList("Invalid data file name.") + out$msg <- append(out$msg, list(msg)) + } + + # check file content + if(file.info(file_name)$size == 0) { + out$valid <- FALSE + msg <- tagList("Data file is empty.") + out$msg <- append(out$msg, list(msg)) + } + + # check file_type + if(tools::file_ext(file_name) != "snp") { + out$valid <- FALSE + msg <- tagList( + "IndSeq SNP files should have an extension", + tags$code(".snp"), "." + ) + out$msg <- append(out$msg, list(msg)) + } + + # continue ? + if(!out$valid) { + return(out) + } + + # data file name + out$data_file <- data_file + + ## DATA FILE CONTENT + + ## HEADER 1 + header1 <- readLines(file_name, n = 1, warn = FALSE) + + # sex ratio + pttrn <- "(?i)NM=[0-9]+\\.?[0-9]*NF(?-i)" + if(!str_detect(header1, pttrn)) { + out$valid <- FALSE + msg <- tagList( + "Missing", tags$b("sex ratio"), "in first line header." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + out$sex_ratio <- str_extract(header1, pttrn) + + # MRC + pttrn <- "(?i)(?<=MRC=)[0-9]+(?-i)" + if(!str_detect(header1, pttrn)) { + out$valid <- FALSE + msg <- tagList( + "Missing", tags$b("Minimum Read Count"), + "(MRC) in first line header.", + "MRC should be a positive or null integer,", + "see manual." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + out$mrc <- as.integer(str_extract(header1, pttrn)) + + ## HEADER 2 + header2 <- tryCatch( + unname(unlist(read.table(file = file_name, skip = 1, nrows = 1))), + error = function(e) return(e) + ) + if("error" %in% class(header2)) { + out$valid <- FALSE + msg <- tagList( + "Formatting issue with second line header, ", + "impossible to read it, see manual." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # upper case + header2 <- str_to_upper(header2) + + # header 2 content + if(header2[1] != "POOL" & + header2[2] != "POP_NAME:HAPLOID_SAMPLE_SIZE" & + !all(str_detect(header2[-(1:2)], "POP[0-9]+:[0-9]+"))) { + out$valid <- FALSE + msg <- tagList( + "Formatting issue with header second line.", + "Required column titles are", tags$code("POOL"), + tags$code("POP_NAME:HAPLOID_SAMPLE_SIZE"), + "followed by a character string", + tags$code("POP:"), + "for each population in the file (see manual)." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## number of population + out$n_pop <- length(header2) - 2 + + ## DATA FILE CONTENT + content <- tryCatch( + read.table(file_name, skip = 2), error = function(e) return(e) + ) + if("error" %in% class(content)) { + out$valid <- FALSE + msg <- tagList( + "Formatting issue with data, ", + "impossible to read the file, see manual." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # check number of population + if(ncol(content) %% 2 != 0) { + out$valid <- FALSE + msg <- tagList( + "Formatting issue with data:", + "number of column should be even,", + "providing pair of counts for reference", + "and alternate alleles at each locus", + "in each popultation, see manual." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + if(ncol(content) != 2*out$n_pop) { + out$valid <- FALSE + msg <- tagList( + "Formatting issue with data:", + "number of population indicated in file second-line header", + "does not correspond to number of columns in file content,", + "see manual." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # nb of locus + out$n_loci <- nrow(content) + + # check data encoding + check_snp_encoding <- apply(content, 1, is.integer) + if(!all(unlist(check_snp_encoding))) { + out$valid <- FALSE + msg <- tagList( + "Issue with data encoding at lines", + tags$code(str_c(which(check_snp_encoding, collapse = ", "))), ".", + "Expecting read counts, i.e. positive or null integer", + "(see manual)." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + if(any(is.na(content))) { + out$valid <- FALSE + msg <- tagList( + "Missing values (i.e.", tags$code("NA"), ")", + "are not allowed (see manual)." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## FILTERING LOCUS + snp_check <- check_snp_poolseq(content, mrc = out$mrc) + out$locus_count <- snp_check$locus_count + + ## output + return(out) +} + +#' Check (for missing values) and filter (based on MRC) IndSeq SNP data +#' @keywords internal +#' @author Ghislain Durif +#' @param content data.frame with columns corresponding to couples of +#' PoolSeq encoding. +check_snp_poolseq <- function(content, mrc = 1) { + + # init output + out <- list(locus_count = NULL) + + # number of pop + n_pop <- ncol(content) / 2 + + # count per allele + allele1_count <- apply( + content[,rep(c(TRUE,FALSE), n_pop)], 1, sum + ) + allele2_count <- apply( + content[,rep(c(FALSE,TRUE), n_pop)], 1, sum + ) + + # extract number of filtered loci by type + tmp_filter <- (allele1_count < mrc) | (allele2_count < mrc) + + # extract number of monomorphic loci by type + tmp_mono <- (allele1_count < 1) | (allele2_count < 1) + + # merge all results into locus_count table + out$locus_count <- data.frame( + type = "A", + count = nrow(content), + filter = sum(tmp_filter), + mono = sum(tmp_mono), + stringsAsFactors = FALSE + ) + + # output + return(out) +} + +#' Check Microsat/Sequence (GenePop) data file +#' @keywords internal +#' @author Ghislain Durif +#' @param data_file string, data file name. +#' @param data_dir string, path to directory where data file is stored. +#' @param expected_data_file string, expected data file name for +#' existing project, default is NULL. +#' @importFrom tools file_ext +#' @importFrom readr read_file +read_mss_data <- function(data_file, data_dir) { + + # init output + out <- list( + msg = list(), valid = TRUE, + data_file = NULL, n_loci = NULL, locus_count = NULL, + n_pop = NULL, n_indiv = NULL, pop_size = NULL, + sex_ratio = NULL + ) + + # ## init output + # locus <- NULL + # locus_mode <- NULL + # locus_name <- NULL + # seq_length <- NULL + # n_loci <- NULL + # n_pop <- NULL + # n_indiv <- NULL + # pop_size <- NULL + + # full path + file_name <- file.path(data_dir, data_file) + + # check file_name + tmp <- check_file_name(file_name) + if(!tmp) { + out$valid <- FALSE + msg <- tagList("Invalid data file name.") + out$msg <- append(out$msg, list(msg)) + } + + # check file content + if(file.info(file_name)$size == 0) { + out$valid <- FALSE + msg <- tagList("Data file is empty.") + out$msg <- append(out$msg, list(msg)) + } + + # check file_type + if(tools::file_ext(file_name) != "mss") { + out$valid <- FALSE + msg <- tagList( + "IndSeq SNP files should have an extension", + tags$code(".mss"), "." + ) + out$msg <- append(out$msg, list(msg)) + } + + # continue ? + if(!out$valid) { + return(out) + } + + # data file name + out$data_file <- data_file + + ## DATA FILE CONTENT + + ## HEADER 1 + header1 <- readLines(file_name, n = 1, warn = FALSE) + + # sex ratio + pttrn <- "(?i)NM=[0-9]+\\.?[0-9]*NF(?-i)" + if(!str_detect(header1, pttrn)) { + out$valid <- FALSE + msg <- tagList( + "Missing", tags$b("sex ratio"), "in first line header." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + out$sex_ratio <- str_extract(header1, pttrn) + + ## FILE CONTENT + file_content <- unlist(str_split(read_file(file_name), "\n")) + + ## locus description + pttrn <- "(?<=<)(A|H|X|Y|M)(?=>)" + locus_pttrn_match <- str_extract(file_content, pttrn) + # check if present + if(all(is.na(locus_pttrn_match))) { + out$valid <- FALSE + msg <- tagList( + "Missing locus description, see manual" + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # check if description follows title line + locus_match_ind <- which(!is.na(locus_pttrn_match)) + if(!all(locus_match_ind == 2:tail(locus_match_ind, 1))) { + out$valid <- FALSE + msg <- tagList( + "Locus description should be located", + "at beginning of data file, after title line,", + "with a single locus per line, see manual." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # check locus description format + pttrn <- "^[A-Za-z0-9\\s_\\-]+ <(A|H|X|Y|M)>$" + locus_desc_match_ind <- which(str_detect(file_content, pttrn)) + if(!all(locus_desc_match_ind == locus_match_ind)) { + out$valid <- FALSE + issue_line <- locus_match_ind[!locus_match_ind %in% + locus_desc_match_ind] + msg <- tagList( + "Issue with Microsat/Sequence locus description format at rows:", + tags$code(str_c(issue_line, collapse = ",")), ".", br(), + "You can use the following character to specify", + "locus names:", + tags$code("A-Z"), tags$code("a-z"), tags$code("0-9"), + tags$code("_"), tags$code("-"), "and", tags$code(" "), "." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## number of locus + out$n_loci <- length(locus_desc_match_ind) + + ## locus description + pttrn <- "(?<=<)(A|H|X|Y|M)(?=>$)" + locus_desc <- str_extract(file_content[locus_match_ind], pttrn) + + ## locus name + pttrn <- "^[A-Za-z0-9\\s_\\-]+(?= <)" + locus_name <- str_extract(file_content[locus_match_ind], pttrn) + locus_name <- str_replace_all(locus_name, " +", "_") + # check + if(length(unique(locus_name)) != length(locus_name)) { + out$valid <- FALSE + msg <- tagList( + "Issue with locus description", + "each locus should have a unique name, see manual." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## population + pttrn <- "^(?i)pop(?-i)$" + pop_match_ind <- which(str_detect(file_content, pttrn)) + # check + if(length(pop_match_ind) == 0) { + out$valid <- FALSE + msg <- tagList( + "Keyword", tags$code("POP"), "is missing, see manual." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## remove unnecessary space + file_content <- str_replace_all( + str_trim(file_content), " +", " " + ) + + ## remove final empty line (if any) + if(tail(file_content, 1) == "") { + file_content <- head(file_content, length(file_content) - 1) + } + + ## number of pop + out$n_pop <- length(pop_match_ind) + + ## pop size + out$pop_size <- diff(c(pop_match_ind, length(file_content))) - + c(rep(1, out$n_pop - 1), 0) + + ## number of individuals + out$n_indiv <- sum(out$pop_size) + + ## population id + pop_id <- unlist(lapply( + 1:out$n_pop, + function(ind) return(rep(ind, each = out$pop_size[ind])) + )) + + ## EXTRACT DATA + data_ind <- head(pop_match_ind, 1):length(file_content) + data_ind <- data_ind[!data_ind %in% pop_match_ind] + data_content <- file_content[data_ind] + + # write data content to a temporary file + data_content <- str_replace_all( + str_replace_all(data_content, ",", " "), + " +", ";" + ) + tmp_file <- file.path(data_dir, "tmp_data_file.csv") + tmp <- writeLines(data_content, tmp_file) + on.exit({ + if(file.exists(tmp_file)) + fs::file_delete(tmp_file) + }) + + # read data as data.frame + data_content <- tryCatch( + read.table(tmp_file, sep = ";", stringsAsFactors = FALSE, + colClasses = "character"), + error = function(e) e + ) + if("error" %in% class(data_content)) { + out$valid <- FALSE + msg <- tagList( + "Issue with Microsat/Sequences data file format, see manual." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ##### CHECK DATA + ## check number of loci + if(ncol(data_content) != out$n_loci + 1) { + out$valid <- FALSE + msg <- tagList( + "Number of declared loci at beginning of file", + "does not correspond to number of actual loci", + "in the data, see manual." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## format data.frame + colnames(data_content) <- c("indiv", str_c("locus", 1:out$n_loci)) + data_content$pop <- as.character(pop_id) + + ## indiv name first column + if(!all(str_detect(data_content$indiv, "[A-Za-z0-9_-]+"))) { + out$valid <- FALSE + msg <- tagList( + "First column should provide individual names,", + "you can use the following character to specify", + "such names:", + tags$code("A-Z"), tags$code("a-z"), tags$code("0-9"), + tags$code("_"), tags$code("-"), "and", tags$code(" "), "." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## check locus encoding + microsat_hap_encoding <- "^[0-9]{3}$" + microsat_dip_encoding <- "^[0-9]{6}$" + microsat_x_encoding <- "^([0-9]{3}|[0-9]{6})$" + nucleotid_encoding <- "[ATCG]*" + seq_hap_encoding <- str_c("^<\\[", nucleotid_encoding, "\\]>") + seq_dip_encoding <- str_c( + "^<\\[", nucleotid_encoding, "\\]\\[", nucleotid_encoding, "\\]>" + ) + seq_x_encoding <- str_c( + "^<\\[", nucleotid_encoding, "\\](\\[", nucleotid_encoding, "\\])?>" + ) + + ## locus data + locus_data <- data_content[,!colnames(data_content) %in% c("indiv", "pop")] + # issue when a single locus + if(ncol(data_content) == 3) { + locus_data <- data.frame(locus1 = locus_data) + } + + ### microsat locus + microsat_hap_locus <- which( + apply( + locus_data, 2, function(loc) { + return(all(str_detect(loc, microsat_hap_encoding))) + } + ) & (locus_desc %in% c("H", "Y", "M")) + ) + microsat_dip_locus <- which( + apply( + locus_data, 2, function(loc) { + return(all(str_detect(loc, microsat_dip_encoding))) + } + ) & (locus_desc == "A") + ) + microsat_x_locus <- which( + apply( + locus_data, 2, function(loc) { + return(all(str_detect(loc, microsat_x_encoding))) + } + ) & (locus_desc == "X") + ) + + ### seq locus + seq_hap_locus <- which( + apply( + locus_data, 2, function(loc) { + return(all(str_detect(loc, seq_hap_encoding))) + } + ) & (locus_desc %in% c("H", "Y", "M")) + ) + seq_dip_locus <- which( + apply( + locus_data, 2, function(loc) { + return(all(str_detect(loc, seq_dip_encoding))) + } + ) & (locus_desc == "A") + ) + seq_x_locus <- which( + apply( + locus_data, 2, function(loc) { + return(all(str_detect(loc, seq_x_encoding))) + } + ) & (locus_desc == "X") + ) + + ## check that A are diploid locus + if(!all(which(locus_desc == "A") %in% + c(microsat_dip_locus, seq_dip_locus))) { + out$valid <- FALSE + msg <- tagList( + "Autosomal diploid (i.e. identified by a", tags$code("A"), ")", + "should all be diploid loci." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## check that H, M and Y are haploid locus + if(!all(which(locus_desc %in% c("H", "M", "Y")) %in% + c(microsat_hap_locus, seq_hap_locus))) { + out$valid <- FALSE + msg <- tagList( + "Autosomal haploid loci", + "(i.e. identified by a", tags$code("H"), ")", + "Y-linked loci (i.e. identified by a", tags$code("Y"), ")", + "and Mitochondrial loci", + "(i.e. identified by a", tags$code("M"), ")", + "should all be haploid loci." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ### check if other issue with formating + if(!all(sort(c(microsat_hap_locus, microsat_dip_locus, + microsat_x_locus, + seq_hap_locus, seq_dip_locus, + seq_x_locus)) == + 1:out$n_loci)) { + out$valid <- FALSE + msg <- tagList( + "Issue with data content of columns:", + tags$code(str_c( + (1:n_loci)[!(1:n_loci) %in% + c(microsat_hap_locus, + microsat_dip_locus, + microsat_x_locus, + seq_hap_locus, + seq_dip_locus, + seq_x_locus)], + collapse = ", " + )), br(), + "See manual." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## locus mode (microsat or sequence, hap or dip) + tmp_locus_mode <- data.frame( + mode = c( + rep("M", length(microsat_hap_locus) + length(microsat_dip_locus) + + length(microsat_x_locus)), + rep("S", length(seq_hap_locus) + length(seq_dip_locus) + + length(seq_x_locus)) + ), + index = c(microsat_hap_locus, microsat_dip_locus, + microsat_x_locus, + seq_hap_locus, seq_dip_locus, + seq_x_locus) + ) + locus_mode <- tmp_locus_mode$mode[order(tmp_locus_mode$index)] + + ## check seq locus length + seq_length <- unlist(lapply( + which(locus_mode == "S"), + function(col_ind) { + tmp <- unlist(lapply( + 1:out$n_indiv, + function(row_ind) { + return( + str_length(str_extract_all( + locus_data[row_ind,col_ind], + "\\[[ATCG]*\\]", + simplify = TRUE + )) - 2 + ) + } + )) + + out <- max(tmp) + if(!all(unique(tmp) %in% c(0, out))) { + return(-100) + } else { + return(out) + } + } + )) + + if(any(seq_length == -100)) { + out$valid <- FALSE + msg <- tagList( + "Non-missing sequence data attached to following loci", + tags$code(str_c(which(seq_length == -100), collapse = ", ")), + "do not have the same length in all", + "individuals." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## check for missing data in microsat locus + microsat_missing_encoding <- "^(0{3}|0{6})$" + missing_microsat <- apply( + locus_data[,which(locus_mode == "M")], c(1,2), + function(item) return(str_detect(item, microsat_missing_encoding)) + ) + + missing_pop <- apply( + missing_microsat, 2, + function(item) { + return(any(unlist(tapply(item, pop_id, sum)) == out$pop_size)) + } + ) + + if(any(missing_pop)) { + out$valid <- FALSE + msg <- tagList( + "Issue with missing data for entire population(s)", + "regarding microsat locus:", + tags$code(str_c(which(missing_pop), collapse = ", ")), br(), + "Remove this locus (these loci) from your dataset." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## check for missing data in seq locus + seq_missing_encoding <- "^<\\[\\](\\[\\])?>$" + missing_seq <- apply( + locus_data[,which(locus_mode == "S")], c(1,2), + function(item) return(str_detect(item, seq_missing_encoding)) + ) + + missing_pop <- apply( + missing_seq, 2, + function(item) { + return(any(unlist(tapply(item, pop_id, sum)) == out$pop_size)) + } + ) + + if(any(missing_pop)) { + out$valid <- FALSE + msg <- tagList( + "Issue with missing data for entire population(s)", + "regarding sequence locus:", + tags$code(str_c(which(missing_pop), collapse = ", ")), br(), + "Remove this locus (these loci) from your dataset." + ) + out$msg <- append(out$msg, list(msg)) + return(out) + } + + ## locus count + out$locus_count <- as.data.frame(table(locus_desc, locus_mode)) + colnames(out$locus_count) <- c("type", "mode", "count") + + ## save locus description and mode + out$locus_desc <- locus_desc + out$locus_mode <- locus_mode + + ## output + return(out) +} diff --git a/R-pkg/R/51_home_page_module.R b/R-pkg/R/51_home_page_module.R index fe3afdd6..b55325f4 100644 --- a/R-pkg/R/51_home_page_module.R +++ b/R-pkg/R/51_home_page_module.R @@ -1,7 +1,7 @@ #' Simplified home page ui #' @keywords internal #' @author Ghislain Durif -simplified_home_page_ui <- function(id) { +home_page_ui <- function(id) { ns <- NS(id) tagList( fluidRow( @@ -98,7 +98,7 @@ simplified_home_page_ui <- function(id) { #' Simlified home page server #' @keywords internal #' @author Ghislain Durif -simplified_home_page_server <- function(input, output, session) { +home_page_server <- function(input, output, session) { # init output out <- reactiveValues( new_analysis_project = NULL, @@ -123,117 +123,3 @@ simplified_home_page_server <- function(input, output, session) { # output return(out) } - -#' Home page ui -#' @keywords internal -#' @author Ghislain Durif -#' @description deprecated -home_page_ui <- function(id) { - ns <- NS(id) - tagList( - fluidRow( - box( - title = "Data analysis", - width = 12, - status = "primary", solidHeader = TRUE, - collapsible = TRUE, - tags$div( - tags$p("Including the two modules of DIYABC Random Forest:"), - tags$ul( - tags$li("training set simulation"), - tags$li("random forest analyses") - ) - ) %>% - helper(type = "markdown", - content = "data_analysis"), - open_project_ui(ns("analysis_project")) - ) - ), - fluidRow( - box( - title = "Pseudo-observed dataset simulation", - width = 12, - status = "info", solidHeader = TRUE, - collapsible = TRUE, collapsed = TRUE, - tags$div( - tags$p("Direct use of DIYABC-RF simulation engine.") %>% - helper(type = "markdown", - content = "data_simulation") - ), - open_project_ui(ns("simu_project")) - ) - ) - ) -} - -#' Home page server -#' @keywords internal -#' @author Ghislain Durif -#' @description deprecated -home_page_server <- function(input, output, session) { - # init output - out <- reactiveValues( - new_analysis_project = NULL, - existing_analysis_project = NULL, - new_simu_project = NULL, - existing_simu_project = NULL - ) - # input - analysis_project <- callModule(open_project_server, "analysis_project") - simu_project <- callModule(open_project_server, "simu_project") - # analysis project reactivity - observeEvent(analysis_project$new, { - out$new_analysis_project <- analysis_project$new - }) - observeEvent(analysis_project$existing, { - out$existing_analysis_project <- analysis_project$existing - }) - # simu project reactivity - observeEvent(simu_project$new, { - out$new_simu_project <- simu_project$new - }) - observeEvent(simu_project$existing, { - out$existing_simu_project <- simu_project$existing - }) - # output - return(out) -} - -#' Open project module ui -#' @description -#' Wrapper to choose between creating a new project and opening an existing one -#' @keywords internal -#' @author Ghislain Durif -#' @importFrom shinyWidgets actionGroupButtons -open_project_ui <- function(id, label = "New project") { - ns <- NS(id) - tagList( - actionGroupButtons( - inputIds = c(ns("new_project"), - ns("existing_project")), - labels = list( - tags$span(icon("plus"), "New project"), - tags$span(icon("folder-open"), "Existing project") - ), - fullwidth = TRUE - ) - ) -} - -#' Open project module server -#' @keywords internal -#' @author Ghislain Durif -open_project_server <- function(input, output, session) { - # init output - out <- reactiveValues(new = NULL, existing = NULL) - # react to new project request - observeEvent(input$new_project, { - out$new <- ifelse(is.null(out$new), 0, out$new) + 1 - }) - # react to open existing project request - observeEvent(input$existing_project, { - out$existing <- ifelse(is.null(out$existing), 0, out$existing) + 1 - }) - # output - return(out) -} diff --git a/R-pkg/R/61_analysis_module.R b/R-pkg/R/61_analysis_module.R new file mode 100644 index 00000000..802439d5 --- /dev/null +++ b/R-pkg/R/61_analysis_module.R @@ -0,0 +1,1102 @@ +#' Analysis module ui +#' @keywords internal +#' @author Ghislain Durif +analysis_module_ui <- function(id) { + ns <- NS(id) + tagList( + tags$style(HTML(".box-header{text-align: center;}")), + fluidRow( + box( + title = tags$b("Project settings"), + width = 12, + status = "primary", solidHeader = FALSE, + collapsible = FALSE, + analysis_proj_set_ui(ns("proj_set")) + ), + box( + title = "Training set simulations", + width = 12, + status = "info", solidHeader = TRUE, + collapsible = TRUE, collapsed = TRUE, + training_set_ui(ns("train_set")) + ), + box( + title = "Random Forest Analyses", + width = 12, + status = "warning", solidHeader = TRUE, + collapsible = TRUE, collapsed = TRUE, + rf_module_ui(ns("rf")) + ), + box( + title = tags$b("Project administration"), + width = 12, + status = "danger", solidHeader = FALSE, + collapsible = FALSE, collapsed = FALSE, + proj_admin_ui(ns("proj_admin")) + ) + ) + ) +} + +#' Analysis module server +#' @keywords internal +#' @author Ghislain Durif +analysis_module_server <- function(input, output, session) { + + ## project setting + proj_set <- callModule(analysis_proj_set_server, "proj_set") + + ## Training set sub-module + # training_set <- callModule(training_set_server, "train_set") + + ## random forest sub-module + # rf <- callModule(rf_module_server, "rf") + + ## action + proj_admin <- callModule(proj_admin_server, "proj_admin", tag = "ap") + + ## reset + observeEvent(proj_admin$reset, { + req(proj_admin$reset) + session$reload() + }) + + ## clean on exit + session$onSessionEnded(function() { + isolate(tryCatch(function() { + if(isTruthy(env$ap$proj_dir)) fs::dir_delete(env$ap$proj_dir) + })) + }) +} + +#' Analysis project setting ui +#' @keywords internal +#' @author Ghislain Durif +analysis_proj_set_ui <- function(id) { + ns <- NS(id) + tagList( + proj_name_ui(ns("proj_name")), + hr(), + h3("Data type"), + data_type_ui(ns("data_type")), + hr(), + proj_type_ui(ns("proj_type")), + hr(), + proj_config_ui(ns("proj_config")), + hr(), + data_file_ui(ns("data_file")), + hr(), + uiOutput(ns("feedback")) + ) +} + + + +#' Analysis project setting server +#' @keywords internal +#' @author Ghislain Durif +#' @importFrom dplyr distinct +#' @importFrom fs file_copy file_delete +analysis_proj_set_server <- function(input, output, session) { + + ## project name + callModule(proj_name_server, "proj_name", tag = "ap") + + ## data type + callModule(data_type_server, "data_type", tag = "ap") + + ## reset project when data type change + observeEvent({c(env$ap$locus_type, env$ap$seq_mode)}, { + req(env$ap$proj_dir) + + # clean before upload + clean_proj_dir(env$ap$proj_dir) + # reset env + reset_ap() + # file modification + update_proj_file("ap") + }) + + ## project type + callModule(proj_type_server, "proj_type") + + ## project status + callModule(proj_config_server, "proj_config") + + ## data file + callModule(data_file_server, "data_file") + + # output$proj_is_ready <- renderUI({ + # if(!(out$valid_proj & out$valid_data_file)) { + # h3(icon("warning"), "Project set up is not ready.", + # style="color:red;text-align:center;") + # } else { + # h4(icon("check"), "Project set up is ok.", + # style="text-align:center;") + # } + # }) + # +} + +#' Project type setting ui +#' @keywords internal +#' @author Ghislain Durif +proj_type_ui <- function(id) { + ns <- NS(id) + # inline help for project type + proj_type_help <- tagList( + "You can either:", + tags$ol( + tags$li("start with a new project;"), + tags$li("open one of your own an existing project;"), + tags$li("open one of the included examples.") + ) + ) + # ui + tagList( + h3("Project type") %>% + helper(type = "inline", content = as.character(proj_type_help)), + radioGroupButtons( + ns("proj_type"), + label = NULL, + choices = c("New project" = "new", + "Existing project" = "existing", + "Example" = "example"), + selected = "new", + justified = TRUE + ), + conditionalPanel( + condition = "input.proj_type == 'new'", + ns = ns, + new_proj_ui(ns("new_proj")) + ), + conditionalPanel( + condition = "input.proj_type == 'existing'", + ns = ns, + existing_proj_ui(ns("existing_proj")) + ), + conditionalPanel( + condition = "input.proj_type == 'example'", + ns = ns, + example_proj_ui(ns("example_proj")), + ), + conditionalPanel( + condition = "input.proj_type !== 'new'", + ns = ns, + proj_file_list_ui(ns("proj_file_list")), + ) + ) +} + +#' Project type setting server +#' @keywords internal +#' @author Ghislain Durif +#' @importFrom dplyr distinct +#' @importFrom fs file_copy file_delete +proj_type_server <- function(input, output, session) { + + ## project type + observeEvent(input$proj_type, { + req(input$proj_type) + env$ap$proj_type <- input$proj_type + }) + + ## New project + callModule(new_proj_server, "new_proj") + + ## Existing project + callModule(existing_proj_server, "existing_proj") + + ## Example project + callModule(example_proj_server, "example_proj") + + ## File list for existing or example project + callModule(proj_file_list_server, "proj_file_list") +} + +#' Feedback on project file list ui +#' @keywords internal +#' @author Ghislain Durif +proj_file_list_ui <- function(id) { + ns <- NS(id) + tagList( + helpText( + uiOutput(ns("feedback_proj_file")) + ) + ) +} + +#' Feedback on project file list server +#' @keywords internal +#' @author Ghislain Durif +proj_file_list_server <- function(input, output, session) { + + # # debugging + # observe({ + # req(env$ap$file_modif) + # req(env$ap$proj_file_list) + # pprint("file modif") + # pprint(env$ap$file_modif) + # pprint("file list") + # pprint(env$ap$proj_file_list) + # }) + + # feedback on list of uploaded files + observeEvent( + {c(env$ap$file_modif, env$ap$proj_file_list, env$ap$proj_type)}, + { + req(env$ap$proj_type %in% c("existing", "example")) + # output + output$feedback_proj_file <- renderUI({ + # default + tag_list <- tags$div( + icon("warning"), "No file was uploaded.", + style = "color: #F89406; margin-top: -15px;" + ) + # else + if(isTruthy(env$ap) && isTruthy(env$ap$proj_file_list)) { + # project files + proj_file_list <- env$ap$proj_file_list + + if(length(proj_file_list) > 0) { + # expected files + expected_files1 <- c("headerRF.txt", "header.txt") + expected_files2 <- c("statobsRF.txt", "reftableRF.bin") + expected_files <- c(expected_files1, expected_files2) + + # important project files that are present + important_files <- expected_files[expected_files %in% + proj_file_list] + + # additional files + additional_files <- proj_file_list[!proj_file_list %in% + important_files] + + # missing files ? + missing_files <- NULL + + missing_header <- !any(expected_files1 %in% + proj_file_list) + if(missing_header) { + missing_files <- c(missing_files, "headerRF.txt") + } + + missing_files2 <- !(expected_files2 %in% proj_file_list) + if(any(missing_files2)) { + missing_files <- c(missing_files, + expected_files2[missing_files2]) + } + # project core files + subitem1 <- NULL + if(length(important_files) > 0) { + subitem1 <- tags$div( + do.call(tags$ul, lapply( + important_files, + function(item) + return(tags$li(tags$code(item))) + )) + ) + } else { + subitem1 <- tags$b("none") + } + # additional files + subitem2 <- NULL + if(length(additional_files) > 0) { + subitem2 <- tags$div( + do.call(tags$ul, lapply( + additional_files, + function(item) + return(tags$li(tags$code(item))) + )) + ) + } else { + subitem2 <- tags$b("none") + } + item1 <- helpText( + h5(icon("comment"), tags$b("Uploaded files")), + fluidRow( + column( + width = 6, + tagList( + tags$p("Project core files:", subitem1) + ) + ), + column( + width = 6, + tagList( + tags$p("Additional files:", subitem2) + ) + ) + ) + ) + # missing files + item2 <- NULL + if(length(missing_files) > 0) { + item2 <- tags$div( + tags$p( + icon("warning"), + "Potentially missing files", + "for an existing project:", + tags$div( + do.call(tags$ul, lapply( + missing_files, + function(item) + return(tags$li(tags$code(item))) + )) + ), + tags$b("Note:"), + "you will be able to generate them below." + ), + style = "color: #F89406;" + ) + } + tag_list <- tagList(item1, item2) + } + } + # output + tag_list + }) + } + ) +} + +#' Project file check ui +#' @keywords internal +#' @author Ghislain Durif +proj_file_check_ui <- function(id) { + ns <- NS(id) + tagList( + uiOutput(ns("global_feedback")), + uiOutput(ns("feedback_header")), + uiOutput(ns("feedback_reftable")), + uiOutput(ns("feedback_statobs")) + ) +} + +#' Project file check server +#' @keywords internal +#' @author Ghislain Durif +proj_file_check_server <- function(input, output, session) { + + ## file check + observeEvent( + {c(env$ap$file_modif, env$ap$proj_file_list, + env$ap$proj_dir, env$ap$locus_type)}, + { + req(env$ap$proj_type %in% c("existing", "example")) + req(env$ap$proj_dir) + req(env$ap$locus_type) + # file check + file_check <- check_proj_file(env$ap$proj_dir, env$ap$locus_type) + # update env + env$ap$header_check <- file_check$header_check + env$ap$reftable_check <- file_check$reftable_check + env$ap$statobs_check <- file_check$statobs_check + } + ) + + ## OUTPUT + # global + output$global_feedback <- renderUI({ + if(is.null(env$ap$header_check) && is.null(env$ap$reftable_check) && + is.null(env$ap$statobs_check)) { + helpText( + "Project is not configured yet.", + tags$p(tags$ul(tags$li( + "For a new project, you will be able to configure it", + "in the panel below." + ))), + tags$p(tags$ul(tags$li( + "For an existing or an example project,", + "you will be able to check the configuration", + "or modify it in the panel below." + ))) + ) + } else { + helpText(h5(icon("comment"), "Current setup")) + } + }) + # header + output$feedback_header <- renderUI({ + if(isTruthy(env$ap$header_check)) { + if(isTruthy(env$ap$header_check$valid)) { + # data file + data_file <- NULL + if(env$ap$header_check$data_file %in% env$ap$proj_file_list) { + data_file <- tagList( + "Data file:", + tags$code(env$ap$header_check$data_file) + ) + } else { + data_file <- tagList( + "Expected data file:", + tags$code(env$ap$header_check$data_file), + "(it can be uploaded below)." + ) + } + # output + helpText( + tags$p(tags$ul(tags$li(data_file))), + tags$p(tags$ul(tags$li( + tags$code(env$ap$header_check$header_file), + "file is ok", + "with", + tags$b(as.character(env$ap$header_check$n_scen)), + ifelse( + env$ap$header_check$n_scen > 1, + "scenarii", + "scenario"), + "." + ))) + ) + } else { + tags$div( + tags$p( + icon("warning"), + "Issue with provided", + tags$code(env$ap$header_check$header_file), + "file:", + do.call( + tags$ul, + lapply(env$ap$header_check$msg, tags$li) + ) + ), + style = "color: #F89406;" + ) + } + } else { + NULL + } + }) + # reftable + output$feedback_reftable <- renderUI({ + if(isTruthy(env$ap$reftable_check)) { + if(isTruthy(env$ap$reftable_check$valid)) { + helpText(tags$p(tags$ul(tags$li( + tags$code("reftableRF.bin"), "file is ok", + "with", + tags$b(as.character(env$ap$reftable_check$n_stat)), + "summary statistics computed over", + tags$b(as.character(env$ap$reftable_check$n_rec)), + "simulations in the training set." + )))) + } else { + tags$div( + tags$p( + icon("warning"), + "Issue with provided", tags$code("reftableRF.bin"), + "file:", + do.call( + tags$ul, + lapply(env$ap$header_check$msg, tags$li) + ) + ), + style = "color: #F89406;" + ) + } + } else { + NULL + } + }) + # statobs + output$feedback_statobs <- renderUI({ + if(isTruthy(env$ap$statobs_check)) { + if(isTruthy(env$ap$statobs_check$valid)) { + helpText(tags$p(tags$ul(tags$li( + tags$code("statobsRF.txt"), + "file is ok." + )))) + } else { + tags$div( + tags$p( + icon("warning"), + "Issue with provided", tags$code("statobsRF.txt"), + "file:", + do.call( + tags$ul, + lapply(env$ap$header_check$msg, tags$li) + ) + ), + style = "color: #F89406;" + ) + } + } else { + NULL + } + }) +} + +#' New project ui +#' @keywords internal +#' @author Ghislain Durif +new_proj_ui <- function(id) { + ns <- NS(id) + tagList( + helpText( + icon("comment"), "You will be able to upload your data file", + "and configure your project below." + ) + ) +} + +#' New project server +#' @keywords internal +#' @author Ghislain Durif +new_proj_server <- function(input, output, session) { + + # clean project directory when choosing this mode + observeEvent(env$ap$proj_type, { + req(env$ap$proj_type == "new") + req(env$ap$proj_dir) + + # clean before upload + clean_proj_dir(env$ap$proj_dir) + # reset env + reset_ap() + # file modification + update_proj_file("ap") + }) +} + +#' Existing project ui +#' @keywords internal +#' @author Ghislain Durif +existing_proj_ui <- function(id) { + ns <- NS(id) + # inline help for proj file input + proj_file_help <- tagList( + tags$ul( + tags$li( + "You can", tags$b("either"), "upload:", + "a project", tags$code("zip"), + "file generated in a previous run", + tags$b("or"), + "single project-related files, including", + tags$code("headerRF.txt"), ", ", + tags$code("reftableRF.bin"), ", ", + tags$code("statobsRF.txt"), "and your observed data file." + ), + tags$li( + "You", tags$b("cannot"), "upload both a project", + tags$code("zip"), "file", + "and single project-related files, those will be ignored.", + style = "margin-top: 10px;" + ), + tags$li( + "When uploading", tags$b("single project-related files"), + "you", tags$b("should"), "upload all required files", + "at the same time (use", tags$code("CTRL+click"), + "to select multiple files in the file chooser window).", + style = "margin-top: 10px;" + ), + tags$li( + "If you", tags$b("re-upload"), "a file or a group of files,", + "it will", tags$b("delete"), "and", tags$b("replace"), + "any previous upload.", + style = "margin-top: 10px;" + ), + tags$li( + "If some project files are missing or have formating issues", + "you will be able to (re)configure", + "the corresponding settings in the next panel.", + style = "margin-top: 10px;" + ) + ) + ) + # ui + tagList( + h4(tags$b("Project files")) %>% + helper( + type = "inline", + content = as.character(proj_file_help) + ), + helpText( + "Use", tags$code("CTRL+click"), "to select more than one file." + ), + fileInput( + ns("file_input"), + label = NULL, + buttonLabel = "Select file(s)", + multiple = TRUE, + accept = c( + ".txt", + ".bin", + ".zip" + ) + ), + uiOutput(ns("feedback_existing")) + ) +} + +#' Existing project server +#' @keywords internal +#' @author Ghislain Durif +existing_proj_server <- function(input, output, session) { + + # # file_input = data.frame with fields 'name', 'size', 'type', 'datapath' + # # debugging + # observe({ + # pprint("file input") + # print(input$file_input) + # }) + + # Feedback on file upload + observe({ + req(env$ap$proj_type == "existing") + + # feedback on missing file + feedbackWarning("file_input", !isTruthy(input$file_input), + "Missing file(s).") + }) + + # reset file upload when another mode is chosen + observeEvent({c(env$ap$proj_type, env$ap$locus_type, env$ap$seq_mode)}, { + shinyjs::reset("file_input") + }) + + # clean project directory when choosing this mode + observeEvent(env$ap$proj_type, { + req(env$ap$proj_type == "existing") + req(env$ap$proj_dir) + + # clean before upload + clean_proj_dir(env$ap$proj_dir) + # reset env + reset_ap() + # file modification + update_proj_file("ap") + }) + + # manage file upload (copy to project directory) + observeEvent(input$file_input, { + req(input$file_input) + req(env$ap$proj_dir) + + # upload + input_check <- tryCatch( + proj_file_input(input$file_input, env$ap$proj_dir), + error = function(e) return(NULL) + ) + + # feedback + output$feedback_existing <- renderUI({ + if(is.null(input_check) || !input_check$valid) { + msg <- "Issue(s) with uploaded file(s)." + feedbackWarning( + "file_input", is.null(input_check) || !input_check$valid, + msg + ) + if(length(input_check$msg) > 0) { + tags$div( + icon("warning"), "Issue(s) with uploaded file(s):", + do.call(tags$ul, lapply(input_check$msg, tags$li)), + style = "color: #F89406; margin-top: -15px;" + ) + } else { + NULL + } + } else { + NULL + } + }) + + # update project file list and check files + if(!is.null(input_check) && isTruthy(input_check$valid)) { + # file modification + update_proj_file("ap") + } else { + # clean before upload + clean_proj_dir(env$ap$proj_dir) + # reset env + reset_ap() + # file modification + update_proj_file("ap") + } + }) +} + +#' Example project ui +#' @keywords internal +#' @author Ghislain Durif +example_proj_ui <- function(id) { + ns <- NS(id) + tagList( + h4(tags$b("Select an example")), + selectInput( + ns("proj_example"), + label = NULL, + choices = c("", "Not available at the moment"), + selected = NULL, + multiple = FALSE + ) + ) +} + +#' Example project server +#' @keywords internal +#' @author Ghislain Durif +example_proj_server <- function(input, output, session) { + + # clean project directory when choosing this mode + observeEvent(env$ap$proj_type, { + req(env$ap$proj_type == "example") + req(env$ap$proj_dir) + + # clean before upload + clean_proj_dir(env$ap$proj_dir) + # reset env + reset_ap() + # file modification + update_proj_file("ap") + }) + + # update possible input + observeEvent({c(env$ap$proj_type, env$ap$locus_type, env$ap$seq_mode)}, { + req(env$ap$proj_type == "example") + req(env$ap$locus_type) + + ## MSS + if(env$ap$locus_type == "mss") { + updateSelectInput( + session, + "proj_example", + choices = c("", "Not available at the moment"), + selected = NULL + ) + ## SNP + } else if(env$ap$locus_type == "snp") { + req(env$ap$seq_mode) + ## IndSeq + if(env$ap$seq_mode == "indseq") { + possible_choices <- basename( + list.dirs( + example_dir() + ) + ) + possible_choices <- + possible_choices[str_detect(possible_choices, "IndSeq")] + updateSelectInput( + session, + "proj_example", + choices = c("", possible_choices), + selected = NULL + ) + ## PoolSeq + } else if(env$ap$seq_mode == "poolseq") { + possible_choices <- basename( + list.dirs( + example_dir() + ) + ) + possible_choices <- + possible_choices[str_detect(possible_choices, "PoolSeq")] + updateSelectInput( + session, + "proj_example", + choices = c("", possible_choices), + selected = NULL + ) + } + } + }) + + # manage file upload (copy to project directory) + observeEvent(input$proj_example, { + req(input$proj_example) + req(env$ap$proj_dir) + + # copy files + proj_files <- list.files( + file.path(example_dir(), input$proj_example) + ) + fs::file_copy( + path = file.path(example_dir(), input$proj_example, proj_files), + new_path = file.path(env$ap$proj_dir, proj_files), + overwrite = TRUE + ) + + # file modification + update_proj_file("ap") + }) +} + +#' Project configuration feedback ui +#' @keywords internal +#' @author Ghislain Durif +proj_config_ui <- function(id) { + ns <- NS(id) + tagList( + h3("Project configuration"), + proj_file_check_ui(ns("proj_file_check")) + ) +} + +#' Project configuration feedback server +#' @keywords internal +#' @author Ghislain Durif +proj_config_server <- function(input, output, session) { + callModule(proj_file_check_server, "proj_file_check") +} + +#' Data file ui +#' @keywords internal +#' @author Ghislain Durif +data_file_ui <- function(id) { + ns <- NS(id) + tagList( + h3("Data file"), + uiOutput(ns("input_data")), + br(), + check_data_ui(ns("check_data_file")) + ) +} + +#' Data file server +#' @keywords internal +#' @author Ghislain Durif +data_file_server <- function(input, output, session) { + + ns <- session$ns + + # input data file + output$input_data <- renderUI({ + if(isTruthy(env$ap$header_check$valid) && + isTruthy(env$ap$proj_file_list) && + (env$ap$header_check$data_file %in% env$ap$proj_file_list)) { + # update data file in env + env$ap$data_file <- env$ap$header_check$data_file + # output + helpText( + icon("comment"), + "Data file was already provided." + ) + } else { + input_data_file_ui(ns("input_data_file")) + } + }) + + # input data file (if necessary) + callModule(input_data_file_server, "input_data_file") + + # check data + callModule(check_data_server, "check_data_file") +} + +#' Input data file ui +#' @keywords internal +#' @author Ghislain Durif +input_data_file_ui <- function(id) { + ns <- NS(id) + tagList( + fileInput( + ns("data_file"), + label = NULL, + buttonLabel = "Select file", + multiple = FALSE + ), + uiOutput(ns("feedback")) + ) +} + +#' Input data file server +#' @keywords internal +#' @author Ghislain Durif +#' @param proj_dir string as a `reactive`, project directory. +input_data_file_server <- function(input, output, session) { + + # Feedback on file upload + observe({ + # feedback on missing file + feedbackWarning("data_file", !isTruthy(input$data_file), + "Missing data file.") + }) + + ## get data file + observeEvent(input$data_file, { + # input$data_file = data.frame with 4 columns: + # name (chr), size (int), type (chr), datapath (chr) + req(env$ap$proj_dir) + req(nrow(input$data_file) == 1) + # check data file name (if header exists) + if(isTruthy(env$ap$header_check$valid) && + isTruthy(env$ap$header_check$data_file)) { + req(input$data_file$name == env$ap$header_check$data_file) + } + # save data file name + env$ap$data_file <- input$data_file$name + # copy to project directory + fs::file_copy(input$data_file$datapath, + file.path(local$proj_dir, out$name), + overwrite = TRUE) + + if(file.exists(input$data_file$datapath)) { + # logging("deleting:", input$data_file$datapath) + fs::file_delete(input$data_file$datapath) + } + }) + + ## feedback + output$feedback <- renderUI({ + if(isTruthy(nrow(input$data_file) == 1)) { + if(isTruthy(env$ap$header_check$valid) && + isTruthy(env$ap$header_check$data_file) && + input$data_file$name != env$ap$header_check$data_file) { + tags$div( + icon("warning"), + "Provided data file name does not match", + "expected data file name from", + tags$code(env$ap$header_check$header_file), "file.", + style = "color: #F89406;" + ) + } else { + NULL + } + } else { + tags$div( + icon("warning"), + "Missing data file.", + style = "color: #F89406;" + ) + } + }) +} + +#' Check data ui +#' @keywords internal +#' @author Ghislain Durif +check_data_ui <- function(id) { + ns <- NS(id) + tagList( + helpText( + icon("clock"), + "Checking the data file may take some time." + ), + fluidRow( + column( + width = 4, + actionButton( + ns("check"), + label = "Check data", + icon = icon("check"), + width = '100%' + ) + ), + column( + width = 8, + uiOutput(ns("feedback_check")) + ) + ), + uiOutput(ns("feedback")), + uiOutput(ns("data_info")) + ) +} + +#' Check data server +#' @keywords internal +#' @author Ghislain Durif +check_data_server <- function(input, output, session) { + + # init local + local <- reactiveValues(run = FALSE) + + ## warning + output$feedback_check <- renderUI({ + if(!isTruthy(env$ap$data_check) && !local$run) { + tags$p( + tags$div( + icon("warning"), "Data file was not checked.", + style = "color: #F89406;" + ) + ) + } else { + NULL + } + }) + + ## run data check + observeEvent(input$check, { + req(input$check) + req(!local$run) + req(env$ap$proj_dir) + req(env$ap$locus_type) + req(env$ap$seq_mode) + req(env$ap$data_file) + + # ask to run check + local$run <- TRUE + }) + + ## data check + observeEvent(local$run, { + req(isTruthy(local$run)) + req(env$ap$data_file) + req(env$ap$proj_dir) + req(env$ap$locus_type) + req(env$ap$seq_mode) + + pprint(env$ap$data_file) + pprint(env$ap$proj_dir) + pprint(env$ap$locus_type) + pprint(env$ap$seq_mode) + + # data file check + env$ap$data_check <- check_data_file( + env$ap$data_file, env$ap$proj_dir, + env$ap$locus_type, env$ap$seq_mode + ) + + # run is over + local$run <- FALSE + }) + + ## data info + output$data_info <- renderUI({ + req(env$ap$locus_type) + req(env$ap$seq_mode) + # data case + tmp_data_case <- NULL + if(env$ap$locus_type == "mss") { + tmp_data_case <- "Microsat/Sequence" + ## snp locus / indseq + } else if((env$ap$locus_type == "snp") && + (env$ap$seq_mode == "indseq")) { + tmp_data_case <- "SNP IndSeq" + ## snp locus / poolseq + } else if((env$ap$locus_type == "snp") && + (env$ap$seq_mode == "poolseq")) { + tmp_data_case <- "SNP PoolSeq" + } + + # # output + # if(isTruthy(env$ap$data_check)) { + # if(isTruthy(env$ap$data_check$valid)) { + # # format_data_info( + # # env$ap$data_check, env$ap$locus_type, env$ap$seq_mode + # # ) + # NULL + # } else if(isTruthy(env$ap$data_check$msg)) { + # tags$div( + # tags$p( + # icon("warning"), + # "Issue with your", tags$b(tmp_data_case), "data file:", + # do.call( + # tags$ul, lapply(env$ap$data_check$msg, tags$li) + # ) + # ), + # style = "color: #F89406;" + # ) + # } else { + # tags$div( + # tags$p( + # icon("warning"), + # "Issue with your", tags$b(tmp_data_case), "data file." + # ), + # style = "color: #F89406;" + # ) + # } + # } else { + # NULL + # } + NULL + }) +} diff --git a/R-pkg/R/61_analysis_page_module.R b/R-pkg/R/61_analysis_page_module.R deleted file mode 100644 index 75d86183..00000000 --- a/R-pkg/R/61_analysis_page_module.R +++ /dev/null @@ -1,1025 +0,0 @@ -#' Analysis page ui -#' @keywords internal -#' @author Ghislain Durif -analysis_page_ui <- function(id) { - ns <- NS(id) - tagList( - tags$style(HTML(".box-header{text-align: center;}")), - fluidRow( - box( - title = tags$b("Project settings"), - width = 12, - status = "primary", solidHeader = FALSE, - collapsible = FALSE, - analysis_proj_set_ui(ns("proj_set")) - ), - box( - title = "Training set simulations", - width = 12, - status = "info", solidHeader = TRUE, - collapsible = TRUE, collapsed = TRUE, - training_set_ui(ns("train_set")) - ), - box( - title = "Random Forest Analyses", - width = 12, - status = "warning", solidHeader = TRUE, - collapsible = TRUE, collapsed = TRUE, - rf_module_ui(ns("rf")) - ), - box( - title = tags$b("Project administration"), - width = 12, - status = "danger", solidHeader = FALSE, - collapsible = FALSE, collapsed = FALSE, - proj_admin_ui(ns("proj_admin")) - ) - ) - ) -} - -#' Analysis page server -#' @keywords internal -#' @author Ghislain Durif -analysis_page_server <- function(input, output, session) { - # namespace - ns <- session$ns - # init local - local <- reactiveValues( - locus_type = NULL, - seq_mode = NULL, - new_proj = NULL, - proj_dir = NULL, - proj_header = NULL, - scenario_list = NULL - ) - # init output - out <- reactiveValues( - setting = NULL, - scenario = NULL, - reset = NULL - ) - ## project setting - proj_set <- callModule(analysis_proj_set_server, "proj_set") - # # output: - # data_file = NULL, - # data_info = NULL, - # locus_type = NULL, - # seq_mode = NULL, - # new_proj = NULL, - # proj_dir = mk_proj_dir(), - # proj_file_list = character(0), - # proj_header_content = list(), - # proj_name = NULL, - # valid_data_file = FALSE - # valid_proj = FALSEvalid_proj = FALSE - - # # debugging - # observe({ - # pprint("#### Project settings ####") - # pprint(reactiveValuesToList(proj_set)) - # }) - - - # FIXME - # update local - observe({ - # FIXME obsolete - local$proj_dir <- proj_set$proj_dir - local$proj_name <- proj_set$proj_name - }) - - ## Training set sub-module - training_set <- callModule( - training_set_server, "train_set", - data_file = reactive(proj_set$data_file), - data_info = reactive(proj_set$data_info), - locus_type = reactive(proj_set$locus_type), - seq_mode = reactive(proj_set$seq_mode), - new_proj = reactive(proj_set$new_proj), - proj_dir = reactive(proj_set$proj_dir), - proj_file_list = reactive({NULL}), - valid_proj = reactive(proj_set$valid_proj) - ) - - # # debugging - # observe({ - # pprint("training set valid proj") - # pprint(training_set$valid_proj) - # }) - - ## random forest module - rf <- callModule( - rf_module_server, "rf", - locus_type = reactive(proj_set$locus_type), - proj_dir = reactive(proj_set$proj_dir), - # proj_file_list = reactive(proj_set$proj_file_list), - valid_proj = reactive(proj_set$valid_proj) - ) - - ## action - proj_admin <- callModule( - proj_admin_server, "proj_admin", - proj_dir = reactive(proj_set$proj_dir), - proj_name = reactive(proj_set$proj_name) - ) - - ## reset - observeEvent(proj_admin$reset, { - req(proj_admin$reset) - out$reset <- proj_admin$reset - session$reload() - }) - - # output - return(out) -} - -#' Analysis project setting ui -#' @keywords internal -#' @author Ghislain Durif -analysis_proj_set_ui <- function(id) { - ns <- NS(id) - tagList( - proj_name_ui(ns("proj_name_setup")), - hr(), - h3("Data type"), - data_type_ui(ns("data_type")), - hr(), - h3("Project type"), - helpText( - "You can either: (i) start with a new project;", - "(ii) open one of your own an existing project;", - "or (iii) open one of the included examples." - ), - radioGroupButtons( - ns("proj_type"), - label = NULL, - choices = c("New project" = "new", - "Existing project" = "existing", - "Example" = "example"), - selected = "new", - justified = TRUE - ), - conditionalPanel( - condition = "input.proj_type == 'existing'", - ns = ns, - h4(tags$b("Project files")), - helpText( - "Use ctrl+click to select more than one file." - ), - fileInput( - ns("file_input"), - label = NULL, - buttonLabel = "Select file(s)", - multiple = TRUE, - accept = c( - ".txt", - ".bin", - ".zip" - ) - ), - helpText( - tags$ul( - tags$li( - "You can", tags$b("either"), "upload:", - "a project", tags$code("zip"), - "file generated in a previous run", - tags$b("or"), - "single project-related files, including", - tags$code("headerRF.txt"), ", ", - tags$code("reftableRF.bin"), ", ", - tags$code("statobsRF.txt"), "." - ), - tags$li( - "You", tags$b("cannot"), "upload both a project", - tags$code("zip"), "file", - "and single project-related files." - ), - tags$li( - "If you re-upload a file, it will over-write the", - "corresponding file that you previously uploaded." - ) - ) - ), - br(), - uiOutput(ns("file_check")), - helpText( - "If some files are missing or have formating issues", - "(identified with a", icon("times"),")", - "you will be able to (re)configure the corresponding", - "settings below." - ) - ), - conditionalPanel( - condition = "input.proj_type == 'example'", - ns = ns, - h4(tags$b("Select an example")), - selectInput( - ns("proj_example"), - label = NULL, - choices = c("", "Not available at the moment"), - selected = NULL, - multiple = FALSE - ), - ), - hr(), - h3("Data file"), - helpText( - icon("clock"), - "Loading and checking the data file may take some time." - ), - conditionalPanel( - condition = "input.proj_type !== 'example'", - ns = ns, - input_data_ui(ns("input_data_file")) - ), - check_data_ui(ns("check_data_file")), - hr(), - uiOutput(ns("proj_is_ready")) - ) -} - -#' Analysis project setting ui -#' @keywords internal -#' @author Ghislain Durif -#' @importFrom dplyr distinct -#' @importFrom fs file_copy file_delete -analysis_proj_set_server <- function(input, output, session) { - # namespace - ns <- session$ns - # init local - local <- reactiveValues( - data_file = NULL, - file_input = NULL, - valid_proj_name = FALSE, - existing_proj_zip = FALSE, - # data.frame with 4 columns: - # name (chr), size (int), type (chr), datapath (chr) - header_data_file = NULL - ) - # init output - out <- reactiveValues( - data_file = NULL, - data_info = NULL, - locus_type = NULL, - seq_mode = NULL, - new_proj = TRUE, - proj_dir = mk_proj_dir("diyabc_rf"), - proj_file_list = character(0), - proj_header_content = list(), - proj_name = NULL, - valid_data_file = FALSE, - valid_proj = FALSE - ) - - # clean on exit - session$onSessionEnded(function() { - isolate(tryCatch(fs::dir_delete(out$proj_dir))) - }) - - # debugging - observe({ - logging("project directory:", out$proj_dir) - }) - - ## project name - proj_name_setup <- callModule(proj_name_server, "proj_name_setup") - - observeEvent(proj_name_setup$proj_name, { - req(proj_name_setup$proj_name) - out$proj_name <- proj_name_setup$proj_name - }) - - observeEvent(proj_name_setup$valid_proj_name, { - req(!is.null(proj_name_setup$valid_proj_name)) - local$valid_proj_name <- proj_name_setup$valid_proj_name - }) - - ## data type - data_type <- callModule(data_type_server, "data_type") - observe({ - req(data_type$locus_type) - req(data_type$seq_mode) - out$locus_type <- data_type$locus_type - out$seq_mode <- data_type$seq_mode - }) - - ## new or existing project - observe({ - req(input$proj_type) - if(input$proj_type == "new") { - out$new_proj <- TRUE - } else if(input$proj_type == "existing") { - out$new_proj <- FALSE - req(!is.null(local$local$proj_file_list)) - if("headerRF.txt" %in% local$proj_file_list) { - out$new_proj <- FALSE - } else { - out$new_proj <- TRUE - } - } else if(input$proj_type == "example") { - out$new_proj <- FALSE - } - }) - - ## Manage existing project - possible_files <- c("headerRF.txt", "reftableRF.bin", "statobsRF.txt") - # copy uploaded files to project working directory (server-side) - observeEvent(input$file_input, { - ## user file input - req(input$file_input) - # data.frame with 4 columns: - # name (chr), size (int), type (chr), datapath (chr) - req(is.data.frame(input$file_input)) - req(nrow(input$file_input) > 0) - - ## extraction - new_file_input <- input$file_input - - tmp_proj_check <- existing_proj_file_check( - new_file_input, possible_files, out$proj_dir, local$file_input - ) - - local$file_input <- tmp_proj_check$file_input - local$existing_proj_zip <- tmp_proj_check$existing_proj_zip - }) - - # print possible files when uploading existing projects - output$file_check <- renderUI({ - helpText( - icon("comment"), "Project-related files check", - tags$p( - tags$div( - style = "column-count:2;", - do.call( - tags$ul, - lapply( - possible_files, - function(item) tags$li( - if(item %in% local$file_input$name) { - ind <- which(item == local$file_input$name) - if(local$file_input$valid[ind]) { - tags$div( - tags$code(item), - icon("check") - ) - } else { - tags$div( - tags$code(item), - icon("times") - ) - } - } else { - tags$code(item) - } - ) - ) - ) - ) - ) - ) - }) - - ## Manage example project - # update possible input - observe({ - req(!is.null(data_type$locus_type)) - req(!is.null(data_type$seq_mode)) - - if(data_type$locus_type == "mss") { - updateSelectInput( - session, - "proj_example", - choices = c("", "Not available at the moment"), - selected = NULL - ) - } else if(data_type$locus_type == "snp" & - data_type$seq_mode == "indseq") { - possible_choices <- basename( - list.dirs( - example_dir() - ) - ) - possible_choices <- possible_choices[str_detect(possible_choices, - "IndSeq")] - updateSelectInput( - session, - "proj_example", - choices = c("", possible_choices), - selected = NULL - ) - } else if(data_type$locus_type == "snp" & - data_type$seq_mode == "poolseq") { - possible_choices <- basename( - list.dirs( - example_dir() - ) - ) - possible_choices <- possible_choices[str_detect(possible_choices, - "PoolSeq")] - updateSelectInput( - session, - "proj_example", - choices = c("", possible_choices), - selected = NULL - ) - } - }) - # copy files if required - observeEvent(input$proj_example, { - - req(input$proj_type == "example") - req(input$proj_example) - - # copy files - proj_files <- list.files( - file.path( - example_dir(), - input$proj_example - ) - ) - fs::file_copy( - path = file.path( - example_dir(), - input$proj_example, - proj_files - ), - new_path = file.path( - out$proj_dir, - proj_files - ), - overwrite = TRUE - ) - - # update file input - # data.frame with 4 columns: - # name (chr), size (int), type (chr), datapath (chr) - local$file_input <- data.frame( - name = proj_files, - size = rep(NA, length(proj_files)), - type = rep(NA, length(proj_files)), - datapath = file.path( - out$proj_dir, - proj_files - ), - valid = rep(TRUE, length(proj_files)) - ) - - ## file type - ind <- which(local$file_input$name == "headerRF.txt") - local$file_input$type[ind] <- "text/plain" - ind <- which(local$file_input$name == "reftableRF.bin") - local$file_input$type[ind] <- "application/octet-stream" - ind <- which(local$file_input$name == "statObsRF.txt") - local$file_input$type[ind] <- "text/plain" - }) - - ## check current project header file - observeEvent(local$file_input, { - - req(is.data.frame(local$file_input)) - req(nrow(local$file_input) > 0) - req(!is.null(input$proj_type)) - - # # debugging - # pprint("file input") - # pprint(local$file_input) - - ## header check - if("headerRF.txt" %in% local$file_input$name) { - ind <- which(local$file_input$name == "headerRF.txt") - header_file_content <- parse_diyabc_header( - file_name = local$file_input$datapath[ind], - file_type = local$file_input$type[ind], - locus_type = data_type$locus_type - ) - # valid header file - local$file_input$valid[ind] <- header_file_content$valid - # header data file name - local$header_data_file <- header_file_content$data_file - # header data file content - out$proj_header_content <- header_file_content - # data from example ? - if(input$proj_type == "example") { - local$data_file <- header_file_content$data_file - } - } - - # # debugging - # pprint("file input") - # pprint(local$file_input) - - ## delete non valid files - lapply( - split(local$file_input, seq(nrow(local$file_input))), - function(item) { - if(!item$valid) { - if(file.exists(item$datapath)) { - logging("deleting:", item$datapath) - fs::file_delete(item$datapath) - } - } - } - ) - - ## project file list - out$proj_file_list <- local$file_input$name[local$file_input$valid] - - # # debugging - # pprint("file_input") - # pprint(local$file_input) - }) - - ## Data file file - data_file <- callModule( - input_data_server, "input_data_file", - proj_dir = reactive(out$proj_dir), - existing_proj_zip = reactive(local$existing_proj_zip) - ) - - # update local if data file upload - observe({ - req(!is.null(data_file$name)) - local$data_file <- data_file$name - }) - - # data file extracted from existing project zip file - observe({ - req(!is.null(local$existing_proj_zip)) - req(!is.null(local$header_data_file)) - - if(local$existing_proj_zip) { - local$data_file <- local$header_data_file - } - }) - - ## Data file check - check_data <- callModule( - check_data_server, "check_data_file", - data_file = reactive(local$data_file), - expected_data_file = reactive(local$header_data_file), - locus_type = reactive(out$locus_type), - seq_mode = reactive(out$seq_mode), - proj_dir = reactive(out$proj_dir) - ) - - # update output - observe({ - out$data_file <- data_file$name - out$data_info <- check_data$info - out$valid_data_file <- check_data$valid - }) - - # valid set up ? - observe({ - - req(!is.null(out$valid_data_file)) - req(!is.null(local$valid_proj_name)) - - # check header if required - valid_header <- TRUE - if(!is.null(out$proj_header_content$valid)) { - valid_header <- out$proj_header_content$valid - } - - out$valid_proj <- local$valid_proj_name & valid_header & out$valid_data_file - - # # debugging - # logging("valid proj?", out$valid_proj) - }) - - output$proj_is_ready <- renderUI({ - if(!(out$valid_proj & out$valid_data_file)) { - h3(icon("warning"), "Project set up is not ready.", - style="color:red;text-align:center;") - } else { - h4(icon("check"), "Project set up is ok.", - style="text-align:center;") - } - }) - - - ## output - return(out) -} - -#' Existing project related file check -#' @keywords internal -#' @author Ghislain Durif -existing_proj_file_check <- function( - new_file_input, possible_files, proj_dir, file_input -) { - # new_file_input - # data.frame with 4 columns: - # name (chr), size (int), type (chr), datapath (chr) - # possible files - # vector of possible files - # proj_dir - # character path - # file_input - # data.frame with already uploaded files - - # init output - out <- list(existing_proj_zip = FALSE, - file_input = file_input) - - # # debugging - # pprint("new file input") - # pprint(new_file_input) - - # check if project zip files was provided - check4zip <- manage_proj_zip_file(new_file_input, possible_files) - new_file_input <- check4zip$new_file_input - out$existing_proj_zip <- check4zip$existing_proj_zip - - # # debugging - # pprint("new file input") - # pprint(new_file_input) - - ## prepare filename check (if relevant) - new_file_input$valid <- rep(TRUE, nrow(new_file_input)) - - ## delete non related files - if(!out$existing_proj_zip) { - - ## filename check - new_file_input$valid <- (new_file_input$name %in% possible_files) - - ## delete - lapply( - split(new_file_input, seq(nrow(new_file_input))), - function(item) { - if(!item$valid) { - if(file.exists(item$datapath)) { - # logging("deleting:", item$datapath) - fs::file_delete(item$datapath) - } - } - } - ) - - new_file_input <- new_file_input[new_file_input$valid,] - } - - # # debugging - # pprint("new file input") - # pprint(str(new_file_input)) - # pprint(new_file_input) - - ## copy files to project directory - if(nrow(new_file_input) > 0) { - lapply( - split(new_file_input, seq(nrow(new_file_input))), - function(item) { - if(item$type == "diyabc_dir") { - fs::dir_copy( - item$datapath, - file.path(proj_dir, item$name), - overwrite = TRUE - ) - if(dir.exists(item$datapath)) { - # logging("deleting:", item$datapath) - fs::dir_delete(item$datapath) - } - } else { - fs::file_copy(item$datapath, - file.path(proj_dir, item$name), - overwrite = TRUE) - if(file.exists(item$datapath)) { - # logging("deleting:", item$datapath) - fs::file_delete(item$datapath) - } - } - } - ) - new_file_input$datapath <- file.path( - proj_dir, - new_file_input$name - ) - - ## update file input list - if(is.null(file_input)) { - out$file_input <- new_file_input - } else { - out$file_input <- rbind( - file_input[!file_input$name %in% new_file_input$name,], - new_file_input - ) - } - } - - # output - return(out) -} - -#' Manage existing project zip file -#' @keywords internal -#' @author Ghislain Durif -manage_proj_zip_file <- function(new_file_input, possible_files) { - # new_file_input - # data.frame with 4 columns: - # name (chr), size (int), type (chr), datapath (chr) - - # init output - out <- list(existing_proj_zip = FALSE, - new_file_input = new_file_input) - - # any uploaded zip file ? - zip_file_ind <- str_detect(string = new_file_input$name, - pattern = "\\.zip$") - find_zip_file <- any(zip_file_ind) - if(find_zip_file) { - - # multiple zip files ? - if(sum(zip_file_ind) > 1) { - txt <- str_c( - "Multiple project zip files were provided. ", - "Only first one (by lexicographical order) ", - "will be considered." - ) - warnings(txt) - new_file_input <- head(new_file_input[zip_file_ind,],1) - } - - # temp data dir - tmp_data_dir <- dirname(new_file_input$datapath[1]) - - # extract project files - unzip( - new_file_input$datapath[1], - exdir = dirname(new_file_input$datapath[1]) - ) - - # remove zip file - fs::file_delete(new_file_input$datapath[1]) - - # list content of zip file - tmp_file_list <- list.files(tmp_data_dir) - - if(length(tmp_file_list) > 0) { - - # check if project-related zip file was provided - out$existing_proj_zip <- any(tmp_file_list %in% possible_files) - - # modify new_file_input - # data.frame with 4 columns: - # name (chr), size (int), type (chr), datapath (chr) - out$new_file_input <- Reduce( - "rbind", - lapply( - tmp_file_list, - function(tmp_file) { - tmp_file_info <- file.info( - file.path(tmp_data_dir, tmp_file) - ) - return( - data.frame( - name = tmp_file, - size = tmp_file_info$size, - type = ifelse( - tmp_file_info$isdir, - "diyabc_dir", - "diyabc_file" - ), - datapath = file.path(tmp_data_dir, tmp_file), - stringsAsFactors = FALSE - ) - ) - } - ) - ) - - ## file type - ind <- which(out$new_file_input$name == "headerRF.txt") - out$new_file_input$type[ind] <- "text/plain" - ind <- which(out$new_file_input$name == "reftableRF.bin") - out$new_file_input$type[ind] <- "application/octet-stream" - ind <- which(out$new_file_input$name == "statObsRF.txt") - out$new_file_input$type[ind] <- "text/plain" - - } else { - out$new_file_input <- head(new_file_input, 0) - } - } - # output - return(out) -} - -#' Input data ui -#' @keywords internal -#' @author Ghislain Durif -input_data_ui <- function(id) { - ns <- NS(id) - tagList( - uiOutput(ns("feedback")), - fileInput( - ns("data_file"), - label = NULL, - buttonLabel = "Select file", - multiple = FALSE - ) - ) -} - -#' Input data server -#' @keywords internal -#' @author Ghislain Durif -#' @param proj_dir string as a `reactive`, project directory. -input_data_server <- function(input, output, session, - proj_dir = reactive({NULL}), - existing_proj_zip = reactive({NULL})) { - ## init local - local <- reactiveValues( - # input - proj_dir = NULL, - existing_proj_zip = NULL - ) - ## get input - observe({ - local$proj_dir <- proj_dir() - local$existing_proj_zip <- existing_proj_zip() - # # debugging - # pprint(paste0("input proj dir = ", local$proj_dir)) - }) - ## init output - out <- reactiveValues( - name = NULL - ) - - ## feedback - output$feedback <- renderUI({ - if(!is.null(local$existing_proj_zip)) { - if(local$existing_proj_zip) { - helpText( - icon("comment"), - "Data file was already extracted from project zip file." - ) - } else { - NULL - } - } else { - NULL - } - }) - - ## get data file - observeEvent(input$data_file, { - # input$data_file = data.frame with 4 columns: - # name (chr), size (int), type (chr), datapath (chr) - req(local$proj_dir) - req(input$data_file) - req(is.data.frame(input$data_file)) - req(nrow(input$data_file) > 0) - # data file - out$name <- input$data_file$name - # copy to project directory - fs::file_copy(input$data_file$datapath, - file.path(local$proj_dir, out$name), - overwrite = TRUE) - - if(file.exists(input$data_file$datapath)) { - # logging("deleting:", input$data_file$datapath) - fs::file_delete(input$data_file$datapath) - } - }) - # # debugging - # observe({ - # logging("data file = ", out$file) - # }) - - ## output - return(out) -} - -#' Check data ui -#' @keywords internal -#' @author Ghislain Durif -check_data_ui <- function(id) { - ns <- NS(id) - tagList( - uiOutput(ns("missing_file")), - uiOutput(ns("data_info")) - ) -} - -#' check data server -#' @keywords internal -#' @author Ghislain Durif -#' @param data_file string as a `reactive`, data file uploaded by the user. -#' @param expected_data_file string as a `reactive`, expected data file if a -#' header file is provided (NULL otherwise). -#' @param locus_type string as a `reactive`, `"mss"` or `"snp"`. -#' @param seq_mode string as a `reactive`, `"indseq"` or `"poolseq"`. -#' @param proj_dir string as a `reactive`, project directory. -check_data_server <- function(input, output, session, - data_file = reactive({NULL}), - expected_data_file = reactive({NULL}), - locus_type = reactive({"snp"}), - seq_mode = reactive({"indseq"}), - proj_dir = reactive({NULL})) { - # init local - local <- reactiveValues( - file_check = NULL, - data_info = NULL, - # input - data_file = NULL, - exp_data_file = NULL, - locus_type = NULL, - seq_mode = NULL, - proj_dir = NULL - ) - # get input - observe({ - local$data_file <- data_file() - local$exp_data_file <- expected_data_file() - local$locus_type <- locus_type() - local$seq_mode <- seq_mode() - local$proj_dir <- proj_dir() - - # # debugging - # pprint(paste0("input data file = ", local$data_file)) - # pprint(paste0("expected data file = ", local$exp_data_file)) - # pprint(paste0("input locus type = ", local$locus_type)) - # pprint(paste0("input seq mode = ", local$seq_mode)) - # pprint(paste0("input proj dir = ", local$proj_dir)) - }) - # init output - out <- reactiveValues( - data_file = NULL, - data_info = NULL, - valid = FALSE - ) - # # debugging - # observe({ - # logging("data file = ", out$file) - # }) - - ## message if missing file - output$missing_file <- renderUI({ - if(is.null(local$data_file)) { - helpText( - icon("warning"), "Missing data file" - ) - } else { - NULL - } - }) - - # data check - observe({ - req(!is.null(local$data_file)) - req(!is.null(local$proj_dir)) - req(!is.null(local$locus_type)) - req(!is.null(local$seq_mode)) - # check - local$file_check <- check_data_file( - local$data_file, local$proj_dir, - local$locus_type, local$seq_mode, - local$exp_data_file - ) - # data info - req(!is.null(local$file_check)) - req(!is.null(local$file_check$valid)) - # valid data - out$valid <- local$file_check$valid - # data spec - req(!is.null(local$file_check$spec)) - out$info <- local$file_check$spec - }) - - # user feedback - output$data_info <- renderUI({ - req(!is.null(local$file_check)) - # show data info - if(local$file_check$valid) { - req(local$file_check$msg) - helpText( - icon("comment"), "Data file info", - do.call( - tags$ul, - lapply(local$file_check$msg, function(item) { - return(tags$li(item)) - }) - ) - ) - } else { - tmp_msg <- NULL - if(!is.null(local$file_check$err)) { - tmp_msg <- do.call( - tags$ul, - lapply(local$file_check$err, function(item) { - return(tags$li(item)) - }) - ) - } - helpText( - icon("warning"), "Issue with data file.", - tmp_msg - ) - } - }) - - # output - return(out) -} diff --git a/R-pkg/R/62_analysis_project_io.R b/R-pkg/R/62_analysis_project_io.R new file mode 100644 index 00000000..88fbdb38 --- /dev/null +++ b/R-pkg/R/62_analysis_project_io.R @@ -0,0 +1,527 @@ +#' Manage uploading of existing project related files +#' @keywords internal +#' @author Ghislain Durif +#' @param file_input data.frame with fields name (chr), size (int), +#' type (chr), datapath (chr) storing new uploaded files. +#' @param proj_dir character string, path to project directory. +proj_file_input <- function(file_input, proj_dir) { + + # init output + out <- list(msg = list(), valid = FALSE) + + # # debugging + # pprint("file input") + # pprint(file_input) + + # check if project zip files was provided + is_zip <- proj_zip_file_input(file_input) + out$msg <- append(out$msg, is_zip$msg) + + if(is_zip$zip_file) { + if(!is_zip$valid) { + out$valid <- FALSE + return(out) + } + file_input <- is_zip$file_input + } + + # # debugging + # pprint("file input") + # pprint(file_input) + + # manage new uploaded project files + if(nrow(file_input) > 0) { + + # empty project directory + fs::dir_delete(list.dirs(proj_dir, recursive = FALSE)) + fs::file_delete(list.files(proj_dir, full.names = TRUE)) + + # copy files to project directory + lapply( + split(file_input, seq(nrow(file_input))), + function(item) { + if(item$type == "diyabc_dir") { + fs::dir_copy( + item$datapath, + file.path(proj_dir, item$name), + overwrite = TRUE + ) + if(dir.exists(item$datapath)) { + # logging("deleting:", item$datapath) + fs::dir_delete(item$datapath) + } + } else { + fs::file_copy(item$datapath, + file.path(proj_dir, item$name), + overwrite = TRUE) + if(file.exists(item$datapath)) { + # logging("deleting:", item$datapath) + fs::file_delete(item$datapath) + } + } + } + ) + + # valid upload ? + out$valid <- TRUE + + } else { + msg <- tagList( + "No file was provided." + ) + out$msg <- append(out$msg, list(msg)) + out$valid <- FALSE + } + + # output + return(out) +} + + +#' Manage existing project zip file +#' @keywords internal +#' @author Ghislain Durif +#' @param file_input data.frame with fields name (chr), size (int), +#' type (chr), datapath (chr) storing new uploaded files. +proj_zip_file_input <- function(file_input) { + + # init output + out <- list( + file_input = NULL, msg = list(), zip_file = FALSE, valid = FALSE + ) + + # any uploaded zip file ? + zip_file_ind <- (file_input$type == "application/zip") + + ## IF NOT + if(!any(zip_file_ind)) { + return(out) + } + + ## ELSE + out$zip_file <- TRUE + + # a single or multiple zip files ? + if(sum(zip_file_ind) > 1) { + msg <- tagList( + "You provided more than", tags$b("one"), "project", + tags$code("zip"), "files." + ) + out$msg <- append(out$msg, list(msg)) + out$valid <- FALSE + return(out) + } + + # a zip file and other files ? + if(nrow(file_input) > 1) { + msg <- tagList( + "You provided a project", tags$code("zip"), "file", + tags$b("and"), "other file(s)." + ) + out$msg <- append(out$msg, list(msg)) + out$valid <- FALSE + return(out) + } + + ## READY TO EXTRACT PROJECT FILES + out$valid <- TRUE + + # tmp dir for extraction + tmp_dir <- mk_proj_dir("diyabc_zip_extract") + + # extract project files + unzip( + file_input$datapath[1], + exdir = tmp_dir + ) + + # list content of zip file + tmp_file_list <- list.files(tmp_dir) + + # check if zip file content was at zip root or inside a root directory + if(length(tmp_file_list) == 1) { + + # if extracted project files inside a root directory + if(file.info(tmp_dir)$isdir) { + # move into extracted project directory + tmp_dir <- file.path(tmp_dir, tmp_file_list) + # update list content of zip file + tmp_file_list <- list.files(tmp_dir) + } + } + + # check extracted project files + if(length(tmp_file_list) > 0) { + + # modify file_input with extracted files + # data.frame with 4 columns: + # name (chr), size (int), type (chr), datapath (chr) + out$file_input <- Reduce( + "rbind", + lapply( + tmp_file_list, + function(tmp_file) { + tmp_file_info <- file.info( + file.path(tmp_dir, tmp_file) + ) + return( + data.frame( + name = tmp_file, + size = tmp_file_info$size, + type = ifelse( + tmp_file_info$isdir, + "diyabc_dir", + "diyabc_file" + ), + datapath = file.path(tmp_dir, tmp_file), + stringsAsFactors = FALSE + ) + ) + } + ) + ) + + ## specific file type + ind <- which(out$file_input$name == "headerRF.txt") + out$file_input$type[ind] <- "text/plain" + ind <- which(out$file_input$name == "header.txt") + out$file_input$type[ind] <- "text/plain" + ind <- which(out$file_input$name == "reftableRF.bin") + out$file_input$type[ind] <- "application/octet-stream" + ind <- which(out$file_input$name == "statobsRF.txt") + out$file_input$type[ind] <- "text/plain" + + } else { + + out$file_input <- head(file_input, 0) + out$valid <- FALSE + msg <- tagList( + "You provided an empty project", tags$code("zip"), "file.", + ) + out$msg <- append(out$msg, list(msg)) + } + + # output + return(out) +} + +#' Check existing project related files +#' @keywords internal +#' @author Ghislain Durif +#' @param proj_dir character string, path to project directory. +#' @param locus_type character string, `"snp"` or `"mss"`. +#' @importFrom mime guess_type +check_proj_file <- function(proj_dir, locus_type = "snp") { + + # init output + out <- list( + msg = list(), valid = TRUE, + header_check = NULL, statobs_check = NULL, + reftable_check = NULL + ) + + # check project directory + if(!dir.exists(proj_dir)) { + out$valid <- FALSE + msg <- tagList("Input project directory does not exist.") + out$msg <- append(out$msg, list(msg)) + return(out) + } + + # project files + proj_file_list <- list.files(proj_dir) + + # check header + if(any(c("header.txt", "headerRF.txt") %in% proj_file_list)) { + header_file <- file.path( + proj_dir, + ifelse( + "headerRF.txt" %in% proj_file_list, + "headerRF.txt", "header.txt" + ) + ) + out$header_check <- tryCatch( + read_header( + file_name = header_file, + file_type = mime::guess_type(header_file), + locus_type = locus_type + ), + error = function(e) return(NULL) + ) + if(is.null(out$header_check)) { + out$valid <- FALSE + msg <- tagList("Issue when checking project header file.") + out$msg <- append(out$msg, list(msg)) + } + } + + # check reftable + if("reftableRF.bin" %in% proj_file_list) { + reftable_file <- file.path(proj_dir, "reftableRF.bin") + out$reftable_check <- tryCatch( + read_reftable( + file_name = reftable_file, + file_type = mime::guess_type(reftable_file) + ), + error = function(e) return(NULL) + ) + if(is.null(out$reftable_check)) { + out$valid <- FALSE + msg <- tagList("Issue when checking project reftable file.") + out$msg <- append(out$msg, list(msg)) + } + + # check agreement between header file (if any and reftable file) + if(isTruthy(out$header_check$valid) && + isTruthy(out$reftable_check$valid)) { + if(out$header_check$n_scen != out$reftable_check$n_scen) { + out$reftable_check$valid <- FALSE + msg <- tagList( + "Different number of scenarii configured in files", + tags$code(out$header_check$header_file), "and", + tags$code("reftableRF.bin"), "." + ) + out$reftable_check$msg <- append(out$msg, list(msg)) + } + if(any(out$header_check$n_param_scen != + out$reftable_check$n_param_scen)) { + out$reftable_check$valid <- FALSE + msg <- tagList( + "Different number of parameters per scenario", + "configured in files", + tags$code(out$header_check$header_file), "and", + tags$code("reftableRF.bin"), "." + ) + out$reftable_check$msg <- append(out$msg, list(msg)) + } + if(out$header_check$n_stat != out$reftable_check$n_stat) { + out$reftable_check$valid <- FALSE + msg <- tagList( + "Different number of summary statistics", + "configured in files", + tags$code(out$header_check$header_file), "and", + tags$code("reftableRF.bin"), "." + ) + out$reftable_check$msg <- append(out$msg, list(msg)) + } + } + } + + # check statobs + if("statobsRF.txt" %in% proj_file_list) { + if(isTruthy(out$reftable_check$n_stat)) { + statobs_file <- file.path(proj_dir, "statobsRF.txt") + out$statobs_check <- tryCatch( + read_statobs( + file_name = statobs_file, + file_type = mime::guess_type(statobs_file), + n_stat = out$reftable_check$n_stat + ), + error = function(e) return(NULL) + ) + if(is.null(out$statobs_check)) { + out$valid <- FALSE + msg <- tagList("Issue when checking project statobs file.") + out$msg <- append(out$msg, list(msg)) + } + } else { + out$valid <- FALSE + msg <- tagList( + "Impossible to check project statobs file", + "because issue with reftable file.") + out$msg <- append(out$msg, list(msg)) + } + } + + # output + return(out) +} + +#' Check data file +#' @keywords internal +#' @author Ghislain Durif +#' @param data_file string, data file name. +#' @param data_dir string, path to directory where data file is stored. +#' @param locus_type string, locus type `"mss"` or `"snp"`. +#' @param seq_mode string, `"indseq"` or `"poolseq"`. +check_data_file <- function(data_file, data_dir, locus_type, seq_mode) { + # output + out <- NULL + # ## debugging + # logging("data_file = ", data_file) + ## mss locus + if(locus_type == "mss") { + out <- read_mss_data(data_file, data_dir) + ## snp locus / indseq + } else if((locus_type == "snp") && (seq_mode == "indseq")) { + out <- read_indseq_snp_data(data_file, data_dir) + ## snp locus / poolseq + } else if((locus_type == "snp") && (seq_mode == "poolseq")) { + out <- read_poolseq_snp_data(data_file, data_dir) + } else { + stop("Issue with input arguments") + } + + ## output + return(out) +} + +#' Check data file in background +#' @keywords internal +#' @author Ghislain Durif +#' @param data_file string, data file name. +#' @param data_dir string, path to directory where data file is stored. +#' @param locus_type string, locus type `"mss"` or `"snp"`. +#' @param seq_mode string, `"indseq"` or `"poolseq"`. +#' @importFrom future plan multisession future +check_data_file_bg <- function(data_file, data_dir, locus_type, seq_mode) { + + # FIXME + + # setup background run + oplan <- plan(multisession) + on.exit(plan(oplan)) + + # setup future + out <- future({ + check_data_file_bg(data_file, data_dir, locus_type, seq_mode) + }) + + ## output + return(out) +} + +#' Format data info for user output +#' @keywords internal +#' @author Ghislain Durif +format_data_info <- function(data_check, locus_type, seq_mode) { + + out <- NULL + + ## microsat/sequence + if(locus_type == "mss") { + out <- tagList( + tags$ul( + tags$li( + "Data file:", tags$code(data_check$data_file) + ), + tags$li( + "Number of populations =", + as.character(data_check$n_pop) + ), + tags$li( + "Number of individuals =", + as.character(data_check$n_indiv) + ), + tags$li( + "Total number of loci =", as.character(data_check$n_loci), + ", including", + tags$b(as.character(sum(data_check$locus_mode == "M"))), + "microsat locus, and", + tags$b(as.character(sum(data_check$locus_mode == "S"))), + "sequence locus." + ), + tags$li( + "Sex ratio in the dataset:", + tags$code(data_check$sex_ratio) + ) + ) + ) + ## snp locus / indseq + } else if((locus_type == "snp") && (seq_mode == "indseq")) { + out <- tagList( + tags$ul( + tags$li( + "Data file:", tags$code(data_check$data_file) + ), + tags$li( + "Number of population pools =", + as.character(data_check$n_pop) + ), + tags$li( + "Filtering with Minimun Allele Frequency (MAF) =", + as.character(data_check$maf) + ), + tags$li( + "Total number of loci =", + as.character(data_check$n_loci), br(), + do.call( + tags$ul, + lapply( + split( + data_check$locus_count, + seq(nrow(data_check$locus_count)) + ), + function(item) { + return(tags$li( + "Number of ", + tags$code(as.character(item$type)), + "loci =", as.character(item$count), br(), + "Number of excluded loci (with MAF <", + as.character(data_check$maf), ") =", + as.character(item$filter), + ", including", as.character(item$mono), + "monomorphic loci.", br(), + tags$b( + "Number of", + tags$code(as.character(item$type)), + "available of loci =", + as.character( + item$count - item$filter + ) + ) + )) + } + ) + ) + ), + tags$li( + "Sex ratio in the dataset:", + tags$code(data_check$sex_ratio) + ) + ) + ) + ## snp locus / poolseq + } else if((locus_type == "snp") && (seq_mode == "poolseq")) { + out <- tagList( + tags$ul( + tags$li( + "Data file:", tags$code(data_check$data_file) + ), + tags$li( + "Number of population pools =", + as.character(data_check$n_pop) + ), + tags$li( + "Filtering with Minimum Read Count (MRC) =", + as.character(data_check$mrc) + ), + tags$li( + "Total number of loci =", as.character(data_check$n_loci), + ), + tags$li( + "Number of excluded loci (with MRC <", + as.character(data_check$n_loci), ") =", + as.character(data_check$locus_count$filter), + ", including", as.character(data_check$locus_count$mono), + "monomorphic loci." + ), + tags$li( + tags$b( + "Total number available of loci =", + as.character( + data_check$n_loci - data_check$locus_count$filter + ) + ) + ), + tags$li( + "Sex ratio in the dataset:", + tags$code(data_check$sex_ratio) + ) + ) + ) + } + + ## output + return(out) +} diff --git a/R-pkg/R/91_datagen_page_module.R b/R-pkg/R/91_datagen_module.R similarity index 100% rename from R-pkg/R/91_datagen_page_module.R rename to R-pkg/R/91_datagen_module.R diff --git a/R-pkg/R/app_index.R b/R-pkg/R/app_index.R index 98d5b4c4..dd49f84c 100644 --- a/R-pkg/R/app_index.R +++ b/R-pkg/R/app_index.R @@ -1,7 +1,8 @@ #' App dashboard simplified sidebar #' @keywords internal #' @author Ghislain Durif -#' @importFrom shinydashboard dashboardSidebar menuItem sidebarMenu +#' @importFrom shinydashboard dashboardSidebar menuItem sidebarMenu +#' sidebarMenuOutput app_sidebar <- function() { dashboardSidebar( sidebarMenu( @@ -11,16 +12,7 @@ app_sidebar <- function() { tabName = "home_tab", icon = icon("home") ), - menuItem( - "DIYABC-RF main pipeline", - tabName = "analysis_tab", - icon = icon("flask") - ), - menuItem( - "Synthetic data file generation", - tabName = "datagen_tab", - icon = icon("dna") - ), + sidebarMenuOutput("menu_tabs"), menuItem( "Preferences", tabName = "pref_tab", @@ -39,23 +31,25 @@ app_sidebar <- function() { #' @keywords internal #' @author Ghislain Durif #' @importFrom shinydashboard dashboardBody tabItems tabItem +#' @importFrom shinyFeedback useShinyFeedback #' @importFrom shinyjs useShinyjs app_body <- function() { dashboardBody( useShinyjs(), + useShinyFeedback(), add_busy_spinner(spin = "fading-circle", margins = c(0, 10)), tabItems( tabItem( tabName = "home_tab", - simplified_home_page_ui("home_page") + home_page_ui("home_page") ), tabItem( tabName = "analysis_tab", - analysis_page_ui("analysis_page") + analysis_module_ui("analysis_module") ), tabItem( tabName = "datagen_tab", - datagen_page_ui("datagen_page") + # datagen_module_ui("datagen_module") ), tabItem( tabName = "pref_tab", @@ -79,42 +73,73 @@ app_body <- function() { #' App simplified dashboard server function #' @keywords internal #' @author Ghislain Durif +#' @importFrom shinydashboard renderMenu index_server <- function(input, output, session) { # home page - home_page <- callModule(simplified_home_page_server, "home_page") + home_page <- callModule(home_page_server, "home_page") ## new analysis project observeEvent(home_page$new_analysis_project, { req(home_page$new_analysis_project) + + # rendering sidebar menu + output$menu_tabs <- renderMenu({ + sidebarMenu( + id = "dynamic_tabs", + menuItem( + "DIYABC-RF main pipeline", + tabName = "analysis_tab", + icon = icon("flask") + ) + ) + }) + + # update tab item updateTabItems(session, "app_menu", selected = "analysis_tab") - init_diyabcrf_env() + + # reset env + reset_diyabcrf_env() + + # verbosity + observe({ + logging("analysis project directory:", env$ap$proj_dir) + }) }) ## new data generation project observeEvent(home_page$new_datagen_project, { req(home_page$new_datagen_project) + + # rendering sidebar menu + output$menu_tabs <- renderMenu({ + sidebarMenu( + id = "dynamic_tabs", + menuItem( + "Synthetic data file generation", + tabName = "datagen_tab", + icon = icon("dna") + ) + ) + }) + + # update tab item updateTabItems(session, "app_menu", selected = "datagen_tab") - init_datagen_env() + + # reset env + reset_datagen_env() + + # verbosity + observe({ + logging("data generation project directory:", env$dp$proj_dir) + }) }) ## analysis page - analysis_page <- callModule(analysis_page_server, "analysis_page") - # # reset - # observeEvent(analysis_page$reset, { - # req(analysis_page$reset) - # session$reload() - # updateTabItems(session, "app_menu", selected = "analysis_tab") - # }) + analysis_module <- callModule(analysis_module_server, "analysis_module") ## datagen page - datagen_page <- callModule(datagen_page_server, "datagen_page") - # # reset - # observeEvent(datagen_page$reset, { - # req(datagen_page$reset) - # session$reload() - # updateTabItems(session, "app_menu", selected = "datagen_tab") - # }) + # datagen_module <- callModule(datagen_module_server, "datagen_module") ## preferences callModule(pref_page_server, "pref_page") diff --git a/R-pkg/R/app_server.R b/R-pkg/R/app_server.R index c88152e2..3423ea5a 100644 --- a/R-pkg/R/app_server.R +++ b/R-pkg/R/app_server.R @@ -14,6 +14,12 @@ diyabc_server <- function(input, output, session) { ## help observe_helpers(session, help_dir = help_dir(), withMathJax = TRUE) + ## init + local <- reactiveValues(init = NULL) + observeEvent(local$init, { + init_diyabcrf_env() + init_datagen_env() + }, ignoreNULL = FALSE) ## index server function index_server(input, output, session) } \ No newline at end of file diff --git a/R-pkg/R/diyabcGUI-package.R b/R-pkg/R/diyabcGUI-package.R index 8c584dd6..7c6f7018 100644 --- a/R-pkg/R/diyabcGUI-package.R +++ b/R-pkg/R/diyabcGUI-package.R @@ -19,6 +19,7 @@ #' @import shiny #' @importFrom shinybusy add_busy_spinner #' @importFrom shinydashboard box updateTabItems +#' @importFrom shinyFeedback feedbackWarning #' @importFrom shinyhelper helper #' @importFrom shinyjs disable enable hidden hide reset show #' @importFrom shinyWidgets actionBttn actionGroupButtons ask_confirmation progressBar diff --git a/R-pkg/inst/test_input/bad_files/diyabcGUI_proj.txt b/R-pkg/inst/test_input/bad_files/diyabcGUI_proj.txt deleted file mode 100644 index 1fe91afa..00000000 --- a/R-pkg/inst/test_input/bad_files/diyabcGUI_proj.txt +++ /dev/null @@ -1 +0,0 @@ -proj_name:IndSeq_SNP_estim_param diff --git a/R-pkg/tests/testthat/test-01_directory.R b/R-pkg/tests/testthat/test-01_directory.R index 96627880..5af11b8d 100644 --- a/R-pkg/tests/testthat/test-01_directory.R +++ b/R-pkg/tests/testthat/test-01_directory.R @@ -34,4 +34,31 @@ test_that("mk_proj_dir", { path <- mk_proj_dir() expect_true(dir.exists(path)) expect_error(fs::dir_delete(path), NA) -}) \ No newline at end of file +}) + +test_that("clean_proj_dir", { + # test dir + tmp_dir <- mk_proj_dir("test_clean_proj_dir") + # setup test + fs::file_create(file.path(tmp_dir, "file1")) + fs::file_create(file.path(tmp_dir, "file2")) + fs::dir_create(file.path(tmp_dir, "subdir1")) + fs::file_create(file.path(tmp_dir, "subdir1", "file1")) + fs::file_create(file.path(tmp_dir, "subdir1", "file1")) + fs::dir_create(file.path(tmp_dir, "subdir1", "subdir2")) + fs::file_create(file.path(tmp_dir, "subdir1", "subdir2", "file1")) + fs::file_create(file.path(tmp_dir, "subdir1", "subdir2", "file1")) + # run + clean_proj_dir(tmp_dir) + # check + expect_equal(length(list.files(tmp_dir)), 0) + +}) + +test_that("clean_bin_dir", { + # test cleaning + clean_bin_dir() + expect_true(all(list.files(bin_dir()) %in% c("LICENSE", "README.md"))) + # dl latest bin + dl_all_latest_bin() +}) diff --git a/R-pkg/tests/testthat/test-02_regex.R b/R-pkg/tests/testthat/test-02_regex.R new file mode 100644 index 00000000..09097c41 --- /dev/null +++ b/R-pkg/tests/testthat/test-02_regex.R @@ -0,0 +1,29 @@ +context("02_regex") + +test_that("int_regex", { + expect_true(str_detect("12", int_regex())) + expect_false(str_detect("xxx", int_regex())) +}) + +test_that("num_regex", { + expect_true(str_detect("12", num_regex())) + expect_true(str_detect("0.12", num_regex())) + expect_true(str_detect("12.0", num_regex())) + expect_true(str_detect("12.", num_regex())) + expect_true(str_detect(".12", num_regex())) + expect_false(str_detect("xxx", num_regex())) +}) + +test_that("numexp_regex", { + expect_true(str_detect("12", numexp_regex())) + expect_true(str_detect("0.12", numexp_regex())) + expect_true(str_detect("12.0", numexp_regex())) + expect_true(str_detect("12.", numexp_regex())) + expect_true(str_detect(".12", numexp_regex())) + expect_true(str_detect("12E7", numexp_regex())) + expect_true(str_detect("0.12e-6", numexp_regex())) + expect_true(str_detect("12.0E-8", numexp_regex())) + expect_true(str_detect("12.E-001", numexp_regex())) + expect_true(str_detect(".12E10", numexp_regex())) + expect_false(str_detect("xxx", numexp_regex())) +}) diff --git a/R-pkg/tests/testthat/test-03_utils.R b/R-pkg/tests/testthat/test-03_utils.R index 6926674b..d83e2cd8 100644 --- a/R-pkg/tests/testthat/test-03_utils.R +++ b/R-pkg/tests/testthat/test-03_utils.R @@ -22,14 +22,6 @@ test_that("find_bin", { expect_true(file.exists(path)) }) -test_that("clean_bin_dir", { - # test cleaning - clean_bin_dir() - expect_true(all(list.files(bin_dir()) %in% c("LICENSE", "README.md"))) - # dl latest bin - dl_all_latest_bin() -}) - test_that("get_os", { expect_true(is.character(get_os("abcranger"))) }) diff --git a/R-pkg/tests/testthat/test-41_input_read.R b/R-pkg/tests/testthat/test-41_input_read.R new file mode 100644 index 00000000..aef70b20 --- /dev/null +++ b/R-pkg/tests/testthat/test-41_input_read.R @@ -0,0 +1,347 @@ +context("41_input_read") + +test_that("read_header", { + + ## SNP IndSeq + # estim param + test_proj <- "IndSeq_SNP_estim_param" + test_dir <- file.path(data4test_dir(), test_proj) + file_name <- file.path(test_dir, "headerRF.txt") + file_type <- "text/plain" + locus_type <- "snp" + res <- read_header(file_name, file_type, locus_type) + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_equal(res$data_file, "indseq_SNP_sim_dataset_4POP_001.snp") + expect_equal(res$n_param, 8) + expect_equal(res$n_stat, 130) + expect_equal(res$n_scen, 1) + expect_equal(res$n_scen, length(res$scenario_list)) + expect_equal(length(res$n_param_scen), 1) + expect_equal(length(res$n_param_scen), length(res$scenario_list)) + expect_equal(res$n_param_scen, 8) + expect_equal(res$n_cond, 2) + expect_equal(res$n_cond, length(res$cond_list)) + expect_equal(res$simu_mode, "DRAW UNTIL") + expect_equal(res$n_locus_desc, 1) + expect_equal(res$n_locus_desc, length(res$locus_desc)) + expect_equal(res$n_prior, 8) + expect_equal(res$n_prior, length(res$prior_list)) + expect_equal(res$header_file, basename(file_name)) + + # indeseq -> bad header file + test_proj <- "bad_files" + test_dir <- file.path(test_input_dir(), "bad_files") + file_name <- file.path(test_dir, "headerRF.txt") + file_type <- "text/plain" + locus_type = "mss" + res <- read_header(file_name, file_type, locus_type) + expect_false(res$valid) + expect_equal(length(res$msg), 1) + + # model choice + test_proj <- "IndSeq_SNP_model_choice" + test_dir <- file.path(data4test_dir(), test_proj) + file_name <- file.path(test_dir, "headerRF.txt") + file_type <- "text/plain" + locus_type = "snp" + res <- read_header(file_name, file_type, locus_type) + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_equal(res$data_file, "indseq_SNP_sim_dataset_4POP_001.snp") + expect_equal(res$n_param, 13) + expect_equal(res$n_stat, 130) + expect_equal(res$n_scen, 6) + expect_equal(res$n_scen, length(res$scenario_list)) + expect_equal(length(res$n_param_scen), 6) + expect_equal(length(res$n_param_scen), length(res$scenario_list)) + expect_equal(res$n_param_scen, c(8L, 8L, 8L, 7L, 7L, 7L)) + expect_equal(res$n_cond, 6) + expect_equal(res$n_cond, length(res$cond_list)) + expect_equal(res$simu_mode, "DRAW UNTIL") + expect_equal(res$n_locus_desc, 1) + expect_equal(res$n_locus_desc, length(res$locus_desc)) + expect_equal(res$n_prior, 13) + expect_equal(res$n_prior, length(res$prior_list)) + expect_equal(res$header_file, basename(file_name)) + + ## SNP PoolSeq + # estim param + test_proj <- "PoolSeq_SNP_estim_param" + test_dir <- file.path(data4test_dir(), test_proj) + file_name <- file.path(test_dir, "headerRF.txt") + file_type <- "text/plain" + locus_type = "snp" + res <- read_header(file_name, file_type, locus_type) + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_equal(res$data_file, "poolseq_SNP_sim_dataset_4POP_cov100_001.snp") + expect_equal(res$n_param, 8) + expect_equal(res$n_stat, 130) + expect_equal(res$n_scen, 1) + expect_equal(res$n_scen, length(res$scenario_list)) + expect_equal(length(res$n_param_scen), 1) + expect_equal(length(res$n_param_scen), length(res$scenario_list)) + expect_equal(res$n_param_scen, 8) + expect_equal(res$n_cond, 2) + expect_equal(res$n_cond, length(res$cond_list)) + expect_equal(res$simu_mode, "DRAW UNTIL") + expect_equal(res$n_locus_desc, 1) + expect_equal(res$n_locus_desc, length(res$locus_desc)) + expect_equal(res$n_prior, 8) + expect_equal(res$n_prior, length(res$prior_list)) + expect_equal(res$header_file, basename(file_name)) + + # model choice + test_proj <- "PoolSeq_SNP_model_choice" + test_dir <- file.path(data4test_dir(), test_proj) + file_name <- file.path(test_dir, "headerRF.txt") + file_type <- "text/plain" + locus_type = "snp" + res <- read_header(file_name, file_type, locus_type) + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_equal(res$data_file, "poolseq_SNP_sim_dataset_4POP_cov100_001.snp") + expect_equal(res$n_param, 13) + expect_equal(res$n_stat, 130) + expect_equal(res$n_scen, 6) + expect_equal(res$n_scen, length(res$scenario_list)) + expect_equal(length(res$n_param_scen), 6) + expect_equal(length(res$n_param_scen), length(res$scenario_list)) + expect_equal(res$n_param_scen, c(8L, 8L, 8L, 7L, 7L, 7L)) + expect_equal(res$n_cond, 6) + expect_equal(res$n_cond, length(res$cond_list)) + expect_equal(res$simu_mode, "DRAW UNTIL") + expect_equal(res$n_locus_desc, 1) + expect_equal(res$n_locus_desc, length(res$locus_desc)) + expect_equal(res$n_prior, 13) + expect_equal(res$n_prior, length(res$prior_list)) + expect_equal(res$header_file, basename(file_name)) + + ## MSS + # microsat + test_proj <- "Microsat" + test_dir <- file.path(data4test_dir(), test_proj) + file_name <- file.path(test_dir, "header.txt") + file_type <- "text/plain" + locus_type = "mss" + res <- read_header(file_name, file_type, locus_type) + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_equal( + res$data_file, + "simu_dataset_microsat_one_pop_bottleneck_multisamples_001.mss" + ) + expect_equal(res$n_param, 6) + expect_equal(res$n_stat, 40) + expect_equal(res$n_scen, 2) + expect_equal(res$n_scen, length(res$scenario_list)) + expect_equal(length(res$n_param_scen), 2) + expect_equal(length(res$n_param_scen), length(res$scenario_list)) + expect_equal(res$n_param_scen, c(3L, 1L)) + expect_equal(res$n_cond, 0) + expect_equal(res$n_cond, length(res$cond_list)) + expect_null(res$simu_mode) + expect_equal(res$n_locus_desc, 50) + expect_equal(res$n_locus_desc, length(res$locus_desc)) + expect_equal(res$n_prior, 3) + expect_equal(res$n_prior, length(res$prior_list)) + expect_equal(res$n_group, 1) + expect_equal(res$n_group, length(res$group_prior_list)) + expect_equal(res$header_file, basename(file_name)) + + # microsat sequence 1 + test_proj <- "Microsat_Sequences" + test_dir <- file.path(data4test_dir(), test_proj) + file_name <- file.path(test_dir, "header.txt") + file_type <- "text/plain" + locus_type = "mss" + res <- read_header(file_name, file_type, locus_type) + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_equal(res$data_file, "mss_example_001.mss") + expect_equal(res$n_param, 9) + expect_equal(res$n_stat, 2) + expect_equal(res$n_scen, 1) + expect_equal(res$n_scen, length(res$scenario_list)) + expect_equal(length(res$n_param_scen), 1) + expect_equal(length(res$n_param_scen), length(res$scenario_list)) + expect_equal(res$n_param_scen, 9) + expect_equal(res$n_cond, 2) + expect_equal(res$n_cond, length(res$cond_list)) + expect_equal(res$simu_mode, "DRAW UNTIL") + expect_equal(res$n_locus_desc, 28) + expect_equal(res$n_locus_desc, length(res$locus_desc)) + expect_equal(res$n_prior, 9) + expect_equal(res$n_prior, length(res$prior_list)) + expect_equal(res$n_group, 2) + expect_equal(res$n_group, length(res$group_prior_list)) + expect_equal(res$header_file, basename(file_name)) + + # microsat sequence 1 -> bad header file + test_proj <- "bad_files" + test_dir <- file.path(test_input_dir(), "bad_files") + file_name <- file.path(test_dir, "header.txt") + file_type <- "text/plain" + locus_type = "mss" + res <- read_header(file_name, file_type, locus_type) + expect_false(res$valid) + expect_equal(length(res$msg), 1) + + # microsat sequence 2 + test_proj <- "Microsat_Sequences2" + test_dir <- file.path(data4test_dir(), test_proj) + file_name <- file.path(test_dir, "header.txt") + file_type <- "text/plain" + locus_type = "mss" + res <- read_header(file_name, file_type, locus_type) + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_equal(res$data_file, "toytest2_micro_seq_complexe_001.mss") + expect_equal(res$n_param, 22) + expect_equal(res$n_stat, 85) + expect_equal(res$n_scen, 2) + expect_equal(res$n_scen, length(res$scenario_list)) + expect_equal(length(res$n_param_scen), 2) + expect_equal(length(res$n_param_scen), length(res$scenario_list)) + expect_equal(res$n_param_scen, c(9L, 5L)) + expect_equal(res$n_cond, 2) + expect_equal(res$n_cond, length(res$cond_list)) + expect_equal(res$simu_mode, "DRAW UNTIL") + expect_equal(res$n_locus_desc, 28) + expect_equal(res$n_locus_desc, length(res$locus_desc)) + expect_equal(res$n_prior, 9) + expect_equal(res$n_prior, length(res$prior_list)) + expect_equal(res$n_group, 5) + expect_equal(res$n_group, length(res$group_prior_list)) + expect_equal(res$header_file, basename(file_name)) +}) + + +test_that("parse_header_scenario", { + # model choice + test_proj <- "IndSeq_SNP_model_choice" + test_dir <- file.path(data4test_dir(), test_proj) + file_name <- file.path(test_dir, "headerRF.txt") + header <- str_split(read_file(file_name), "\n", simplify = TRUE) + + content <- header[5:13] + res <- parse_header_scenario(content) + expect_true(res$valid) + expect_equal(res$id, 1) + expect_equal(res$n_param, 8) + expect_equal(res$prior, 0.16667) + expect_equal(res$scenario, str_c(header[6:13], collapse = "\n")) + + content <- header[6:13] + res <- parse_header_scenario(content) + + expect_false(res$valid) +}) + + +test_that("parse_header_group_prior", { + # microsat + test_proj <- "Microsat_Sequences" + test_dir <- file.path(data4test_dir(), test_proj) + file_name <- file.path(test_dir, "header.txt") + header <- str_split(read_file(file_name), "\n", simplify = TRUE) + + content <- header[60:74] + n_group <- 2 + current_line <- 59 + res <- parse_header_group_prior(content, n_group, current_line) + + expect_true(res$valid) + expect_equal(length(res$group_prior), 2) + expect_equal(length(res$mss_type), 2) + expect_equal(res$current_line, 74) +}) + + +test_that("read_statobs", { + + # existing file + test_proj <- "IndSeq_SNP_estim_param" + test_dir <- file.path(data4test_dir(), test_proj) + file_name <- file.path(test_dir, "statobsRF.txt") + file_type <- "text/plain" + tmp <- read_statobs(file_name, file_type, n_stat = 130) + expect_true(tmp$valid) + + # bad number of summary stats + test_proj <- "IndSeq_SNP_estim_param" + test_dir <- file.path(data4test_dir(), test_proj) + file_name <- file.path(test_dir, "statobsRF.txt") + file_type <- "text/plain" + res <- read_statobs(file_name, file_type, n_stat = 120) + expect_false(res$valid) + expect_true(length(res$msg) == 1) + + # unexisting file + file_name <- file.path(test_dir, "toto.txt") + tmp <- read_statobs(file_name, file_type, n_stat = 130) + expect_false(tmp$valid) + expect_true(length(tmp$msg) == 1) + + # unexisting file and wrong file type + file_name <- file.path(test_dir, "toto.txt") + file_type <- "toto" + tmp <- read_statobs(file_name, file_type, n_stat = 130) + expect_false(tmp$valid) + expect_true(length(tmp$msg) == 2) +}) + + +test_that("read_reftable", { + + ## SNP estim param + test_proj <- "IndSeq_SNP_estim_param" + test_dir <- file.path(data4test_dir(), test_proj) + file_name <- file.path(test_dir, "reftableRF.bin") + file_type <- "application/octet-stream" + res <- read_reftable(file_name, file_type) + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_equal(res$n_rec, 12000) + expect_equal(res$n_scen, 1) + expect_equal(res$n_rec_scen, 12000) + expect_equal(res$n_param_scen, 8) + expect_equal(res$n_stat, 130) + + ## SNP model choice + test_proj <- "IndSeq_SNP_model_choice" + test_dir <- file.path(data4test_dir(), test_proj) + file_name <- file.path(test_dir, "reftableRF.bin") + file_type <- "application/octet-stream" + res <- read_reftable(file_name, file_type) + expect_equal(length(res$msg), 0) + expect_equal(res$n_rec, 12000) + expect_equal(res$n_scen, 6) + expect_equal(res$n_rec_scen, c(1942L, 1950L, 2028L, 2001L, 1994L, 2085L)) + expect_equal(res$n_param_scen, c(8L, 8L, 8L, 7L, 7L, 7L)) + expect_equal(res$n_stat, 130) + + ## wrong file type + test_proj <- "IndSeq_SNP_model_choice" + test_dir <- file.path(data4test_dir(), test_proj) + file_name <- file.path(test_dir, "reftableRF.bin") + file_type <- "text/plain" + res <- read_reftable(file_name, file_type) + expect_false(res$valid) + expect_true(length(res$msg) == 1) + + ## unexisting file + file_name <- file.path(test_dir, "toto.txt") + file_type <- "application/octet-stream" + res <- read_reftable(file_name, file_type) + expect_false(res$valid) + expect_true(length(res$msg) == 1) + + ## unexisting file and wrong file type + file_name <- file.path(test_dir, "toto.txt") + file_type <- "toto" + res <- read_reftable(file_name, file_type) + expect_false(res$valid) + expect_true(length(res$msg) == 2) +}) diff --git a/R-pkg/tests/testthat/test-41_io_management.R b/R-pkg/tests/testthat/test-41_io_management.R index 42048cd7..db28bf02 100644 --- a/R-pkg/tests/testthat/test-41_io_management.R +++ b/R-pkg/tests/testthat/test-41_io_management.R @@ -1,351 +1 @@ context("io_management") - -test_that("check_data_file", { - # snp indseq - data_file <- "indseq_SNP_sim_dataset_4POP_001.snp" - data_dir <- file.path(example_dir(), - "IndSeq_SNP_estim_param") - locus_type <- "snp" - seq_mode <- "indseq" - out <- check_data_file(data_file, data_dir, locus_type, seq_mode, - expected_data_file = data_file) - expect_true(out$valid) - - # snp poolseq - data_file <- "poolseq_SNP_sim_dataset_4POP_cov100_001.snp" - data_dir <- file.path(example_dir(), - "PoolSeq_SNP_estim_param") - locus_type <- "snp" - seq_mode <- "poolseq" - out <- check_data_file(data_file, data_dir, locus_type, seq_mode, - expected_data_file = data_file) - expect_true(out$valid) - - # mss - data_file <- "mss_example_001.mss" - data_dir <- data4test_dir("mss") - locus_type <- "mss" - seq_mode <- NULL - out <- check_data_file(data_file, data_dir, locus_type, seq_mode, - expected_data_file = data_file) - expect_true(out$valid) -}) - -test_that("check_indseq_snp_data_file", { - - data_file <- "indseq_SNP_sim_dataset_4POP_001.snp" - data_dir <- file.path(example_dir(), - "IndSeq_SNP_estim_param") - out <- check_indseq_snp_data_file( - data_file, data_dir, expected_data_file = NULL - ) - expect_true(out$valid) - - data_file <- "poolseq_SNP_sim_dataset_4POP_cov100_001.snp" - data_dir <- file.path(example_dir(), - "PoolSeq_SNP_estim_param") - out <- check_indseq_snp_data_file( - data_file, data_dir, expected_data_file = NULL - ) - expect_false(out$valid) -}) - -test_that("indseq_locus_filter", { - # generate test data - n_indiv <- 100 - n_pop <- 5 - indiv_id <- str_c("ind", 1:n_indiv) - indiv_sex <- sample(c("F", "M"), size = n_indiv, replace = TRUE) - indiv_pop <- sample(1:n_pop, size = n_indiv, replace = TRUE) - content <- data.frame( - IND = str_c("ind", 1:n_indiv), - SEX = indiv_sex, - POP = indiv_pop, - A = sample(0:2, size = n_indiv, replace = TRUE), - H = sample(0:1, size = n_indiv, replace = TRUE), - X = ifelse( - indiv_sex == "F", - sample(0:2, size = n_indiv, replace = TRUE), - sample(0:1, size = n_indiv, replace = TRUE) - ), - Y = ifelse( - indiv_sex == "F", - rep(9, n_indiv), - sample(0:1, size = n_indiv, replace = TRUE) - ), - M = sample(0:1, size = n_indiv, replace = TRUE), - stringsAsFactors = FALSE - ) - - col_type <- colnames(content) - - locus_details <- data.frame( - count = rep(1, 5), - type = tail(col_type, 5), - stringsAsFactors = FALSE - ) - - maf <- 0.05 - - # test without missing values at random - snp_filter <- lapply( - 1:(ncol(content) - 3) + 3, - function(ind) { - out <- indseq_locus_filter( - snp_data = content[,ind], - sex_data = content[,2], - locus_type = col_type[ind], - maf = maf - ) - } - ) - snp_filter <- Reduce("rbind", snp_filter) - - expect_true(is.data.frame(snp_filter)) - expect_identical(colnames(snp_filter), c("filter", "mono", "issue")) - }) - -test_that("filter_snp_indseq", { - ## generate simulated test data - n_indiv <- 100 - n_pop <- 5 - indiv_id <- str_c("ind", 1:n_indiv) - indiv_sex <- sample(c("F", "M"), size = n_indiv, replace = TRUE) - indiv_pop <- sample(1:n_pop, size = n_indiv, replace = TRUE) - content <- data.frame( - IND = str_c("ind", 1:n_indiv), - SEX = indiv_sex, - POP = indiv_pop, - A = sample(0:2, size = n_indiv, replace = TRUE), - H = sample(0:1, size = n_indiv, replace = TRUE), - X = ifelse( - indiv_sex == "F", - sample(0:2, size = n_indiv, replace = TRUE), - sample(0:1, size = n_indiv, replace = TRUE) - ), - Y = ifelse( - indiv_sex == "F", - rep(9, n_indiv), - sample(0:1, size = n_indiv, replace = TRUE) - ), - M = sample(0:1, size = n_indiv, replace = TRUE), - stringsAsFactors = FALSE - ) - - col_type <- colnames(content) - - locus_details <- data.frame( - count = rep(1, 5), - type = tail(col_type, 5), - stringsAsFactors = FALSE - ) - - maf <- 0.05 - - # test on simulated data - out <- filter_snp_indseq(content, col_type, locus_details, maf) - - expect_true(is.data.frame(out)) - expect_true(all(colnames(out) %in% c("type", "mono", "count", "filter", "issue"))) - - ## test on SNP data file - data_file <- "indseq_SNP_sim_dataset_4POP_001.snp" - data_dir <- file.path(example_dir(), - "IndSeq_SNP_estim_param") - data_path <- file.path(data_dir, data_file) - - # header - col_type <- unname(unlist( - read.table(file = data_path, skip = 1, nrows = 1) - )) - - # locus type - candidate_locus <- c("A", "H", "X", "Y", "M") - locus_encoding <- str_c(header[-(1:3)], collapse = " ") - locus_details <- Reduce("rbind", lapply( - candidate_locus, - function(pttrn) { - count <- str_count(locus_encoding, pttrn) - return(data.frame( - count = count, - type = pttrn, - stringsAsFactors = FALSE - )) - } - )) - - # data - content <- read.table(file = data_path, skip = 2) - - # test on data - out <- filter_snp_indseq(content, col_type, locus_details, maf) - - expect_true(is.data.frame(out)) - expect_true(all(colnames(out) %in% c("type", "mono", "count", "filter", "issue"))) - - -}) - -test_that("check_poolseq_snp_data_file", { - - data_file <- "poolseq_SNP_sim_dataset_4POP_cov100_001.snp" - data_dir <- file.path(example_dir(), - "PoolSeq_SNP_estim_param") - out <- check_poolseq_snp_data_file( - data_file, data_dir, expected_data_file = NULL - ) - expect_true(out$valid) - - data_file <- "indseq_SNP_sim_dataset_4POP_001.snp" - data_dir <- file.path(example_dir(), - "IndSeq_SNP_estim_param") - out <- check_poolseq_snp_data_file( - data_file, data_dir, expected_data_file = NULL - ) - expect_false(out$valid) -}) - -test_that("check_mss_data_file", { - - data_file <- "mss_example_001.mss" - data_dir <- data4test_dir("mss") - out <- check_mss_data_file( - data_file, data_dir, expected_data_file = NULL - ) - expect_true(out$valid) - - data_file <- "indseq_SNP_sim_dataset_4POP_001.snp" - data_dir <- file.path(example_dir(), - "IndSeq_SNP_estim_param") - out <- check_mss_data_file( - data_file, data_dir, expected_data_file = NULL - ) - expect_false(out$valid) -}) - -test_that("check_file_name", { - expect_true(check_file_name(system.file("DESCRIPTION", - package = "diyabcGUI"))) - expect_false(check_file_name(5)) - expect_false(check_file_name(file.path(fs::path_home(), - "not_existing_file"))) -}) - - -test_that("parse_diyabc_header", { - # model choice - file_name <- file.path(example_dir(), - "IndSeq_SNP_model_choice", - "headerRF.txt") - file_type = "text/plain" - locus_type = "snp" - expect_equal( - parse_diyabc_header(file_name, file_type, locus_type), - list( - data_file="indseq_SNP_sim_dataset_4POP_001.snp", - loci_description="5000 G1 from 1", - n_loci_des=1, n_param=13, n_prior=13, n_sumstat=2, - raw_cond_list=c("t21>t32", "t42t423", - "t431t32", "t431t2")) + expect_true(check_header_cond("t1>=t2")) + expect_true(check_header_cond("t1")) + expect_false(check_header_cond("t1>")) + expect_false(check_header_cond(" [M] G1 2 40", type = "mss" + )) + expect_true(check_header_group_prior( + "Locus_yyy [S] G2 1000", type = "mss" + )) + + expect_false(check_header_group_prior( + "Locus_xxx [M G1 2 40", type = "mss" + )) + expect_false(check_header_group_prior( + " [M] G1 2 40", type = "mss" + )) + expect_false(check_header_group_prior( + "Locus_xxx [M] G1 2", type = "mss" + )) + expect_false(check_header_group_prior( + "Locus_yyy G1 from 1", type = "snp" + )) + expect_true(check_header_group_prior( + "5000 G2 from 10", type = "snp" + )) + + expect_false(check_header_group_prior( + " G2 from 10", type = "snp" + )) + expect_false(check_header_group_prior( + "5000 G2 from ", type = "snp" + )) + expect_false(check_header_group_prior( + "5000 G from 10", type = "snp" + )) +}) + + +test_that("check_header_group_prior", { + ## MSS + content <- c( + "MEANMU UN[1.00E-004,1.00E-3,0.0005,2]", + "GAMMU GA[1.00E-005,1.00E-002,Mean_u,2]", + "MEANP UN[1.00E-001,3.00E-001,0.22,2]", + "GAMP GA[1.00e-002,9.00E-001,Mean_P,2]", + "MEANSNI LU[1.00E-008,1.00E-005,1.00E-007,2]", + "GAMSNI GA[1.00E-009,1.00E-004,Mean_u_SNI,2]" + ) + expect_true(check_header_group_prior(content, type = "M")) + + content <- c( + "MEANMU UN[1.00E-004,1.00E-3,0.0005,2]", + "GAMMU GA[1.00E-005,1.00E-002,Mean_u,2]", + "MEANP UN[1.00E-001,3.00E-001,0.22,2]", + "GAMP GA[1.00E-002,9.00E-001,Mean_P,2]" + ) + expect_false(check_header_group_prior(content, type = "M")) + + content <- c( + "MEANMU UN[1.00E-004,1.00E-3,0.0005,2", + "GAMMU GA[1.00E-005,1.00E-002,Mean_u,2]", + "MEANP UN[1.00E-001,3.00E-001,0.22,2]", + "GAMP GA[1.00E-002,9.00E-001,Mean_P,2]", + "MEANSNI LU[1.00E-008,1.00E-005,1.00E-007,2]", + "GAMSNI GA[1.00E-009,1.00E-004,Mean_u_SNI,2]" + ) + expect_false(check_header_group_prior(content, type = "M")) + + content <- c( + "MEANMU UN[1.00E-9,1.00E-7,5E-9,2]", + "GAMMU GA[1.00E-9,1.00E-6,Mean_u,2]", + "MEANK1 UN[0.050,20,10,2]", + "GAMK1 GA[0.050,20,Mean_k1,2]", + "MEANK2 UN[0.050,20,10,2]", + "GAMK2 GA[0.050,20,Mean_k2,2]", + "MODEL TN 10 2.00" + ) + expect_false(check_header_group_prior(content, type = "M")) + + + ## SNP + content <- c( + "MEANMU UN[1.00e-9,1.00E-7,5E-9,2]", + "GAMMU GA[1.00E-9,1.00E-6,Mean_u,2]", + "MEANK1 UN[0.050,20,10,2]", + "GAMK1 GA[0.050,20,Mean_k1,2]", + "MEANK2 UN[0.050,20,10,2]", + "GAMK2 GA[0.050,20,Mean_k2,2]", + "MODEL TN 10 2.00" + ) + expect_true(check_header_group_prior(content, type = "S")) + + content <- c( + "MEANMU UN[1.00E-9,1.00E-7,5E-9,2]", + "GAMMU GA[1.00E-9,1.00E-6,Mean_u,2]", + "MEANK1 UN[0.050,20,10,2]", + "GAMK1 GA[0.050,20,Mean_k1,2]", + "MEANK2 UN[0.050,20,10,2]", + "GAMK2 GA[0.050,20,Mean_k2,2]", + "MODEL TN 10.99 2.00" + ) + expect_false(check_header_group_prior(content, type = "S")) + + content <- c( + "MEANMU UN[1.00E-9,1.00E-7,5E-9,2]", + "GAMMU GA[1.00E-9,1.00E-6,Mean_u,2]", + "MEANK2 UN[0.050,20,10,2]", + "GAMK2 GA[0.050,20,Mean_k2,2]", + "MODEL TN 10 2.00" + ) + expect_false(check_header_group_prior(content, type = "S")) + + content <- c( + "MEANMU UN[1.00E-9,1.00E-7,5E-9,2]", + "GAMMU GA[1.00E-9,1.00E-6,Mean_u,2]", + "MEANK1 UN[0.050,20,10,2]", + "GAMK1 GA[0.050,20,0,2]", + "MEANK2 UN[0.050,20,10,2]", + "GAMK2 GA[0.050,20,Mean_k2,2]", + "MODEL TN 10 2.00" + ) + expect_false(check_header_group_prior(content, type = "S")) + + + content <- c( + "MEANMU UN[1.00E-004,1.00E-3,0.0005,2]", + "GAMMU GA[1.00E-005,1.00E-002,Mean_u,2]", + "MEANP UN[1.00E-001,3.00E-001,0.22,2]", + "GAMP GA[1.00E-002,9.00E-001,Mean_P,2]", + "MEANSNI LU[1.00E-008,1.00E-005,1.00E-007,2]", + "GAMSNI GA[1.00E-009,1.00E-004,Mean_u_SNI,2]" + ) + expect_false(check_header_group_prior(content, type = "S")) +}) + diff --git a/R-pkg/tests/testthat/test-43_data_read.R b/R-pkg/tests/testthat/test-43_data_read.R new file mode 100644 index 00000000..74a67594 --- /dev/null +++ b/R-pkg/tests/testthat/test-43_data_read.R @@ -0,0 +1,340 @@ +context("43_data_read") + +test_that("read_indseq_snp_data", { + + # good file + data_file <- "indseq_SNP_sim_dataset_4POP_001.snp" + data_dir <- file.path(data4test_dir(), "IndSeq_SNP_estim_param") + res <- read_indseq_snp_data(data_file, data_dir) + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_equal(res$data_file, data_file) + expect_equal(res$n_loci, 30000) + expect_true(is.data.frame(res$locus_count)) + expect_equal(res$locus_count$count, 30000) + expect_equal(res$locus_count$filter, 13046) + expect_equal(res$locus_count$mono, 0) + expect_equal(res$n_pop, 4) + expect_equal(res$n_indiv, 40) + expect_equal(res$sex_ratio, "NM=1NF") + expect_equal(res$maf, 0.05) + + # bad file + data_file <- "poolseq_SNP_sim_dataset_4POP_cov100_001.snp" + data_dir <- file.path(data4test_dir(), "PoolSeq_SNP_estim_param") + res <- read_indseq_snp_data(data_file, data_dir) + expect_false(res$valid) + expect_true(length(res$msg) > 0) +}) + +test_that("process_indseq_locus", { + # generate test data + n_indiv <- 100 + n_pop <- 5 + indiv_id <- str_c("ind", 1:n_indiv) + indiv_sex <- sample(c("F", "M"), size = n_indiv, replace = TRUE) + indiv_pop <- sample(1:n_pop, size = n_indiv, replace = TRUE) + content <- data.frame( + IND = str_c("ind", 1:n_indiv), + SEX = indiv_sex, + POP = as.character(indiv_pop), + A = sample(0:2, size = n_indiv, replace = TRUE), + H = sample(0:1, size = n_indiv, replace = TRUE), + X = ifelse( + indiv_sex == "F", + sample(0:2, size = n_indiv, replace = TRUE), + sample(0:1, size = n_indiv, replace = TRUE) + ), + Y = ifelse( + indiv_sex == "F", + rep(9, n_indiv), + sample(0:1, size = n_indiv, replace = TRUE) + ), + M = sample(0:1, size = n_indiv, replace = TRUE), + stringsAsFactors = FALSE + ) + + col_type <- colnames(content) + + locus_count <- data.frame( + count = rep(1, 5), + type = tail(col_type, 5), + stringsAsFactors = FALSE + ) + + maf <- 0.05 + + # test on a single SNP + ind <- 4 + snp_data <- content[,ind] + sex_data <- content[,2] + pop_data <- content[,3] + snp_type <- col_type[ind] + + res <- process_indseq_locus(snp_data, sex_data, pop_data, snp_type, maf) + expect_true(res$valid) + expect_false(res$filter) + expect_false(res$mono) + expect_true(is.na(res$missing_pop)) + expect_true(is.na(res$issue_X)) + expect_true(is.na(res$issue_Y)) + expect_true(is.numeric(res$af)) + expect_true(is.numeric(res$maf)) + expect_false(res$hudson) + + # test on multiple SNP without missing values at random + res <- Reduce("rbind", lapply( + 1:(ncol(content) - 3) + 3, + function(ind) { + out <- process_indseq_locus( + snp_data = content[,ind], + sex_data = content[,2], + pop_data = content[,3], + snp_type = col_type[ind], + maf = maf + ) + } + )) + + expect_true(is.data.frame(res)) + + ## test missing pop and missing values for X and Y chromosome + # generate test data + n_indiv <- 100 + n_pop <- 5 + indiv_id <- str_c("ind", 1:n_indiv) + indiv_sex <- sort(rep(c("F", "M"), length.out = n_indiv)) + indiv_pop <- sort(rep(str_c("pop", 1:n_pop), length.out = n_indiv)) + content <- data.frame( + IND = str_c("ind", 1:n_indiv), + SEX = indiv_sex, + POP = as.character(indiv_pop), + A = c(rep(9, sum(indiv_pop == "pop1")), + sample(0:2, size = n_indiv - sum(indiv_pop == "pop1"), + replace = TRUE)), + H = sample(0:1, size = n_indiv, replace = TRUE), + X = sample(0:2, size = n_indiv, replace = TRUE), + Y = sample(0:1, size = n_indiv, replace = TRUE), + M = sample(0:1, size = n_indiv, replace = TRUE), + stringsAsFactors = FALSE + ) + col_type <- colnames(content) + # run + res <- Reduce("rbind", lapply( + 1:(ncol(content) - 3) + 3, + function(ind) { + out <- process_indseq_locus( + snp_data = content[,ind], + sex_data = content[,2], + pop_data = content[,3], + snp_type = col_type[ind], + maf = maf + ) + } + )) + + expect_equal(res$valid, c(FALSE, TRUE, FALSE, FALSE, TRUE)) +}) + +test_that("check_snp_indseq", { + ## generate simulated test data + n_indiv <- 100 + n_pop <- 5 + indiv_id <- str_c("ind", 1:n_indiv) + indiv_sex <- sample(c("F", "M"), size = n_indiv, replace = TRUE) + indiv_pop <- sample(1:n_pop, size = n_indiv, replace = TRUE) + data_tab <- data.frame( + IND = str_c("ind", 1:n_indiv), + SEX = indiv_sex, + POP = indiv_pop, + A = sample(0:2, size = n_indiv, replace = TRUE), + H = sample(0:1, size = n_indiv, replace = TRUE), + X = ifelse( + indiv_sex == "F", + sample(0:2, size = n_indiv, replace = TRUE), + sample(0:1, size = n_indiv, replace = TRUE) + ), + Y = ifelse( + indiv_sex == "F", + rep(9, n_indiv), + sample(0:1, size = n_indiv, replace = TRUE) + ), + M = sample(0:1, size = n_indiv, replace = TRUE), + stringsAsFactors = FALSE + ) + + indiv_info <- data_tab[,1:3] + content <- t(data_tab[,-(1:3)]) + snp_type <- rownames(content) + + locus_count <- data.frame( + count = rep(1, 5), type = snp_type, + stringsAsFactors = FALSE + ) + + maf <- 0.05 + + # test on simulated data + res <- check_snp_indseq(content, indiv_info, snp_type, locus_count, maf) + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_true(is.data.frame(res$locus_count)) + + ## test missing pop and missing values for X and Y chromosome + # generate test data + n_indiv <- 100 + n_pop <- 5 + indiv_id <- str_c("ind", 1:n_indiv) + indiv_sex <- sort(rep(c("F", "M"), length.out = n_indiv)) + indiv_pop <- sort(rep(str_c("pop", 1:n_pop), length.out = n_indiv)) + data_tab <- data.frame( + IND = str_c("ind", 1:n_indiv), + SEX = indiv_sex, + POP = as.character(indiv_pop), + A = c(rep(9, sum(indiv_pop == "pop1")), + sample(0:2, size = n_indiv - sum(indiv_pop == "pop1"), + replace = TRUE)), + H = sample(0:1, size = n_indiv, replace = TRUE), + X = sample(0:2, size = n_indiv, replace = TRUE), + Y = sample(0:1, size = n_indiv, replace = TRUE), + M = sample(0:1, size = n_indiv, replace = TRUE), + stringsAsFactors = FALSE + ) + + indiv_info <- data_tab[,1:3] + content <- t(data_tab[,-(1:3)]) + snp_type <- rownames(content) + + locus_count <- data.frame( + count = rep(1, 5), type = snp_type, + stringsAsFactors = FALSE + ) + + maf <- 0.05 + + # test on simulated data + res <- check_snp_indseq(content, indiv_info, snp_type, locus_count, maf) + expect_false(res$valid) + expect_equal(length(res$msg), 3) + + ## test on SNP data file + data_file <- "indseq_SNP_sim_dataset_4POP_001.snp" + data_dir <- file.path(data4test_dir(), "IndSeq_SNP_estim_param") + data_path <- file.path(data_dir, data_file) + + # header + header <- unname(unlist( + read.table(file = data_path, skip = 1, nrows = 1) + )) + + # locus type + candidate_locus <- c("A", "H", "X", "Y", "M") + locus_encoding <- str_c(header[-(1:3)], collapse = " ") + locus_count <- Reduce("rbind", lapply( + candidate_locus, + function(pttrn) { + count <- str_count(locus_encoding, pttrn) + return(data.frame( + count = count, + type = pttrn, + stringsAsFactors = FALSE + )) + } + )) + + # data + content <- read.table(file = data_path, skip = 2) + + snp_type <- header[-(1:3)] + indiv_info <- content[,1:3] + colnames(indiv_info) <- header[1:3] + content <- t(content[,-(1:3)]) + + # test on data + res <- check_snp_indseq(content, indiv_info, snp_type, locus_count, maf) + expect_true(res$valid) + expect_true(is.data.frame(res$locus_count)) + expect_equal(res$locus_count$count, 30000) + expect_equal(res$locus_count$filter, 13046) + expect_equal(res$locus_count$mono, 0) +}) + + +test_that("read_poolseq_snp_data", { + + # good file + data_file <- "poolseq_SNP_sim_dataset_4POP_cov100_001.snp" + data_dir <- file.path(data4test_dir(), "PoolSeq_SNP_estim_param") + res <- read_poolseq_snp_data(data_file, data_dir) + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_equal(res$data_file, data_file) + expect_equal(res$n_loci, 30000) + expect_true(is.data.frame(res$locus_count)) + expect_equal(res$locus_count$count, 30000) + expect_equal(res$locus_count$filter, 15612) + expect_equal(res$locus_count$mono, 5918) + expect_equal(res$n_pop, 4) + expect_equal(res$sex_ratio, "NM=1NF") + expect_equal(res$mrc, 5) + + # bad file + data_file <- "indseq_SNP_sim_dataset_4POP_001.snp" + data_dir <- file.path(data4test_dir(), "IndSeq_SNP_estim_param") + res <- read_poolseq_snp_data(data_file, data_dir) + expect_false(res$valid) + expect_true(length(res$msg) > 0) +}) + + +test_that("check_snp_poolseq", { + + ## test on SNP data file + data_file <- "poolseq_SNP_sim_dataset_4POP_cov100_001.snp" + data_dir <- file.path(data4test_dir(), "PoolSeq_SNP_estim_param") + data_path <- file.path(data_dir, data_file) + + # header + header <- unname(unlist( + read.table(file = data_path, skip = 1, nrows = 1) + )) + + # data + content <- read.table(file = data_path, skip = 2) + + # test on data + mrc <- 5 + res <- check_snp_poolseq(content, mrc) + expect_true(is.data.frame(res$locus_count)) + expect_equal(res$locus_count$count, 30000) + expect_equal(res$locus_count$filter, 15612) + expect_equal(res$locus_count$mono, 5918) +}) + +test_that("read_mss_data", { + + ## good file + data_file <- "mss_example_001.mss" + data_dir <- data4test_dir("mss") + res <- read_mss_data(data_file, data_dir) + expect_true(out$valid) + expect_equal(length(out$msg), 0) + expect_equal(res$data_file, data_file) + expect_equal(res$n_loci, 28) + expect_true(is.data.frame(res$locus_count)) + expect_equal(sum(res$locus_count$count), res$n_loci) + expect_equal(res$n_pop, 3) + expect_equal(res$n_indiv, 60) + expect_equal(res$pop_size, c(20,20,20)) + expect_equal(res$sex_ratio, "NM=2.33333NF") + expect_equal(length(res$locus_desc), res$n_loci) + expect_equal(length(res$locus_mode), res$n_loci) + + ## bad file + data_file <- "indseq_SNP_sim_dataset_4POP_001.snp" + data_dir <- file.path(example_dir(), + "IndSeq_SNP_estim_param") + out <- read_mss_data(data_file, data_dir) + expect_false(out$valid) +}) + diff --git a/R-pkg/tests/testthat/test-62_analysis_project_io.R b/R-pkg/tests/testthat/test-62_analysis_project_io.R new file mode 100644 index 00000000..cdaf40bd --- /dev/null +++ b/R-pkg/tests/testthat/test-62_analysis_project_io.R @@ -0,0 +1,383 @@ +context("test-62_analysis_project_io") + +test_that("proj_file_input", { + + ## clean project files + test_proj <- "IndSeq_SNP_estim_param" + + # setup test case + tmp_dir <- file.path(mk_proj_dir("test-62_analysis_project_io"), test_proj) + test_dir <- file.path(data4test_dir(), test_proj) + fs::dir_copy(test_dir, tmp_dir, overwrite = TRUE) + + tmp_file_list <- list.files(tmp_dir) + tmp_file_info <- file.info( + file.path(tmp_dir, tmp_file_list) + ) + + # input parameters + file_input <- data.frame( + name = tmp_file_list, + size = tmp_file_info$size, + type = ifelse( + tmp_file_info$isdir, + "diyabc_dir", + "diyabc_file" + ), + datapath = file.path(tmp_dir, tmp_file_list), + stringsAsFactors = FALSE + ) + + # specific file type + ind <- which(out$file_input$name == "headerRF.txt") + file_input$type[ind] <- "text/plain" + ind <- which(file_input$name == "header.txt") + file_input$type[ind] <- "text/plain" + ind <- which(file_input$name == "reftableRF.bin") + file_input$type[ind] <- "application/octet-stream" + ind <- which(file_input$name == "statobsRF.txt") + file_input$type[ind] <- "text/plain" + + # tmp proj dir + proj_dir <- mk_proj_dir("test-62_analysis_project_io") + + # run + proj_file <- proj_file_input(file_input, proj_dir) + + expect_true(proj_file$valid) + expect_true(length(list.files(proj_dir)) == length(list.files(test_dir))) + + + ## no input + proj_file <- proj_file_input(head(file_input, 0), proj_dir) + + expect_false(proj_file$valid) + expect_true(length(proj_file$msg) == 1) + + + ## clean project zip file with root directory + test_proj <- "IndSeq_SNP_estim_param" + + # setup test case + tmp_dir <- mk_proj_dir("test-62_analysis_project_io") + test_dir <- file.path(data4test_dir(), test_proj) + zip_file <- file.path(tmp_dir, paste0(test_proj, ".zip")) + zip::zip( + zipfile = zip_file, files = test_proj, root = data4test_dir(), + recurse = TRUE, include_directories = TRUE + ) + + tmp_file <- list.files(tmp_dir) + tmp_file_info <- file.info( + file.path(tmp_dir, tmp_file) + ) + + # input parameters + file_input <- data.frame( + name = tmp_file, + size = tmp_file_info$size, + type = "application/zip", + datapath = file.path(tmp_dir, tmp_file), + stringsAsFactors = FALSE + ) + + # run + proj_file <- proj_file_input(file_input, proj_dir) + + expect_true(proj_file$valid) + expect_true(length(list.files(proj_dir)) == length(list.files(test_dir))) +}) + + + +test_that("proj_zip_file_input", { + + ## clean project zip file with root directory + test_proj <- "IndSeq_SNP_estim_param" + + # setup test case + tmp_dir <- mk_proj_dir("test-62_analysis_project_io") + test_dir <- file.path(data4test_dir(), test_proj) + zip_file <- file.path(tmp_dir, paste0(test_proj, ".zip")) + zip::zip( + zipfile = zip_file, files = test_proj, root = data4test_dir(), + recurse = TRUE, include_directories = TRUE + ) + + tmp_file <- list.files(tmp_dir) + tmp_file_info <- file.info( + file.path(tmp_dir, tmp_file) + ) + + # input parameters + file_input <- data.frame( + name = tmp_file, + size = tmp_file_info$size, + type = "application/zip", + datapath = file.path(tmp_dir, tmp_file), + stringsAsFactors = FALSE + ) + + # run + zip_proj <- proj_zip_file_input(file_input) + + expect_true(zip_proj$zip_file) + expect_true(zip_proj$valid) + expect_true(length(zip_proj$msg) == 0) + expect_true(nrow(zip_proj$file_input) == length(list.files(test_dir))) + + ## providing two zip files + file_input <- rbind(file_input, file_input) + + # run + zip_proj <- proj_zip_file_input(file_input) + + expect_true(zip_proj$zip_file) + expect_false(zip_proj$valid) + expect_true(length(zip_proj$msg) == 1) + + + ## clean project zip file without root directory + test_proj <- "IndSeq_SNP_estim_param" + + # setup test case + tmp_dir <- mk_proj_dir("test-62_analysis_project_io") + test_dir <- file.path(data4test_dir(), test_proj) + zip_file <- file.path(tmp_dir, paste0(test_proj, ".zip")) + zip::zip( + zipfile = zip_file, files = list.files(test_dir), root = test_dir, + recurse = TRUE, include_directories = TRUE + ) + + tmp_file <- list.files(tmp_dir) + tmp_file_info <- file.info( + file.path(tmp_dir, tmp_file) + ) + + # input parameters + file_input <- data.frame( + name = tmp_file, + size = tmp_file_info$size, + type = "application/zip", + datapath = file.path(tmp_dir, tmp_file), + stringsAsFactors = FALSE + ) + + # run + zip_proj <- proj_zip_file_input(file_input) + + expect_true(zip_proj$zip_file) + expect_true(zip_proj$valid) + expect_true(length(zip_proj$msg) == 0) + expect_true(nrow(zip_proj$file_input) == length(list.files(test_dir))) + + + ## providing empty zip file + + # setup test case + tmp_dir <- mk_proj_dir("test-62_analysis_project_io") + test_dir <- file.path(tmp_dir, "empty_project") + fs::dir_create(test_dir) + zip_file <- file.path(tmp_dir, "empty_project.zip") + zip::zip( + zipfile = zip_file, files = "empty_project", root = tmp_dir, + recurse = TRUE, include_directories = TRUE + ) + fs::dir_delete(test_dir) + + tmp_file <- list.files(tmp_dir) + tmp_file_info <- file.info( + file.path(tmp_dir, tmp_file) + ) + + # input parameters + file_input <- data.frame( + name = tmp_file, + size = tmp_file_info$size, + type = "application/zip", + datapath = file.path(tmp_dir, tmp_file), + stringsAsFactors = FALSE + ) + + # run + zip_proj <- proj_zip_file_input(file_input) + + expect_true(zip_proj$zip_file) + expect_false(zip_proj$valid) + expect_true(length(zip_proj$msg) == 1) +}) + + +test_that("check_proj_file", { + + ## SNP IndSeq + # estim param + test_proj <- "IndSeq_SNP_estim_param" + test_dir <- file.path(data4test_dir(), test_proj) + res <- check_proj_file(proj_dir = test_dir, locus_type = "snp") + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_true(res$header_check$valid) + expect_true(res$reftable_check$valid) + expect_true(res$statobs_check$valid) + + # bad locus type + res <- check_proj_file(proj_dir = test_dir, locus_type = "mss") + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_false(res$header_check$valid) + expect_true(res$reftable_check$valid) + expect_true(res$statobs_check$valid) + + # model choice + test_proj <- "IndSeq_SNP_model_choice" + test_dir <- file.path(data4test_dir(), test_proj) + res <- check_proj_file(proj_dir = test_dir, locus_type = "snp") + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_true(res$header_check$valid) + expect_true(res$reftable_check$valid) + expect_true(res$statobs_check$valid) + + ## SNP PoolSeq + # estim param + test_proj <- "PoolSeq_SNP_estim_param" + test_dir <- file.path(data4test_dir(), test_proj) + res <- check_proj_file(proj_dir = test_dir, locus_type = "snp") + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_true(res$header_check$valid) + expect_true(res$reftable_check$valid) + expect_true(res$statobs_check$valid) + + # model choice + test_proj <- "PoolSeq_SNP_model_choice" + test_dir <- file.path(data4test_dir(), test_proj) + res <- check_proj_file(proj_dir = test_dir, locus_type = "snp") + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_true(res$header_check$valid) + expect_true(res$reftable_check$valid) + expect_true(res$statobs_check$valid) + + ## MSS + # microsat + test_proj <- "Microsat" + test_dir <- file.path(data4test_dir(), test_proj) + res <- check_proj_file(proj_dir = test_dir, locus_type = "mss") + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_true(res$header_check$valid) + expect_null(res$reftable_check) + expect_null(res$statobs_check) + + # bad locus type + res <- check_proj_file(proj_dir = test_dir, locus_type = "snp") + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_false(res$header_check$valid) + expect_null(res$reftable_check) + expect_null(res$statobs_check) + + # microsat sequence 1 + test_proj <- "Microsat_Sequences" + test_dir <- file.path(data4test_dir(), test_proj) + res <- check_proj_file(proj_dir = test_dir, locus_type = "mss") + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_true(res$header_check$valid) + expect_null(res$reftable_check) + expect_null(res$statobs_check) + + # microsat sequence 2 + test_proj <- "Microsat_Sequences2" + test_dir <- file.path(data4test_dir(), test_proj) + res <- check_proj_file(proj_dir = test_dir, locus_type = "mss") + expect_true(res$valid) + expect_equal(length(res$msg), 0) + expect_true(res$header_check$valid) + expect_null(res$reftable_check) + expect_null(res$statobs_check) +}) + +test_that("check_data_file", { + # snp indseq + data_file <- "indseq_SNP_sim_dataset_4POP_001.snp" + data_dir <- file.path(data4test_dir(), "IndSeq_SNP_estim_param") + locus_type <- "snp" + seq_mode <- "indseq" + out <- check_data_file(data_file, data_dir, locus_type, seq_mode) + expect_true(out$valid) + + # snp poolseq + data_file <- "poolseq_SNP_sim_dataset_4POP_cov100_001.snp" + data_dir <- file.path(data4test_dir(), "PoolSeq_SNP_estim_param") + locus_type <- "snp" + seq_mode <- "poolseq" + out <- check_data_file(data_file, data_dir, locus_type, seq_mode) + expect_true(out$valid) + + # mss + data_file <- "mss_example_001.mss" + data_dir <- data4test_dir("mss") + locus_type <- "mss" + seq_mode <- NULL + out <- check_data_file(data_file, data_dir, locus_type, seq_mode) + expect_true(out$valid) +}) + +test_that("check_data_file", { + # FIXME + # snp indseq + data_file <- "indseq_SNP_sim_dataset_4POP_001.snp" + data_dir <- file.path(data4test_dir(), "IndSeq_SNP_estim_param") + locus_type <- "snp" + seq_mode <- "indseq" + out <- check_data_file_bg(data_file, data_dir, locus_type, seq_mode) + expect_false(resolved(out)) + expect_true(out$valid) + + # snp poolseq + data_file <- "poolseq_SNP_sim_dataset_4POP_cov100_001.snp" + data_dir <- file.path(data4test_dir(), "PoolSeq_SNP_estim_param") + locus_type <- "snp" + seq_mode <- "poolseq" + out <- check_data_file(data_file, data_dir, locus_type, seq_mode) + expect_true(out$valid) + + # mss + data_file <- "mss_example_001.mss" + data_dir <- data4test_dir("mss") + locus_type <- "mss" + seq_mode <- NULL + out <- check_data_file(data_file, data_dir, locus_type, seq_mode) + expect_true(out$valid) +}) + + +test_that("format_data_info", { + # snp indseq + data_file <- "indseq_SNP_sim_dataset_4POP_001.snp" + data_dir <- file.path(example_dir(), + "IndSeq_SNP_estim_param") + locus_type <- "snp" + seq_mode <- "indseq" + data_check <- check_data_file(data_file, data_dir, locus_type, seq_mode) + res <- format_data_info(data_check, locus_type, seq_mode) + + # snp poolseq + data_file <- "poolseq_SNP_sim_dataset_4POP_cov100_001.snp" + data_dir <- file.path(data4test_dir(), "PoolSeq_SNP_estim_param") + locus_type <- "snp" + seq_mode <- "poolseq" + data_check <- check_data_file(data_file, data_dir, locus_type, seq_mode) + res <- format_data_info(data_check, locus_type, seq_mode) + + # mss + data_file <- "mss_example_001.mss" + data_dir <- data4test_dir("mss") + locus_type <- "mss" + seq_mode <- NULL + data_check <- check_data_file(data_file, data_dir, locus_type, seq_mode) + res <- format_data_info(data_check, locus_type, seq_mode) +}) diff --git a/version b/version index 5b09c67c..9084fa2f 100644 --- a/version +++ b/version @@ -1 +1 @@ -1.0.14 +1.1.0