cellgeni
diff --git a/‎README.md
+63-27 b/‎README.md
+63-27
diff --git a/‎bin/combine_meta.py
+250 b/‎bin/combine_meta.py
+250
@@ -2,45 +2,81 @@
 This Nextflow pipeline pulls samples from iRODS and converts them to FASTQ files.
 
 ## Contents of Repo
-* `main.nf` - the Nextflow pipeline that pulls the samples and converts them.
-* `nextflow.config` - the configuration script that controls the cluster scheduler, process and container.
-* `examples/samples.csv` - an example samples.csv file, contains one colum with sample names (header 'sample' is required).
+* `main.nf` - the Nextflow pipeline that runs all workflows
+* `modules/metatable.nf` - a collection of processes that help getting `IRODS` metadata for samples listed in `--findmeta <samples.csv>` file
+* `modules/getfiles.nf` - a collection of processes that help loading the data (`.cram` or `.bam` files) from IRODS and converting them to `.fastq.gz` files
+* `modules/upload2ftp.nf` - a collection of processes that help uploading a list of `.fastq.gz` files to FTP server (specified in `nextflow.config`)
+* `nextflow.config` - the configuration script that controls the cluster scheduler, process and container
+* `bin/parser.py` - script that parses metadata from `imeta ls` output and saves it in `.json` format
+* `bin/combine_meta.py` - script that combines all metadata in `.json` format and saves it to `.tsv` file
+* `examples/samples.csv` - an example samples.csv file, contains one colum with sample names (no header is required)
 * `examples/run.sh` - an example run script that executes the pipeline.
 
 ## Pipeline Arguments
-* `--meta`: A metadata CSV with the sample IDs, and possibly other iRODS parameters. Header relevant, as specifies iRODS metadata field names (required)
-* `--publish_dir`: Path to put the output filess of the pipeline. (default `'results'`)
-* `--type`: Other potential arguments, though not mandatory. If ATAC, set this to 'ATAC' (default `null`)
+* `--findmeta`: specify a .csv file with sample names to run a metadata search
+* `--cram2fastq`: if specified the script runs conversion of cram files that are found on `findmeta` step
+* `--meta`: this argument spicifies the .tsv with cram files (potentially from `findmeta` step) to run cram2fastq conversion
+* `--publish_dir`: path to put the output filess of the pipeline. (default `'results'`)
 * `--index_format`: index-format formula for samtools, only if you really know what you're doing (default `"i*i*"`)
-* `--publish_fastqs`: Whether to publish fastqs - other workflows using this may not want to (default `true`)
-* `--find_crams_only`: For advanced CRAM list manipulation. Only return the found SAMPLE,CRAM list, e.g. for manual curation/manipulation (default `false`)
-* `--cram_list`: Accept SAMPLE,CRAM list on input (default `null`)
-* `--merge`: Concatenate FASTQ files for samples with multiple lanes or sample numbers. Only one R1, R2 (and optionally R3) will be generated per each sample. (default `false`)
-* `--ftp_upload`: Upload the resulting files to an FTP (default `false`).
-  *   Use in combination with `--ftp_credenials`, `--ftp_host` and `--ftp_path`.
+* `--toftp`: upload the resulting files to the ArrayExpress FTP server (default `false`).
+  *   Use in combination with `--ftp_credenials`, `--ftp_host` and `--ftp_path`
+*   `--fastqfiles`: this argument spicifies the .fastq.gz files (potentially from `cram2fastq` step) to upload them to the ArrayExpress ftp server
 
+## Examples of use
+1. Run a metadata search for a specified list of samples:
+```shell
+nextflow run main.nf --findmeta ./examples/samples.csv
+```
+
+2. Download cram files (specified in metadata.tsv) from IRODS and convert them to fastq
+```shell
+nextflow run main.nf --cram2fastq --meta metadata/metadata.tsv
+```
+
+3. Upload fastq files to ftp server (you to set up the ftp server in nextflow.config):
+```shell
+nextflow run main.nf --toftp --fastqfiles ./results/
+```
+
+4. Combine several steps to run them together
+```shell
+nextflow run main.nf --findmeta ./examples/samples.csv --cram2fastq --toftp
+```
 
 ## Graph
 ```mermaid
 ---
 title: Nextflow pipeline for retrieving CRAM files stored in IRODS and convert them to FASTQ
 ---
-flowchart LR
-    subgraph "find and pull CRAM files"
+flowchart TB
+    subgraph findmeta["Find CRAM metadata"]
+    direction LR
       v0([findCrams])
-      v1([combineCramLists])
-      v2([downloadCram])
-      v3([renameCram])
+      v1([getMetadata])
+      v2([parseMetadata])
+      v3([combineMetadata])
+    end
+    subgraph downloadcrams["Covert CRAMS --> FASTQ"]
+    direction LR
+        v4([downloadCram])
+        v5([cramToFastq])
+        v6([calculateReadLength])
+        v7([checkATAC])
+        v8([renameATAC])
+        v9([saveMetaToJson])
+        v10([updateMetadata])
     end
-    v4(((cramToFastq)))
-    subgraph "optional steps"
-        v5([merge])
-        v6([uploadFTP])
+    subgraph uploadtoftp["Upload data to FTP"]
+    direction LR
+      v11([concatFastqs])
+      v12([uploadFTP])
     end
-    v0 --> v1
-    v1 --> v2
-    v2 --> v3
-    v3 --> v4
-    v4 --> v5
-    v5 --> v6
+    v0 --> v1 --> v2 --> v3
+    v4 --> v5 --> v6 --> v7{10X ATAC}
+    v11 --> v12
+    v7 --YES--> v8
+    v8 --> v9
+    v7 --NO--> v9
+    v9 --> v10
+    findmeta -.-> downloadcrams -.-> uploadtoftp
 ```
@@ -0,0 +1,250 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import json
+import csv
+from typing import Set, List, Dict, Any
+from collections import defaultdict
+import logging
+import argparse
+
+
+PARSER = argparse.ArgumentParser(
+    description="Reads metadata from a set of .json files and combines everything to a .tsv file"
+)
+PARSER.add_argument(
+    "dir",
+    metavar="dir",
+    type=str,
+    help="specify a path to the directory with a set of .json files you want to combine",
+)
+PARSER.add_argument(
+    "-a",
+    "--validate_all",
+    help="if specified runs all validation steps, if not runs library type validation only",
+    action="store_true",
+)
+
+
+class ColoredFormatter(logging.Formatter):
+    blue = "\n\033[94m"
+    yellow = "\033[93m"
+    reset = "\033[0m"
+    format = "%(levelname)s: %(message)s"
+
+    FORMATS = {
+        logging.INFO: blue + format + reset,
+        logging.WARNING: yellow + format + reset,
+    }
+
+    def format(self, record):
+        log_fmt = self.FORMATS.get(record.levelno)
+        formatter = logging.Formatter(log_fmt)
+        return formatter.format(record)
+
+
+def setup_logging() -> None:
+    """
+    Setup logging configuration of the script
+    """
+    # a basic config to save logs to metadata.log
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(levelname)s: %(message)s",
+        filename="metadata.log",
+        filemode="w",
+    )
+
+    # define a Handler which writes INFO messages or higher to the sys.stderr
+    console = logging.StreamHandler(sys.stdout)
+    console.setLevel(logging.INFO)
+    # tell the handler to use colored format
+    console.setFormatter(ColoredFormatter())
+    # add the handler to the root logger
+    logging.getLogger("").addHandler(console)
+
+
+def get_sampleindex(meta_list: List[Dict[str, Any]]) -> Dict[str, int]:
+    """
+    Get an indexes of all unique samples
+    meta_list (List[Dict[str, Any]]): a list containing metadata for all samples
+    return (Dict[str, int]): indexes of all unique samples in the list
+    """
+    sample_index = defaultdict(list)
+    for i, sample_meta in enumerate(meta_list):
+        sample_index[sample_meta["sample"]].append(i)
+    return sample_index
+
+
+def validate_filenames(meta_list: List[Dict[str, Any]]) -> None:
+    """
+    Check if there are duplicated filenames of fastq files in the metadata list
+    meta_list (List[Dict[str, Any]]): a list containing metadata for all samples
+    """
+    # get duplicated filenames
+    filenames = [meta["fastq_prefix"] for meta in meta_list]
+    duplicated_filenames = {name for name in filenames if filenames.count(name) > 1}
+    # raise a warning
+    if len(duplicated_filenames) != 0:
+        message = "There are duplicated filenames:\n" + "\n".join(duplicated_filenames)
+        logging.warning(message)
+
+
+def raise_sample_warning(sample: str, warning_messages: List[str]) -> None:
+    """
+    Checks if there were already any warning for a sample and prints a HEADER if there was none
+    sample (str): a sample for which the warnings are raised
+    warning_messages (List[str]): a list of warning for particular sample
+    """
+    if warning_messages:
+        # print a header with sample name
+        logging.info(f"Sample {sample}:")
+        # raise all warnings
+        for message in warning_messages:
+            logging.warning(message)
+
+
+def validate_consistency(
+    sample: str,
+    meta_list: List[Dict[str, Any]],
+    column: str,
+    warning_messages: List[str],
+) -> None:
+    """
+    Check if there are multiple values in `column`
+    sample (str): sample name
+    meta_list (List[Dict[str, Any]]): a list containing metadata for all files of a particular sample
+    column (str): a column of interest in `meta_list`
+    warning_messages: (List[str]): a list of warning messages from previous validation steps
+    """
+    # get unique values
+    unique_values = {meta.get(column, "NaN") for meta in meta_list}
+    if len(unique_values) > 1:
+        # make a warning message
+        warning_message = (
+            f"There are multiple values of {column} available:"
+            + ",".join(unique_values)
+        )
+        # save a warning message to the list
+        warning_messages.append(warning_message)
+
+
+def validate_readcounts(
+    sample: str, meta_list: List[Dict[str, Any]], warning_messages: List[str]
+) -> None:
+    """
+    Check if there IRODS total_counts equals samtools output
+    sample (str): sample name
+    meta_list (List[Dict[str, Any]]): a list containing metadata for all files of a particular sample
+    warning_messages: (List[str]): a list of warning messages from previous validation steps
+    """
+    # get samples with inconsistent total number of reads
+    warning_list = [
+        cram_meta["cram_path"]
+        for cram_meta in meta_list
+        if cram_meta["total_reads"] != cram_meta["num_reads_processed"]
+    ]
+    if warning_list:
+        # make a warning message
+        warning_message = (
+            "IRODS total_count != num_reads_processed for files:"
+            + ",".join(warning_list)
+        )
+        # save a warning message to the list
+        warning_messages.append(warning_message)
+
+
+def validate_atac(
+    sample: str, meta_list: List[Dict[str, Any]], warning_messages: List[str]
+) -> None:
+    """
+    Check if there IRODS total_counts equals samtools output
+    sample (str): sample name
+    meta_list (List[Dict[str, Any]]): a list containing metadata for all files of a particular sample
+    warning_messages: (List[str]): a list of warning messages from previous validation steps
+    """
+    warning_list = [
+        cram_meta["cram_path"]
+        for cram_meta in meta_list
+        if "atac" in cram_meta["library_type"].lower() and cram_meta["i2len"] == "24"
+    ]
+    if warning_list:
+        # make a warning message
+        warning_title = f"The following files are suspected to be 10X ATAC. They were renamed according to CellRanger naming convention :"
+        warning_message = warning_title + ",".join(warning_list)
+        # save a warning message to the list
+        warning_messages.append(warning_message)
+
+
+def validate_metalist(meta_list: List[Dict[str, Any]], validate_all: bool) -> None:
+    """
+    Validates metadata values in a list of columns
+    meta_list (List[Dict[str, Any]]): a list containing metadata for all files of a particular sample
+    warning_messages (List[str]): a list of warning for particular sample
+    """
+    # get sample indexes
+    sample_index = get_sampleindex(meta_list)
+
+    # check if there are dulicated filenames
+    validate_filenames(meta_list)
+
+    # validate cram files for each sample
+    for sample, indexes in sample_index.items():
+        warning_messages = list()
+        # subsample metadata list
+        subsample_metalist = [meta_list[idx] for idx in indexes]
+        # validate metadata
+        validate_consistency(
+            sample, subsample_metalist, "library_type", warning_messages
+        )
+        if validate_all:
+            validate_readcounts(sample, subsample_metalist, warning_messages)
+            validate_consistency(sample, subsample_metalist, "r1len", warning_messages)
+            validate_consistency(sample, subsample_metalist, "r2len", warning_messages)
+            validate_atac(sample, subsample_metalist, warning_messages)
+        # raise all warning messages
+        raise_sample_warning(sample, warning_messages)
+
+
+def main() -> None:
+    # set up logging
+    setup_logging()
+
+    # parse arguments
+    args = PARSER.parse_args()
+
+    # read positional argument with filedir path
+    dirpath = args.dir.rstrip("/")
+
+    # read all json files to meta_list
+    meta_list = list()
+
+    for filename in os.listdir(dirpath):
+        with open(f"{dirpath}/{filename}", "r") as file:
+            # reading the json file
+            sample_meta = json.load(file)
+            meta_list.append(sample_meta)
+
+    # save the field names
+    fieldnames = sample_meta.keys()
+
+    # sort the the data by sample name
+    meta_list = sorted(meta_list, key=lambda x: x["sample"])
+
+    # validate metadata
+    validate_metalist(meta_list, args.validate_all)
+
+    # write all metadata to csv
+    with open("metadata.tsv", mode="w") as csv_file:
+        # create writer object
+        writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter="\t")
+
+        # write the data
+        writer.writeheader()
+        for sample_meta in meta_list:
+            writer.writerow(sample_meta)
+
+
+if __name__ == "__main__":
+    main()