nf-core · abhi18av · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
diff --git a/assets/magma_samplesheet_test.csv b/assets/magma_samplesheet_test.csv
@@ -0,0 +1,4 @@
+Study,Sample,Library,Attempt,R1,R2,Flowcell,Lane,Index Sequence
+MAGMA,SRR26331590,1,1,https://zenodo.org/records/20671479/files/SRR26331590_R1.fastq.gz,https://zenodo.org/records/20671479/files/SRR26331590_R2.fastq.gz,UNNAMED,1,ATGCATGC
+MAGMA,SRR26331595,1,1,https://zenodo.org/records/20671479/files/SRR26331595_R1.fastq.gz,https://zenodo.org/records/20671479/files/SRR26331595_R2.fastq.gz,UNNAMED,1,ATGCATGC
+MAGMA,SRR26331599,1,1,https://zenodo.org/records/20671479/files/SRR26331599_R1.fastq.gz,https://zenodo.org/records/20671479/files/SRR26331599_R2.fastq.gz,UNNAMED,1,ATGCATGC
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -34,6 +34,9 @@
                 "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
             }
         },
-        "required": ["sample", "fastq_1"]
+        "anyOf": [
+            { "required": ["sample", "fastq_1"] },
+            { "required": ["Sample", "R1"] }
+        ]
     }
 }
diff --git a/bin/convert_ismapper_to_vcf.py b/bin/convert_ismapper_to_vcf.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2021-2024 MAGMA pipeline authors, see https://doi.org/10.1371/journal.pcbi.1011648
+#
+# This file is part of MAGMA pipeline, see https://github.com/TORCH-Consortium/MAGMA
+#
+# For quick overview of GPL-3 license, please refer
+# https://www.tldrlegal.com/license/gnu-general-public-license-v3-gpl-3
+#
+# - You MUST keep this license with original authors in your copy
+# - You MUST acknowledge the original source of this software
+# - You MUST state significant changes made to the original software
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program . If not, see <http://www.gnu.org/licenses/>.
+#
+
+import argparse
+import csv
+import glob
+from Bio import SeqIO
+
+# Define the VCF header
+vcf_header = """##fileformat=VCFv4.2
+##source=ISMapper
+##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
+##INFO=<ID=Orientation,Number=1,Type=String,Description="Orientation of the transposable element">
+##INFO=<ID=Gap,Number=1,Type=Integer,Description="Gap between sequences">
+##INFO=<ID=Call,Number=1,Type=String,Description="Call information">
+##INFO=<ID=Percent_ID,Number=1,Type=Float,Description="Percent identity">
+##INFO=<ID=Percent_cov,Number=1,Type=Float,Description="Percent coverage">
+##INFO=<ID=Left_gene,Number=1,Type=String,Description="Left gene">
+##INFO=<ID=Left_description,Number=1,Type=String,Description="Description of the left gene">
+##INFO=<ID=Left_strand,Number=1,Type=String,Description="Strand of the left gene">
+##INFO=<ID=Left_distance,Number=1,Type=Integer,Description="Distance to the left gene">
+##INFO=<ID=Right_gene,Number=1,Type=String,Description="Right gene">
+##INFO=<ID=Right_description,Number=1,Type=String,Description="Description of the right gene">
+##INFO=<ID=Right_strand,Number=1,Type=String,Description="Strand of the right gene">
+##INFO=<ID=Right_distance,Number=1,Type=Integer,Description="Distance to the right gene">
+##INFO=<ID=Gene_interruption,Number=1,Type=String,Description="Gene interruption status">
+##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
+#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE
+"""
+
+# Function to read the reference genome
+def read_reference_genome(reference_file):
+    reference_sequences = {}
+    reference_sequence_length = 0
+    with open(reference_file, 'r') as ref_file:
+        # Read the reference genome using Biopython
+        for record in SeqIO.parse(ref_file, "fasta"):
+            reference_sequences[record.id] = record.seq
+            reference_sequence_length = len(record.seq)
+    return reference_sequences, reference_sequence_length
+
+# Function to read the transposable element information
+def read_transposable_elements(query_file):
+    te_info = {}
+    with open(query_file, 'r') as query_file:
+        # Read the multifasta using Biopython
+        for record in SeqIO.parse(query_file, "fasta"):
+            te_name = record.id
+            te_info[te_name] = len(record.seq)
+
+        # NOTE: Renable in case we need to dump this on disk.
+        # with open("temp_transposable_elements.csv", mode='w', newline='') as file:
+        #     writer = csv.writer(file)
+        #     writer.writerow(["name", "length"])
+        #     for te_name, te_length in te_info.items():
+        #         writer.writerow([te_name, te_length])
+    return te_info
+
+# Function to convert ISMapper data to VCF format
+def convert_is_mapper_to_vcf(is_mapper_dir, vcf_file, reference_sequences, te_info):
+
+    is_mapper_txt_file = glob.glob(is_mapper_dir + "*.txt")[0]
+
+    with open(is_mapper_txt_file, 'r') as infile_is_mapper, open(vcf_file, 'w') as outfile:
+        # Write the VCF header
+        outfile.write(vcf_header)
+
+        # Read the ISMapper TSV file
+        reader_is_mapper = csv.DictReader(infile_is_mapper, delimiter='\t')
+
+        for row in reader_is_mapper:
+            chrom = "NC-000962-3-H37Rv"  # Using the specified chromosome
+            pos = int(row['x'])  # Convert position to integer
+            region_id = row['region']
+            ref = reference_sequences[chrom][pos - 1]  # Extract the reference allele from the reference sequence
+            orientation = row['orientation']
+            # FIXME Hard-code the name of this specific element
+            te_name = 'IS6110'
+            te_length = te_info.get(te_name, 'NA')
+            alt = f"{ref}[<{te_name},{orientation}>:{te_length}["  # Use transposable element, orientation, and its length
+            qual = '.'
+            filter_status = 'PASS'
+            info = (
+                f"SVTYPE=BND;"
+                f"Orientation={orientation};"
+                f"Gap={row['gap']};"
+                f"Call={row['call']};"
+                f"Percent_ID={row['percent_ID']};"
+                f"Percent_cov={row['percent_cov']};"
+                f"Left_gene={row['left_gene']};"
+                f"Left_description={row['left_description']};"
+                f"Left_strand={row['left_strand']};"
+                f"Left_distance={row['left_distance']};"
+                f"Right_gene={row['right_gene']};"
+                f"Right_description={row['right_description']};"
+                f"Right_strand={row['right_strand']};"
+                f"Right_distance={row['right_distance']};"
+                f"Gene_interruption={row['gene_interruption']}"
+            )
+            format_field = "GT:AD:DP:GQ:PL"
+            sample_field = "1:10,10:10:99:1800"
+
+            # Write the VCF entry
+            vcf_entry = f"{chrom}\t{pos}\t{region_id}\t{ref}\t{alt}\t{qual}\t{filter_status}\t{info}\t{format_field}\t{sample_field}\n"
+            outfile.write(vcf_entry)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Convert ISMapper output to VCF format.')
+    parser.add_argument('--is_mapper_dir', required=True, help='Path to the ISMapper output directory.')
+    parser.add_argument('--reference_file', required=True, help='Path to the reference genome file.')
+    parser.add_argument('--query_file', required=True, help='Path to the transposable elements multifasta file.')
+    parser.add_argument('--output_vcf_file', required=True, help='Path to the output VCF file.')
+
+    # Step 3: Parse the command-line arguments
+    args = parser.parse_args()
+
+    # Step 4: Replace hardcoded file paths with variables
+    is_mapper_dir = args.is_mapper_dir
+    vcf_file = args.output_vcf_file
+    reference_file = args.reference_file
+    query_file = args.query_file
+
+    # Read the reference genome sequence
+    reference_sequences, _ = read_reference_genome(reference_file)
+
+    # Read the transposable element information
+    query_info = read_transposable_elements(query_file)
+
+    # Run the conversion function
+    convert_is_mapper_to_vcf(is_mapper_dir, vcf_file, reference_sequences, query_info)
diff --git a/bin/fastq_cohort_validation.py b/bin/fastq_cohort_validation.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2021-2024 MAGMA pipeline authors, see https://doi.org/10.1371/journal.pcbi.1011648
+#
+# This file is part of MAGMA pipeline, see https://github.com/TORCH-Consortium/MAGMA
+#
+# For quick overview of GPL-3 license, please refer
+# https://www.tldrlegal.com/license/gnu-general-public-license-v3-gpl-3
+#
+# - You MUST keep this license with original authors in your copy
+# - You MUST acknowledge the original source of this software
+# - You MUST state significant changes made to the original software
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program . If not, see <http://www.gnu.org/licenses/>.
+#
+
+import glob
+import argparse
+import csv
+import json
+
+import pandas as pd
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Summarize the input FASTQ validation report')
+
+    parser.add_argument('magma_samplesheet', metavar='magma_samplesheet',  type=str, help='')
+
+    parser.add_argument('merged_fastq_reports', metavar='merged_fastq_reports',  type=str, help='')
+
+    parser.add_argument('magma_analysis', metavar='magma_analysis',  type=str, help='')
+
+
+    args = vars(parser.parse_args())
+
+    # ============================================
+    # Parse the validation reports for exact sample names which passed/failed
+    # ============================================
+
+    # Load the JSON file into a dictionary
+    with open(args['merged_fastq_reports'], 'r') as f:
+        fastq_report_dict = json.load(f)
+
+    with open(args['magma_samplesheet'], 'r') as f:
+        magma_samplesheet_json = json.load(f)
+
+    fastq_report_keys_list = list(fastq_report_dict.keys())
+
+    magma_analysis_json = {}
+
+    for elem in magma_samplesheet_json:
+        elem["fastq_report"] = {}
+
+        if elem['R1'] is not None:
+            fastq_1_name = elem['R1'].split("/")[-1]
+            elem["fastqs_approved"] = True
+            if fastq_1_name in fastq_report_keys_list:
+                elem["fastq_report"][fastq_1_name] = {"file": fastq_report_dict[fastq_1_name]}
+            else:
+                elem["fastq_report"][fastq_1_name] = {"fastq_utils_check": "failed"}
+                elem["fastqs_approved"] = False
+
+        if elem['R2'] != "" :
+            fastq_2_name = elem['R2'].split("/")[-1]
+            if fastq_2_name in fastq_report_keys_list:
+                elem["fastq_report"][fastq_2_name] = {"file": fastq_report_dict[fastq_2_name]}
+            else:
+                elem["fastq_report"][fastq_2_name] = {"fastq_utils_check": "failed"}
+                elem["fastqs_approved"] = False
+
+        magma_analysis_json[elem["magma_sample_name"]] = elem
+
+    with open(args['magma_analysis'], 'w') as f:
+        json.dump(magma_analysis_json, f, indent=4, ensure_ascii= False)
diff --git a/bin/generate_merged_cohort_stats.py b/bin/generate_merged_cohort_stats.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2021-2024 MAGMA pipeline authors, see https://doi.org/10.1371/journal.pcbi.1011648
+#
+# This file is part of MAGMA pipeline, see https://github.com/TORCH-Consortium/MAGMA
+#
+# For quick overview of GPL-3 license, please refer
+# https://www.tldrlegal.com/license/gnu-general-public-license-v3-gpl-3
+#
+# - You MUST keep this license with original authors in your copy
+# - You MUST acknowledge the original source of this software
+# - You MUST state significant changes made to the original software
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program . If not, see <http://www.gnu.org/licenses/>.
+#
+
+import argparse
+import pandas as pd
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Summarize the CALL_WF and MINOR_VARIANTS_ANALYSIS_WF analysis results')
+    parser.add_argument('--relabundance_approved_tsv', default="approved_samples.relabundance.tsv", metavar='relabundance_approved_tsv', type=str, help='File enlisting the approved samples from MINOR_VARIANTS_ANALYSIS_WF')
+    parser.add_argument('--relabundance_rejected_tsv', default="rejected_samples.relabundance.tsv", metavar='relabundance_rejected_tsv', type=str, help='File enlisting the rejected samples from MINOR_VARIANTS_ANALYSIS_WF')
+    parser.add_argument('--call_wf_cohort_stats_tsv', default="joint.cohort_stats.tsv", metavar='call_wf_cohort_stats_tsv', type=str, help='File enlisting the cohort results of CALL_WF')
+    parser.add_argument('--output_file', default="joint.merged_cohort_stats.tsv", metavar='output_file', type=str, help='Name of the output file merged cohort statistics')
+    args = vars(parser.parse_args())
+
+    # Read the TSV files into dataframes
+    df_cohort_stats = pd.read_csv(args['call_wf_cohort_stats_tsv'], sep="\t")
+    df_cohort_stats.columns = df_cohort_stats.columns.str.strip()
+    df_cohort_stats['SAMPLE'] = df_cohort_stats['SAMPLE'].str.strip()
+    df_cohort_stats = df_cohort_stats.set_index('SAMPLE')
+
+    df_approved_relabundance_stats = pd.read_csv(args['relabundance_approved_tsv'], sep="\t")
+    df_approved_relabundance_stats.columns = df_approved_relabundance_stats.columns.str.strip()
+    df_approved_relabundance_stats['SAMPLE'] = df_approved_relabundance_stats['SAMPLE'].str.strip()
+    df_approved_relabundance_stats = df_approved_relabundance_stats.set_index('SAMPLE')
+
+    df_rejected_relabundance_stats = pd.read_csv(args['relabundance_rejected_tsv'], sep="\t")
+    df_rejected_relabundance_stats.columns = df_rejected_relabundance_stats.columns.str.strip()
+    df_rejected_relabundance_stats['SAMPLE'] = df_rejected_relabundance_stats['SAMPLE'].str.strip()
+    df_rejected_relabundance_stats = df_rejected_relabundance_stats.set_index('SAMPLE')
+
+    # Join the datasets
+    df_relabundance_stats_concat = pd.concat([df_approved_relabundance_stats, df_rejected_relabundance_stats])
+    df_joint_cohort_stats = df_cohort_stats.join(df_relabundance_stats_concat, how="outer")
+
+    # Reorder the columns
+    df_joint_cohort_stats.columns = df_joint_cohort_stats.columns.str.strip()
+    new_cols = ['AVG_INSERT_SIZE', 'MAPPED_PERCENTAGE', 'RAW_TOTAL_SEQS', 'AVERAGE_BASE_QUALITY', 'MEAN_COVERAGE', 'SD_COVERAGE', 'MEDIAN_COVERAGE', 'MAD_COVERAGE', 'PCT_EXC_ADAPTER', 'PCT_EXC_MAPQ', 'PCT_EXC_DUPE', 'PCT_EXC_UNPAIRED', 'PCT_EXC_BASEQ', 'PCT_EXC_OVERLAP', 'PCT_EXC_CAPPED', 'PCT_EXC_TOTAL', 'PCT_1X', 'PCT_5X', 'PCT_10X', 'PCT_30X', 'PCT_50X', 'PCT_100X', 'LINEAGES', 'FREQUENCIES', 'MAPPED_NTM_FRACTION_16S', 'MAPPED_NTM_FRACTION_16S_THRESHOLD_MET', 'COVERAGE_THRESHOLD_MET', 'BREADTH_OF_COVERAGE_THRESHOLD_MET', 'RELABUNDANCE_THRESHOLD_MET', 'ALL_THRESHOLDS_MET']
+    df_final_cohort_stats = df_joint_cohort_stats[new_cols]
+
+    # Impute the NaN value after join
+    df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'] = df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'].fillna(0)
+
+    # Prepare for boolean operation
+    df_final_cohort_stats['MAPPED_NTM_FRACTION_16S_THRESHOLD_MET'] = df_final_cohort_stats['MAPPED_NTM_FRACTION_16S_THRESHOLD_MET'].fillna(0).astype('Int64')
+    df_final_cohort_stats['COVERAGE_THRESHOLD_MET'] = df_final_cohort_stats['COVERAGE_THRESHOLD_MET'].fillna(0).astype('Int64')
+    df_final_cohort_stats['BREADTH_OF_COVERAGE_THRESHOLD_MET'] = df_final_cohort_stats['BREADTH_OF_COVERAGE_THRESHOLD_MET'].fillna(0).astype('Int64')
+    df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'] = df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'].fillna(0).astype('Int64')
+
+    # Derive the final threshold using Boolean operations
+    df_final_cohort_stats['ALL_THRESHOLDS_MET'] = (
+        df_final_cohort_stats['MAPPED_NTM_FRACTION_16S_THRESHOLD_MET'].apply(lambda x: bool(x) if pd.notna(x) else False) &
+        df_final_cohort_stats['COVERAGE_THRESHOLD_MET'].astype('bool') &
+        df_final_cohort_stats['BREADTH_OF_COVERAGE_THRESHOLD_MET'].astype('bool') &
+        df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'].astype('bool')
+    )
+    df_final_cohort_stats['ALL_THRESHOLDS_MET'] = df_final_cohort_stats['ALL_THRESHOLDS_MET'].replace({True: 1, False: 0})
+
+    # Write the final dataframe to file
+    df_final_cohort_stats.to_csv(args['output_file'], sep="\t")