Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions assets/magma_samplesheet_test.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Study,Sample,Library,Attempt,R1,R2,Flowcell,Lane,Index Sequence
MAGMA,SRR26331590,1,1,https://zenodo.org/records/20671479/files/SRR26331590_R1.fastq.gz,https://zenodo.org/records/20671479/files/SRR26331590_R2.fastq.gz,UNNAMED,1,ATGCATGC
MAGMA,SRR26331595,1,1,https://zenodo.org/records/20671479/files/SRR26331595_R1.fastq.gz,https://zenodo.org/records/20671479/files/SRR26331595_R2.fastq.gz,UNNAMED,1,ATGCATGC
MAGMA,SRR26331599,1,1,https://zenodo.org/records/20671479/files/SRR26331599_R1.fastq.gz,https://zenodo.org/records/20671479/files/SRR26331599_R2.fastq.gz,UNNAMED,1,ATGCATGC
5 changes: 4 additions & 1 deletion assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@
"errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
}
},
"required": ["sample", "fastq_1"]
"anyOf": [
{ "required": ["sample", "fastq_1"] },
{ "required": ["Sample", "R1"] }
]
}
}
158 changes: 158 additions & 0 deletions bin/convert_ismapper_to_vcf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#!/usr/bin/env python3
#
# Copyright (c) 2021-2024 MAGMA pipeline authors, see https://doi.org/10.1371/journal.pcbi.1011648
#
# This file is part of MAGMA pipeline, see https://github.com/TORCH-Consortium/MAGMA
#
# For quick overview of GPL-3 license, please refer
# https://www.tldrlegal.com/license/gnu-general-public-license-v3-gpl-3
#
# - You MUST keep this license with original authors in your copy
# - You MUST acknowledge the original source of this software
# - You MUST state significant changes made to the original software
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program . If not, see <http://www.gnu.org/licenses/>.
#

import argparse
import csv
import glob
from Bio import SeqIO

# Define the VCF header
vcf_header = """##fileformat=VCFv4.2
##source=ISMapper
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
##INFO=<ID=Orientation,Number=1,Type=String,Description="Orientation of the transposable element">
##INFO=<ID=Gap,Number=1,Type=Integer,Description="Gap between sequences">
##INFO=<ID=Call,Number=1,Type=String,Description="Call information">
##INFO=<ID=Percent_ID,Number=1,Type=Float,Description="Percent identity">
##INFO=<ID=Percent_cov,Number=1,Type=Float,Description="Percent coverage">
##INFO=<ID=Left_gene,Number=1,Type=String,Description="Left gene">
##INFO=<ID=Left_description,Number=1,Type=String,Description="Description of the left gene">
##INFO=<ID=Left_strand,Number=1,Type=String,Description="Strand of the left gene">
##INFO=<ID=Left_distance,Number=1,Type=Integer,Description="Distance to the left gene">
##INFO=<ID=Right_gene,Number=1,Type=String,Description="Right gene">
##INFO=<ID=Right_description,Number=1,Type=String,Description="Description of the right gene">
##INFO=<ID=Right_strand,Number=1,Type=String,Description="Strand of the right gene">
##INFO=<ID=Right_distance,Number=1,Type=Integer,Description="Distance to the right gene">
##INFO=<ID=Gene_interruption,Number=1,Type=String,Description="Gene interruption status">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE
"""

# Function to read the reference genome
def read_reference_genome(reference_file):
reference_sequences = {}
reference_sequence_length = 0
with open(reference_file, 'r') as ref_file:
# Read the reference genome using Biopython
for record in SeqIO.parse(ref_file, "fasta"):
reference_sequences[record.id] = record.seq
reference_sequence_length = len(record.seq)
return reference_sequences, reference_sequence_length

# Function to read the transposable element information
def read_transposable_elements(query_file):
te_info = {}
with open(query_file, 'r') as query_file:
# Read the multifasta using Biopython
for record in SeqIO.parse(query_file, "fasta"):
te_name = record.id
te_info[te_name] = len(record.seq)

# NOTE: Renable in case we need to dump this on disk.
# with open("temp_transposable_elements.csv", mode='w', newline='') as file:
# writer = csv.writer(file)
# writer.writerow(["name", "length"])
# for te_name, te_length in te_info.items():
# writer.writerow([te_name, te_length])
return te_info

# Function to convert ISMapper data to VCF format
def convert_is_mapper_to_vcf(is_mapper_dir, vcf_file, reference_sequences, te_info):

is_mapper_txt_file = glob.glob(is_mapper_dir + "*.txt")[0]

with open(is_mapper_txt_file, 'r') as infile_is_mapper, open(vcf_file, 'w') as outfile:
# Write the VCF header
outfile.write(vcf_header)

# Read the ISMapper TSV file
reader_is_mapper = csv.DictReader(infile_is_mapper, delimiter='\t')

for row in reader_is_mapper:
chrom = "NC-000962-3-H37Rv" # Using the specified chromosome
pos = int(row['x']) # Convert position to integer
region_id = row['region']
ref = reference_sequences[chrom][pos - 1] # Extract the reference allele from the reference sequence
orientation = row['orientation']
# FIXME Hard-code the name of this specific element
te_name = 'IS6110'
te_length = te_info.get(te_name, 'NA')
alt = f"{ref}[<{te_name},{orientation}>:{te_length}[" # Use transposable element, orientation, and its length
qual = '.'
filter_status = 'PASS'
info = (
f"SVTYPE=BND;"
f"Orientation={orientation};"
f"Gap={row['gap']};"
f"Call={row['call']};"
f"Percent_ID={row['percent_ID']};"
f"Percent_cov={row['percent_cov']};"
f"Left_gene={row['left_gene']};"
f"Left_description={row['left_description']};"
f"Left_strand={row['left_strand']};"
f"Left_distance={row['left_distance']};"
f"Right_gene={row['right_gene']};"
f"Right_description={row['right_description']};"
f"Right_strand={row['right_strand']};"
f"Right_distance={row['right_distance']};"
f"Gene_interruption={row['gene_interruption']}"
)
format_field = "GT:AD:DP:GQ:PL"
sample_field = "1:10,10:10:99:1800"

# Write the VCF entry
vcf_entry = f"{chrom}\t{pos}\t{region_id}\t{ref}\t{alt}\t{qual}\t{filter_status}\t{info}\t{format_field}\t{sample_field}\n"
outfile.write(vcf_entry)

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Convert ISMapper output to VCF format.')
parser.add_argument('--is_mapper_dir', required=True, help='Path to the ISMapper output directory.')
parser.add_argument('--reference_file', required=True, help='Path to the reference genome file.')
parser.add_argument('--query_file', required=True, help='Path to the transposable elements multifasta file.')
parser.add_argument('--output_vcf_file', required=True, help='Path to the output VCF file.')

# Step 3: Parse the command-line arguments
args = parser.parse_args()

# Step 4: Replace hardcoded file paths with variables
is_mapper_dir = args.is_mapper_dir
vcf_file = args.output_vcf_file
reference_file = args.reference_file
query_file = args.query_file

# Read the reference genome sequence
reference_sequences, _ = read_reference_genome(reference_file)

# Read the transposable element information
query_info = read_transposable_elements(query_file)

# Run the conversion function
convert_is_mapper_to_vcf(is_mapper_dir, vcf_file, reference_sequences, query_info)
85 changes: 85 additions & 0 deletions bin/fastq_cohort_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/usr/bin/env python3
#
# Copyright (c) 2021-2024 MAGMA pipeline authors, see https://doi.org/10.1371/journal.pcbi.1011648
#
# This file is part of MAGMA pipeline, see https://github.com/TORCH-Consortium/MAGMA
#
# For quick overview of GPL-3 license, please refer
# https://www.tldrlegal.com/license/gnu-general-public-license-v3-gpl-3
#
# - You MUST keep this license with original authors in your copy
# - You MUST acknowledge the original source of this software
# - You MUST state significant changes made to the original software
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program . If not, see <http://www.gnu.org/licenses/>.
#

import glob
import argparse
import csv
import json

import pandas as pd

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Summarize the input FASTQ validation report')

parser.add_argument('magma_samplesheet', metavar='magma_samplesheet', type=str, help='')

parser.add_argument('merged_fastq_reports', metavar='merged_fastq_reports', type=str, help='')

parser.add_argument('magma_analysis', metavar='magma_analysis', type=str, help='')


args = vars(parser.parse_args())

# ============================================
# Parse the validation reports for exact sample names which passed/failed
# ============================================

# Load the JSON file into a dictionary
with open(args['merged_fastq_reports'], 'r') as f:
fastq_report_dict = json.load(f)

with open(args['magma_samplesheet'], 'r') as f:
magma_samplesheet_json = json.load(f)

fastq_report_keys_list = list(fastq_report_dict.keys())

magma_analysis_json = {}

for elem in magma_samplesheet_json:
elem["fastq_report"] = {}

if elem['R1'] is not None:
fastq_1_name = elem['R1'].split("/")[-1]
elem["fastqs_approved"] = True
if fastq_1_name in fastq_report_keys_list:
elem["fastq_report"][fastq_1_name] = {"file": fastq_report_dict[fastq_1_name]}
else:
elem["fastq_report"][fastq_1_name] = {"fastq_utils_check": "failed"}
elem["fastqs_approved"] = False

if elem['R2'] != "" :
fastq_2_name = elem['R2'].split("/")[-1]
if fastq_2_name in fastq_report_keys_list:
elem["fastq_report"][fastq_2_name] = {"file": fastq_report_dict[fastq_2_name]}
else:
elem["fastq_report"][fastq_2_name] = {"fastq_utils_check": "failed"}
elem["fastqs_approved"] = False

magma_analysis_json[elem["magma_sample_name"]] = elem

with open(args['magma_analysis'], 'w') as f:
json.dump(magma_analysis_json, f, indent=4, ensure_ascii= False)
83 changes: 83 additions & 0 deletions bin/generate_merged_cohort_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/usr/bin/env python3
#
# Copyright (c) 2021-2024 MAGMA pipeline authors, see https://doi.org/10.1371/journal.pcbi.1011648
#
# This file is part of MAGMA pipeline, see https://github.com/TORCH-Consortium/MAGMA
#
# For quick overview of GPL-3 license, please refer
# https://www.tldrlegal.com/license/gnu-general-public-license-v3-gpl-3
#
# - You MUST keep this license with original authors in your copy
# - You MUST acknowledge the original source of this software
# - You MUST state significant changes made to the original software
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program . If not, see <http://www.gnu.org/licenses/>.
#

import argparse
import pandas as pd

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Summarize the CALL_WF and MINOR_VARIANTS_ANALYSIS_WF analysis results')
parser.add_argument('--relabundance_approved_tsv', default="approved_samples.relabundance.tsv", metavar='relabundance_approved_tsv', type=str, help='File enlisting the approved samples from MINOR_VARIANTS_ANALYSIS_WF')
parser.add_argument('--relabundance_rejected_tsv', default="rejected_samples.relabundance.tsv", metavar='relabundance_rejected_tsv', type=str, help='File enlisting the rejected samples from MINOR_VARIANTS_ANALYSIS_WF')
parser.add_argument('--call_wf_cohort_stats_tsv', default="joint.cohort_stats.tsv", metavar='call_wf_cohort_stats_tsv', type=str, help='File enlisting the cohort results of CALL_WF')
parser.add_argument('--output_file', default="joint.merged_cohort_stats.tsv", metavar='output_file', type=str, help='Name of the output file merged cohort statistics')
args = vars(parser.parse_args())

# Read the TSV files into dataframes
df_cohort_stats = pd.read_csv(args['call_wf_cohort_stats_tsv'], sep="\t")
df_cohort_stats.columns = df_cohort_stats.columns.str.strip()
df_cohort_stats['SAMPLE'] = df_cohort_stats['SAMPLE'].str.strip()
df_cohort_stats = df_cohort_stats.set_index('SAMPLE')

df_approved_relabundance_stats = pd.read_csv(args['relabundance_approved_tsv'], sep="\t")
df_approved_relabundance_stats.columns = df_approved_relabundance_stats.columns.str.strip()
df_approved_relabundance_stats['SAMPLE'] = df_approved_relabundance_stats['SAMPLE'].str.strip()
df_approved_relabundance_stats = df_approved_relabundance_stats.set_index('SAMPLE')

df_rejected_relabundance_stats = pd.read_csv(args['relabundance_rejected_tsv'], sep="\t")
df_rejected_relabundance_stats.columns = df_rejected_relabundance_stats.columns.str.strip()
df_rejected_relabundance_stats['SAMPLE'] = df_rejected_relabundance_stats['SAMPLE'].str.strip()
df_rejected_relabundance_stats = df_rejected_relabundance_stats.set_index('SAMPLE')

# Join the datasets
df_relabundance_stats_concat = pd.concat([df_approved_relabundance_stats, df_rejected_relabundance_stats])
df_joint_cohort_stats = df_cohort_stats.join(df_relabundance_stats_concat, how="outer")

# Reorder the columns
df_joint_cohort_stats.columns = df_joint_cohort_stats.columns.str.strip()
new_cols = ['AVG_INSERT_SIZE', 'MAPPED_PERCENTAGE', 'RAW_TOTAL_SEQS', 'AVERAGE_BASE_QUALITY', 'MEAN_COVERAGE', 'SD_COVERAGE', 'MEDIAN_COVERAGE', 'MAD_COVERAGE', 'PCT_EXC_ADAPTER', 'PCT_EXC_MAPQ', 'PCT_EXC_DUPE', 'PCT_EXC_UNPAIRED', 'PCT_EXC_BASEQ', 'PCT_EXC_OVERLAP', 'PCT_EXC_CAPPED', 'PCT_EXC_TOTAL', 'PCT_1X', 'PCT_5X', 'PCT_10X', 'PCT_30X', 'PCT_50X', 'PCT_100X', 'LINEAGES', 'FREQUENCIES', 'MAPPED_NTM_FRACTION_16S', 'MAPPED_NTM_FRACTION_16S_THRESHOLD_MET', 'COVERAGE_THRESHOLD_MET', 'BREADTH_OF_COVERAGE_THRESHOLD_MET', 'RELABUNDANCE_THRESHOLD_MET', 'ALL_THRESHOLDS_MET']
df_final_cohort_stats = df_joint_cohort_stats[new_cols]

# Impute the NaN value after join
df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'] = df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'].fillna(0)

# Prepare for boolean operation
df_final_cohort_stats['MAPPED_NTM_FRACTION_16S_THRESHOLD_MET'] = df_final_cohort_stats['MAPPED_NTM_FRACTION_16S_THRESHOLD_MET'].fillna(0).astype('Int64')
df_final_cohort_stats['COVERAGE_THRESHOLD_MET'] = df_final_cohort_stats['COVERAGE_THRESHOLD_MET'].fillna(0).astype('Int64')
df_final_cohort_stats['BREADTH_OF_COVERAGE_THRESHOLD_MET'] = df_final_cohort_stats['BREADTH_OF_COVERAGE_THRESHOLD_MET'].fillna(0).astype('Int64')
df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'] = df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'].fillna(0).astype('Int64')

# Derive the final threshold using Boolean operations
df_final_cohort_stats['ALL_THRESHOLDS_MET'] = (
df_final_cohort_stats['MAPPED_NTM_FRACTION_16S_THRESHOLD_MET'].apply(lambda x: bool(x) if pd.notna(x) else False) &
df_final_cohort_stats['COVERAGE_THRESHOLD_MET'].astype('bool') &
df_final_cohort_stats['BREADTH_OF_COVERAGE_THRESHOLD_MET'].astype('bool') &
df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'].astype('bool')
)
df_final_cohort_stats['ALL_THRESHOLDS_MET'] = df_final_cohort_stats['ALL_THRESHOLDS_MET'].replace({True: 1, False: 0})

# Write the final dataframe to file
df_final_cohort_stats.to_csv(args['output_file'], sep="\t")
Loading
Loading