|
| 1 | +#============================================================= |
| 2 | +# INTRODUCTION |
| 3 | + |
| 4 | +# This script is run in order to organise data from the RELECOV analyses according to their corresponding epidemiological weeks. |
| 5 | +# By default, all data will be stored in a folder called surveillance_files. |
| 6 | +# Within this folder, different subfolders will be created, each one referring to a certain epidemiological week. |
| 7 | +# Inside each subfolder, the following items are stored: |
| 8 | +## - epidemiological_data.xlsx: an excel file containing lineage information for all the samples from a given week. This information is also aggregated in another sheet. |
| 9 | +## - variant_data.csv: a .csv file containing information regarding the variants identified for all the samples associated to a given week. |
| 10 | +## - consensus_files: a subfolder containing all the consensus.fa files obtained after the analysis of samples. |
| 11 | + |
| 12 | +#============================================================= |
| 13 | + |
| 14 | +#============================================================= |
| 15 | +# EXAMPLES OF USE |
| 16 | + |
| 17 | +# This script processes bioinfo_lab_metadata*.json and long_table_*.json files. |
| 18 | +# This script can either read these files if they are all stored within the same location, or read .txt files which indicate the paths to these files. |
| 19 | + |
| 20 | +# Use the -i option to indicate the path where these files are. |
| 21 | +## Example: python3 create_summary_tables.py -i ./path |
| 22 | + |
| 23 | +# If your files are located in different locations, use the -b and -l options to indicate the names of the .txt files that must contain the paths to the .json files. |
| 24 | +## Example: python3 create_summary_tables.py -b bioinfo_files.txt -l long_table_files.txt |
| 25 | +## Example of what .txt files look like (considering this script is being run from /data/bioinfoshare/UCCT_Relecov): |
| 26 | +### COD-2402-AND-HUCSC/20240604104459/long_table_20241119092541.json |
| 27 | +### COD-2402-AND-HUCSC/20240911160822/long_table_20241118182618.json |
| 28 | +### COD-2403-CAT-HUVH/20240409103006/long_table_20240912110739.json |
| 29 | + |
| 30 | +# If you want to copy the consensus.fa files into each subfolder, write the -c or --copy-fasta option when running the script. |
| 31 | +## Example: python3 create_summary_tables.py -b bioinfo_files.txt -l long_table_files.txt -c |
| 32 | + |
| 33 | +# If you want to generate data only in relation to a certain epidemiological week, use the -w option (using the YYYY-WW format). |
| 34 | +## Example: python3 create_summary_tables.py -b bioinfo_files.txt -l long_table_files.txt -w 2025-01 |
| 35 | + |
| 36 | +#============================================================= |
| 37 | + |
| 38 | +import os |
| 39 | +import json |
| 40 | +import argparse |
| 41 | +import shutil |
| 42 | +import pandas as pd |
| 43 | +from datetime import datetime |
| 44 | + |
| 45 | +# Function to determine the epidemiological week associated to a certain date. |
| 46 | +def get_epi_week(date_str): |
| 47 | + date = datetime.strptime(date_str, "%Y-%m-%d") |
| 48 | + year, week, weekday = date.isocalendar() |
| 49 | + return f"{year}-{week:02d}" |
| 50 | + |
| 51 | +# Function to search .json files in the paths indicated in the provided .txt files (specifically, bioinfo_lab_metadata and long_table .json files), read them, extract the relevant information and generate tables. |
| 52 | +def process_json_files(input_dir=None, metadata_list=None, long_table_list=None, output_dir="surveillance_files", specified_week=None, copy_fasta=False): |
| 53 | + os.makedirs(output_dir, exist_ok=True) |
| 54 | + |
| 55 | + bioinfo_files = [] |
| 56 | + if metadata_list: |
| 57 | + with open(metadata_list, "r", encoding="utf-8") as f: |
| 58 | + bioinfo_files = [line.strip() for line in f if line.strip()] |
| 59 | + elif input_dir: |
| 60 | + bioinfo_files = [ |
| 61 | + os.path.join(input_dir, filename) |
| 62 | + for filename in os.listdir(input_dir) |
| 63 | + if filename.startswith("bioinfo_lab_metadata_") and filename.endswith(".json") |
| 64 | + ] |
| 65 | + |
| 66 | + long_table_files = [] |
| 67 | + if long_table_list: |
| 68 | + with open(long_table_list, "r", encoding="utf-8") as f: |
| 69 | + long_table_files = [line.strip() for line in f if line.strip()] |
| 70 | + elif input_dir: |
| 71 | + long_table_files = [ |
| 72 | + os.path.join(input_dir, filename) |
| 73 | + for filename in os.listdir(input_dir) |
| 74 | + if filename.startswith("long_table_") and filename.endswith(".json") |
| 75 | + ] |
| 76 | + |
| 77 | + all_data = [] |
| 78 | + fa_files = [] |
| 79 | + sample_variant_data = {} |
| 80 | + |
| 81 | + # Processing of bioinfo_lab_metadata_*.json. |
| 82 | + for filepath in bioinfo_files: |
| 83 | + if not os.path.exists(filepath): |
| 84 | + print(f"Warning! The file {filepath} could not be found. Please make sure the path is correct.") |
| 85 | + continue |
| 86 | + |
| 87 | + with open(filepath, "r", encoding="utf-8") as file: |
| 88 | + try: |
| 89 | + data = json.load(file) |
| 90 | + for sample in data: |
| 91 | + if "sample_collection_date" in sample: |
| 92 | + week = get_epi_week(sample["sample_collection_date"]) |
| 93 | + if specified_week and week != specified_week: |
| 94 | + continue |
| 95 | + all_data.append({ |
| 96 | + "HOSPITAL_ID": sample.get("submitting_institution_id", "-"), |
| 97 | + "HOSPITAL": sample.get("collecting_institution", "-"), |
| 98 | + "PROVINCE": sample.get("geo_loc_region", "-"), |
| 99 | + "SAMPLE_ID": sample.get("sequencing_sample_id", "-"), |
| 100 | + "SAMPLE_COLLECTION_DATE": sample.get("sample_collection_date", "-"), |
| 101 | + "LINEAGE": sample.get("lineage_name", "-"), |
| 102 | + "WEEK": week |
| 103 | + }) |
| 104 | + |
| 105 | + # Search of consensus.fa files. |
| 106 | + fa_path = sample.get("viralrecon_filepath_mapping_consensus") |
| 107 | + if copy_fasta and fa_path and os.path.exists(fa_path): |
| 108 | + fa_files.append((fa_path, week)) |
| 109 | + |
| 110 | + except json.JSONDecodeError: |
| 111 | + print(f"Error! Could not read {filepath} properly, please make sure the file is not corrupt.") |
| 112 | + |
| 113 | + if not all_data: |
| 114 | + print("No bioinfo_lab_metadata_*.json files were found.") |
| 115 | + return |
| 116 | + |
| 117 | + # Processing of long_table_*.json. |
| 118 | + for filepath in long_table_files: |
| 119 | + if not os.path.exists(filepath): |
| 120 | + print(f"Warning! The file {filepath} could not be found. Please check the path is correct.") |
| 121 | + continue |
| 122 | + |
| 123 | + with open(filepath, "r", encoding="utf-8") as file: |
| 124 | + try: |
| 125 | + data = json.load(file) |
| 126 | + for sample in data: |
| 127 | + sample_id = sample.get("sample_name") |
| 128 | + if sample_id: |
| 129 | + sample_variant_data[sample_id] = sample |
| 130 | + except json.JSONDecodeError: |
| 131 | + print(f"Error! Could not read {filepath} properly, please make sure the file is not corrupt.") |
| 132 | + |
| 133 | + if not sample_variant_data: |
| 134 | + print("No long_table_*.json files were found.") |
| 135 | + return |
| 136 | + |
| 137 | + df = pd.DataFrame(all_data) |
| 138 | + weeks = df["WEEK"].unique() |
| 139 | + |
| 140 | + # Creation of epidemiological-week folders, the .xlsx file with lineage information per week and the .csv file with the variant information per week. |
| 141 | + for week in weeks: |
| 142 | + week_dir = os.path.join(output_dir, week) |
| 143 | + os.makedirs(week_dir, exist_ok=True) |
| 144 | + |
| 145 | + week_df = df[df["WEEK"] == week] |
| 146 | + aggregated_df = week_df.groupby("LINEAGE").size().reset_index(name="NUMBER_SAMPLES") |
| 147 | + |
| 148 | + excel_file = os.path.join(week_dir, "epidemiological_data.xlsx") |
| 149 | + with pd.ExcelWriter(excel_file) as writer: |
| 150 | + week_df.to_excel(writer, sheet_name="per_sample_data", index=False) |
| 151 | + aggregated_df.to_excel(writer, sheet_name="aggregated_data", index=False) |
| 152 | + |
| 153 | + print(f"Tables were stored in {week_dir}") |
| 154 | + |
| 155 | + # Copy of the consensus.fa files into a subfolder called consensus_files. |
| 156 | + if copy_fasta: |
| 157 | + consensus_dir = os.path.join(week_dir, "consensus_files") |
| 158 | + os.makedirs(consensus_dir, exist_ok=True) |
| 159 | + for fa_path, week_fa in fa_files: |
| 160 | + if week_fa == week: |
| 161 | + dest_path = os.path.join(consensus_dir, os.path.basename(fa_path)) |
| 162 | + shutil.copy(fa_path, dest_path) |
| 163 | + print("Copy of consensus.fa files completed successfully") |
| 164 | + |
| 165 | + # Generation of the .csv files with variant data. |
| 166 | + variant_data = [] |
| 167 | + for _, row in week_df.iterrows(): |
| 168 | + sample_id = row["SAMPLE_ID"] |
| 169 | + if sample_id in sample_variant_data: |
| 170 | + variant_entries = sample_variant_data[sample_id].get("variants", []) |
| 171 | + for variant in variant_entries: |
| 172 | + variant_data.append({ |
| 173 | + "SAMPLE": variant.get("sample", "-"), |
| 174 | + "CHROM": variant.get("chromosome", "-"), |
| 175 | + "POS": variant.get("pos", "-"), |
| 176 | + "ALT": variant.get("alt", "-"), |
| 177 | + "REF": variant.get("ref", "-"), |
| 178 | + "FILTER": variant.get("Filter", "-"), |
| 179 | + "DP": variant.get("dp", "-"), |
| 180 | + "REF_DP": variant.get("ref_dp", "-"), |
| 181 | + "ALT_DP": variant.get("alt_dp", "-"), |
| 182 | + "AF": variant.get("af", "-"), |
| 183 | + "GENE": variant.get("gene", "-"), |
| 184 | + "EFFECT": variant.get("effect", "-"), |
| 185 | + "HGVS_C": variant.get("hgvs_c", "-"), |
| 186 | + "HGVS_P": variant.get("hgvs_p", "-"), |
| 187 | + "HGVS_P_1LETTER": variant.get("hgvs_p_1_letter", "-"), |
| 188 | + "CALLER": variant.get("caller", "-"), |
| 189 | + "LINEAGE": variant.get("lineage", "-") |
| 190 | + }) |
| 191 | + |
| 192 | + if variant_data: |
| 193 | + variant_df = pd.DataFrame(variant_data) |
| 194 | + variant_csv = os.path.join(week_dir, "variant_data.csv") |
| 195 | + variant_df.to_csv(variant_csv, index=False) |
| 196 | + print(f"Variant data stored in {variant_csv}") |
| 197 | + |
| 198 | +if __name__ == "__main__": |
| 199 | + parser = argparse.ArgumentParser(description="JSON files are processed in order to generate lineage and variant tables in relation to all samples associated to a given epidemiological week") |
| 200 | + parser.add_argument("-i", "--input", help="Directory that contains bioinfo_lab_metadata_*.json and long_table_*.json files (they all must be stored within the same directory)") |
| 201 | + parser.add_argument("-b", "--metadata-list", help=".txt file with paths pointing to the JSON files needed to create the .xlsx file for lineage data (bioinfo_lab_metadata_*.json)") |
| 202 | + parser.add_argument("-l", "--long-table-list", help=".txt file with paths pointing to the JSON files needed to create the .csv file for variant information (long_table_*.json)") |
| 203 | + parser.add_argument("-o", "--output", default="surveillance_files", help="Directory where tables are stored (surveillance_files by default)") |
| 204 | + parser.add_argument("-w", "--week", help="Epidemiological week of interest (use the YYYY-WW format)") |
| 205 | + parser.add_argument("-c", "--copy-fasta", action="store_true", help="Copy of all consensus.fa files into a subfolder called consensus_files (you must explicitly call this option)") |
| 206 | + |
| 207 | + args = parser.parse_args() |
| 208 | + process_json_files(args.input, args.metadata_list, args.long_table_list, args.output, args.week, args.copy_fasta) |
0 commit comments