Skip to content

Commit 3faf538

Browse files
committed
Added create_summary_tables.py to assets
1 parent 41a7d6b commit 3faf538

File tree

1 file changed

+208
-0
lines changed

1 file changed

+208
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
#=============================================================
2+
# INTRODUCTION
3+
4+
# This script is run in order to organise data from the RELECOV analyses according to their corresponding epidemiological weeks.
5+
# By default, all data will be stored in a folder called surveillance_files.
6+
# Within this folder, different subfolders will be created, each one referring to a certain epidemiological week.
7+
# Inside each subfolder, the following items are stored:
8+
## - epidemiological_data.xlsx: an excel file containing lineage information for all the samples from a given week. This information is also aggregated in another sheet.
9+
## - variant_data.csv: a .csv file containing information regarding the variants identified for all the samples associated to a given week.
10+
## - consensus_files: a subfolder containing all the consensus.fa files obtained after the analysis of samples.
11+
12+
#=============================================================
13+
14+
#=============================================================
15+
# EXAMPLES OF USE
16+
17+
# This script processes bioinfo_lab_metadata*.json and long_table_*.json files.
18+
# This script can either read these files if they are all stored within the same location, or read .txt files which indicate the paths to these files.
19+
20+
# Use the -i option to indicate the path where these files are.
21+
## Example: python3 create_summary_tables.py -i ./path
22+
23+
# If your files are located in different locations, use the -b and -l options to indicate the names of the .txt files that must contain the paths to the .json files.
24+
## Example: python3 create_summary_tables.py -b bioinfo_files.txt -l long_table_files.txt
25+
## Example of what .txt files look like (considering this script is being run from /data/bioinfoshare/UCCT_Relecov):
26+
### COD-2402-AND-HUCSC/20240604104459/long_table_20241119092541.json
27+
### COD-2402-AND-HUCSC/20240911160822/long_table_20241118182618.json
28+
### COD-2403-CAT-HUVH/20240409103006/long_table_20240912110739.json
29+
30+
# If you want to copy the consensus.fa files into each subfolder, write the -c or --copy-fasta option when running the script.
31+
## Example: python3 create_summary_tables.py -b bioinfo_files.txt -l long_table_files.txt -c
32+
33+
# If you want to generate data only in relation to a certain epidemiological week, use the -w option (using the YYYY-WW format).
34+
## Example: python3 create_summary_tables.py -b bioinfo_files.txt -l long_table_files.txt -w 2025-01
35+
36+
#=============================================================
37+
38+
import os
39+
import json
40+
import argparse
41+
import shutil
42+
import pandas as pd
43+
from datetime import datetime
44+
45+
# Function to determine the epidemiological week associated to a certain date.
46+
def get_epi_week(date_str):
47+
date = datetime.strptime(date_str, "%Y-%m-%d")
48+
year, week, weekday = date.isocalendar()
49+
return f"{year}-{week:02d}"
50+
51+
# Function to search .json files in the paths indicated in the provided .txt files (specifically, bioinfo_lab_metadata and long_table .json files), read them, extract the relevant information and generate tables.
52+
def process_json_files(input_dir=None, metadata_list=None, long_table_list=None, output_dir="surveillance_files", specified_week=None, copy_fasta=False):
53+
os.makedirs(output_dir, exist_ok=True)
54+
55+
bioinfo_files = []
56+
if metadata_list:
57+
with open(metadata_list, "r", encoding="utf-8") as f:
58+
bioinfo_files = [line.strip() for line in f if line.strip()]
59+
elif input_dir:
60+
bioinfo_files = [
61+
os.path.join(input_dir, filename)
62+
for filename in os.listdir(input_dir)
63+
if filename.startswith("bioinfo_lab_metadata_") and filename.endswith(".json")
64+
]
65+
66+
long_table_files = []
67+
if long_table_list:
68+
with open(long_table_list, "r", encoding="utf-8") as f:
69+
long_table_files = [line.strip() for line in f if line.strip()]
70+
elif input_dir:
71+
long_table_files = [
72+
os.path.join(input_dir, filename)
73+
for filename in os.listdir(input_dir)
74+
if filename.startswith("long_table_") and filename.endswith(".json")
75+
]
76+
77+
all_data = []
78+
fa_files = []
79+
sample_variant_data = {}
80+
81+
# Processing of bioinfo_lab_metadata_*.json.
82+
for filepath in bioinfo_files:
83+
if not os.path.exists(filepath):
84+
print(f"Warning! The file {filepath} could not be found. Please make sure the path is correct.")
85+
continue
86+
87+
with open(filepath, "r", encoding="utf-8") as file:
88+
try:
89+
data = json.load(file)
90+
for sample in data:
91+
if "sample_collection_date" in sample:
92+
week = get_epi_week(sample["sample_collection_date"])
93+
if specified_week and week != specified_week:
94+
continue
95+
all_data.append({
96+
"HOSPITAL_ID": sample.get("submitting_institution_id", "-"),
97+
"HOSPITAL": sample.get("collecting_institution", "-"),
98+
"PROVINCE": sample.get("geo_loc_region", "-"),
99+
"SAMPLE_ID": sample.get("sequencing_sample_id", "-"),
100+
"SAMPLE_COLLECTION_DATE": sample.get("sample_collection_date", "-"),
101+
"LINEAGE": sample.get("lineage_name", "-"),
102+
"WEEK": week
103+
})
104+
105+
# Search of consensus.fa files.
106+
fa_path = sample.get("viralrecon_filepath_mapping_consensus")
107+
if copy_fasta and fa_path and os.path.exists(fa_path):
108+
fa_files.append((fa_path, week))
109+
110+
except json.JSONDecodeError:
111+
print(f"Error! Could not read {filepath} properly, please make sure the file is not corrupt.")
112+
113+
if not all_data:
114+
print("No bioinfo_lab_metadata_*.json files were found.")
115+
return
116+
117+
# Processing of long_table_*.json.
118+
for filepath in long_table_files:
119+
if not os.path.exists(filepath):
120+
print(f"Warning! The file {filepath} could not be found. Please check the path is correct.")
121+
continue
122+
123+
with open(filepath, "r", encoding="utf-8") as file:
124+
try:
125+
data = json.load(file)
126+
for sample in data:
127+
sample_id = sample.get("sample_name")
128+
if sample_id:
129+
sample_variant_data[sample_id] = sample
130+
except json.JSONDecodeError:
131+
print(f"Error! Could not read {filepath} properly, please make sure the file is not corrupt.")
132+
133+
if not sample_variant_data:
134+
print("No long_table_*.json files were found.")
135+
return
136+
137+
df = pd.DataFrame(all_data)
138+
weeks = df["WEEK"].unique()
139+
140+
# Creation of epidemiological-week folders, the .xlsx file with lineage information per week and the .csv file with the variant information per week.
141+
for week in weeks:
142+
week_dir = os.path.join(output_dir, week)
143+
os.makedirs(week_dir, exist_ok=True)
144+
145+
week_df = df[df["WEEK"] == week]
146+
aggregated_df = week_df.groupby("LINEAGE").size().reset_index(name="NUMBER_SAMPLES")
147+
148+
excel_file = os.path.join(week_dir, "epidemiological_data.xlsx")
149+
with pd.ExcelWriter(excel_file) as writer:
150+
week_df.to_excel(writer, sheet_name="per_sample_data", index=False)
151+
aggregated_df.to_excel(writer, sheet_name="aggregated_data", index=False)
152+
153+
print(f"Tables were stored in {week_dir}")
154+
155+
# Copy of the consensus.fa files into a subfolder called consensus_files.
156+
if copy_fasta:
157+
consensus_dir = os.path.join(week_dir, "consensus_files")
158+
os.makedirs(consensus_dir, exist_ok=True)
159+
for fa_path, week_fa in fa_files:
160+
if week_fa == week:
161+
dest_path = os.path.join(consensus_dir, os.path.basename(fa_path))
162+
shutil.copy(fa_path, dest_path)
163+
print("Copy of consensus.fa files completed successfully")
164+
165+
# Generation of the .csv files with variant data.
166+
variant_data = []
167+
for _, row in week_df.iterrows():
168+
sample_id = row["SAMPLE_ID"]
169+
if sample_id in sample_variant_data:
170+
variant_entries = sample_variant_data[sample_id].get("variants", [])
171+
for variant in variant_entries:
172+
variant_data.append({
173+
"SAMPLE": variant.get("sample", "-"),
174+
"CHROM": variant.get("chromosome", "-"),
175+
"POS": variant.get("pos", "-"),
176+
"ALT": variant.get("alt", "-"),
177+
"REF": variant.get("ref", "-"),
178+
"FILTER": variant.get("Filter", "-"),
179+
"DP": variant.get("dp", "-"),
180+
"REF_DP": variant.get("ref_dp", "-"),
181+
"ALT_DP": variant.get("alt_dp", "-"),
182+
"AF": variant.get("af", "-"),
183+
"GENE": variant.get("gene", "-"),
184+
"EFFECT": variant.get("effect", "-"),
185+
"HGVS_C": variant.get("hgvs_c", "-"),
186+
"HGVS_P": variant.get("hgvs_p", "-"),
187+
"HGVS_P_1LETTER": variant.get("hgvs_p_1_letter", "-"),
188+
"CALLER": variant.get("caller", "-"),
189+
"LINEAGE": variant.get("lineage", "-")
190+
})
191+
192+
if variant_data:
193+
variant_df = pd.DataFrame(variant_data)
194+
variant_csv = os.path.join(week_dir, "variant_data.csv")
195+
variant_df.to_csv(variant_csv, index=False)
196+
print(f"Variant data stored in {variant_csv}")
197+
198+
if __name__ == "__main__":
199+
parser = argparse.ArgumentParser(description="JSON files are processed in order to generate lineage and variant tables in relation to all samples associated to a given epidemiological week")
200+
parser.add_argument("-i", "--input", help="Directory that contains bioinfo_lab_metadata_*.json and long_table_*.json files (they all must be stored within the same directory)")
201+
parser.add_argument("-b", "--metadata-list", help=".txt file with paths pointing to the JSON files needed to create the .xlsx file for lineage data (bioinfo_lab_metadata_*.json)")
202+
parser.add_argument("-l", "--long-table-list", help=".txt file with paths pointing to the JSON files needed to create the .csv file for variant information (long_table_*.json)")
203+
parser.add_argument("-o", "--output", default="surveillance_files", help="Directory where tables are stored (surveillance_files by default)")
204+
parser.add_argument("-w", "--week", help="Epidemiological week of interest (use the YYYY-WW format)")
205+
parser.add_argument("-c", "--copy-fasta", action="store_true", help="Copy of all consensus.fa files into a subfolder called consensus_files (you must explicitly call this option)")
206+
207+
args = parser.parse_args()
208+
process_json_files(args.input, args.metadata_list, args.long_table_list, args.output, args.week, args.copy_fasta)

0 commit comments

Comments
 (0)