diff --git a/ingest/Snakefile b/ingest/Snakefile index 89aed647ef..3929a5ed4e 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -310,7 +310,7 @@ rule prepare_metadata: sequence_hashes="results/sequence_hashes.ndjson", config="results/config.yaml", output: - metadata="results/metadata_post_prepare.json", + metadata="results/metadata_post_prepare.ndjson", params: log_level=LOG_LEVEL, shell: @@ -328,11 +328,11 @@ rule prepare_metadata: rule group_segments: input: script="scripts/group_segments.py", - metadata="results/metadata_post_prepare.json", + metadata="results/metadata_post_prepare.ndjson", sequences="results/sequences.ndjson", config="results/config.yaml", output: - metadata="results/metadata_post_group.json", + metadata="results/metadata_post_group.ndjson", sequences="results/sequences_post_group.ndjson", params: log_level=LOG_LEVEL, @@ -368,9 +368,9 @@ rule get_previous_submissions: # By delaying the start of the script script="scripts/call_loculus.py", prepped_metadata=( - "results/metadata_post_group.json" + "results/metadata_post_group.ndjson" if SEGMENTED - else "results/metadata_post_prepare.json" + else "results/metadata_post_prepare.ndjson" ), config="results/config.yaml", output: @@ -395,9 +395,9 @@ rule compare_hashes: config="results/config.yaml", old_hashes="results/previous_submissions.json", metadata=( - "results/metadata_post_group.json" + "results/metadata_post_group.ndjson" if SEGMENTED - else "results/metadata_post_prepare.json" + else "results/metadata_post_prepare.ndjson" ), output: to_submit="results/to_submit.json", @@ -431,9 +431,9 @@ rule prepare_files: script="scripts/prepare_files.py", config="results/config.yaml", metadata=( - "results/metadata_post_group.json" + "results/metadata_post_group.ndjson" if SEGMENTED - else "results/metadata_post_prepare.json" + else "results/metadata_post_prepare.ndjson" ), sequences=( "results/sequences_post_group.ndjson" diff --git a/ingest/scripts/call_loculus.py b/ingest/scripts/call_loculus.py index 2a1191cded..25a568db8f 100644 --- a/ingest/scripts/call_loculus.py +++ b/ingest/scripts/call_loculus.py @@ -411,8 +411,6 @@ def get_submitted(config: Config): logger.info(f"Backend has status of: {len(statuses)} sequence entries from ingest") logger.info(f"Ingest has submitted: {len(entries)} sequence entries to ingest") - logger.debug(entries) - logger.debug(statuses) for entry in entries: loculus_accession = entry["accession"] submitter = entry["submitter"] diff --git a/ingest/scripts/compare_hashes.py b/ingest/scripts/compare_hashes.py index 60174564e6..7062062f50 100644 --- a/ingest/scripts/compare_hashes.py +++ b/ingest/scripts/compare_hashes.py @@ -6,6 +6,7 @@ from typing import Any import click +import orjsonl import requests import yaml @@ -171,7 +172,6 @@ def main( config.debug_hashes = True submitted: dict = json.load(open(old_hashes, encoding="utf-8")) - new_metadata = json.load(open(metadata, encoding="utf-8")) update_manager = SequenceUpdateManager( submit=[], @@ -184,7 +184,9 @@ def main( config=config, ) - for fasta_id, record in new_metadata.items(): + for field in orjsonl.stream(metadata): + fasta_id = field["id"] + record = field["metadata"] if not config.segmented: insdc_accession_base = record["insdcAccessionBase"] if not insdc_accession_base: diff --git a/ingest/scripts/group_segments.py b/ingest/scripts/group_segments.py index dcfc427c37..ddc1499b81 100644 --- a/ingest/scripts/group_segments.py +++ b/ingest/scripts/group_segments.py @@ -1,6 +1,7 @@ """Script to group segments together into sequence entries prior to submission to Loculus -Example output for a single isolate with 3 segments: -"KJ682796.1.L/KJ682809.1.M/KJ682819.1.S": { +Example ndjson output for a single isolate with 3 segments: +{"id": "KJ682796.1.L/KJ682809.1.M/KJ682819.1.S", +"metadata": { "ncbiReleaseDate": "2014-07-06T00:00:00Z", "ncbiSourceDb": "GenBank", "authors": "D. Goedhals, F.J. Burt, J.T. Bester, R. Swanepoel", @@ -15,7 +16,7 @@ "hash_S": "f716ed13dca9c8a033d46da2f3dc2ff1", "hash": "ce7056d0bd7e3d6d3eca38f56b9d10f8", "submissionId": "KJ682796.1.L/KJ682809.1.M/KJ682819.1.S" -},""" +}}""" import hashlib import json @@ -100,9 +101,11 @@ def main( segments = config.nucleotide_sequences number_of_segments = len(segments) - with open(input_metadata, encoding="utf-8") as file: - segment_metadata: dict[str, dict[str, str]] = json.load(file) - number_of_segmented_records = len(segment_metadata.keys()) + number_of_segmented_records = 0 + segment_metadata: dict[str, dict[str, str]] = {} + for record in orjsonl.stream(input_metadata): + segment_metadata[record["id"]] = record["metadata"] + number_of_segmented_records += 1 logger.info(f"Found {number_of_segmented_records} individual segments in metadata file") # Group segments according to isolate, collection date and isolate specific values @@ -174,7 +177,7 @@ def main( number_of_groups = len(grouped_accessions) group_lower_bound = number_of_segmented_records // number_of_segments group_upper_bound = number_of_segmented_records - logging.info(f"Total of {number_of_groups} groups left after merging") + logger.info(f"Total of {number_of_groups} groups left after merging") if number_of_groups < group_lower_bound: raise ValueError( { @@ -192,11 +195,11 @@ def main( } ) - # Add segment specific metadata for the segments - metadata: dict[str, dict[str, str]] = {} # Map from original accession to the new concatenated accession fasta_id_map: dict[Accession, Accession] = {} + count = 0 + for group in grouped_accessions: # Create key by concatenating all accession numbers with their segments # e.g. AF1234_S/AF1235_M/AF1236_L @@ -241,12 +244,10 @@ def main( json.dumps(filtered_record, sort_keys=True).encode(), usedforsecurity=False ).hexdigest() - metadata[joint_key] = row + orjsonl.append(output_metadata, {"id": joint_key, "metadata": row}) + count += 1 - Path(output_metadata).write_text( - json.dumps(metadata, indent=4, sort_keys=True), encoding="utf-8" - ) - logging.info(f"Wrote grouped metadata for {len(metadata)} sequences") + logger.info(f"Wrote grouped metadata for {count} sequences") count = 0 count_ignored = 0 @@ -265,8 +266,8 @@ def main( }, ) count += 1 - logging.info(f"Wrote {count} sequences") - logging.info(f"Ignored {count_ignored} sequences as not found in {input_seq}") + logger.info(f"Wrote {count} sequences") + logger.info(f"Ignored {count_ignored} sequences as not found in {input_seq}") if __name__ == "__main__": diff --git a/ingest/scripts/prepare_files.py b/ingest/scripts/prepare_files.py index ac28131285..d1b20e413b 100644 --- a/ingest/scripts/prepare_files.py +++ b/ingest/scripts/prepare_files.py @@ -1,6 +1,7 @@ import csv import json import logging +import os import sys from dataclasses import dataclass from pathlib import Path @@ -101,65 +102,81 @@ def main( relevant_config = {key: full_config[key] for key in Config.__annotations__} config = Config(**relevant_config) - metadata = json.load(open(metadata_path, encoding="utf-8")) to_submit = json.load(open(to_submit_path, encoding="utf-8")) to_revise = json.load(open(to_revise_path, encoding="utf-8")) to_revoke = json.load(open(to_revoke_path, encoding="utf-8")) - metadata_submit = [] - metadata_revise = [] - metadata_submit_prior_to_revoke = [] # Only for multi-segmented case, sequences are revoked - # due to grouping changes and the newly grouped segments must be submitted as new sequences submit_ids = set() revise_ids = set() submit_prior_to_revoke_ids = set() - for fasta_id in to_submit: - metadata_submit.append(metadata[fasta_id]) - submit_ids.update(ids_to_add(fasta_id, config)) + def write_to_tsv_stream(data, filename, columns_list=None): + # Check if the file exists + file_exists = os.path.exists(filename) - for fasta_id, loculus_accession in to_revise.items(): - revise_record = metadata[fasta_id] - revise_record["accession"] = loculus_accession - metadata_revise.append(revise_record) - revise_ids.update(ids_to_add(fasta_id, config)) + with open(filename, "a", newline="", encoding="utf-8") as output_file: + keys = columns_list or data.keys() + dict_writer = csv.DictWriter(output_file, keys, delimiter="\t") - found_seq_to_revoke = False - for fasta_id in to_revoke: - metadata_submit_prior_to_revoke.append(metadata[fasta_id]) - submit_prior_to_revoke_ids.update(ids_to_add(fasta_id, config)) + # Write the header only if the file doesn't already exist + if not file_exists: + dict_writer.writeheader() - if found_seq_to_revoke: - revocation_notification(config, to_revoke) + dict_writer.writerow(data) - def write_to_tsv(data, filename): - if not data: - Path(filename).touch() - return - keys = data[0].keys() - with open(filename, "w", newline="", encoding="utf-8") as output_file: - dict_writer = csv.DictWriter(output_file, keys, delimiter="\t") - dict_writer.writeheader() - dict_writer.writerows(data) + columns_list = None + for field in orjsonl.stream(metadata_path): + fasta_id = field["id"] + record = field["metadata"] + if not columns_list: + columns_list = record.keys() + + if fasta_id in to_submit: + write_to_tsv_stream(record, metadata_submit_path, columns_list) + submit_ids.update(ids_to_add(fasta_id, config)) + continue + + if fasta_id in to_revise: + record["accession"] = to_revise[fasta_id] + write_to_tsv_stream(record, metadata_revise_path, [*columns_list, "accession"]) + revise_ids.update(ids_to_add(fasta_id, config)) + continue + + found_seq_to_revoke = False + if fasta_id in to_revoke: + submit_prior_to_revoke_ids.update(ids_to_add(fasta_id, config)) + write_to_tsv_stream(record, metadata_submit_prior_to_revoke_path, columns_list) + found_seq_to_revoke = True - write_to_tsv(metadata_submit, metadata_submit_path) - write_to_tsv(metadata_revise, metadata_revise_path) - write_to_tsv(metadata_submit_prior_to_revoke, metadata_submit_prior_to_revoke_path) + if found_seq_to_revoke: + revocation_notification(config, to_revoke) - def stream_filter_to_fasta(input, output, keep): + def stream_filter_to_fasta(input, output, output_metadata, keep): if len(keep) == 0: Path(output).touch() + Path(output_metadata).touch() return with open(output, "w", encoding="utf-8") as output_file: for record in orjsonl.stream(input): if record["id"] in keep: output_file.write(f">{record['id']}\n{record['sequence']}\n") - stream_filter_to_fasta(input=sequences_path, output=sequences_submit_path, keep=submit_ids) - stream_filter_to_fasta(input=sequences_path, output=sequences_revise_path, keep=revise_ids) + stream_filter_to_fasta( + input=sequences_path, + output=sequences_submit_path, + output_metadata=metadata_submit_path, + keep=submit_ids, + ) + stream_filter_to_fasta( + input=sequences_path, + output=sequences_revise_path, + output_metadata=metadata_revise_path, + keep=revise_ids, + ) stream_filter_to_fasta( input=sequences_path, output=sequences_submit_prior_to_revoke_path, + output_metadata=metadata_submit_prior_to_revoke_path, keep=submit_prior_to_revoke_ids, ) diff --git a/ingest/scripts/prepare_metadata.py b/ingest/scripts/prepare_metadata.py index 2f0bfe3e64..170b05bdfa 100644 --- a/ingest/scripts/prepare_metadata.py +++ b/ingest/scripts/prepare_metadata.py @@ -10,7 +10,6 @@ import json import logging from dataclasses import dataclass -from pathlib import Path import click import orjsonl @@ -143,11 +142,9 @@ def main( record["hash"] = hashlib.md5(prehash.encode(), usedforsecurity=False).hexdigest() - meta_dict = {rec[fasta_id_field]: rec for rec in metadata} + orjsonl.append(output, {"id": record[fasta_id_field], "metadata": record}) - Path(output).write_text(json.dumps(meta_dict, indent=4, sort_keys=True), encoding="utf-8") - - logging.info(f"Saved metadata for {len(metadata)} sequences") + logger.info(f"Saved metadata for {len(metadata)} sequences") if __name__ == "__main__": diff --git a/ingest/tests/expected_output_cchf/metadata_post_prepare.json b/ingest/tests/expected_output_cchf/metadata_post_prepare.json deleted file mode 100644 index 9a57b572e4..0000000000 --- a/ingest/tests/expected_output_cchf/metadata_post_prepare.json +++ /dev/null @@ -1,152 +0,0 @@ -{ - "KX013462.1": { - "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", - "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", - "bioprojectAccession": "", - "biosampleAccession": "", - "geoLocAdmin1": "Astrakhan", - "geoLocCountry": "Russia", - "hash": "15c5b9d511b1c6c37a3ff43280f3f617", - "hostNameScientific": "Ixodoidea", - "hostTaxonId": "297308", - "insdcAccessionBase": "KX013462", - "insdcAccessionFull": "KX013462.1", - "insdcVersion": "1", - "isLabHost": "", - "ncbiReleaseDate": "2016-12-07T00:00:00Z", - "ncbiSourceDb": "GenBank", - "ncbiUpdateDate": "2016-12-07T00:00:00Z", - "ncbiVirusName": "Orthonairovirus haemorrhagiae", - "ncbiVirusTaxId": "3052518", - "sampleCollectionDate": "1989", - "segment": "L", - "specimenCollectorSampleId": "K229_194", - "insdcRawReadsAccession": "", - "submissionId": "KX013462.1" - }, - "KX013463.1": { - "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", - "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", - "bioprojectAccession": "", - "biosampleAccession": "", - "geoLocAdmin1": "Astrakhan", - "geoLocCountry": "Russia", - "hash": "c11e1b7951a73b14f70403bed5cf2d10", - "hostNameScientific": "Ixodoidea", - "hostTaxonId": "297308", - "insdcAccessionBase": "KX013463", - "insdcAccessionFull": "KX013463.1", - "insdcVersion": "1", - "isLabHost": "", - "ncbiReleaseDate": "2016-12-07T00:00:00Z", - "ncbiSourceDb": "GenBank", - "ncbiUpdateDate": "2016-12-07T00:00:00Z", - "ncbiVirusName": "Orthonairovirus haemorrhagiae", - "ncbiVirusTaxId": "3052518", - "sampleCollectionDate": "1989", - "segment": "M", - "specimenCollectorSampleId": "K229_194", - "insdcRawReadsAccession": "", - "submissionId": "KX013463.1" - }, - "KX013464.1": { - "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", - "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", - "bioprojectAccession": "", - "biosampleAccession": "", - "geoLocAdmin1": "Astrakhan", - "geoLocCountry": "Russia", - "hash": "7cd7b3085ef50ad49973b92979a21ee8", - "hostNameScientific": "Ixodoidea", - "hostTaxonId": "297308", - "insdcAccessionBase": "KX013464", - "insdcAccessionFull": "KX013464.1", - "insdcVersion": "1", - "isLabHost": "", - "ncbiReleaseDate": "2016-12-07T00:00:00Z", - "ncbiSourceDb": "GenBank", - "ncbiUpdateDate": "2016-12-07T00:00:00Z", - "ncbiVirusName": "Orthonairovirus haemorrhagiae", - "ncbiVirusTaxId": "3052518", - "sampleCollectionDate": "1989", - "segment": "S", - "specimenCollectorSampleId": "K229_194", - "insdcRawReadsAccession": "", - "submissionId": "KX013464.1" - }, - "KX013483.1": { - "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", - "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", - "bioprojectAccession": "", - "biosampleAccession": "", - "geoLocAdmin1": "", - "geoLocCountry": "Uganda", - "hash": "7b10a4e21daa8a2e693958761be17d53", - "hostNameScientific": "Homo sapiens", - "hostTaxonId": "9606", - "insdcAccessionBase": "KX013483", - "insdcAccessionFull": "KX013483.1", - "insdcVersion": "1", - "isLabHost": "", - "ncbiReleaseDate": "2016-12-07T00:00:00Z", - "ncbiSourceDb": "GenBank", - "ncbiUpdateDate": "2016-12-07T00:00:00Z", - "ncbiVirusName": "Orthonairovirus haemorrhagiae", - "ncbiVirusTaxId": "3052518", - "sampleCollectionDate": "1958", - "segment": "L", - "specimenCollectorSampleId": "Nakiwogo", - "insdcRawReadsAccession": "", - "submissionId": "KX013483.1" - }, - "KX013485.1": { - "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", - "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", - "bioprojectAccession": "", - "biosampleAccession": "", - "geoLocAdmin1": "", - "geoLocCountry": "Uganda", - "hash": "70954bc35782b5592858ac3f1a6bbf89", - "hostNameScientific": "Homo sapiens", - "hostTaxonId": "9606", - "insdcAccessionBase": "KX013485", - "insdcAccessionFull": "KX013485.1", - "insdcVersion": "1", - "isLabHost": "", - "ncbiReleaseDate": "2016-12-07T00:00:00Z", - "ncbiSourceDb": "GenBank", - "ncbiUpdateDate": "2016-12-07T00:00:00Z", - "ncbiVirusName": "Orthonairovirus haemorrhagiae", - "ncbiVirusTaxId": "3052518", - "sampleCollectionDate": "1958", - "segment": "S", - "specimenCollectorSampleId": "Nakiwogo", - "insdcRawReadsAccession": "", - "submissionId": "KX013485.1" - }, - "KX096703.1": { - "authorAffiliations": "Public Health England, Research", - "authors": "Deryabin, ; Atshabar, B.; Sansyzbaev, Y.; Berezin, V.; Nurmakhanov, T.; Yeskhojayev, O.; Vilkova, A.; Shevtsov, A.; Hewson, R.; Atkinson, B.", - "bioprojectAccession": "", - "biosampleAccession": "", - "geoLocAdmin1": "Sairam district", - "geoLocCountry": "Kazakhstan", - "hash": "cd17a5f4dcc98e7a2afd1ab9f30274bc", - "hostNameScientific": "Hyalomma anatolicum", - "hostTaxonId": "176092", - "insdcAccessionBase": "KX096703", - "insdcAccessionFull": "KX096703.1", - "insdcVersion": "1", - "isLabHost": "", - "ncbiReleaseDate": "2016-04-30T00:00:00Z", - "ncbiSourceDb": "GenBank", - "ncbiUpdateDate": "2016-04-30T00:00:00Z", - "ncbiVirusName": "Orthonairovirus haemorrhagiae", - "ncbiVirusTaxId": "3052518", - "sampleCollectionDate": "2015", - "segment": "S", - "specimenCollectorSampleId": "tick pool #134", - "insdcRawReadsAccession": "", - "submissionId": "KX096703.1" - } -} \ No newline at end of file diff --git a/ingest/tests/expected_output_cchf/metadata_post_prepare.ndjson b/ingest/tests/expected_output_cchf/metadata_post_prepare.ndjson new file mode 100644 index 0000000000..a0763d1ceb --- /dev/null +++ b/ingest/tests/expected_output_cchf/metadata_post_prepare.ndjson @@ -0,0 +1,6 @@ + { "id": "KX013462.1", "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", "biosampleAccession": "", "geoLocAdmin1": "Astrakhan", "geoLocCountry": "Russia", "hash": "15c5b9d511b1c6c37a3ff43280f3f617", "hostNameScientific": "Ixodoidea", "hostTaxonId": "297308", "insdcAccessionBase": "KX013462", "insdcAccessionFull": "KX013462.1", "insdcVersion": "1", "isLabHost": "", "ncbiReleaseDate": "2016-12-07T00:00:00Z", "ncbiSourceDb": "GenBank", "ncbiUpdateDate": "2016-12-07T00:00:00Z", "ncbiVirusName": "Orthonairovirus haemorrhagiae", "ncbiVirusTaxId": "3052518", "sampleCollectionDate": "1989", "segment": "L", "specimenCollectorSampleId": "K229_194", "insdcRawReadsAccession": "", "submissionId": "KX013462.1" } } + { "id": "KX013463.1", "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", "biosampleAccession": "", "geoLocAdmin1": "Astrakhan", "geoLocCountry": "Russia", "hash": "c11e1b7951a73b14f70403bed5cf2d10", "hostNameScientific": "Ixodoidea", "hostTaxonId": "297308", "insdcAccessionBase": "KX013463", "insdcAccessionFull": "KX013463.1", "insdcVersion": "1", "isLabHost": "", "ncbiReleaseDate": "2016-12-07T00:00:00Z", "ncbiSourceDb": "GenBank", "ncbiUpdateDate": "2016-12-07T00:00:00Z", "ncbiVirusName": "Orthonairovirus haemorrhagiae", "ncbiVirusTaxId": "3052518", "sampleCollectionDate": "1989", "segment": "M", "specimenCollectorSampleId": "K229_194", "insdcRawReadsAccession": "", "submissionId": "KX013463.1" } } + { "id": "KX013464.1", "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", "biosampleAccession": "", "geoLocAdmin1": "Astrakhan", "geoLocCountry": "Russia", "hash": "7cd7b3085ef50ad49973b92979a21ee8", "hostNameScientific": "Ixodoidea", "hostTaxonId": "297308", "insdcAccessionBase": "KX013464", "insdcAccessionFull": "KX013464.1", "insdcVersion": "1", "isLabHost": "", "ncbiReleaseDate": "2016-12-07T00:00:00Z", "ncbiSourceDb": "GenBank", "ncbiUpdateDate": "2016-12-07T00:00:00Z", "ncbiVirusName": "Orthonairovirus haemorrhagiae", "ncbiVirusTaxId": "3052518", "sampleCollectionDate": "1989", "segment": "S", "specimenCollectorSampleId": "K229_194", "insdcRawReadsAccession": "", "submissionId": "KX013464.1" } } + { "id": "KX013483.1", "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", "biosampleAccession": "", "geoLocAdmin1": "", "geoLocCountry": "Uganda", "hash": "7b10a4e21daa8a2e693958761be17d53", "hostNameScientific": "Homo sapiens", "hostTaxonId": "9606", "insdcAccessionBase": "KX013483", "insdcAccessionFull": "KX013483.1", "insdcVersion": "1", "isLabHost": "", "ncbiReleaseDate": "2016-12-07T00:00:00Z", "ncbiSourceDb": "GenBank", "ncbiUpdateDate": "2016-12-07T00:00:00Z", "ncbiVirusName": "Orthonairovirus haemorrhagiae", "ncbiVirusTaxId": "3052518", "sampleCollectionDate": "1958", "segment": "L", "specimenCollectorSampleId": "Nakiwogo", "insdcRawReadsAccession": "", "submissionId": "KX013483.1" } } + { "id": "KX013485.1", "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", "biosampleAccession": "", "geoLocAdmin1": "", "geoLocCountry": "Uganda", "hash": "70954bc35782b5592858ac3f1a6bbf89", "hostNameScientific": "Homo sapiens", "hostTaxonId": "9606", "insdcAccessionBase": "KX013485", "insdcAccessionFull": "KX013485.1", "insdcVersion": "1", "isLabHost": "", "ncbiReleaseDate": "2016-12-07T00:00:00Z", "ncbiSourceDb": "GenBank", "ncbiUpdateDate": "2016-12-07T00:00:00Z", "ncbiVirusName": "Orthonairovirus haemorrhagiae", "ncbiVirusTaxId": "3052518", "sampleCollectionDate": "1958", "segment": "S", "specimenCollectorSampleId": "Nakiwogo", "insdcRawReadsAccession": "", "submissionId": "KX013485.1" } } + { "id": "KX096703.1", "metadata": { "authorAffiliations": "Public Health England, Research", "authors": "Deryabin, ; Atshabar, B.; Sansyzbaev, Y.; Berezin, V.; Nurmakhanov, T.; Yeskhojayev, O.; Vilkova, A.; Shevtsov, A.; Hewson, R.; Atkinson, B.", "bioprojectAccession": "", "biosampleAccession": "", "geoLocAdmin1": "Sairam district", "geoLocCountry": "Kazakhstan", "hash": "cd17a5f4dcc98e7a2afd1ab9f30274bc", "hostNameScientific": "Hyalomma anatolicum", "hostTaxonId": "176092", "insdcAccessionBase": "KX096703", "insdcAccessionFull": "KX096703.1", "insdcVersion": "1", "isLabHost": "", "ncbiReleaseDate": "2016-04-30T00:00:00Z", "ncbiSourceDb": "GenBank", "ncbiUpdateDate": "2016-04-30T00:00:00Z", "ncbiVirusName": "Orthonairovirus haemorrhagiae", "ncbiVirusTaxId": "3052518", "sampleCollectionDate": "2015", "segment": "S", "specimenCollectorSampleId": "tick pool #134", "insdcRawReadsAccession": "", "submissionId": "KX096703.1" } } \ No newline at end of file diff --git a/ingest/tests/expected_output_cchf/revise_metadata.tsv b/ingest/tests/expected_output_cchf/revise_metadata.tsv new file mode 100644 index 0000000000..77fb9269a5 --- /dev/null +++ b/ingest/tests/expected_output_cchf/revise_metadata.tsv @@ -0,0 +1,2 @@ +authorAffiliations authors bioprojectAccession biosampleAccession geoLocAdmin1 geoLocCountry hostNameScientific hostTaxonId isLabHost ncbiReleaseDate ncbiSourceDb ncbiVirusName ncbiVirusTaxId sampleCollectionDate specimenCollectorSampleId ncbiUpdateDate_L ncbiUpdateDate_M ncbiUpdateDate_S insdcVersion_L insdcVersion_M insdcVersion_S insdcRawReadsAccession_L insdcRawReadsAccession_M insdcRawReadsAccession_S hash_L hash_M hash_S insdcAccessionBase_L insdcAccessionBase_M insdcAccessionBase_S insdcAccessionFull_L insdcAccessionFull_M insdcAccessionFull_S submissionId hash accession +Public Health England, Research Deryabin, ; Atshabar, B.; Sansyzbaev, Y.; Berezin, V.; Nurmakhanov, T.; Yeskhojayev, O.; Vilkova, A.; Shevtsov, A.; Hewson, R.; Atkinson, B. Sairam district Kazakhstan Hyalomma anatolicum 176092 2016-04-30T00:00:00Z GenBank Orthonairovirus haemorrhagiae 3052518 2015 tick pool #134 2016-04-30T00:00:00Z 1 cd17a5f4dcc98e7a2afd1ab9f30274bc KX096703 KX096703.1 KX096703.1.S 646cca576b1d24d397f5ecf0905fd5c1 LOC_0000VXA diff --git a/ingest/tests/expected_output_cchf/submit_metadata.tsv b/ingest/tests/expected_output_cchf/submit_metadata.tsv new file mode 100644 index 0000000000..140113d450 --- /dev/null +++ b/ingest/tests/expected_output_cchf/submit_metadata.tsv @@ -0,0 +1,2 @@ +authorAffiliations authors bioprojectAccession biosampleAccession geoLocAdmin1 geoLocCountry hostNameScientific hostTaxonId isLabHost ncbiReleaseDate ncbiSourceDb ncbiVirusName ncbiVirusTaxId sampleCollectionDate specimenCollectorSampleId ncbiUpdateDate_L ncbiUpdateDate_M ncbiUpdateDate_S insdcVersion_L insdcVersion_M insdcVersion_S insdcRawReadsAccession_L insdcRawReadsAccession_M insdcRawReadsAccession_S hash_L hash_M hash_S insdcAccessionBase_L insdcAccessionBase_M insdcAccessionBase_S insdcAccessionFull_L insdcAccessionFull_M insdcAccessionFull_S submissionId hash +Chumakov Institute of Poliomyelitis and Viral Encephalitides Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P. Uganda Homo sapiens 9606 2016-12-07T00:00:00Z GenBank Orthonairovirus haemorrhagiae 3052518 1958 Nakiwogo 2016-12-07T00:00:00Z 2016-12-07T00:00:00Z 1 1 7b10a4e21daa8a2e693958761be17d53 70954bc35782b5592858ac3f1a6bbf89 KX013483 KX013485 KX013483.1 KX013485.1 KX013483.1.L/KX013485.1.S bb3a6d8df47cb2891e7b60030a40c335 diff --git a/ingest/tests/test_ingest.py b/ingest/tests/test_ingest.py index 543f66d4aa..fe99b82e67 100644 --- a/ingest/tests/test_ingest.py +++ b/ingest/tests/test_ingest.py @@ -4,6 +4,8 @@ import subprocess from pathlib import Path +import orjsonl +import pandas as pd import pytest # Define the paths to your test data and expected output @@ -35,6 +37,25 @@ def compare_json_files(file1, file2): return json1 == json2 +def compare_ndjson_files(file1, file2): + def create_dict_from_ndjson(file): + dict = {} + for record in orjsonl.stream(file): + dict[record["id"]] = record["metadata"] + + dict1 = create_dict_from_ndjson(file1) + dict2 = create_dict_from_ndjson(file2) + + return dict1 == dict2 + +def compare_tsv_files(file1, file2): + df1 = pd.read_csv(file1, sep="\t") + df2 = pd.read_csv(file2, sep="\t") + + # Compare the contents + return df1.sort_index(axis=1).equals(df2.sort_index(axis=1)) + + def run_snakemake(rule, touch=False): """ Function to run Snakemake with the test data. @@ -67,6 +88,7 @@ def test_snakemake(): run_snakemake("group_segments") run_snakemake("get_previous_submissions", touch=True) # Do not call_loculus run_snakemake("compare_hashes") + run_snakemake("prepare_files") # Check that the output files match the expected files for expected_file in EXPECTED_OUTPUT_DIR.glob("*.json"): @@ -77,6 +99,22 @@ def test_snakemake(): output_file, ), f"{output_file} does not match {expected_file}." + for expected_file in EXPECTED_OUTPUT_DIR.glob("*.tsv"): + output_file = OUTPUT_DIR / expected_file.name + assert output_file.exists(), f"{output_file} does not exist." + assert compare_tsv_files( + expected_file, + output_file, + ), f"{output_file} does not match {expected_file}." + + for expected_file in EXPECTED_OUTPUT_DIR.glob("*.ndjson"): + output_file = OUTPUT_DIR / expected_file.name + assert output_file.exists(), f"{output_file} does not exist." + assert compare_ndjson_files( + expected_file, + output_file, + ), f"{output_file} does not match {expected_file}." + if __name__ == "__main__": pytest.main(["-v"])