From 924805876779cdfa93d520392d5fd06d93510de1 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 8 Jan 2025 09:42:50 +0100 Subject: [PATCH 01/15] fix(ingest): switch to ndjson instead of json to stream grouped metadata --- ingest/Snakefile | 18 +++++++-------- ingest/scripts/compare_hashes.py | 6 +++-- ingest/scripts/group_segments.py | 24 +++++++++++--------- ingest/scripts/prepare_files.py | 36 +++++++++++++++++------------- ingest/scripts/prepare_metadata.py | 7 ++---- 5 files changed, 50 insertions(+), 41 deletions(-) diff --git a/ingest/Snakefile b/ingest/Snakefile index 89aed647ef..3929a5ed4e 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -310,7 +310,7 @@ rule prepare_metadata: sequence_hashes="results/sequence_hashes.ndjson", config="results/config.yaml", output: - metadata="results/metadata_post_prepare.json", + metadata="results/metadata_post_prepare.ndjson", params: log_level=LOG_LEVEL, shell: @@ -328,11 +328,11 @@ rule prepare_metadata: rule group_segments: input: script="scripts/group_segments.py", - metadata="results/metadata_post_prepare.json", + metadata="results/metadata_post_prepare.ndjson", sequences="results/sequences.ndjson", config="results/config.yaml", output: - metadata="results/metadata_post_group.json", + metadata="results/metadata_post_group.ndjson", sequences="results/sequences_post_group.ndjson", params: log_level=LOG_LEVEL, @@ -368,9 +368,9 @@ rule get_previous_submissions: # By delaying the start of the script script="scripts/call_loculus.py", prepped_metadata=( - "results/metadata_post_group.json" + "results/metadata_post_group.ndjson" if SEGMENTED - else "results/metadata_post_prepare.json" + else "results/metadata_post_prepare.ndjson" ), config="results/config.yaml", output: @@ -395,9 +395,9 @@ rule compare_hashes: config="results/config.yaml", old_hashes="results/previous_submissions.json", metadata=( - "results/metadata_post_group.json" + "results/metadata_post_group.ndjson" if SEGMENTED - else "results/metadata_post_prepare.json" + else "results/metadata_post_prepare.ndjson" ), output: to_submit="results/to_submit.json", @@ -431,9 +431,9 @@ rule prepare_files: script="scripts/prepare_files.py", config="results/config.yaml", metadata=( - "results/metadata_post_group.json" + "results/metadata_post_group.ndjson" if SEGMENTED - else "results/metadata_post_prepare.json" + else "results/metadata_post_prepare.ndjson" ), sequences=( "results/sequences_post_group.ndjson" diff --git a/ingest/scripts/compare_hashes.py b/ingest/scripts/compare_hashes.py index 60174564e6..7062062f50 100644 --- a/ingest/scripts/compare_hashes.py +++ b/ingest/scripts/compare_hashes.py @@ -6,6 +6,7 @@ from typing import Any import click +import orjsonl import requests import yaml @@ -171,7 +172,6 @@ def main( config.debug_hashes = True submitted: dict = json.load(open(old_hashes, encoding="utf-8")) - new_metadata = json.load(open(metadata, encoding="utf-8")) update_manager = SequenceUpdateManager( submit=[], @@ -184,7 +184,9 @@ def main( config=config, ) - for fasta_id, record in new_metadata.items(): + for field in orjsonl.stream(metadata): + fasta_id = field["id"] + record = field["metadata"] if not config.segmented: insdc_accession_base = record["insdcAccessionBase"] if not insdc_accession_base: diff --git a/ingest/scripts/group_segments.py b/ingest/scripts/group_segments.py index dcfc427c37..810d7f02bd 100644 --- a/ingest/scripts/group_segments.py +++ b/ingest/scripts/group_segments.py @@ -100,9 +100,11 @@ def main( segments = config.nucleotide_sequences number_of_segments = len(segments) - with open(input_metadata, encoding="utf-8") as file: - segment_metadata: dict[str, dict[str, str]] = json.load(file) - number_of_segmented_records = len(segment_metadata.keys()) + number_of_segmented_records = 0 + segment_metadata: dict[str, dict[str, str]] = {} + for record in orjsonl.stream(input_seq): + segment_metadata[record["id"]] = record["metadata"] + number_of_segmented_records += 1 logger.info(f"Found {number_of_segmented_records} individual segments in metadata file") # Group segments according to isolate, collection date and isolate specific values @@ -174,7 +176,7 @@ def main( number_of_groups = len(grouped_accessions) group_lower_bound = number_of_segmented_records // number_of_segments group_upper_bound = number_of_segmented_records - logging.info(f"Total of {number_of_groups} groups left after merging") + logger.info(f"Total of {number_of_groups} groups left after merging") if number_of_groups < group_lower_bound: raise ValueError( { @@ -197,6 +199,8 @@ def main( # Map from original accession to the new concatenated accession fasta_id_map: dict[Accession, Accession] = {} + count = 0 + for group in grouped_accessions: # Create key by concatenating all accession numbers with their segments # e.g. AF1234_S/AF1235_M/AF1236_L @@ -243,10 +247,10 @@ def main( metadata[joint_key] = row - Path(output_metadata).write_text( - json.dumps(metadata, indent=4, sort_keys=True), encoding="utf-8" - ) - logging.info(f"Wrote grouped metadata for {len(metadata)} sequences") + orjsonl.append(output_metadata, {"id": joint_key, "metadata": row}) + count += 1 + + logger.info(f"Wrote grouped metadata for {count} sequences") count = 0 count_ignored = 0 @@ -265,8 +269,8 @@ def main( }, ) count += 1 - logging.info(f"Wrote {count} sequences") - logging.info(f"Ignored {count_ignored} sequences as not found in {input_seq}") + logger.info(f"Wrote {count} sequences") + logger.info(f"Ignored {count_ignored} sequences as not found in {input_seq}") if __name__ == "__main__": diff --git a/ingest/scripts/prepare_files.py b/ingest/scripts/prepare_files.py index ac28131285..9402b03379 100644 --- a/ingest/scripts/prepare_files.py +++ b/ingest/scripts/prepare_files.py @@ -101,7 +101,6 @@ def main( relevant_config = {key: full_config[key] for key in Config.__annotations__} config = Config(**relevant_config) - metadata = json.load(open(metadata_path, encoding="utf-8")) to_submit = json.load(open(to_submit_path, encoding="utf-8")) to_revise = json.load(open(to_revise_path, encoding="utf-8")) to_revoke = json.load(open(to_revoke_path, encoding="utf-8")) @@ -114,24 +113,31 @@ def main( revise_ids = set() submit_prior_to_revoke_ids = set() - for fasta_id in to_submit: - metadata_submit.append(metadata[fasta_id]) - submit_ids.update(ids_to_add(fasta_id, config)) + for field in orjsonl.stream(metadata_path): + fasta_id = field["id"] + record = field["metadata"] - for fasta_id, loculus_accession in to_revise.items(): - revise_record = metadata[fasta_id] - revise_record["accession"] = loculus_accession - metadata_revise.append(revise_record) - revise_ids.update(ids_to_add(fasta_id, config)) + if fasta_id in to_submit: + metadata_submit.append(record) + submit_ids.update(ids_to_add(fasta_id, config)) + continue - found_seq_to_revoke = False - for fasta_id in to_revoke: - metadata_submit_prior_to_revoke.append(metadata[fasta_id]) - submit_prior_to_revoke_ids.update(ids_to_add(fasta_id, config)) + if fasta_id in to_revise: + record["accession"] = to_revise[fasta_id] + metadata_revise.append(record) + revise_ids.update(ids_to_add(fasta_id, config)) + continue - if found_seq_to_revoke: - revocation_notification(config, to_revoke) + found_seq_to_revoke = False + if fasta_id in to_revoke: + metadata_submit_prior_to_revoke.append(record) + submit_prior_to_revoke_ids.update(ids_to_add(fasta_id, config)) + found_seq_to_revoke = True + if found_seq_to_revoke: + revocation_notification(config, to_revoke) + + # TODO: Convert to streams def write_to_tsv(data, filename): if not data: Path(filename).touch() diff --git a/ingest/scripts/prepare_metadata.py b/ingest/scripts/prepare_metadata.py index 2f0bfe3e64..be7c46985c 100644 --- a/ingest/scripts/prepare_metadata.py +++ b/ingest/scripts/prepare_metadata.py @@ -10,7 +10,6 @@ import json import logging from dataclasses import dataclass -from pathlib import Path import click import orjsonl @@ -143,11 +142,9 @@ def main( record["hash"] = hashlib.md5(prehash.encode(), usedforsecurity=False).hexdigest() - meta_dict = {rec[fasta_id_field]: rec for rec in metadata} + orjsonl.append(output, {"id": record[fasta_id_field], "hash": record}) - Path(output).write_text(json.dumps(meta_dict, indent=4, sort_keys=True), encoding="utf-8") - - logging.info(f"Saved metadata for {len(metadata)} sequences") + logger.info(f"Saved metadata for {len(metadata)} sequences") if __name__ == "__main__": From da4385d282769f40824ab53e3441b134a6bdcf7c Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 8 Jan 2025 09:51:59 +0100 Subject: [PATCH 02/15] fix --- ingest/scripts/group_segments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest/scripts/group_segments.py b/ingest/scripts/group_segments.py index 810d7f02bd..64131da095 100644 --- a/ingest/scripts/group_segments.py +++ b/ingest/scripts/group_segments.py @@ -102,7 +102,7 @@ def main( number_of_segmented_records = 0 segment_metadata: dict[str, dict[str, str]] = {} - for record in orjsonl.stream(input_seq): + for record in orjsonl.stream(input_metadata): segment_metadata[record["id"]] = record["metadata"] number_of_segmented_records += 1 logger.info(f"Found {number_of_segmented_records} individual segments in metadata file") From ad5a227607868991b641b1d47002f2ac30f3604b Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 8 Jan 2025 10:07:45 +0100 Subject: [PATCH 03/15] fix --- ingest/scripts/prepare_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest/scripts/prepare_metadata.py b/ingest/scripts/prepare_metadata.py index be7c46985c..170b05bdfa 100644 --- a/ingest/scripts/prepare_metadata.py +++ b/ingest/scripts/prepare_metadata.py @@ -142,7 +142,7 @@ def main( record["hash"] = hashlib.md5(prehash.encode(), usedforsecurity=False).hexdigest() - orjsonl.append(output, {"id": record[fasta_id_field], "hash": record}) + orjsonl.append(output, {"id": record[fasta_id_field], "metadata": record}) logger.info(f"Saved metadata for {len(metadata)} sequences") From fce2cff8006bd7d762a521476642775d4e6207b9 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 8 Jan 2025 11:56:40 +0100 Subject: [PATCH 04/15] change script to stream --- ingest/scripts/prepare_files.py | 36 +++++++++++++-------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/ingest/scripts/prepare_files.py b/ingest/scripts/prepare_files.py index 9402b03379..eb7a159b45 100644 --- a/ingest/scripts/prepare_files.py +++ b/ingest/scripts/prepare_files.py @@ -105,53 +105,45 @@ def main( to_revise = json.load(open(to_revise_path, encoding="utf-8")) to_revoke = json.load(open(to_revoke_path, encoding="utf-8")) - metadata_submit = [] - metadata_revise = [] - metadata_submit_prior_to_revoke = [] # Only for multi-segmented case, sequences are revoked - # due to grouping changes and the newly grouped segments must be submitted as new sequences submit_ids = set() revise_ids = set() submit_prior_to_revoke_ids = set() + def write_to_tsv_stream(data, index, filename): + with open(filename, "w", newline="", encoding="utf-8") as output_file: + dict_writer = None + + if index == 0: + keys = data.keys() + dict_writer = csv.DictWriter(output_file, keys, delimiter="\t") + dict_writer.writeheader() + + dict_writer.writerow(data) + for field in orjsonl.stream(metadata_path): fasta_id = field["id"] record = field["metadata"] if fasta_id in to_submit: - metadata_submit.append(record) + write_to_tsv_stream(record, submit_ids.len(), metadata_submit_path) submit_ids.update(ids_to_add(fasta_id, config)) continue if fasta_id in to_revise: record["accession"] = to_revise[fasta_id] - metadata_revise.append(record) + write_to_tsv_stream(record, revise_ids.len(), metadata_revise_path) revise_ids.update(ids_to_add(fasta_id, config)) continue found_seq_to_revoke = False if fasta_id in to_revoke: - metadata_submit_prior_to_revoke.append(record) submit_prior_to_revoke_ids.update(ids_to_add(fasta_id, config)) + write_to_tsv_stream(record, submit_prior_to_revoke_ids.len(), metadata_submit_prior_to_revoke_path) found_seq_to_revoke = True if found_seq_to_revoke: revocation_notification(config, to_revoke) - # TODO: Convert to streams - def write_to_tsv(data, filename): - if not data: - Path(filename).touch() - return - keys = data[0].keys() - with open(filename, "w", newline="", encoding="utf-8") as output_file: - dict_writer = csv.DictWriter(output_file, keys, delimiter="\t") - dict_writer.writeheader() - dict_writer.writerows(data) - - write_to_tsv(metadata_submit, metadata_submit_path) - write_to_tsv(metadata_revise, metadata_revise_path) - write_to_tsv(metadata_submit_prior_to_revoke, metadata_submit_prior_to_revoke_path) - def stream_filter_to_fasta(input, output, keep): if len(keep) == 0: Path(output).touch() From 9a17ece865e35851185ed2fd79cef01fa01be9a4 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 8 Jan 2025 12:00:03 +0100 Subject: [PATCH 05/15] fix tests --- ...pare.json => metadata_post_prepare.ndjson} | 38 +++++++++++++------ 1 file changed, 27 insertions(+), 11 deletions(-) rename ingest/tests/expected_output_cchf/{metadata_post_prepare.json => metadata_post_prepare.ndjson} (95%) diff --git a/ingest/tests/expected_output_cchf/metadata_post_prepare.json b/ingest/tests/expected_output_cchf/metadata_post_prepare.ndjson similarity index 95% rename from ingest/tests/expected_output_cchf/metadata_post_prepare.json rename to ingest/tests/expected_output_cchf/metadata_post_prepare.ndjson index 9a57b572e4..2c21990fd1 100644 --- a/ingest/tests/expected_output_cchf/metadata_post_prepare.json +++ b/ingest/tests/expected_output_cchf/metadata_post_prepare.ndjson @@ -1,5 +1,6 @@ { - "KX013462.1": { + "id": "KX013462.1", + "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", @@ -23,8 +24,11 @@ "specimenCollectorSampleId": "K229_194", "insdcRawReadsAccession": "", "submissionId": "KX013462.1" - }, - "KX013463.1": { + } +} +{ + "id": "KX013463.1", + "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", @@ -48,8 +52,11 @@ "specimenCollectorSampleId": "K229_194", "insdcRawReadsAccession": "", "submissionId": "KX013463.1" - }, - "KX013464.1": { + } +} +{ + "id": "KX013464.1", + "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", @@ -73,8 +80,11 @@ "specimenCollectorSampleId": "K229_194", "insdcRawReadsAccession": "", "submissionId": "KX013464.1" - }, - "KX013483.1": { + } +} +{ + "id": "KX013483.1", + "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", @@ -98,8 +108,11 @@ "specimenCollectorSampleId": "Nakiwogo", "insdcRawReadsAccession": "", "submissionId": "KX013483.1" - }, - "KX013485.1": { + } +} +{ + "id": "KX013485.1", + "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", @@ -123,8 +136,11 @@ "specimenCollectorSampleId": "Nakiwogo", "insdcRawReadsAccession": "", "submissionId": "KX013485.1" - }, - "KX096703.1": { + } +} +{ + "id": "KX096703.1", + "metadata": { "authorAffiliations": "Public Health England, Research", "authors": "Deryabin, ; Atshabar, B.; Sansyzbaev, Y.; Berezin, V.; Nurmakhanov, T.; Yeskhojayev, O.; Vilkova, A.; Shevtsov, A.; Hewson, R.; Atkinson, B.", "bioprojectAccession": "", From da2c1b31a701a9567ccf41d267afc60cf1220a50 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 8 Jan 2025 13:03:46 +0100 Subject: [PATCH 06/15] fix --- ingest/scripts/prepare_files.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ingest/scripts/prepare_files.py b/ingest/scripts/prepare_files.py index eb7a159b45..52d08525d5 100644 --- a/ingest/scripts/prepare_files.py +++ b/ingest/scripts/prepare_files.py @@ -125,20 +125,20 @@ def write_to_tsv_stream(data, index, filename): record = field["metadata"] if fasta_id in to_submit: - write_to_tsv_stream(record, submit_ids.len(), metadata_submit_path) + write_to_tsv_stream(record, len(submit_ids), metadata_submit_path) submit_ids.update(ids_to_add(fasta_id, config)) continue if fasta_id in to_revise: record["accession"] = to_revise[fasta_id] - write_to_tsv_stream(record, revise_ids.len(), metadata_revise_path) + write_to_tsv_stream(record, len(revise_ids), metadata_revise_path) revise_ids.update(ids_to_add(fasta_id, config)) continue found_seq_to_revoke = False if fasta_id in to_revoke: submit_prior_to_revoke_ids.update(ids_to_add(fasta_id, config)) - write_to_tsv_stream(record, submit_prior_to_revoke_ids.len(), metadata_submit_prior_to_revoke_path) + write_to_tsv_stream(record, len(submit_prior_to_revoke_ids), metadata_submit_prior_to_revoke_path) found_seq_to_revoke = True if found_seq_to_revoke: From 8424c6bc8ddacfc4dac688fd9d0836d4eb75f7b7 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Thu, 9 Jan 2025 15:26:50 +0100 Subject: [PATCH 07/15] fix --- ingest/scripts/prepare_files.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ingest/scripts/prepare_files.py b/ingest/scripts/prepare_files.py index 52d08525d5..3dd7108d3c 100644 --- a/ingest/scripts/prepare_files.py +++ b/ingest/scripts/prepare_files.py @@ -111,11 +111,10 @@ def main( def write_to_tsv_stream(data, index, filename): with open(filename, "w", newline="", encoding="utf-8") as output_file: - dict_writer = None + keys = data.keys() + dict_writer = csv.DictWriter(output_file, keys, delimiter="\t") if index == 0: - keys = data.keys() - dict_writer = csv.DictWriter(output_file, keys, delimiter="\t") dict_writer.writeheader() dict_writer.writerow(data) From a9a32791bdd3a09b868bd0e9762cd0732aaa3b8b Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Thu, 9 Jan 2025 15:28:40 +0100 Subject: [PATCH 08/15] update tests --- ingest/tests/expected_output_cchf/revise_metadata.tsv | 2 ++ ingest/tests/expected_output_cchf/submit_metadata.tsv | 2 ++ ingest/tests/test_ingest.py | 1 + 3 files changed, 5 insertions(+) create mode 100644 ingest/tests/expected_output_cchf/revise_metadata.tsv create mode 100644 ingest/tests/expected_output_cchf/submit_metadata.tsv diff --git a/ingest/tests/expected_output_cchf/revise_metadata.tsv b/ingest/tests/expected_output_cchf/revise_metadata.tsv new file mode 100644 index 0000000000..9bcd5c51cf --- /dev/null +++ b/ingest/tests/expected_output_cchf/revise_metadata.tsv @@ -0,0 +1,2 @@ +authorAffiliations authors bioprojectAccession biosampleAccession geoLocAdmin1 geoLocCountry hostNameScientific hostTaxonId isLabHost ncbiReleaseDate ncbiSourceDb ncbiVirusName ncbiVirusTaxId sampleCollectionDate specimenCollectorSampleId ncbiUpdateDate_L ncbiUpdateDate_M ncbiUpdateDate_S insdcVersion_L insdcVersion_M insdcVersion_S sraRunAccession_L sraRunAccession_M sraRunAccession_S hash_L hash_M hash_S insdcAccessionBase_L insdcAccessionBase_M insdcAccessionBase_S insdcAccessionFull_L insdcAccessionFull_M insdcAccessionFull_S submissionId hash accession +Public Health England, Research Deryabin, ; Atshabar, B.; Sansyzbaev, Y.; Berezin, V.; Nurmakhanov, T.; Yeskhojayev, O.; Vilkova, A.; Shevtsov, A.; Hewson, R.; Atkinson, B. Sairam district Kazakhstan Hyalomma anatolicum 176092 2016-04-30T00:00:00Z GenBank Orthonairovirus haemorrhagiae 3052518 2015 tick pool #134 2016-04-30T00:00:00Z 1 cd17a5f4dcc98e7a2afd1ab9f30274bc KX096703 KX096703.1 KX096703.1.S 646cca576b1d24d397f5ecf0905fd5c1 LOC_0000VXA diff --git a/ingest/tests/expected_output_cchf/submit_metadata.tsv b/ingest/tests/expected_output_cchf/submit_metadata.tsv new file mode 100644 index 0000000000..fa8ffea4ff --- /dev/null +++ b/ingest/tests/expected_output_cchf/submit_metadata.tsv @@ -0,0 +1,2 @@ +authorAffiliations authors bioprojectAccession biosampleAccession geoLocAdmin1 geoLocCountry hostNameScientific hostTaxonId isLabHost ncbiReleaseDate ncbiSourceDb ncbiVirusName ncbiVirusTaxId sampleCollectionDate specimenCollectorSampleId ncbiUpdateDate_L ncbiUpdateDate_M ncbiUpdateDate_S insdcVersion_L insdcVersion_M insdcVersion_S sraRunAccession_L sraRunAccession_M sraRunAccession_S hash_L hash_M hash_S insdcAccessionBase_L insdcAccessionBase_M insdcAccessionBase_S insdcAccessionFull_L insdcAccessionFull_M insdcAccessionFull_S submissionId hash +Chumakov Institute of Poliomyelitis and Viral Encephalitides Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P. Uganda Homo sapiens 9606 2016-12-07T00:00:00Z GenBank Orthonairovirus haemorrhagiae 3052518 1958 Nakiwogo 2016-12-07T00:00:00Z 2016-12-07T00:00:00Z 1 1 7b10a4e21daa8a2e693958761be17d53 70954bc35782b5592858ac3f1a6bbf89 KX013483 KX013485 KX013483.1 KX013485.1 KX013483.1.L/KX013485.1.S bb3a6d8df47cb2891e7b60030a40c335 diff --git a/ingest/tests/test_ingest.py b/ingest/tests/test_ingest.py index 543f66d4aa..051cae9a12 100644 --- a/ingest/tests/test_ingest.py +++ b/ingest/tests/test_ingest.py @@ -67,6 +67,7 @@ def test_snakemake(): run_snakemake("group_segments") run_snakemake("get_previous_submissions", touch=True) # Do not call_loculus run_snakemake("compare_hashes") + run_snakemake("prepare_files") # Check that the output files match the expected files for expected_file in EXPECTED_OUTPUT_DIR.glob("*.json"): From b1dff69d3f9d50e2eccb6cab0965f5ab1fe42a4d Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Thu, 9 Jan 2025 19:50:58 +0100 Subject: [PATCH 09/15] fix --- ingest/scripts/prepare_files.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/ingest/scripts/prepare_files.py b/ingest/scripts/prepare_files.py index 3dd7108d3c..ffce2c549d 100644 --- a/ingest/scripts/prepare_files.py +++ b/ingest/scripts/prepare_files.py @@ -137,26 +137,40 @@ def write_to_tsv_stream(data, index, filename): found_seq_to_revoke = False if fasta_id in to_revoke: submit_prior_to_revoke_ids.update(ids_to_add(fasta_id, config)) - write_to_tsv_stream(record, len(submit_prior_to_revoke_ids), metadata_submit_prior_to_revoke_path) + write_to_tsv_stream( + record, len(submit_prior_to_revoke_ids), metadata_submit_prior_to_revoke_path + ) found_seq_to_revoke = True if found_seq_to_revoke: revocation_notification(config, to_revoke) - def stream_filter_to_fasta(input, output, keep): + def stream_filter_to_fasta(input, output, output_metadata, keep): if len(keep) == 0: Path(output).touch() + Path(output_metadata).touch() return with open(output, "w", encoding="utf-8") as output_file: for record in orjsonl.stream(input): if record["id"] in keep: output_file.write(f">{record['id']}\n{record['sequence']}\n") - stream_filter_to_fasta(input=sequences_path, output=sequences_submit_path, keep=submit_ids) - stream_filter_to_fasta(input=sequences_path, output=sequences_revise_path, keep=revise_ids) + stream_filter_to_fasta( + input=sequences_path, + output=sequences_submit_path, + output_metadata=metadata_submit_path, + keep=submit_ids, + ) + stream_filter_to_fasta( + input=sequences_path, + output=sequences_revise_path, + output_metadata=metadata_revise_path, + keep=revise_ids, + ) stream_filter_to_fasta( input=sequences_path, output=sequences_submit_prior_to_revoke_path, + output_metadata=metadata_submit_prior_to_revoke_path, keep=submit_prior_to_revoke_ids, ) From 0af5ef2eba484fd5e45dbce3ebd81f30ba8c3414 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Thu, 9 Jan 2025 21:01:46 +0100 Subject: [PATCH 10/15] fix --- ingest/scripts/prepare_files.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/ingest/scripts/prepare_files.py b/ingest/scripts/prepare_files.py index ffce2c549d..b35be08606 100644 --- a/ingest/scripts/prepare_files.py +++ b/ingest/scripts/prepare_files.py @@ -1,6 +1,7 @@ import csv import json import logging +import os import sys from dataclasses import dataclass from pathlib import Path @@ -109,12 +110,16 @@ def main( revise_ids = set() submit_prior_to_revoke_ids = set() - def write_to_tsv_stream(data, index, filename): - with open(filename, "w", newline="", encoding="utf-8") as output_file: + def write_to_tsv_stream(data, filename): + # Check if the file exists + file_exists = os.path.exists(filename) + + with open(filename, "a", newline="", encoding="utf-8") as output_file: keys = data.keys() dict_writer = csv.DictWriter(output_file, keys, delimiter="\t") - if index == 0: + # Write the header only if the file doesn't already exist + if not file_exists: dict_writer.writeheader() dict_writer.writerow(data) @@ -124,22 +129,20 @@ def write_to_tsv_stream(data, index, filename): record = field["metadata"] if fasta_id in to_submit: - write_to_tsv_stream(record, len(submit_ids), metadata_submit_path) + write_to_tsv_stream(record, metadata_submit_path) submit_ids.update(ids_to_add(fasta_id, config)) continue if fasta_id in to_revise: record["accession"] = to_revise[fasta_id] - write_to_tsv_stream(record, len(revise_ids), metadata_revise_path) + write_to_tsv_stream(record, metadata_revise_path) revise_ids.update(ids_to_add(fasta_id, config)) continue found_seq_to_revoke = False if fasta_id in to_revoke: submit_prior_to_revoke_ids.update(ids_to_add(fasta_id, config)) - write_to_tsv_stream( - record, len(submit_prior_to_revoke_ids), metadata_submit_prior_to_revoke_path - ) + write_to_tsv_stream(record, metadata_submit_prior_to_revoke_path) found_seq_to_revoke = True if found_seq_to_revoke: From 40f891b7355ea187833d5ade532daf384937dde8 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Thu, 9 Jan 2025 21:02:57 +0100 Subject: [PATCH 11/15] dont store unneeded data --- ingest/scripts/group_segments.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/ingest/scripts/group_segments.py b/ingest/scripts/group_segments.py index 64131da095..ddc1499b81 100644 --- a/ingest/scripts/group_segments.py +++ b/ingest/scripts/group_segments.py @@ -1,6 +1,7 @@ """Script to group segments together into sequence entries prior to submission to Loculus -Example output for a single isolate with 3 segments: -"KJ682796.1.L/KJ682809.1.M/KJ682819.1.S": { +Example ndjson output for a single isolate with 3 segments: +{"id": "KJ682796.1.L/KJ682809.1.M/KJ682819.1.S", +"metadata": { "ncbiReleaseDate": "2014-07-06T00:00:00Z", "ncbiSourceDb": "GenBank", "authors": "D. Goedhals, F.J. Burt, J.T. Bester, R. Swanepoel", @@ -15,7 +16,7 @@ "hash_S": "f716ed13dca9c8a033d46da2f3dc2ff1", "hash": "ce7056d0bd7e3d6d3eca38f56b9d10f8", "submissionId": "KJ682796.1.L/KJ682809.1.M/KJ682819.1.S" -},""" +}}""" import hashlib import json @@ -194,8 +195,6 @@ def main( } ) - # Add segment specific metadata for the segments - metadata: dict[str, dict[str, str]] = {} # Map from original accession to the new concatenated accession fasta_id_map: dict[Accession, Accession] = {} @@ -245,8 +244,6 @@ def main( json.dumps(filtered_record, sort_keys=True).encode(), usedforsecurity=False ).hexdigest() - metadata[joint_key] = row - orjsonl.append(output_metadata, {"id": joint_key, "metadata": row}) count += 1 From 8dea5ef83c11cf3d44e70f7b362dcec218dea8db Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Sun, 12 Jan 2025 13:03:24 +0100 Subject: [PATCH 12/15] fix tsv column order bug --- ingest/scripts/call_loculus.py | 2 -- ingest/scripts/prepare_files.py | 13 ++++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/ingest/scripts/call_loculus.py b/ingest/scripts/call_loculus.py index 2a1191cded..25a568db8f 100644 --- a/ingest/scripts/call_loculus.py +++ b/ingest/scripts/call_loculus.py @@ -411,8 +411,6 @@ def get_submitted(config: Config): logger.info(f"Backend has status of: {len(statuses)} sequence entries from ingest") logger.info(f"Ingest has submitted: {len(entries)} sequence entries to ingest") - logger.debug(entries) - logger.debug(statuses) for entry in entries: loculus_accession = entry["accession"] submitter = entry["submitter"] diff --git a/ingest/scripts/prepare_files.py b/ingest/scripts/prepare_files.py index b35be08606..10ef36768e 100644 --- a/ingest/scripts/prepare_files.py +++ b/ingest/scripts/prepare_files.py @@ -110,12 +110,12 @@ def main( revise_ids = set() submit_prior_to_revoke_ids = set() - def write_to_tsv_stream(data, filename): + def write_to_tsv_stream(data, filename, columns_list=None): # Check if the file exists file_exists = os.path.exists(filename) with open(filename, "a", newline="", encoding="utf-8") as output_file: - keys = data.keys() + keys = columns_list or data.keys() dict_writer = csv.DictWriter(output_file, keys, delimiter="\t") # Write the header only if the file doesn't already exist @@ -124,25 +124,28 @@ def write_to_tsv_stream(data, filename): dict_writer.writerow(data) + columns_list = None for field in orjsonl.stream(metadata_path): fasta_id = field["id"] record = field["metadata"] + if not columns_list: + columns_list = record.keys() if fasta_id in to_submit: - write_to_tsv_stream(record, metadata_submit_path) + write_to_tsv_stream(record, metadata_submit_path, columns_list) submit_ids.update(ids_to_add(fasta_id, config)) continue if fasta_id in to_revise: record["accession"] = to_revise[fasta_id] - write_to_tsv_stream(record, metadata_revise_path) + write_to_tsv_stream(record, metadata_revise_path, columns_list) revise_ids.update(ids_to_add(fasta_id, config)) continue found_seq_to_revoke = False if fasta_id in to_revoke: submit_prior_to_revoke_ids.update(ids_to_add(fasta_id, config)) - write_to_tsv_stream(record, metadata_submit_prior_to_revoke_path) + write_to_tsv_stream(record, metadata_submit_prior_to_revoke_path, columns_list) found_seq_to_revoke = True if found_seq_to_revoke: From c7abd5f3c4330e841aaeba51bbbb45dfbc9e30b0 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Sun, 12 Jan 2025 13:07:43 +0100 Subject: [PATCH 13/15] fix --- ingest/scripts/prepare_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest/scripts/prepare_files.py b/ingest/scripts/prepare_files.py index 10ef36768e..d1b20e413b 100644 --- a/ingest/scripts/prepare_files.py +++ b/ingest/scripts/prepare_files.py @@ -138,7 +138,7 @@ def write_to_tsv_stream(data, filename, columns_list=None): if fasta_id in to_revise: record["accession"] = to_revise[fasta_id] - write_to_tsv_stream(record, metadata_revise_path, columns_list) + write_to_tsv_stream(record, metadata_revise_path, [*columns_list, "accession"]) revise_ids.update(ids_to_add(fasta_id, config)) continue From d8c67fb909de58f90328f64b96cb2b0124e53e05 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Thu, 16 Jan 2025 16:40:52 +0100 Subject: [PATCH 14/15] add tests --- .../metadata_post_prepare.ndjson | 1 + ingest/tests/test_ingest.py | 37 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/ingest/tests/expected_output_cchf/metadata_post_prepare.ndjson b/ingest/tests/expected_output_cchf/metadata_post_prepare.ndjson index 2c21990fd1..4b8dff0000 100644 --- a/ingest/tests/expected_output_cchf/metadata_post_prepare.ndjson +++ b/ingest/tests/expected_output_cchf/metadata_post_prepare.ndjson @@ -1,3 +1,4 @@ + { "id": "KX013462.1", "metadata": { diff --git a/ingest/tests/test_ingest.py b/ingest/tests/test_ingest.py index 051cae9a12..fe99b82e67 100644 --- a/ingest/tests/test_ingest.py +++ b/ingest/tests/test_ingest.py @@ -4,6 +4,8 @@ import subprocess from pathlib import Path +import orjsonl +import pandas as pd import pytest # Define the paths to your test data and expected output @@ -35,6 +37,25 @@ def compare_json_files(file1, file2): return json1 == json2 +def compare_ndjson_files(file1, file2): + def create_dict_from_ndjson(file): + dict = {} + for record in orjsonl.stream(file): + dict[record["id"]] = record["metadata"] + + dict1 = create_dict_from_ndjson(file1) + dict2 = create_dict_from_ndjson(file2) + + return dict1 == dict2 + +def compare_tsv_files(file1, file2): + df1 = pd.read_csv(file1, sep="\t") + df2 = pd.read_csv(file2, sep="\t") + + # Compare the contents + return df1.sort_index(axis=1).equals(df2.sort_index(axis=1)) + + def run_snakemake(rule, touch=False): """ Function to run Snakemake with the test data. @@ -78,6 +99,22 @@ def test_snakemake(): output_file, ), f"{output_file} does not match {expected_file}." + for expected_file in EXPECTED_OUTPUT_DIR.glob("*.tsv"): + output_file = OUTPUT_DIR / expected_file.name + assert output_file.exists(), f"{output_file} does not exist." + assert compare_tsv_files( + expected_file, + output_file, + ), f"{output_file} does not match {expected_file}." + + for expected_file in EXPECTED_OUTPUT_DIR.glob("*.ndjson"): + output_file = OUTPUT_DIR / expected_file.name + assert output_file.exists(), f"{output_file} does not exist." + assert compare_ndjson_files( + expected_file, + output_file, + ), f"{output_file} does not match {expected_file}." + if __name__ == "__main__": pytest.main(["-v"]) From ecfe6dfc568e129190722f8c90617ec8945869af Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Tue, 28 Jan 2025 14:16:43 +0100 Subject: [PATCH 15/15] fix tests after merge conflict --- .../metadata_post_prepare.ndjson | 175 +----------------- .../expected_output_cchf/revise_metadata.tsv | 2 +- .../expected_output_cchf/submit_metadata.tsv | 2 +- 3 files changed, 8 insertions(+), 171 deletions(-) diff --git a/ingest/tests/expected_output_cchf/metadata_post_prepare.ndjson b/ingest/tests/expected_output_cchf/metadata_post_prepare.ndjson index 4b8dff0000..a0763d1ceb 100644 --- a/ingest/tests/expected_output_cchf/metadata_post_prepare.ndjson +++ b/ingest/tests/expected_output_cchf/metadata_post_prepare.ndjson @@ -1,169 +1,6 @@ - -{ - "id": "KX013462.1", - "metadata": { - "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", - "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", - "bioprojectAccession": "", - "biosampleAccession": "", - "geoLocAdmin1": "Astrakhan", - "geoLocCountry": "Russia", - "hash": "15c5b9d511b1c6c37a3ff43280f3f617", - "hostNameScientific": "Ixodoidea", - "hostTaxonId": "297308", - "insdcAccessionBase": "KX013462", - "insdcAccessionFull": "KX013462.1", - "insdcVersion": "1", - "isLabHost": "", - "ncbiReleaseDate": "2016-12-07T00:00:00Z", - "ncbiSourceDb": "GenBank", - "ncbiUpdateDate": "2016-12-07T00:00:00Z", - "ncbiVirusName": "Orthonairovirus haemorrhagiae", - "ncbiVirusTaxId": "3052518", - "sampleCollectionDate": "1989", - "segment": "L", - "specimenCollectorSampleId": "K229_194", - "insdcRawReadsAccession": "", - "submissionId": "KX013462.1" - } -} -{ - "id": "KX013463.1", - "metadata": { - "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", - "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", - "bioprojectAccession": "", - "biosampleAccession": "", - "geoLocAdmin1": "Astrakhan", - "geoLocCountry": "Russia", - "hash": "c11e1b7951a73b14f70403bed5cf2d10", - "hostNameScientific": "Ixodoidea", - "hostTaxonId": "297308", - "insdcAccessionBase": "KX013463", - "insdcAccessionFull": "KX013463.1", - "insdcVersion": "1", - "isLabHost": "", - "ncbiReleaseDate": "2016-12-07T00:00:00Z", - "ncbiSourceDb": "GenBank", - "ncbiUpdateDate": "2016-12-07T00:00:00Z", - "ncbiVirusName": "Orthonairovirus haemorrhagiae", - "ncbiVirusTaxId": "3052518", - "sampleCollectionDate": "1989", - "segment": "M", - "specimenCollectorSampleId": "K229_194", - "insdcRawReadsAccession": "", - "submissionId": "KX013463.1" - } -} -{ - "id": "KX013464.1", - "metadata": { - "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", - "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", - "bioprojectAccession": "", - "biosampleAccession": "", - "geoLocAdmin1": "Astrakhan", - "geoLocCountry": "Russia", - "hash": "7cd7b3085ef50ad49973b92979a21ee8", - "hostNameScientific": "Ixodoidea", - "hostTaxonId": "297308", - "insdcAccessionBase": "KX013464", - "insdcAccessionFull": "KX013464.1", - "insdcVersion": "1", - "isLabHost": "", - "ncbiReleaseDate": "2016-12-07T00:00:00Z", - "ncbiSourceDb": "GenBank", - "ncbiUpdateDate": "2016-12-07T00:00:00Z", - "ncbiVirusName": "Orthonairovirus haemorrhagiae", - "ncbiVirusTaxId": "3052518", - "sampleCollectionDate": "1989", - "segment": "S", - "specimenCollectorSampleId": "K229_194", - "insdcRawReadsAccession": "", - "submissionId": "KX013464.1" - } -} -{ - "id": "KX013483.1", - "metadata": { - "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", - "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", - "bioprojectAccession": "", - "biosampleAccession": "", - "geoLocAdmin1": "", - "geoLocCountry": "Uganda", - "hash": "7b10a4e21daa8a2e693958761be17d53", - "hostNameScientific": "Homo sapiens", - "hostTaxonId": "9606", - "insdcAccessionBase": "KX013483", - "insdcAccessionFull": "KX013483.1", - "insdcVersion": "1", - "isLabHost": "", - "ncbiReleaseDate": "2016-12-07T00:00:00Z", - "ncbiSourceDb": "GenBank", - "ncbiUpdateDate": "2016-12-07T00:00:00Z", - "ncbiVirusName": "Orthonairovirus haemorrhagiae", - "ncbiVirusTaxId": "3052518", - "sampleCollectionDate": "1958", - "segment": "L", - "specimenCollectorSampleId": "Nakiwogo", - "insdcRawReadsAccession": "", - "submissionId": "KX013483.1" - } -} -{ - "id": "KX013485.1", - "metadata": { - "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", - "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", - "bioprojectAccession": "", - "biosampleAccession": "", - "geoLocAdmin1": "", - "geoLocCountry": "Uganda", - "hash": "70954bc35782b5592858ac3f1a6bbf89", - "hostNameScientific": "Homo sapiens", - "hostTaxonId": "9606", - "insdcAccessionBase": "KX013485", - "insdcAccessionFull": "KX013485.1", - "insdcVersion": "1", - "isLabHost": "", - "ncbiReleaseDate": "2016-12-07T00:00:00Z", - "ncbiSourceDb": "GenBank", - "ncbiUpdateDate": "2016-12-07T00:00:00Z", - "ncbiVirusName": "Orthonairovirus haemorrhagiae", - "ncbiVirusTaxId": "3052518", - "sampleCollectionDate": "1958", - "segment": "S", - "specimenCollectorSampleId": "Nakiwogo", - "insdcRawReadsAccession": "", - "submissionId": "KX013485.1" - } -} -{ - "id": "KX096703.1", - "metadata": { - "authorAffiliations": "Public Health England, Research", - "authors": "Deryabin, ; Atshabar, B.; Sansyzbaev, Y.; Berezin, V.; Nurmakhanov, T.; Yeskhojayev, O.; Vilkova, A.; Shevtsov, A.; Hewson, R.; Atkinson, B.", - "bioprojectAccession": "", - "biosampleAccession": "", - "geoLocAdmin1": "Sairam district", - "geoLocCountry": "Kazakhstan", - "hash": "cd17a5f4dcc98e7a2afd1ab9f30274bc", - "hostNameScientific": "Hyalomma anatolicum", - "hostTaxonId": "176092", - "insdcAccessionBase": "KX096703", - "insdcAccessionFull": "KX096703.1", - "insdcVersion": "1", - "isLabHost": "", - "ncbiReleaseDate": "2016-04-30T00:00:00Z", - "ncbiSourceDb": "GenBank", - "ncbiUpdateDate": "2016-04-30T00:00:00Z", - "ncbiVirusName": "Orthonairovirus haemorrhagiae", - "ncbiVirusTaxId": "3052518", - "sampleCollectionDate": "2015", - "segment": "S", - "specimenCollectorSampleId": "tick pool #134", - "insdcRawReadsAccession": "", - "submissionId": "KX096703.1" - } -} \ No newline at end of file + { "id": "KX013462.1", "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", "biosampleAccession": "", "geoLocAdmin1": "Astrakhan", "geoLocCountry": "Russia", "hash": "15c5b9d511b1c6c37a3ff43280f3f617", "hostNameScientific": "Ixodoidea", "hostTaxonId": "297308", "insdcAccessionBase": "KX013462", "insdcAccessionFull": "KX013462.1", "insdcVersion": "1", "isLabHost": "", "ncbiReleaseDate": "2016-12-07T00:00:00Z", "ncbiSourceDb": "GenBank", "ncbiUpdateDate": "2016-12-07T00:00:00Z", "ncbiVirusName": "Orthonairovirus haemorrhagiae", "ncbiVirusTaxId": "3052518", "sampleCollectionDate": "1989", "segment": "L", "specimenCollectorSampleId": "K229_194", "insdcRawReadsAccession": "", "submissionId": "KX013462.1" } } + { "id": "KX013463.1", "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", "biosampleAccession": "", "geoLocAdmin1": "Astrakhan", "geoLocCountry": "Russia", "hash": "c11e1b7951a73b14f70403bed5cf2d10", "hostNameScientific": "Ixodoidea", "hostTaxonId": "297308", "insdcAccessionBase": "KX013463", "insdcAccessionFull": "KX013463.1", "insdcVersion": "1", "isLabHost": "", "ncbiReleaseDate": "2016-12-07T00:00:00Z", "ncbiSourceDb": "GenBank", "ncbiUpdateDate": "2016-12-07T00:00:00Z", "ncbiVirusName": "Orthonairovirus haemorrhagiae", "ncbiVirusTaxId": "3052518", "sampleCollectionDate": "1989", "segment": "M", "specimenCollectorSampleId": "K229_194", "insdcRawReadsAccession": "", "submissionId": "KX013463.1" } } + { "id": "KX013464.1", "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", "biosampleAccession": "", "geoLocAdmin1": "Astrakhan", "geoLocCountry": "Russia", "hash": "7cd7b3085ef50ad49973b92979a21ee8", "hostNameScientific": "Ixodoidea", "hostTaxonId": "297308", "insdcAccessionBase": "KX013464", "insdcAccessionFull": "KX013464.1", "insdcVersion": "1", "isLabHost": "", "ncbiReleaseDate": "2016-12-07T00:00:00Z", "ncbiSourceDb": "GenBank", "ncbiUpdateDate": "2016-12-07T00:00:00Z", "ncbiVirusName": "Orthonairovirus haemorrhagiae", "ncbiVirusTaxId": "3052518", "sampleCollectionDate": "1989", "segment": "S", "specimenCollectorSampleId": "K229_194", "insdcRawReadsAccession": "", "submissionId": "KX013464.1" } } + { "id": "KX013483.1", "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", "biosampleAccession": "", "geoLocAdmin1": "", "geoLocCountry": "Uganda", "hash": "7b10a4e21daa8a2e693958761be17d53", "hostNameScientific": "Homo sapiens", "hostTaxonId": "9606", "insdcAccessionBase": "KX013483", "insdcAccessionFull": "KX013483.1", "insdcVersion": "1", "isLabHost": "", "ncbiReleaseDate": "2016-12-07T00:00:00Z", "ncbiSourceDb": "GenBank", "ncbiUpdateDate": "2016-12-07T00:00:00Z", "ncbiVirusName": "Orthonairovirus haemorrhagiae", "ncbiVirusTaxId": "3052518", "sampleCollectionDate": "1958", "segment": "L", "specimenCollectorSampleId": "Nakiwogo", "insdcRawReadsAccession": "", "submissionId": "KX013483.1" } } + { "id": "KX013485.1", "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", "biosampleAccession": "", "geoLocAdmin1": "", "geoLocCountry": "Uganda", "hash": "70954bc35782b5592858ac3f1a6bbf89", "hostNameScientific": "Homo sapiens", "hostTaxonId": "9606", "insdcAccessionBase": "KX013485", "insdcAccessionFull": "KX013485.1", "insdcVersion": "1", "isLabHost": "", "ncbiReleaseDate": "2016-12-07T00:00:00Z", "ncbiSourceDb": "GenBank", "ncbiUpdateDate": "2016-12-07T00:00:00Z", "ncbiVirusName": "Orthonairovirus haemorrhagiae", "ncbiVirusTaxId": "3052518", "sampleCollectionDate": "1958", "segment": "S", "specimenCollectorSampleId": "Nakiwogo", "insdcRawReadsAccession": "", "submissionId": "KX013485.1" } } + { "id": "KX096703.1", "metadata": { "authorAffiliations": "Public Health England, Research", "authors": "Deryabin, ; Atshabar, B.; Sansyzbaev, Y.; Berezin, V.; Nurmakhanov, T.; Yeskhojayev, O.; Vilkova, A.; Shevtsov, A.; Hewson, R.; Atkinson, B.", "bioprojectAccession": "", "biosampleAccession": "", "geoLocAdmin1": "Sairam district", "geoLocCountry": "Kazakhstan", "hash": "cd17a5f4dcc98e7a2afd1ab9f30274bc", "hostNameScientific": "Hyalomma anatolicum", "hostTaxonId": "176092", "insdcAccessionBase": "KX096703", "insdcAccessionFull": "KX096703.1", "insdcVersion": "1", "isLabHost": "", "ncbiReleaseDate": "2016-04-30T00:00:00Z", "ncbiSourceDb": "GenBank", "ncbiUpdateDate": "2016-04-30T00:00:00Z", "ncbiVirusName": "Orthonairovirus haemorrhagiae", "ncbiVirusTaxId": "3052518", "sampleCollectionDate": "2015", "segment": "S", "specimenCollectorSampleId": "tick pool #134", "insdcRawReadsAccession": "", "submissionId": "KX096703.1" } } \ No newline at end of file diff --git a/ingest/tests/expected_output_cchf/revise_metadata.tsv b/ingest/tests/expected_output_cchf/revise_metadata.tsv index 9bcd5c51cf..77fb9269a5 100644 --- a/ingest/tests/expected_output_cchf/revise_metadata.tsv +++ b/ingest/tests/expected_output_cchf/revise_metadata.tsv @@ -1,2 +1,2 @@ -authorAffiliations authors bioprojectAccession biosampleAccession geoLocAdmin1 geoLocCountry hostNameScientific hostTaxonId isLabHost ncbiReleaseDate ncbiSourceDb ncbiVirusName ncbiVirusTaxId sampleCollectionDate specimenCollectorSampleId ncbiUpdateDate_L ncbiUpdateDate_M ncbiUpdateDate_S insdcVersion_L insdcVersion_M insdcVersion_S sraRunAccession_L sraRunAccession_M sraRunAccession_S hash_L hash_M hash_S insdcAccessionBase_L insdcAccessionBase_M insdcAccessionBase_S insdcAccessionFull_L insdcAccessionFull_M insdcAccessionFull_S submissionId hash accession +authorAffiliations authors bioprojectAccession biosampleAccession geoLocAdmin1 geoLocCountry hostNameScientific hostTaxonId isLabHost ncbiReleaseDate ncbiSourceDb ncbiVirusName ncbiVirusTaxId sampleCollectionDate specimenCollectorSampleId ncbiUpdateDate_L ncbiUpdateDate_M ncbiUpdateDate_S insdcVersion_L insdcVersion_M insdcVersion_S insdcRawReadsAccession_L insdcRawReadsAccession_M insdcRawReadsAccession_S hash_L hash_M hash_S insdcAccessionBase_L insdcAccessionBase_M insdcAccessionBase_S insdcAccessionFull_L insdcAccessionFull_M insdcAccessionFull_S submissionId hash accession Public Health England, Research Deryabin, ; Atshabar, B.; Sansyzbaev, Y.; Berezin, V.; Nurmakhanov, T.; Yeskhojayev, O.; Vilkova, A.; Shevtsov, A.; Hewson, R.; Atkinson, B. Sairam district Kazakhstan Hyalomma anatolicum 176092 2016-04-30T00:00:00Z GenBank Orthonairovirus haemorrhagiae 3052518 2015 tick pool #134 2016-04-30T00:00:00Z 1 cd17a5f4dcc98e7a2afd1ab9f30274bc KX096703 KX096703.1 KX096703.1.S 646cca576b1d24d397f5ecf0905fd5c1 LOC_0000VXA diff --git a/ingest/tests/expected_output_cchf/submit_metadata.tsv b/ingest/tests/expected_output_cchf/submit_metadata.tsv index fa8ffea4ff..140113d450 100644 --- a/ingest/tests/expected_output_cchf/submit_metadata.tsv +++ b/ingest/tests/expected_output_cchf/submit_metadata.tsv @@ -1,2 +1,2 @@ -authorAffiliations authors bioprojectAccession biosampleAccession geoLocAdmin1 geoLocCountry hostNameScientific hostTaxonId isLabHost ncbiReleaseDate ncbiSourceDb ncbiVirusName ncbiVirusTaxId sampleCollectionDate specimenCollectorSampleId ncbiUpdateDate_L ncbiUpdateDate_M ncbiUpdateDate_S insdcVersion_L insdcVersion_M insdcVersion_S sraRunAccession_L sraRunAccession_M sraRunAccession_S hash_L hash_M hash_S insdcAccessionBase_L insdcAccessionBase_M insdcAccessionBase_S insdcAccessionFull_L insdcAccessionFull_M insdcAccessionFull_S submissionId hash +authorAffiliations authors bioprojectAccession biosampleAccession geoLocAdmin1 geoLocCountry hostNameScientific hostTaxonId isLabHost ncbiReleaseDate ncbiSourceDb ncbiVirusName ncbiVirusTaxId sampleCollectionDate specimenCollectorSampleId ncbiUpdateDate_L ncbiUpdateDate_M ncbiUpdateDate_S insdcVersion_L insdcVersion_M insdcVersion_S insdcRawReadsAccession_L insdcRawReadsAccession_M insdcRawReadsAccession_S hash_L hash_M hash_S insdcAccessionBase_L insdcAccessionBase_M insdcAccessionBase_S insdcAccessionFull_L insdcAccessionFull_M insdcAccessionFull_S submissionId hash Chumakov Institute of Poliomyelitis and Viral Encephalitides Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P. Uganda Homo sapiens 9606 2016-12-07T00:00:00Z GenBank Orthonairovirus haemorrhagiae 3052518 1958 Nakiwogo 2016-12-07T00:00:00Z 2016-12-07T00:00:00Z 1 1 7b10a4e21daa8a2e693958761be17d53 70954bc35782b5592858ac3f1a6bbf89 KX013483 KX013485 KX013483.1 KX013485.1 KX013483.1.L/KX013485.1.S bb3a6d8df47cb2891e7b60030a40c335