Skip to content

Commit

Permalink
add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
anna-parker committed Jan 16, 2025
1 parent ad081f5 commit d6edd1a
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 168 deletions.
174 changes: 6 additions & 168 deletions ingest/tests/expected_output_cchf/metadata_post_prepare.ndjson
Original file line number Diff line number Diff line change
@@ -1,168 +1,6 @@
{
"id": "KX013462.1",
"metadata": {
"authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides",
"authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.",
"bioprojectAccession": "",
"biosampleAccession": "",
"geoLocAdmin1": "Astrakhan",
"geoLocCountry": "Russia",
"hash": "15c5b9d511b1c6c37a3ff43280f3f617",
"hostNameScientific": "Ixodoidea",
"hostTaxonId": "297308",
"insdcAccessionBase": "KX013462",
"insdcAccessionFull": "KX013462.1",
"insdcVersion": "1",
"isLabHost": "",
"ncbiReleaseDate": "2016-12-07T00:00:00Z",
"ncbiSourceDb": "GenBank",
"ncbiUpdateDate": "2016-12-07T00:00:00Z",
"ncbiVirusName": "Orthonairovirus haemorrhagiae",
"ncbiVirusTaxId": "3052518",
"sampleCollectionDate": "1989",
"segment": "L",
"specimenCollectorSampleId": "K229_194",
"sraRunAccession": "",
"submissionId": "KX013462.1"
}
}
{
"id": "KX013463.1",
"metadata": {
"authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides",
"authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.",
"bioprojectAccession": "",
"biosampleAccession": "",
"geoLocAdmin1": "Astrakhan",
"geoLocCountry": "Russia",
"hash": "c11e1b7951a73b14f70403bed5cf2d10",
"hostNameScientific": "Ixodoidea",
"hostTaxonId": "297308",
"insdcAccessionBase": "KX013463",
"insdcAccessionFull": "KX013463.1",
"insdcVersion": "1",
"isLabHost": "",
"ncbiReleaseDate": "2016-12-07T00:00:00Z",
"ncbiSourceDb": "GenBank",
"ncbiUpdateDate": "2016-12-07T00:00:00Z",
"ncbiVirusName": "Orthonairovirus haemorrhagiae",
"ncbiVirusTaxId": "3052518",
"sampleCollectionDate": "1989",
"segment": "M",
"specimenCollectorSampleId": "K229_194",
"sraRunAccession": "",
"submissionId": "KX013463.1"
}
}
{
"id": "KX013464.1",
"metadata": {
"authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides",
"authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.",
"bioprojectAccession": "",
"biosampleAccession": "",
"geoLocAdmin1": "Astrakhan",
"geoLocCountry": "Russia",
"hash": "7cd7b3085ef50ad49973b92979a21ee8",
"hostNameScientific": "Ixodoidea",
"hostTaxonId": "297308",
"insdcAccessionBase": "KX013464",
"insdcAccessionFull": "KX013464.1",
"insdcVersion": "1",
"isLabHost": "",
"ncbiReleaseDate": "2016-12-07T00:00:00Z",
"ncbiSourceDb": "GenBank",
"ncbiUpdateDate": "2016-12-07T00:00:00Z",
"ncbiVirusName": "Orthonairovirus haemorrhagiae",
"ncbiVirusTaxId": "3052518",
"sampleCollectionDate": "1989",
"segment": "S",
"specimenCollectorSampleId": "K229_194",
"sraRunAccession": "",
"submissionId": "KX013464.1"
}
}
{
"id": "KX013483.1",
"metadata": {
"authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides",
"authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.",
"bioprojectAccession": "",
"biosampleAccession": "",
"geoLocAdmin1": "",
"geoLocCountry": "Uganda",
"hash": "7b10a4e21daa8a2e693958761be17d53",
"hostNameScientific": "Homo sapiens",
"hostTaxonId": "9606",
"insdcAccessionBase": "KX013483",
"insdcAccessionFull": "KX013483.1",
"insdcVersion": "1",
"isLabHost": "",
"ncbiReleaseDate": "2016-12-07T00:00:00Z",
"ncbiSourceDb": "GenBank",
"ncbiUpdateDate": "2016-12-07T00:00:00Z",
"ncbiVirusName": "Orthonairovirus haemorrhagiae",
"ncbiVirusTaxId": "3052518",
"sampleCollectionDate": "1958",
"segment": "L",
"specimenCollectorSampleId": "Nakiwogo",
"sraRunAccession": "",
"submissionId": "KX013483.1"
}
}
{
"id": "KX013485.1",
"metadata": {
"authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides",
"authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.",
"bioprojectAccession": "",
"biosampleAccession": "",
"geoLocAdmin1": "",
"geoLocCountry": "Uganda",
"hash": "70954bc35782b5592858ac3f1a6bbf89",
"hostNameScientific": "Homo sapiens",
"hostTaxonId": "9606",
"insdcAccessionBase": "KX013485",
"insdcAccessionFull": "KX013485.1",
"insdcVersion": "1",
"isLabHost": "",
"ncbiReleaseDate": "2016-12-07T00:00:00Z",
"ncbiSourceDb": "GenBank",
"ncbiUpdateDate": "2016-12-07T00:00:00Z",
"ncbiVirusName": "Orthonairovirus haemorrhagiae",
"ncbiVirusTaxId": "3052518",
"sampleCollectionDate": "1958",
"segment": "S",
"specimenCollectorSampleId": "Nakiwogo",
"sraRunAccession": "",
"submissionId": "KX013485.1"
}
}
{
"id": "KX096703.1",
"metadata": {
"authorAffiliations": "Public Health England, Research",
"authors": "Deryabin, ; Atshabar, B.; Sansyzbaev, Y.; Berezin, V.; Nurmakhanov, T.; Yeskhojayev, O.; Vilkova, A.; Shevtsov, A.; Hewson, R.; Atkinson, B.",
"bioprojectAccession": "",
"biosampleAccession": "",
"geoLocAdmin1": "Sairam district",
"geoLocCountry": "Kazakhstan",
"hash": "cd17a5f4dcc98e7a2afd1ab9f30274bc",
"hostNameScientific": "Hyalomma anatolicum",
"hostTaxonId": "176092",
"insdcAccessionBase": "KX096703",
"insdcAccessionFull": "KX096703.1",
"insdcVersion": "1",
"isLabHost": "",
"ncbiReleaseDate": "2016-04-30T00:00:00Z",
"ncbiSourceDb": "GenBank",
"ncbiUpdateDate": "2016-04-30T00:00:00Z",
"ncbiVirusName": "Orthonairovirus haemorrhagiae",
"ncbiVirusTaxId": "3052518",
"sampleCollectionDate": "2015",
"segment": "S",
"specimenCollectorSampleId": "tick pool #134",
"sraRunAccession": "",
"submissionId": "KX096703.1"
}
}
{ "id": "KX013462.1", "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", "biosampleAccession": "", "geoLocAdmin1": "Astrakhan", "geoLocCountry": "Russia", "hash": "15c5b9d511b1c6c37a3ff43280f3f617", "hostNameScientific": "Ixodoidea", "hostTaxonId": "297308", "insdcAccessionBase": "KX013462", "insdcAccessionFull": "KX013462.1", "insdcVersion": "1", "isLabHost": "", "ncbiReleaseDate": "2016-12-07T00:00:00Z", "ncbiSourceDb": "GenBank", "ncbiUpdateDate": "2016-12-07T00:00:00Z", "ncbiVirusName": "Orthonairovirus haemorrhagiae", "ncbiVirusTaxId": "3052518", "sampleCollectionDate": "1989", "segment": "L", "specimenCollectorSampleId": "K229_194", "sraRunAccession": "", "submissionId": "KX013462.1" } }
{ "id": "KX013463.1", "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", "biosampleAccession": "", "geoLocAdmin1": "Astrakhan", "geoLocCountry": "Russia", "hash": "c11e1b7951a73b14f70403bed5cf2d10", "hostNameScientific": "Ixodoidea", "hostTaxonId": "297308", "insdcAccessionBase": "KX013463", "insdcAccessionFull": "KX013463.1", "insdcVersion": "1", "isLabHost": "", "ncbiReleaseDate": "2016-12-07T00:00:00Z", "ncbiSourceDb": "GenBank", "ncbiUpdateDate": "2016-12-07T00:00:00Z", "ncbiVirusName": "Orthonairovirus haemorrhagiae", "ncbiVirusTaxId": "3052518", "sampleCollectionDate": "1989", "segment": "M", "specimenCollectorSampleId": "K229_194", "sraRunAccession": "", "submissionId": "KX013463.1" } }
{ "id": "KX013464.1", "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", "biosampleAccession": "", "geoLocAdmin1": "Astrakhan", "geoLocCountry": "Russia", "hash": "7cd7b3085ef50ad49973b92979a21ee8", "hostNameScientific": "Ixodoidea", "hostTaxonId": "297308", "insdcAccessionBase": "KX013464", "insdcAccessionFull": "KX013464.1", "insdcVersion": "1", "isLabHost": "", "ncbiReleaseDate": "2016-12-07T00:00:00Z", "ncbiSourceDb": "GenBank", "ncbiUpdateDate": "2016-12-07T00:00:00Z", "ncbiVirusName": "Orthonairovirus haemorrhagiae", "ncbiVirusTaxId": "3052518", "sampleCollectionDate": "1989", "segment": "S", "specimenCollectorSampleId": "K229_194", "sraRunAccession": "", "submissionId": "KX013464.1" } }
{ "id": "KX013483.1", "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", "biosampleAccession": "", "geoLocAdmin1": "", "geoLocCountry": "Uganda", "hash": "7b10a4e21daa8a2e693958761be17d53", "hostNameScientific": "Homo sapiens", "hostTaxonId": "9606", "insdcAccessionBase": "KX013483", "insdcAccessionFull": "KX013483.1", "insdcVersion": "1", "isLabHost": "", "ncbiReleaseDate": "2016-12-07T00:00:00Z", "ncbiSourceDb": "GenBank", "ncbiUpdateDate": "2016-12-07T00:00:00Z", "ncbiVirusName": "Orthonairovirus haemorrhagiae", "ncbiVirusTaxId": "3052518", "sampleCollectionDate": "1958", "segment": "L", "specimenCollectorSampleId": "Nakiwogo", "sraRunAccession": "", "submissionId": "KX013483.1" } }
{ "id": "KX013485.1", "metadata": { "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", "authors": "Lukashev, A. N.; Klimentov, A. S.; Smirnova, S. E.; Dzagurova, T. K.; Drexler, J. F.; Gmyl, A. P.", "bioprojectAccession": "", "biosampleAccession": "", "geoLocAdmin1": "", "geoLocCountry": "Uganda", "hash": "70954bc35782b5592858ac3f1a6bbf89", "hostNameScientific": "Homo sapiens", "hostTaxonId": "9606", "insdcAccessionBase": "KX013485", "insdcAccessionFull": "KX013485.1", "insdcVersion": "1", "isLabHost": "", "ncbiReleaseDate": "2016-12-07T00:00:00Z", "ncbiSourceDb": "GenBank", "ncbiUpdateDate": "2016-12-07T00:00:00Z", "ncbiVirusName": "Orthonairovirus haemorrhagiae", "ncbiVirusTaxId": "3052518", "sampleCollectionDate": "1958", "segment": "S", "specimenCollectorSampleId": "Nakiwogo", "sraRunAccession": "", "submissionId": "KX013485.1" } }
{ "id": "KX096703.1", "metadata": { "authorAffiliations": "Public Health England, Research", "authors": "Deryabin, ; Atshabar, B.; Sansyzbaev, Y.; Berezin, V.; Nurmakhanov, T.; Yeskhojayev, O.; Vilkova, A.; Shevtsov, A.; Hewson, R.; Atkinson, B.", "bioprojectAccession": "", "biosampleAccession": "", "geoLocAdmin1": "Sairam district", "geoLocCountry": "Kazakhstan", "hash": "cd17a5f4dcc98e7a2afd1ab9f30274bc", "hostNameScientific": "Hyalomma anatolicum", "hostTaxonId": "176092", "insdcAccessionBase": "KX096703", "insdcAccessionFull": "KX096703.1", "insdcVersion": "1", "isLabHost": "", "ncbiReleaseDate": "2016-04-30T00:00:00Z", "ncbiSourceDb": "GenBank", "ncbiUpdateDate": "2016-04-30T00:00:00Z", "ncbiVirusName": "Orthonairovirus haemorrhagiae", "ncbiVirusTaxId": "3052518", "sampleCollectionDate": "2015", "segment": "S", "specimenCollectorSampleId": "tick pool #134", "sraRunAccession": "", "submissionId": "KX096703.1" } }
37 changes: 37 additions & 0 deletions ingest/tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import subprocess
from pathlib import Path

import orjsonl
import pandas as pd
import pytest

# Define the paths to your test data and expected output
Expand Down Expand Up @@ -35,6 +37,25 @@ def compare_json_files(file1, file2):
return json1 == json2


def compare_ndjson_files(file1, file2):
def create_dict_from_ndjson(file):
dict = {}
for record in orjsonl.stream(file):
dict[record["id"]] = record["metadata"]

dict1 = create_dict_from_ndjson(file1)
dict2 = create_dict_from_ndjson(file2)

return dict1 == dict2

def compare_tsv_files(file1, file2):
df1 = pd.read_csv(file1, sep="\t")
df2 = pd.read_csv(file2, sep="\t")

# Compare the contents
return df1.sort_index(axis=1).equals(df2.sort_index(axis=1))


def run_snakemake(rule, touch=False):
"""
Function to run Snakemake with the test data.
Expand Down Expand Up @@ -78,6 +99,22 @@ def test_snakemake():
output_file,
), f"{output_file} does not match {expected_file}."

for expected_file in EXPECTED_OUTPUT_DIR.glob("*.tsv"):
output_file = OUTPUT_DIR / expected_file.name
assert output_file.exists(), f"{output_file} does not exist."
assert compare_tsv_files(
expected_file,
output_file,
), f"{output_file} does not match {expected_file}."

for expected_file in EXPECTED_OUTPUT_DIR.glob("*.ndjson"):
output_file = OUTPUT_DIR / expected_file.name
assert output_file.exists(), f"{output_file} does not exist."
assert compare_ndjson_files(
expected_file,
output_file,
), f"{output_file} does not match {expected_file}."


if __name__ == "__main__":
pytest.main(["-v"])

0 comments on commit d6edd1a

Please sign in to comment.