diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index b4e91f7..309a19b 100755 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [ '3.8', '3.9', '3.10' ] + python-version: ['3.11'] steps: - name: Checkout code diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 46c9feb..885405f 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -8,7 +8,7 @@ jobs: strategy: matrix: - python-version: ['3.8', '3.9', '3.10'] + python-version: ['3.11'] steps: - name: Checkout code diff --git a/.gitignore b/.gitignore index fc2bc44..2abcd34 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,6 @@ out vafator/tests/resources/results .cache .jupyter -.local \ No newline at end of file +.local +run.sh +VAFator.egg-info/* diff --git a/requirements.txt b/requirements.txt old mode 100755 new mode 100644 index bdd5c16..3484f8c --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,11 @@ -pandas~=1.3.3 -pysam~=0.19.1 -cyvcf2~=0.30.14 -logzero~=1.7.0 -pybedtools~=0.9.0 -numpy>=1.20,<2.0 -scipy>=1.0.0,<2.0.0 \ No newline at end of file +pandas>=3.0.1,<4 +# pysam pinned: above 0.21.0 base qualities show up wrong in the presence of soft clipping/insertions/overlapping read pairs or a combination of these factors +pysam==0.21.0 +cyvcf2>=0.32.1,<0.33 +logzero>=1.7.0,<2 +pybedtools>=0.12.0,<0.13 +numpy>=2.4.3,<3 +scipy>=1.17.1,<2 +setuptools +pytest +pytest-cov \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index a2f3748..4f08792 100755 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,63 @@ [metadata] -description-file = README.md \ No newline at end of file +name = VAFator +version = 3.0.0 +description = Annotate variants in a VCF file with technical annotations from one or more BAMs +description-file = README.md +long_description = file: README.md +long_description_content_type = text/markdown +license = MIT +url = https://github.com/TRON-Bioinformatics/vafator +author = Pablo Riesgo Ferreiro, Jonas Ibn-Salem, Luis Kress, Özlem Muslu +classifiers = + Development Status :: 4 - Beta + Intended Audience :: Healthcare Industry + Intended Audience :: Science/Research + Topic :: Scientific/Engineering :: Bio-Informatics + Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 + Programming Language :: Python :: 3.13 + Programming Language :: Python :: 3 :: Only + License :: OSI Approved :: MIT License + Operating System :: Unix +author_email = priesgoferreiro@gmail.com + +[options.entry_points] +console_scripts = + vafator=vafator.command_line:annotator + multiallelics-filter=vafator.command_line:multiallelics_filter + vafator2decifer=vafator.command_line:vafator2decifer + hatchet2bed=vafator.command_line:hatchet2bed + +[options] +packages = find: +include_package_data = True +zip_safe = False + +python_requires = >=3.11, <3.12 + +install_requires = + pandas>=3.0.1,<4 + pysam==0.21.0 # above this version base qualities show up wrong in the presence of soft clipping/insertions/both (latest release 0.23.3) + cyvcf2>=0.32.1,<0.33 + logzero>=1.7.0,<2 + pybedtools>=0.12.0,<0.13 + numpy>=2.4.3,<3 + scipy>=1.17.1,<2 + setuptools + +[options.packages.find] +exclude = + tests + tests.* + legacy + legacy.* + +[options.extras_require] +dev = + pytest + ruff + mypy +test = + pytest + pytest-cov + setuptools \ No newline at end of file diff --git a/setup.py b/setup.py index 1025dd9..c29369c 100755 --- a/setup.py +++ b/setup.py @@ -1,52 +1,11 @@ -from setuptools import find_packages, setup +from setuptools import setup import vafator VERSION = vafator.VERSION - -# parses requirements from file -with open("requirements.txt") as f: - required = f.read().splitlines() - with open("README.md", "r", encoding="utf-8") as f: long_description = f.read() # Build the Python package -setup( - name='vafator', - version=VERSION, - packages=find_packages(exclude=["legacy"]), - entry_points={ - 'console_scripts': [ - 'vafator=vafator.command_line:annotator', - 'multiallelics-filter=vafator.command_line:multiallelics_filter', - 'vafator2decifer=vafator.command_line:vafator2decifer', - 'hatchet2bed=vafator.command_line:hatchet2bed' - ], - }, - author="TRON - Translational Oncology at the University Medical Center of the Johannes Gutenberg University Mainz" - "- Computational Medicine group", - author_email='pablo.riesgoferreiro@tron-mainz.de', - description='Annotate a VCF file with AF, AD and DP from tumor and normal BAMs', - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/tron-bioinformatics/vafator", - requires=[], - install_requires=required, - classifiers=[ - 'Development Status :: 4 - Beta', # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package - 'Intended Audience :: Healthcare Industry', - 'Intended Audience :: Science/Research', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3 :: Only', - "License :: OSI Approved :: MIT License", - "Operating System :: Unix" - ], - python_requires='>=3.7', - license='MIT' -) \ No newline at end of file +setup() diff --git a/vafator/__init__.py b/vafator/__init__.py index 90a0094..ea9d694 100755 --- a/vafator/__init__.py +++ b/vafator/__init__.py @@ -1,4 +1 @@ -VERSION='2.2.2' - - -AMBIGUOUS_BASES = ['N', 'M', 'R', 'W', 'S', 'Y', 'K', 'V', 'H', 'D', 'B'] +VERSION = "3.0.0" diff --git a/vafator/annotator.py b/vafator/annotator.py index 00f8293..5351e50 100755 --- a/vafator/annotator.py +++ b/vafator/annotator.py @@ -1,420 +1,500 @@ -from collections import Counter - -import numpy as np -import pysam -from cyvcf2 import VCF, Writer, Variant -import os -import vafator -import datetime -import json -import asyncio -import time - -from vafator.ploidies import DEFAULT_PLOIDY -from vafator.rank_sum_test import calculate_rank_sum_test, get_rank_sum_tests -from vafator.power import PowerCalculator, DEFAULT_ERROR_RATE, DEFAULT_FPR -from vafator.pileups import get_variant_pileup, get_metrics - - -BATCH_SIZE = 10000 - - -def background(f): - def wrapped(*args, **kwargs): - return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs) - - return wrapped - - -class Annotator(object): - - vafator_header = { - "name": "vafator", - "version": vafator.VERSION, - "date": datetime.datetime.now().ctime(), - "timestamp": datetime.datetime.now().timestamp(), - } - - def __init__(self, input_vcf: str, output_vcf: str, - input_bams: dict, - purities: dict = {}, - mapping_qual_thr=0, - base_call_qual_thr=29, - tumor_ploidies: dict = {}, - normal_ploidy=2, - fpr=DEFAULT_FPR, - error_rate=DEFAULT_ERROR_RATE, - include_ambiguous_bases=False): - - self.mapping_quality_threshold = mapping_qual_thr - self.base_call_quality_threshold = base_call_qual_thr - self.purities = purities - self.tumor_ploidies = tumor_ploidies - self.normal_ploidy = normal_ploidy - self.include_ambiguous_bases = include_ambiguous_bases - self.power = PowerCalculator( - normal_ploidy=normal_ploidy, tumor_ploidies=tumor_ploidies, purities=purities, - error_rate=error_rate, fpr=fpr) - - self.vcf = VCF(input_vcf) - # sets a line in the header with the command used to annotate the file - self.vafator_header["input_vcf"] = os.path.abspath(input_vcf) - self.vafator_header["output_vcf"] = os.path.abspath(output_vcf) - self.vafator_header["bams"] = ";".join( - ["{}:{}".format(s, ",".join([os.path.abspath(b) for b in bams])) for s, bams in input_bams.items()]) - self.vafator_header["mapping_quality_threshold"] = mapping_qual_thr - self.vafator_header["base_call_quality_threshold"] = base_call_qual_thr - self.vafator_header["purities"] = ";".join(["{}:{}".format(s, p) for s, p in purities.items()]) - self.vafator_header["normal_ploidy"] = normal_ploidy - self.vafator_header["tumor_ploidy"] = ";".join(["{}:{}".format(s, p.report_value) - for s, p in tumor_ploidies.items()]) \ - if tumor_ploidies else DEFAULT_PLOIDY - self.vafator_header["include_ambiguous_bases"] = self.include_ambiguous_bases - self.vcf.add_to_header("##vafator_command_line={}".format(json.dumps(self.vafator_header))) - # adds to the header all the names of the annotations - for a in Annotator._get_headers(input_bams): - self.vcf.add_info_to_header(a) - self.vcf_writer = Writer(output_vcf, self.vcf) - self.bam_readers = {s : [pysam.AlignmentFile(b, "rb") for b in bams] for s, bams in input_bams.items()} - - @staticmethod - def _get_headers(input_bams: dict): - headers = [] - - for s, bams in input_bams.items(): - headers.append({ - 'ID': "{}_af".format(s), - 'Description': "Allele frequency for the alternate alleles in the {} sample/s".format(s), - 'Type': 'Float', - 'Number': 'A' - }) - headers.append({ - 'ID': "{}_dp".format(s), - 'Description': "Total depth of coverage in the {} sample/s (independent of alleles)".format(s), - 'Type': 'Float', - 'Number': '1' - }) - headers.append({ - 'ID': "{}_ac".format(s), - 'Description': "Allele count for the alternate alleles in the {} sample/s".format(s), - 'Type': 'Integer', - 'Number': 'A' - }) - headers.append({ - 'ID': "{}_n".format(s), - 'Description': "Allele count for ambiguous bases (any IUPAC ambiguity code is counted) " - "in the {} sample/s".format(s), - 'Type': 'Integer', - 'Number': '1' - }) - headers.append({ - 'ID': "{}_pu".format(s), - 'Description': "Probability of an undetected mutation given the observed supporting reads (AC), " - "the observed total coverage (DP) and the provided tumor purity in the " - "{} sample/s".format(s), - 'Type': 'Float', - 'Number': 'A' - }) - headers.append({ - 'ID': "{}_pw".format(s), - 'Description': "Power to detect a somatic mutation as described in Absolute " - "given the observed total coverage (DP) " - "and the provided tumor purity and ploidies in the {} sample/s".format(s), - 'Type': 'Float', - 'Number': '1' - }) - headers.append({ - 'ID': "{}_k".format(s), - 'Description': "Minimum number of supporting reads, k, such that the probability of observing " - "k or more non-reference reads due to sequencing error is less than the defined FPR " - "in the {} sample/s".format(s), - 'Type': 'Float', - 'Number': '1' - }) - headers.append({ - 'ID': "{}_eaf".format(s), - 'Description': "Expected VAF considering the purity and ploidy/copy number in the " - "{} sample/s".format(s), - 'Type': 'Float', - 'Number': '1' - }) - headers.append({ - 'ID': "{}_bq".format(s), - 'Description': "Median base call quality of the reads supporting each allele in the " - "{} sample/s".format(s), - 'Type': 'Float', - 'Number': 'R' - }) - headers.append({ - 'ID': "{}_mq".format(s), - 'Description': "Median mapping quality of the reads supporting each allele in the " - "{} sample/s".format(s), - 'Type': 'Float', - 'Number': 'R' - }) - headers.append({ - 'ID': "{}_pos".format(s), - 'Description': "Median position within the read of the reads supporting each allele in the " - "{} sample/s".format(s), - 'Type': 'Float', - 'Number': 'R' - }) - headers.append({ - 'ID': "{}_rsmq".format(s), - 'Description': "Rank sum test comparing the MQ distributions supporting the reference and the " - "alternate in the {} sample/s. Identical distributions will have a value of 0, larger " - "values away from 0 indicate different distributions.".format(s), - 'Type': 'Float', - 'Number': 'A' - }) - headers.append({ - 'ID': "{}_rsmq_pv".format(s), - 'Description': "Rank sum test comparing the mapping quality distributions between alternate " - "and reference p-value in the {} sample/s. , The null hypothesis is that there is no " - "difference between the distributions".format(s), - 'Type': 'Float', - 'Number': 'A' - }) - headers.append({ - 'ID': "{}_rsbq".format(s), - 'Description': "Rank sum test comparing the base call qualities distributions supporting the reference " - "and the alternate in the {} sample/s. Identical distributions will have a value of 0, " - "larger values away from 0 indicate different distributions.".format(s), - 'Type': 'Float', - 'Number': 'A' - }) - headers.append({ - 'ID': "{}_rsbq_pv".format(s), - 'Description': "Rank sum test comparing the base call qualities distributions between alternate " - "and reference p-value in the {} sample/s. , The null hypothesis is that there is no " - "difference between the distributions".format(s), - 'Type': 'Float', - 'Number': 'A' - }) - headers.append({ - 'ID': "{}_rspos".format(s), - 'Description': "Rank sum test comparing the relative position distributions supporting the reference " - "and the alternate in the {} sample/s. Identical distributions will have a value of 0, " - "larger values away from 0 indicate different distributions.".format(s), - 'Type': 'Float', - 'Number': 'A' - }) - headers.append({ - 'ID': "{}_rspos_pv".format(s), - 'Description': "Rank sum test comparing the relative position distributions between alternate " - "and reference p-value in the {} sample/s. , The null hypothesis is that there is no " - "difference between the distributions".format(s), - 'Type': 'Float', - 'Number': 'A' - }) - - if len(bams) > 1: - for i, bam in enumerate(bams, start=1): - n = os.path.basename(bam).split(".")[0] - headers = headers + [ - {'ID': "{}_af_{}".format(s, i), - 'Description': "Allele frequency for the alternate alleles in the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'A'}, - {'ID': "{}_dp_{}".format(s, i), - 'Description': "Depth of coverage in the {} sample {} (independent of alleles)".format(s, n), - 'Type': 'Float', 'Number': '1'}, - {'ID': "{}_ac_{}".format(s, i), - 'Description': "Allele count for the alternate alleles in the {} sample {}".format(s, n), - 'Type': 'Integer', 'Number': 'A'}, - {'ID': "{}_n_{}".format(s, i), - 'Description': "Allele count for ambiguous bases (any IUPAC ambiguity code is counted) " - "in the {} sample {}".format(s, n), - 'Type': 'Integer', 'Number': '1'}, - {'ID': "{}_pu_{}".format(s, i), - 'Description': "Probability of an undetected mutation given the observed supporting " - "reads (AC), the observed total coverage (DP) and the provided tumor " - "purity in the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'A'}, - {'ID': "{}_pw_{}".format(s, i), - 'Description': "Power to detect a somatic mutation as described in Absolute " - "given the observed total coverage (DP) " - "and the provided tumor purity and ploidies in the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': '1'}, - {'ID': "{}_k_{}".format(s, i), - 'Description': "Minimum number of supporting reads, k, such that the probability of observing " - "k or more non-reference reads due to sequencing error is less than the " - "defined FPR in the {} sample {}".format(s, n), - 'Type': 'Float', - 'Number': '1'}, - {'ID': "{}_bq_{}".format(s, i), - 'Description': "Median base call quality of the reads supporting each allele in " - "the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'R'}, - {'ID': "{}_rsbq_{}".format(s, i), - 'Description': "Rank sum test comparing the base call qualities distributions supporting the " - "reference and the alternate in the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'A'}, - {'ID': "{}_rsbq_pv_{}".format(s, i), - 'Description': "Significance for the rank sum test comparing the base call qualities " - "distributions supporting the reference and the alternate " - "in the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'A'}, - {'ID': "{}_mq_{}".format(s, i), - 'Description': "Median mapping quality of the reads supporting each allele in " - "the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'R'}, - {'ID': "{}_rsmq_{}".format(s, i), - 'Description': "Rank sum test comparing the mapping qualities distributions supporting the " - "reference and the alternate in the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'A'}, - {'ID': "{}_rsmq_pv_{}".format(s, i), - 'Description': "Significance for the rank sum test comparing the mapping qualities " - "distributions supporting the reference and the alternate " - "in the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'A'}, - {'ID': "{}_pos_{}".format(s, i), - 'Description': "Median position within the read of the reads supporting each allele in " - "the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'R'}, - {'ID': "{}_rspos_{}".format(s, i), - 'Description': "Rank sum test comparing the position distributions supporting the " - "reference and the alternate in the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'A'}, - {'ID': "{}_rspos_pv_{}".format(s, i), - 'Description': "Significance for the rank sum test comparing the position " - "distributions supporting the reference and the alternate " - "in the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'A'}, - ] - return headers - - @background - def _write_batch(self, batch): - for v in batch: - self.vcf_writer.write_record(v) - - def _add_stats(self, variant: Variant): - for sample, bams in self.bam_readers.items(): - global_dp = 0 - global_ac = Counter() - global_bq = Counter() - global_mq = Counter() - global_pos = Counter() - global_all_mqs = {} - global_all_bqs = {} - global_all_positions = {} - for i, bam in enumerate(bams): - pileups = get_variant_pileup( - variant=variant, bam=bam, - min_base_quality=self.base_call_quality_threshold, - min_mapping_quality=self.mapping_quality_threshold) - coverage_metrics = get_metrics(variant=variant, pileups=pileups, - include_ambiguous_bases=self.include_ambiguous_bases) - if coverage_metrics is not None: - if len(bams) > 1: - variant.INFO["{}_af_{}".format(sample, i + 1)] = \ - ",".join([str(self._calculate_af(coverage_metrics.ac[alt], coverage_metrics.dp)) - for alt in variant.ALT]) - variant.INFO["{}_ac_{}".format(sample, i + 1)] = \ - ",".join([str(coverage_metrics.ac[alt]) for alt in variant.ALT]) - variant.INFO["{}_n_{}".format(sample, i + 1)] = \ - str(sum([coverage_metrics.ac.get(n, 0) for n in vafator.AMBIGUOUS_BASES])) - variant.INFO["{}_dp_{}".format(sample, i + 1)] = coverage_metrics.dp - variant.INFO["{}_pu_{}".format(sample, i + 1)] = ",".join( - [str(self.power.calculate_power( - ac=coverage_metrics.ac[alt], dp=coverage_metrics.dp, sample=sample, variant=variant - )) for alt in variant.ALT]) - power, k = self.power.calculate_absolute_power( - dp=coverage_metrics.dp, sample=sample, variant=variant) - variant.INFO["{}_pw_{}".format(sample, i + 1)] = str(power) - variant.INFO["{}_k_{}".format(sample, i + 1)] = str(k) - variant.INFO["{}_bq_{}".format(sample, i + 1)] = ",".join( - [str(coverage_metrics.bqs[variant.REF])] + - [str(coverage_metrics.bqs[alt]) for alt in variant.ALT]) - variant.INFO["{}_mq_{}".format(sample, i + 1)] = ",".join( - [str(coverage_metrics.mqs[variant.REF])] + - [str(coverage_metrics.mqs[alt]) for alt in variant.ALT]) - variant.INFO["{}_pos_{}".format(sample, i + 1)] = ",".join( - [str(coverage_metrics.positions[variant.REF])] + - [str(coverage_metrics.positions[alt]) for alt in variant.ALT]) - - pvalues, stats = get_rank_sum_tests(coverage_metrics.all_mqs, variant) - if stats: - variant.INFO["{}_rsmq_{}".format(sample, i + 1)] = ",".join(stats) - variant.INFO["{}_rsmq_pv_{}".format(sample, i + 1)] = ",".join(pvalues) - - pvalues, stats = get_rank_sum_tests(coverage_metrics.all_bqs, variant) - if stats: - variant.INFO["{}_rsbq_{}".format(sample, i + 1)] = ",".join(stats) - variant.INFO["{}_rsbq_pv_{}".format(sample, i + 1)] = ",".join(pvalues) - - pvalues, stats = get_rank_sum_tests(coverage_metrics.all_positions, variant) - if stats: - variant.INFO["{}_rspos_{}".format(sample, i + 1)] = ",".join(stats) - variant.INFO["{}_rspos_pv_{}".format(sample, i + 1)] = ",".join(pvalues) - - global_ac.update(coverage_metrics.ac) - global_bq.update(coverage_metrics.bqs) - global_mq.update(coverage_metrics.mqs) - global_pos.update(coverage_metrics.positions) - global_all_mqs.update(coverage_metrics.all_mqs) - global_all_bqs.update(coverage_metrics.all_bqs) - global_all_positions.update(coverage_metrics.all_positions) - global_dp += coverage_metrics.dp - - variant.INFO["{}_af".format(sample)] = ",".join([str(self._calculate_af(global_ac[alt], global_dp)) for alt in variant.ALT]) - variant.INFO["{}_ac".format(sample)] = ",".join([str(global_ac[alt]) for alt in variant.ALT]) - variant.INFO["{}_n".format(sample)] = str(sum([global_ac.get(n, 0) for n in vafator.AMBIGUOUS_BASES])) - variant.INFO["{}_dp".format(sample)] = global_dp - variant.INFO["{}_eaf".format(sample)] = str(self.power.calculate_expected_vaf( - sample=sample, variant=variant)) - variant.INFO["{}_pu".format(sample)] = ",".join( - [str(self.power.calculate_power(ac=global_ac[alt], dp=global_dp, sample=sample, variant=variant)) - for alt in variant.ALT]) - power, k = self.power.calculate_absolute_power( - dp=global_dp, sample=sample, variant=variant) - variant.INFO["{}_pw".format(sample)] = str(power) - variant.INFO["{}_k".format(sample)] = str(k) - variant.INFO["{}_bq".format(sample)] = ",".join( - [str(global_bq[variant.REF])] + [str(global_bq[alt]) for alt in variant.ALT]) - variant.INFO["{}_mq".format(sample)] = ",".join( - [str(global_mq[variant.REF])] + [str(global_mq[alt]) for alt in variant.ALT]) - variant.INFO["{}_pos".format(sample)] = ",".join( - [str(global_pos[variant.REF])] + [str(global_pos[alt]) for alt in variant.ALT]) - - # for these rank sum tests it is required at least one value for the alternate and one value for the - # reference otherwise it cannot be calculated - pvalues, stats = get_rank_sum_tests(global_all_mqs, variant) - if stats: - variant.INFO["{}_rsmq".format(sample)] = ",".join(stats) - variant.INFO["{}_rsmq_pv".format(sample)] = ",".join(pvalues) - - pvalues, stats = get_rank_sum_tests(global_all_bqs, variant) - if stats: - variant.INFO["{}_rsbq".format(sample)] = ",".join(stats) - variant.INFO["{}_rsbq_pv".format(sample)] = ",".join(pvalues) - - pvalues, stats = get_rank_sum_tests(global_all_positions, variant) - if stats: - variant.INFO["{}_rspos".format(sample)] = ",".join(stats) - variant.INFO["{}_rspos_pv".format(sample)] = ",".join(pvalues) - - def _calculate_af(self, ac, dp): - return round(float(ac) / dp, 5) if dp > 0 else 0.0 - - def run(self): - batch = [] - variant: Variant - for variant in self.vcf: - # gets the counts of all bases across all BAMs - self._add_stats(variant) - - batch.append(variant) - if len(batch) >= BATCH_SIZE: - self._write_batch(batch) - batch = [] - if len(batch) > 0: - self._write_batch(batch) - - time.sleep(2) - - self.vcf_writer.close() - self.vcf.close() - for _, bams in self.bam_readers.items(): - for bam in bams: - bam.close() +from collections import Counter +import os +from concurrent.futures import ProcessPoolExecutor + +import pysam +from cyvcf2 import VCF, Writer, Variant +import vafator +import datetime +import json + +from vafator.constants import ( + AMBIGUOUS_BASES, + BATCH_SIZE, + _HEADER_TEMPLATES, + _REPLICATE_HEADER_TEMPLATES, +) +from vafator.ploidies import DEFAULT_PLOIDY +from vafator.rank_sum_test import get_rank_sum_tests +from vafator.power import PowerCalculator, DEFAULT_ERROR_RATE, DEFAULT_FPR +from vafator.pileups import collect_metrics_for_chrom, stream_variants_by_chrom +from vafator.pileup_utils import VariantRecord, EMPTY_METRICS + + +def _collect_metrics_worker( + chrom: str, + variant_tuples: list, + bam_paths: dict, + min_base_quality: int, + min_mapping_quality: int, + include_ambiguous_bases: bool, +) -> dict: + """ + Top-level worker function for ProcessPoolExecutor — must be module-level to be picklable. + Opens its own BAM readers (AlignmentFile objects cannot be shared across processes). + Receives variant data as plain tuples (cyvcf2.Variant objects are not picklable). + + Args: + chrom: chromosome name + variant_tuples: list of (POS, REF, ALT) tuples for variants on this chromosome + bam_paths: {sample: [bam_path, ...]} — picklable BAM file paths + min_base_quality: minimum base call quality threshold + min_mapping_quality: minimum mapping quality threshold + include_ambiguous_bases: whether to include ambiguous bases in depth calculation + + Returns: + {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}} + """ + all_metrics = {} + variants = [ + VariantRecord(CHROM=chrom, POS=pos, REF=ref, ALT=[alt]) + for pos, ref, alt in variant_tuples + ] + for sample, bam_files in bam_paths.items(): + for i, bam_path in enumerate(bam_files): + bam = pysam.AlignmentFile(bam_path, "r") + all_metrics[(sample, i)] = collect_metrics_for_chrom( + chrom=chrom, + variants=variants, + bam=bam, + min_base_quality=min_base_quality, + min_mapping_quality=min_mapping_quality, + include_ambiguous_bases=include_ambiguous_bases, + ) + bam.close() + return all_metrics + + +class Annotator(object): + + def __init__( + self, + input_vcf: str, + output_vcf: str, + input_bams: dict, + purities: dict = None, + mapping_qual_thr: int = 0, + base_call_qual_thr: int = 29, + tumor_ploidies: dict = None, + normal_ploidy: int = 2, + fpr: float = DEFAULT_FPR, + error_rate: float = DEFAULT_ERROR_RATE, + include_ambiguous_bases: bool = True, + num_processes: int = 1, + ): + """ + Args: + input_vcf: path to the input VCF file to annotate + output_vcf: path for the annotated output VCF + input_bams: {sample_name: [bam_path, ...]} — one or more BAMs per sample + purities: {sample_name: purity} — tumor purity per sample (default: 1.0) + mapping_qual_thr: minimum mapping quality; reads below this are excluded + base_call_qual_thr: minimum base call quality; bases below this are excluded + tumor_ploidies: {sample_name: PloidyManager} — tumor ploidy per sample (default: 2) + normal_ploidy: normal ploidy for power calculation (default: 2) + fpr: false positive rate for power calculation + error_rate: sequencing error rate for power calculation + include_ambiguous_bases: if True, ambiguous bases (N and IUPAC codes) are counted in DP + num_processes: number of parallel processes for chromosome-level annotation (default: 1) + """ + + if purities is None: + purities = {} + if tumor_ploidies is None: + tumor_ploidies = {} + + self.mapping_quality_threshold = mapping_qual_thr + self.base_call_quality_threshold = base_call_qual_thr + self.purities = purities + self.tumor_ploidies = tumor_ploidies + self.normal_ploidy = normal_ploidy + self.include_ambiguous_bases = include_ambiguous_bases + self.num_processes = num_processes + self.power = PowerCalculator( + normal_ploidy=normal_ploidy, + tumor_ploidies=tumor_ploidies, + purities=purities, + error_rate=error_rate, + fpr=fpr, + ) + + self.vcf = VCF(input_vcf) + + self.header = { + "name": "vafator", + "version": vafator.VERSION, + "date": datetime.datetime.now().ctime(), + "timestamp": datetime.datetime.now().timestamp(), + "input_vcf": os.path.abspath(input_vcf), + "output_vcf": os.path.abspath(output_vcf), + "bams": ";".join( + [ + "{}:{}".format(s, ",".join([os.path.abspath(b) for b in bams])) + for s, bams in input_bams.items() + ] + ), + "mapping_quality_threshold": mapping_qual_thr, + "base_call_quality_threshold": base_call_qual_thr, + "purities": ";".join(["{}:{}".format(s, p) for s, p in purities.items()]), + "normal_ploidy": normal_ploidy, + "tumor_ploidy": ( + ";".join( + [ + "{}:{}".format(s, p.report_value) + for s, p in tumor_ploidies.items() + ] + ) + if tumor_ploidies + else DEFAULT_PLOIDY + ), + "include_ambiguous_bases": include_ambiguous_bases, + } + self.vcf.add_to_header( + "##vafator_command_line={}".format(json.dumps(self.header)) + ) + + for a in Annotator._get_headers(input_bams): + self.vcf.add_info_to_header(a) + self.vcf_writer = Writer(output_vcf, self.vcf) + + self.bam_paths = input_bams + self.bam_readers = { + s: [pysam.AlignmentFile(b, "r") for b in bams] + for s, bams in input_bams.items() + } + + def run(self) -> None: + """Run the annotation pipeline over all variants in the input VCF, + writing annotated records to the output VCF.""" + batch = [] + if self.num_processes > 1: + self._run_parallel(batch) + else: + self._run_serial(batch) + if batch: + self._write_batch(batch) + self.vcf_writer.close() + self.vcf.close() + for _, bams in self.bam_readers.items(): + for bam in bams: + bam.close() + + def _run_serial(self, batch: list) -> None: + """Annotate variants chromosome by chromosome in the main process.""" + for chrom, chrom_variants in stream_variants_by_chrom(self.vcf): + all_metrics = self._collect_chrom_metrics(chrom, chrom_variants) + self._annotate_and_batch(chrom_variants, all_metrics, batch) + + def _run_parallel(self, batch: list) -> None: + """Annotate variants using a process pool — one worker per chromosome. + Results are collected and written in original VCF chromosome order.""" + chrom_variants_map = {} + futures = {} + with ProcessPoolExecutor(max_workers=self.num_processes) as executor: + for chrom, chrom_variants in stream_variants_by_chrom(self.vcf): + chrom_variants_map[chrom] = chrom_variants + variant_tuples = [(v.POS, v.REF, v.ALT[0]) for v in chrom_variants] + futures[ + executor.submit( + _collect_metrics_worker, + chrom=chrom, + variant_tuples=variant_tuples, + bam_paths=self.bam_paths, + min_base_quality=self.base_call_quality_threshold, + min_mapping_quality=self.mapping_quality_threshold, + include_ambiguous_bases=self.include_ambiguous_bases, + ) + ] = chrom + chrom_results = {futures[f]: f.result() for f in futures} + for chrom, chrom_variants in chrom_variants_map.items(): + self._annotate_and_batch(chrom_variants, chrom_results[chrom], batch) + + def _collect_chrom_metrics(self, chrom: str, chrom_variants: list) -> dict: + """Collect metrics for all BAMs for one chromosome in the main process. + + Args: + chrom: chromosome name + chrom_variants: list of cyvcf2 Variant objects on this chromosome + + Returns: + {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}} + """ + all_metrics = {} + for sample, bams in self.bam_readers.items(): + for i, bam in enumerate(bams): + all_metrics[(sample, i)] = collect_metrics_for_chrom( + chrom=chrom, + variants=chrom_variants, + bam=bam, + min_base_quality=self.base_call_quality_threshold, + min_mapping_quality=self.mapping_quality_threshold, + include_ambiguous_bases=self.include_ambiguous_bases, + ) + return all_metrics + + def _annotate_and_batch( + self, chrom_variants: list, all_metrics: dict, batch: list + ) -> None: + """Annotate variants using pre-computed metrics and append to write batch. + Flushes the batch to disk when it reaches BATCH_SIZE. + + Args: + chrom_variants: list of cyvcf2 Variant objects to annotate + all_metrics: {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}} + batch: accumulator list for annotated variants pending write + """ + for variant in chrom_variants: + metrics_by_bam = { + (sample, i): all_metrics[(sample, i)].get( + (variant.POS, variant.REF, variant.ALT[0]), EMPTY_METRICS + ) + for sample, bams in self.bam_readers.items() + for i in range(len(bams)) + } + self._add_stats(variant, metrics_by_bam) + batch.append(variant) + if len(batch) >= BATCH_SIZE: + self._write_batch(batch) + batch.clear() + + def _add_stats(self, variant: Variant, metrics_by_bam: dict) -> None: + """Annotate a single variant using pre-computed metrics. + + Args: + variant: the cyvcf2 Variant to annotate in place + metrics_by_bam: {(sample, bam_idx): CoverageMetrics} + """ + for sample, bams in self.bam_readers.items(): + global_dp = 0 + global_ac = Counter() + global_bq = Counter() + global_mq = Counter() + global_pos = Counter() + global_all_mqs = {} + global_all_bqs = {} + global_all_positions = {} + + for i in range(len(bams)): + metrics = metrics_by_bam.get((sample, i), EMPTY_METRICS) + if metrics is not None: + if len(bams) > 1: + self._annotate_replicate(variant, sample, i, metrics) + global_ac.update(metrics.ac) + global_bq.update(metrics.bqs) + global_mq.update(metrics.mqs) + global_pos.update(metrics.positions) + global_all_mqs.update(metrics.all_mqs) + global_all_bqs.update(metrics.all_bqs) + global_all_positions.update(metrics.all_positions) + global_dp += metrics.dp + + self._annotate_sample( + variant, + sample, + global_ac, + global_dp, + global_bq, + global_mq, + global_pos, + global_all_mqs, + global_all_bqs, + global_all_positions, + ) + + def _annotate_replicate(self, v: Variant, s: str, i: int, m) -> None: + """Write per-replicate annotations — only called when multiple BAMs are provided for a sample. + + Args: + v: the cyvcf2 Variant being annotated + s: sample name (e.g. 'tumor', 'normal') + i: 0-based index of the BAM within the sample's BAM list + m: pre-computed CoverageMetrics for this variant in this BAM + """ + n = i + 1 + v.INFO["{}_af_{}".format(s, n)] = ",".join( + [str(self._calculate_af(m.ac[alt], m.dp)) for alt in v.ALT] + ) + v.INFO["{}_ac_{}".format(s, n)] = ",".join([str(m.ac[alt]) for alt in v.ALT]) + v.INFO["{}_n_{}".format(s, n)] = str( + sum(m.ac.get(b, 0) for b in AMBIGUOUS_BASES) + ) + v.INFO["{}_dp_{}".format(s, n)] = m.dp + v.INFO["{}_pu_{}".format(s, n)] = ",".join( + [ + str( + self.power.calculate_power( + ac=m.ac[alt], dp=m.dp, sample=s, variant=v + ) + ) + for alt in v.ALT + ] + ) + + power, k = self.power.calculate_absolute_power(dp=m.dp, sample=s, variant=v) + v.INFO["{}_pw_{}".format(s, n)] = str(power) + v.INFO["{}_k_{}".format(s, n)] = str(k) + v.INFO["{}_bq_{}".format(s, n)] = ",".join( + [str(m.bqs[v.REF])] + [str(m.bqs[alt]) for alt in v.ALT] + ) + v.INFO["{}_mq_{}".format(s, n)] = ",".join( + [str(m.mqs[v.REF])] + [str(m.mqs[alt]) for alt in v.ALT] + ) + v.INFO["{}_pos_{}".format(s, n)] = ",".join( + [str(m.positions[v.REF])] + [str(m.positions[alt]) for alt in v.ALT] + ) + for key, tag in [ + (m.all_mqs, "rsmq"), + (m.all_bqs, "rsbq"), + (m.all_positions, "rspos"), + ]: + pvalues, stats = get_rank_sum_tests(key, v) + if stats: + v.INFO["{}_{}_{}".format(s, tag, n)] = ",".join(stats) + v.INFO["{}_{}_pv_{}".format(s, tag, n)] = ",".join(pvalues) + + def _annotate_sample( + self, + v: Variant, + s: str, + gac: Counter, + gdp: int, + gbq: Counter, + gmq: Counter, + gpos: Counter, + gallmq: dict, + gallbq: dict, + gallpos: dict, + ) -> None: + """Write aggregate annotations for a sample, combining metrics across all replicates. + + Args: + v: the cyvcf2 Variant being annotated + s: sample name (e.g. 'tumor', 'normal') + gac: allele counts summed across all BAMs + gdp: total depth summed across all BAMs + gbq: median BQ per allele, summed across all BAMs + gmq: median MQ per allele, summed across all BAMs + gpos: median read position per allele, summed across all BAMs + gallmq: MQ distributions per allele across all BAMs + gallbq: BQ distributions per allele across all BAMs + gallpos: read position distributions per allele across all BAMs + """ + v.INFO["{}_af".format(s)] = ",".join( + [str(self._calculate_af(gac[alt], gdp)) for alt in v.ALT] + ) + v.INFO["{}_ac".format(s)] = ",".join([str(gac[alt]) for alt in v.ALT]) + v.INFO["{}_n".format(s)] = str(sum(gac.get(b, 0) for b in AMBIGUOUS_BASES)) + v.INFO["{}_dp".format(s)] = gdp + v.INFO["{}_eaf".format(s)] = str( + self.power.calculate_expected_vaf(sample=s, variant=v) + ) + v.INFO["{}_pu".format(s)] = ",".join( + [ + str( + self.power.calculate_power(ac=gac[alt], dp=gdp, sample=s, variant=v) + ) + for alt in v.ALT + ] + ) + + power, k = self.power.calculate_absolute_power(dp=gdp, sample=s, variant=v) + v.INFO["{}_pw".format(s)] = str(power) + v.INFO["{}_k".format(s)] = str(k) + v.INFO["{}_bq".format(s)] = ",".join( + [str(gbq[v.REF])] + [str(gbq[alt]) for alt in v.ALT] + ) + v.INFO["{}_mq".format(s)] = ",".join( + [str(gmq[v.REF])] + [str(gmq[alt]) for alt in v.ALT] + ) + v.INFO["{}_pos".format(s)] = ",".join( + [str(gpos[v.REF])] + [str(gpos[alt]) for alt in v.ALT] + ) + for distributions, tag in [ + (gallmq, "rsmq"), + (gallbq, "rsbq"), + (gallpos, "rspos"), + ]: + pvalues, stats = get_rank_sum_tests(distributions, v) + if stats: + v.INFO["{}_{}".format(s, tag)] = ",".join(stats) + v.INFO["{}_{}_pv".format(s, tag)] = ",".join(pvalues) + + def _calculate_af(self, ac: int, dp: int) -> float: + """Return allele frequency, or 0.0 if depth is zero. + + Args: + ac: allele count for this alternate allele + dp: total depth of coverage + + Returns: + allele frequency rounded to 5 decimal places, or 0.0 if dp is zero + """ + return round(float(ac) / dp, 5) if dp > 0 else 0.0 + + def _write_batch(self, batch: list) -> None: + """Write a batch of annotated variants to the output VCF. + + Args: + batch: list of cyvcf2 Variant objects to write + """ + for v in batch: + self.vcf_writer.write_record(v) + + @staticmethod + def _get_headers(input_bams: dict) -> list: + """Build the list of INFO header entries for all samples and replicates. + + Args: + input_bams: {sample_name: [bam_path, ...]} + + Returns: + list of header dicts suitable for cyvcf2's add_info_to_header + """ + headers = [] + for s, bams in input_bams.items(): + for suffix, description, typ, number in _HEADER_TEMPLATES: + headers.append( + Annotator._make_header(suffix, description, typ, number, sample=s) + ) + if len(bams) > 1: + for i in range(1, len(bams) + 1): + # n = os.path.basename(bam).split(".")[0] + for suffix, description, typ, number in _REPLICATE_HEADER_TEMPLATES: + headers.append( + Annotator._make_header( + suffix, description, typ, number, sample=s, index=i + ) + ) + return headers + + @staticmethod + def _make_header( + suffix: str, + description: str, + typ: str, + number: str, + sample: str, + index: int = None, + ) -> dict: + """Build a single INFO header dict for cyvcf2's add_info_to_header. + + Args: + suffix: annotation suffix (e.g. 'af', 'dp') + description: description template with {sample} placeholder + typ: VCF type string ('Float', 'Integer', 'String') + number: VCF number string ('A', 'R', '1', etc.) + sample: sample name to substitute into ID and description + index: optional replicate index appended to the ID + + Returns: + dict with keys ID, Description, Type, Number suitable for cyvcf2's add_info_to_header + """ + header_id = "{}_{}".format(sample, suffix) + if index is not None: + header_id = "{}_{}".format(header_id, index) + return { + "ID": header_id, + "Description": description.format(sample=sample), + "Type": typ, + "Number": number, + } diff --git a/vafator/command_line.py b/vafator/command_line.py index 06cdd85..b8f3ea1 100755 --- a/vafator/command_line.py +++ b/vafator/command_line.py @@ -1,7 +1,7 @@ #!/usr/bin/env python import argparse -import sys import logging +from pathlib import Path import vafator from vafator.power import DEFAULT_FPR, DEFAULT_ERROR_RATE from vafator.hatchet2bed import run_hatchet2bed @@ -13,43 +13,124 @@ epilog = "Copyright (c) 2019-2021 TRON gGmbH (See LICENSE for licensing details)" +def _validate_alignment_file_extension(alignment_file): + suffixes = [suffix.lower() for suffix in Path(alignment_file).suffixes] + if ".sam" in suffixes: + raise ValueError( + "SAM input is not supported: {}. Please provide BAM or CRAM files.".format( + alignment_file + ) + ) + + def annotator(): # set up logger - parser = argparse.ArgumentParser(description="vafator v{}".format(vafator.VERSION), - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - epilog=epilog) - parser.add_argument("--input-vcf", dest="input_vcf", action="store", help="The VCF to annotate", required=True) - parser.add_argument("--output-vcf", dest="output_vcf", action="store", help="The annotated VCF", required=True) - parser.add_argument('--bam', action='append', nargs=2, - metavar=('sample_name', 'bam_file'), default=[], - help='A sample name and a BAM file. Can be used multiple times to input multiple samples and ' - 'multiple BAM files. The same sample name can be used multiple times with different BAMs, ' - 'this will treated as replicates.') - parser.add_argument("--mapping-quality", dest="mapping_quality", action="store", type=int, default=1, - help="All reads with a mapping quality below this threshold will be filtered out") - parser.add_argument("--base-call-quality", dest="base_call_quality", action="store", type=int, default=30, - help="All bases with a base call quality below this threshold will be filtered out") - parser.add_argument('--purity', action='append', nargs=2, - metavar=('sample_name', 'purity'), default=[], - help='A sample name and a tumor purity value. Can be used multiple times to input multiple ' - 'samples in combination with --bam. If no purity is provided for a given sample the ' - 'default value is 1.0') - parser.add_argument("--tumor-ploidy", action='append', nargs=2, - metavar=('sample_name', 'tumor_ploidy'), default=[], - help='A sample name and a tumor ploidy. Can be used multiple times to input multiple ' - 'samples in combination with --bam. The tumor ploidy can be provided as a genome-wide ' - 'value (eg: --tumor-ploidy primary 2) or as local copy numbers in a BED file ' - '(eg: --tumor-ploidy primary /path/to/copy_numbers.bed), see the documentation for ' - 'expected BED format (default: 2)') - parser.add_argument("--normal-ploidy", dest="normal_ploidy", required=False, default=2, type=int, - help="Normal ploidy for the power calculation (default: 2)") - parser.add_argument("--fpr", dest="fpr", required=False, default=DEFAULT_FPR, type=float, - help="False Positive Rate (FPR) to use in the power calculation") - parser.add_argument("--error-rate", dest="error_rate", required=False, default=DEFAULT_ERROR_RATE, type=float, - help="Error rate to use in the power calculation") - parser.add_argument("--include-ambiguous-bases", dest="include_ambiguous_bases", action='store_true', - help="Flag indicating to include ambiguous bases from the DP calculation") + parser = argparse.ArgumentParser( + description="vafator v{}".format(vafator.VERSION), + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + epilog=epilog, + ) + parser.add_argument( + "--input-vcf", + dest="input_vcf", + action="store", + help="The VCF to annotate", + required=True, + ) + parser.add_argument( + "--output-vcf", + dest="output_vcf", + action="store", + help="The annotated VCF", + required=True, + ) + parser.add_argument( + "--bam", + action="append", + nargs=2, + metavar=("sample_name", "bam_file"), + default=[], + help="A sample name and a BAM/CRAM file. Can be used multiple times to input multiple samples and " + "multiple BAM/CRAM files. The same sample name can be used multiple times with different BAMs/CRAMs, " + "this will treated as replicates.", + ) + parser.add_argument( + "--mapping-quality", + dest="mapping_quality", + action="store", + type=int, + default=1, + help="All reads with a mapping quality below this threshold will be filtered out", + ) + parser.add_argument( + "--base-call-quality", + dest="base_call_quality", + action="store", + type=int, + default=30, + help="All bases with a base call quality below this threshold will be filtered out", + ) + parser.add_argument( + "--purity", + action="append", + nargs=2, + metavar=("sample_name", "purity"), + default=[], + help="A sample name and a tumor purity value. Can be used multiple times to input multiple " + "samples in combination with --bam. If no purity is provided for a given sample the " + "default value is 1.0", + ) + parser.add_argument( + "--tumor-ploidy", + action="append", + nargs=2, + metavar=("sample_name", "tumor_ploidy"), + default=[], + help="A sample name and a tumor ploidy. Can be used multiple times to input multiple " + "samples in combination with --bam. The tumor ploidy can be provided as a genome-wide " + "value (eg: --tumor-ploidy primary 2) or as local copy numbers in a BED file " + "(eg: --tumor-ploidy primary /path/to/copy_numbers.bed), see the documentation for " + "expected BED format (default: 2)", + ) + parser.add_argument( + "--normal-ploidy", + dest="normal_ploidy", + required=False, + default=2, + type=int, + help="Normal ploidy for the power calculation (default: 2)", + ) + parser.add_argument( + "--fpr", + dest="fpr", + required=False, + default=DEFAULT_FPR, + type=float, + help="False Positive Rate (FPR) to use in the power calculation", + ) + parser.add_argument( + "--error-rate", + dest="error_rate", + required=False, + default=DEFAULT_ERROR_RATE, + type=float, + help="Error rate to use in the power calculation", + ) + parser.add_argument( + "--exclude-ambiguous-bases", + dest="exclude_ambiguous_bases", + action="store_true", + help="Flag indicating to include ambiguous bases from the DP calculation", + ) + parser.add_argument( + "--num-processes", + dest="num_processes", + required=False, + default=1, + type=int, + help="Number of processes for parallel chromosome-level annotation (default: 1)", + ) args = parser.parse_args() @@ -57,6 +138,7 @@ def annotator(): bams = {} for sample_name, bam in args.bam: + _validate_alignment_file_extension(bam) if sample_name in bams: bams[sample_name].append(bam) else: @@ -65,106 +147,207 @@ def annotator(): purities = {} for sample_name, purity in args.purity: if sample_name in purities: - raise ValueError('Multiple purity values provided for sample: {}'.format(sample_name)) + raise ValueError( + "Multiple purity values provided for sample: {}".format(sample_name) + ) if sample_name not in bams: - raise ValueError('Provided a purity value for a sample for which no BAM is provided: {}'.format(sample_name)) + raise ValueError( + "Provided a purity value for a sample for which no BAM is provided: {}".format( + sample_name + ) + ) purities[sample_name] = float(purity) tumor_ploidies = {} for sample_name, tumor_ploidy in args.tumor_ploidy: if sample_name in tumor_ploidies: - raise ValueError('Multiple tumor ploidy values provided for sample: {}'.format(sample_name)) + raise ValueError( + "Multiple tumor ploidy values provided for sample: {}".format( + sample_name + ) + ) if sample_name not in bams: raise ValueError( - 'Provided a tumor ploidy value for a sample for which no BAM is provided: {}'.format(sample_name)) + "Provided a tumor ploidy value for a sample for which no BAM is provided: {}".format( + sample_name + ) + ) try: # checks if a genome-wide purity value was passed - tumor_ploidies[sample_name] = PloidyManager(genome_wide_ploidy=float(tumor_ploidy)) + tumor_ploidies[sample_name] = PloidyManager( + genome_wide_ploidy=float(tumor_ploidy) + ) except ValueError: # checks if the non float-like value is a path to an existing file tumor_ploidies[sample_name] = PloidyManager(local_copy_numbers=tumor_ploidy) if len(bams) == 0: - raise ValueError("Please, provide at least one bam file with '--bam sample_name /path/to/file.bam'") - - try: - annotator = Annotator( - input_vcf=args.input_vcf, - output_vcf=args.output_vcf, - input_bams=bams, - mapping_qual_thr=args.mapping_quality, - base_call_qual_thr=args.base_call_quality, - purities=purities, - tumor_ploidies=tumor_ploidies, - normal_ploidy=int(args.normal_ploidy), - fpr=args.fpr, - error_rate=args.error_rate, - include_ambiguous_bases=args.include_ambiguous_bases + raise ValueError( + "Please, provide at least one bam file with '--bam sample_name /path/to/file.bam'" ) - annotator.run() - except Exception as e: - logging.error(str(e)) - sys.exit(-1) + + annotator = Annotator( + input_vcf=args.input_vcf, + output_vcf=args.output_vcf, + input_bams=bams, + mapping_qual_thr=args.mapping_quality, + base_call_qual_thr=args.base_call_quality, + purities=purities, + tumor_ploidies=tumor_ploidies, + normal_ploidy=int(args.normal_ploidy), + fpr=args.fpr, + error_rate=args.error_rate, + include_ambiguous_bases=(not args.exclude_ambiguous_bases), + num_processes=args.num_processes, + ) + annotator.run() + logging.info("Vafator finished!") def multiallelics_filter(): # set up logger - parser = argparse.ArgumentParser(description="vafator v{}".format(vafator.VERSION), - formatter_class=argparse.ArgumentDefaultsHelpFormatter, epilog=epilog) - parser.add_argument("--input-vcf", dest="input_vcf", action="store", help="The VCF to annotate", required=True) - parser.add_argument("--output-vcf", dest="output_vcf", action="store", help="The annotated VCF", required=True) - parser.add_argument("--tumor-sample-name", dest="tumor_sample_name", action="store", - help='The tumor sample name (will look for annotation ${SAMPLE_NAME}_af)', default='tumor') + parser = argparse.ArgumentParser( + description="vafator v{}".format(vafator.VERSION), + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + epilog=epilog, + ) + parser.add_argument( + "--input-vcf", + dest="input_vcf", + action="store", + help="The VCF to annotate", + required=True, + ) + parser.add_argument( + "--output-vcf", + dest="output_vcf", + action="store", + help="The annotated VCF", + required=True, + ) + parser.add_argument( + "--tumor-sample-name", + dest="tumor_sample_name", + action="store", + help="The tumor sample name (will look for annotation ${SAMPLE_NAME}_af)", + default="tumor", + ) args = parser.parse_args() logging.info("Vafator multiallelic filter starting...") - try: - filter = MultiallelicFilter( - input_vcf=args.input_vcf, - output_vcf=args.output_vcf, - tumor_sample_name=args.tumor_sample_name - ) - filter.run() - except Exception as e: - logging.error(str(e)) - sys.exit(-1) + filter = MultiallelicFilter( + input_vcf=args.input_vcf, + output_vcf=args.output_vcf, + tumor_sample_name=args.tumor_sample_name, + ) + filter.run() + logging.info("Vafator multiallelic filter finished!") def vafator2decifer(): - parser = argparse.ArgumentParser(description='Generate input for Decifer using VCF file and HATCHet CNA file') - parser.add_argument("-V", "--vcf_file", required=True, type=str, help="single or multi-sample VCF file") - parser.add_argument("-S", "--samples", required=True, type=str, - help="comma separated list of sample name prefixes to use for VAFator annotations, " - "eg: primary_tumor,metastasis_tumor; the annotations primary_tumor_ac, primary_tumor_dp, " - "etc. will be expected to exist") - parser.add_argument("-C", "--cna_file", required=True, type=str, help="HATCHet CNA file: best.seg.ucn ") - parser.add_argument("-O", "--out_dir", required=True, default="./", type=str, - help="directory for printing files; please make unique for each patient!") - parser.add_argument("-M", "--min_depth", required=True, type=int, help="minimum depth PER sample") - parser.add_argument("-A", "--min_alt_depth", required=True, type=int, - help="minimum depth of ALT allele in at least one sample") - parser.add_argument("-F", "--min_vaf", required=True, type=float, - help="minimum VAF of ALT allele in at least one sample") - parser.add_argument("-N", "--max_CN", required=False, default=6, type=int, - help="maximum total copy number for each observed clone") - parser.add_argument("-B", "--exclude_list", required=False, default=None, type=str, - help="BED file of genomic regions to exclude") - parser.add_argument("-p", "--min_purity", required=False, default=0.0, type=float, - help="minimum purity to consider samples") - parser.add_argument("--snp_file", required=False, default=None, type=str, - help="HATCHet file containing germline SNP counts in tumor samples, baf/tumor.1bed") + parser = argparse.ArgumentParser( + description="Generate input for Decifer using VCF file and HATCHet CNA file" + ) + parser.add_argument( + "-V", + "--vcf_file", + required=True, + type=str, + help="single or multi-sample VCF file", + ) + parser.add_argument( + "-S", + "--samples", + required=True, + type=str, + help="comma separated list of sample name prefixes to use for VAFator annotations, " + "eg: primary_tumor,metastasis_tumor; the annotations primary_tumor_ac, primary_tumor_dp, " + "etc. will be expected to exist", + ) + parser.add_argument( + "-C", + "--cna_file", + required=True, + type=str, + help="HATCHet CNA file: best.seg.ucn ", + ) + parser.add_argument( + "-O", + "--out_dir", + required=True, + default="./", + type=str, + help="directory for printing files; please make unique for each patient!", + ) + parser.add_argument( + "-M", "--min_depth", required=True, type=int, help="minimum depth PER sample" + ) + parser.add_argument( + "-A", + "--min_alt_depth", + required=True, + type=int, + help="minimum depth of ALT allele in at least one sample", + ) + parser.add_argument( + "-F", + "--min_vaf", + required=True, + type=float, + help="minimum VAF of ALT allele in at least one sample", + ) + parser.add_argument( + "-N", + "--max_CN", + required=False, + default=6, + type=int, + help="maximum total copy number for each observed clone", + ) + parser.add_argument( + "-B", + "--exclude_list", + required=False, + default=None, + type=str, + help="BED file of genomic regions to exclude", + ) + parser.add_argument( + "-p", + "--min_purity", + required=False, + default=0.0, + type=float, + help="minimum purity to consider samples", + ) + parser.add_argument( + "--snp_file", + required=False, + default=None, + type=str, + help="HATCHet file containing germline SNP counts in tumor samples, baf/tumor.1bed", + ) args = parser.parse_args() run_vafator2decifer(args) def hatchet2bed(): - parser = argparse.ArgumentParser(description='Generate input for Decifer using VCF file and HATCHet CNA file') - parser.add_argument("-i", "--input-file", required=True, type=str, help="input *.ucn hatchet file") - parser.add_argument("-o", "--output-prefix", required=True, type=str, - help="output BED file prefix, one file will be created per sample in the input with the " - "average tumor copy number in each segment") + parser = argparse.ArgumentParser( + description="Generate input for Decifer using VCF file and HATCHet CNA file" + ) + parser.add_argument( + "-i", "--input-file", required=True, type=str, help="input *.ucn hatchet file" + ) + parser.add_argument( + "-o", + "--output-prefix", + required=True, + type=str, + help="output BED file prefix, one file will be created per sample in the input with the " + "average tumor copy number in each segment", + ) args = parser.parse_args() run_hatchet2bed(input_file=args.input_file, output_prefix=args.output_prefix) diff --git a/vafator/constants.py b/vafator/constants.py new file mode 100755 index 0000000..8496c0f --- /dev/null +++ b/vafator/constants.py @@ -0,0 +1,115 @@ +# Annotation batch size for VCF writing +BATCH_SIZE = 10000 + +# IUPAC ambiguity codes treated as ambiguous bases in pileup metrics +AMBIGUOUS_BASES = ["N", "M", "R", "W", "S", "Y", "K", "V", "H", "D", "B"] + +# Header templates: (suffix, description, type, number) +# {sample} is substituted at generation time +_HEADER_TEMPLATES = [ + ( + "af", + "Allele frequency for the alternate alleles in the {sample} sample/s", + "Float", + "A", + ), + ( + "dp", + "Total depth of coverage in the {sample} sample/s (independent of alleles)", + "Float", + "1", + ), + ( + "ac", + "Allele count for the alternate alleles in the {sample} sample/s", + "Integer", + "A", + ), + ( + "n", + "Allele count for ambiguous bases (any IUPAC ambiguity code is counted) in the {sample} sample/s", + "Integer", + "1", + ), + ( + "pu", + "Probability of an undetected mutation given the observed supporting reads (AC), the observed total coverage (DP) and the provided tumor purity in the {sample} sample/s", + "Float", + "A", + ), + ( + "pw", + "Power to detect a somatic mutation as described in Absolute given the observed total coverage (DP) and the provided tumor purity and ploidies in the {sample} sample/s", + "Float", + "1", + ), + ( + "k", + "Minimum number of supporting reads, k, such that the probability of observing k or more non-reference reads due to sequencing error is less than the defined FPR in the {sample} sample/s", + "Float", + "1", + ), + ( + "eaf", + "Expected VAF considering the purity and ploidy/copy number in the {sample} sample/s", + "Float", + "1", + ), + ( + "bq", + "Median base call quality of the reads supporting each allele in the {sample} sample/s", + "Float", + "R", + ), + ( + "mq", + "Median mapping quality of the reads supporting each allele in the {sample} sample/s", + "Float", + "R", + ), + ( + "pos", + "Median position within the read of the reads supporting each allele in the {sample} sample/s", + "Float", + "R", + ), + ( + "rsmq", + "Rank sum test comparing the MQ distributions supporting the reference and the alternate in the {sample} sample/s", + "Float", + "A", + ), + ( + "rsmq_pv", + "Rank sum test p-value for MQ distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions", + "Float", + "A", + ), + ( + "rsbq", + "Rank sum test comparing the BQ distributions supporting the reference and the alternate in the {sample} sample/s", + "Float", + "A", + ), + ( + "rsbq_pv", + "Rank sum test p-value for BQ distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions", + "Float", + "A", + ), + ( + "rspos", + "Rank sum test comparing the position distributions supporting the reference and the alternate in the {sample} sample/s", + "Float", + "A", + ), + ( + "rspos_pv", + "Rank sum test p-value for position distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions", + "Float", + "A", + ), +] + +# eaf is not produced per-replicate +_REPLICATE_HEADER_TEMPLATES = [t for t in _HEADER_TEMPLATES if t[0] != "eaf"] diff --git a/vafator/hatchet2bed.py b/vafator/hatchet2bed.py index d0b45bb..cbc5989 100644 --- a/vafator/hatchet2bed.py +++ b/vafator/hatchet2bed.py @@ -2,9 +2,9 @@ def run_hatchet2bed(input_file, output_prefix): - input_df = pd.read_csv(input_file, sep='\t') - cn_columns = sorted(list(filter(lambda c: c.startswith('cn_clone'), input_df.columns))) - u_columns = sorted(list(filter(lambda c: c.startswith('u_clone'), input_df.columns))) + input_df = pd.read_csv(input_file, sep="\t") + cn_columns = sorted([c for c in input_df.columns if c.startswith("cn_clone")]) + u_columns = sorted([c for c in input_df.columns if c.startswith("u_clone")]) for sample in input_df.SAMPLE.unique(): data = [] @@ -14,9 +14,18 @@ def run_hatchet2bed(input_file, output_prefix): for cn_column, u_column in zip(cn_columns, u_columns): u = float(row[u_column]) total_u += u - cn = sum(map(lambda c: float(c), row[cn_column].split('|'))) + cn = sum(map(float, row[cn_column].split("|"))) numerator.append(u * cn) - data.append([row['#CHR'], row['START'], row['END'], sum(numerator)/total_u]) + data.append( + [row["#CHR"], row["START"], row["END"], sum(numerator) / total_u] + ) - output_df = pd.DataFrame(data=data, columns=['chromosome', 'start', 'end', 'value']) - output_df.to_csv('{}.{}.bed'.format(output_prefix, sample), sep='\t', header=False, index=False) + output_df = pd.DataFrame( + data=data, columns=["chromosome", "start", "end", "value"] + ) + output_df.to_csv( + "{}.{}.bed".format(output_prefix, sample), + sep="\t", + header=False, + index=False, + ) diff --git a/vafator/multiallelic_filter.py b/vafator/multiallelic_filter.py index 6f08d7e..6b998d6 100755 --- a/vafator/multiallelic_filter.py +++ b/vafator/multiallelic_filter.py @@ -16,7 +16,7 @@ class MultiallelicFilter(object): multiallelic_annotation_name = "multiallelic" - def __init__(self, input_vcf, output_vcf, tumor_sample_name='tumor'): + def __init__(self, input_vcf, output_vcf, tumor_sample_name="tumor"): """ :param input_vcf: the input VCF file :param output_vcf: the file path of the output VCF @@ -26,11 +26,20 @@ def __init__(self, input_vcf, output_vcf, tumor_sample_name='tumor'): # sets a line in the header with the command used to annotate the file self.vafator_header["input_vcf"] = input_vcf self.vafator_header["output_vcf"] = output_vcf - self.vcf.add_to_header("##vafator_command_line={}".format(json.dumps(self.vafator_header))) + self.vcf.add_to_header( + "##vafator_command_line={}".format(json.dumps(self.vafator_header)) + ) # adds to the header all the names of the annotations - self.vcf.add_info_to_header({'ID': self.multiallelic_annotation_name, 'Type': 'String', 'Number': '.', - 'Description': "Indicates multiallelic variants filtered and their frequencies if any (e.g.: T,0.12)"}) + self.vcf.add_info_to_header( + { + "ID": self.multiallelic_annotation_name, + "Type": "String", + "Number": ".", + "Description": "Indicates multiallelic variants filtered and their frequencies if any (e.g.: T,0.12)", + } + ) self.vcf_writer = Writer(output_vcf, self.vcf) + self.random = random.Random(42) def run(self): batch = [] @@ -41,8 +50,12 @@ def run(self): prev_variant = variant continue # considers only SNVs with same chromosome, position and reference - if variant.is_snp and variant.CHROM == prev_variant.CHROM and variant.POS == prev_variant.POS and \ - variant.REF == prev_variant.REF: + if ( + variant.is_snp + and variant.CHROM == prev_variant.CHROM + and variant.POS == prev_variant.POS + and variant.REF == prev_variant.REF + ): af1 = self.get_tumor_af(prev_variant) af2 = self.get_tumor_af(variant) # keeps the variant with the highest AF @@ -55,8 +68,10 @@ def run(self): # chooses a variant at random alt1 = variant.ALT[0] alt2 = prev_variant.ALT[0] - prev_variant = random.sample([variant, prev_variant], k=1)[0] - self.set_multiallelic_annotation(prev_variant, alt1 if alt1 != prev_variant.ALT[0] else alt2, af1) + prev_variant = self.random.sample([variant, prev_variant], k=1)[0] + self.set_multiallelic_annotation( + prev_variant, alt1 if alt1 != prev_variant.ALT[0] else alt2, af1 + ) continue # write previous variant and stores current for next iteration @@ -74,8 +89,10 @@ def run(self): self.vcf.close() def set_multiallelic_annotation(self, variant, alt, af): - variant.INFO[self.multiallelic_annotation_name] = \ - ",".join(variant.INFO.get(self.multiallelic_annotation_name, "").split(",") + [alt, str(af)]) + variant.INFO[self.multiallelic_annotation_name] = ",".join( + variant.INFO.get(self.multiallelic_annotation_name, "").split(",") + + [alt, str(af)] + ) def get_tumor_af(self, prev_variant): return prev_variant.INFO.get("{}_af".format(self.tumor_sample_name), 0.0) diff --git a/vafator/pileup_utils.py b/vafator/pileup_utils.py new file mode 100755 index 0000000..22f72a6 --- /dev/null +++ b/vafator/pileup_utils.py @@ -0,0 +1,119 @@ +from collections import Counter +from dataclasses import dataclass +from math import nan +from typing import List, Dict + +import numpy as np + + +@dataclass +class VariantRecord: + """Lightweight, picklable variant representation used by pileup workers. + Mirrors the cyvcf2.Variant fields accessed by pileup and metrics functions. + + Attributes: + CHROM: chromosome name + POS: 1-based variant position + REF: reference allele + ALT: list of alternate alleles + """ + + CHROM: str + POS: int + REF: str + ALT: List[str] + + +@dataclass +class CoverageMetrics: + """Pileup metrics computed for a single variant in a single BAM. + + Attributes: + ac: allele counts per base, including the reference + dp: total depth of coverage + bqs: median base call quality per allele, including the reference + mqs: median mapping quality per allele, including the reference + positions: median read position per allele, including the reference + all_bqs: full base call quality distribution per allele + all_mqs: full mapping quality distribution per allele + all_positions: full read position distribution per allele + """ + + ac: dict + dp: int + bqs: dict = None + mqs: dict = None + positions: dict = None + all_bqs: dict = None + all_mqs: dict = None + all_positions: dict = None + + +EMPTY_METRICS = CoverageMetrics( + ac=Counter(), + dp=0, + bqs=Counter(), + mqs=Counter(), + positions=Counter(), + all_bqs={}, + all_mqs={}, + all_positions={}, +) + + +def aggregate_list_per_base(bases: List[str], values: list) -> Dict[str, list]: + """Group a list of values by their corresponding base. + + Args: + bases: list of base characters (e.g. ['A', 'T', 'A', '']) + values: list of numeric values parallel to bases + + Returns: + dict mapping each base to a list of its associated values + """ + aggregated_values = {} + for b, v in zip(bases, values): + if b not in aggregated_values: + aggregated_values[b] = [] + aggregated_values[b].append(v) + return aggregated_values + + +def safe_median(values: list) -> float: + """Return the median of a list, or nan if the list is empty. + Avoids numpy RuntimeWarning raised by np.median on empty arrays. + + Args: + values: list of numeric values + + Returns: + median as float, or nan if values is empty + """ + return float(np.median(values)) if values else nan + + +def is_snp(variant) -> bool: + """Return True if the variant is a single nucleotide polymorphism. + + Args: + variant: any object with REF and ALT attributes (Variant or VariantRecord) + """ + return len(variant.REF) == 1 and len(variant.ALT[0]) == 1 + + +def is_insertion(variant) -> bool: + """Return True if the variant is an insertion. + + Args: + variant: any object with REF and ALT attributes (Variant or VariantRecord) + """ + return len(variant.REF) == 1 and len(variant.ALT[0]) > 1 + + +def is_deletion(variant) -> bool: + """Return True if the variant is a deletion. + + Args: + variant: any object with REF and ALT attributes (Variant or VariantRecord) + """ + return len(variant.ALT[0]) == 1 and len(variant.REF) > 1 diff --git a/vafator/pileups.py b/vafator/pileups.py index 65160a4..47ec377 100755 --- a/vafator/pileups.py +++ b/vafator/pileups.py @@ -1,227 +1,362 @@ -from collections import Counter -from dataclasses import dataclass -from typing import Union +from collections import Counter, defaultdict +from typing import Union, List, Dict, Iterator, Tuple + from cyvcf2 import Variant from pysam.libcalignmentfile import IteratorColumnRegion, AlignmentFile -from vafator import AMBIGUOUS_BASES -from vafator.tests.utils import VafatorVariant -import numpy as np +from vafator.constants import AMBIGUOUS_BASES +from vafator.pileup_utils import ( + VariantRecord, + CoverageMetrics, + safe_median, + aggregate_list_per_base, + is_snp, + is_insertion, + is_deletion, +) -def is_snp(variant: Variant): - return len(variant.REF) == 1 and len(variant.ALT[0]) == 1 +def get_variant_pileup( + variant: Union[Variant, VariantRecord], + bam: AlignmentFile, + min_base_quality: int, + min_mapping_quality: int, +) -> IteratorColumnRegion: + """Open a pileup iterator at a single variant position. + Kept for backwards compatibility and use in tests. + Args: + variant: variant to query (cyvcf2 Variant or VariantRecord) + bam: open pysam AlignmentFile + min_base_quality: minimum base call quality; bases below this are excluded + min_mapping_quality: minimum mapping quality; reads below this are excluded -def is_insertion(variant: Variant): - return len(variant.REF) == 1 and len(variant.ALT[0]) > 1 + Returns: + pysam IteratorColumnRegion over the variant position + """ + position = variant.POS + return bam.pileup( + contig=variant.CHROM, + start=position - 1, + stop=position, + truncate=True, + max_depth=1000000, + min_base_quality=min_base_quality, + min_mapping_quality=min_mapping_quality, + stepper="samtools", + ) -def is_deletion(variant: Variant): - return len(variant.ALT[0]) == 1 and len(variant.REF) > 1 +def get_region_pileup( + chrom: str, + start: int, + end: int, + bam: AlignmentFile, + min_base_quality: int, + min_mapping_quality: int, +): + """Open a single pileup iterator spanning a genomic region. + Args: + chrom: chromosome name + start: 0-based inclusive start position + end: 1-based exclusive end position (last variant POS) + bam: open pysam AlignmentFile + min_base_quality: minimum base call quality; bases below this are excluded + min_mapping_quality: minimum mapping quality; reads below this are excluded -def get_variant_pileup( - variant: Union[Variant, VafatorVariant], bam: AlignmentFile, min_base_quality, min_mapping_quality) -> IteratorColumnRegion: - position = variant.POS - # this function returns the pileups at all positions covered by reads covered the queried position - # approximately +- read size bp - return bam.pileup(contig=variant.CHROM, start=position - 1, stop=position, - truncate=True, # returns only this column - max_depth=0, # disables maximum depth - min_base_quality=min_base_quality, - min_mapping_quality=min_mapping_quality) - - -@dataclass -class CoverageMetrics: - # number supporting reads of each base, including the reference - ac: dict - # total depth of coverage - dp: int - # median base call quality of each base, including the reference - bqs: dict = None - # median mapping quality of each alternate base, including the reference - mqs: dict = None - # median position within the read of each alternate base, including the reference - positions: dict = None - # base call quality distribution of each base, including the reference - all_bqs: dict = None - # mapping quality distribution of each base, including the reference - all_mqs: dict = None - # position within the read distribution of each base, including the reference - all_positions: dict = None - - -def get_metrics(variant: Variant, pileups: IteratorColumnRegion, include_ambiguous_bases=False) -> CoverageMetrics: + Returns: + pysam pileup iterator over the region + """ + return bam.pileup( + contig=chrom, + start=start, + stop=end, + truncate=True, + max_depth=1000000, + min_base_quality=min_base_quality, + min_mapping_quality=min_mapping_quality, + stepper="samtools", + ) + + +def stream_variants_by_chrom(vcf) -> Iterator[Tuple[str, List[Variant]]]: + """Yield variants grouped by chromosome, one chromosome at a time. + + Args: + vcf: open cyvcf2 VCF iterator + + Returns: + iterator of (chrom, [variants]) tuples + """ + current_chrom = None + current_variants = [] + for variant in vcf: + if variant.CHROM != current_chrom: + if current_variants: + yield current_chrom, current_variants + current_chrom = variant.CHROM + current_variants = [variant] + else: + current_variants.append(variant) + if current_variants: + yield current_chrom, current_variants + + +def collect_metrics_for_chrom( + chrom: str, + variants: List[Variant], + bam: AlignmentFile, + min_base_quality: int, + min_mapping_quality: int, + include_ambiguous_bases: bool = False, +) -> Dict[Tuple, CoverageMetrics]: + """Compute pileup metrics for all variants on a chromosome using a single pileup iterator. + + Metrics are computed immediately for each pileup column while it is still valid — + avoids segfaults from storing PileupColumn C objects after the iterator advances. + + Args: + chrom: chromosome name + variants: list of variants on this chromosome (must be sorted by position) + bam: open pysam AlignmentFile + min_base_quality: minimum base call quality threshold + min_mapping_quality: minimum mapping quality threshold + include_ambiguous_bases: if True, ambiguous bases are counted in depth + + Returns: + {(pos, REF, ALT[0]): CoverageMetrics} for each variant with read support + """ + if not variants: + return {} + + variants_by_pos: Dict[int, List] = defaultdict(list) + for v in variants: + variants_by_pos[v.POS].append(v) + + start = variants[0].POS - 1 # 0-based inclusive + end = variants[-1].POS # exclusive end for pysam + results: Dict[Tuple, CoverageMetrics] = {} + + for pileup_col in get_region_pileup( + chrom=chrom, + start=start, + end=end, + bam=bam, + min_base_quality=min_base_quality, + min_mapping_quality=min_mapping_quality, + ): + ref_pos = pileup_col.reference_pos + 1 # convert to 1-based + if ref_pos not in variants_by_pos: + continue + for variant in variants_by_pos[ref_pos]: + metrics = _get_metrics_from_column( + variant, pileup_col, include_ambiguous_bases + ) + if metrics is not None: + results[(ref_pos, variant.REF, variant.ALT[0])] = metrics + + return results + + +def _get_metrics_from_column( + variant, pileup_col, include_ambiguous_bases: bool = True +) -> CoverageMetrics: + """Dispatch pileup metrics computation based on variant type. + + Args: + variant: Variant or VariantRecord being evaluated + pileup_col: pysam PileupColumn at the variant position + include_ambiguous_bases: if True, ambiguous bases are counted in depth + + Returns: + CoverageMetrics for this variant, or None if the variant type is not supported + """ if is_snp(variant): - return get_snv_metrics(pileups, include_ambiguous_bases) + return _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases) elif is_insertion(variant): - return get_insertion_metrics(variant, pileups) + return _get_insertion_metrics_from_column(variant, pileup_col) elif is_deletion(variant): - return get_deletion_metrics(variant, pileups) + return _get_deletion_metrics_from_column(variant, pileup_col) return None -def get_insertion_metrics(variant: Variant, pileups: IteratorColumnRegion) -> CoverageMetrics: +def _get_snv_metrics_from_column( + pileup_col, include_ambiguous_bases: bool = True +) -> CoverageMetrics: + """Compute SNV metrics from a pileup column. + Deletions at the position are represented as empty string bases and included in depth. + + Args: + pileup_col: pysam PileupColumn at the SNV position + include_ambiguous_bases: if True, IUPAC ambiguous bases are counted in depth + + Returns: + CoverageMetrics with ac, dp, bqs, mqs, positions and their distributions + """ + bases, qualities, mapping_qualities, query_positions = [], [], [], [] + + for read in pileup_col.pileups: + if read.is_refskip: + continue + if read.is_del: + bases.append("") + qualities.append(0) + mapping_qualities.append(read.alignment.mapping_quality) + query_positions.append(read.query_position_or_next) + else: + base = read.alignment.query_sequence[read.query_position].upper() + bases.append(base) + qualities.append(read.alignment.query_qualities[read.query_position]) + mapping_qualities.append(read.alignment.mapping_quality) + query_positions.append(read.query_position) + + all_bqs = aggregate_list_per_base(bases, qualities) + all_mqs = aggregate_list_per_base(bases, mapping_qualities) + all_positions = aggregate_list_per_base(bases, query_positions) + + bqs = Counter({b: safe_median(l) for b, l in all_bqs.items()}) + mqs = Counter({b: safe_median(l) for b, l in all_mqs.items()}) + positions = Counter({b: safe_median(l) for b, l in all_positions.items()}) + + ac = Counter(b for b in bases if b != "") + + if include_ambiguous_bases: + dp = len(bases) + else: + dp = sum(1 for b in bases if b == "" or b not in AMBIGUOUS_BASES) + + return CoverageMetrics( + ac=ac, + dp=dp, + bqs=bqs, + mqs=mqs, + positions=positions, + all_bqs=all_bqs, + all_mqs=all_mqs, + all_positions=all_positions, + ) + + +def _get_insertion_metrics_from_column(variant, pileup_col) -> CoverageMetrics: + """Compute insertion metrics from a pileup column. + Checks both insertion length and sequence to count supporting reads. + Reads with no indel at this position are counted as reference support. + + Args: + variant: Variant or VariantRecord with REF and ALT defining the insertion + pileup_col: pysam PileupColumn at the insertion position + + Returns: + CoverageMetrics with ac, dp, mqs, positions (bqs is empty for indels) + """ ac = {alt.upper(): 0 for alt in variant.ALT} mq = {alt.upper(): [] for alt in variant.ALT} mq[variant.REF] = [] pos = {alt.upper(): [] for alt in variant.ALT} pos[variant.REF] = [] - dp = 0 variant_position = variant.POS insertion_length = len(variant.ALT[0]) - len(variant.REF) insertion = variant.ALT[0][1:] alt_upper = variant.ALT[0].upper() - try: - pileups = next(pileups).pileups - dp += len(pileups) - for pileup_read in pileups: - if pileup_read.indel > 0: - # read with an insertion - index = pileup_read.alignment.reference_start - relative_position = 0 - for cigar_type, cigar_length in pileup_read.alignment.cigartuples: - if cigar_type in [0, 2, 3, 7, 8]: # consumes reference M, D, N, =, X - index += cigar_length - if index > variant_position: - break - if cigar_type in [0, 1, 4, 7, 8]: # consumes query M, I, S, =, X - relative_position += cigar_length - if cigar_type == 1: # does not count I - insertion_in_query = pileup_read.alignment.query[relative_position : relative_position + insertion_length] - if index == variant_position and cigar_length == insertion_length and insertion == insertion_in_query: - # the read contains the insertion - ac[alt_upper] = ac[alt_upper] + 1 - mq[alt_upper].append(pileup_read.alignment.mapping_quality) - pos[alt_upper].append(pileup_read.query_position_or_next) - elif pileup_read.indel == 0: - # NOTE: considers all reads without indels to be the reference! - mq[variant.REF].append(pileup_read.alignment.mapping_quality) - pos[variant.REF].append(pileup_read.query_position_or_next) - - except StopIteration: - # no reads - pass + + pileup_reads = pileup_col.pileups + dp = len(pileup_reads) + + for pileup_read in pileup_reads: + if pileup_read.indel > 0: + # read with an insertion + index = pileup_read.alignment.reference_start + relative_position = 0 + for cigar_type, cigar_length in pileup_read.alignment.cigartuples: + if cigar_type in [0, 2, 3, 7, 8]: # consumes reference M, D, N, =, X + index += cigar_length + if index > variant_position: + break + if cigar_type in [0, 1, 4, 7, 8]: # consumes query M, I, S, =, X + relative_position += cigar_length + if cigar_type == 1: # does not count I + insertion_in_query = pileup_read.alignment.query[ + relative_position : relative_position + insertion_length + ] + if ( + index == variant_position + and cigar_length == insertion_length + and insertion == insertion_in_query + ): + # the read contains the insertion + ac[alt_upper] += 1 + mq[alt_upper].append(pileup_read.alignment.mapping_quality) + pos[alt_upper].append(pileup_read.query_position_or_next) + elif pileup_read.indel == 0: + # NOTE: considers all reads without indels to be the reference! + mq[variant.REF].append(pileup_read.alignment.mapping_quality) + pos[variant.REF].append(pileup_read.query_position_or_next) + return CoverageMetrics( - ac=Counter(ac), dp=dp, - mqs=Counter({k: np.median(l) for k, l in mq.items()}), - positions=Counter({k: np.median(l) for k, l in pos.items()}), + ac=Counter(ac), + dp=dp, + mqs=Counter({k: safe_median(l) for k, l in mq.items()}), + positions=Counter({k: safe_median(l) for k, l in pos.items()}), bqs=Counter(), all_mqs={k: l for k, l in mq.items()}, all_positions={k: l for k, l in pos.items()}, - all_bqs=Counter() + all_bqs=Counter(), ) -def get_deletion_metrics(variant: Variant, pileups: IteratorColumnRegion) -> CoverageMetrics: +def _get_deletion_metrics_from_column(variant, pileup_col) -> CoverageMetrics: + """Compute deletion metrics from a pileup column. + Checks both deletion length and position via CIGAR traversal to count supporting reads. + Reads with no indel at this position are counted as reference support. + + Args: + variant: Variant or VariantRecord with REF and ALT defining the deletion + pileup_col: pysam PileupColumn at the deletion position + + Returns: + CoverageMetrics with ac, dp, mqs, positions (bqs is empty for indels) + """ ac = {alt.upper(): 0 for alt in variant.ALT} mq = {alt.upper(): [] for alt in variant.ALT} mq[variant.REF] = [] pos = {alt.upper(): [] for alt in variant.ALT} pos[variant.REF] = [] - dp = 0 variant_position = variant.POS deletion_length = len(variant.REF) - len(variant.ALT[0]) alt_upper = variant.ALT[0].upper() - try: - pileups = next(pileups).pileups - dp += len(pileups) - for pileup_read in pileups: - if pileup_read.indel < 0: - # read with a deletion - start = pileup_read.alignment.reference_start - match = False - for cigar_type, cigar_length in pileup_read.alignment.cigartuples: - if cigar_type in [0, 3, 7, 8]: # consumes reference M, N, =, X - start += cigar_length - elif cigar_type == 2: # D - if start == variant_position and cigar_length == deletion_length: - ac[alt_upper] = ac[alt_upper] + 1 - mq[alt_upper].append(pileup_read.alignment.mapping_quality) - pos[alt_upper].append(pileup_read.query_position_or_next) - match = True - break - else: - start += cigar_length - if start > variant_position: - break - if not match: - # TODO: when finds a read with an indel not matching our particular indel it counts it - pass - elif pileup_read.indel == 0: - # NOTE: considers all reads without indels to be the reference! - mq[variant.REF].append(pileup_read.alignment.mapping_quality) - pos[variant.REF].append(pileup_read.query_position_or_next) - except StopIteration: - # no reads - pass - return CoverageMetrics( - ac=Counter(ac), dp=dp, - mqs=Counter({k: np.median(l) for k, l in mq.items()}), - positions=Counter({k: np.median(l) for k, l in pos.items()}), - bqs=Counter(), - all_mqs={k: l for k, l in mq.items()}, - all_positions={k: l for k, l in pos.items()}, - all_bqs=Counter() - ) - - -def get_snv_metrics(pileups: IteratorColumnRegion, include_ambiguous_bases=False) -> CoverageMetrics: - try: - pileup = next(pileups) - bases = [s.upper() for s in pileup.get_query_sequences()] - bqs = aggregate_median_per_base(bases, pileup.get_query_qualities()) - mqs = aggregate_median_per_base(bases, pileup.get_mapping_qualities()) - positions = aggregate_median_per_base(bases, pileup.get_query_positions()) - all_bqs = aggregate_list_per_base(bases, pileup.get_query_qualities()) - all_mqs = aggregate_list_per_base(bases, pileup.get_mapping_qualities()) - all_positions = aggregate_list_per_base(bases, pileup.get_query_positions()) + pileup_reads = pileup_col.pileups + dp = len(pileup_reads) - if include_ambiguous_bases: - dp = len([b for b in bases if b!= ""]) - else: - # remove ambiguous bases and reads where the position is spliced out - dp = len([b for b in bases if b not in AMBIGUOUS_BASES and b != ""]) - ac = Counter(bases) - except StopIteration: - # no reads - dp = 0 - ac = Counter() - bqs = Counter() - mqs = Counter() - positions = Counter() - all_bqs = {} - all_mqs = {} - all_positions = {} + for pileup_read in pileup_reads: + if pileup_read.indel < 0: + start = pileup_read.alignment.reference_start + for cigar_type, cigar_length in pileup_read.alignment.cigartuples: + if cigar_type in [0, 3, 7, 8]: # consumes reference M, N, =, X + start += cigar_length + elif cigar_type == 2: + if start == variant_position and cigar_length == deletion_length: + ac[alt_upper] += 1 + mq[alt_upper].append(pileup_read.alignment.mapping_quality) + pos[alt_upper].append(pileup_read.query_position_or_next) + break + else: + start += cigar_length + if start > variant_position: + break + elif pileup_read.indel == 0: + # NOTE: considers all reads without indels to be the reference! + mq[variant.REF].append(pileup_read.alignment.mapping_quality) + pos[variant.REF].append(pileup_read.query_position_or_next) return CoverageMetrics( - ac=ac, + ac=Counter(ac), dp=dp, - bqs=bqs, - mqs=mqs, - positions=positions, - all_bqs=all_bqs, - all_mqs=all_mqs, - all_positions=all_positions + mqs=Counter({k: safe_median(l) for k, l in mq.items()}), + positions=Counter({k: safe_median(l) for k, l in pos.items()}), + bqs=Counter(), + all_mqs={k: l for k, l in mq.items()}, + all_positions={k: l for k, l in pos.items()}, + all_bqs=Counter(), ) - - -def aggregate_median_per_base(bases, values) -> Counter: - aggregated_values = {} - for b, v in zip(bases, values): - if b not in aggregated_values: - aggregated_values[b] = [] - aggregated_values[b].append(v) - return Counter({b: np.median(bq_list) for b, bq_list in aggregated_values.items()}) - - -def aggregate_list_per_base(bases, values) -> dict: - aggregated_values = {} - for b, v in zip(bases, values): - if b not in aggregated_values: - aggregated_values[b] = [] - aggregated_values[b].append(v) - return aggregated_values diff --git a/vafator/ploidies.py b/vafator/ploidies.py index b9f9099..31afc6f 100755 --- a/vafator/ploidies.py +++ b/vafator/ploidies.py @@ -3,32 +3,47 @@ import pandas as pd from cyvcf2 import Variant -from vafator.tests.utils import VafatorVariant +from vafator.pileups import VariantRecord DEFAULT_PLOIDY = 2.0 class PloidyManager: - def __init__(self, local_copy_numbers: str = None, genome_wide_ploidy: float = DEFAULT_PLOIDY): + def __init__( + self, local_copy_numbers: str = None, genome_wide_ploidy: float = DEFAULT_PLOIDY + ): if local_copy_numbers is not None and not os.path.exists(local_copy_numbers): - raise ValueError('The provided tumor ploidy is neither a copy number value or a BED file with copy ' - 'numbers') - self.report_value = local_copy_numbers if local_copy_numbers else genome_wide_ploidy - self.bed = pd.read_csv(local_copy_numbers, sep='\t', names=['chromosome', 'start', 'end', 'copy_number']) \ - if local_copy_numbers is not None else None + raise ValueError( + "The provided tumor ploidy is neither a copy number value or a BED file with copy " + "numbers" + ) + self.report_value = ( + local_copy_numbers if local_copy_numbers else genome_wide_ploidy + ) + self.bed = ( + pd.read_csv( + local_copy_numbers, + sep="\t", + names=["chromosome", "start", "end", "copy_number"], + ) + if local_copy_numbers is not None + else None + ) self.ploidy = genome_wide_ploidy - def get_ploidy(self, variant: Union[Variant, VafatorVariant]) -> float: + def get_ploidy(self, variant: Union[Variant, VariantRecord]) -> float: result = self.ploidy if self.bed is not None: # read from the BED file # NOTE: converts variant position from 1-based into 0-based and considers intervals as half-closed - hits = self.bed[(self.bed.chromosome == variant.CHROM) & - (self.bed.start <= variant.POS - 1) & - (self.bed.end > variant.POS - 1)] + hits = self.bed[ + (self.bed.chromosome == variant.CHROM) + & (self.bed.start <= variant.POS - 1) + & (self.bed.end > variant.POS - 1) + ] if hits.shape[0] > 0: result = float(hits.copy_number.iloc[0]) diff --git a/vafator/power.py b/vafator/power.py index 2430e31..e051e24 100644 --- a/vafator/power.py +++ b/vafator/power.py @@ -7,19 +7,20 @@ DEFAULT_PURITY = 1.0 DEFAULT_NORMAL_PLOIDY = 2 -DEFAULT_FPR = 5*(10**-7) +DEFAULT_FPR = 5 * (10**-7) DEFAULT_ERROR_RATE = 10**-3 class PowerCalculator: def __init__( - self, - tumor_ploidies: dict, - purities: dict, - normal_ploidy: int = DEFAULT_NORMAL_PLOIDY, - fpr: float = DEFAULT_FPR, - error_rate: float = DEFAULT_ERROR_RATE): + self, + tumor_ploidies: dict, + purities: dict, + normal_ploidy: int = DEFAULT_NORMAL_PLOIDY, + fpr: float = DEFAULT_FPR, + error_rate: float = DEFAULT_ERROR_RATE, + ): self.normal_ploidy = normal_ploidy self.purities = purities @@ -27,7 +28,17 @@ def __init__( self.fpr = fpr self.error_rate = error_rate - def calculate_power(self, dp: int, ac: int, sample: str, variant: Optional[Variant]) -> float: + # cache for _calculate_k: dp -> k + # dp values repeat heavily across variants so this avoids repeated binom.cdf loops + self._k_cache: dict = {} + + # cache for calculate_expected_vaf: (sample, pos) -> vaf + # when using genome-wide ploidy (most common case) this is the same value for all variants + self._eaf_cache: dict = {} + + def calculate_power( + self, dp: int, ac: int, sample: str, variant: Optional[Variant] + ) -> float: """ Return the binomial probability of observing ac or less supporting reads, given a total coverage dp and a expected VAF tumor purity / 2. @@ -45,10 +56,29 @@ def calculate_expected_vaf(self, sample: str, variant: Optional[Variant]) -> flo In a scenario with purity = 1, tumor CN = 2 and normal CN = 2 => expected VAF = 0.5 """ + # cache key: use variant position for local copy number lookups, + # or just sample name when genome-wide ploidy is used (most common case) + cache_key = ( + sample, + variant.CHROM if variant else None, + variant.POS if variant else None, + ) + if cache_key in self._eaf_cache: + return self._eaf_cache[cache_key] + purity = self.purities.get(sample, DEFAULT_PURITY) - tumor_ploidy = max(1, self.tumor_ploidies.get(sample, default_ploidy_manager).get_ploidy(variant=variant)) - corrected_tumor_ploidy = purity * tumor_ploidy + ((1 - purity) * self.normal_ploidy) + tumor_ploidy = max( + 1, + self.tumor_ploidies.get(sample, default_ploidy_manager).get_ploidy( + variant=variant + ), + ) + corrected_tumor_ploidy = purity * tumor_ploidy + ( + (1 - purity) * self.normal_ploidy + ) expected_vaf = purity / corrected_tumor_ploidy + + self._eaf_cache[cache_key] = expected_vaf return expected_vaf def _calculate_p(self, m: int, n: int) -> float: @@ -74,10 +104,13 @@ def _calculate_d(self, k: int, n: int) -> float: return (self.fpr - p) / (p_1 - p) def _calculate_k(self, dp: int) -> int: + # cache: same dp value appears across thousands of variants + if dp in self._k_cache: + return self._k_cache[dp] k = 1 while self._calculate_p(m=k, n=dp) > self.fpr: k += 1 - + self._k_cache[dp] = k return k def calculate_absolute_power(self, sample, variant, dp: int) -> float: @@ -89,7 +122,10 @@ def calculate_absolute_power(self, sample, variant, dp: int) -> float: k = self._calculate_k(dp=dp) n = dp f = self.calculate_expected_vaf(sample, variant) - power = 1 - binom.cdf(k=k - 1, n=n, p=f) + self._calculate_d(k=k, n=n) * binom(n, f).pmf(k) + # avoid instantiating a frozen binom distribution object — use module-level functions directly + power = ( + 1 + - binom.cdf(k=k - 1, n=n, p=f) + + self._calculate_d(k=k, n=n) * binom.pmf(k=k, n=n, p=f) + ) return round(power, 5), k - - diff --git a/vafator/rank_sum_test.py b/vafator/rank_sum_test.py index 384bd37..2314572 100644 --- a/vafator/rank_sum_test.py +++ b/vafator/rank_sum_test.py @@ -3,7 +3,11 @@ import numpy as np -def calculate_rank_sum_test(alternate_dist: List[int], reference_dist: List[int]) -> Tuple[float, float]: +def calculate_rank_sum_test( + alternate_dist: List[int], reference_dist: List[int] +) -> Tuple[float, float]: + if not alternate_dist or not reference_dist: # skip empty distributions + return np.nan, np.nan stat, pvalue = scipy.stats.ranksums(x=alternate_dist, y=reference_dist) return round(stat, 3), round(pvalue, 5) @@ -14,10 +18,9 @@ def get_rank_sum_tests(distributions: dict, variant): for alt in variant.ALT: stat, pvalue = calculate_rank_sum_test( alternate_dist=distributions.get(alt, []), - reference_dist=distributions.get(variant.REF, [])) + reference_dist=distributions.get(variant.REF, []), + ) if not np.isnan(stat) and not np.isnan(pvalue): stats.append(str(stat)) pvalues.append(str(pvalue)) return pvalues, stats - - diff --git a/vafator/tests/test_annotator.py b/vafator/tests/test_annotator.py index 7bcb716..8172870 100755 --- a/vafator/tests/test_annotator.py +++ b/vafator/tests/test_annotator.py @@ -13,14 +13,43 @@ EXPECTED_ANNOTATIONS = [ - 'af', 'dp', 'ac', 'n', 'pu', 'pw', 'k', 'eaf', 'bq', 'mq', 'pos', 'rsmq', 'rsmq_pv', 'rsbq', 'rsbq_pv', 'rspos', - 'rspos_pv' + "af", + "dp", + "ac", + "n", + "pu", + "pw", + "k", + "eaf", + "bq", + "mq", + "pos", + "rsmq", + "rsmq_pv", + "rsbq", + "rsbq_pv", + "rspos", + "rspos_pv", ] # replicates do not have EAF annotation EXPECTED_ANNOTATIONS_REPLICATES = [ - 'af', 'dp', 'ac', 'n', 'pu', 'pw', 'k', 'bq', 'mq', 'pos', 'rsmq', 'rsmq_pv', 'rsbq', 'rsbq_pv', 'rspos', - 'rspos_pv' + "af", + "dp", + "ac", + "n", + "pu", + "pw", + "k", + "bq", + "mq", + "pos", + "rsmq", + "rsmq_pv", + "rsbq", + "rsbq_pv", + "rspos", + "rspos_pv", ] @@ -28,11 +57,16 @@ class TestAnnotator(TestCase): def test_annotator(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_annotator1_output.vcf") + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test_annotator1_output.vcf" + ) bam1 = pkg_resources.resource_filename(__name__, "resources/COLO_829_n1.bam") bam2 = pkg_resources.resource_filename(__name__, "resources/COLO_829_t1.bam") annotator = Annotator( - input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam1], "tumor": [bam2]}) + input_vcf=input_file, + output_vcf=output_vcf, + input_bams={"normal": [bam1], "tumor": [bam2]}, + ) annotator.run() self.assertTrue(os.path.exists(output_vcf)) @@ -47,11 +81,16 @@ def test_annotator(self): def test_annotator_with_multiple_bams(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_annotator1_output.vcf") + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test_annotator1_output.vcf" + ) bam1 = pkg_resources.resource_filename(__name__, "resources/COLO_829_n1.bam") bam2 = pkg_resources.resource_filename(__name__, "resources/COLO_829_t1.bam") annotator = Annotator( - input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam1, bam2], "tumor": [bam1, bam2]}) + input_vcf=input_file, + output_vcf=output_vcf, + input_bams={"normal": [bam1, bam2], "tumor": [bam1, bam2]}, + ) annotator.run() self.assertTrue(os.path.exists(output_vcf)) @@ -61,19 +100,27 @@ def test_annotator_with_multiple_bams(self): info_annotations = test_utils._get_info_fields(output_vcf) for a in EXPECTED_ANNOTATIONS_REPLICATES: - self.assertTrue("tumor_{}_1".format(a) in info_annotations, - "Missing annotation tumor_{}_1".format(a)) - self.assertTrue("normal_{}_1".format(a) in info_annotations, - "Missing annotation normal_{}_1".format(a)) + self.assertTrue( + "tumor_{}_1".format(a) in info_annotations, + "Missing annotation tumor_{}_1".format(a), + ) + self.assertTrue( + "normal_{}_1".format(a) in info_annotations, + "Missing annotation normal_{}_1".format(a), + ) def test_annotator_with_prefix(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_annotator1_output.vcf") + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test_annotator1_output.vcf" + ) bam1 = pkg_resources.resource_filename(__name__, "resources/COLO_829_n1.bam") bam2 = pkg_resources.resource_filename(__name__, "resources/COLO_829_t1.bam") annotator = Annotator( - input_vcf=input_file, output_vcf=output_vcf, - input_bams={"RNA_normal": [bam1, bam2], "RNA_tumor": [bam1, bam2]}) + input_vcf=input_file, + output_vcf=output_vcf, + input_bams={"RNA_normal": [bam1, bam2], "RNA_tumor": [bam1, bam2]}, + ) annotator.run() self.assertTrue(os.path.exists(output_vcf)) @@ -83,19 +130,29 @@ def test_annotator_with_prefix(self): info_annotations = test_utils._get_info_fields(output_vcf) for a in EXPECTED_ANNOTATIONS_REPLICATES: - self.assertTrue("RNA_tumor_{}_1".format(a) in info_annotations, - "Missing annotation RNA_tumor_{}_1".format(a)) - self.assertTrue("RNA_normal_{}_1".format(a) in info_annotations, - "Missing annotation RNA_normal_{}_1".format(a)) + self.assertTrue( + "RNA_tumor_{}_1".format(a) in info_annotations, + "Missing annotation RNA_tumor_{}_1".format(a), + ) + self.assertTrue( + "RNA_normal_{}_1".format(a) in info_annotations, + "Missing annotation RNA_normal_{}_1".format(a), + ) def test_annotator_with_mnvs(self): - input_file = pkg_resources.resource_filename(__name__, "resources/test_tumor_normal.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_tumor_normal_output.vcf") + input_file = pkg_resources.resource_filename( + __name__, "resources/test_tumor_normal.vcf" + ) + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test_tumor_normal_output.vcf" + ) bam1 = pkg_resources.resource_filename(__name__, "resources/COLO_829_n1.bam") bam2 = pkg_resources.resource_filename(__name__, "resources/COLO_829_t1.bam") annotator = Annotator( - input_vcf=input_file, output_vcf=output_vcf, - input_bams={"RNA_normal": [bam1, bam2], "RNA_tumor": [bam1, bam2]}) + input_vcf=input_file, + output_vcf=output_vcf, + input_bams={"RNA_normal": [bam1, bam2], "RNA_tumor": [bam1, bam2]}, + ) annotator.run() self.assertTrue(os.path.exists(output_vcf)) @@ -105,10 +162,14 @@ def test_annotator_with_mnvs(self): info_annotations = test_utils._get_info_fields(output_vcf) for a in EXPECTED_ANNOTATIONS_REPLICATES: - self.assertTrue("RNA_tumor_{}_1".format(a) in info_annotations, - "Missing annotation RNA_tumor_{}_1".format(a)) - self.assertTrue("RNA_normal_{}_1".format(a) in info_annotations, - "Missing annotation RNA_normal_{}_1".format(a)) + self.assertTrue( + "RNA_tumor_{}_1".format(a) in info_annotations, + "Missing annotation RNA_tumor_{}_1".format(a), + ) + self.assertTrue( + "RNA_normal_{}_1".format(a) in info_annotations, + "Missing annotation RNA_normal_{}_1".format(a), + ) def _get_info_at(self, input_file, chromosome, position, annotation): vcf = VCF(input_file) @@ -122,19 +183,27 @@ def _get_info_at(self, input_file, chromosome, position, annotation): def test_nist(self): input_file = pkg_resources.resource_filename( - __name__, "resources/project.NIST.hc.snps.indels.chr1_1000000_2000000.vcf") + __name__, "resources/project.NIST.hc.snps.indels.chr1_1000000_2000000.vcf" + ) output_vcf = pkg_resources.resource_filename( - __name__, "resources/results/project.NIST.hc.snps.indels.chr1_1000000_2000000.vaf.vcf") + __name__, + "resources/results/project.NIST.hc.snps.indels.chr1_1000000_2000000.vaf.vcf", + ) bam_file = pkg_resources.resource_filename( __name__, - "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam") + "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam", + ) start = time.time() - annotator = Annotator(input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam_file]}) + annotator = Annotator( + input_vcf=input_file, + output_vcf=output_vcf, + input_bams={"normal": [bam_file]}, + ) annotator.run() duration = time.time() - start logger.info("Duration {} seconds".format(round(duration, 3))) - self._assert_vafator_vcf(output_vcf, sample_name='normal') + self._assert_vafator_vcf(output_vcf, sample_name="normal") n_variants_input = test_utils._get_count_variants(input_file) n_variants_output = test_utils._get_count_variants(output_vcf) @@ -145,105 +214,125 @@ def test_nist(self): self.assertTrue("normal_ac" in info_annotations) self.assertTrue("normal_dp" in info_annotations) - variant = test_utils._get_mutation_at_position(output_vcf, 'chr1', 1506035) + variant = test_utils._get_mutation_at_position(output_vcf, "chr1", 1506035) self.assertIsNotNone(variant) - self.assertEqual(variant.INFO['normal_ac'], 3) - self.assertEqual(variant.INFO['normal_dp'], 3) - self.assertEqual(variant.INFO['normal_af'], 1.0) - self.assertEqual(variant.INFO['normal_pu'], 1.0) - self.assertEqual(variant.INFO['normal_eaf'], 0.5) - self.assertEqual(variant.INFO['normal_mq'][0], 0) - self.assertEqual(variant.INFO['normal_mq'][1], 60.0) - self.assertEqual(variant.INFO['normal_bq'][0], 0) - self.assertEqual(variant.INFO['normal_bq'][1], 37.0) - self.assertEqual(variant.INFO['normal_pos'][0], 0) - self.assertEqual(variant.INFO['normal_pos'][1], 56.0) - - variant = test_utils._get_mutation_at_position(output_vcf, 'chr1', 1509825) + self.assertEqual(variant.INFO["normal_ac"], 3) + self.assertEqual(variant.INFO["normal_dp"], 3) + self.assertEqual(variant.INFO["normal_af"], 1.0) + self.assertEqual(variant.INFO["normal_pu"], 1.0) + self.assertEqual(variant.INFO["normal_eaf"], 0.5) + self.assertEqual(variant.INFO["normal_mq"][0], 0) + self.assertEqual(variant.INFO["normal_mq"][1], 60.0) + self.assertEqual(variant.INFO["normal_bq"][0], 0) + self.assertEqual(variant.INFO["normal_bq"][1], 37.0) + self.assertEqual(variant.INFO["normal_pos"][0], 0) + self.assertEqual(variant.INFO["normal_pos"][1], 56.0) + + variant = test_utils._get_mutation_at_position(output_vcf, "chr1", 1509825) self.assertIsNotNone(variant) - self.assertEqual(variant.INFO['normal_ac'], 13) - self.assertEqual(variant.INFO['normal_dp'], 20) + self.assertEqual(variant.INFO["normal_ac"], 13) + self.assertEqual(variant.INFO["normal_dp"], 20) # these values are rounded to six digits inside the VCF, not sure why when read the representation is # different... - self.assertEqual(round(variant.INFO['normal_af'], 5), 0.65) - self.assertEqual(round(variant.INFO['normal_pu'], 5), 0.94234) - self.assertEqual(variant.INFO['normal_eaf'], 0.5) - self.assertEqual(variant.INFO['normal_mq'][0], 60.0) - self.assertEqual(variant.INFO['normal_mq'][1], 60.0) - self.assertEqual(variant.INFO['normal_bq'][0], 37.0) - self.assertEqual(variant.INFO['normal_bq'][1], 35.0) - self.assertEqual(variant.INFO['normal_pos'][0], 31.0) - self.assertEqual(variant.INFO['normal_pos'][1], 41.0) + self.assertEqual(round(variant.INFO["normal_af"], 5), 0.65) + self.assertEqual(round(variant.INFO["normal_pu"], 5), 0.94234) + self.assertEqual(variant.INFO["normal_eaf"], 0.5) + self.assertEqual(variant.INFO["normal_mq"][0], 60.0) + self.assertEqual(variant.INFO["normal_mq"][1], 60.0) + self.assertEqual(variant.INFO["normal_bq"][0], 37.0) + self.assertEqual(variant.INFO["normal_bq"][1], 35.0) + self.assertEqual(variant.INFO["normal_pos"][0], 31.0) + self.assertEqual(variant.INFO["normal_pos"][1], 41.0) # this is a deletion - variant = test_utils._get_mutation_at_position(output_vcf, 'chr1', 1323143) + variant = test_utils._get_mutation_at_position(output_vcf, "chr1", 1323143) self.assertIsNotNone(variant) - self.assertEqual(variant.INFO['normal_ac'], 20) - self.assertEqual(variant.INFO['normal_dp'], 21) + self.assertEqual(variant.INFO["normal_ac"], 20) + self.assertEqual(variant.INFO["normal_dp"], 21) # these values are rounded to six digits inside the VCF, not sure why when read the representation is # different... - self.assertEqual(round(variant.INFO['normal_af'], 5), 0.95238) - self.assertEqual(round(variant.INFO['normal_pu'], 5), 1.0) - self.assertEqual(variant.INFO['normal_eaf'], 0.5) - self.assertEqual(variant.INFO['normal_mq'][0], 60.0) - self.assertEqual(variant.INFO['normal_mq'][1], 60.0) - self.assertEqual(variant.INFO['normal_bq'][0], 0.0) - self.assertEqual(variant.INFO['normal_bq'][1], 0.0) - self.assertEqual(variant.INFO['normal_pos'][0], 50.0) - self.assertEqual(variant.INFO['normal_pos'][1], 21.0) + self.assertEqual(round(variant.INFO["normal_af"], 5), 0.95238) + self.assertEqual(round(variant.INFO["normal_pu"], 5), 1.0) + self.assertEqual(variant.INFO["normal_eaf"], 0.5) + self.assertEqual(variant.INFO["normal_mq"][0], 60.0) + self.assertEqual(variant.INFO["normal_mq"][1], 60.0) + self.assertEqual(variant.INFO["normal_bq"][0], 0.0) + self.assertEqual(variant.INFO["normal_bq"][1], 0.0) + self.assertEqual(variant.INFO["normal_pos"][0], 50.0) + self.assertEqual(variant.INFO["normal_pos"][1], 21.0) # this is an insertion - variant = test_utils._get_mutation_at_position(output_vcf, 'chr1', 1935367) + variant = test_utils._get_mutation_at_position(output_vcf, "chr1", 1935367) self.assertIsNotNone(variant) - self.assertEqual(variant.INFO['normal_ac'], 1) - self.assertEqual(variant.INFO['normal_dp'], 2) + self.assertEqual(variant.INFO["normal_ac"], 1) + self.assertEqual(variant.INFO["normal_dp"], 2) # these values are rounded to six digits inside the VCF, not sure why when read the representation is # different... - self.assertEqual(round(variant.INFO['normal_af'], 5), 0.5) - self.assertEqual(round(variant.INFO['normal_pu'], 5), 0.75) - self.assertEqual(variant.INFO['normal_eaf'], 0.5) - self.assertEqual(variant.INFO['normal_mq'][0], 60.0) - self.assertEqual(variant.INFO['normal_mq'][1], 29.0) - self.assertEqual(variant.INFO['normal_bq'][0], 0.0) - self.assertEqual(variant.INFO['normal_bq'][1], 0.0) - self.assertEqual(variant.INFO['normal_pos'][0], 95.0) - self.assertEqual(variant.INFO['normal_pos'][1], 56.0) + self.assertEqual(round(variant.INFO["normal_af"], 5), 0.5) + self.assertEqual(round(variant.INFO["normal_pu"], 5), 0.75) + self.assertEqual(variant.INFO["normal_eaf"], 0.5) + self.assertEqual(variant.INFO["normal_mq"][0], 60.0) + self.assertEqual(variant.INFO["normal_mq"][1], 29.0) + self.assertEqual(variant.INFO["normal_bq"][0], 0.0) + self.assertEqual(variant.INFO["normal_bq"][1], 0.0) + self.assertEqual(variant.INFO["normal_pos"][0], 95.0) + self.assertEqual(variant.INFO["normal_pos"][1], 56.0) def test_nist_with_replicates(self): input_file = pkg_resources.resource_filename( - __name__, "resources/project.NIST.hc.snps.indels.chr1_1000000_2000000.vcf") + __name__, "resources/project.NIST.hc.snps.indels.chr1_1000000_2000000.vcf" + ) output_vcf = pkg_resources.resource_filename( - __name__, "resources/results/project.NIST.hc.snps.indels.chr1_1000000_2000000.vaf_replicates.vcf") + __name__, + "resources/results/project.NIST.hc.snps.indels.chr1_1000000_2000000.vaf_replicates.vcf", + ) bam_file = pkg_resources.resource_filename( __name__, - "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam") + "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam", + ) start = time.time() - annotator = Annotator(input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam_file]}) + annotator = Annotator( + input_vcf=input_file, + output_vcf=output_vcf, + input_bams={"normal": [bam_file]}, + ) annotator.run() duration = time.time() - start logger.info("Duration {} seconds".format(round(duration, 3))) - self._assert_vafator_vcf(output_vcf, sample_name='normal') - self._assert_vafator_vcf(output_vcf, sample_name='normal', replicate=1) - self._assert_vafator_vcf(output_vcf, sample_name='normal', replicate=2) + self._assert_vafator_vcf(output_vcf, sample_name="normal") + self._assert_vafator_vcf(output_vcf, sample_name="normal", replicate=1) + self._assert_vafator_vcf(output_vcf, sample_name="normal", replicate=2) def test_annotator_bams_order(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_annotator1_output.vcf") - output_vcf_2 = pkg_resources.resource_filename(__name__, "resources/results/test_annotator2_output.vcf") + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test_annotator1_output.vcf" + ) + output_vcf_2 = pkg_resources.resource_filename( + __name__, "resources/results/test_annotator2_output.vcf" + ) bam1 = pkg_resources.resource_filename(__name__, "resources/COLO_829_n1.bam") bam2 = pkg_resources.resource_filename(__name__, "resources/COLO_829_t1.bam") - Annotator(input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam1], "tumor": [bam2]}).run() - Annotator(input_vcf=input_file, output_vcf=output_vcf_2, input_bams={"tumor": [bam2], "normal": [bam1]}).run() + Annotator( + input_vcf=input_file, + output_vcf=output_vcf, + input_bams={"normal": [bam1], "tumor": [bam2]}, + ).run() + Annotator( + input_vcf=input_file, + output_vcf=output_vcf_2, + input_bams={"tumor": [bam2], "normal": [bam1]}, + ).run() self.assertTrue(os.path.exists(output_vcf)) self.assertTrue(os.path.exists(output_vcf_2)) - self._assert_vafator_vcf(output_vcf, sample_name='normal') - self._assert_vafator_vcf(output_vcf, sample_name='tumor') - self._assert_vafator_vcf(output_vcf_2, sample_name='normal') - self._assert_vafator_vcf(output_vcf_2, sample_name='tumor') + self._assert_vafator_vcf(output_vcf, sample_name="normal") + self._assert_vafator_vcf(output_vcf, sample_name="tumor") + self._assert_vafator_vcf(output_vcf_2, sample_name="normal") + self._assert_vafator_vcf(output_vcf_2, sample_name="tumor") vcf = VCF(output_vcf) vcf_2 = VCF(output_vcf_2) @@ -253,20 +342,31 @@ def test_annotator_bams_order(self): self.assertEqual( v.INFO.get("normal_{}".format(a), ""), v2.INFO.get("normal_{}".format(a), ""), - "Variant {}:{}:{}>{} is missing annotation normal_{}".format(v.CHROM, v.POS, v.REF, v.ALT[0], a)) + "Variant {}:{}:{}>{} is missing annotation normal_{}".format( + v.CHROM, v.POS, v.REF, v.ALT[0], a + ), + ) self.assertEqual( v.INFO.get("tumor_{}".format(a), ""), v2.INFO.get("tumor_{}".format(a), ""), - "Variant {}:{}:{}>{} is missing annotation tumor_{}".format(v.CHROM, v.POS, v.REF, v.ALT[0], a)) + "Variant {}:{}:{}>{} is missing annotation tumor_{}".format( + v.CHROM, v.POS, v.REF, v.ALT[0], a + ), + ) def test_annotator_with_purities(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_annotator1_output.vcf") + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test_annotator1_output.vcf" + ) bam1 = pkg_resources.resource_filename(__name__, "resources/COLO_829_n1.bam") bam2 = pkg_resources.resource_filename(__name__, "resources/COLO_829_t1.bam") annotator = Annotator( - input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam1], "tumor": [bam2]}, - purities={"tumor": 0.8}, tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.8)} + input_vcf=input_file, + output_vcf=output_vcf, + input_bams={"normal": [bam1], "tumor": [bam2]}, + purities={"tumor": 0.8}, + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.8)}, ) annotator.run() @@ -281,8 +381,11 @@ def test_annotator_with_purities(self): self.assertTrue("normal_{}".format(a) in info_annotations) annotator = Annotator( - input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam1], "tumor": [bam2]}, - purities={"tumor": 0.2}, tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=1.5)} + input_vcf=input_file, + output_vcf=output_vcf, + input_bams={"normal": [bam1], "tumor": [bam2]}, + purities={"tumor": 0.2}, + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=1.5)}, ) annotator.run() @@ -291,18 +394,69 @@ def _assert_vafator_vcf(self, vcf_filename, sample_name, replicate=None): vcf = VCF(vcf_filename) for v in vcf: # p-values or VAFs - self._assert_probability(v.INFO.get(self._get_annotation_name('rsbq_pv', sample_name, replicate=replicate), 0)) - self._assert_probability(v.INFO.get(self._get_annotation_name('rsmq_pv', sample_name, replicate=replicate), 0)) - self._assert_probability(v.INFO.get(self._get_annotation_name('rspos_pv', sample_name, replicate=replicate), 0)) - self._assert_probability(v.INFO.get(self._get_annotation_name('eaf', sample_name, replicate=replicate), 0)) - self._assert_probability(v.INFO.get(self._get_annotation_name('af', sample_name, replicate=replicate), 0)) + self._assert_probability( + v.INFO.get( + self._get_annotation_name( + "rsbq_pv", sample_name, replicate=replicate + ), + 0, + ) + ) + self._assert_probability( + v.INFO.get( + self._get_annotation_name( + "rsmq_pv", sample_name, replicate=replicate + ), + 0, + ) + ) + self._assert_probability( + v.INFO.get( + self._get_annotation_name( + "rspos_pv", sample_name, replicate=replicate + ), + 0, + ) + ) + self._assert_probability( + v.INFO.get( + self._get_annotation_name("eaf", sample_name, replicate=replicate), + 0, + ) + ) + self._assert_probability( + v.INFO.get( + self._get_annotation_name("af", sample_name, replicate=replicate), 0 + ) + ) # positive integer annotations - self._assert_positive_integer(v.INFO.get(self._get_annotation_name('ac', sample_name, replicate=replicate), 0)) - self._assert_positive_integer(v.INFO.get(self._get_annotation_name('dp', sample_name, replicate=replicate), 0)) - self._assert_positive_integer(v.INFO.get(self._get_annotation_name('mq', sample_name, replicate=replicate), 0)) - self._assert_positive_integer(v.INFO.get(self._get_annotation_name('bq', sample_name, replicate=replicate), 0)) - self._assert_positive_integer(v.INFO.get(self._get_annotation_name('pos', sample_name, replicate=replicate), 0)) + self._assert_positive_integer( + v.INFO.get( + self._get_annotation_name("ac", sample_name, replicate=replicate), 0 + ) + ) + self._assert_positive_integer( + v.INFO.get( + self._get_annotation_name("dp", sample_name, replicate=replicate), 0 + ) + ) + self._assert_positive_integer( + v.INFO.get( + self._get_annotation_name("mq", sample_name, replicate=replicate), 0 + ) + ) + self._assert_positive_integer( + v.INFO.get( + self._get_annotation_name("bq", sample_name, replicate=replicate), 0 + ) + ) + self._assert_positive_integer( + v.INFO.get( + self._get_annotation_name("pos", sample_name, replicate=replicate), + 0, + ) + ) vcf.close() @staticmethod @@ -315,13 +469,25 @@ def _get_annotation_name(annotation_name, sample_name, replicate=None): def _assert_probability(self, annotation): if isinstance(annotation, list) or isinstance(annotation, tuple): for a in annotation: - self.assertTrue(0.0 <= float(a) <= 1.0, "Expected probability has a value of {}".format(a)) + self.assertTrue( + 0.0 <= float(a) <= 1.0, + "Expected probability has a value of {}".format(a), + ) else: - self.assertTrue(0.0 <= float(annotation) <= 1.0, "Expected probability has a value of {}".format(annotation)) + self.assertTrue( + 0.0 <= float(annotation) <= 1.0, + "Expected probability has a value of {}".format(annotation), + ) def _assert_positive_integer(self, annotation): if isinstance(annotation, list) or isinstance(annotation, tuple): for a in annotation: - self.assertTrue(np.isnan(a) or 0.0 <= a, "Expected positive integer has a value of {}".format(a)) + self.assertTrue( + np.isnan(a) or 0.0 <= a, + "Expected positive integer has a value of {}".format(a), + ) else: - self.assertTrue(np.isnan(annotation) or 0.0 <= annotation, "Expected positive integer has a value of {}".format(annotation)) + self.assertTrue( + np.isnan(annotation) or 0.0 <= annotation, + "Expected positive integer has a value of {}".format(annotation), + ) diff --git a/vafator/tests/test_hatchet2bed.py b/vafator/tests/test_hatchet2bed.py index 814651f..9e5ee47 100644 --- a/vafator/tests/test_hatchet2bed.py +++ b/vafator/tests/test_hatchet2bed.py @@ -8,10 +8,24 @@ class Hatchet2bedTest(TestCase): def test_hatchet2bed(self): run_hatchet2bed( - input_file=pkg_resources.resource_filename(__name__, "resources/best.seg.minimal.ucn"), - output_prefix=pkg_resources.resource_filename(__name__, "resources/best.seg.minimal") + input_file=pkg_resources.resource_filename( + __name__, "resources/best.seg.minimal.ucn" + ), + output_prefix=pkg_resources.resource_filename( + __name__, "resources/best.seg.minimal" + ), ) self.assertTrue( - os.path.exists(pkg_resources.resource_filename(__name__, "resources/best.seg.minimal.my_tumor.bed"))) + os.path.exists( + pkg_resources.resource_filename( + __name__, "resources/best.seg.minimal.my_tumor.bed" + ) + ) + ) self.assertTrue( - os.path.exists(pkg_resources.resource_filename(__name__, "resources/best.seg.minimal.my_metastasis.bed"))) + os.path.exists( + pkg_resources.resource_filename( + __name__, "resources/best.seg.minimal.my_metastasis.bed" + ) + ) + ) diff --git a/vafator/tests/test_multiallelic_filter.py b/vafator/tests/test_multiallelic_filter.py index bdc15a0..1bc8f15 100755 --- a/vafator/tests/test_multiallelic_filter.py +++ b/vafator/tests/test_multiallelic_filter.py @@ -14,8 +14,12 @@ def setUp(self): def test_no_variants_filtered(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test1_output.vcf") - multiallelic_filter = MultiallelicFilter(input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name='tumor') + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test1_output.vcf" + ) + multiallelic_filter = MultiallelicFilter( + input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name="tumor" + ) multiallelic_filter.run() self.assertTrue(os.path.exists(output_vcf)) @@ -25,33 +29,53 @@ def test_no_variants_filtered(self): def test_two_variants_filtered(self): input_file = pkg_resources.resource_filename(__name__, "resources/test2.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test2_output.vcf") - multiallelic_filter = MultiallelicFilter(input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name='tumor') + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test2_output.vcf" + ) + multiallelic_filter = MultiallelicFilter( + input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name="tumor" + ) multiallelic_filter.run() self.assertTrue(os.path.exists(output_vcf)) n_variants_input = test_utils._get_count_variants(input_file) n_variants_output = test_utils._get_count_variants(output_vcf) - self.assertTrue(n_variants_input == n_variants_output + 3, "input:{}; output:{}".format( - n_variants_input, n_variants_output)) - - af1 = self._get_info_at(output_vcf, chromosome="chr4", position=1235, annotation='tumor_af') + self.assertTrue( + n_variants_input == n_variants_output + 3, + "input:{}; output:{}".format(n_variants_input, n_variants_output), + ) + + af1 = self._get_info_at( + output_vcf, chromosome="chr4", position=1235, annotation="tumor_af" + ) self.assertTrue(af1, 0.2) - multiallelic1 = self._get_info_at(output_vcf, chromosome="chr4", position=1235, annotation='multiallelic') + multiallelic1 = self._get_info_at( + output_vcf, chromosome="chr4", position=1235, annotation="multiallelic" + ) self.assertTrue(multiallelic1, "T,0.1") - af2 = self._get_info_at(output_vcf, chromosome="chr6", position=1235, annotation='tumor_af') + af2 = self._get_info_at( + output_vcf, chromosome="chr6", position=1235, annotation="tumor_af" + ) self.assertTrue(af2, 0.2) - multiallelic2 = self._get_info_at(output_vcf, chromosome="chr6", position=1235, annotation='multiallelic') + multiallelic2 = self._get_info_at( + output_vcf, chromosome="chr6", position=1235, annotation="multiallelic" + ) self.assertTrue(multiallelic2, "A,0.01") - af3 = self._get_info_at(output_vcf, chromosome="chr6", position=1234, annotation='tumor_af') + af3 = self._get_info_at( + output_vcf, chromosome="chr6", position=1234, annotation="tumor_af" + ) self.assertTrue(af3, 0.5) def test_different_reference_is_kept(self): input_file = pkg_resources.resource_filename(__name__, "resources/test3.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test3_output.vcf") - multiallelic_filter = MultiallelicFilter(input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name='tumor') + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test3_output.vcf" + ) + multiallelic_filter = MultiallelicFilter( + input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name="tumor" + ) multiallelic_filter.run() self.assertTrue(os.path.exists(output_vcf)) @@ -62,8 +86,12 @@ def test_different_reference_is_kept(self): def test_three_multiallelics(self): input_file = pkg_resources.resource_filename(__name__, "resources/test4.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test4_output.vcf") - multiallelic_filter = MultiallelicFilter(input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name='tumor') + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test4_output.vcf" + ) + multiallelic_filter = MultiallelicFilter( + input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name="tumor" + ) multiallelic_filter.run() self.assertTrue(os.path.exists(output_vcf)) @@ -74,8 +102,12 @@ def test_three_multiallelics(self): def test_equal_af(self): input_file = pkg_resources.resource_filename(__name__, "resources/test5.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test5_output.vcf") - multiallelic_filter = MultiallelicFilter(input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name='tumor') + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test5_output.vcf" + ) + multiallelic_filter = MultiallelicFilter( + input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name="tumor" + ) multiallelic_filter.run() self.assertTrue(os.path.exists(output_vcf)) @@ -92,4 +124,4 @@ def _get_info_at(self, input_file, chromosome, position, annotation): vcf.close() return v.INFO.get(annotation) vcf.close() - return {} \ No newline at end of file + return {} diff --git a/vafator/tests/test_pileups.py b/vafator/tests/test_pileups.py index d7b3c76..6ead2cd 100644 --- a/vafator/tests/test_pileups.py +++ b/vafator/tests/test_pileups.py @@ -4,7 +4,10 @@ import pysam from vafator.tests.utils import VafatorVariant -from vafator.pileups import get_variant_pileup, get_metrics +from vafator.pileups import ( + get_variant_pileup, + _get_metrics_from_column, +) class TestPileups(TestCase): @@ -13,58 +16,127 @@ class TestPileups(TestCase): min_mapping_quality = 0 bam_file = pkg_resources.resource_filename( __name__, - "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam") + "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam", + ) bam_reader = pysam.AlignmentFile(bam_file) def test_snv_metrics(self): - variant = VafatorVariant(chromosome="chr1", position=1017341, reference="G", alternative=["T"]) - self._assert_metrics(variant=variant, expected_ac={'G': 5, 'T': 6}, expected_dp=11) - self._assert_metrics(variant=variant, expected_ac={'G': 1, 'T': 2}, expected_dp=3, min_base_quality=40) + variant = VafatorVariant( + chromosome="chr1", position=1017341, reference="G", alternative=["T"] + ) + self._assert_metrics( + variant=variant, expected_ac={"G": 5, "T": 6}, expected_dp=11 + ) + self._assert_metrics( + variant=variant, + expected_ac={"G": 1, "T": 2}, + expected_dp=3, + min_base_quality=40, + ) def test_snv_metrics_2(self): - variant = VafatorVariant(chromosome="chr1", position=1018144, reference="T", alternative=["C"]) - self._assert_metrics(variant=variant, expected_ac={'C': 9, 'T': 11}, expected_dp=20) - self._assert_metrics(variant=variant, expected_ac={'C': 3, 'T': 4}, expected_dp=7, min_base_quality=40) - self._assert_metrics(variant=variant, expected_ac=Counter(), expected_dp=0, min_mapping_quality=65) + variant = VafatorVariant( + chromosome="chr1", position=1018144, reference="T", alternative=["C"] + ) + self._assert_metrics( + variant=variant, expected_ac={"C": 9, "T": 11}, expected_dp=20 + ) + self._assert_metrics( + variant=variant, + expected_ac={"C": 3, "T": 4}, + expected_dp=7, + min_base_quality=40, + ) + self._assert_metrics( + variant=variant, + expected_ac=Counter(), + expected_dp=0, + min_mapping_quality=65, + ) def test_insertion_metrics(self): # variant called in the VCF shows no read support (!?) - variant = VafatorVariant(chromosome="chr1", position=1247578, reference="T", alternative=["TGG"]) - self._assert_metrics(variant=variant, expected_ac={'TGG': 0}, expected_dp=3) + variant = VafatorVariant( + chromosome="chr1", position=1247578, reference="T", alternative=["TGG"] + ) + self._assert_metrics(variant=variant, expected_ac={"TGG": 0}, expected_dp=3) # there is one read supporting this insertion of 3 Gs - variant = VafatorVariant(chromosome="chr1", position=1247578, reference="T", alternative=["TGGG"]) - self._assert_metrics(variant=variant, expected_ac={'TGGG': 1}, expected_dp=3) + variant = VafatorVariant( + chromosome="chr1", position=1247578, reference="T", alternative=["TGGG"] + ) + self._assert_metrics(variant=variant, expected_ac={"TGGG": 1}, expected_dp=3) # this ensures that the insertion sequence is checked not only the insertion length! - variant = VafatorVariant(chromosome="chr1", position=1247578, reference="T", alternative=["TGGA"]) - self._assert_metrics(variant=variant, expected_ac={'TGGA': 0}, expected_dp=3) + variant = VafatorVariant( + chromosome="chr1", position=1247578, reference="T", alternative=["TGGA"] + ) + self._assert_metrics(variant=variant, expected_ac={"TGGA": 0}, expected_dp=3) # there is one read supporting this insertion of 4 Gs - variant = VafatorVariant(chromosome="chr1", position=1247578, reference="T", alternative=["TGGGG"]) - self._assert_metrics(variant=variant, expected_ac={'TGGGG': 1}, expected_dp=3) + variant = VafatorVariant( + chromosome="chr1", position=1247578, reference="T", alternative=["TGGGG"] + ) + self._assert_metrics(variant=variant, expected_ac={"TGGGG": 1}, expected_dp=3) # there is no read supporting an insertion of 5 Gs - variant = VafatorVariant(chromosome="chr1", position=1247578, reference="T", alternative=["TGGGGG"]) - self._assert_metrics(variant=variant, expected_ac={'TGGGGG': 0}, expected_dp=3) + variant = VafatorVariant( + chromosome="chr1", position=1247578, reference="T", alternative=["TGGGGG"] + ) + self._assert_metrics(variant=variant, expected_ac={"TGGGGG": 0}, expected_dp=3) def test_insertion_metrics_2(self): - variant = VafatorVariant(chromosome="chr1", position=1594199, reference="C", alternative=["CT"]) - self._assert_metrics(variant=variant, expected_ac={'CT': 9}, expected_dp=11) - self._assert_metrics(variant=variant, expected_ac={'CT': 5}, expected_dp=7, min_mapping_quality=40) - self._assert_metrics(variant=variant, expected_ac={'CT': 4}, expected_dp=4, min_base_quality=40) + variant = VafatorVariant( + chromosome="chr1", position=1594199, reference="C", alternative=["CT"] + ) + self._assert_metrics(variant=variant, expected_ac={"CT": 9}, expected_dp=11) + self._assert_metrics( + variant=variant, + expected_ac={"CT": 5}, + expected_dp=7, + min_mapping_quality=40, + ) + self._assert_metrics( + variant=variant, expected_ac={"CT": 4}, expected_dp=4, min_base_quality=40 + ) def test_deletion_metrics(self): - variant = VafatorVariant(chromosome="chr1", position=1510035, reference="GGC", alternative=["G"]) - self._assert_metrics(variant=variant, expected_ac={'G': 12}, expected_dp=13) - self._assert_metrics(variant=variant, expected_ac={'G': 0}, expected_dp=0, min_mapping_quality=61) - self._assert_metrics(variant=variant, expected_ac={'G': 3}, expected_dp=3, min_base_quality=40) + variant = VafatorVariant( + chromosome="chr1", position=1510035, reference="GGC", alternative=["G"] + ) + self._assert_metrics(variant=variant, expected_ac={"G": 12}, expected_dp=13) + self._assert_metrics( + variant=variant, expected_ac={"G": 0}, expected_dp=0, min_mapping_quality=61 + ) + self._assert_metrics( + variant=variant, expected_ac={"G": 3}, expected_dp=3, min_base_quality=40 + ) # deletions with a reference sequence not matching the reference would be matched # vafator expects correct indel calls - variant = VafatorVariant(chromosome="chr1", position=1510035, reference="GCC", alternative=["G"]) - self._assert_metrics(variant=variant, expected_ac={'G': 12}, expected_dp=13) + variant = VafatorVariant( + chromosome="chr1", position=1510035, reference="GCC", alternative=["G"] + ) + self._assert_metrics(variant=variant, expected_ac={"G": 12}, expected_dp=13) - def _assert_metrics(self, variant: VafatorVariant, expected_ac, expected_dp, - min_base_quality=0, min_mapping_quality=0): + def _assert_metrics( + self, + variant, + expected_ac, + expected_dp, + min_base_quality=0, + min_mapping_quality=0, + ): pileups = get_variant_pileup( - variant=variant, bam=self.bam_reader, - min_base_quality=min_base_quality, min_mapping_quality=min_mapping_quality) - coverage_metrics = get_metrics(variant=variant, pileups=pileups) - self.assertEqual(expected_ac, coverage_metrics.ac) - self.assertEqual(expected_dp, coverage_metrics.dp) + variant=variant, + bam=self.bam_reader, + min_base_quality=min_base_quality, + min_mapping_quality=min_mapping_quality, + ) + pileup_col = next(iter(pileups), None) + if pileup_col is None: + coverage_metrics = None + else: + coverage_metrics = _get_metrics_from_column( + variant=variant, pileup_col=pileup_col, include_ambiguous_bases=True + ) + if expected_dp == 0: + self.assertIsNone(coverage_metrics) + else: + self.assertEqual(expected_ac, coverage_metrics.ac) + self.assertEqual(expected_dp, coverage_metrics.dp) diff --git a/vafator/tests/test_ploidy_manager.py b/vafator/tests/test_ploidy_manager.py index 5e68640..4906095 100644 --- a/vafator/tests/test_ploidy_manager.py +++ b/vafator/tests/test_ploidy_manager.py @@ -1,5 +1,4 @@ from unittest import TestCase - import pkg_resources from vafator.tests.utils import VafatorVariant @@ -9,33 +8,107 @@ class PloidyManagerTest(TestCase): def test_default_ploidy_manager(self): - self.assertEqual(PloidyManager().get_ploidy( - variant=VafatorVariant(chromosome="chr1", position=12345, reference="A", alternative="C")), 2.0) - self.assertEqual(default_ploidy_manager.get_ploidy( - variant=VafatorVariant(chromosome="chr1", position=12345, reference="A", alternative="C")), 2.0) + self.assertEqual( + PloidyManager().get_ploidy( + variant=VafatorVariant( + chromosome="chr1", position=12345, reference="A", alternative="C" + ) + ), + 2.0, + ) + self.assertEqual( + default_ploidy_manager.get_ploidy( + variant=VafatorVariant( + chromosome="chr1", position=12345, reference="A", alternative="C" + ) + ), + 2.0, + ) self.assertEqual(default_ploidy_manager.get_ploidy(variant=None), 2.0) def test_genome_wide_ploidy_manager(self): - self.assertEqual(PloidyManager(genome_wide_ploidy=3.2).get_ploidy( - variant=VafatorVariant(chromosome="chr1", position=12345, reference="A", alternative="C")), 3.2) - self.assertEqual(PloidyManager(genome_wide_ploidy=3.2).get_ploidy(variant=None), 3.2) + self.assertEqual( + PloidyManager(genome_wide_ploidy=3.2).get_ploidy( + variant=VafatorVariant( + chromosome="chr1", position=12345, reference="A", alternative="C" + ) + ), + 3.2, + ) + self.assertEqual( + PloidyManager(genome_wide_ploidy=3.2).get_ploidy(variant=None), 3.2 + ) def test_local_copy_numbers_ploidy_manager(self): - input_bed = pkg_resources.resource_filename(__name__, "resources/test_copy_numbers.bed") - self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( - variant=VafatorVariant(chromosome="chr1", position=12345, reference="A", alternative="C")), 1.2) - self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( - variant=VafatorVariant(chromosome="chr2", position=12345, reference="A", alternative="C")), 3.2) + input_bed = pkg_resources.resource_filename( + __name__, "resources/test_copy_numbers.bed" + ) + self.assertEqual( + PloidyManager(local_copy_numbers=input_bed).get_ploidy( + variant=VafatorVariant( + chromosome="chr1", position=12345, reference="A", alternative="C" + ) + ), + 1.2, + ) + self.assertEqual( + PloidyManager(local_copy_numbers=input_bed).get_ploidy( + variant=VafatorVariant( + chromosome="chr2", position=12345, reference="A", alternative="C" + ) + ), + 3.2, + ) # test non existing interval - self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( - variant=VafatorVariant(chromosome="chr3", position=12345, reference="A", alternative="C")), 2.0) - # test lower boundary of interval - self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( - variant=VafatorVariant(chromosome="chr1", position=10000, reference="A", alternative="C")), 2.0) - self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( - variant=VafatorVariant(chromosome="chr1", position=10001, reference="A", alternative="C")), 1.2) - # test upper boundary of interval - self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( - variant=VafatorVariant(chromosome="chr1", position=20000, reference="A", alternative="C")), 1.2) - self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( - variant=VafatorVariant(chromosome="chr1", position=20001, reference="A", alternative="C")), 2.1) + self.assertEqual( + PloidyManager(local_copy_numbers=input_bed).get_ploidy( + variant=VafatorVariant( + chromosome="chr3", position=12345, reference="A", alternative="C" + ) + ), + 2.0, + ) + + def test_interval_boundaries(self): + input_bed = pkg_resources.resource_filename( + __name__, "resources/test_copy_numbers.bed" + ) + # lower boundary — POS 10000 is 0-based 9999, outside interval start 10000 + self.assertEqual( + PloidyManager(local_copy_numbers=input_bed).get_ploidy( + variant=VafatorVariant( + chromosome="chr1", position=10000, reference="A", alternative="C" + ) + ), + 2.0, + ) + # just inside lower boundary + self.assertEqual( + PloidyManager(local_copy_numbers=input_bed).get_ploidy( + variant=VafatorVariant( + chromosome="chr1", position=10001, reference="A", alternative="C" + ) + ), + 1.2, + ) + # upper boundary + self.assertEqual( + PloidyManager(local_copy_numbers=input_bed).get_ploidy( + variant=VafatorVariant( + chromosome="chr1", position=20000, reference="A", alternative="C" + ) + ), + 1.2, + ) + self.assertEqual( + PloidyManager(local_copy_numbers=input_bed).get_ploidy( + variant=VafatorVariant( + chromosome="chr1", position=20001, reference="A", alternative="C" + ) + ), + 2.1, + ) + + def test_invalid_bed_raises(self): + with self.assertRaises(ValueError): + PloidyManager(local_copy_numbers="/nonexistent/path.bed") diff --git a/vafator/tests/test_power_calculator.py b/vafator/tests/test_power_calculator.py index a3c4981..c5371f9 100644 --- a/vafator/tests/test_power_calculator.py +++ b/vafator/tests/test_power_calculator.py @@ -10,118 +10,257 @@ class PowerCalculatorTest(TestCase): def test_power_calculator(self): power = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.5)}, purities={'tumor': 0.8}) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=0, sample='tumor', variant=None), 0.01734) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=1, sample='tumor', variant=None), 0.10404917949499565, 5) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=2, sample='tumor', variant=None), 0.29914139104811255, 5) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=3, sample='tumor', variant=None), 0.5592643397856016, 5) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=4, sample='tumor', variant=None), 0.7868719199309048, 5) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=5, sample='tumor', variant=None), 0.9234364680180867, 5) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=6, sample='tumor', variant=None), 0.9803383630544125, 5) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=7, sample='tumor', variant=None), 0.9965960473505056, 5) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=8, sample='tumor', variant=None), 0.999644363156023, 5) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=9, sample='tumor', variant=None), 0.9999830649121916, 5) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=10, sample='tumor', variant=None), 1.0) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=11, sample='tumor', variant=None), 1.0) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.5)}, + purities={"tumor": 0.8}, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=0, sample="tumor", variant=None), 0.01734 + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=1, sample="tumor", variant=None), + 0.10404917949499565, + 5, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=2, sample="tumor", variant=None), + 0.29914139104811255, + 5, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=3, sample="tumor", variant=None), + 0.5592643397856016, + 5, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=4, sample="tumor", variant=None), + 0.7868719199309048, + 5, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=5, sample="tumor", variant=None), + 0.9234364680180867, + 5, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=6, sample="tumor", variant=None), + 0.9803383630544125, + 5, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=7, sample="tumor", variant=None), + 0.9965960473505056, + 5, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=8, sample="tumor", variant=None), + 0.999644363156023, + 5, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=9, sample="tumor", variant=None), + 0.9999830649121916, + 5, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=10, sample="tumor", variant=None), 1.0 + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=11, sample="tumor", variant=None), 1.0 + ) + + def test_zero_dp_returns_zero_power(self): + power = PowerCalculator( + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.0)}, + purities={"tumor": 0.8}, + ) + self.assertEqual( + power.calculate_power(dp=0, ac=0, sample="tumor", variant=None), 1.0 + ) def test_eaf_copy_number_below_one(self): power = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=0.5)}, purities={'tumor': 0.9}) - self.assertLessEqual(power.calculate_expected_vaf(sample='tumor', variant=None), 1.0) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=0.5)}, + purities={"tumor": 0.9}, + ) + self.assertLessEqual( + power.calculate_expected_vaf(sample="tumor", variant=None), 1.0 + ) + + def test_eaf_is_cached(self): + power = PowerCalculator( + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.0)}, + purities={"tumor": 0.8}, + ) + v1 = VafatorVariant( + chromosome="chr1", position=100, reference="A", alternative="G" + ) + eaf1 = power.calculate_expected_vaf(sample="tumor", variant=v1) + eaf2 = power.calculate_expected_vaf(sample="tumor", variant=v1) + self.assertEqual(eaf1, eaf2) + self.assertEqual(len(power._eaf_cache), 1) + + def test_k_is_cached(self): + power = PowerCalculator( + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.0)}, + purities={"tumor": 0.8}, + ) + k1 = power._calculate_k(100) + k2 = power._calculate_k(100) + self.assertEqual(k1, k2) + self.assertEqual(len(power._k_cache), 1) + + def test_higher_coverage_higher_power(self): + power = PowerCalculator( + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.0)}, + purities={"tumor": 0.8}, + ) + p_low, _ = power.calculate_absolute_power(dp=10, sample="tumor", variant=None) + p_high, _ = power.calculate_absolute_power(dp=100, sample="tumor", variant=None) + self.assertLess(p_low, p_high) def test_varying_purity(self): power1 = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.5)}, purities={'tumor': 0.8}) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.5)}, + purities={"tumor": 0.8}, + ) power2 = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.5)}, purities={'tumor': 0.6}) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.5)}, + purities={"tumor": 0.6}, + ) self.assertLess( - power1.calculate_power(dp=10, ac=2, sample='tumor', variant=None), - power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None)) - + power1.calculate_power(dp=10, ac=2, sample="tumor", variant=None), + power2.calculate_power(dp=10, ac=2, sample="tumor", variant=None), + ) power3 = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.5)}, purities={'tumor': 0.4}) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.5)}, + purities={"tumor": 0.4}, + ) self.assertLess( - power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None), - power3.calculate_power(dp=10, ac=2, sample='tumor', variant=None)) + power2.calculate_power(dp=10, ac=2, sample="tumor", variant=None), + power3.calculate_power(dp=10, ac=2, sample="tumor", variant=None), + ) def test_varying_ploidy(self): power1 = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8}) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.0)}, + purities={"tumor": 0.8}, + ) power2 = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=4.0)}, purities={'tumor': 0.8}) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=4.0)}, + purities={"tumor": 0.8}, + ) self.assertLess( - power1.calculate_power(dp=10, ac=2, sample='tumor', variant=None), - power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None)) + power1.calculate_power(dp=10, ac=2, sample="tumor", variant=None), + power2.calculate_power(dp=10, ac=2, sample="tumor", variant=None), + ) power3 = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=6.0)}, purities={'tumor': 0.8}) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=6.0)}, + purities={"tumor": 0.8}, + ) self.assertLess( - power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None), - power3.calculate_power(dp=10, ac=2, sample='tumor', variant=None)) + power2.calculate_power(dp=10, ac=2, sample="tumor", variant=None), + power3.calculate_power(dp=10, ac=2, sample="tumor", variant=None), + ) def test_local_copy_numbers(self): - input_bed = pkg_resources.resource_filename(__name__, "resources/test_copy_numbers.bed") + input_bed = pkg_resources.resource_filename( + __name__, "resources/test_copy_numbers.bed" + ) power = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(local_copy_numbers=input_bed)}, purities={'tumor': 0.8}) + tumor_ploidies={"tumor": PloidyManager(local_copy_numbers=input_bed)}, + purities={"tumor": 0.8}, + ) self.assertNotEqual( - power.calculate_power(dp=10, ac=2, sample='tumor', - variant=VafatorVariant( - chromosome='chr1', position=10001, reference='A', alternative='G')), - power.calculate_power(dp=10, ac=2, sample='tumor', - variant=VafatorVariant( - chromosome='chr1', position=20001, reference='A', alternative='G'))) + power.calculate_power( + dp=10, + ac=2, + sample="tumor", + variant=VafatorVariant( + chromosome="chr1", position=10001, reference="A", alternative="G" + ), + ), + power.calculate_power( + dp=10, + ac=2, + sample="tumor", + variant=VafatorVariant( + chromosome="chr1", position=20001, reference="A", alternative="G" + ), + ), + ) self.assertEqual( - power.calculate_power(dp=10, ac=2, sample='tumor', - variant=VafatorVariant( - chromosome='chr1', position=10001, reference='A', alternative='G')), - power.calculate_power(dp=10, ac=2, sample='tumor', - variant=VafatorVariant( - chromosome='chr1', position=10002, reference='A', alternative='G'))) + power.calculate_power( + dp=10, + ac=2, + sample="tumor", + variant=VafatorVariant( + chromosome="chr1", position=10001, reference="A", alternative="G" + ), + ), + power.calculate_power( + dp=10, + ac=2, + sample="tumor", + variant=VafatorVariant( + chromosome="chr1", position=10002, reference="A", alternative="G" + ), + ), + ) def test_absolute_power_calculator(self): - ploidy_manager = {'tumor': PloidyManager(genome_wide_ploidy=2)} - - calculator = PowerCalculator(tumor_ploidies=ploidy_manager, purities={'tumor': 0.8}) - p, k = calculator.calculate_absolute_power(dp=100, sample='tumor', variant=None) + ploidy_manager = {"tumor": PloidyManager(genome_wide_ploidy=2)} + calculator = PowerCalculator( + tumor_ploidies=ploidy_manager, purities={"tumor": 0.8} + ) + p, k = calculator.calculate_absolute_power(dp=100, sample="tumor", variant=None) self.assertEqual(p, 1.0) self.assertEqual(k, 4) - p, k = calculator.calculate_absolute_power(dp=50, sample='tumor', variant=None) + p, k = calculator.calculate_absolute_power(dp=50, sample="tumor", variant=None) self.assertEqual(p, 1.0) self.assertEqual(k, 4) - p, k = calculator.calculate_absolute_power(dp=10, sample='tumor', variant=None) + p, k = calculator.calculate_absolute_power(dp=10, sample="tumor", variant=None) self.assertEqual(p, 0.85408) self.assertEqual(k, 3) - p, k = calculator.calculate_absolute_power(dp=2, sample='tumor', variant=None) + p, k = calculator.calculate_absolute_power(dp=2, sample="tumor", variant=None) self.assertEqual(p, 0.16009) self.assertEqual(k, 2) - calculator = PowerCalculator(tumor_ploidies=ploidy_manager, purities={'tumor': 0.6}) - p, k = calculator.calculate_absolute_power( - dp=100, sample='tumor', variant=None) + calculator = PowerCalculator( + tumor_ploidies=ploidy_manager, purities={"tumor": 0.6} + ) + p, k = calculator.calculate_absolute_power(dp=100, sample="tumor", variant=None) self.assertEqual(p, 1.0) self.assertEqual(k, 4) - p, k = calculator.calculate_absolute_power(dp=50, sample='tumor', variant=None) + p, k = calculator.calculate_absolute_power(dp=50, sample="tumor", variant=None) self.assertEqual(p, 1.00007) self.assertEqual(k, 4) - p, k = calculator.calculate_absolute_power(dp=10, sample='tumor', variant=None) + p, k = calculator.calculate_absolute_power(dp=10, sample="tumor", variant=None) self.assertEqual(p, 0.64373) self.assertEqual(k, 3) - p, k = calculator.calculate_absolute_power(dp=2, sample='tumor', variant=None) + p, k = calculator.calculate_absolute_power(dp=2, sample="tumor", variant=None) self.assertEqual(p, 0.09005) self.assertEqual(k, 2) - calculator = PowerCalculator(tumor_ploidies=ploidy_manager, purities={'tumor': 0.1}) - p, k = calculator.calculate_absolute_power( - dp=100, sample='tumor', variant=None) + calculator = PowerCalculator( + tumor_ploidies=ploidy_manager, purities={"tumor": 0.1} + ) + p, k = calculator.calculate_absolute_power(dp=100, sample="tumor", variant=None) self.assertEqual(p, 0.75607) self.assertEqual(k, 4) - p, k = calculator.calculate_absolute_power(dp=50, sample='tumor', variant=None) + p, k = calculator.calculate_absolute_power(dp=50, sample="tumor", variant=None) self.assertEqual(p, 0.33419) self.assertEqual(k, 4) - p, k = calculator.calculate_absolute_power(dp=10, sample='tumor', variant=None) + p, k = calculator.calculate_absolute_power(dp=10, sample="tumor", variant=None) self.assertEqual(p, 0.01254) self.assertEqual(k, 3) - p, k = calculator.calculate_absolute_power(dp=2, sample='tumor', variant=None) + p, k = calculator.calculate_absolute_power(dp=2, sample="tumor", variant=None) self.assertEqual(p, 0.0025) self.assertEqual(k, 2) + + def test_default_purity_is_one(self): + power = PowerCalculator(tumor_ploidies={}, purities={}) + eaf = power.calculate_expected_vaf(sample="tumor", variant=None) + # purity=1, normal_ploidy=2, tumor_ploidy=2 => eaf = 1/(1*2 + 0*2) = 0.5 + self.assertAlmostEqual(eaf, 0.5) diff --git a/vafator/tests/test_rank_sum_test.py b/vafator/tests/test_rank_sum_test.py index 66f4f65..870cdc5 100644 --- a/vafator/tests/test_rank_sum_test.py +++ b/vafator/tests/test_rank_sum_test.py @@ -1,5 +1,7 @@ +from math import isnan from unittest import TestCase -from vafator.rank_sum_test import calculate_rank_sum_test +from vafator.rank_sum_test import calculate_rank_sum_test, get_rank_sum_tests +from vafator.pileups import VariantRecord class TestRankSumTest(TestCase): @@ -31,3 +33,37 @@ def test_gatk_example(self): stat, pvalue = calculate_rank_sum_test(alt_distribution, ref_distribution) self.assertEqual(stat, -2.154) self.assertEqual(pvalue, 0.03121) + + def test_empty_alternate_returns_nan(self): + stat, pvalue = calculate_rank_sum_test([], [1, 2, 3]) + self.assertTrue(isnan(stat)) + self.assertTrue(isnan(pvalue)) + + def test_empty_reference_returns_nan(self): + stat, pvalue = calculate_rank_sum_test([1, 2, 3], []) + self.assertTrue(isnan(stat)) + self.assertTrue(isnan(pvalue)) + + def test_both_empty_returns_nan(self): + stat, pvalue = calculate_rank_sum_test([], []) + self.assertTrue(isnan(stat)) + self.assertTrue(isnan(pvalue)) + + def test_get_rank_sum_tests_snv(self): + variant = VariantRecord(CHROM="chr1", POS=100, REF="A", ALT=["T"]) + distributions = { + "A": [20, 25, 30, 35, 40], # ref is higher + "T": [1, 5, 10, 15, 20], # alt is lower + } + pvalues, stats = get_rank_sum_tests(distributions, variant) + self.assertEqual(len(stats), 1) + self.assertEqual(len(pvalues), 1) + # alt < ref so ranksums(alt, ref) returns negative stat + self.assertLess(float(stats[0]), 0.0) + + def test_get_rank_sum_tests_no_alt_reads(self): + variant = VariantRecord(CHROM="chr1", POS=100, REF="A", ALT=["T"]) + distributions = {"A": [20, 25, 30]} # no T reads + pvalues, stats = get_rank_sum_tests(distributions, variant) + self.assertEqual(stats, []) + self.assertEqual(pvalues, []) diff --git a/vafator/tests/utils.py b/vafator/tests/utils.py index 098f844..2d490d6 100755 --- a/vafator/tests/utils.py +++ b/vafator/tests/utils.py @@ -1,43 +1,42 @@ -from cyvcf2 import VCF - - -def _get_count_variants(input_file): - vcf = VCF(input_file) - n_variants = 0 - for v in vcf: - n_variants += 1 - vcf.close() - return n_variants - - -def _get_mutation_at_position(input_file, chromosome, position): - variant = None - vcf = VCF(input_file) - for v in vcf(): # we cannot query by specific positions as this requires a tabix index - if v.CHROM == chromosome and v.POS == position: - variant = v - break - vcf.close() - return variant - - -def _get_info_fields(input_file): - vcf = VCF(input_file) - return [h.info().get("ID") for h in vcf.header_iter() if h['HeaderType'] == 'INFO'] - - -def _get_annotation_values(input_file, annotation): - vcf = VCF(input_file) - values = [] - for v in vcf: - values.append(v.INFO.get(annotation)) - return values - - -class VafatorVariant: - - def __init__(self, chromosome, position, reference, alternative): - self.CHROM = chromosome - self.POS = position - self.REF = reference - self.ALT = alternative +from cyvcf2 import VCF +from vafator.pileups import VariantRecord + + +def _get_count_variants(input_file): + vcf = VCF(input_file) + n_variants = 0 + for v in vcf: + n_variants += 1 + vcf.close() + return n_variants + + +def _get_mutation_at_position(input_file, chromosome, position): + variant = None + vcf = VCF(input_file) + for ( + v + ) in vcf: # we cannot query by specific positions as this requires a tabix index + if v.CHROM == chromosome and v.POS == position: + variant = v + break + vcf.close() + return variant + + +def _get_info_fields(input_file): + vcf = VCF(input_file) + return [h.info().get("ID") for h in vcf.header_iter() if h["HeaderType"] == "INFO"] + + +def _get_annotation_values(input_file, annotation): + vcf = VCF(input_file) + values = [] + for v in vcf: + values.append(v.INFO.get(annotation)) + return values + + +def VafatorVariant(chromosome, position, reference, alternative): + """Backwards-compatible factory for VariantRecord using legacy positional kwarg names.""" + return VariantRecord(CHROM=chromosome, POS=position, REF=reference, ALT=alternative) diff --git a/vafator/vafator2decifer.py b/vafator/vafator2decifer.py index 32ae4b8..6ebf447 100644 --- a/vafator/vafator2decifer.py +++ b/vafator/vafator2decifer.py @@ -13,7 +13,6 @@ python vcf_2_decifer.py [OPTIONS] """ -import re import sys import pybedtools as pbt from cyvcf2 import VCF, Variant @@ -21,7 +20,6 @@ import pandas as pd import numpy as np from collections import defaultdict -import argparse def filterByDepthAndVaf(variant: Variant, Filter, samples): @@ -31,20 +29,29 @@ def filterByDepthAndVaf(variant: Variant, Filter, samples): for s in samples: # filter if genotype has low depth or is missing - if variant.INFO["{}_dp".format(s)] < Filter['MinDepth']: + if variant.INFO["{}_dp".format(s)] < Filter["MinDepth"]: missing += 1 # filter if alt allele isn't greater than the specified threshold in at least one sample - if not any(np.greater_equal([variant.INFO["{}_ac".format(s)] for s in samples], Filter['MinDepthAltAllele'])): + if not any( + np.greater_equal( + [variant.INFO["{}_ac".format(s)] for s in samples], + Filter["MinDepthAltAllele"], + ) + ): missing += 1 # filter if VAF isn't greater than the specified threshold in at least one sample - if not any(np.greater_equal([variant.INFO["{}_af".format(s)] for s in samples], Filter['MinVAF'])): + if not any( + np.greater_equal( + [variant.INFO["{}_af".format(s)] for s in samples], Filter["MinVAF"] + ) + ): missing += 1 if missing > 0: PASS = 0 - return (PASS) + return PASS def compute_ref_var_depths(vcf, FilterDP, samples): @@ -56,7 +63,9 @@ def compute_ref_var_depths(vcf, FilterDP, samples): PASS = filterByDepthAndVaf(variant, FilterDP, samples) # print(np.greater_equal(variant.gt_alt_depths,FilterDP['MinDepthAltAllele'])) if PASS: - char_label = ".".join(map(str, [variant.CHROM, variant.POS, variant.REF, variant.ALT[0]])) + char_label = ".".join( + map(str, [variant.CHROM, variant.POS, variant.REF, variant.ALT[0]]) + ) for s in samples: alt = variant.INFO["{}_ac".format(s)] ref = variant.INFO["{}_dp".format(s)] - alt @@ -69,14 +78,19 @@ def print_output(ref_var_depths, cna_overlaps, outdir, samples): chars = ref_var_depths.keys() & cna_overlaps.keys() header = [str(len(chars)) + " #characters"] header.append(str(len(samples)) + " #samples") - header.append("#sample_index\tsample_label\tcharacter_index\tcharacter_label\tref\tvar") + header.append( + "#sample_index\tsample_label\tcharacter_index\tcharacter_label\tref\tvar" + ) # print(header) - with open(f"{outdir}/decifer.input.tsv", 'w') as out: + with open(f"{outdir}/decifer.input.tsv", "w") as out: print("\n".join(header), file=out) for char_label in ref_var_depths: if char_label in cna_overlaps: for i in range(len(samples)): - r, v = ref_var_depths[char_label][i][0], ref_var_depths[char_label][i][1] + r, v = ( + ref_var_depths[char_label][i][0], + ref_var_depths[char_label][i][1], + ) to_print = [i, samples[i], char_index, char_label, r, v] cnas = cna_overlaps[char_label][i] to_print.extend(cnas) @@ -88,14 +102,14 @@ def print_output(ref_var_depths, cna_overlaps, outdir, samples): def get_purities(cna_df, num_samples, min_purity): purities = {} for i, row in cna_df.head(num_samples + 1).iterrows(): - purity = 1.0 - row['u_normal'] + purity = 1.0 - row["u_normal"] if purity >= min_purity: - purities[row['SAMPLE']] = purity + purities[row["SAMPLE"]] = purity return purities def print_purities(purities, sample_index, num_samples, outdir): - with open(f"{outdir}/decifer.purity.tsv", 'w') as out: + with open(f"{outdir}/decifer.purity.tsv", "w") as out: for sample in sample_index: print(sample_index[sample], purities[sample], file=out, sep="\t") @@ -113,21 +127,21 @@ def filter_high_CN_sites(cn_states_persite, max_CN): def print_unique_CN_states(cn_states, max_CN, outdir): # print unique copy number states for sites that are below the max_CN threshold cn_states = tuple(set(cn_states)) - with open(f"{outdir}/cn_states.txt", 'w') as out: + with open(f"{outdir}/cn_states.txt", "w") as out: for value in cn_states: PASS = 1 for i in value: if int(i[0]) + int(i[1]) > max_CN: PASS = 0 if PASS: - out.write(';'.join([','.join(i) for i in value]) + '\n') + out.write(";".join([",".join(i) for i in value]) + "\n") def print_filtered_sites(filtered_sites, cna_overlaps, outdir): - with open(f"{outdir}/filtered_sites.txt", 'w') as out: + with open(f"{outdir}/filtered_sites.txt", "w") as out: out.write("\n".join(filtered_sites)) print(file=out) - with open(f"{outdir}/filtered_stats.txt", 'w') as out: + with open(f"{outdir}/filtered_stats.txt", "w") as out: filtered = len(filtered_sites) total = len(cna_overlaps.keys()) print("# sites that were filtered due to copy-number states > max_CN", file=out) @@ -163,7 +177,8 @@ def overlap_cna_snp(vcf_samples, max_CN, snps, out_dir): if filter_high_CN_sites(cn_info.keys(), max_CN): # store results, converting from dict to a list for later printing cna_info = [] - [cna_info.extend([c.split("|")[0], c.split("|")[1], cn_info[c]]) for c in cn_info] + for c in cn_info: + cna_info.extend([c.split("|")[0], c.split("|")[1], cn_info[c]]) cna_overlaps[char_label].append(cna_info) else: filtered_sites.add(char_label) @@ -180,22 +195,24 @@ def run_vafator2decifer(args): num_samples = len(args.samples.split(",")) if args.samples is not None else 0 # Load in CNA information - cna_df = pd.read_csv(args.cna_file, sep='\t', index_col=False) + cna_df = pd.read_csv(args.cna_file, sep="\t", index_col=False) # get purities and filter by min_purity purities = get_purities(cna_df, num_samples, args.min_purity) # restrict samples considered in VCF and CNA file to those that have purity > min_purity restricted_samples = list(purities.keys()) - cna_df = cna_df.loc[cna_df['SAMPLE'].isin(list(purities.keys()))] + cna_df = cna_df.loc[cna_df["SAMPLE"].isin(list(purities.keys()))] # print new CNA file, filtering out samples below min_purity cna_df.to_csv(f"{args.out_dir}/best.seg.ucn", sep="\t", index=False) if args.snp_file: - snp_df = pd.read_csv(args.snp_file, sep='\t', index_col=False, header=None) + snp_df = pd.read_csv(args.snp_file, sep="\t", index_col=False, header=None) snp_df = snp_df.loc[snp_df[2].isin(list(purities.keys()))] # rearrange columns for decifer snp_df = snp_df[[2, 0, 1, 3, 4]] - snp_df.to_csv(f"{args.out_dir}/snpfile.1bed", sep="\t", index=False, header=False) + snp_df.to_csv( + f"{args.out_dir}/snpfile.1bed", sep="\t", index=False, header=False + ) num_samples = len(restricted_samples) # print purity information @@ -204,21 +221,25 @@ def run_vafator2decifer(args): # Filtering criteria Filter = {} - Filter['MinDepth'] = args.min_depth - Filter['MinDepthAltAllele'] = args.min_alt_depth - Filter['MinVAF'] = args.min_vaf + Filter["MinDepth"] = args.min_depth + Filter["MinDepthAltAllele"] = args.min_alt_depth + Filter["MinVAF"] = args.min_vaf # ref_var_depths[char_label] = list of (ref,alt) tuples, one for each sample, in same order as vcf.samples ref_var_depths = compute_ref_var_depths(vcf, Filter, samples=restricted_samples) # print BED file for SNPs - with open(f"{args.out_dir}/snps.bed", 'w') as out: + with open(f"{args.out_dir}/snps.bed", "w") as out: print("chrom\tstart\tend\tREF\tALT", file=out) # sort ref_var_depths by the first two parts of chr_label - for chr_label in sorted(ref_var_depths, key=lambda x: (x.split('.')[0], int(x.split('.')[1]))): + for chr_label in sorted( + ref_var_depths, key=lambda x: (x.split(".")[0], int(x.split(".")[1])) + ): pos = chr_label.split(".") # subtract 1 from position to create interval in BED format - print(pos[0], int(pos[1]) - 1, int(pos[1]), pos[2], pos[3], sep="\t", file=out) + print( + pos[0], int(pos[1]) - 1, int(pos[1]), pos[2], pos[3], sep="\t", file=out + ) snps = pbt.BedTool(f"{args.out_dir}/snps.bed") if args.exclude_list: @@ -227,16 +248,18 @@ def run_vafator2decifer(args): # prepare BED files for CNA intervals for each sample, for overlapping with SNPs for sample in restricted_samples: - df = cna_df[cna_df['SAMPLE'] == sample] + df = cna_df[cna_df["SAMPLE"] == sample] # consider subtracting 1 from start of interval to be compatible with BED format, leave end interval alone - df.loc[:, 'START'] = df['START'] - df = df.drop('SAMPLE', axis=1) + df.loc[:, "START"] = df["START"] + df = df.drop("SAMPLE", axis=1) df.to_csv(f"{args.out_dir}/{sample}_cna.bed", index=False, sep="\t") # overlap SNPs with CNA intervals for each sample # cna_overlaps[char_label] = list of tuples of CNA info (one tuple for each sample, in same order as vcf.samples) # this function also prints the observed CN state trees for the generatestatetrees function - cna_overlaps, cn_states_allsites, filtered_sites = overlap_cna_snp(restricted_samples, args.max_CN, snps, args.out_dir) + cna_overlaps, cn_states_allsites, filtered_sites = overlap_cna_snp( + restricted_samples, args.max_CN, snps, args.out_dir + ) # sites may have unique CN states that are duplicate; set them to find unique CN states across sites print_unique_CN_states(cn_states_allsites, args.max_CN, args.out_dir)