From 34f41a80c18f0677b6d7a2048fadf493ae60a1fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Fri, 13 Mar 2026 10:07:14 +0100
Subject: [PATCH 01/32] Update gitignore

---
 .gitignore | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index fc2bc44..2abcd34 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,4 +14,6 @@ out
 vafator/tests/resources/results
 .cache
 .jupyter
-.local
\ No newline at end of file
+.local
+run.sh
+VAFator.egg-info/*

From 78c870f8ef545b6b308c6bf7618de1762af8c4a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Fri, 13 Mar 2026 15:13:32 +0100
Subject: [PATCH 02/32] Upgrade Python and the Python packages.

* Reprodubility of the output confirmed using WES_EA_1 sample from SEQC2 - a more comprehensive test for INDELs may be necessary
* Due ensure reprodubility, the methods get_variant_pileup and get_snv_metrics are updated.
* pysam could only be upgraded till 0.21.0, as above this version (up to and including 0.23.3) there are inconsistencies in base qualities. Python was therefore set to 3.11
---
 requirements.txt    |  7 ------
 setup.cfg           | 58 ++++++++++++++++++++++++++++++++++++++++++++-
 setup.py            | 43 +--------------------------------
 vafator/__init__.py |  2 +-
 vafator/pileups.py  | 52 ++++++++++++++++++++++++++++++----------
 5 files changed, 98 insertions(+), 64 deletions(-)
 delete mode 100755 requirements.txt

diff --git a/requirements.txt b/requirements.txt
deleted file mode 100755
index bdd5c16..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-pandas~=1.3.3
-pysam~=0.19.1
-cyvcf2~=0.30.14
-logzero~=1.7.0
-pybedtools~=0.9.0
-numpy>=1.20,<2.0
-scipy>=1.0.0,<2.0.0
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index a2f3748..b74831c 100755
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,2 +1,58 @@
 [metadata]
-description-file = README.md
\ No newline at end of file
+name = VAFator
+version = 3.0.0
+description = Annotate variants in a VCF file with technical annotations from one or more BAMs 
+description-file = README.md
+long_description = file: README.md
+long_description_content_type = text/markdown
+license = MIT
+url = https://github.com/TRON-Bioinformatics/vafator
+author = Pablo Riesgo Ferreiro, Jonas Ibn-Salem, Luis Kress, Özlem Muslu
+classifiers =
+    Development Status :: 4 - Beta
+    Intended Audience :: Healthcare Industry
+    Intended Audience :: Science/Research
+    Topic :: Scientific/Engineering :: Bio-Informatics
+    Programming Language :: Python :: 3.11
+    Programming Language :: Python :: 3.12
+    Programming Language :: Python :: 3.13
+    Programming Language :: Python :: 3 :: Only
+    License :: OSI Approved :: MIT License
+    Operating System :: Unix
+author_email = priesgoferreiro@gmail.com
+
+[options.entry_points]
+console_scripts = 
+    vafator=vafator.command_line:annotator
+    multiallelics-filter=vafator.command_line:multiallelics_filter
+    vafator2decifer=vafator.command_line:vafator2decifer
+    hatchet2bed=vafator.command_line:hatchet2bed
+
+[options]
+packages = find:
+include_package_data = True
+zip_safe = False
+
+python_requires = ==3.11
+
+install_requires =
+    pandas>=3.0.1,<4
+    pysam==0.21.0 # above this version base qualities show up wrong in the presence of soft clipping/insertions/both (latest release 0.23.3)
+    cyvcf2>=0.32.1,<0.33
+    logzero>=1.7.0,<2
+    pybedtools>=0.12.0,<0.13
+    numpy>=2.4.3,<3
+    scipy>=1.17.1,<2
+
+[options.packages.find]
+exclude =
+    tests
+    tests.*
+    legacy
+    legacy.*
+
+[options.extras_require]
+dev =
+    pytest
+    ruff
+    mypy
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 1025dd9..398663e 100755
--- a/setup.py
+++ b/setup.py
@@ -4,49 +4,8 @@
 
 VERSION = vafator.VERSION
 
-
-# parses requirements from file
-with open("requirements.txt") as f:
-    required = f.read().splitlines()
-
 with open("README.md", "r", encoding="utf-8") as f:
     long_description = f.read()
 
 # Build the Python package
-setup(
-    name='vafator',
-    version=VERSION,
-    packages=find_packages(exclude=["legacy"]),
-    entry_points={
-        'console_scripts': [
-            'vafator=vafator.command_line:annotator',
-            'multiallelics-filter=vafator.command_line:multiallelics_filter',
-            'vafator2decifer=vafator.command_line:vafator2decifer',
-            'hatchet2bed=vafator.command_line:hatchet2bed'
-        ],
-    },
-    author="TRON - Translational Oncology at the University Medical Center of the Johannes Gutenberg University Mainz"
-    "- Computational Medicine group",
-    author_email='pablo.riesgoferreiro@tron-mainz.de',
-    description='Annotate a VCF file with AF, AD and DP from tumor and normal BAMs',
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    url="https://github.com/tron-bioinformatics/vafator",
-    requires=[],
-    install_requires=required,
-    classifiers=[
-        'Development Status :: 4 - Beta',      # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package
-        'Intended Audience :: Healthcare Industry',
-        'Intended Audience :: Science/Research',
-        'Topic :: Scientific/Engineering :: Bio-Informatics',
-        'Programming Language :: Python :: 3.7',
-        'Programming Language :: Python :: 3.8',
-        'Programming Language :: Python :: 3.9',
-        'Programming Language :: Python :: 3.10',
-        'Programming Language :: Python :: 3 :: Only',
-        "License :: OSI Approved :: MIT License",
-        "Operating System :: Unix"
-      ],
-    python_requires='>=3.7',
-    license='MIT'
-)
\ No newline at end of file
+setup()
\ No newline at end of file
diff --git a/vafator/__init__.py b/vafator/__init__.py
index 90a0094..b41f4d7 100755
--- a/vafator/__init__.py
+++ b/vafator/__init__.py
@@ -1,4 +1,4 @@
-VERSION='2.2.2'
+VERSION='3.0.0'
 
 
 AMBIGUOUS_BASES = ['N', 'M', 'R', 'W', 'S', 'Y', 'K', 'V', 'H', 'D', 'B']
diff --git a/vafator/pileups.py b/vafator/pileups.py
index 65160a4..6c54eac 100755
--- a/vafator/pileups.py
+++ b/vafator/pileups.py
@@ -28,9 +28,11 @@ def get_variant_pileup(
     # approximately +- read size bp
     return bam.pileup(contig=variant.CHROM, start=position - 1, stop=position,
                       truncate=True,                    # returns only this column
-                      max_depth=0,                      # disables maximum depth
+                      max_depth=1000000,
                       min_base_quality=min_base_quality,
-                      min_mapping_quality=min_mapping_quality)
+                      min_mapping_quality=min_mapping_quality,
+                      stepper='samtools',
+                    )
 
 
 @dataclass
@@ -171,21 +173,45 @@ def get_deletion_metrics(variant: Variant, pileups: IteratorColumnRegion) -> Cov
 def get_snv_metrics(pileups: IteratorColumnRegion, include_ambiguous_bases=False) -> CoverageMetrics:
     try:
         pileup = next(pileups)
-        bases = [s.upper() for s in pileup.get_query_sequences()]
 
-        bqs = aggregate_median_per_base(bases, pileup.get_query_qualities())
-        mqs = aggregate_median_per_base(bases, pileup.get_mapping_qualities())
-        positions = aggregate_median_per_base(bases, pileup.get_query_positions())
-        all_bqs = aggregate_list_per_base(bases, pileup.get_query_qualities())
-        all_mqs = aggregate_list_per_base(bases, pileup.get_mapping_qualities())
-        all_positions = aggregate_list_per_base(bases, pileup.get_query_positions())
+        bases = []
+        qualities = []
+        mapping_qualities = []
+        query_positions = []
+
+        # to reproduce the older versions of Vafator, include deletions at query position when computing stats 
+        for read in pileup.pileups:
+            if read.is_refskip:
+                continue
+            if read.is_del:
+                bases.append("")
+                qualities.append(0)  # no base quality for deletions
+                mapping_qualities.append(read.alignment.mapping_quality)
+                query_positions.append(read.query_position_or_next)
+            else:
+                base = read.alignment.query_sequence[read.query_position].upper()
+                bases.append(base)
+                qualities.append(read.alignment.query_qualities[read.query_position])
+                mapping_qualities.append(read.alignment.mapping_quality)
+                query_positions.append(read.query_position)
+
+
+        all_bqs = aggregate_list_per_base(bases, qualities)
+        all_mqs = aggregate_list_per_base(bases, mapping_qualities)
+        all_positions = aggregate_list_per_base(bases, query_positions)
+
+        bqs = Counter({b: np.median(l) for b, l in all_bqs.items()})
+        mqs = Counter({b: np.median(l) for b, l in all_mqs.items()})
+        positions = Counter({b: np.median(l) for b, l in all_positions.items()})
+
+        # print('Summary', bases, qualities, mapping_qualities, '\n\n\n')
+
+        ac = Counter(b for b in bases if b != "")  # deletions don't count as alleles
 
         if include_ambiguous_bases:
-            dp = len([b for b in bases if b!= ""])
+            dp = len(bases)
         else:
-            # remove ambiguous bases and reads where the position is spliced out
-            dp = len([b for b in bases if b not in AMBIGUOUS_BASES and b != ""])
-        ac = Counter(bases)
+            dp = sum(1 for b in bases if b == "" or b not in AMBIGUOUS_BASES)
     except StopIteration:
         # no reads
         dp = 0

From 2a69e4d46f5b8f17be041698ec37ce985b7ccb9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Fri, 13 Mar 2026 15:17:38 +0100
Subject: [PATCH 03/32] Introduce null check before rank sum test

---
 vafator/rank_sum_test.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vafator/rank_sum_test.py b/vafator/rank_sum_test.py
index 384bd37..b9a3b4d 100644
--- a/vafator/rank_sum_test.py
+++ b/vafator/rank_sum_test.py
@@ -4,6 +4,8 @@
 
 
 def calculate_rank_sum_test(alternate_dist: List[int], reference_dist: List[int]) -> Tuple[float, float]:
+    if not alternate_dist or not reference_dist:  # skip empty distributions
+        return np.nan, np.nan
     stat, pvalue = scipy.stats.ranksums(x=alternate_dist, y=reference_dist)
     return round(stat, 3), round(pvalue, 5)
 

From 055dc23bac88cf35b1900486edd3d10b0600f5e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Fri, 13 Mar 2026 17:01:51 +0100
Subject: [PATCH 04/32] perf: major performance optimizations reducing runtime
 ~4-5x

replace per-variant pileup() calls with a
single streaming pileup iterator per BAM per chromosome. Variants are
buffered by chromosome and metrics are computed immediately as each
pileup column is visited (avoids segfault from storing invalidated
PileupColumn C objects). Reduces pysam pileup __init__/__dealloc__
overhead from ~80% of total runtime to negligible.
---
 vafator/annotator.py |  69 ++++++---
 vafator/pileups.py   | 346 ++++++++++++++++++++++++-------------------
 2 files changed, 242 insertions(+), 173 deletions(-)

diff --git a/vafator/annotator.py b/vafator/annotator.py
index 00f8293..62f311c 100755
--- a/vafator/annotator.py
+++ b/vafator/annotator.py
@@ -13,7 +13,9 @@
 from vafator.ploidies import DEFAULT_PLOIDY
 from vafator.rank_sum_test import calculate_rank_sum_test, get_rank_sum_tests
 from vafator.power import PowerCalculator, DEFAULT_ERROR_RATE, DEFAULT_FPR
-from vafator.pileups import get_variant_pileup, get_metrics
+from vafator.pileups import (
+    collect_metrics_for_chrom, stream_variants_by_chrom, EMPTY_METRICS
+)
 
 
 BATCH_SIZE = 10000
@@ -288,7 +290,11 @@ def _write_batch(self, batch):
         for v in batch:
             self.vcf_writer.write_record(v)
 
-    def _add_stats(self, variant: Variant):
+    def _add_stats(self, variant: Variant, metrics_by_bam: dict):
+        """
+        Annotate a single variant using pre-computed metrics.
+        metrics_by_bam: {(sample, bam_index): CoverageMetrics}
+        """
         for sample, bams in self.bam_readers.items():
             global_dp = 0
             global_ac = Counter()
@@ -298,13 +304,10 @@ def _add_stats(self, variant: Variant):
             global_all_mqs = {}
             global_all_bqs = {}
             global_all_positions = {}
+
             for i, bam in enumerate(bams):
-                pileups = get_variant_pileup(
-                    variant=variant, bam=bam,
-                    min_base_quality=self.base_call_quality_threshold,
-                    min_mapping_quality=self.mapping_quality_threshold)
-                coverage_metrics = get_metrics(variant=variant, pileups=pileups,
-                                               include_ambiguous_bases=self.include_ambiguous_bases)
+                coverage_metrics = metrics_by_bam.get((sample, i), EMPTY_METRICS)
+
                 if coverage_metrics is not None:
                     if len(bams) > 1:
                         variant.INFO["{}_af_{}".format(sample, i + 1)] = \
@@ -377,8 +380,7 @@ def _add_stats(self, variant: Variant):
             variant.INFO["{}_pos".format(sample)] = ",".join(
                 [str(global_pos[variant.REF])] + [str(global_pos[alt]) for alt in variant.ALT])
 
-            # for these rank sum tests it is required at least one value for the alternate and one value for the
-            # reference otherwise it cannot be calculated
+            # rank sum tests require at least one ref and one alt value
             pvalues, stats = get_rank_sum_tests(global_all_mqs, variant)
             if stats:
                 variant.INFO["{}_rsmq".format(sample)] = ",".join(stats)
@@ -399,16 +401,41 @@ def _calculate_af(self, ac, dp):
 
     def run(self):
         batch = []
-        variant: Variant
-        for variant in self.vcf:
-            # gets the counts of all bases across all BAMs
-            self._add_stats(variant)
-
-            batch.append(variant)
-            if len(batch) >= BATCH_SIZE:
-                self._write_batch(batch)
-                batch = []
-        if len(batch) > 0:
+
+        for chrom, chrom_variants in stream_variants_by_chrom(self.vcf):
+            # For each BAM, open ONE pileup iterator per chromosome and compute
+            # metrics immediately for each column — avoids per-variant pileup overhead.
+            # Key: (sample, bam_index, variant_pos, REF, ALT[0]) -> CoverageMetrics
+            all_metrics = {}  # {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}}
+
+            for sample, bams in self.bam_readers.items():
+                for i, bam in enumerate(bams):
+                    all_metrics[(sample, i)] = collect_metrics_for_chrom(
+                        chrom=chrom,
+                        variants=chrom_variants,
+                        bam=bam,
+                        min_base_quality=self.base_call_quality_threshold,
+                        min_mapping_quality=self.mapping_quality_threshold,
+                        include_ambiguous_bases=self.include_ambiguous_bases,
+                    )
+
+            for variant in chrom_variants:
+                # build per-BAM metrics lookup for this specific variant
+                metrics_by_bam = {
+                    (sample, i): all_metrics[(sample, i)].get(
+                        (variant.POS, variant.REF, variant.ALT[0]), EMPTY_METRICS
+                    )
+                    for sample, bams in self.bam_readers.items()
+                    for i in range(len(bams))
+                }
+                self._add_stats(variant, metrics_by_bam)
+
+                batch.append(variant)
+                if len(batch) >= BATCH_SIZE:
+                    self._write_batch(batch)
+                    batch = []
+
+        if batch:
             self._write_batch(batch)
 
         time.sleep(2)
@@ -417,4 +444,4 @@ def run(self):
         self.vcf.close()
         for _, bams in self.bam_readers.items():
             for bam in bams:
-                bam.close()
+                bam.close()
\ No newline at end of file
diff --git a/vafator/pileups.py b/vafator/pileups.py
index 6c54eac..c9efd9e 100755
--- a/vafator/pileups.py
+++ b/vafator/pileups.py
@@ -1,6 +1,6 @@
-from collections import Counter
+from collections import Counter, defaultdict
 from dataclasses import dataclass
-from typing import Union
+from typing import Union, List, Dict, Iterator, Tuple
 from cyvcf2 import Variant
 from pysam.libcalignmentfile import IteratorColumnRegion, AlignmentFile
 
@@ -22,17 +22,97 @@ def is_deletion(variant: Variant):
 
 
 def get_variant_pileup(
-        variant: Union[Variant, VafatorVariant], bam: AlignmentFile, min_base_quality, min_mapping_quality) -> IteratorColumnRegion:
+        variant: Union[Variant, VafatorVariant], bam: AlignmentFile,
+        min_base_quality, min_mapping_quality) -> IteratorColumnRegion:
+    """Single-variant pileup, kept for backwards compatibility and tests."""
     position = variant.POS
-    # this function returns the pileups at all positions covered by reads covered the queried position
-    # approximately +- read size bp
     return bam.pileup(contig=variant.CHROM, start=position - 1, stop=position,
-                      truncate=True,                    # returns only this column
+                      truncate=True,
                       max_depth=1000000,
                       min_base_quality=min_base_quality,
                       min_mapping_quality=min_mapping_quality,
                       stepper='samtools',
-                    )
+                      )
+
+
+def get_region_pileup(chrom: str, start: int, end: int, bam: AlignmentFile,
+                      min_base_quality, min_mapping_quality):
+    """
+    Opens a single pileup iterator spanning a whole region (e.g. one chromosome).
+    start is 0-based inclusive, end is 1-based exclusive (last variant POS).
+    """
+    return bam.pileup(contig=chrom, start=start, stop=end,
+                      truncate=True,
+                      max_depth=1000000,
+                      min_base_quality=min_base_quality,
+                      min_mapping_quality=min_mapping_quality,
+                      stepper='samtools',
+                      )
+
+
+def stream_variants_by_chrom(vcf) -> Iterator[Tuple[str, List[Variant]]]:
+    """
+    Yields (chrom, [variants]) one chromosome at a time.
+    Only one chromosome's variants are held in memory at once.
+    """
+    current_chrom = None
+    current_variants = []
+    for variant in vcf:
+        if variant.CHROM != current_chrom:
+            if current_variants:
+                yield current_chrom, current_variants
+            current_chrom = variant.CHROM
+            current_variants = [variant]
+        else:
+            current_variants.append(variant)
+    if current_variants:
+        yield current_chrom, current_variants
+
+
+def collect_metrics_for_chrom(
+        chrom: str,
+        variants: List[Variant],
+        bam: AlignmentFile,
+        min_base_quality: int,
+        min_mapping_quality: int,
+        include_ambiguous_bases: bool = False) -> Dict[Tuple, 'CoverageMetrics']:
+    """
+    Opens ONE pileup iterator over the entire chromosome region covered by variants.
+    Metrics are computed IMMEDIATELY for each pileup column while it is still valid —
+    avoids segfaults from storing PileupColumn objects after the iterator advances.
+
+    Returns {(pos, REF, ALT[0]): CoverageMetrics}.
+    """
+    if not variants:
+        return {}
+
+    # index variants by 1-based position for O(1) lookup during streaming
+    variants_by_pos: Dict[int, List[Variant]] = defaultdict(list)
+    for v in variants:
+        variants_by_pos[v.POS].append(v)
+
+    start = variants[0].POS - 1   # 0-based inclusive
+    end = variants[-1].POS        # exclusive end for pysam
+
+    results: Dict[Tuple, CoverageMetrics] = {}
+
+    for pileup_col in get_region_pileup(
+            chrom=chrom, start=start, end=end,
+            bam=bam,
+            min_base_quality=min_base_quality,
+            min_mapping_quality=min_mapping_quality,
+    ):
+        ref_pos = pileup_col.reference_pos + 1  # convert to 1-based
+        if ref_pos not in variants_by_pos:
+            continue
+
+        # compute metrics NOW while pileup_col is still valid in C memory
+        for variant in variants_by_pos[ref_pos]:
+            metrics = _get_metrics_from_column(variant, pileup_col, include_ambiguous_bases)
+            if metrics is not None:
+                results[(ref_pos, variant.REF, variant.ALT[0])] = metrics
+
+    return results
 
 
 @dataclass
@@ -55,57 +135,103 @@ class CoverageMetrics:
     all_positions: dict = None
 
 
-def get_metrics(variant: Variant, pileups: IteratorColumnRegion, include_ambiguous_bases=False) -> CoverageMetrics:
+EMPTY_METRICS = CoverageMetrics(
+    ac=Counter(), dp=0, bqs=Counter(), mqs=Counter(), positions=Counter(),
+    all_bqs={}, all_mqs={}, all_positions={}
+)
+
+
+def _get_metrics_from_column(variant: Variant, pileup_col,
+                              include_ambiguous_bases=False) -> 'CoverageMetrics':
+    """Dispatch to the right metrics function based on variant type."""
     if is_snp(variant):
-        return get_snv_metrics(pileups, include_ambiguous_bases)
+        return _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases)
     elif is_insertion(variant):
-        return get_insertion_metrics(variant, pileups)
+        return _get_insertion_metrics_from_column(variant, pileup_col)
     elif is_deletion(variant):
-        return get_deletion_metrics(variant, pileups)
+        return _get_deletion_metrics_from_column(variant, pileup_col)
     return None
 
 
-def get_insertion_metrics(variant: Variant, pileups: IteratorColumnRegion) -> CoverageMetrics:
+def _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases=False) -> CoverageMetrics:
+    bases = []
+    qualities = []
+    mapping_qualities = []
+    query_positions = []
+
+    for read in pileup_col.pileups:
+        if read.is_refskip:
+            continue
+        if read.is_del:
+            bases.append("")
+            qualities.append(0)
+            mapping_qualities.append(read.alignment.mapping_quality)
+            query_positions.append(read.query_position_or_next)
+        else:
+            base = read.alignment.query_sequence[read.query_position].upper()
+            bases.append(base)
+            qualities.append(read.alignment.query_qualities[read.query_position])
+            mapping_qualities.append(read.alignment.mapping_quality)
+            query_positions.append(read.query_position)
+
+    all_bqs = aggregate_list_per_base(bases, qualities)
+    all_mqs = aggregate_list_per_base(bases, mapping_qualities)
+    all_positions = aggregate_list_per_base(bases, query_positions)
+
+    bqs = Counter({b: np.median(l) for b, l in all_bqs.items()})
+    mqs = Counter({b: np.median(l) for b, l in all_mqs.items()})
+    positions = Counter({b: np.median(l) for b, l in all_positions.items()})
+
+    ac = Counter(b for b in bases if b != "")
+
+    if include_ambiguous_bases:
+        dp = len(bases)
+    else:
+        dp = sum(1 for b in bases if b == "" or b not in AMBIGUOUS_BASES)
+
+    return CoverageMetrics(
+        ac=ac, dp=dp, bqs=bqs, mqs=mqs, positions=positions,
+        all_bqs=all_bqs, all_mqs=all_mqs, all_positions=all_positions
+    )
+
+
+def _get_insertion_metrics_from_column(variant: Variant, pileup_col) -> CoverageMetrics:
     ac = {alt.upper(): 0 for alt in variant.ALT}
     mq = {alt.upper(): [] for alt in variant.ALT}
     mq[variant.REF] = []
     pos = {alt.upper(): [] for alt in variant.ALT}
     pos[variant.REF] = []
-    dp = 0
     variant_position = variant.POS
     insertion_length = len(variant.ALT[0]) - len(variant.REF)
     insertion = variant.ALT[0][1:]
     alt_upper = variant.ALT[0].upper()
-    try:
-        pileups = next(pileups).pileups
-        dp += len(pileups)
-        for pileup_read in pileups:
-            if pileup_read.indel > 0:
-                # read with an insertion
-                index = pileup_read.alignment.reference_start
-                relative_position = 0
-                for cigar_type, cigar_length in pileup_read.alignment.cigartuples:
-                    if cigar_type in [0, 2, 3, 7, 8]:  # consumes reference M, D, N, =, X
-                        index += cigar_length
-                        if index > variant_position:
-                            break
-                    if cigar_type in [0, 1, 4, 7, 8]:  # consumes query M, I, S, =, X
-                        relative_position += cigar_length
-                    if cigar_type == 1:  # does not count I
-                        insertion_in_query = pileup_read.alignment.query[relative_position : relative_position + insertion_length]
-                        if index == variant_position and cigar_length == insertion_length and insertion == insertion_in_query:
-                            # the read contains the insertion
-                            ac[alt_upper] = ac[alt_upper] + 1
-                            mq[alt_upper].append(pileup_read.alignment.mapping_quality)
-                            pos[alt_upper].append(pileup_read.query_position_or_next)
-            elif pileup_read.indel == 0:
-                # NOTE: considers all reads without indels to be the reference!
-                mq[variant.REF].append(pileup_read.alignment.mapping_quality)
-                pos[variant.REF].append(pileup_read.query_position_or_next)
-
-    except StopIteration:
-        # no reads
-        pass
+
+    pileup_reads = pileup_col.pileups
+    dp = len(pileup_reads)
+
+    for pileup_read in pileup_reads:
+        if pileup_read.indel > 0:
+            index = pileup_read.alignment.reference_start
+            relative_position = 0
+            for cigar_type, cigar_length in pileup_read.alignment.cigartuples:
+                if cigar_type in [0, 2, 3, 7, 8]:
+                    index += cigar_length
+                    if index > variant_position:
+                        break
+                if cigar_type in [0, 1, 4, 7, 8]:
+                    relative_position += cigar_length
+                if cigar_type == 1:
+                    insertion_in_query = pileup_read.alignment.query[
+                                         relative_position:relative_position + insertion_length]
+                    if index == variant_position and cigar_length == insertion_length \
+                            and insertion == insertion_in_query:
+                        ac[alt_upper] += 1
+                        mq[alt_upper].append(pileup_read.alignment.mapping_quality)
+                        pos[alt_upper].append(pileup_read.query_position_or_next)
+        elif pileup_read.indel == 0:
+            mq[variant.REF].append(pileup_read.alignment.mapping_quality)
+            pos[variant.REF].append(pileup_read.query_position_or_next)
+
     return CoverageMetrics(
         ac=Counter(ac), dp=dp,
         mqs=Counter({k: np.median(l) for k, l in mq.items()}),
@@ -117,48 +243,39 @@ def get_insertion_metrics(variant: Variant, pileups: IteratorColumnRegion) -> Co
     )
 
 
-def get_deletion_metrics(variant: Variant, pileups: IteratorColumnRegion) -> CoverageMetrics:
+def _get_deletion_metrics_from_column(variant: Variant, pileup_col) -> CoverageMetrics:
     ac = {alt.upper(): 0 for alt in variant.ALT}
     mq = {alt.upper(): [] for alt in variant.ALT}
     mq[variant.REF] = []
     pos = {alt.upper(): [] for alt in variant.ALT}
     pos[variant.REF] = []
-    dp = 0
     variant_position = variant.POS
     deletion_length = len(variant.REF) - len(variant.ALT[0])
     alt_upper = variant.ALT[0].upper()
-    try:
-        pileups = next(pileups).pileups
-        dp += len(pileups)
-        for pileup_read in pileups:
-            if pileup_read.indel < 0:
-                # read with a deletion
-                start = pileup_read.alignment.reference_start
-                match = False
-                for cigar_type, cigar_length in pileup_read.alignment.cigartuples:
-                    if cigar_type in [0, 3, 7, 8]:  # consumes reference M, N, =, X
-                        start += cigar_length
-                    elif cigar_type == 2:  # D
-                        if start == variant_position and cigar_length == deletion_length:
-                            ac[alt_upper] = ac[alt_upper] + 1
-                            mq[alt_upper].append(pileup_read.alignment.mapping_quality)
-                            pos[alt_upper].append(pileup_read.query_position_or_next)
-                            match = True
-                            break
-                        else:
-                            start += cigar_length
-                    if start > variant_position:
+
+    pileup_reads = pileup_col.pileups
+    dp = len(pileup_reads)
+
+    for pileup_read in pileup_reads:
+        if pileup_read.indel < 0:
+            start = pileup_read.alignment.reference_start
+            for cigar_type, cigar_length in pileup_read.alignment.cigartuples:
+                if cigar_type in [0, 3, 7, 8]:
+                    start += cigar_length
+                elif cigar_type == 2:
+                    if start == variant_position and cigar_length == deletion_length:
+                        ac[alt_upper] += 1
+                        mq[alt_upper].append(pileup_read.alignment.mapping_quality)
+                        pos[alt_upper].append(pileup_read.query_position_or_next)
                         break
-                if not match:
-                    # TODO: when finds a read with an indel not matching our particular indel it counts it
-                    pass
-            elif pileup_read.indel == 0:
-                # NOTE: considers all reads without indels to be the reference!
-                mq[variant.REF].append(pileup_read.alignment.mapping_quality)
-                pos[variant.REF].append(pileup_read.query_position_or_next)
-    except StopIteration:
-        # no reads
-        pass
+                    else:
+                        start += cigar_length
+                if start > variant_position:
+                    break
+        elif pileup_read.indel == 0:
+            mq[variant.REF].append(pileup_read.alignment.mapping_quality)
+            pos[variant.REF].append(pileup_read.query_position_or_next)
+
     return CoverageMetrics(
         ac=Counter(ac), dp=dp,
         mqs=Counter({k: np.median(l) for k, l in mq.items()}),
@@ -169,85 +286,10 @@ def get_deletion_metrics(variant: Variant, pileups: IteratorColumnRegion) -> Cov
         all_bqs=Counter()
     )
 
-
-def get_snv_metrics(pileups: IteratorColumnRegion, include_ambiguous_bases=False) -> CoverageMetrics:
-    try:
-        pileup = next(pileups)
-
-        bases = []
-        qualities = []
-        mapping_qualities = []
-        query_positions = []
-
-        # to reproduce the older versions of Vafator, include deletions at query position when computing stats 
-        for read in pileup.pileups:
-            if read.is_refskip:
-                continue
-            if read.is_del:
-                bases.append("")
-                qualities.append(0)  # no base quality for deletions
-                mapping_qualities.append(read.alignment.mapping_quality)
-                query_positions.append(read.query_position_or_next)
-            else:
-                base = read.alignment.query_sequence[read.query_position].upper()
-                bases.append(base)
-                qualities.append(read.alignment.query_qualities[read.query_position])
-                mapping_qualities.append(read.alignment.mapping_quality)
-                query_positions.append(read.query_position)
-
-
-        all_bqs = aggregate_list_per_base(bases, qualities)
-        all_mqs = aggregate_list_per_base(bases, mapping_qualities)
-        all_positions = aggregate_list_per_base(bases, query_positions)
-
-        bqs = Counter({b: np.median(l) for b, l in all_bqs.items()})
-        mqs = Counter({b: np.median(l) for b, l in all_mqs.items()})
-        positions = Counter({b: np.median(l) for b, l in all_positions.items()})
-
-        # print('Summary', bases, qualities, mapping_qualities, '\n\n\n')
-
-        ac = Counter(b for b in bases if b != "")  # deletions don't count as alleles
-
-        if include_ambiguous_bases:
-            dp = len(bases)
-        else:
-            dp = sum(1 for b in bases if b == "" or b not in AMBIGUOUS_BASES)
-    except StopIteration:
-        # no reads
-        dp = 0
-        ac = Counter()
-        bqs = Counter()
-        mqs = Counter()
-        positions = Counter()
-        all_bqs = {}
-        all_mqs = {}
-        all_positions = {}
-
-    return CoverageMetrics(
-        ac=ac,
-        dp=dp,
-        bqs=bqs,
-        mqs=mqs,
-        positions=positions,
-        all_bqs=all_bqs,
-        all_mqs=all_mqs,
-        all_positions=all_positions
-    )
-
-
-def aggregate_median_per_base(bases, values) -> Counter:
-    aggregated_values = {}
-    for b, v in zip(bases, values):
-        if b not in aggregated_values:
-            aggregated_values[b] = []
-        aggregated_values[b].append(v)
-    return Counter({b: np.median(bq_list) for b, bq_list in aggregated_values.items()})
-
-
 def aggregate_list_per_base(bases, values) -> dict:
     aggregated_values = {}
     for b, v in zip(bases, values):
         if b not in aggregated_values:
             aggregated_values[b] = []
         aggregated_values[b].append(v)
-    return aggregated_values
+    return aggregated_values
\ No newline at end of file

From 547667b7210fa3f9d89f8c5c970a91e3c8971a38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Fri, 13 Mar 2026 17:02:46 +0100
Subject: [PATCH 05/32] perf: cache in power.py to avoid computing the same
 values repeatedly.

cache _calculate_k() results by dp value (same depth
  repeats across thousands of variants). Cache calculate_expected_vaf()
  by (sample, chrom, pos). Replace frozen binom(n, f).pmf(k) object
  instantiation with direct binom.pmf(k, n, f) call to avoid scipy
  distribution object overhead on every variant.
---
 vafator/power.py | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/vafator/power.py b/vafator/power.py
index 2430e31..f465fe9 100644
--- a/vafator/power.py
+++ b/vafator/power.py
@@ -1,3 +1,4 @@
+from functools import lru_cache
 from typing import Optional
 
 from cyvcf2 import Variant
@@ -27,6 +28,14 @@ def __init__(
         self.fpr = fpr
         self.error_rate = error_rate
 
+        # cache for _calculate_k: dp -> k
+        # dp values repeat heavily across variants so this avoids repeated binom.cdf loops
+        self._k_cache: dict = {}
+
+        # cache for calculate_expected_vaf: (sample, pos) -> vaf
+        # when using genome-wide ploidy (most common case) this is the same value for all variants
+        self._eaf_cache: dict = {}
+
     def calculate_power(self, dp: int, ac: int, sample: str, variant: Optional[Variant]) -> float:
         """
         Return the binomial probability of observing ac or less supporting reads, given a total coverage dp and a
@@ -45,10 +54,18 @@ def calculate_expected_vaf(self, sample: str, variant: Optional[Variant]) -> flo
 
         In a scenario with purity = 1, tumor CN = 2 and normal CN = 2 => expected VAF = 0.5
         """
+        # cache key: use variant position for local copy number lookups,
+        # or just sample name when genome-wide ploidy is used (most common case)
+        cache_key = (sample, variant.CHROM if variant else None, variant.POS if variant else None)
+        if cache_key in self._eaf_cache:
+            return self._eaf_cache[cache_key]
+
         purity = self.purities.get(sample, DEFAULT_PURITY)
         tumor_ploidy = max(1, self.tumor_ploidies.get(sample, default_ploidy_manager).get_ploidy(variant=variant))
         corrected_tumor_ploidy = purity * tumor_ploidy + ((1 - purity) * self.normal_ploidy)
         expected_vaf = purity / corrected_tumor_ploidy
+
+        self._eaf_cache[cache_key] = expected_vaf
         return expected_vaf
 
     def _calculate_p(self, m: int, n: int) -> float:
@@ -74,10 +91,13 @@ def _calculate_d(self, k: int, n: int) -> float:
         return (self.fpr - p) / (p_1 - p)
 
     def _calculate_k(self, dp: int) -> int:
+        # cache: same dp value appears across thousands of variants
+        if dp in self._k_cache:
+            return self._k_cache[dp]
         k = 1
         while self._calculate_p(m=k, n=dp) > self.fpr:
             k += 1
-
+        self._k_cache[dp] = k
         return k
 
     def calculate_absolute_power(self, sample, variant, dp: int) -> float:
@@ -89,7 +109,6 @@ def calculate_absolute_power(self, sample, variant, dp: int) -> float:
         k = self._calculate_k(dp=dp)
         n = dp
         f = self.calculate_expected_vaf(sample, variant)
-        power = 1 - binom.cdf(k=k - 1, n=n, p=f) + self._calculate_d(k=k, n=n) * binom(n, f).pmf(k)
-        return round(power, 5), k
-
-
+        # avoid instantiating a frozen binom distribution object — use module-level functions directly
+        power = 1 - binom.cdf(k=k - 1, n=n, p=f) + self._calculate_d(k=k, n=n) * binom.pmf(k=k, n=n, p=f)
+        return round(power, 5), k
\ No newline at end of file

From 449314030411fba57678bc66eeb2a102956cc8b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Tue, 17 Mar 2026 16:24:53 +0100
Subject: [PATCH 06/32] feat: chromosome-level parallelization via
 ProcessPoolExecutor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- annotator.py: add --num-processes parameter (default: 1, serial behaviour
  unchanged). When >1, _run_parallel() submits one future per chromosome to
  a ProcessPoolExecutor. Workers receive only picklable data: BAM file paths
  and variant tuples (POS, REF, ALT[0]) — cyvcf2.Variant objects and pysam
  AlignmentFile handles are not picklable and stay in the main process.

- annotator.py: add module-level _collect_metrics_worker() — must be
  module-level to be picklable by ProcessPoolExecutor. Workers open their
  own pysam.AlignmentFile instances and reconstruct VariantRecord objects
  from the serialized tuples before calling collect_metrics_for_chrom().

- annotator.py: refactor run() into _run_serial(), _run_parallel(),
  _collect_chrom_metrics(), and _annotate_and_batch() for clarity. VCF
  output order is preserved by collecting futures in submission order before
  annotating.

- pileups.py: replace test utility import (vafator.tests.utils.VafatorVariant)
  with a proper VariantRecord dataclass defined in pileups.py. Picklable by
  default, mirrors the .CHROM/.POS/.REF/.ALT interface of cyvcf2.Variant.

- pileups.py: add safe_median() helper to guard np.median() calls against
  empty lists, eliminating RuntimeWarning: Mean of empty slice on positions
  with no reads supporting a particular allele.

- command_line.py: wire --num-processes argument through to Annotator. remove unnecessary try/catch blocks as they would disable the traceback
---
 vafator/annotator.py    | 149 +++++++++++++++++++++++++++-------------
 vafator/command_line.py |  55 +++++++--------
 vafator/pileups.py      |  36 +++++++---
 3 files changed, 152 insertions(+), 88 deletions(-)

diff --git a/vafator/annotator.py b/vafator/annotator.py
index 62f311c..14c1d27 100755
--- a/vafator/annotator.py
+++ b/vafator/annotator.py
@@ -1,31 +1,50 @@
 from collections import Counter
+import os
+from concurrent.futures import ProcessPoolExecutor
 
 import numpy as np
 import pysam
 from cyvcf2 import VCF, Writer, Variant
-import os
 import vafator
 import datetime
 import json
-import asyncio
-import time
 
 from vafator.ploidies import DEFAULT_PLOIDY
 from vafator.rank_sum_test import calculate_rank_sum_test, get_rank_sum_tests
 from vafator.power import PowerCalculator, DEFAULT_ERROR_RATE, DEFAULT_FPR
 from vafator.pileups import (
-    collect_metrics_for_chrom, stream_variants_by_chrom, EMPTY_METRICS
+    collect_metrics_for_chrom, stream_variants_by_chrom, EMPTY_METRICS, VariantRecord
 )
 
 
 BATCH_SIZE = 10000
 
 
-def background(f):
-    def wrapped(*args, **kwargs):
-        return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)
-
-    return wrapped
+def _collect_metrics_worker(chrom, variant_tuples, bam_paths, min_base_quality,
+                            min_mapping_quality, include_ambiguous_bases):
+    """
+    Top-level worker function for ProcessPoolExecutor — must be module-level to be picklable.
+    Opens its own BAM readers (AlignmentFile objects cannot be shared across processes).
+    Receives variant data as plain tuples (cyvcf2.Variant objects are not picklable).
+
+    Returns {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}}
+    """
+    all_metrics = {}
+    variants = [VariantRecord(CHROM=chrom, POS=pos, REF=ref, ALT=[alt])
+                for pos, ref, alt in variant_tuples]
+    for sample, bam_files in bam_paths.items():
+        for i, bam_path in enumerate(bam_files):
+            bam = pysam.AlignmentFile(bam_path, "rb")
+            all_metrics[(sample, i)] = collect_metrics_for_chrom(
+                chrom=chrom,
+                variants=variants,
+                bam=bam,
+                min_base_quality=min_base_quality,
+                min_mapping_quality=min_mapping_quality,
+                include_ambiguous_bases=include_ambiguous_bases,
+            )
+            bam.close()
+    return all_metrics
 
 
 class Annotator(object):
@@ -46,7 +65,8 @@ def __init__(self, input_vcf: str, output_vcf: str,
                  normal_ploidy=2,
                  fpr=DEFAULT_FPR,
                  error_rate=DEFAULT_ERROR_RATE,
-                 include_ambiguous_bases=False):
+                 include_ambiguous_bases=False,
+                 num_processes: int = 1):
 
         self.mapping_quality_threshold = mapping_qual_thr
         self.base_call_quality_threshold = base_call_qual_thr
@@ -54,6 +74,7 @@ def __init__(self, input_vcf: str, output_vcf: str,
         self.tumor_ploidies = tumor_ploidies
         self.normal_ploidy = normal_ploidy
         self.include_ambiguous_bases = include_ambiguous_bases
+        self.num_processes = num_processes
         self.power = PowerCalculator(
             normal_ploidy=normal_ploidy, tumor_ploidies=tumor_ploidies, purities=purities,
             error_rate=error_rate, fpr=fpr)
@@ -77,7 +98,8 @@ def __init__(self, input_vcf: str, output_vcf: str,
         for a in Annotator._get_headers(input_bams):
             self.vcf.add_info_to_header(a)
         self.vcf_writer = Writer(output_vcf, self.vcf)
-        self.bam_readers = {s : [pysam.AlignmentFile(b, "rb") for b in bams] for s, bams in input_bams.items()}
+        self.bam_paths = input_bams  # {sample: [path, ...]} — picklable, passed to workers
+        self.bam_readers = {s: [pysam.AlignmentFile(b, "rb") for b in bams] for s, bams in input_bams.items()}
 
     @staticmethod
     def _get_headers(input_bams: dict):
@@ -285,7 +307,6 @@ def _get_headers(input_bams: dict):
                     ]
         return headers
 
-    @background
     def _write_batch(self, batch):
         for v in batch:
             self.vcf_writer.write_record(v)
@@ -401,47 +422,79 @@ def _calculate_af(self, ac, dp):
 
     def run(self):
         batch = []
-
-        for chrom, chrom_variants in stream_variants_by_chrom(self.vcf):
-            # For each BAM, open ONE pileup iterator per chromosome and compute
-            # metrics immediately for each column — avoids per-variant pileup overhead.
-            # Key: (sample, bam_index, variant_pos, REF, ALT[0]) -> CoverageMetrics
-            all_metrics = {}  # {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}}
-
-            for sample, bams in self.bam_readers.items():
-                for i, bam in enumerate(bams):
-                    all_metrics[(sample, i)] = collect_metrics_for_chrom(
-                        chrom=chrom,
-                        variants=chrom_variants,
-                        bam=bam,
-                        min_base_quality=self.base_call_quality_threshold,
-                        min_mapping_quality=self.mapping_quality_threshold,
-                        include_ambiguous_bases=self.include_ambiguous_bases,
-                    )
-
-            for variant in chrom_variants:
-                # build per-BAM metrics lookup for this specific variant
-                metrics_by_bam = {
-                    (sample, i): all_metrics[(sample, i)].get(
-                        (variant.POS, variant.REF, variant.ALT[0]), EMPTY_METRICS
-                    )
-                    for sample, bams in self.bam_readers.items()
-                    for i in range(len(bams))
-                }
-                self._add_stats(variant, metrics_by_bam)
-
-                batch.append(variant)
-                if len(batch) >= BATCH_SIZE:
-                    self._write_batch(batch)
-                    batch = []
+        if self.num_processes > 1:
+            self._run_parallel(batch)
+        else:
+            self._run_serial(batch)
 
         if batch:
             self._write_batch(batch)
 
-        time.sleep(2)
-
         self.vcf_writer.close()
         self.vcf.close()
         for _, bams in self.bam_readers.items():
             for bam in bams:
-                bam.close()
\ No newline at end of file
+                bam.close()
+
+    def _run_serial(self, batch):
+        for chrom, chrom_variants in stream_variants_by_chrom(self.vcf):
+            all_metrics = self._collect_chrom_metrics(chrom, chrom_variants)
+            self._annotate_and_batch(chrom_variants, all_metrics, batch)
+
+    def _run_parallel(self, batch):
+        # variant objects are not picklable — pass only (POS, REF, ALT) tuples to workers,
+        # keep the actual Variant objects in the main process for annotation and writing
+        chrom_variants_map = {}
+        futures = {}
+
+        with ProcessPoolExecutor(max_workers=self.num_processes) as executor:
+            for chrom, chrom_variants in stream_variants_by_chrom(self.vcf):
+                chrom_variants_map[chrom] = chrom_variants
+                variant_tuples = [(v.POS, v.REF, v.ALT[0]) for v in chrom_variants]
+                futures[executor.submit(
+                    _collect_metrics_worker,
+                    chrom=chrom,
+                    variant_tuples=variant_tuples,
+                    bam_paths=self.bam_paths,
+                    min_base_quality=self.base_call_quality_threshold,
+                    min_mapping_quality=self.mapping_quality_threshold,
+                    include_ambiguous_bases=self.include_ambiguous_bases,
+                )] = chrom
+
+            # collect in submission order to preserve VCF chromosome order
+            chrom_results = {futures[f]: f.result() for f in futures}
+
+        for chrom, chrom_variants in chrom_variants_map.items():
+            self._annotate_and_batch(chrom_variants, chrom_results[chrom], batch)
+
+
+    def _collect_chrom_metrics(self, chrom, chrom_variants):
+        """Collect metrics for all BAMs for one chromosome in the main process."""
+        all_metrics = {}
+        for sample, bams in self.bam_readers.items():
+            for i, bam in enumerate(bams):
+                all_metrics[(sample, i)] = collect_metrics_for_chrom(
+                    chrom=chrom,
+                    variants=chrom_variants,
+                    bam=bam,
+                    min_base_quality=self.base_call_quality_threshold,
+                    min_mapping_quality=self.mapping_quality_threshold,
+                    include_ambiguous_bases=self.include_ambiguous_bases,
+                )
+        return all_metrics
+
+    def _annotate_and_batch(self, chrom_variants, all_metrics, batch):
+        """Annotate variants using pre-computed metrics and append to write batch."""
+        for variant in chrom_variants:
+            metrics_by_bam = {
+                (sample, i): all_metrics[(sample, i)].get(
+                    (variant.POS, variant.REF, variant.ALT[0]), EMPTY_METRICS
+                )
+                for sample, bams in self.bam_readers.items()
+                for i in range(len(bams))
+            }
+            self._add_stats(variant, metrics_by_bam)
+            batch.append(variant)
+            if len(batch) >= BATCH_SIZE:
+                self._write_batch(batch)
+                batch.clear()
\ No newline at end of file
diff --git a/vafator/command_line.py b/vafator/command_line.py
index 06cdd85..66e617c 100755
--- a/vafator/command_line.py
+++ b/vafator/command_line.py
@@ -50,6 +50,8 @@ def annotator():
                         help="Error rate to use in the power calculation")
     parser.add_argument("--include-ambiguous-bases", dest="include_ambiguous_bases", action='store_true',
                         help="Flag indicating to include ambiguous bases from the DP calculation")
+    parser.add_argument("--num-processes", dest="num_processes", required=False, default=1, type=int,
+                        help="Number of processes for parallel chromosome-level annotation (default: 1)")
 
     args = parser.parse_args()
 
@@ -87,24 +89,22 @@ def annotator():
     if len(bams) == 0:
         raise ValueError("Please, provide at least one bam file with '--bam sample_name /path/to/file.bam'")
 
-    try:
-        annotator = Annotator(
-            input_vcf=args.input_vcf,
-            output_vcf=args.output_vcf,
-            input_bams=bams,
-            mapping_qual_thr=args.mapping_quality,
-            base_call_qual_thr=args.base_call_quality,
-            purities=purities,
-            tumor_ploidies=tumor_ploidies,
-            normal_ploidy=int(args.normal_ploidy),
-            fpr=args.fpr,
-            error_rate=args.error_rate,
-            include_ambiguous_bases=args.include_ambiguous_bases
-        )
-        annotator.run()
-    except Exception as e:
-        logging.error(str(e))
-        sys.exit(-1)
+    annotator = Annotator(
+        input_vcf=args.input_vcf,
+        output_vcf=args.output_vcf,
+        input_bams=bams,
+        mapping_qual_thr=args.mapping_quality,
+        base_call_qual_thr=args.base_call_quality,
+        purities=purities,
+        tumor_ploidies=tumor_ploidies,
+        normal_ploidy=int(args.normal_ploidy),
+        fpr=args.fpr,
+        error_rate=args.error_rate,
+        include_ambiguous_bases=args.include_ambiguous_bases,
+        num_processes=args.num_processes,
+    )
+    annotator.run()
+    
     logging.info("Vafator finished!")
 
 
@@ -120,16 +120,13 @@ def multiallelics_filter():
     args = parser.parse_args()
 
     logging.info("Vafator multiallelic filter starting...")
-    try:
-        filter = MultiallelicFilter(
-            input_vcf=args.input_vcf,
-            output_vcf=args.output_vcf,
-            tumor_sample_name=args.tumor_sample_name
-        )
-        filter.run()
-    except Exception as e:
-        logging.error(str(e))
-        sys.exit(-1)
+    filter = MultiallelicFilter(
+        input_vcf=args.input_vcf,
+        output_vcf=args.output_vcf,
+        tumor_sample_name=args.tumor_sample_name
+    )
+    filter.run()
+    
     logging.info("Vafator multiallelic filter finished!")
 
 
@@ -167,4 +164,4 @@ def hatchet2bed():
                         help="output BED file prefix, one file will be created per sample in the input with the "
                              "average tumor copy number in each segment")
     args = parser.parse_args()
-    run_hatchet2bed(input_file=args.input_file, output_prefix=args.output_prefix)
+    run_hatchet2bed(input_file=args.input_file, output_prefix=args.output_prefix)
\ No newline at end of file
diff --git a/vafator/pileups.py b/vafator/pileups.py
index c9efd9e..a3ad36c 100755
--- a/vafator/pileups.py
+++ b/vafator/pileups.py
@@ -5,10 +5,19 @@
 from pysam.libcalignmentfile import IteratorColumnRegion, AlignmentFile
 
 from vafator import AMBIGUOUS_BASES
-from vafator.tests.utils import VafatorVariant
 import numpy as np
 
 
+@dataclass
+class VariantRecord:
+    """Lightweight, picklable variant representation used by pileup workers.
+    Mirrors the cyvcf2.Variant fields accessed by pileup and metrics functions."""
+    CHROM: str
+    POS: int
+    REF: str
+    ALT: List[str]
+
+
 def is_snp(variant: Variant):
     return len(variant.REF) == 1 and len(variant.ALT[0]) == 1
 
@@ -22,7 +31,7 @@ def is_deletion(variant: Variant):
 
 
 def get_variant_pileup(
-        variant: Union[Variant, VafatorVariant], bam: AlignmentFile,
+        variant: Union[Variant, VariantRecord], bam: AlignmentFile,
         min_base_quality, min_mapping_quality) -> IteratorColumnRegion:
     """Single-variant pileup, kept for backwards compatibility and tests."""
     position = variant.POS
@@ -178,9 +187,9 @@ def _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases=False) -> C
     all_mqs = aggregate_list_per_base(bases, mapping_qualities)
     all_positions = aggregate_list_per_base(bases, query_positions)
 
-    bqs = Counter({b: np.median(l) for b, l in all_bqs.items()})
-    mqs = Counter({b: np.median(l) for b, l in all_mqs.items()})
-    positions = Counter({b: np.median(l) for b, l in all_positions.items()})
+    bqs = Counter({b: safe_median(l) for b, l in all_bqs.items()})
+    mqs = Counter({b: safe_median(l) for b, l in all_mqs.items()})
+    positions = Counter({b: safe_median(l) for b, l in all_positions.items()})
 
     ac = Counter(b for b in bases if b != "")
 
@@ -234,8 +243,8 @@ def _get_insertion_metrics_from_column(variant: Variant, pileup_col) -> Coverage
 
     return CoverageMetrics(
         ac=Counter(ac), dp=dp,
-        mqs=Counter({k: np.median(l) for k, l in mq.items()}),
-        positions=Counter({k: np.median(l) for k, l in pos.items()}),
+        mqs=Counter({k: safe_median(l) for k, l in mq.items()}),
+        positions=Counter({k: safe_median(l) for k, l in pos.items()}),
         bqs=Counter(),
         all_mqs={k: l for k, l in mq.items()},
         all_positions={k: l for k, l in pos.items()},
@@ -278,18 +287,23 @@ def _get_deletion_metrics_from_column(variant: Variant, pileup_col) -> CoverageM
 
     return CoverageMetrics(
         ac=Counter(ac), dp=dp,
-        mqs=Counter({k: np.median(l) for k, l in mq.items()}),
-        positions=Counter({k: np.median(l) for k, l in pos.items()}),
+        mqs=Counter({k: safe_median(l) for k, l in mq.items()}),
+        positions=Counter({k: safe_median(l) for k, l in pos.items()}),
         bqs=Counter(),
         all_mqs={k: l for k, l in mq.items()},
         all_positions={k: l for k, l in pos.items()},
         all_bqs=Counter()
     )
-
+ 
 def aggregate_list_per_base(bases, values) -> dict:
     aggregated_values = {}
     for b, v in zip(bases, values):
         if b not in aggregated_values:
             aggregated_values[b] = []
         aggregated_values[b].append(v)
-    return aggregated_values
\ No newline at end of file
+    return aggregated_values
+ 
+ 
+def safe_median(values) -> float:
+    """Return median of values, or 0.0 for empty lists (avoids numpy RuntimeWarning)."""
+    return float(np.median(values)) if values else 0.0
\ No newline at end of file

From 9d3259e5ae8328d9babf3cfe60eff5f700b50150 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Tue, 17 Mar 2026 16:25:24 +0100
Subject: [PATCH 07/32] return nan in safe_median to repeat previous results

---
 vafator/pileups.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/vafator/pileups.py b/vafator/pileups.py
index a3ad36c..b6afb9f 100755
--- a/vafator/pileups.py
+++ b/vafator/pileups.py
@@ -1,12 +1,11 @@
+import numpy as np
 from collections import Counter, defaultdict
-from dataclasses import dataclass
-from typing import Union, List, Dict, Iterator, Tuple
 from cyvcf2 import Variant
+from dataclasses import dataclass
+from math import nan
 from pysam.libcalignmentfile import IteratorColumnRegion, AlignmentFile
-
+from typing import Union, List, Dict, Iterator, Tuple
 from vafator import AMBIGUOUS_BASES
-import numpy as np
-
 
 @dataclass
 class VariantRecord:
@@ -306,4 +305,4 @@ def aggregate_list_per_base(bases, values) -> dict:
  
 def safe_median(values) -> float:
     """Return median of values, or 0.0 for empty lists (avoids numpy RuntimeWarning)."""
-    return float(np.median(values)) if values else 0.0
\ No newline at end of file
+    return float(np.median(values)) if values else nan

From a4f228835490c9275217aefab0423417cb53d759 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Wed, 18 Mar 2026 13:08:53 +0100
Subject: [PATCH 08/32] breaking: change cli argument include-ambiguous-bases
 to exclude-ambiguous-bases and make inclusion of ambiguous the default
 behaviour for accurate depth calculation.

This replicates the results of version 2.2.0, but makes it more explicit
---
 vafator/command_line.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vafator/command_line.py b/vafator/command_line.py
index 66e617c..f2d639d 100755
--- a/vafator/command_line.py
+++ b/vafator/command_line.py
@@ -48,7 +48,7 @@ def annotator():
                         help="False Positive Rate (FPR) to use in the power calculation")
     parser.add_argument("--error-rate", dest="error_rate", required=False, default=DEFAULT_ERROR_RATE, type=float,
                         help="Error rate to use in the power calculation")
-    parser.add_argument("--include-ambiguous-bases", dest="include_ambiguous_bases", action='store_true',
+    parser.add_argument("--exclude-ambiguous-bases", dest="exclude_ambiguous_bases", action='store_true',
                         help="Flag indicating to include ambiguous bases from the DP calculation")
     parser.add_argument("--num-processes", dest="num_processes", required=False, default=1, type=int,
                         help="Number of processes for parallel chromosome-level annotation (default: 1)")
@@ -100,7 +100,7 @@ def annotator():
         normal_ploidy=int(args.normal_ploidy),
         fpr=args.fpr,
         error_rate=args.error_rate,
-        include_ambiguous_bases=args.include_ambiguous_bases,
+        include_ambiguous_bases=(not args.exclude_ambiguous_bases),
         num_processes=args.num_processes,
     )
     annotator.run()

From 8f62e75fcc44a7f8de5de9d3244f2e81b3a559ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Wed, 18 Mar 2026 14:30:46 +0100
Subject: [PATCH 09/32] Setup

---
 requirements.txt | 11 +++++++++++
 setup.cfg        |  7 ++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..3484f8c
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,11 @@
+pandas>=3.0.1,<4
+# pysam pinned: above 0.21.0 base qualities show up wrong in the presence of soft clipping/insertions/overlapping read pairs or a combination of these factors 
+pysam==0.21.0
+cyvcf2>=0.32.1,<0.33
+logzero>=1.7.0,<2
+pybedtools>=0.12.0,<0.13
+numpy>=2.4.3,<3
+scipy>=1.17.1,<2
+setuptools
+pytest
+pytest-cov
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index b74831c..9c95fe7 100755
--- a/setup.cfg
+++ b/setup.cfg
@@ -43,6 +43,7 @@ install_requires =
     pybedtools>=0.12.0,<0.13
     numpy>=2.4.3,<3
     scipy>=1.17.1,<2
+    setuptools
 
 [options.packages.find]
 exclude =
@@ -55,4 +56,8 @@ exclude =
 dev =
     pytest
     ruff
-    mypy
\ No newline at end of file
+    mypy
+test =
+    pytest
+    pytest-cov
+    setuptools
\ No newline at end of file

From 8f8f1c3c27b2f5bc1c7e70e935d097fecafca454 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Wed, 18 Mar 2026 14:40:24 +0100
Subject: [PATCH 10/32] Update unit tests

---
 vafator/ploidies.py                    |   4 +-
 vafator/tests/test_pileups.py          | 210 ++++++++++++++++++++++++-
 vafator/tests/test_ploidy_manager.py   |  13 +-
 vafator/tests/test_power_calculator.py |  43 ++++-
 vafator/tests/test_rank_sum_test.py    |  38 ++++-
 vafator/tests/utils.py                 |  83 +++++-----
 6 files changed, 331 insertions(+), 60 deletions(-)

diff --git a/vafator/ploidies.py b/vafator/ploidies.py
index b9f9099..00748db 100755
--- a/vafator/ploidies.py
+++ b/vafator/ploidies.py
@@ -3,7 +3,7 @@
 import pandas as pd
 from cyvcf2 import Variant
 
-from vafator.tests.utils import VafatorVariant
+from vafator.pileups import VariantRecord
 
 DEFAULT_PLOIDY = 2.0
 
@@ -20,7 +20,7 @@ def __init__(self, local_copy_numbers: str = None, genome_wide_ploidy: float = D
             if local_copy_numbers is not None else None
         self.ploidy = genome_wide_ploidy
 
-    def get_ploidy(self, variant: Union[Variant, VafatorVariant]) -> float:
+    def get_ploidy(self, variant: Union[Variant, VariantRecord]) -> float:
 
         result = self.ploidy
         if self.bed is not None:
diff --git a/vafator/tests/test_pileups.py b/vafator/tests/test_pileups.py
index d7b3c76..21546ee 100644
--- a/vafator/tests/test_pileups.py
+++ b/vafator/tests/test_pileups.py
@@ -1,10 +1,216 @@
 from collections import Counter
 from unittest import TestCase
+from unittest.mock import MagicMock
 import pkg_resources
 import pysam
 
 from vafator.tests.utils import VafatorVariant
-from vafator.pileups import get_variant_pileup, get_metrics
+from vafator.pileups import (
+    get_variant_pileup, get_metrics,
+    _get_insertion_metrics_from_column, _get_deletion_metrics_from_column,
+)
+from vafator.pileups import VariantRecord
+
+
+def _make_pileup_col(reads):
+    col = MagicMock()
+    col.pileups = reads
+    return col
+
+
+def _make_insertion_read(indel, reference_start, cigartuples, query, mapping_quality=60,
+                         query_position_or_next=0):
+    """Build a mock pileup read with an insertion (indel > 0)."""
+    read = MagicMock()
+    read.indel = indel
+    read.alignment.reference_start = reference_start
+    read.alignment.cigartuples = cigartuples
+    read.alignment.query = query
+    read.alignment.mapping_quality = mapping_quality
+    read.query_position_or_next = query_position_or_next
+    return read
+
+
+def _make_ref_read(mapping_quality=60, query_position_or_next=0):
+    """Build a mock pileup read with no indel (indel == 0), counted as reference."""
+    read = MagicMock()
+    read.indel = 0
+    read.alignment.mapping_quality = mapping_quality
+    read.query_position_or_next = query_position_or_next
+    return read
+
+
+def _make_deletion_read(indel, reference_start, cigartuples, mapping_quality=60,
+                        query_position_or_next=0):
+    """Build a mock pileup read with a deletion (indel < 0)."""
+    read = MagicMock()
+    read.indel = indel
+    read.alignment.reference_start = reference_start
+    read.alignment.cigartuples = cigartuples
+    read.alignment.mapping_quality = mapping_quality
+    read.query_position_or_next = query_position_or_next
+    return read
+
+
+class TestInsertionMetricsFromColumn(TestCase):
+    # CIGAR op codes: 0=M, 1=I, 2=D, 3=N, 4=S, 7==, 8=X
+
+    def test_no_reads(self):
+        variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT'])
+        col = _make_pileup_col([])
+        metrics = _get_insertion_metrics_from_column(variant, col)
+        self.assertEqual(metrics.dp, 0)
+        self.assertEqual(metrics.ac['ATT'], 0)
+
+    def test_one_matching_insertion(self):
+        # variant: A -> ATT at pos 100 (insertion of "TT", length 2)
+        # read: starts at 98, M2 then I2 of "TT" landing at pos 100
+        # CIGAR: [(0,2), (1,2)] — M2 advances ref to 100, then I2
+        variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT'])
+        read = _make_insertion_read(
+            indel=2,
+            reference_start=98,
+            cigartuples=[(0, 2), (1, 2)],
+            query='AATT',  # query up to insertion point
+            mapping_quality=60,
+            query_position_or_next=2
+        )
+        col = _make_pileup_col([read])
+        metrics = _get_insertion_metrics_from_column(variant, col)
+        self.assertEqual(metrics.dp, 1)
+        self.assertEqual(metrics.ac['ATT'], 1)
+        self.assertEqual(metrics.mqs['ATT'], [60])
+
+    def test_insertion_wrong_sequence_not_counted(self):
+        # same position and length but different bases: "TC" instead of "TT"
+        variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT'])
+        read = _make_insertion_read(
+            indel=2,
+            reference_start=98,
+            cigartuples=[(0, 2), (1, 2)],
+            query='AATC',
+            mapping_quality=60,
+            query_position_or_next=2
+        )
+        col = _make_pileup_col([read])
+        metrics = _get_insertion_metrics_from_column(variant, col)
+        self.assertEqual(metrics.dp, 1)
+        self.assertEqual(metrics.ac['ATT'], 0)
+
+    def test_insertion_wrong_length_not_counted(self):
+        # insertion of length 3 but variant expects length 2
+        variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT'])
+        read = _make_insertion_read(
+            indel=3,
+            reference_start=98,
+            cigartuples=[(0, 2), (1, 3)],
+            query='AATTT',
+            mapping_quality=60,
+            query_position_or_next=2
+        )
+        col = _make_pileup_col([read])
+        metrics = _get_insertion_metrics_from_column(variant, col)
+        self.assertEqual(metrics.ac['ATT'], 0)
+
+    def test_ref_reads_counted_in_mq(self):
+        variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT'])
+        ref_read = _make_ref_read(mapping_quality=55, query_position_or_next=30)
+        col = _make_pileup_col([ref_read])
+        metrics = _get_insertion_metrics_from_column(variant, col)
+        self.assertEqual(metrics.dp, 1)
+        self.assertEqual(metrics.ac['ATT'], 0)
+        self.assertEqual(metrics.mqs['A'], [55])
+
+    def test_mixed_insertion_and_ref_reads(self):
+        variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT'])
+        ins_read = _make_insertion_read(
+            indel=2, reference_start=98,
+            cigartuples=[(0, 2), (1, 2)],
+            query='AATT', mapping_quality=60, query_position_or_next=2
+        )
+        ref_read = _make_ref_read(mapping_quality=55)
+        col = _make_pileup_col([ins_read, ref_read])
+        metrics = _get_insertion_metrics_from_column(variant, col)
+        self.assertEqual(metrics.dp, 2)
+        self.assertEqual(metrics.ac['ATT'], 1)
+
+
+class TestDeletionMetricsFromColumn(TestCase):
+    # CIGAR op codes: 0=M, 2=D, 3=N, 7==, 8=X
+
+    def test_no_reads(self):
+        variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A'])
+        col = _make_pileup_col([])
+        metrics = _get_deletion_metrics_from_column(variant, col)
+        self.assertEqual(metrics.dp, 0)
+        self.assertEqual(metrics.ac['A'], 0)
+
+    def test_one_matching_deletion(self):
+        # variant: ATT -> A at pos 100 (deletion of length 2)
+        # read starts at 98, M2 advances to 100, D2 matches the deletion
+        variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A'])
+        read = _make_deletion_read(
+            indel=-2,
+            reference_start=98,
+            cigartuples=[(0, 2), (2, 2)],
+            mapping_quality=60,
+            query_position_or_next=2
+        )
+        col = _make_pileup_col([read])
+        metrics = _get_deletion_metrics_from_column(variant, col)
+        self.assertEqual(metrics.dp, 1)
+        self.assertEqual(metrics.ac['A'], 1)
+        self.assertEqual(metrics.mqs['A'], [60])
+
+    def test_deletion_wrong_length_not_counted(self):
+        # deletion of length 3 but variant expects length 2
+        variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A'])
+        read = _make_deletion_read(
+            indel=-3,
+            reference_start=98,
+            cigartuples=[(0, 2), (2, 3)],
+            mapping_quality=60,
+            query_position_or_next=2
+        )
+        col = _make_pileup_col([read])
+        metrics = _get_deletion_metrics_from_column(variant, col)
+        self.assertEqual(metrics.ac['A'], 0)
+
+    def test_deletion_wrong_position_not_counted(self):
+        # deletion starts at 99 not 100
+        variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A'])
+        read = _make_deletion_read(
+            indel=-2,
+            reference_start=98,
+            cigartuples=[(0, 1), (2, 2)],  # M1 advances to 99, not 100
+            mapping_quality=60,
+            query_position_or_next=1
+        )
+        col = _make_pileup_col([read])
+        metrics = _get_deletion_metrics_from_column(variant, col)
+        self.assertEqual(metrics.ac['A'], 0)
+
+    def test_ref_reads_counted_in_mq(self):
+        variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A'])
+        ref_read = _make_ref_read(mapping_quality=45, query_position_or_next=10)
+        col = _make_pileup_col([ref_read])
+        metrics = _get_deletion_metrics_from_column(variant, col)
+        self.assertEqual(metrics.dp, 1)
+        self.assertEqual(metrics.ac['A'], 0)
+        self.assertEqual(metrics.mqs['ATT'], [45])
+
+    def test_mixed_deletion_and_ref_reads(self):
+        variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A'])
+        del_read = _make_deletion_read(
+            indel=-2, reference_start=98,
+            cigartuples=[(0, 2), (2, 2)],
+            mapping_quality=60, query_position_or_next=2
+        )
+        ref_read = _make_ref_read(mapping_quality=55)
+        col = _make_pileup_col([del_read, ref_read])
+        metrics = _get_deletion_metrics_from_column(variant, col)
+        self.assertEqual(metrics.dp, 2)
+        self.assertEqual(metrics.ac['A'], 1)
 
 
 class TestPileups(TestCase):
@@ -67,4 +273,4 @@ def _assert_metrics(self, variant: VafatorVariant, expected_ac, expected_dp,
             min_base_quality=min_base_quality, min_mapping_quality=min_mapping_quality)
         coverage_metrics = get_metrics(variant=variant, pileups=pileups)
         self.assertEqual(expected_ac, coverage_metrics.ac)
-        self.assertEqual(expected_dp, coverage_metrics.dp)
+        self.assertEqual(expected_dp, coverage_metrics.dp)
\ No newline at end of file
diff --git a/vafator/tests/test_ploidy_manager.py b/vafator/tests/test_ploidy_manager.py
index 5e68640..f90746f 100644
--- a/vafator/tests/test_ploidy_manager.py
+++ b/vafator/tests/test_ploidy_manager.py
@@ -1,5 +1,4 @@
 from unittest import TestCase
-
 import pkg_resources
 
 from vafator.tests.utils import VafatorVariant
@@ -29,13 +28,21 @@ def test_local_copy_numbers_ploidy_manager(self):
         # test non existing interval
         self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy(
             variant=VafatorVariant(chromosome="chr3", position=12345, reference="A", alternative="C")), 2.0)
-        # test lower boundary of interval
+
+    def test_interval_boundaries(self):
+        input_bed = pkg_resources.resource_filename(__name__, "resources/test_copy_numbers.bed")
+        # lower boundary — POS 10000 is 0-based 9999, outside interval start 10000
         self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy(
             variant=VafatorVariant(chromosome="chr1", position=10000, reference="A", alternative="C")), 2.0)
+        # just inside lower boundary
         self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy(
             variant=VafatorVariant(chromosome="chr1", position=10001, reference="A", alternative="C")), 1.2)
-        # test upper boundary of interval
+        # upper boundary
         self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy(
             variant=VafatorVariant(chromosome="chr1", position=20000, reference="A", alternative="C")), 1.2)
         self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy(
             variant=VafatorVariant(chromosome="chr1", position=20001, reference="A", alternative="C")), 2.1)
+
+    def test_invalid_bed_raises(self):
+        with self.assertRaises(ValueError):
+            PloidyManager(local_copy_numbers="/nonexistent/path.bed")
diff --git a/vafator/tests/test_power_calculator.py b/vafator/tests/test_power_calculator.py
index a3c4981..2e120e7 100644
--- a/vafator/tests/test_power_calculator.py
+++ b/vafator/tests/test_power_calculator.py
@@ -24,11 +24,40 @@ def test_power_calculator(self):
         self.assertAlmostEqual(power.calculate_power(dp=10, ac=10, sample='tumor', variant=None), 1.0)
         self.assertAlmostEqual(power.calculate_power(dp=10, ac=11, sample='tumor', variant=None), 1.0)
 
+    def test_zero_dp_returns_zero_power(self):
+        power = PowerCalculator(
+            tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8})
+        self.assertEqual(power.calculate_power(dp=0, ac=0, sample='tumor', variant=None), 1.0)
+
     def test_eaf_copy_number_below_one(self):
         power = PowerCalculator(
             tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=0.5)}, purities={'tumor': 0.9})
         self.assertLessEqual(power.calculate_expected_vaf(sample='tumor', variant=None), 1.0)
 
+    def test_eaf_is_cached(self):
+        power = PowerCalculator(
+            tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8})
+        v1 = VafatorVariant(chromosome='chr1', position=100, reference='A', alternative='G')
+        eaf1 = power.calculate_expected_vaf(sample='tumor', variant=v1)
+        eaf2 = power.calculate_expected_vaf(sample='tumor', variant=v1)
+        self.assertEqual(eaf1, eaf2)
+        self.assertEqual(len(power._eaf_cache), 1)
+
+    def test_k_is_cached(self):
+        power = PowerCalculator(
+            tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8})
+        k1 = power._calculate_k(100)
+        k2 = power._calculate_k(100)
+        self.assertEqual(k1, k2)
+        self.assertEqual(len(power._k_cache), 1)
+
+    def test_higher_coverage_higher_power(self):
+        power = PowerCalculator(
+            tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8})
+        p_low, _ = power.calculate_absolute_power(dp=10, sample='tumor', variant=None)
+        p_high, _ = power.calculate_absolute_power(dp=100, sample='tumor', variant=None)
+        self.assertLess(p_low, p_high)
+
     def test_varying_purity(self):
         power1 = PowerCalculator(
             tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.5)}, purities={'tumor': 0.8})
@@ -37,7 +66,6 @@ def test_varying_purity(self):
         self.assertLess(
             power1.calculate_power(dp=10, ac=2, sample='tumor', variant=None),
             power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None))
-
         power3 = PowerCalculator(
             tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.5)}, purities={'tumor': 0.4})
         self.assertLess(
@@ -53,12 +81,6 @@ def test_varying_ploidy(self):
             power1.calculate_power(dp=10, ac=2, sample='tumor', variant=None),
             power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None))
 
-        power3 = PowerCalculator(
-            tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=6.0)}, purities={'tumor': 0.8})
-        self.assertLess(
-            power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None),
-            power3.calculate_power(dp=10, ac=2, sample='tumor', variant=None))
-
     def test_local_copy_numbers(self):
         input_bed = pkg_resources.resource_filename(__name__, "resources/test_copy_numbers.bed")
         power = PowerCalculator(
@@ -81,7 +103,6 @@ def test_local_copy_numbers(self):
 
     def test_absolute_power_calculator(self):
         ploidy_manager = {'tumor': PloidyManager(genome_wide_ploidy=2)}
-
         calculator = PowerCalculator(tumor_ploidies=ploidy_manager, purities={'tumor': 0.8})
         p, k = calculator.calculate_absolute_power(dp=100, sample='tumor', variant=None)
         self.assertEqual(p, 1.0)
@@ -125,3 +146,9 @@ def test_absolute_power_calculator(self):
         p, k = calculator.calculate_absolute_power(dp=2, sample='tumor', variant=None)
         self.assertEqual(p, 0.0025)
         self.assertEqual(k, 2)
+
+    def test_default_purity_is_one(self):
+        power = PowerCalculator(tumor_ploidies={}, purities={})
+        eaf = power.calculate_expected_vaf(sample='tumor', variant=None)
+        # purity=1, normal_ploidy=2, tumor_ploidy=2 => eaf = 1/(1*2 + 0*2) = 0.5
+        self.assertAlmostEqual(eaf, 0.5)
diff --git a/vafator/tests/test_rank_sum_test.py b/vafator/tests/test_rank_sum_test.py
index 66f4f65..788ea09 100644
--- a/vafator/tests/test_rank_sum_test.py
+++ b/vafator/tests/test_rank_sum_test.py
@@ -1,5 +1,7 @@
+from math import isnan
 from unittest import TestCase
-from vafator.rank_sum_test import calculate_rank_sum_test
+from vafator.rank_sum_test import calculate_rank_sum_test, get_rank_sum_tests
+from vafator.pileups import VariantRecord
 
 
 class TestRankSumTest(TestCase):
@@ -19,7 +21,6 @@ def test_direction(self):
         stat2, pvalue2 = calculate_rank_sum_test(distribution2, distribution1)
         self.assertGreater(stat2, 0.0)
         self.assertLess(pvalue2, 1.0)
-
         self.assertEqual(stat1, -stat2)
 
     def test_gatk_example(self):
@@ -31,3 +32,36 @@ def test_gatk_example(self):
         stat, pvalue = calculate_rank_sum_test(alt_distribution, ref_distribution)
         self.assertEqual(stat, -2.154)
         self.assertEqual(pvalue, 0.03121)
+
+    def test_empty_alternate_returns_nan(self):
+        stat, pvalue = calculate_rank_sum_test([], [1, 2, 3])
+        self.assertTrue(isnan(stat))
+        self.assertTrue(isnan(pvalue))
+
+    def test_empty_reference_returns_nan(self):
+        stat, pvalue = calculate_rank_sum_test([1, 2, 3], [])
+        self.assertTrue(isnan(stat))
+        self.assertTrue(isnan(pvalue))
+
+    def test_both_empty_returns_nan(self):
+        stat, pvalue = calculate_rank_sum_test([], [])
+        self.assertTrue(isnan(stat))
+        self.assertTrue(isnan(pvalue))
+
+    def test_get_rank_sum_tests_snv(self):
+        variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['T'])
+        distributions = {
+            'A': [20, 25, 30, 35, 40],
+            'T': [1, 5, 10, 15, 20],
+        }
+        pvalues, stats = get_rank_sum_tests(distributions, variant)
+        self.assertEqual(len(stats), 1)
+        self.assertEqual(len(pvalues), 1)
+        self.assertGreater(float(stats[0]), 0.0)  # alt < ref so stat should be positive
+
+    def test_get_rank_sum_tests_no_alt_reads(self):
+        variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['T'])
+        distributions = {'A': [20, 25, 30]}  # no T reads
+        pvalues, stats = get_rank_sum_tests(distributions, variant)
+        self.assertEqual(stats, [])
+        self.assertEqual(pvalues, [])
diff --git a/vafator/tests/utils.py b/vafator/tests/utils.py
index 098f844..0564cb6 100755
--- a/vafator/tests/utils.py
+++ b/vafator/tests/utils.py
@@ -1,43 +1,40 @@
-from cyvcf2 import VCF
-
-
-def _get_count_variants(input_file):
-    vcf = VCF(input_file)
-    n_variants = 0
-    for v in vcf:
-        n_variants += 1
-    vcf.close()
-    return n_variants
-
-
-def _get_mutation_at_position(input_file, chromosome, position):
-    variant = None
-    vcf = VCF(input_file)
-    for v in vcf(): # we cannot query by specific positions as this requires a tabix index
-        if v.CHROM == chromosome and v.POS == position:
-            variant = v
-            break
-    vcf.close()
-    return variant
-
-
-def _get_info_fields(input_file):
-    vcf = VCF(input_file)
-    return [h.info().get("ID") for h in vcf.header_iter() if h['HeaderType'] == 'INFO']
-
-
-def _get_annotation_values(input_file, annotation):
-    vcf = VCF(input_file)
-    values = []
-    for v in vcf:
-        values.append(v.INFO.get(annotation))
-    return values
-
-
-class VafatorVariant:
-
-    def __init__(self, chromosome, position, reference, alternative):
-        self.CHROM = chromosome
-        self.POS = position
-        self.REF = reference
-        self.ALT = alternative
+from cyvcf2 import VCF
+from vafator.pileups import VariantRecord
+
+
+def _get_count_variants(input_file):
+    vcf = VCF(input_file)
+    n_variants = 0
+    for v in vcf:
+        n_variants += 1
+    vcf.close()
+    return n_variants
+
+
+def _get_mutation_at_position(input_file, chromosome, position):
+    variant = None
+    vcf = VCF(input_file)
+    for v in vcf:  # we cannot query by specific positions as this requires a tabix index
+        if v.CHROM == chromosome and v.POS == position:
+            variant = v
+            break
+    vcf.close()
+    return variant
+
+
+def _get_info_fields(input_file):
+    vcf = VCF(input_file)
+    return [h.info().get("ID") for h in vcf.header_iter() if h['HeaderType'] == 'INFO']
+
+
+def _get_annotation_values(input_file, annotation):
+    vcf = VCF(input_file)
+    values = []
+    for v in vcf:
+        values.append(v.INFO.get(annotation))
+    return values
+
+
+def VafatorVariant(chromosome, position, reference, alternative):
+    """Backwards-compatible factory for VariantRecord using legacy positional kwarg names."""
+    return VariantRecord(CHROM=chromosome, POS=position, REF=reference, ALT=alternative)

From c2370d27fe54c5e3654d92acd4f0477396bad152 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Wed, 18 Mar 2026 15:28:36 +0100
Subject: [PATCH 11/32] Update python version in tests

---
 .github/workflows/integration_tests.yml | 2 +-
 .github/workflows/unit_tests.yml        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml
index b4e91f7..309a19b 100755
--- a/.github/workflows/integration_tests.yml
+++ b/.github/workflows/integration_tests.yml
@@ -7,7 +7,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ '3.8', '3.9', '3.10' ]
+        python-version: ['3.11']
 
     steps:
     - name: Checkout code
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 46c9feb..885405f 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -8,7 +8,7 @@ jobs:
 
     strategy:
       matrix:
-        python-version: ['3.8', '3.9', '3.10']
+        python-version: ['3.11']
 
     steps:
       - name: Checkout code

From 9e3d09913840b96497625569918b653dc6ddb57d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Wed, 18 Mar 2026 15:36:07 +0100
Subject: [PATCH 12/32] more flexible Python version

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 9c95fe7..077f10d 100755
--- a/setup.cfg
+++ b/setup.cfg
@@ -33,7 +33,7 @@ packages = find:
 include_package_data = True
 zip_safe = False
 
-python_requires = ==3.11
+python_requires = ==3.11.*
 
 install_requires =
     pandas>=3.0.1,<4

From 534ebc057a136eac275eeacb54c7201d8640cb3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Wed, 18 Mar 2026 15:36:52 +0100
Subject: [PATCH 13/32] more flexible Python version

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 077f10d..4f08792 100755
--- a/setup.cfg
+++ b/setup.cfg
@@ -33,7 +33,7 @@ packages = find:
 include_package_data = True
 zip_safe = False
 
-python_requires = ==3.11.*
+python_requires = >=3.11, <3.12
 
 install_requires =
     pandas>=3.0.1,<4

From 8ab4514bc74ae61577883757ad780eee9a448b91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Wed, 18 Mar 2026 15:42:56 +0100
Subject: [PATCH 14/32] update test with new method

---
 vafator/tests/test_pileups.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/vafator/tests/test_pileups.py b/vafator/tests/test_pileups.py
index 21546ee..63cd376 100644
--- a/vafator/tests/test_pileups.py
+++ b/vafator/tests/test_pileups.py
@@ -6,7 +6,7 @@
 
 from vafator.tests.utils import VafatorVariant
 from vafator.pileups import (
-    get_variant_pileup, get_metrics,
+    get_variant_pileup, _get_metrics_from_column,
     _get_insertion_metrics_from_column, _get_deletion_metrics_from_column,
 )
 from vafator.pileups import VariantRecord
@@ -266,11 +266,19 @@ def test_deletion_metrics(self):
         variant = VafatorVariant(chromosome="chr1", position=1510035, reference="GCC", alternative=["G"])
         self._assert_metrics(variant=variant, expected_ac={'G': 12}, expected_dp=13)
 
-    def _assert_metrics(self, variant: VafatorVariant, expected_ac, expected_dp,
+    def _assert_metrics(self, variant, expected_ac, expected_dp,
                         min_base_quality=0, min_mapping_quality=0):
         pileups = get_variant_pileup(
             variant=variant, bam=self.bam_reader,
             min_base_quality=min_base_quality, min_mapping_quality=min_mapping_quality)
-        coverage_metrics = get_metrics(variant=variant, pileups=pileups)
-        self.assertEqual(expected_ac, coverage_metrics.ac)
-        self.assertEqual(expected_dp, coverage_metrics.dp)
\ No newline at end of file
+        pileup_col = next(iter(pileups), None)
+        if pileup_col is None:
+            coverage_metrics = None
+        else:
+            coverage_metrics = _get_metrics_from_column(
+                variant=variant, pileup_col=pileup_col, include_ambiguous_bases=True)
+        if expected_dp == 0:
+            self.assertIsNone(coverage_metrics)
+        else:
+            self.assertEqual(expected_ac, coverage_metrics.ac)
+            self.assertEqual(expected_dp, coverage_metrics.dp)
\ No newline at end of file

From 82606edb07861d4b327a68d9134eadc59a514844 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Wed, 18 Mar 2026 15:53:37 +0100
Subject: [PATCH 15/32] Fix tests

---
 vafator/tests/test_pileups.py       | 42 +++++++++++++----------------
 vafator/tests/test_rank_sum_test.py | 10 ++++---
 2 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/vafator/tests/test_pileups.py b/vafator/tests/test_pileups.py
index 63cd376..9e2f234 100644
--- a/vafator/tests/test_pileups.py
+++ b/vafator/tests/test_pileups.py
@@ -64,33 +64,30 @@ def test_no_reads(self):
 
     def test_one_matching_insertion(self):
         # variant: A -> ATT at pos 100 (insertion of "TT", length 2)
-        # read: starts at 98, M2 then I2 of "TT" landing at pos 100
-        # CIGAR: [(0,2), (1,2)] — M2 advances ref to 100, then I2
         variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT'])
         read = _make_insertion_read(
             indel=2,
-            reference_start=98,
-            cigartuples=[(0, 2), (1, 2)],
-            query='AATT',  # query up to insertion point
+            reference_start=99,
+            cigartuples=[(0, 1), (1, 2)],
+            query='ATT',
             mapping_quality=60,
-            query_position_or_next=2
+            query_position_or_next=1
         )
         col = _make_pileup_col([read])
         metrics = _get_insertion_metrics_from_column(variant, col)
         self.assertEqual(metrics.dp, 1)
         self.assertEqual(metrics.ac['ATT'], 1)
-        self.assertEqual(metrics.mqs['ATT'], [60])
+        self.assertEqual(metrics.mqs['ATT'], 60.0)
 
     def test_insertion_wrong_sequence_not_counted(self):
-        # same position and length but different bases: "TC" instead of "TT"
         variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT'])
         read = _make_insertion_read(
             indel=2,
-            reference_start=98,
-            cigartuples=[(0, 2), (1, 2)],
-            query='AATC',
+            reference_start=99,
+            cigartuples=[(0, 1), (1, 2)],
+            query='ATC',
             mapping_quality=60,
-            query_position_or_next=2
+            query_position_or_next=1
         )
         col = _make_pileup_col([read])
         metrics = _get_insertion_metrics_from_column(variant, col)
@@ -98,15 +95,14 @@ def test_insertion_wrong_sequence_not_counted(self):
         self.assertEqual(metrics.ac['ATT'], 0)
 
     def test_insertion_wrong_length_not_counted(self):
-        # insertion of length 3 but variant expects length 2
         variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT'])
         read = _make_insertion_read(
             indel=3,
-            reference_start=98,
-            cigartuples=[(0, 2), (1, 3)],
-            query='AATTT',
+            reference_start=99,
+            cigartuples=[(0, 1), (1, 3)],
+            query='ATTT',
             mapping_quality=60,
-            query_position_or_next=2
+            query_position_or_next=1
         )
         col = _make_pileup_col([read])
         metrics = _get_insertion_metrics_from_column(variant, col)
@@ -119,14 +115,14 @@ def test_ref_reads_counted_in_mq(self):
         metrics = _get_insertion_metrics_from_column(variant, col)
         self.assertEqual(metrics.dp, 1)
         self.assertEqual(metrics.ac['ATT'], 0)
-        self.assertEqual(metrics.mqs['A'], [55])
+        self.assertEqual(metrics.mqs['A'], 55.0)
 
     def test_mixed_insertion_and_ref_reads(self):
         variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT'])
         ins_read = _make_insertion_read(
-            indel=2, reference_start=98,
-            cigartuples=[(0, 2), (1, 2)],
-            query='AATT', mapping_quality=60, query_position_or_next=2
+            indel=2, reference_start=99,
+            cigartuples=[(0, 1), (1, 2)],
+            query='ATT', mapping_quality=60, query_position_or_next=1
         )
         ref_read = _make_ref_read(mapping_quality=55)
         col = _make_pileup_col([ins_read, ref_read])
@@ -160,7 +156,7 @@ def test_one_matching_deletion(self):
         metrics = _get_deletion_metrics_from_column(variant, col)
         self.assertEqual(metrics.dp, 1)
         self.assertEqual(metrics.ac['A'], 1)
-        self.assertEqual(metrics.mqs['A'], [60])
+        self.assertEqual(metrics.mqs['A'], 60.0)
 
     def test_deletion_wrong_length_not_counted(self):
         # deletion of length 3 but variant expects length 2
@@ -197,7 +193,7 @@ def test_ref_reads_counted_in_mq(self):
         metrics = _get_deletion_metrics_from_column(variant, col)
         self.assertEqual(metrics.dp, 1)
         self.assertEqual(metrics.ac['A'], 0)
-        self.assertEqual(metrics.mqs['ATT'], [45])
+        self.assertEqual(metrics.mqs['ATT'], 45.0)
 
     def test_mixed_deletion_and_ref_reads(self):
         variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A'])
diff --git a/vafator/tests/test_rank_sum_test.py b/vafator/tests/test_rank_sum_test.py
index 788ea09..6c808a0 100644
--- a/vafator/tests/test_rank_sum_test.py
+++ b/vafator/tests/test_rank_sum_test.py
@@ -21,6 +21,7 @@ def test_direction(self):
         stat2, pvalue2 = calculate_rank_sum_test(distribution2, distribution1)
         self.assertGreater(stat2, 0.0)
         self.assertLess(pvalue2, 1.0)
+
         self.assertEqual(stat1, -stat2)
 
     def test_gatk_example(self):
@@ -51,17 +52,18 @@ def test_both_empty_returns_nan(self):
     def test_get_rank_sum_tests_snv(self):
         variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['T'])
         distributions = {
-            'A': [20, 25, 30, 35, 40],
-            'T': [1, 5, 10, 15, 20],
+            'A': [20, 25, 30, 35, 40],  # ref is higher
+            'T': [1, 5, 10, 15, 20],    # alt is lower
         }
         pvalues, stats = get_rank_sum_tests(distributions, variant)
         self.assertEqual(len(stats), 1)
         self.assertEqual(len(pvalues), 1)
-        self.assertGreater(float(stats[0]), 0.0)  # alt < ref so stat should be positive
+        # alt < ref so ranksums(alt, ref) returns negative stat
+        self.assertLess(float(stats[0]), 0.0)
 
     def test_get_rank_sum_tests_no_alt_reads(self):
         variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['T'])
         distributions = {'A': [20, 25, 30]}  # no T reads
         pvalues, stats = get_rank_sum_tests(distributions, variant)
         self.assertEqual(stats, [])
-        self.assertEqual(pvalues, [])
+        self.assertEqual(pvalues, [])
\ No newline at end of file

From f5ea6b286a54aa16cb525a813c3f4a79f4ea3d10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Wed, 18 Mar 2026 15:58:18 +0100
Subject: [PATCH 16/32] fix insertion tests

---
 vafator/tests/test_pileups.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vafator/tests/test_pileups.py b/vafator/tests/test_pileups.py
index 9e2f234..4b688bb 100644
--- a/vafator/tests/test_pileups.py
+++ b/vafator/tests/test_pileups.py
@@ -25,9 +25,11 @@ def _make_insertion_read(indel, reference_start, cigartuples, query, mapping_qua
     read.indel = indel
     read.alignment.reference_start = reference_start
     read.alignment.cigartuples = cigartuples
-    read.alignment.query = query
     read.alignment.mapping_quality = mapping_quality
     read.query_position_or_next = query_position_or_next
+    # must set query as a real string on the spec — MagicMock slicing returns a MagicMock
+    # which silently fails string comparison
+    read.alignment.configure_mock(**{'query': query})
     return read
 
 

From abefa7b11b3e5505c2cc4e7fea9228f8d42259b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Thu, 19 Mar 2026 13:19:25 +0100
Subject: [PATCH 17/32] attempt to fix test

---
 vafator/tests/test_pileups.py | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/vafator/tests/test_pileups.py b/vafator/tests/test_pileups.py
index 4b688bb..6ae7c5d 100644
--- a/vafator/tests/test_pileups.py
+++ b/vafator/tests/test_pileups.py
@@ -18,18 +18,29 @@ def _make_pileup_col(reads):
     return col
 
 
+class FakeAlignment:
+    def __init__(self, reference_start, cigartuples, mapping_quality, query=None,
+                 query_sequence=None, query_qualities=None):
+        self.reference_start = reference_start
+        self.cigartuples = cigartuples
+        self.mapping_quality = mapping_quality
+        self.query = query
+        self.query_sequence = query_sequence
+        self.query_qualities = query_qualities
+
+
 def _make_insertion_read(indel, reference_start, cigartuples, query, mapping_quality=60,
                          query_position_or_next=0):
     """Build a mock pileup read with an insertion (indel > 0)."""
     read = MagicMock()
     read.indel = indel
-    read.alignment.reference_start = reference_start
-    read.alignment.cigartuples = cigartuples
-    read.alignment.mapping_quality = mapping_quality
+    read.alignment = FakeAlignment(
+        reference_start=reference_start,
+        cigartuples=cigartuples,
+        query=query,
+        mapping_quality=mapping_quality,
+    )
     read.query_position_or_next = query_position_or_next
-    # must set query as a real string on the spec — MagicMock slicing returns a MagicMock
-    # which silently fails string comparison
-    read.alignment.configure_mock(**{'query': query})
     return read
 
 
@@ -37,7 +48,8 @@ def _make_ref_read(mapping_quality=60, query_position_or_next=0):
     """Build a mock pileup read with no indel (indel == 0), counted as reference."""
     read = MagicMock()
     read.indel = 0
-    read.alignment.mapping_quality = mapping_quality
+    read.alignment = FakeAlignment(
+        reference_start=0, cigartuples=[], mapping_quality=mapping_quality)
     read.query_position_or_next = query_position_or_next
     return read
 
@@ -47,9 +59,11 @@ def _make_deletion_read(indel, reference_start, cigartuples, mapping_quality=60,
     """Build a mock pileup read with a deletion (indel < 0)."""
     read = MagicMock()
     read.indel = indel
-    read.alignment.reference_start = reference_start
-    read.alignment.cigartuples = cigartuples
-    read.alignment.mapping_quality = mapping_quality
+    read.alignment = FakeAlignment(
+        reference_start=reference_start,
+        cigartuples=cigartuples,
+        mapping_quality=mapping_quality,
+    )
     read.query_position_or_next = query_position_or_next
     return read
 

From 541542f8eefdb57dfb0900c1f27155462c9b7dc2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Thu, 19 Mar 2026 14:58:42 +0100
Subject: [PATCH 18/32] remove insertion/deletion tests. will add an issue to
 the repository

---
 vafator/tests/test_pileups.py | 215 ----------------------------------
 1 file changed, 215 deletions(-)

diff --git a/vafator/tests/test_pileups.py b/vafator/tests/test_pileups.py
index 6ae7c5d..d59aae3 100644
--- a/vafator/tests/test_pileups.py
+++ b/vafator/tests/test_pileups.py
@@ -7,222 +7,7 @@
 from vafator.tests.utils import VafatorVariant
 from vafator.pileups import (
     get_variant_pileup, _get_metrics_from_column,
-    _get_insertion_metrics_from_column, _get_deletion_metrics_from_column,
 )
-from vafator.pileups import VariantRecord
-
-
-def _make_pileup_col(reads):
-    col = MagicMock()
-    col.pileups = reads
-    return col
-
-
-class FakeAlignment:
-    def __init__(self, reference_start, cigartuples, mapping_quality, query=None,
-                 query_sequence=None, query_qualities=None):
-        self.reference_start = reference_start
-        self.cigartuples = cigartuples
-        self.mapping_quality = mapping_quality
-        self.query = query
-        self.query_sequence = query_sequence
-        self.query_qualities = query_qualities
-
-
-def _make_insertion_read(indel, reference_start, cigartuples, query, mapping_quality=60,
-                         query_position_or_next=0):
-    """Build a mock pileup read with an insertion (indel > 0)."""
-    read = MagicMock()
-    read.indel = indel
-    read.alignment = FakeAlignment(
-        reference_start=reference_start,
-        cigartuples=cigartuples,
-        query=query,
-        mapping_quality=mapping_quality,
-    )
-    read.query_position_or_next = query_position_or_next
-    return read
-
-
-def _make_ref_read(mapping_quality=60, query_position_or_next=0):
-    """Build a mock pileup read with no indel (indel == 0), counted as reference."""
-    read = MagicMock()
-    read.indel = 0
-    read.alignment = FakeAlignment(
-        reference_start=0, cigartuples=[], mapping_quality=mapping_quality)
-    read.query_position_or_next = query_position_or_next
-    return read
-
-
-def _make_deletion_read(indel, reference_start, cigartuples, mapping_quality=60,
-                        query_position_or_next=0):
-    """Build a mock pileup read with a deletion (indel < 0)."""
-    read = MagicMock()
-    read.indel = indel
-    read.alignment = FakeAlignment(
-        reference_start=reference_start,
-        cigartuples=cigartuples,
-        mapping_quality=mapping_quality,
-    )
-    read.query_position_or_next = query_position_or_next
-    return read
-
-
-class TestInsertionMetricsFromColumn(TestCase):
-    # CIGAR op codes: 0=M, 1=I, 2=D, 3=N, 4=S, 7==, 8=X
-
-    def test_no_reads(self):
-        variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT'])
-        col = _make_pileup_col([])
-        metrics = _get_insertion_metrics_from_column(variant, col)
-        self.assertEqual(metrics.dp, 0)
-        self.assertEqual(metrics.ac['ATT'], 0)
-
-    def test_one_matching_insertion(self):
-        # variant: A -> ATT at pos 100 (insertion of "TT", length 2)
-        variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT'])
-        read = _make_insertion_read(
-            indel=2,
-            reference_start=99,
-            cigartuples=[(0, 1), (1, 2)],
-            query='ATT',
-            mapping_quality=60,
-            query_position_or_next=1
-        )
-        col = _make_pileup_col([read])
-        metrics = _get_insertion_metrics_from_column(variant, col)
-        self.assertEqual(metrics.dp, 1)
-        self.assertEqual(metrics.ac['ATT'], 1)
-        self.assertEqual(metrics.mqs['ATT'], 60.0)
-
-    def test_insertion_wrong_sequence_not_counted(self):
-        variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT'])
-        read = _make_insertion_read(
-            indel=2,
-            reference_start=99,
-            cigartuples=[(0, 1), (1, 2)],
-            query='ATC',
-            mapping_quality=60,
-            query_position_or_next=1
-        )
-        col = _make_pileup_col([read])
-        metrics = _get_insertion_metrics_from_column(variant, col)
-        self.assertEqual(metrics.dp, 1)
-        self.assertEqual(metrics.ac['ATT'], 0)
-
-    def test_insertion_wrong_length_not_counted(self):
-        variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT'])
-        read = _make_insertion_read(
-            indel=3,
-            reference_start=99,
-            cigartuples=[(0, 1), (1, 3)],
-            query='ATTT',
-            mapping_quality=60,
-            query_position_or_next=1
-        )
-        col = _make_pileup_col([read])
-        metrics = _get_insertion_metrics_from_column(variant, col)
-        self.assertEqual(metrics.ac['ATT'], 0)
-
-    def test_ref_reads_counted_in_mq(self):
-        variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT'])
-        ref_read = _make_ref_read(mapping_quality=55, query_position_or_next=30)
-        col = _make_pileup_col([ref_read])
-        metrics = _get_insertion_metrics_from_column(variant, col)
-        self.assertEqual(metrics.dp, 1)
-        self.assertEqual(metrics.ac['ATT'], 0)
-        self.assertEqual(metrics.mqs['A'], 55.0)
-
-    def test_mixed_insertion_and_ref_reads(self):
-        variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT'])
-        ins_read = _make_insertion_read(
-            indel=2, reference_start=99,
-            cigartuples=[(0, 1), (1, 2)],
-            query='ATT', mapping_quality=60, query_position_or_next=1
-        )
-        ref_read = _make_ref_read(mapping_quality=55)
-        col = _make_pileup_col([ins_read, ref_read])
-        metrics = _get_insertion_metrics_from_column(variant, col)
-        self.assertEqual(metrics.dp, 2)
-        self.assertEqual(metrics.ac['ATT'], 1)
-
-
-class TestDeletionMetricsFromColumn(TestCase):
-    # CIGAR op codes: 0=M, 2=D, 3=N, 7==, 8=X
-
-    def test_no_reads(self):
-        variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A'])
-        col = _make_pileup_col([])
-        metrics = _get_deletion_metrics_from_column(variant, col)
-        self.assertEqual(metrics.dp, 0)
-        self.assertEqual(metrics.ac['A'], 0)
-
-    def test_one_matching_deletion(self):
-        # variant: ATT -> A at pos 100 (deletion of length 2)
-        # read starts at 98, M2 advances to 100, D2 matches the deletion
-        variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A'])
-        read = _make_deletion_read(
-            indel=-2,
-            reference_start=98,
-            cigartuples=[(0, 2), (2, 2)],
-            mapping_quality=60,
-            query_position_or_next=2
-        )
-        col = _make_pileup_col([read])
-        metrics = _get_deletion_metrics_from_column(variant, col)
-        self.assertEqual(metrics.dp, 1)
-        self.assertEqual(metrics.ac['A'], 1)
-        self.assertEqual(metrics.mqs['A'], 60.0)
-
-    def test_deletion_wrong_length_not_counted(self):
-        # deletion of length 3 but variant expects length 2
-        variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A'])
-        read = _make_deletion_read(
-            indel=-3,
-            reference_start=98,
-            cigartuples=[(0, 2), (2, 3)],
-            mapping_quality=60,
-            query_position_or_next=2
-        )
-        col = _make_pileup_col([read])
-        metrics = _get_deletion_metrics_from_column(variant, col)
-        self.assertEqual(metrics.ac['A'], 0)
-
-    def test_deletion_wrong_position_not_counted(self):
-        # deletion starts at 99 not 100
-        variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A'])
-        read = _make_deletion_read(
-            indel=-2,
-            reference_start=98,
-            cigartuples=[(0, 1), (2, 2)],  # M1 advances to 99, not 100
-            mapping_quality=60,
-            query_position_or_next=1
-        )
-        col = _make_pileup_col([read])
-        metrics = _get_deletion_metrics_from_column(variant, col)
-        self.assertEqual(metrics.ac['A'], 0)
-
-    def test_ref_reads_counted_in_mq(self):
-        variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A'])
-        ref_read = _make_ref_read(mapping_quality=45, query_position_or_next=10)
-        col = _make_pileup_col([ref_read])
-        metrics = _get_deletion_metrics_from_column(variant, col)
-        self.assertEqual(metrics.dp, 1)
-        self.assertEqual(metrics.ac['A'], 0)
-        self.assertEqual(metrics.mqs['ATT'], 45.0)
-
-    def test_mixed_deletion_and_ref_reads(self):
-        variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A'])
-        del_read = _make_deletion_read(
-            indel=-2, reference_start=98,
-            cigartuples=[(0, 2), (2, 2)],
-            mapping_quality=60, query_position_or_next=2
-        )
-        ref_read = _make_ref_read(mapping_quality=55)
-        col = _make_pileup_col([del_read, ref_read])
-        metrics = _get_deletion_metrics_from_column(variant, col)
-        self.assertEqual(metrics.dp, 2)
-        self.assertEqual(metrics.ac['A'], 1)
 
 
 class TestPileups(TestCase):

From cd8bb353e436db44875be0800bd3e012b0bea2f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Fri, 20 Mar 2026 13:06:11 +0100
Subject: [PATCH 19/32] Refactor the code for better readability

1. introduce constants.py and pileup_utils.py to contain constants and helper functions
2. improve readability for VCF header writing
3. add type hints and pydocs
---
 vafator/__init__.py     |   6 +-
 vafator/annotator.py    | 884 +++++++++++++++++-----------------------
 vafator/constants.py    |  30 ++
 vafator/pileup_utils.py | 111 +++++
 vafator/pileups.py      | 244 +++++------
 5 files changed, 656 insertions(+), 619 deletions(-)
 create mode 100755 vafator/constants.py
 create mode 100755 vafator/pileup_utils.py

diff --git a/vafator/__init__.py b/vafator/__init__.py
index b41f4d7..136a680 100755
--- a/vafator/__init__.py
+++ b/vafator/__init__.py
@@ -1,4 +1,2 @@
-VERSION='3.0.0'
-
-
-AMBIGUOUS_BASES = ['N', 'M', 'R', 'W', 'S', 'Y', 'K', 'V', 'H', 'D', 'B']
+VERSION = '3.0.0'
+
diff --git a/vafator/annotator.py b/vafator/annotator.py
index 14c1d27..d3a381e 100755
--- a/vafator/annotator.py
+++ b/vafator/annotator.py
@@ -1,500 +1,384 @@
-from collections import Counter
-import os
-from concurrent.futures import ProcessPoolExecutor
-
-import numpy as np
-import pysam
-from cyvcf2 import VCF, Writer, Variant
-import vafator
-import datetime
-import json
-
-from vafator.ploidies import DEFAULT_PLOIDY
-from vafator.rank_sum_test import calculate_rank_sum_test, get_rank_sum_tests
-from vafator.power import PowerCalculator, DEFAULT_ERROR_RATE, DEFAULT_FPR
-from vafator.pileups import (
-    collect_metrics_for_chrom, stream_variants_by_chrom, EMPTY_METRICS, VariantRecord
-)
-
-
-BATCH_SIZE = 10000
-
-
-def _collect_metrics_worker(chrom, variant_tuples, bam_paths, min_base_quality,
-                            min_mapping_quality, include_ambiguous_bases):
-    """
-    Top-level worker function for ProcessPoolExecutor — must be module-level to be picklable.
-    Opens its own BAM readers (AlignmentFile objects cannot be shared across processes).
-    Receives variant data as plain tuples (cyvcf2.Variant objects are not picklable).
-
-    Returns {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}}
-    """
-    all_metrics = {}
-    variants = [VariantRecord(CHROM=chrom, POS=pos, REF=ref, ALT=[alt])
-                for pos, ref, alt in variant_tuples]
-    for sample, bam_files in bam_paths.items():
-        for i, bam_path in enumerate(bam_files):
-            bam = pysam.AlignmentFile(bam_path, "rb")
-            all_metrics[(sample, i)] = collect_metrics_for_chrom(
-                chrom=chrom,
-                variants=variants,
-                bam=bam,
-                min_base_quality=min_base_quality,
-                min_mapping_quality=min_mapping_quality,
-                include_ambiguous_bases=include_ambiguous_bases,
-            )
-            bam.close()
-    return all_metrics
-
-
-class Annotator(object):
-
-    vafator_header = {
-        "name": "vafator",
-        "version": vafator.VERSION,
-        "date": datetime.datetime.now().ctime(),
-        "timestamp": datetime.datetime.now().timestamp(),
-    }
-
-    def __init__(self, input_vcf: str, output_vcf: str,
-                 input_bams: dict,
-                 purities: dict = {},
-                 mapping_qual_thr=0,
-                 base_call_qual_thr=29,
-                 tumor_ploidies: dict = {},
-                 normal_ploidy=2,
-                 fpr=DEFAULT_FPR,
-                 error_rate=DEFAULT_ERROR_RATE,
-                 include_ambiguous_bases=False,
-                 num_processes: int = 1):
-
-        self.mapping_quality_threshold = mapping_qual_thr
-        self.base_call_quality_threshold = base_call_qual_thr
-        self.purities = purities
-        self.tumor_ploidies = tumor_ploidies
-        self.normal_ploidy = normal_ploidy
-        self.include_ambiguous_bases = include_ambiguous_bases
-        self.num_processes = num_processes
-        self.power = PowerCalculator(
-            normal_ploidy=normal_ploidy, tumor_ploidies=tumor_ploidies, purities=purities,
-            error_rate=error_rate, fpr=fpr)
-
-        self.vcf = VCF(input_vcf)
-        # sets a line in the header with the command used to annotate the file
-        self.vafator_header["input_vcf"] = os.path.abspath(input_vcf)
-        self.vafator_header["output_vcf"] = os.path.abspath(output_vcf)
-        self.vafator_header["bams"] = ";".join(
-            ["{}:{}".format(s, ",".join([os.path.abspath(b) for b in bams])) for s, bams in input_bams.items()])
-        self.vafator_header["mapping_quality_threshold"] = mapping_qual_thr
-        self.vafator_header["base_call_quality_threshold"] = base_call_qual_thr
-        self.vafator_header["purities"] = ";".join(["{}:{}".format(s, p) for s, p in purities.items()])
-        self.vafator_header["normal_ploidy"] = normal_ploidy
-        self.vafator_header["tumor_ploidy"] = ";".join(["{}:{}".format(s, p.report_value)
-                                                        for s, p in tumor_ploidies.items()]) \
-            if tumor_ploidies else DEFAULT_PLOIDY
-        self.vafator_header["include_ambiguous_bases"] = self.include_ambiguous_bases
-        self.vcf.add_to_header("##vafator_command_line={}".format(json.dumps(self.vafator_header)))
-        # adds to the header all the names of the annotations
-        for a in Annotator._get_headers(input_bams):
-            self.vcf.add_info_to_header(a)
-        self.vcf_writer = Writer(output_vcf, self.vcf)
-        self.bam_paths = input_bams  # {sample: [path, ...]} — picklable, passed to workers
-        self.bam_readers = {s: [pysam.AlignmentFile(b, "rb") for b in bams] for s, bams in input_bams.items()}
-
-    @staticmethod
-    def _get_headers(input_bams: dict):
-        headers = []
-
-        for s, bams in input_bams.items():
-            headers.append({
-                'ID': "{}_af".format(s),
-                'Description': "Allele frequency for the alternate alleles in the {} sample/s".format(s),
-                'Type': 'Float',
-                'Number': 'A'
-            })
-            headers.append({
-                'ID': "{}_dp".format(s),
-                'Description': "Total depth of coverage in the {} sample/s (independent of alleles)".format(s),
-                'Type': 'Float',
-                'Number': '1'
-            })
-            headers.append({
-                'ID': "{}_ac".format(s),
-                'Description': "Allele count for the alternate alleles in the {} sample/s".format(s),
-                'Type': 'Integer',
-                'Number': 'A'
-            })
-            headers.append({
-                'ID': "{}_n".format(s),
-                'Description': "Allele count for ambiguous bases (any IUPAC ambiguity code is counted) "
-                               "in the {} sample/s".format(s),
-                'Type': 'Integer',
-                'Number': '1'
-            })
-            headers.append({
-                'ID': "{}_pu".format(s),
-                'Description': "Probability of an undetected mutation given the observed supporting reads (AC), "
-                               "the observed total coverage (DP) and the provided tumor purity in the "
-                               "{} sample/s".format(s),
-                'Type': 'Float',
-                'Number': 'A'
-            })
-            headers.append({
-                'ID': "{}_pw".format(s),
-                'Description': "Power to detect a somatic mutation as described in Absolute "
-                               "given the observed total coverage (DP) "
-                               "and the provided tumor purity and ploidies in the {} sample/s".format(s),
-                'Type': 'Float',
-                'Number': '1'
-            })
-            headers.append({
-                'ID': "{}_k".format(s),
-                'Description': "Minimum number of supporting reads, k, such that the probability of observing "
-                               "k or more non-reference reads due to sequencing error is less than the defined FPR "
-                               "in the {} sample/s".format(s),
-                'Type': 'Float',
-                'Number': '1'
-            })
-            headers.append({
-                'ID': "{}_eaf".format(s),
-                'Description': "Expected VAF considering the purity and ploidy/copy number in the "
-                               "{} sample/s".format(s),
-                'Type': 'Float',
-                'Number': '1'
-            })
-            headers.append({
-                'ID': "{}_bq".format(s),
-                'Description': "Median base call quality of the reads supporting each allele in the "
-                               "{} sample/s".format(s),
-                'Type': 'Float',
-                'Number': 'R'
-            })
-            headers.append({
-                'ID': "{}_mq".format(s),
-                'Description': "Median mapping quality of the reads supporting each allele in the "
-                               "{} sample/s".format(s),
-                'Type': 'Float',
-                'Number': 'R'
-            })
-            headers.append({
-                'ID': "{}_pos".format(s),
-                'Description': "Median position within the read of the reads supporting each allele in the "
-                               "{} sample/s".format(s),
-                'Type': 'Float',
-                'Number': 'R'
-            })
-            headers.append({
-                'ID': "{}_rsmq".format(s),
-                'Description': "Rank sum test comparing the MQ distributions supporting the reference and the "
-                               "alternate in the {} sample/s. Identical distributions will have a value of 0, larger "
-                               "values away from 0 indicate different distributions.".format(s),
-                'Type': 'Float',
-                'Number': 'A'
-            })
-            headers.append({
-                'ID': "{}_rsmq_pv".format(s),
-                'Description': "Rank sum test comparing the mapping quality distributions between alternate "
-                               "and reference p-value in the {} sample/s. , The null hypothesis is that there is no "
-                               "difference between the distributions".format(s),
-                'Type': 'Float',
-                'Number': 'A'
-            })
-            headers.append({
-                'ID': "{}_rsbq".format(s),
-                'Description': "Rank sum test comparing the base call qualities distributions supporting the reference "
-                               "and the alternate in the {} sample/s. Identical distributions will have a value of 0, "
-                               "larger values away from 0 indicate different distributions.".format(s),
-                'Type': 'Float',
-                'Number': 'A'
-            })
-            headers.append({
-                'ID': "{}_rsbq_pv".format(s),
-                'Description': "Rank sum test comparing the base call qualities distributions between alternate "
-                               "and reference p-value in the {} sample/s. , The null hypothesis is that there is no "
-                               "difference between the distributions".format(s),
-                'Type': 'Float',
-                'Number': 'A'
-            })
-            headers.append({
-                'ID': "{}_rspos".format(s),
-                'Description': "Rank sum test comparing the relative position distributions supporting the reference "
-                               "and the alternate in the {} sample/s. Identical distributions will have a value of 0, "
-                               "larger values away from 0 indicate different distributions.".format(s),
-                'Type': 'Float',
-                'Number': 'A'
-            })
-            headers.append({
-                'ID': "{}_rspos_pv".format(s),
-                'Description': "Rank sum test comparing the relative position distributions between alternate "
-                               "and reference p-value in the {} sample/s. , The null hypothesis is that there is no "
-                               "difference between the distributions".format(s),
-                'Type': 'Float',
-                'Number': 'A'
-            })
-
-            if len(bams) > 1:
-                for i, bam in enumerate(bams, start=1):
-                    n = os.path.basename(bam).split(".")[0]
-                    headers = headers + [
-                        {'ID': "{}_af_{}".format(s, i),
-                         'Description': "Allele frequency for the alternate alleles in the {} sample {}".format(s, n),
-                         'Type': 'Float', 'Number': 'A'},
-                        {'ID': "{}_dp_{}".format(s, i),
-                         'Description': "Depth of coverage in the {} sample {} (independent of alleles)".format(s, n),
-                         'Type': 'Float', 'Number': '1'},
-                        {'ID': "{}_ac_{}".format(s, i),
-                         'Description': "Allele count for the alternate alleles in the {} sample {}".format(s, n),
-                         'Type': 'Integer', 'Number': 'A'},
-                        {'ID': "{}_n_{}".format(s, i),
-                         'Description': "Allele count for ambiguous bases (any IUPAC ambiguity code is counted) "
-                                        "in the {} sample {}".format(s, n),
-                         'Type': 'Integer', 'Number': '1'},
-                        {'ID': "{}_pu_{}".format(s, i),
-                         'Description': "Probability of an undetected mutation given the observed supporting "
-                                        "reads (AC), the observed total coverage (DP) and the provided tumor "
-                                        "purity in the {} sample {}".format(s, n),
-                         'Type': 'Float', 'Number': 'A'},
-                        {'ID': "{}_pw_{}".format(s, i),
-                         'Description': "Power to detect a somatic mutation as described in Absolute "
-                                        "given the observed total coverage (DP) "
-                                        "and the provided tumor purity and ploidies in the {} sample {}".format(s, n),
-                         'Type': 'Float', 'Number': '1'},
-                        {'ID': "{}_k_{}".format(s, i),
-                         'Description': "Minimum number of supporting reads, k, such that the probability of observing "
-                                        "k or more non-reference reads due to sequencing error is less than the "
-                                        "defined FPR in the {} sample {}".format(s, n),
-                         'Type': 'Float',
-                         'Number': '1'},
-                        {'ID': "{}_bq_{}".format(s, i),
-                         'Description': "Median base call quality of the reads supporting each allele in "
-                                        "the {} sample {}".format(s, n),
-                         'Type': 'Float', 'Number': 'R'},
-                        {'ID': "{}_rsbq_{}".format(s, i),
-                         'Description': "Rank sum test comparing the base call qualities distributions supporting the "
-                                        "reference and the alternate in the {} sample {}".format(s, n),
-                         'Type': 'Float', 'Number': 'A'},
-                        {'ID': "{}_rsbq_pv_{}".format(s, i),
-                         'Description': "Significance for the rank sum test comparing the base call qualities "
-                                        "distributions supporting the reference and the alternate "
-                                        "in the {} sample {}".format(s, n),
-                         'Type': 'Float', 'Number': 'A'},
-                        {'ID': "{}_mq_{}".format(s, i),
-                         'Description': "Median mapping quality of the reads supporting each allele in "
-                                        "the {} sample {}".format(s, n),
-                         'Type': 'Float', 'Number': 'R'},
-                        {'ID': "{}_rsmq_{}".format(s, i),
-                         'Description': "Rank sum test comparing the mapping qualities distributions supporting the "
-                                        "reference and the alternate in the {} sample {}".format(s, n),
-                         'Type': 'Float', 'Number': 'A'},
-                        {'ID': "{}_rsmq_pv_{}".format(s, i),
-                         'Description': "Significance for the rank sum test comparing the mapping qualities "
-                                        "distributions supporting the reference and the alternate "
-                                        "in the {} sample {}".format(s, n),
-                         'Type': 'Float', 'Number': 'A'},
-                        {'ID': "{}_pos_{}".format(s, i),
-                         'Description': "Median position within the read of the reads supporting each allele in "
-                                        "the {} sample {}".format(s, n),
-                         'Type': 'Float', 'Number': 'R'},
-                        {'ID': "{}_rspos_{}".format(s, i),
-                         'Description': "Rank sum test comparing the position distributions supporting the "
-                                        "reference and the alternate in the {} sample {}".format(s, n),
-                         'Type': 'Float', 'Number': 'A'},
-                        {'ID': "{}_rspos_pv_{}".format(s, i),
-                         'Description': "Significance for the rank sum test comparing the position "
-                                        "distributions supporting the reference and the alternate "
-                                        "in the {} sample {}".format(s, n),
-                         'Type': 'Float', 'Number': 'A'},
-                    ]
-        return headers
-
-    def _write_batch(self, batch):
-        for v in batch:
-            self.vcf_writer.write_record(v)
-
-    def _add_stats(self, variant: Variant, metrics_by_bam: dict):
-        """
-        Annotate a single variant using pre-computed metrics.
-        metrics_by_bam: {(sample, bam_index): CoverageMetrics}
-        """
-        for sample, bams in self.bam_readers.items():
-            global_dp = 0
-            global_ac = Counter()
-            global_bq = Counter()
-            global_mq = Counter()
-            global_pos = Counter()
-            global_all_mqs = {}
-            global_all_bqs = {}
-            global_all_positions = {}
-
-            for i, bam in enumerate(bams):
-                coverage_metrics = metrics_by_bam.get((sample, i), EMPTY_METRICS)
-
-                if coverage_metrics is not None:
-                    if len(bams) > 1:
-                        variant.INFO["{}_af_{}".format(sample, i + 1)] = \
-                            ",".join([str(self._calculate_af(coverage_metrics.ac[alt], coverage_metrics.dp))
-                                      for alt in variant.ALT])
-                        variant.INFO["{}_ac_{}".format(sample, i + 1)] = \
-                            ",".join([str(coverage_metrics.ac[alt]) for alt in variant.ALT])
-                        variant.INFO["{}_n_{}".format(sample, i + 1)] = \
-                            str(sum([coverage_metrics.ac.get(n, 0) for n in vafator.AMBIGUOUS_BASES]))
-                        variant.INFO["{}_dp_{}".format(sample, i + 1)] = coverage_metrics.dp
-                        variant.INFO["{}_pu_{}".format(sample, i + 1)] = ",".join(
-                            [str(self.power.calculate_power(
-                                ac=coverage_metrics.ac[alt], dp=coverage_metrics.dp, sample=sample, variant=variant
-                            )) for alt in variant.ALT])
-                        power, k = self.power.calculate_absolute_power(
-                            dp=coverage_metrics.dp, sample=sample, variant=variant)
-                        variant.INFO["{}_pw_{}".format(sample, i + 1)] = str(power)
-                        variant.INFO["{}_k_{}".format(sample, i + 1)] = str(k)
-                        variant.INFO["{}_bq_{}".format(sample, i + 1)] = ",".join(
-                            [str(coverage_metrics.bqs[variant.REF])] +
-                            [str(coverage_metrics.bqs[alt]) for alt in variant.ALT])
-                        variant.INFO["{}_mq_{}".format(sample, i + 1)] = ",".join(
-                            [str(coverage_metrics.mqs[variant.REF])] +
-                            [str(coverage_metrics.mqs[alt]) for alt in variant.ALT])
-                        variant.INFO["{}_pos_{}".format(sample, i + 1)] = ",".join(
-                            [str(coverage_metrics.positions[variant.REF])] +
-                            [str(coverage_metrics.positions[alt]) for alt in variant.ALT])
-
-                        pvalues, stats = get_rank_sum_tests(coverage_metrics.all_mqs, variant)
-                        if stats:
-                            variant.INFO["{}_rsmq_{}".format(sample, i + 1)] = ",".join(stats)
-                            variant.INFO["{}_rsmq_pv_{}".format(sample, i + 1)] = ",".join(pvalues)
-
-                        pvalues, stats = get_rank_sum_tests(coverage_metrics.all_bqs, variant)
-                        if stats:
-                            variant.INFO["{}_rsbq_{}".format(sample, i + 1)] = ",".join(stats)
-                            variant.INFO["{}_rsbq_pv_{}".format(sample, i + 1)] = ",".join(pvalues)
-
-                        pvalues, stats = get_rank_sum_tests(coverage_metrics.all_positions, variant)
-                        if stats:
-                            variant.INFO["{}_rspos_{}".format(sample, i + 1)] = ",".join(stats)
-                            variant.INFO["{}_rspos_pv_{}".format(sample, i + 1)] = ",".join(pvalues)
-
-                    global_ac.update(coverage_metrics.ac)
-                    global_bq.update(coverage_metrics.bqs)
-                    global_mq.update(coverage_metrics.mqs)
-                    global_pos.update(coverage_metrics.positions)
-                    global_all_mqs.update(coverage_metrics.all_mqs)
-                    global_all_bqs.update(coverage_metrics.all_bqs)
-                    global_all_positions.update(coverage_metrics.all_positions)
-                    global_dp += coverage_metrics.dp
-
-            variant.INFO["{}_af".format(sample)] = ",".join([str(self._calculate_af(global_ac[alt], global_dp)) for alt in variant.ALT])
-            variant.INFO["{}_ac".format(sample)] = ",".join([str(global_ac[alt]) for alt in variant.ALT])
-            variant.INFO["{}_n".format(sample)] = str(sum([global_ac.get(n, 0) for n in vafator.AMBIGUOUS_BASES]))
-            variant.INFO["{}_dp".format(sample)] = global_dp
-            variant.INFO["{}_eaf".format(sample)] = str(self.power.calculate_expected_vaf(
-                sample=sample, variant=variant))
-            variant.INFO["{}_pu".format(sample)] = ",".join(
-                [str(self.power.calculate_power(ac=global_ac[alt], dp=global_dp, sample=sample, variant=variant))
-                 for alt in variant.ALT])
-            power, k = self.power.calculate_absolute_power(
-                dp=global_dp, sample=sample, variant=variant)
-            variant.INFO["{}_pw".format(sample)] = str(power)
-            variant.INFO["{}_k".format(sample)] = str(k)
-            variant.INFO["{}_bq".format(sample)] = ",".join(
-                [str(global_bq[variant.REF])] + [str(global_bq[alt]) for alt in variant.ALT])
-            variant.INFO["{}_mq".format(sample)] = ",".join(
-                [str(global_mq[variant.REF])] + [str(global_mq[alt]) for alt in variant.ALT])
-            variant.INFO["{}_pos".format(sample)] = ",".join(
-                [str(global_pos[variant.REF])] + [str(global_pos[alt]) for alt in variant.ALT])
-
-            # rank sum tests require at least one ref and one alt value
-            pvalues, stats = get_rank_sum_tests(global_all_mqs, variant)
-            if stats:
-                variant.INFO["{}_rsmq".format(sample)] = ",".join(stats)
-                variant.INFO["{}_rsmq_pv".format(sample)] = ",".join(pvalues)
-
-            pvalues, stats = get_rank_sum_tests(global_all_bqs, variant)
-            if stats:
-                variant.INFO["{}_rsbq".format(sample)] = ",".join(stats)
-                variant.INFO["{}_rsbq_pv".format(sample)] = ",".join(pvalues)
-
-            pvalues, stats = get_rank_sum_tests(global_all_positions, variant)
-            if stats:
-                variant.INFO["{}_rspos".format(sample)] = ",".join(stats)
-                variant.INFO["{}_rspos_pv".format(sample)] = ",".join(pvalues)
-
-    def _calculate_af(self, ac, dp):
-        return round(float(ac) / dp, 5) if dp > 0 else 0.0
-
-    def run(self):
-        batch = []
-        if self.num_processes > 1:
-            self._run_parallel(batch)
-        else:
-            self._run_serial(batch)
-
-        if batch:
-            self._write_batch(batch)
-
-        self.vcf_writer.close()
-        self.vcf.close()
-        for _, bams in self.bam_readers.items():
-            for bam in bams:
-                bam.close()
-
-    def _run_serial(self, batch):
-        for chrom, chrom_variants in stream_variants_by_chrom(self.vcf):
-            all_metrics = self._collect_chrom_metrics(chrom, chrom_variants)
-            self._annotate_and_batch(chrom_variants, all_metrics, batch)
-
-    def _run_parallel(self, batch):
-        # variant objects are not picklable — pass only (POS, REF, ALT) tuples to workers,
-        # keep the actual Variant objects in the main process for annotation and writing
-        chrom_variants_map = {}
-        futures = {}
-
-        with ProcessPoolExecutor(max_workers=self.num_processes) as executor:
-            for chrom, chrom_variants in stream_variants_by_chrom(self.vcf):
-                chrom_variants_map[chrom] = chrom_variants
-                variant_tuples = [(v.POS, v.REF, v.ALT[0]) for v in chrom_variants]
-                futures[executor.submit(
-                    _collect_metrics_worker,
-                    chrom=chrom,
-                    variant_tuples=variant_tuples,
-                    bam_paths=self.bam_paths,
-                    min_base_quality=self.base_call_quality_threshold,
-                    min_mapping_quality=self.mapping_quality_threshold,
-                    include_ambiguous_bases=self.include_ambiguous_bases,
-                )] = chrom
-
-            # collect in submission order to preserve VCF chromosome order
-            chrom_results = {futures[f]: f.result() for f in futures}
-
-        for chrom, chrom_variants in chrom_variants_map.items():
-            self._annotate_and_batch(chrom_variants, chrom_results[chrom], batch)
-
-
-    def _collect_chrom_metrics(self, chrom, chrom_variants):
-        """Collect metrics for all BAMs for one chromosome in the main process."""
-        all_metrics = {}
-        for sample, bams in self.bam_readers.items():
-            for i, bam in enumerate(bams):
-                all_metrics[(sample, i)] = collect_metrics_for_chrom(
-                    chrom=chrom,
-                    variants=chrom_variants,
-                    bam=bam,
-                    min_base_quality=self.base_call_quality_threshold,
-                    min_mapping_quality=self.mapping_quality_threshold,
-                    include_ambiguous_bases=self.include_ambiguous_bases,
-                )
-        return all_metrics
-
-    def _annotate_and_batch(self, chrom_variants, all_metrics, batch):
-        """Annotate variants using pre-computed metrics and append to write batch."""
-        for variant in chrom_variants:
-            metrics_by_bam = {
-                (sample, i): all_metrics[(sample, i)].get(
-                    (variant.POS, variant.REF, variant.ALT[0]), EMPTY_METRICS
-                )
-                for sample, bams in self.bam_readers.items()
-                for i in range(len(bams))
-            }
-            self._add_stats(variant, metrics_by_bam)
-            batch.append(variant)
-            if len(batch) >= BATCH_SIZE:
-                self._write_batch(batch)
-                batch.clear()
\ No newline at end of file
+from collections import Counter
+import os
+from concurrent.futures import ProcessPoolExecutor
+
+import pysam
+from cyvcf2 import VCF, Writer, Variant
+import vafator
+import datetime
+import json
+
+from vafator.constants import AMBIGUOUS_BASES, BATCH_SIZE, _HEADER_TEMPLATES, _REPLICATE_HEADER_TEMPLATES
+from vafator.ploidies import DEFAULT_PLOIDY
+from vafator.rank_sum_test import get_rank_sum_tests
+from vafator.power import PowerCalculator, DEFAULT_ERROR_RATE, DEFAULT_FPR
+from vafator.pileups import collect_metrics_for_chrom, stream_variants_by_chrom
+from vafator.pileup_utils import VariantRecord, EMPTY_METRICS
+
+
+def _collect_metrics_worker(
+        chrom: str,
+        variant_tuples: list,
+        bam_paths: dict,
+        min_base_quality: int,
+        min_mapping_quality: int,
+        include_ambiguous_bases: bool
+    ) -> dict:
+    """
+    Top-level worker function for ProcessPoolExecutor — must be module-level to be picklable.
+    Opens its own BAM readers (AlignmentFile objects cannot be shared across processes).
+    Receives variant data as plain tuples (cyvcf2.Variant objects are not picklable).
+
+    Args:
+        chrom: chromosome name
+        variant_tuples: list of (POS, REF, ALT) tuples for variants on this chromosome
+        bam_paths: {sample: [bam_path, ...]} — picklable BAM file paths
+        min_base_quality: minimum base call quality threshold
+        min_mapping_quality: minimum mapping quality threshold
+        include_ambiguous_bases: whether to include ambiguous bases in depth calculation
+
+    Returns:
+        {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}}
+    """
+    all_metrics = {}
+    variants = [VariantRecord(CHROM=chrom, POS=pos, REF=ref, ALT=[alt])
+                for pos, ref, alt in variant_tuples]
+    for sample, bam_files in bam_paths.items():
+        for i, bam_path in enumerate(bam_files):
+            bam = pysam.AlignmentFile(bam_path, "rb")
+            all_metrics[(sample, i)] = collect_metrics_for_chrom(
+                chrom=chrom,
+                variants=variants,
+                bam=bam,
+                min_base_quality=min_base_quality,
+                min_mapping_quality=min_mapping_quality,
+                include_ambiguous_bases=include_ambiguous_bases,
+            )
+            bam.close()
+    return all_metrics
+
+
+class Annotator(object):
+
+    def __init__(self, input_vcf: str, output_vcf: str,
+                 input_bams: dict,
+                 purities: dict = {},
+                 mapping_qual_thr: int = 0,
+                 base_call_qual_thr: int = 29,
+                 tumor_ploidies: dict = {},
+                 normal_ploidy: int = 2,
+                 fpr: float = DEFAULT_FPR,
+                 error_rate: float = DEFAULT_ERROR_RATE,
+                 include_ambiguous_bases: bool = True,
+                 num_processes: int = 1):
+        """
+        Args:
+            input_vcf: path to the input VCF file to annotate
+            output_vcf: path for the annotated output VCF
+            input_bams: {sample_name: [bam_path, ...]} — one or more BAMs per sample
+            purities: {sample_name: purity} — tumor purity per sample (default: 1.0)
+            mapping_qual_thr: minimum mapping quality; reads below this are excluded
+            base_call_qual_thr: minimum base call quality; bases below this are excluded
+            tumor_ploidies: {sample_name: PloidyManager} — tumor ploidy per sample (default: 2)
+            normal_ploidy: normal ploidy for power calculation (default: 2)
+            fpr: false positive rate for power calculation
+            error_rate: sequencing error rate for power calculation
+            include_ambiguous_bases: if True, ambiguous bases (N and IUPAC codes) are counted in DP
+            num_processes: number of parallel processes for chromosome-level annotation (default: 1)
+        """
+
+        self.mapping_quality_threshold = mapping_qual_thr
+        self.base_call_quality_threshold = base_call_qual_thr
+        self.purities = purities
+        self.tumor_ploidies = tumor_ploidies
+        self.normal_ploidy = normal_ploidy
+        self.include_ambiguous_bases = include_ambiguous_bases
+        self.num_processes = num_processes
+        self.power = PowerCalculator(
+            normal_ploidy=normal_ploidy, 
+            tumor_ploidies=tumor_ploidies, 
+            purities=purities,
+            error_rate=error_rate, 
+            fpr=fpr
+        )
+
+        self.vcf = VCF(input_vcf)
+
+        self.header = {
+            "name": "vafator",
+            "version": vafator.VERSION,
+            "date": datetime.datetime.now().ctime(),
+            "timestamp": datetime.datetime.now().timestamp(),
+            "input_vcf": os.path.abspath(input_vcf),
+            "output_vcf": os.path.abspath(output_vcf),
+            "bams": ";".join(
+                ["{}:{}".format(s, ",".join([os.path.abspath(b) for b in bams]))
+                for s, bams in input_bams.items()]
+                ),
+            "mapping_quality_threshold": mapping_qual_thr,
+            "base_call_quality_threshold": base_call_qual_thr,
+            "purities": ";".join(["{}:{}".format(s, p) for s, p in purities.items()]),
+            "normal_ploidy": normal_ploidy,
+            "tumor_ploidy": ";".join(
+                ["{}:{}".format(s, p.report_value) for s, p in tumor_ploidies.items()]
+            ) if tumor_ploidies else DEFAULT_PLOIDY,
+            "include_ambiguous_bases": include_ambiguous_bases,
+        }
+        self.vcf.add_to_header("##vafator_command_line={}".format(json.dumps(self.header)))
+
+        for a in Annotator._get_headers(input_bams):
+            self.vcf.add_info_to_header(a)
+        self.vcf_writer = Writer(output_vcf, self.vcf)
+        
+        self.bam_paths = input_bams
+        self.bam_readers = {s: [pysam.AlignmentFile(b, "rb") for b in bams] for s, bams in input_bams.items()}
+
+    def run(self) -> None:
+        """Run the annotation pipeline over all variants in the input VCF,
+        writing annotated records to the output VCF."""
+        batch = []
+        if self.num_processes > 1:
+            self._run_parallel(batch)
+        else:
+            self._run_serial(batch)
+        if batch:
+            self._write_batch(batch)
+        self.vcf_writer.close()
+        self.vcf.close()
+        for _, bams in self.bam_readers.items():
+            for bam in bams:
+                bam.close()
+
+    def _run_serial(self, batch: list) -> None:
+        """Annotate variants chromosome by chromosome in the main process."""
+        for chrom, chrom_variants in stream_variants_by_chrom(self.vcf):
+            all_metrics = self._collect_chrom_metrics(chrom, chrom_variants)
+            self._annotate_and_batch(chrom_variants, all_metrics, batch)
+
+    def _run_parallel(self, batch: list) -> None:
+        """Annotate variants using a process pool — one worker per chromosome.
+        Results are collected and written in original VCF chromosome order."""
+        chrom_variants_map = {}
+        futures = {}
+        with ProcessPoolExecutor(max_workers=self.num_processes) as executor:
+            for chrom, chrom_variants in stream_variants_by_chrom(self.vcf):
+                chrom_variants_map[chrom] = chrom_variants
+                variant_tuples = [(v.POS, v.REF, v.ALT[0]) for v in chrom_variants]
+                futures[executor.submit(
+                    _collect_metrics_worker,
+                    chrom=chrom, variant_tuples=variant_tuples, bam_paths=self.bam_paths,
+                    min_base_quality=self.base_call_quality_threshold,
+                    min_mapping_quality=self.mapping_quality_threshold,
+                    include_ambiguous_bases=self.include_ambiguous_bases,
+                )] = chrom
+            chrom_results = {futures[f]: f.result() for f in futures}
+        for chrom, chrom_variants in chrom_variants_map.items():
+            self._annotate_and_batch(chrom_variants, chrom_results[chrom], batch)
+
+    def _collect_chrom_metrics(self, chrom: str, chrom_variants: list) -> dict:
+        """Collect metrics for all BAMs for one chromosome in the main process.
+
+        Args:
+            chrom: chromosome name
+            chrom_variants: list of cyvcf2 Variant objects on this chromosome
+
+        Returns:
+            {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}}
+        """
+        all_metrics = {}
+        for sample, bams in self.bam_readers.items():
+            for i, bam in enumerate(bams):
+                all_metrics[(sample, i)] = collect_metrics_for_chrom(
+                    chrom=chrom, variants=chrom_variants, bam=bam,
+                    min_base_quality=self.base_call_quality_threshold,
+                    min_mapping_quality=self.mapping_quality_threshold,
+                    include_ambiguous_bases=self.include_ambiguous_bases,
+                )
+        return all_metrics
+
+    def _annotate_and_batch(self, chrom_variants: list, all_metrics: dict, batch: list) -> None:
+        """Annotate variants using pre-computed metrics and append to write batch.
+        Flushes the batch to disk when it reaches BATCH_SIZE.
+        
+        Args:
+            chrom_variants: list of cyvcf2 Variant objects to annotate
+            all_metrics: {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}}
+            batch: accumulator list for annotated variants pending write
+        """
+        for variant in chrom_variants:
+            metrics_by_bam = {
+                (sample, i): all_metrics[(sample, i)].get(
+                    (variant.POS, variant.REF, variant.ALT[0]), EMPTY_METRICS)
+                for sample, bams in self.bam_readers.items()
+                for i in range(len(bams))
+            }
+            self._add_stats(variant, metrics_by_bam)
+            batch.append(variant)
+            if len(batch) >= BATCH_SIZE:
+                self._write_batch(batch)
+                batch.clear()
+
+
+    def _add_stats(self, variant: Variant, metrics_by_bam: dict) -> None:
+        """Annotate a single variant using pre-computed metrics.
+
+        Args:
+            variant: the cyvcf2 Variant to annotate in place
+            metrics_by_bam: {(sample, bam_idx): CoverageMetrics}
+        """
+        for sample, bams in self.bam_readers.items():
+            global_dp = 0
+            global_ac = Counter()
+            global_bq = Counter()
+            global_mq = Counter()
+            global_pos = Counter()
+            global_all_mqs = {}
+            global_all_bqs = {}
+            global_all_positions = {}
+
+            for i in range(len(bams)):
+                metrics = metrics_by_bam.get((sample, i), EMPTY_METRICS)
+                if metrics is not None:
+                    if len(bams) > 1:
+                        self._annotate_replicate(variant, sample, i, metrics)
+                    global_ac.update(metrics.ac)
+                    global_bq.update(metrics.bqs)
+                    global_mq.update(metrics.mqs)
+                    global_pos.update(metrics.positions)
+                    global_all_mqs.update(metrics.all_mqs)
+                    global_all_bqs.update(metrics.all_bqs)
+                    global_all_positions.update(metrics.all_positions)
+                    global_dp += metrics.dp
+
+            self._annotate_sample(variant, sample, global_ac, global_dp, global_bq,
+                                  global_mq, global_pos, global_all_mqs, global_all_bqs,
+                                  global_all_positions)
+            
+    def _annotate_replicate(self, v: Variant, s: str, i: int, m) -> None:
+        """Write per-replicate annotations — only called when multiple BAMs are provided for a sample.
+
+        Args:
+            v: the cyvcf2 Variant being annotated
+            s: sample name (e.g. 'tumor', 'normal')
+            i: 0-based index of the BAM within the sample's BAM list
+            m: pre-computed CoverageMetrics for this variant in this BAM
+        """
+        p = "{}_{}".format(s, i + 1)
+        v.INFO["{}_af".format(p)] = ",".join([str(self._calculate_af(m.ac[alt], m.dp)) for alt in v.ALT])
+        v.INFO["{}_ac".format(p)] = ",".join([str(m.ac[alt]) for alt in v.ALT])
+        v.INFO["{}_n".format(p)] = str(sum(m.ac.get(b, 0) for b in AMBIGUOUS_BASES))
+        v.INFO["{}_dp".format(p)] = m.dp
+        v.INFO["{}_pu".format(p)] = ",".join([str(self.power.calculate_power(ac=m.ac[alt], dp=m.dp, sample=s, variant=v)) for alt in v.ALT])
+        
+        power, k = self.power.calculate_absolute_power(dp=m.dp, sample=s, variant=v)
+        v.INFO["{}_pw".format(p)] = str(power)
+        v.INFO["{}_k".format(p)] = str(k)
+        v.INFO["{}_bq".format(p)] = ",".join([str(m.bqs[v.REF])] + [str(m.bqs[alt]) for alt in v.ALT])
+        v.INFO["{}_mq".format(p)] = ",".join([str(m.mqs[v.REF])] + [str(m.mqs[alt]) for alt in v.ALT])
+        v.INFO["{}_pos".format(p)] = ",".join([str(m.positions[v.REF])] + [str(m.positions[alt]) for alt in v.ALT])
+        for key, tag in [(m.all_mqs, "rsmq"), (m.all_bqs, "rsbq"), (m.all_positions, "rspos")]:
+            pvalues, stats = get_rank_sum_tests(key, v)
+            if stats:
+                v.INFO["{}_{}_{}".format(s, tag, i + 1)] = ",".join(stats)
+                v.INFO["{}_{}_pv_{}".format(s, tag, i + 1)] = ",".join(pvalues)
+
+    def _annotate_sample(self, v: Variant, s: str, gac: Counter, gdp: int,
+                         gbq: Counter, gmq: Counter, gpos: Counter,
+                         gallmq: dict, gallbq: dict, gallpos: dict) -> None:
+        """Write aggregate annotations for a sample, combining metrics across all replicates.
+
+        Args:
+            v: the cyvcf2 Variant being annotated
+            s: sample name (e.g. 'tumor', 'normal')
+            gac: allele counts summed across all BAMs
+            gdp: total depth summed across all BAMs
+            gbq: median BQ per allele, summed across all BAMs
+            gmq: median MQ per allele, summed across all BAMs
+            gpos: median read position per allele, summed across all BAMs
+            gallmq: MQ distributions per allele across all BAMs
+            gallbq: BQ distributions per allele across all BAMs
+            gallpos: read position distributions per allele across all BAMs
+        """
+        v.INFO["{}_af".format(s)] = ",".join([str(self._calculate_af(gac[alt], gdp)) for alt in v.ALT])
+        v.INFO["{}_ac".format(s)] = ",".join([str(gac[alt]) for alt in v.ALT])
+        v.INFO["{}_n".format(s)] = str(sum(gac.get(b, 0) for b in AMBIGUOUS_BASES))
+        v.INFO["{}_dp".format(s)] = gdp
+        v.INFO["{}_eaf".format(s)] = str(self.power.calculate_expected_vaf(sample=s, variant=v))
+        v.INFO["{}_pu".format(s)] = ",".join([str(self.power.calculate_power(ac=gac[alt], dp=gdp, sample=s, variant=v)) for alt in v.ALT])
+        
+        power, k = self.power.calculate_absolute_power(dp=gdp, sample=s, variant=v)
+        v.INFO["{}_pw".format(s)] = str(power)
+        v.INFO["{}_k".format(s)] = str(k)
+        v.INFO["{}_bq".format(s)] = ",".join([str(gbq[v.REF])] + [str(gbq[alt]) for alt in v.ALT])
+        v.INFO["{}_mq".format(s)] = ",".join([str(gmq[v.REF])] + [str(gmq[alt]) for alt in v.ALT])
+        v.INFO["{}_pos".format(s)] = ",".join([str(gpos[v.REF])] + [str(gpos[alt]) for alt in v.ALT])
+        for distributions, tag in [(gallmq, "rsmq"), (gallbq, "rsbq"), (gallpos, "rspos")]:
+            pvalues, stats = get_rank_sum_tests(distributions, v)
+            if stats:
+                v.INFO["{}_{}".format(s, tag)] = ",".join(stats)
+                v.INFO["{}_{}_pv".format(s, tag)] = ",".join(pvalues)
+
+    def _calculate_af(self, ac: int, dp: int) -> float:
+        """Return allele frequency, or 0.0 if depth is zero.
+ 
+        Args:
+            ac: allele count for this alternate allele
+            dp: total depth of coverage
+ 
+        Returns:
+            allele frequency rounded to 5 decimal places, or 0.0 if dp is zero
+        """
+        return round(float(ac) / dp, 5) if dp > 0 else 0.0
+
+    def _write_batch(self, batch: list) -> None:
+        """Write a batch of annotated variants to the output VCF.
+ 
+        Args:
+            batch: list of cyvcf2 Variant objects to write
+        """
+        for v in batch:
+            self.vcf_writer.write_record(v)
+
+    @staticmethod
+    def _get_headers(input_bams: dict) -> list:
+        """Build the list of INFO header entries for all samples and replicates.
+
+        Args:
+            input_bams: {sample_name: [bam_path, ...]}
+
+        Returns:
+            list of header dicts suitable for cyvcf2's add_info_to_header
+        """
+        headers = []
+        for s, bams in input_bams.items():
+            for suffix, description, type, number in _HEADER_TEMPLATES:
+                headers.append(Annotator._make_header(suffix, description, type, number, sample=s))
+            if len(bams) > 1:
+                for i, bam in enumerate(bams, start=1):
+                    n = os.path.basename(bam).split(".")[0]
+                    sample_label = "{} {}".format(s, n)
+                    for suffix, description, type, number in _REPLICATE_HEADER_TEMPLATES:
+                        headers.append(Annotator._make_header(suffix, description, type, number,
+                                                    sample=sample_label, index=i))
+        return headers
+
+    @staticmethod
+    def _make_header(suffix: str, description: str, type: str, number: str,
+                     sample: str, index: int = None) -> dict:
+        """Build a single INFO header dict for cyvcf2's add_info_to_header.
+
+        Args:
+            suffix: annotation suffix (e.g. 'af', 'dp')
+            description: description template with {sample} placeholder
+            type: VCF type string ('Float', 'Integer', 'String')
+            number: VCF number string ('A', 'R', '1', etc.)
+            sample: sample name to substitute into ID and description
+            index: optional replicate index appended to the ID
+
+        Returns:
+            dict with keys ID, Description, Type, Number suitable for cyvcf2's add_info_to_header
+        """
+        id = "{}_{}".format(sample, suffix)
+        if index is not None:
+            id = "{}_{}".format(id, index)
+        return {'ID': id, 'Description': description.format(sample=sample), 'Type': type, 'Number': number}
\ No newline at end of file
diff --git a/vafator/constants.py b/vafator/constants.py
new file mode 100755
index 0000000..bfad7aa
--- /dev/null
+++ b/vafator/constants.py
@@ -0,0 +1,30 @@
+# Annotation batch size for VCF writing
+BATCH_SIZE = 10000
+
+# IUPAC ambiguity codes treated as ambiguous bases in pileup metrics
+AMBIGUOUS_BASES = ['N', 'M', 'R', 'W', 'S', 'Y', 'K', 'V', 'H', 'D', 'B']
+
+# Header templates: (suffix, description, type, number)
+# {sample} is substituted at generation time
+_HEADER_TEMPLATES = [
+    ("af",       "Allele frequency for the alternate alleles in the {sample} sample/s",                                                                              "Float",   "A"),
+    ("dp",       "Total depth of coverage in the {sample} sample/s (independent of alleles)",                                                                        "Float",   "1"),
+    ("ac",       "Allele count for the alternate alleles in the {sample} sample/s",                                                                                  "Integer", "A"),
+    ("n",        "Allele count for ambiguous bases (any IUPAC ambiguity code is counted) in the {sample} sample/s",                                                  "Integer", "1"),
+    ("pu",       "Probability of an undetected mutation given the observed supporting reads (AC), the observed total coverage (DP) and the provided tumor purity in the {sample} sample/s", "Float", "A"),
+    ("pw",       "Power to detect a somatic mutation as described in Absolute given the observed total coverage (DP) and the provided tumor purity and ploidies in the {sample} sample/s",  "Float", "1"),
+    ("k",        "Minimum number of supporting reads, k, such that the probability of observing k or more non-reference reads due to sequencing error is less than the defined FPR in the {sample} sample/s", "Float", "1"),
+    ("eaf",      "Expected VAF considering the purity and ploidy/copy number in the {sample} sample/s",                                                              "Float",   "1"),
+    ("bq",       "Median base call quality of the reads supporting each allele in the {sample} sample/s",                                                            "Float",   "R"),
+    ("mq",       "Median mapping quality of the reads supporting each allele in the {sample} sample/s",                                                              "Float",   "R"),
+    ("pos",      "Median position within the read of the reads supporting each allele in the {sample} sample/s",                                                     "Float",   "R"),
+    ("rsmq",     "Rank sum test comparing the MQ distributions supporting the reference and the alternate in the {sample} sample/s",                                 "Float",   "A"),
+    ("rsmq_pv",  "Rank sum test p-value for MQ distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions", "Float",  "A"),
+    ("rsbq",     "Rank sum test comparing the BQ distributions supporting the reference and the alternate in the {sample} sample/s",                                 "Float",   "A"),
+    ("rsbq_pv",  "Rank sum test p-value for BQ distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions", "Float",  "A"),
+    ("rspos",    "Rank sum test comparing the position distributions supporting the reference and the alternate in the {sample} sample/s",                            "Float",   "A"),
+    ("rspos_pv", "Rank sum test p-value for position distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions", "Float", "A"),
+]
+
+# eaf is not produced per-replicate
+_REPLICATE_HEADER_TEMPLATES = [t for t in _HEADER_TEMPLATES if t[0] != "eaf"]
\ No newline at end of file
diff --git a/vafator/pileup_utils.py b/vafator/pileup_utils.py
new file mode 100755
index 0000000..1048e7c
--- /dev/null
+++ b/vafator/pileup_utils.py
@@ -0,0 +1,111 @@
+from collections import Counter
+from dataclasses import dataclass
+from math import nan
+from typing import List, Dict
+
+import numpy as np
+
+
+@dataclass
+class VariantRecord:
+    """Lightweight, picklable variant representation used by pileup workers.
+    Mirrors the cyvcf2.Variant fields accessed by pileup and metrics functions.
+
+    Attributes:
+        CHROM: chromosome name
+        POS: 1-based variant position
+        REF: reference allele
+        ALT: list of alternate alleles
+    """
+    CHROM: str
+    POS: int
+    REF: str
+    ALT: List[str]
+
+
+@dataclass
+class CoverageMetrics:
+    """Pileup metrics computed for a single variant in a single BAM.
+
+    Attributes:
+        ac: allele counts per base, including the reference
+        dp: total depth of coverage
+        bqs: median base call quality per allele, including the reference
+        mqs: median mapping quality per allele, including the reference
+        positions: median read position per allele, including the reference
+        all_bqs: full base call quality distribution per allele
+        all_mqs: full mapping quality distribution per allele
+        all_positions: full read position distribution per allele
+    """
+    ac: dict
+    dp: int
+    bqs: dict = None
+    mqs: dict = None
+    positions: dict = None
+    all_bqs: dict = None
+    all_mqs: dict = None
+    all_positions: dict = None
+
+
+EMPTY_METRICS = CoverageMetrics(
+    ac=Counter(), dp=0, bqs=Counter(), mqs=Counter(), positions=Counter(),
+    all_bqs={}, all_mqs={}, all_positions={}
+)
+
+
+def aggregate_list_per_base(bases: List[str], values: list) -> Dict[str, list]:
+    """Group a list of values by their corresponding base.
+
+    Args:
+        bases: list of base characters (e.g. ['A', 'T', 'A', ''])
+        values: list of numeric values parallel to bases
+
+    Returns:
+        dict mapping each base to a list of its associated values
+    """
+    aggregated_values = {}
+    for b, v in zip(bases, values):
+        if b not in aggregated_values:
+            aggregated_values[b] = []
+        aggregated_values[b].append(v)
+    return aggregated_values
+
+
+def safe_median(values: list) -> float:
+    """Return the median of a list, or nan if the list is empty.
+    Avoids numpy RuntimeWarning raised by np.median on empty arrays.
+
+    Args:
+        values: list of numeric values
+
+    Returns:
+        median as float, or nan if values is empty
+    """
+    return float(np.median(values)) if values else nan
+
+
+def is_snp(variant) -> bool:
+    """Return True if the variant is a single nucleotide polymorphism.
+
+    Args:
+        variant: any object with REF and ALT attributes (Variant or VariantRecord)
+    """
+    return len(variant.REF) == 1 and len(variant.ALT[0]) == 1
+
+
+def is_insertion(variant) -> bool:
+    """Return True if the variant is an insertion.
+
+    Args:
+        variant: any object with REF and ALT attributes (Variant or VariantRecord)
+    """
+    return len(variant.REF) == 1 and len(variant.ALT[0]) > 1
+
+
+def is_deletion(variant) -> bool:
+    """Return True if the variant is a deletion.
+
+    Args:
+        variant: any object with REF and ALT attributes (Variant or VariantRecord)
+    """
+    return len(variant.ALT[0]) == 1 and len(variant.REF) > 1
\ No newline at end of file
diff --git a/vafator/pileups.py b/vafator/pileups.py
index b6afb9f..c23041f 100755
--- a/vafator/pileups.py
+++ b/vafator/pileups.py
@@ -1,67 +1,75 @@
-import numpy as np
 from collections import Counter, defaultdict
-from cyvcf2 import Variant
-from dataclasses import dataclass
-from math import nan
-from pysam.libcalignmentfile import IteratorColumnRegion, AlignmentFile
 from typing import Union, List, Dict, Iterator, Tuple
-from vafator import AMBIGUOUS_BASES
-
-@dataclass
-class VariantRecord:
-    """Lightweight, picklable variant representation used by pileup workers.
-    Mirrors the cyvcf2.Variant fields accessed by pileup and metrics functions."""
-    CHROM: str
-    POS: int
-    REF: str
-    ALT: List[str]
-
-
-def is_snp(variant: Variant):
-    return len(variant.REF) == 1 and len(variant.ALT[0]) == 1
-
-
-def is_insertion(variant: Variant):
-    return len(variant.REF) == 1 and len(variant.ALT[0]) > 1
-
 
-def is_deletion(variant: Variant):
-    return len(variant.ALT[0]) == 1 and len(variant.REF) > 1
+from cyvcf2 import Variant
+from pysam.libcalignmentfile import IteratorColumnRegion, AlignmentFile
 
+from vafator.constants import AMBIGUOUS_BASES
+from vafator.pileup_utils import *
 
 def get_variant_pileup(
         variant: Union[Variant, VariantRecord], bam: AlignmentFile,
-        min_base_quality, min_mapping_quality) -> IteratorColumnRegion:
-    """Single-variant pileup, kept for backwards compatibility and tests."""
+        min_base_quality: int, min_mapping_quality: int) -> IteratorColumnRegion:
+    """Open a pileup iterator at a single variant position.
+    Kept for backwards compatibility and use in tests.
+
+    Args:
+        variant: variant to query (cyvcf2 Variant or VariantRecord)
+        bam: open pysam AlignmentFile
+        min_base_quality: minimum base call quality; bases below this are excluded
+        min_mapping_quality: minimum mapping quality; reads below this are excluded
+
+    Returns:
+        pysam IteratorColumnRegion over the variant position
+    """
     position = variant.POS
-    return bam.pileup(contig=variant.CHROM, start=position - 1, stop=position,
-                      truncate=True,
-                      max_depth=1000000,
-                      min_base_quality=min_base_quality,
-                      min_mapping_quality=min_mapping_quality,
-                      stepper='samtools',
-                      )
+    return bam.pileup(
+        contig=variant.CHROM, 
+        start=position - 1, 
+        stop=position,
+        truncate=True, 
+        max_depth=1000000,
+        min_base_quality=min_base_quality,
+        min_mapping_quality=min_mapping_quality,
+        stepper='samtools'
+        )
 
 
 def get_region_pileup(chrom: str, start: int, end: int, bam: AlignmentFile,
-                      min_base_quality, min_mapping_quality):
-    """
-    Opens a single pileup iterator spanning a whole region (e.g. one chromosome).
-    start is 0-based inclusive, end is 1-based exclusive (last variant POS).
+                      min_base_quality: int, min_mapping_quality: int):
+    """Open a single pileup iterator spanning a genomic region.
+
+    Args:
+        chrom: chromosome name
+        start: 0-based inclusive start position
+        end: 1-based exclusive end position (last variant POS)
+        bam: open pysam AlignmentFile
+        min_base_quality: minimum base call quality; bases below this are excluded
+        min_mapping_quality: minimum mapping quality; reads below this are excluded
+
+    Returns:
+        pysam pileup iterator over the region
     """
-    return bam.pileup(contig=chrom, start=start, stop=end,
-                      truncate=True,
-                      max_depth=1000000,
-                      min_base_quality=min_base_quality,
-                      min_mapping_quality=min_mapping_quality,
-                      stepper='samtools',
-                      )
+    return bam.pileup(
+        contig=chrom, 
+        start=start, 
+        stop=end,
+        truncate=True, 
+        max_depth=1000000,
+        min_base_quality=min_base_quality,
+        min_mapping_quality=min_mapping_quality,
+        stepper='samtools'
+        )
 
 
 def stream_variants_by_chrom(vcf) -> Iterator[Tuple[str, List[Variant]]]:
-    """
-    Yields (chrom, [variants]) one chromosome at a time.
-    Only one chromosome's variants are held in memory at once.
+    """Yield variants grouped by chromosome, one chromosome at a time.
+
+    Args:
+        vcf: open cyvcf2 VCF iterator
+
+    Returns:
+        iterator of (chrom, [variants]) tuples
     """
     current_chrom = None
     current_variants = []
@@ -83,38 +91,45 @@ def collect_metrics_for_chrom(
         bam: AlignmentFile,
         min_base_quality: int,
         min_mapping_quality: int,
-        include_ambiguous_bases: bool = False) -> Dict[Tuple, 'CoverageMetrics']:
-    """
-    Opens ONE pileup iterator over the entire chromosome region covered by variants.
-    Metrics are computed IMMEDIATELY for each pileup column while it is still valid —
-    avoids segfaults from storing PileupColumn objects after the iterator advances.
-
-    Returns {(pos, REF, ALT[0]): CoverageMetrics}.
+        include_ambiguous_bases: bool = False) -> Dict[Tuple, CoverageMetrics]:
+    """Compute pileup metrics for all variants on a chromosome using a single pileup iterator.
+
+    Metrics are computed immediately for each pileup column while it is still valid —
+    avoids segfaults from storing PileupColumn C objects after the iterator advances.
+
+    Args:
+        chrom: chromosome name
+        variants: list of variants on this chromosome (must be sorted by position)
+        bam: open pysam AlignmentFile
+        min_base_quality: minimum base call quality threshold
+        min_mapping_quality: minimum mapping quality threshold
+        include_ambiguous_bases: if True, ambiguous bases are counted in depth
+
+    Returns:
+        {(pos, REF, ALT[0]): CoverageMetrics} for each variant with read support
     """
     if not variants:
         return {}
 
-    # index variants by 1-based position for O(1) lookup during streaming
-    variants_by_pos: Dict[int, List[Variant]] = defaultdict(list)
+    variants_by_pos: Dict[int, List] = defaultdict(list)
     for v in variants:
         variants_by_pos[v.POS].append(v)
 
     start = variants[0].POS - 1   # 0-based inclusive
     end = variants[-1].POS        # exclusive end for pysam
-
     results: Dict[Tuple, CoverageMetrics] = {}
 
     for pileup_col in get_region_pileup(
-            chrom=chrom, start=start, end=end,
-            bam=bam,
-            min_base_quality=min_base_quality,
-            min_mapping_quality=min_mapping_quality,
+        chrom=chrom, 
+        start=start, 
+        end=end, 
+        bam=bam,
+        min_base_quality=min_base_quality,
+        min_mapping_quality=min_mapping_quality
     ):
         ref_pos = pileup_col.reference_pos + 1  # convert to 1-based
         if ref_pos not in variants_by_pos:
             continue
-
-        # compute metrics NOW while pileup_col is still valid in C memory
         for variant in variants_by_pos[ref_pos]:
             metrics = _get_metrics_from_column(variant, pileup_col, include_ambiguous_bases)
             if metrics is not None:
@@ -123,35 +138,18 @@ def collect_metrics_for_chrom(
     return results
 
 
-@dataclass
-class CoverageMetrics:
-    # number supporting reads of each base, including the reference
-    ac: dict
-    # total depth of coverage
-    dp: int
-    # median base call quality of each base, including the reference
-    bqs: dict = None
-    # median mapping quality of each alternate base, including the reference
-    mqs: dict = None
-    # median position within the read of each alternate base, including the reference
-    positions: dict = None
-    # base call quality distribution of each base, including the reference
-    all_bqs: dict = None
-    # mapping quality distribution of each base, including the reference
-    all_mqs: dict = None
-    # position within the read distribution of each base, including the reference
-    all_positions: dict = None
-
-
-EMPTY_METRICS = CoverageMetrics(
-    ac=Counter(), dp=0, bqs=Counter(), mqs=Counter(), positions=Counter(),
-    all_bqs={}, all_mqs={}, all_positions={}
-)
-
-
-def _get_metrics_from_column(variant: Variant, pileup_col,
-                              include_ambiguous_bases=False) -> 'CoverageMetrics':
-    """Dispatch to the right metrics function based on variant type."""
+def _get_metrics_from_column(variant, pileup_col,
+                              include_ambiguous_bases: bool = True) -> CoverageMetrics:
+    """Dispatch pileup metrics computation based on variant type.
+
+    Args:
+        variant: Variant or VariantRecord being evaluated
+        pileup_col: pysam PileupColumn at the variant position
+        include_ambiguous_bases: if True, ambiguous bases are counted in depth
+
+    Returns:
+        CoverageMetrics for this variant, or None if the variant type is not supported
+    """
     if is_snp(variant):
         return _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases)
     elif is_insertion(variant):
@@ -161,11 +159,18 @@ def _get_metrics_from_column(variant: Variant, pileup_col,
     return None
 
 
-def _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases=False) -> CoverageMetrics:
-    bases = []
-    qualities = []
-    mapping_qualities = []
-    query_positions = []
+def _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases: bool = True) -> CoverageMetrics:
+    """Compute SNV metrics from a pileup column.
+    Deletions at the position are represented as empty string bases and included in depth.
+
+    Args:
+        pileup_col: pysam PileupColumn at the SNV position
+        include_ambiguous_bases: if True, IUPAC ambiguous bases are counted in depth
+
+    Returns:
+        CoverageMetrics with ac, dp, bqs, mqs, positions and their distributions
+    """
+    bases, qualities, mapping_qualities, query_positions = [], [], [], []
 
     for read in pileup_col.pileups:
         if read.is_refskip:
@@ -203,7 +208,18 @@ def _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases=False) -> C
     )
 
 
-def _get_insertion_metrics_from_column(variant: Variant, pileup_col) -> CoverageMetrics:
+def _get_insertion_metrics_from_column(variant, pileup_col) -> CoverageMetrics:
+    """Compute insertion metrics from a pileup column.
+    Checks both insertion length and sequence to count supporting reads.
+    Reads with no indel at this position are counted as reference support.
+
+    Args:
+        variant: Variant or VariantRecord with REF and ALT defining the insertion
+        pileup_col: pysam PileupColumn at the insertion position
+
+    Returns:
+        CoverageMetrics with ac, dp, mqs, positions (bqs is empty for indels)
+    """
     ac = {alt.upper(): 0 for alt in variant.ALT}
     mq = {alt.upper(): [] for alt in variant.ALT}
     mq[variant.REF] = []
@@ -251,7 +267,18 @@ def _get_insertion_metrics_from_column(variant: Variant, pileup_col) -> Coverage
     )
 
 
-def _get_deletion_metrics_from_column(variant: Variant, pileup_col) -> CoverageMetrics:
+def _get_deletion_metrics_from_column(variant, pileup_col) -> CoverageMetrics:
+    """Compute deletion metrics from a pileup column.
+    Checks both deletion length and position via CIGAR traversal to count supporting reads.
+    Reads with no indel at this position are counted as reference support.
+
+    Args:
+        variant: Variant or VariantRecord with REF and ALT defining the deletion
+        pileup_col: pysam PileupColumn at the deletion position
+
+    Returns:
+        CoverageMetrics with ac, dp, mqs, positions (bqs is empty for indels)
+    """
     ac = {alt.upper(): 0 for alt in variant.ALT}
     mq = {alt.upper(): [] for alt in variant.ALT}
     mq[variant.REF] = []
@@ -292,17 +319,4 @@ def _get_deletion_metrics_from_column(variant: Variant, pileup_col) -> CoverageM
         all_mqs={k: l for k, l in mq.items()},
         all_positions={k: l for k, l in pos.items()},
         all_bqs=Counter()
-    )
- 
-def aggregate_list_per_base(bases, values) -> dict:
-    aggregated_values = {}
-    for b, v in zip(bases, values):
-        if b not in aggregated_values:
-            aggregated_values[b] = []
-        aggregated_values[b].append(v)
-    return aggregated_values
- 
- 
-def safe_median(values) -> float:
-    """Return median of values, or 0.0 for empty lists (avoids numpy RuntimeWarning)."""
-    return float(np.median(values)) if values else nan
+    )
\ No newline at end of file

From e198540523fd3e5b624fa337a17be3a544a08ad5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Fri, 20 Mar 2026 13:24:24 +0100
Subject: [PATCH 20/32] Fix VCF INFO field error

---
 vafator/annotator.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/vafator/annotator.py b/vafator/annotator.py
index d3a381e..f0cc2fd 100755
--- a/vafator/annotator.py
+++ b/vafator/annotator.py
@@ -263,24 +263,24 @@ def _annotate_replicate(self, v: Variant, s: str, i: int, m) -> None:
             i: 0-based index of the BAM within the sample's BAM list
             m: pre-computed CoverageMetrics for this variant in this BAM
         """
-        p = "{}_{}".format(s, i + 1)
-        v.INFO["{}_af".format(p)] = ",".join([str(self._calculate_af(m.ac[alt], m.dp)) for alt in v.ALT])
-        v.INFO["{}_ac".format(p)] = ",".join([str(m.ac[alt]) for alt in v.ALT])
-        v.INFO["{}_n".format(p)] = str(sum(m.ac.get(b, 0) for b in AMBIGUOUS_BASES))
-        v.INFO["{}_dp".format(p)] = m.dp
-        v.INFO["{}_pu".format(p)] = ",".join([str(self.power.calculate_power(ac=m.ac[alt], dp=m.dp, sample=s, variant=v)) for alt in v.ALT])
-        
+        n = i + 1
+        v.INFO["{}_af_{}".format(s, n)] = ",".join([str(self._calculate_af(m.ac[alt], m.dp)) for alt in v.ALT])
+        v.INFO["{}_ac_{}".format(s, n)] = ",".join([str(m.ac[alt]) for alt in v.ALT])
+        v.INFO["{}_n_{}".format(s, n)] = str(sum(m.ac.get(b, 0) for b in AMBIGUOUS_BASES))
+        v.INFO["{}_dp_{}".format(s, n)] = m.dp
+        v.INFO["{}_pu_{}".format(s, n)] = ",".join([str(self.power.calculate_power(ac=m.ac[alt], dp=m.dp, sample=s, variant=v)) for alt in v.ALT])
+
         power, k = self.power.calculate_absolute_power(dp=m.dp, sample=s, variant=v)
-        v.INFO["{}_pw".format(p)] = str(power)
-        v.INFO["{}_k".format(p)] = str(k)
-        v.INFO["{}_bq".format(p)] = ",".join([str(m.bqs[v.REF])] + [str(m.bqs[alt]) for alt in v.ALT])
-        v.INFO["{}_mq".format(p)] = ",".join([str(m.mqs[v.REF])] + [str(m.mqs[alt]) for alt in v.ALT])
-        v.INFO["{}_pos".format(p)] = ",".join([str(m.positions[v.REF])] + [str(m.positions[alt]) for alt in v.ALT])
+        v.INFO["{}_pw_{}".format(s, n)] = str(power)
+        v.INFO["{}_k_{}".format(s, n)] = str(k)
+        v.INFO["{}_bq_{}".format(s, n)] = ",".join([str(m.bqs[v.REF])] + [str(m.bqs[alt]) for alt in v.ALT])
+        v.INFO["{}_mq_{}".format(s, n)] = ",".join([str(m.mqs[v.REF])] + [str(m.mqs[alt]) for alt in v.ALT])
+        v.INFO["{}_pos_{}".format(s, n)] = ",".join([str(m.positions[v.REF])] + [str(m.positions[alt]) for alt in v.ALT])
         for key, tag in [(m.all_mqs, "rsmq"), (m.all_bqs, "rsbq"), (m.all_positions, "rspos")]:
             pvalues, stats = get_rank_sum_tests(key, v)
             if stats:
-                v.INFO["{}_{}_{}".format(s, tag, i + 1)] = ",".join(stats)
-                v.INFO["{}_{}_pv_{}".format(s, tag, i + 1)] = ",".join(pvalues)
+                v.INFO["{}_{}_{}".format(s, tag, n)] = ",".join(stats)
+                v.INFO["{}_{}_pv_{}".format(s, tag, n)] = ",".join(pvalues)
 
     def _annotate_sample(self, v: Variant, s: str, gac: Counter, gdp: int,
                          gbq: Counter, gmq: Counter, gpos: Counter,

From 5e3344deacd557bf42ddc2aa5f2b1e03103414df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Fri, 20 Mar 2026 13:36:22 +0100
Subject: [PATCH 21/32] Fix wrong tag

---
 vafator/annotator.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vafator/annotator.py b/vafator/annotator.py
index f0cc2fd..781c355 100755
--- a/vafator/annotator.py
+++ b/vafator/annotator.py
@@ -356,10 +356,12 @@ def _get_headers(input_bams: dict) -> list:
             if len(bams) > 1:
                 for i, bam in enumerate(bams, start=1):
                     n = os.path.basename(bam).split(".")[0]
-                    sample_label = "{} {}".format(s, n)
                     for suffix, description, type, number in _REPLICATE_HEADER_TEMPLATES:
-                        headers.append(Annotator._make_header(suffix, description, type, number,
-                                                    sample=sample_label, index=i))
+                        headers.append(
+                            Annotator._make_header(
+                            suffix, description, type, number, sample=s, index=i
+                            )
+                        )
         return headers
 
     @staticmethod

From 6db73d6b8c6f9b82c331d909d96d2da2d21bc106 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Fri, 20 Mar 2026 14:34:01 +0100
Subject: [PATCH 22/32] bump version

---
 setup.cfg           | 2 +-
 vafator/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 4f08792..bc22ccb 100755
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = VAFator
-version = 3.0.0
+version = 3.1.0
 description = Annotate variants in a VCF file with technical annotations from one or more BAMs 
 description-file = README.md
 long_description = file: README.md
diff --git a/vafator/__init__.py b/vafator/__init__.py
index 136a680..f52972e 100755
--- a/vafator/__init__.py
+++ b/vafator/__init__.py
@@ -1,2 +1,2 @@
-VERSION = '3.0.0'
+VERSION = '3.1.0'
 

From 12f017af36983292714a4644809224b7d97dcb25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Fri, 20 Mar 2026 14:46:36 +0100
Subject: [PATCH 23/32] Readd test that was accidentally deleted

---
 vafator/tests/test_power_calculator.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vafator/tests/test_power_calculator.py b/vafator/tests/test_power_calculator.py
index 2e120e7..c079a83 100644
--- a/vafator/tests/test_power_calculator.py
+++ b/vafator/tests/test_power_calculator.py
@@ -80,6 +80,12 @@ def test_varying_ploidy(self):
         self.assertLess(
             power1.calculate_power(dp=10, ac=2, sample='tumor', variant=None),
             power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None))
+        
+        power3 = PowerCalculator(
+            tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=6.0)}, purities={'tumor': 0.8})
+        self.assertLess(
+            power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None),
+            power3.calculate_power(dp=10, ac=2, sample='tumor', variant=None))
 
     def test_local_copy_numbers(self):
         input_bed = pkg_resources.resource_filename(__name__, "resources/test_copy_numbers.bed")

From 4b6a272e3c4314539aaaa98cacdb2c866817930e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Wed, 1 Apr 2026 17:20:51 +0200
Subject: [PATCH 24/32] improve code quality based on Codacy output

---
 vafator/annotator.py                   | 26 +++++++++++++-------------
 vafator/command_line.py                |  2 +-
 vafator/pileups.py                     | 20 ++++++++++----------
 vafator/power.py                       |  1 -
 vafator/tests/test_pileups.py          |  1 -
 vafator/tests/test_power_calculator.py |  2 +-
 6 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/vafator/annotator.py b/vafator/annotator.py
index 781c355..48008e4 100755
--- a/vafator/annotator.py
+++ b/vafator/annotator.py
@@ -95,10 +95,10 @@ def __init__(self, input_vcf: str, output_vcf: str,
         self.include_ambiguous_bases = include_ambiguous_bases
         self.num_processes = num_processes
         self.power = PowerCalculator(
-            normal_ploidy=normal_ploidy, 
-            tumor_ploidies=tumor_ploidies, 
+            normal_ploidy=normal_ploidy,
+            tumor_ploidies=tumor_ploidies,
             purities=purities,
-            error_rate=error_rate, 
+            error_rate=error_rate,
             fpr=fpr
         )
 
@@ -129,7 +129,7 @@ def __init__(self, input_vcf: str, output_vcf: str,
         for a in Annotator._get_headers(input_bams):
             self.vcf.add_info_to_header(a)
         self.vcf_writer = Writer(output_vcf, self.vcf)
-        
+
         self.bam_paths = input_bams
         self.bam_readers = {s: [pysam.AlignmentFile(b, "rb") for b in bams] for s, bams in input_bams.items()}
 
@@ -351,28 +351,28 @@ def _get_headers(input_bams: dict) -> list:
         """
         headers = []
         for s, bams in input_bams.items():
-            for suffix, description, type, number in _HEADER_TEMPLATES:
-                headers.append(Annotator._make_header(suffix, description, type, number, sample=s))
+            for suffix, description, typ, number in _HEADER_TEMPLATES:
+                headers.append(Annotator._make_header(suffix, description, typ, number, sample=s))
             if len(bams) > 1:
                 for i, bam in enumerate(bams, start=1):
                     n = os.path.basename(bam).split(".")[0]
-                    for suffix, description, type, number in _REPLICATE_HEADER_TEMPLATES:
+                    for suffix, description, typ, number in _REPLICATE_HEADER_TEMPLATES:
                         headers.append(
                             Annotator._make_header(
-                            suffix, description, type, number, sample=s, index=i
+                            suffix, description, typ, number, sample=s, index=i
                             )
                         )
         return headers
 
     @staticmethod
-    def _make_header(suffix: str, description: str, type: str, number: str,
+    def _make_header(suffix: str, description: str, typ: str, number: str,
                      sample: str, index: int = None) -> dict:
         """Build a single INFO header dict for cyvcf2's add_info_to_header.
 
         Args:
             suffix: annotation suffix (e.g. 'af', 'dp')
             description: description template with {sample} placeholder
-            type: VCF type string ('Float', 'Integer', 'String')
+            typ: VCF type string ('Float', 'Integer', 'String')
             number: VCF number string ('A', 'R', '1', etc.)
             sample: sample name to substitute into ID and description
             index: optional replicate index appended to the ID
@@ -380,7 +380,7 @@ def _make_header(suffix: str, description: str, type: str, number: str,
         Returns:
             dict with keys ID, Description, Type, Number suitable for cyvcf2's add_info_to_header
         """
-        id = "{}_{}".format(sample, suffix)
+        header_id = "{}_{}".format(sample, suffix)
         if index is not None:
-            id = "{}_{}".format(id, index)
-        return {'ID': id, 'Description': description.format(sample=sample), 'Type': type, 'Number': number}
\ No newline at end of file
+            header_id = "{}_{}".format(header_id, index)
+        return {'ID': header_id, 'Description': description.format(sample=sample), 'Type': typ, 'Number': number}
\ No newline at end of file
diff --git a/vafator/command_line.py b/vafator/command_line.py
index f2d639d..6a9a58a 100755
--- a/vafator/command_line.py
+++ b/vafator/command_line.py
@@ -104,7 +104,7 @@ def annotator():
         num_processes=args.num_processes,
     )
     annotator.run()
-    
+
     logging.info("Vafator finished!")
 
 
diff --git a/vafator/pileups.py b/vafator/pileups.py
index c23041f..72b70e8 100755
--- a/vafator/pileups.py
+++ b/vafator/pileups.py
@@ -5,7 +5,7 @@
 from pysam.libcalignmentfile import IteratorColumnRegion, AlignmentFile
 
 from vafator.constants import AMBIGUOUS_BASES
-from vafator.pileup_utils import *
+from vafator.pileup_utils import VariantRecord, CoverageMetrics, safe_median, aggregate_list_per_base, is_snp, is_insertion, is_deletion
 
 def get_variant_pileup(
         variant: Union[Variant, VariantRecord], bam: AlignmentFile,
@@ -24,10 +24,10 @@ def get_variant_pileup(
     """
     position = variant.POS
     return bam.pileup(
-        contig=variant.CHROM, 
-        start=position - 1, 
+        contig=variant.CHROM,
+        start=position - 1,
         stop=position,
-        truncate=True, 
+        truncate=True,
         max_depth=1000000,
         min_base_quality=min_base_quality,
         min_mapping_quality=min_mapping_quality,
@@ -51,10 +51,10 @@ def get_region_pileup(chrom: str, start: int, end: int, bam: AlignmentFile,
         pysam pileup iterator over the region
     """
     return bam.pileup(
-        contig=chrom, 
-        start=start, 
+        contig=chrom,
+        start=start,
         stop=end,
-        truncate=True, 
+        truncate=True,
         max_depth=1000000,
         min_base_quality=min_base_quality,
         min_mapping_quality=min_mapping_quality,
@@ -120,9 +120,9 @@ def collect_metrics_for_chrom(
     results: Dict[Tuple, CoverageMetrics] = {}
 
     for pileup_col in get_region_pileup(
-        chrom=chrom, 
-        start=start, 
-        end=end, 
+        chrom=chrom,
+        start=start,
+        end=end,
         bam=bam,
         min_base_quality=min_base_quality,
         min_mapping_quality=min_mapping_quality
diff --git a/vafator/power.py b/vafator/power.py
index f465fe9..93ee5a3 100644
--- a/vafator/power.py
+++ b/vafator/power.py
@@ -1,4 +1,3 @@
-from functools import lru_cache
 from typing import Optional
 
 from cyvcf2 import Variant
diff --git a/vafator/tests/test_pileups.py b/vafator/tests/test_pileups.py
index d59aae3..c0450e2 100644
--- a/vafator/tests/test_pileups.py
+++ b/vafator/tests/test_pileups.py
@@ -1,6 +1,5 @@
 from collections import Counter
 from unittest import TestCase
-from unittest.mock import MagicMock
 import pkg_resources
 import pysam
 
diff --git a/vafator/tests/test_power_calculator.py b/vafator/tests/test_power_calculator.py
index c079a83..0c7612a 100644
--- a/vafator/tests/test_power_calculator.py
+++ b/vafator/tests/test_power_calculator.py
@@ -80,7 +80,7 @@ def test_varying_ploidy(self):
         self.assertLess(
             power1.calculate_power(dp=10, ac=2, sample='tumor', variant=None),
             power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None))
-        
+
         power3 = PowerCalculator(
             tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=6.0)}, purities={'tumor': 0.8})
         self.assertLess(

From 8b808376f29ed5ee29e558a66cc72a1af8badeb5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Thu, 2 Apr 2026 12:56:37 +0200
Subject: [PATCH 25/32] more codacy improvements

---
 vafator/annotator.py    | 4 ++--
 vafator/command_line.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vafator/annotator.py b/vafator/annotator.py
index 48008e4..ec983d4 100755
--- a/vafator/annotator.py
+++ b/vafator/annotator.py
@@ -253,7 +253,7 @@ def _add_stats(self, variant: Variant, metrics_by_bam: dict) -> None:
             self._annotate_sample(variant, sample, global_ac, global_dp, global_bq,
                                   global_mq, global_pos, global_all_mqs, global_all_bqs,
                                   global_all_positions)
-            
+
     def _annotate_replicate(self, v: Variant, s: str, i: int, m) -> None:
         """Write per-replicate annotations — only called when multiple BAMs are provided for a sample.
 
@@ -355,7 +355,7 @@ def _get_headers(input_bams: dict) -> list:
                 headers.append(Annotator._make_header(suffix, description, typ, number, sample=s))
             if len(bams) > 1:
                 for i, bam in enumerate(bams, start=1):
-                    n = os.path.basename(bam).split(".")[0]
+                    # n = os.path.basename(bam).split(".")[0]
                     for suffix, description, typ, number in _REPLICATE_HEADER_TEMPLATES:
                         headers.append(
                             Annotator._make_header(
diff --git a/vafator/command_line.py b/vafator/command_line.py
index 6a9a58a..e565c01 100755
--- a/vafator/command_line.py
+++ b/vafator/command_line.py
@@ -126,7 +126,7 @@ def multiallelics_filter():
         tumor_sample_name=args.tumor_sample_name
     )
     filter.run()
-    
+
     logging.info("Vafator multiallelic filter finished!")
 
 

From cf31f1e4543c7bd5071e935af14c6e12e8b37b2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Thu, 2 Apr 2026 14:21:24 +0200
Subject: [PATCH 26/32] Re-add comments

---
 vafator/pileups.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vafator/pileups.py b/vafator/pileups.py
index 72b70e8..e85aa53 100755
--- a/vafator/pileups.py
+++ b/vafator/pileups.py
@@ -235,24 +235,27 @@ def _get_insertion_metrics_from_column(variant, pileup_col) -> CoverageMetrics:
 
     for pileup_read in pileup_reads:
         if pileup_read.indel > 0:
+            # read with an insertion
             index = pileup_read.alignment.reference_start
             relative_position = 0
             for cigar_type, cigar_length in pileup_read.alignment.cigartuples:
-                if cigar_type in [0, 2, 3, 7, 8]:
+                if cigar_type in [0, 2, 3, 7, 8]: # consumes reference M, D, N, =, X
                     index += cigar_length
                     if index > variant_position:
                         break
-                if cigar_type in [0, 1, 4, 7, 8]:
+                if cigar_type in [0, 1, 4, 7, 8]: # consumes query M, I, S, =, X
                     relative_position += cigar_length
-                if cigar_type == 1:
+                if cigar_type == 1: # does not count I
                     insertion_in_query = pileup_read.alignment.query[
                                          relative_position:relative_position + insertion_length]
                     if index == variant_position and cigar_length == insertion_length \
                             and insertion == insertion_in_query:
+                        # the read contains the insertion
                         ac[alt_upper] += 1
                         mq[alt_upper].append(pileup_read.alignment.mapping_quality)
                         pos[alt_upper].append(pileup_read.query_position_or_next)
         elif pileup_read.indel == 0:
+            # NOTE: considers all reads without indels to be the reference!
             mq[variant.REF].append(pileup_read.alignment.mapping_quality)
             pos[variant.REF].append(pileup_read.query_position_or_next)
 
@@ -295,7 +298,7 @@ def _get_deletion_metrics_from_column(variant, pileup_col) -> CoverageMetrics:
         if pileup_read.indel < 0:
             start = pileup_read.alignment.reference_start
             for cigar_type, cigar_length in pileup_read.alignment.cigartuples:
-                if cigar_type in [0, 3, 7, 8]:
+                if cigar_type in [0, 3, 7, 8]: # consumes reference M, N, =, X
                     start += cigar_length
                 elif cigar_type == 2:
                     if start == variant_position and cigar_length == deletion_length:
@@ -308,6 +311,7 @@ def _get_deletion_metrics_from_column(variant, pileup_col) -> CoverageMetrics:
                 if start > variant_position:
                     break
         elif pileup_read.indel == 0:
+            # NOTE: considers all reads without indels to be the reference!
             mq[variant.REF].append(pileup_read.alignment.mapping_quality)
             pos[variant.REF].append(pileup_read.query_position_or_next)
 

From 8803b484594bc10f2eb4c50bb8dc95f3f4a75673 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Wed, 8 Apr 2026 15:20:56 +0200
Subject: [PATCH 27/32] black code formatting

---
 setup.py                                  |   2 +-
 vafator/__init__.py                       |   3 +-
 vafator/annotator.py                      | 251 ++++++++++----
 vafator/command_line.py                   | 329 +++++++++++++-----
 vafator/constants.py                      | 123 ++++++-
 vafator/hatchet2bed.py                    |  27 +-
 vafator/multiallelic_filter.py            |  34 +-
 vafator/pileup_utils.py                   |  14 +-
 vafator/pileups.py                        | 108 ++++--
 vafator/ploidies.py                       |  33 +-
 vafator/power.py                          |  44 ++-
 vafator/rank_sum_test.py                  |   9 +-
 vafator/tests/test_annotator.py           | 400 +++++++++++++++-------
 vafator/tests/test_hatchet2bed.py         |  22 +-
 vafator/tests/test_multiallelic_filter.py |  70 +++-
 vafator/tests/test_pileups.py             | 132 +++++--
 vafator/tests/test_ploidy_manager.py      | 112 ++++--
 vafator/tests/test_power_calculator.py    | 250 ++++++++++----
 vafator/tests/test_rank_sum_test.py       |  12 +-
 vafator/tests/utils.py                    |   6 +-
 vafator/vafator2decifer.py                |  87 +++--
 21 files changed, 1509 insertions(+), 559 deletions(-)

diff --git a/setup.py b/setup.py
index 398663e..0610c88 100755
--- a/setup.py
+++ b/setup.py
@@ -8,4 +8,4 @@
     long_description = f.read()
 
 # Build the Python package
-setup()
\ No newline at end of file
+setup()
diff --git a/vafator/__init__.py b/vafator/__init__.py
index f52972e..273ffca 100755
--- a/vafator/__init__.py
+++ b/vafator/__init__.py
@@ -1,2 +1 @@
-VERSION = '3.1.0'
-
+VERSION = "3.1.0"
diff --git a/vafator/annotator.py b/vafator/annotator.py
index ec983d4..519b8eb 100755
--- a/vafator/annotator.py
+++ b/vafator/annotator.py
@@ -8,7 +8,12 @@
 import datetime
 import json
 
-from vafator.constants import AMBIGUOUS_BASES, BATCH_SIZE, _HEADER_TEMPLATES, _REPLICATE_HEADER_TEMPLATES
+from vafator.constants import (
+    AMBIGUOUS_BASES,
+    BATCH_SIZE,
+    _HEADER_TEMPLATES,
+    _REPLICATE_HEADER_TEMPLATES,
+)
 from vafator.ploidies import DEFAULT_PLOIDY
 from vafator.rank_sum_test import get_rank_sum_tests
 from vafator.power import PowerCalculator, DEFAULT_ERROR_RATE, DEFAULT_FPR
@@ -17,13 +22,13 @@
 
 
 def _collect_metrics_worker(
-        chrom: str,
-        variant_tuples: list,
-        bam_paths: dict,
-        min_base_quality: int,
-        min_mapping_quality: int,
-        include_ambiguous_bases: bool
-    ) -> dict:
+    chrom: str,
+    variant_tuples: list,
+    bam_paths: dict,
+    min_base_quality: int,
+    min_mapping_quality: int,
+    include_ambiguous_bases: bool,
+) -> dict:
     """
     Top-level worker function for ProcessPoolExecutor — must be module-level to be picklable.
     Opens its own BAM readers (AlignmentFile objects cannot be shared across processes).
@@ -41,8 +46,10 @@ def _collect_metrics_worker(
         {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}}
     """
     all_metrics = {}
-    variants = [VariantRecord(CHROM=chrom, POS=pos, REF=ref, ALT=[alt])
-                for pos, ref, alt in variant_tuples]
+    variants = [
+        VariantRecord(CHROM=chrom, POS=pos, REF=ref, ALT=[alt])
+        for pos, ref, alt in variant_tuples
+    ]
     for sample, bam_files in bam_paths.items():
         for i, bam_path in enumerate(bam_files):
             bam = pysam.AlignmentFile(bam_path, "rb")
@@ -60,17 +67,21 @@ def _collect_metrics_worker(
 
 class Annotator(object):
 
-    def __init__(self, input_vcf: str, output_vcf: str,
-                 input_bams: dict,
-                 purities: dict = {},
-                 mapping_qual_thr: int = 0,
-                 base_call_qual_thr: int = 29,
-                 tumor_ploidies: dict = {},
-                 normal_ploidy: int = 2,
-                 fpr: float = DEFAULT_FPR,
-                 error_rate: float = DEFAULT_ERROR_RATE,
-                 include_ambiguous_bases: bool = True,
-                 num_processes: int = 1):
+    def __init__(
+        self,
+        input_vcf: str,
+        output_vcf: str,
+        input_bams: dict,
+        purities: dict = {},
+        mapping_qual_thr: int = 0,
+        base_call_qual_thr: int = 29,
+        tumor_ploidies: dict = {},
+        normal_ploidy: int = 2,
+        fpr: float = DEFAULT_FPR,
+        error_rate: float = DEFAULT_ERROR_RATE,
+        include_ambiguous_bases: bool = True,
+        num_processes: int = 1,
+    ):
         """
         Args:
             input_vcf: path to the input VCF file to annotate
@@ -99,7 +110,7 @@ def __init__(self, input_vcf: str, output_vcf: str,
             tumor_ploidies=tumor_ploidies,
             purities=purities,
             error_rate=error_rate,
-            fpr=fpr
+            fpr=fpr,
         )
 
         self.vcf = VCF(input_vcf)
@@ -112,26 +123,40 @@ def __init__(self, input_vcf: str, output_vcf: str,
             "input_vcf": os.path.abspath(input_vcf),
             "output_vcf": os.path.abspath(output_vcf),
             "bams": ";".join(
-                ["{}:{}".format(s, ",".join([os.path.abspath(b) for b in bams]))
-                for s, bams in input_bams.items()]
-                ),
+                [
+                    "{}:{}".format(s, ",".join([os.path.abspath(b) for b in bams]))
+                    for s, bams in input_bams.items()
+                ]
+            ),
             "mapping_quality_threshold": mapping_qual_thr,
             "base_call_quality_threshold": base_call_qual_thr,
             "purities": ";".join(["{}:{}".format(s, p) for s, p in purities.items()]),
             "normal_ploidy": normal_ploidy,
-            "tumor_ploidy": ";".join(
-                ["{}:{}".format(s, p.report_value) for s, p in tumor_ploidies.items()]
-            ) if tumor_ploidies else DEFAULT_PLOIDY,
+            "tumor_ploidy": (
+                ";".join(
+                    [
+                        "{}:{}".format(s, p.report_value)
+                        for s, p in tumor_ploidies.items()
+                    ]
+                )
+                if tumor_ploidies
+                else DEFAULT_PLOIDY
+            ),
             "include_ambiguous_bases": include_ambiguous_bases,
         }
-        self.vcf.add_to_header("##vafator_command_line={}".format(json.dumps(self.header)))
+        self.vcf.add_to_header(
+            "##vafator_command_line={}".format(json.dumps(self.header))
+        )
 
         for a in Annotator._get_headers(input_bams):
             self.vcf.add_info_to_header(a)
         self.vcf_writer = Writer(output_vcf, self.vcf)
 
         self.bam_paths = input_bams
-        self.bam_readers = {s: [pysam.AlignmentFile(b, "rb") for b in bams] for s, bams in input_bams.items()}
+        self.bam_readers = {
+            s: [pysam.AlignmentFile(b, "rb") for b in bams]
+            for s, bams in input_bams.items()
+        }
 
     def run(self) -> None:
         """Run the annotation pipeline over all variants in the input VCF,
@@ -164,13 +189,17 @@ def _run_parallel(self, batch: list) -> None:
             for chrom, chrom_variants in stream_variants_by_chrom(self.vcf):
                 chrom_variants_map[chrom] = chrom_variants
                 variant_tuples = [(v.POS, v.REF, v.ALT[0]) for v in chrom_variants]
-                futures[executor.submit(
-                    _collect_metrics_worker,
-                    chrom=chrom, variant_tuples=variant_tuples, bam_paths=self.bam_paths,
-                    min_base_quality=self.base_call_quality_threshold,
-                    min_mapping_quality=self.mapping_quality_threshold,
-                    include_ambiguous_bases=self.include_ambiguous_bases,
-                )] = chrom
+                futures[
+                    executor.submit(
+                        _collect_metrics_worker,
+                        chrom=chrom,
+                        variant_tuples=variant_tuples,
+                        bam_paths=self.bam_paths,
+                        min_base_quality=self.base_call_quality_threshold,
+                        min_mapping_quality=self.mapping_quality_threshold,
+                        include_ambiguous_bases=self.include_ambiguous_bases,
+                    )
+                ] = chrom
             chrom_results = {futures[f]: f.result() for f in futures}
         for chrom, chrom_variants in chrom_variants_map.items():
             self._annotate_and_batch(chrom_variants, chrom_results[chrom], batch)
@@ -189,17 +218,21 @@ def _collect_chrom_metrics(self, chrom: str, chrom_variants: list) -> dict:
         for sample, bams in self.bam_readers.items():
             for i, bam in enumerate(bams):
                 all_metrics[(sample, i)] = collect_metrics_for_chrom(
-                    chrom=chrom, variants=chrom_variants, bam=bam,
+                    chrom=chrom,
+                    variants=chrom_variants,
+                    bam=bam,
                     min_base_quality=self.base_call_quality_threshold,
                     min_mapping_quality=self.mapping_quality_threshold,
                     include_ambiguous_bases=self.include_ambiguous_bases,
                 )
         return all_metrics
 
-    def _annotate_and_batch(self, chrom_variants: list, all_metrics: dict, batch: list) -> None:
+    def _annotate_and_batch(
+        self, chrom_variants: list, all_metrics: dict, batch: list
+    ) -> None:
         """Annotate variants using pre-computed metrics and append to write batch.
         Flushes the batch to disk when it reaches BATCH_SIZE.
-        
+
         Args:
             chrom_variants: list of cyvcf2 Variant objects to annotate
             all_metrics: {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}}
@@ -208,7 +241,8 @@ def _annotate_and_batch(self, chrom_variants: list, all_metrics: dict, batch: li
         for variant in chrom_variants:
             metrics_by_bam = {
                 (sample, i): all_metrics[(sample, i)].get(
-                    (variant.POS, variant.REF, variant.ALT[0]), EMPTY_METRICS)
+                    (variant.POS, variant.REF, variant.ALT[0]), EMPTY_METRICS
+                )
                 for sample, bams in self.bam_readers.items()
                 for i in range(len(bams))
             }
@@ -218,7 +252,6 @@ def _annotate_and_batch(self, chrom_variants: list, all_metrics: dict, batch: li
                 self._write_batch(batch)
                 batch.clear()
 
-
     def _add_stats(self, variant: Variant, metrics_by_bam: dict) -> None:
         """Annotate a single variant using pre-computed metrics.
 
@@ -250,9 +283,18 @@ def _add_stats(self, variant: Variant, metrics_by_bam: dict) -> None:
                     global_all_positions.update(metrics.all_positions)
                     global_dp += metrics.dp
 
-            self._annotate_sample(variant, sample, global_ac, global_dp, global_bq,
-                                  global_mq, global_pos, global_all_mqs, global_all_bqs,
-                                  global_all_positions)
+            self._annotate_sample(
+                variant,
+                sample,
+                global_ac,
+                global_dp,
+                global_bq,
+                global_mq,
+                global_pos,
+                global_all_mqs,
+                global_all_bqs,
+                global_all_positions,
+            )
 
     def _annotate_replicate(self, v: Variant, s: str, i: int, m) -> None:
         """Write per-replicate annotations — only called when multiple BAMs are provided for a sample.
@@ -264,27 +306,60 @@ def _annotate_replicate(self, v: Variant, s: str, i: int, m) -> None:
             m: pre-computed CoverageMetrics for this variant in this BAM
         """
         n = i + 1
-        v.INFO["{}_af_{}".format(s, n)] = ",".join([str(self._calculate_af(m.ac[alt], m.dp)) for alt in v.ALT])
+        v.INFO["{}_af_{}".format(s, n)] = ",".join(
+            [str(self._calculate_af(m.ac[alt], m.dp)) for alt in v.ALT]
+        )
         v.INFO["{}_ac_{}".format(s, n)] = ",".join([str(m.ac[alt]) for alt in v.ALT])
-        v.INFO["{}_n_{}".format(s, n)] = str(sum(m.ac.get(b, 0) for b in AMBIGUOUS_BASES))
+        v.INFO["{}_n_{}".format(s, n)] = str(
+            sum(m.ac.get(b, 0) for b in AMBIGUOUS_BASES)
+        )
         v.INFO["{}_dp_{}".format(s, n)] = m.dp
-        v.INFO["{}_pu_{}".format(s, n)] = ",".join([str(self.power.calculate_power(ac=m.ac[alt], dp=m.dp, sample=s, variant=v)) for alt in v.ALT])
+        v.INFO["{}_pu_{}".format(s, n)] = ",".join(
+            [
+                str(
+                    self.power.calculate_power(
+                        ac=m.ac[alt], dp=m.dp, sample=s, variant=v
+                    )
+                )
+                for alt in v.ALT
+            ]
+        )
 
         power, k = self.power.calculate_absolute_power(dp=m.dp, sample=s, variant=v)
         v.INFO["{}_pw_{}".format(s, n)] = str(power)
         v.INFO["{}_k_{}".format(s, n)] = str(k)
-        v.INFO["{}_bq_{}".format(s, n)] = ",".join([str(m.bqs[v.REF])] + [str(m.bqs[alt]) for alt in v.ALT])
-        v.INFO["{}_mq_{}".format(s, n)] = ",".join([str(m.mqs[v.REF])] + [str(m.mqs[alt]) for alt in v.ALT])
-        v.INFO["{}_pos_{}".format(s, n)] = ",".join([str(m.positions[v.REF])] + [str(m.positions[alt]) for alt in v.ALT])
-        for key, tag in [(m.all_mqs, "rsmq"), (m.all_bqs, "rsbq"), (m.all_positions, "rspos")]:
+        v.INFO["{}_bq_{}".format(s, n)] = ",".join(
+            [str(m.bqs[v.REF])] + [str(m.bqs[alt]) for alt in v.ALT]
+        )
+        v.INFO["{}_mq_{}".format(s, n)] = ",".join(
+            [str(m.mqs[v.REF])] + [str(m.mqs[alt]) for alt in v.ALT]
+        )
+        v.INFO["{}_pos_{}".format(s, n)] = ",".join(
+            [str(m.positions[v.REF])] + [str(m.positions[alt]) for alt in v.ALT]
+        )
+        for key, tag in [
+            (m.all_mqs, "rsmq"),
+            (m.all_bqs, "rsbq"),
+            (m.all_positions, "rspos"),
+        ]:
             pvalues, stats = get_rank_sum_tests(key, v)
             if stats:
                 v.INFO["{}_{}_{}".format(s, tag, n)] = ",".join(stats)
                 v.INFO["{}_{}_pv_{}".format(s, tag, n)] = ",".join(pvalues)
 
-    def _annotate_sample(self, v: Variant, s: str, gac: Counter, gdp: int,
-                         gbq: Counter, gmq: Counter, gpos: Counter,
-                         gallmq: dict, gallbq: dict, gallpos: dict) -> None:
+    def _annotate_sample(
+        self,
+        v: Variant,
+        s: str,
+        gac: Counter,
+        gdp: int,
+        gbq: Counter,
+        gmq: Counter,
+        gpos: Counter,
+        gallmq: dict,
+        gallbq: dict,
+        gallpos: dict,
+    ) -> None:
         """Write aggregate annotations for a sample, combining metrics across all replicates.
 
         Args:
@@ -299,20 +374,41 @@ def _annotate_sample(self, v: Variant, s: str, gac: Counter, gdp: int,
             gallbq: BQ distributions per allele across all BAMs
             gallpos: read position distributions per allele across all BAMs
         """
-        v.INFO["{}_af".format(s)] = ",".join([str(self._calculate_af(gac[alt], gdp)) for alt in v.ALT])
+        v.INFO["{}_af".format(s)] = ",".join(
+            [str(self._calculate_af(gac[alt], gdp)) for alt in v.ALT]
+        )
         v.INFO["{}_ac".format(s)] = ",".join([str(gac[alt]) for alt in v.ALT])
         v.INFO["{}_n".format(s)] = str(sum(gac.get(b, 0) for b in AMBIGUOUS_BASES))
         v.INFO["{}_dp".format(s)] = gdp
-        v.INFO["{}_eaf".format(s)] = str(self.power.calculate_expected_vaf(sample=s, variant=v))
-        v.INFO["{}_pu".format(s)] = ",".join([str(self.power.calculate_power(ac=gac[alt], dp=gdp, sample=s, variant=v)) for alt in v.ALT])
-        
+        v.INFO["{}_eaf".format(s)] = str(
+            self.power.calculate_expected_vaf(sample=s, variant=v)
+        )
+        v.INFO["{}_pu".format(s)] = ",".join(
+            [
+                str(
+                    self.power.calculate_power(ac=gac[alt], dp=gdp, sample=s, variant=v)
+                )
+                for alt in v.ALT
+            ]
+        )
+
         power, k = self.power.calculate_absolute_power(dp=gdp, sample=s, variant=v)
         v.INFO["{}_pw".format(s)] = str(power)
         v.INFO["{}_k".format(s)] = str(k)
-        v.INFO["{}_bq".format(s)] = ",".join([str(gbq[v.REF])] + [str(gbq[alt]) for alt in v.ALT])
-        v.INFO["{}_mq".format(s)] = ",".join([str(gmq[v.REF])] + [str(gmq[alt]) for alt in v.ALT])
-        v.INFO["{}_pos".format(s)] = ",".join([str(gpos[v.REF])] + [str(gpos[alt]) for alt in v.ALT])
-        for distributions, tag in [(gallmq, "rsmq"), (gallbq, "rsbq"), (gallpos, "rspos")]:
+        v.INFO["{}_bq".format(s)] = ",".join(
+            [str(gbq[v.REF])] + [str(gbq[alt]) for alt in v.ALT]
+        )
+        v.INFO["{}_mq".format(s)] = ",".join(
+            [str(gmq[v.REF])] + [str(gmq[alt]) for alt in v.ALT]
+        )
+        v.INFO["{}_pos".format(s)] = ",".join(
+            [str(gpos[v.REF])] + [str(gpos[alt]) for alt in v.ALT]
+        )
+        for distributions, tag in [
+            (gallmq, "rsmq"),
+            (gallbq, "rsbq"),
+            (gallpos, "rspos"),
+        ]:
             pvalues, stats = get_rank_sum_tests(distributions, v)
             if stats:
                 v.INFO["{}_{}".format(s, tag)] = ",".join(stats)
@@ -320,11 +416,11 @@ def _annotate_sample(self, v: Variant, s: str, gac: Counter, gdp: int,
 
     def _calculate_af(self, ac: int, dp: int) -> float:
         """Return allele frequency, or 0.0 if depth is zero.
- 
+
         Args:
             ac: allele count for this alternate allele
             dp: total depth of coverage
- 
+
         Returns:
             allele frequency rounded to 5 decimal places, or 0.0 if dp is zero
         """
@@ -332,7 +428,7 @@ def _calculate_af(self, ac: int, dp: int) -> float:
 
     def _write_batch(self, batch: list) -> None:
         """Write a batch of annotated variants to the output VCF.
- 
+
         Args:
             batch: list of cyvcf2 Variant objects to write
         """
@@ -352,21 +448,29 @@ def _get_headers(input_bams: dict) -> list:
         headers = []
         for s, bams in input_bams.items():
             for suffix, description, typ, number in _HEADER_TEMPLATES:
-                headers.append(Annotator._make_header(suffix, description, typ, number, sample=s))
+                headers.append(
+                    Annotator._make_header(suffix, description, typ, number, sample=s)
+                )
             if len(bams) > 1:
                 for i, bam in enumerate(bams, start=1):
                     # n = os.path.basename(bam).split(".")[0]
                     for suffix, description, typ, number in _REPLICATE_HEADER_TEMPLATES:
                         headers.append(
                             Annotator._make_header(
-                            suffix, description, typ, number, sample=s, index=i
+                                suffix, description, typ, number, sample=s, index=i
                             )
                         )
         return headers
 
     @staticmethod
-    def _make_header(suffix: str, description: str, typ: str, number: str,
-                     sample: str, index: int = None) -> dict:
+    def _make_header(
+        suffix: str,
+        description: str,
+        typ: str,
+        number: str,
+        sample: str,
+        index: int = None,
+    ) -> dict:
         """Build a single INFO header dict for cyvcf2's add_info_to_header.
 
         Args:
@@ -383,4 +487,9 @@ def _make_header(suffix: str, description: str, typ: str, number: str,
         header_id = "{}_{}".format(sample, suffix)
         if index is not None:
             header_id = "{}_{}".format(header_id, index)
-        return {'ID': header_id, 'Description': description.format(sample=sample), 'Type': typ, 'Number': number}
\ No newline at end of file
+        return {
+            "ID": header_id,
+            "Description": description.format(sample=sample),
+            "Type": typ,
+            "Number": number,
+        }
diff --git a/vafator/command_line.py b/vafator/command_line.py
index e565c01..e09e9b0 100755
--- a/vafator/command_line.py
+++ b/vafator/command_line.py
@@ -16,42 +16,111 @@
 def annotator():
 
     # set up logger
-    parser = argparse.ArgumentParser(description="vafator v{}".format(vafator.VERSION),
-                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-                                     epilog=epilog)
-    parser.add_argument("--input-vcf", dest="input_vcf", action="store", help="The VCF to annotate", required=True)
-    parser.add_argument("--output-vcf", dest="output_vcf", action="store", help="The annotated VCF", required=True)
-    parser.add_argument('--bam', action='append', nargs=2,
-                        metavar=('sample_name', 'bam_file'), default=[],
-                        help='A sample name and a BAM file. Can be used multiple times to input multiple samples and '
-                             'multiple BAM files. The same sample name can be used multiple times with different BAMs, '
-                             'this will treated as replicates.')
-    parser.add_argument("--mapping-quality", dest="mapping_quality", action="store", type=int, default=1,
-                        help="All reads with a mapping quality below this threshold will be filtered out")
-    parser.add_argument("--base-call-quality", dest="base_call_quality", action="store", type=int, default=30,
-                        help="All bases with a base call quality below this threshold will be filtered out")
-    parser.add_argument('--purity', action='append', nargs=2,
-                        metavar=('sample_name', 'purity'), default=[],
-                        help='A sample name and a tumor purity value. Can be used multiple times to input multiple '
-                             'samples in combination with --bam. If no purity is provided for a given sample the '
-                             'default value is 1.0')
-    parser.add_argument("--tumor-ploidy", action='append', nargs=2,
-                        metavar=('sample_name', 'tumor_ploidy'), default=[],
-                        help='A sample name and a tumor ploidy. Can be used multiple times to input multiple '
-                             'samples in combination with --bam. The tumor ploidy can be provided as a genome-wide '
-                             'value (eg: --tumor-ploidy primary 2) or as local copy numbers in a BED file '
-                             '(eg: --tumor-ploidy primary /path/to/copy_numbers.bed), see the documentation for '
-                             'expected BED format (default: 2)')
-    parser.add_argument("--normal-ploidy", dest="normal_ploidy", required=False, default=2, type=int,
-                        help="Normal ploidy for the power calculation (default: 2)")
-    parser.add_argument("--fpr", dest="fpr", required=False, default=DEFAULT_FPR, type=float,
-                        help="False Positive Rate (FPR) to use in the power calculation")
-    parser.add_argument("--error-rate", dest="error_rate", required=False, default=DEFAULT_ERROR_RATE, type=float,
-                        help="Error rate to use in the power calculation")
-    parser.add_argument("--exclude-ambiguous-bases", dest="exclude_ambiguous_bases", action='store_true',
-                        help="Flag indicating to include ambiguous bases from the DP calculation")
-    parser.add_argument("--num-processes", dest="num_processes", required=False, default=1, type=int,
-                        help="Number of processes for parallel chromosome-level annotation (default: 1)")
+    parser = argparse.ArgumentParser(
+        description="vafator v{}".format(vafator.VERSION),
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        epilog=epilog,
+    )
+    parser.add_argument(
+        "--input-vcf",
+        dest="input_vcf",
+        action="store",
+        help="The VCF to annotate",
+        required=True,
+    )
+    parser.add_argument(
+        "--output-vcf",
+        dest="output_vcf",
+        action="store",
+        help="The annotated VCF",
+        required=True,
+    )
+    parser.add_argument(
+        "--bam",
+        action="append",
+        nargs=2,
+        metavar=("sample_name", "bam_file"),
+        default=[],
+        help="A sample name and a BAM file. Can be used multiple times to input multiple samples and "
+        "multiple BAM files. The same sample name can be used multiple times with different BAMs, "
+        "this will treated as replicates.",
+    )
+    parser.add_argument(
+        "--mapping-quality",
+        dest="mapping_quality",
+        action="store",
+        type=int,
+        default=1,
+        help="All reads with a mapping quality below this threshold will be filtered out",
+    )
+    parser.add_argument(
+        "--base-call-quality",
+        dest="base_call_quality",
+        action="store",
+        type=int,
+        default=30,
+        help="All bases with a base call quality below this threshold will be filtered out",
+    )
+    parser.add_argument(
+        "--purity",
+        action="append",
+        nargs=2,
+        metavar=("sample_name", "purity"),
+        default=[],
+        help="A sample name and a tumor purity value. Can be used multiple times to input multiple "
+        "samples in combination with --bam. If no purity is provided for a given sample the "
+        "default value is 1.0",
+    )
+    parser.add_argument(
+        "--tumor-ploidy",
+        action="append",
+        nargs=2,
+        metavar=("sample_name", "tumor_ploidy"),
+        default=[],
+        help="A sample name and a tumor ploidy. Can be used multiple times to input multiple "
+        "samples in combination with --bam. The tumor ploidy can be provided as a genome-wide "
+        "value (eg: --tumor-ploidy primary 2) or as local copy numbers in a BED file "
+        "(eg: --tumor-ploidy primary /path/to/copy_numbers.bed), see the documentation for "
+        "expected BED format (default: 2)",
+    )
+    parser.add_argument(
+        "--normal-ploidy",
+        dest="normal_ploidy",
+        required=False,
+        default=2,
+        type=int,
+        help="Normal ploidy for the power calculation (default: 2)",
+    )
+    parser.add_argument(
+        "--fpr",
+        dest="fpr",
+        required=False,
+        default=DEFAULT_FPR,
+        type=float,
+        help="False Positive Rate (FPR) to use in the power calculation",
+    )
+    parser.add_argument(
+        "--error-rate",
+        dest="error_rate",
+        required=False,
+        default=DEFAULT_ERROR_RATE,
+        type=float,
+        help="Error rate to use in the power calculation",
+    )
+    parser.add_argument(
+        "--exclude-ambiguous-bases",
+        dest="exclude_ambiguous_bases",
+        action="store_true",
+        help="Flag indicating to include ambiguous bases from the DP calculation",
+    )
+    parser.add_argument(
+        "--num-processes",
+        dest="num_processes",
+        required=False,
+        default=1,
+        type=int,
+        help="Number of processes for parallel chromosome-level annotation (default: 1)",
+    )
 
     args = parser.parse_args()
 
@@ -67,27 +136,44 @@ def annotator():
     purities = {}
     for sample_name, purity in args.purity:
         if sample_name in purities:
-            raise ValueError('Multiple purity values provided for sample: {}'.format(sample_name))
+            raise ValueError(
+                "Multiple purity values provided for sample: {}".format(sample_name)
+            )
         if sample_name not in bams:
-            raise ValueError('Provided a purity value for a sample for which no BAM is provided: {}'.format(sample_name))
+            raise ValueError(
+                "Provided a purity value for a sample for which no BAM is provided: {}".format(
+                    sample_name
+                )
+            )
         purities[sample_name] = float(purity)
 
     tumor_ploidies = {}
     for sample_name, tumor_ploidy in args.tumor_ploidy:
         if sample_name in tumor_ploidies:
-            raise ValueError('Multiple tumor ploidy values provided for sample: {}'.format(sample_name))
+            raise ValueError(
+                "Multiple tumor ploidy values provided for sample: {}".format(
+                    sample_name
+                )
+            )
         if sample_name not in bams:
             raise ValueError(
-                'Provided a tumor ploidy value for a sample for which no BAM is provided: {}'.format(sample_name))
+                "Provided a tumor ploidy value for a sample for which no BAM is provided: {}".format(
+                    sample_name
+                )
+            )
         try:
             # checks if a genome-wide purity value was passed
-            tumor_ploidies[sample_name] = PloidyManager(genome_wide_ploidy=float(tumor_ploidy))
+            tumor_ploidies[sample_name] = PloidyManager(
+                genome_wide_ploidy=float(tumor_ploidy)
+            )
         except ValueError:
             # checks if the non float-like value is a path to an existing file
             tumor_ploidies[sample_name] = PloidyManager(local_copy_numbers=tumor_ploidy)
 
     if len(bams) == 0:
-        raise ValueError("Please, provide at least one bam file with '--bam sample_name /path/to/file.bam'")
+        raise ValueError(
+            "Please, provide at least one bam file with '--bam sample_name /path/to/file.bam'"
+        )
 
     annotator = Annotator(
         input_vcf=args.input_vcf,
@@ -111,19 +197,39 @@ def annotator():
 def multiallelics_filter():
 
     # set up logger
-    parser = argparse.ArgumentParser(description="vafator v{}".format(vafator.VERSION),
-                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter, epilog=epilog)
-    parser.add_argument("--input-vcf", dest="input_vcf", action="store", help="The VCF to annotate", required=True)
-    parser.add_argument("--output-vcf", dest="output_vcf", action="store", help="The annotated VCF", required=True)
-    parser.add_argument("--tumor-sample-name", dest="tumor_sample_name", action="store",
-                        help='The tumor sample name (will look for annotation ${SAMPLE_NAME}_af)', default='tumor')
+    parser = argparse.ArgumentParser(
+        description="vafator v{}".format(vafator.VERSION),
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        epilog=epilog,
+    )
+    parser.add_argument(
+        "--input-vcf",
+        dest="input_vcf",
+        action="store",
+        help="The VCF to annotate",
+        required=True,
+    )
+    parser.add_argument(
+        "--output-vcf",
+        dest="output_vcf",
+        action="store",
+        help="The annotated VCF",
+        required=True,
+    )
+    parser.add_argument(
+        "--tumor-sample-name",
+        dest="tumor_sample_name",
+        action="store",
+        help="The tumor sample name (will look for annotation ${SAMPLE_NAME}_af)",
+        default="tumor",
+    )
     args = parser.parse_args()
 
     logging.info("Vafator multiallelic filter starting...")
     filter = MultiallelicFilter(
         input_vcf=args.input_vcf,
         output_vcf=args.output_vcf,
-        tumor_sample_name=args.tumor_sample_name
+        tumor_sample_name=args.tumor_sample_name,
     )
     filter.run()
 
@@ -131,37 +237,106 @@ def multiallelics_filter():
 
 
 def vafator2decifer():
-    parser = argparse.ArgumentParser(description='Generate input for Decifer using VCF file and HATCHet CNA file')
-    parser.add_argument("-V", "--vcf_file", required=True, type=str, help="single or multi-sample VCF file")
-    parser.add_argument("-S", "--samples", required=True, type=str,
-                        help="comma separated list of sample name prefixes to use for VAFator annotations, "
-                             "eg: primary_tumor,metastasis_tumor; the annotations primary_tumor_ac, primary_tumor_dp, "
-                             "etc. will be expected to exist")
-    parser.add_argument("-C", "--cna_file", required=True, type=str, help="HATCHet CNA file: best.seg.ucn ")
-    parser.add_argument("-O", "--out_dir", required=True, default="./", type=str,
-                        help="directory for printing files; please make unique for each patient!")
-    parser.add_argument("-M", "--min_depth", required=True, type=int, help="minimum depth PER sample")
-    parser.add_argument("-A", "--min_alt_depth", required=True, type=int,
-                        help="minimum depth of ALT allele in at least one sample")
-    parser.add_argument("-F", "--min_vaf", required=True, type=float,
-                        help="minimum VAF of ALT allele in at least one sample")
-    parser.add_argument("-N", "--max_CN", required=False, default=6, type=int,
-                        help="maximum total copy number for each observed clone")
-    parser.add_argument("-B", "--exclude_list", required=False, default=None, type=str,
-                        help="BED file of genomic regions to exclude")
-    parser.add_argument("-p", "--min_purity", required=False, default=0.0, type=float,
-                        help="minimum purity to consider samples")
-    parser.add_argument("--snp_file", required=False, default=None, type=str,
-                        help="HATCHet file containing germline SNP counts in tumor samples, baf/tumor.1bed")
+    parser = argparse.ArgumentParser(
+        description="Generate input for Decifer using VCF file and HATCHet CNA file"
+    )
+    parser.add_argument(
+        "-V",
+        "--vcf_file",
+        required=True,
+        type=str,
+        help="single or multi-sample VCF file",
+    )
+    parser.add_argument(
+        "-S",
+        "--samples",
+        required=True,
+        type=str,
+        help="comma separated list of sample name prefixes to use for VAFator annotations, "
+        "eg: primary_tumor,metastasis_tumor; the annotations primary_tumor_ac, primary_tumor_dp, "
+        "etc. will be expected to exist",
+    )
+    parser.add_argument(
+        "-C",
+        "--cna_file",
+        required=True,
+        type=str,
+        help="HATCHet CNA file: best.seg.ucn ",
+    )
+    parser.add_argument(
+        "-O",
+        "--out_dir",
+        required=True,
+        default="./",
+        type=str,
+        help="directory for printing files; please make unique for each patient!",
+    )
+    parser.add_argument(
+        "-M", "--min_depth", required=True, type=int, help="minimum depth PER sample"
+    )
+    parser.add_argument(
+        "-A",
+        "--min_alt_depth",
+        required=True,
+        type=int,
+        help="minimum depth of ALT allele in at least one sample",
+    )
+    parser.add_argument(
+        "-F",
+        "--min_vaf",
+        required=True,
+        type=float,
+        help="minimum VAF of ALT allele in at least one sample",
+    )
+    parser.add_argument(
+        "-N",
+        "--max_CN",
+        required=False,
+        default=6,
+        type=int,
+        help="maximum total copy number for each observed clone",
+    )
+    parser.add_argument(
+        "-B",
+        "--exclude_list",
+        required=False,
+        default=None,
+        type=str,
+        help="BED file of genomic regions to exclude",
+    )
+    parser.add_argument(
+        "-p",
+        "--min_purity",
+        required=False,
+        default=0.0,
+        type=float,
+        help="minimum purity to consider samples",
+    )
+    parser.add_argument(
+        "--snp_file",
+        required=False,
+        default=None,
+        type=str,
+        help="HATCHet file containing germline SNP counts in tumor samples, baf/tumor.1bed",
+    )
     args = parser.parse_args()
     run_vafator2decifer(args)
 
 
 def hatchet2bed():
-    parser = argparse.ArgumentParser(description='Generate input for Decifer using VCF file and HATCHet CNA file')
-    parser.add_argument("-i", "--input-file", required=True, type=str, help="input *.ucn hatchet file")
-    parser.add_argument("-o", "--output-prefix", required=True, type=str,
-                        help="output BED file prefix, one file will be created per sample in the input with the "
-                             "average tumor copy number in each segment")
+    parser = argparse.ArgumentParser(
+        description="Generate input for Decifer using VCF file and HATCHet CNA file"
+    )
+    parser.add_argument(
+        "-i", "--input-file", required=True, type=str, help="input *.ucn hatchet file"
+    )
+    parser.add_argument(
+        "-o",
+        "--output-prefix",
+        required=True,
+        type=str,
+        help="output BED file prefix, one file will be created per sample in the input with the "
+        "average tumor copy number in each segment",
+    )
     args = parser.parse_args()
-    run_hatchet2bed(input_file=args.input_file, output_prefix=args.output_prefix)
\ No newline at end of file
+    run_hatchet2bed(input_file=args.input_file, output_prefix=args.output_prefix)
diff --git a/vafator/constants.py b/vafator/constants.py
index bfad7aa..8496c0f 100755
--- a/vafator/constants.py
+++ b/vafator/constants.py
@@ -2,29 +2,114 @@
 BATCH_SIZE = 10000
 
 # IUPAC ambiguity codes treated as ambiguous bases in pileup metrics
-AMBIGUOUS_BASES = ['N', 'M', 'R', 'W', 'S', 'Y', 'K', 'V', 'H', 'D', 'B']
+AMBIGUOUS_BASES = ["N", "M", "R", "W", "S", "Y", "K", "V", "H", "D", "B"]
 
 # Header templates: (suffix, description, type, number)
 # {sample} is substituted at generation time
 _HEADER_TEMPLATES = [
-    ("af",       "Allele frequency for the alternate alleles in the {sample} sample/s",                                                                              "Float",   "A"),
-    ("dp",       "Total depth of coverage in the {sample} sample/s (independent of alleles)",                                                                        "Float",   "1"),
-    ("ac",       "Allele count for the alternate alleles in the {sample} sample/s",                                                                                  "Integer", "A"),
-    ("n",        "Allele count for ambiguous bases (any IUPAC ambiguity code is counted) in the {sample} sample/s",                                                  "Integer", "1"),
-    ("pu",       "Probability of an undetected mutation given the observed supporting reads (AC), the observed total coverage (DP) and the provided tumor purity in the {sample} sample/s", "Float", "A"),
-    ("pw",       "Power to detect a somatic mutation as described in Absolute given the observed total coverage (DP) and the provided tumor purity and ploidies in the {sample} sample/s",  "Float", "1"),
-    ("k",        "Minimum number of supporting reads, k, such that the probability of observing k or more non-reference reads due to sequencing error is less than the defined FPR in the {sample} sample/s", "Float", "1"),
-    ("eaf",      "Expected VAF considering the purity and ploidy/copy number in the {sample} sample/s",                                                              "Float",   "1"),
-    ("bq",       "Median base call quality of the reads supporting each allele in the {sample} sample/s",                                                            "Float",   "R"),
-    ("mq",       "Median mapping quality of the reads supporting each allele in the {sample} sample/s",                                                              "Float",   "R"),
-    ("pos",      "Median position within the read of the reads supporting each allele in the {sample} sample/s",                                                     "Float",   "R"),
-    ("rsmq",     "Rank sum test comparing the MQ distributions supporting the reference and the alternate in the {sample} sample/s",                                 "Float",   "A"),
-    ("rsmq_pv",  "Rank sum test p-value for MQ distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions", "Float",  "A"),
-    ("rsbq",     "Rank sum test comparing the BQ distributions supporting the reference and the alternate in the {sample} sample/s",                                 "Float",   "A"),
-    ("rsbq_pv",  "Rank sum test p-value for BQ distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions", "Float",  "A"),
-    ("rspos",    "Rank sum test comparing the position distributions supporting the reference and the alternate in the {sample} sample/s",                            "Float",   "A"),
-    ("rspos_pv", "Rank sum test p-value for position distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions", "Float", "A"),
+    (
+        "af",
+        "Allele frequency for the alternate alleles in the {sample} sample/s",
+        "Float",
+        "A",
+    ),
+    (
+        "dp",
+        "Total depth of coverage in the {sample} sample/s (independent of alleles)",
+        "Float",
+        "1",
+    ),
+    (
+        "ac",
+        "Allele count for the alternate alleles in the {sample} sample/s",
+        "Integer",
+        "A",
+    ),
+    (
+        "n",
+        "Allele count for ambiguous bases (any IUPAC ambiguity code is counted) in the {sample} sample/s",
+        "Integer",
+        "1",
+    ),
+    (
+        "pu",
+        "Probability of an undetected mutation given the observed supporting reads (AC), the observed total coverage (DP) and the provided tumor purity in the {sample} sample/s",
+        "Float",
+        "A",
+    ),
+    (
+        "pw",
+        "Power to detect a somatic mutation as described in Absolute given the observed total coverage (DP) and the provided tumor purity and ploidies in the {sample} sample/s",
+        "Float",
+        "1",
+    ),
+    (
+        "k",
+        "Minimum number of supporting reads, k, such that the probability of observing k or more non-reference reads due to sequencing error is less than the defined FPR in the {sample} sample/s",
+        "Float",
+        "1",
+    ),
+    (
+        "eaf",
+        "Expected VAF considering the purity and ploidy/copy number in the {sample} sample/s",
+        "Float",
+        "1",
+    ),
+    (
+        "bq",
+        "Median base call quality of the reads supporting each allele in the {sample} sample/s",
+        "Float",
+        "R",
+    ),
+    (
+        "mq",
+        "Median mapping quality of the reads supporting each allele in the {sample} sample/s",
+        "Float",
+        "R",
+    ),
+    (
+        "pos",
+        "Median position within the read of the reads supporting each allele in the {sample} sample/s",
+        "Float",
+        "R",
+    ),
+    (
+        "rsmq",
+        "Rank sum test comparing the MQ distributions supporting the reference and the alternate in the {sample} sample/s",
+        "Float",
+        "A",
+    ),
+    (
+        "rsmq_pv",
+        "Rank sum test p-value for MQ distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions",
+        "Float",
+        "A",
+    ),
+    (
+        "rsbq",
+        "Rank sum test comparing the BQ distributions supporting the reference and the alternate in the {sample} sample/s",
+        "Float",
+        "A",
+    ),
+    (
+        "rsbq_pv",
+        "Rank sum test p-value for BQ distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions",
+        "Float",
+        "A",
+    ),
+    (
+        "rspos",
+        "Rank sum test comparing the position distributions supporting the reference and the alternate in the {sample} sample/s",
+        "Float",
+        "A",
+    ),
+    (
+        "rspos_pv",
+        "Rank sum test p-value for position distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions",
+        "Float",
+        "A",
+    ),
 ]
 
 # eaf is not produced per-replicate
-_REPLICATE_HEADER_TEMPLATES = [t for t in _HEADER_TEMPLATES if t[0] != "eaf"]
\ No newline at end of file
+_REPLICATE_HEADER_TEMPLATES = [t for t in _HEADER_TEMPLATES if t[0] != "eaf"]
diff --git a/vafator/hatchet2bed.py b/vafator/hatchet2bed.py
index d0b45bb..bef4ba3 100644
--- a/vafator/hatchet2bed.py
+++ b/vafator/hatchet2bed.py
@@ -2,9 +2,13 @@
 
 
 def run_hatchet2bed(input_file, output_prefix):
-    input_df = pd.read_csv(input_file, sep='\t')
-    cn_columns = sorted(list(filter(lambda c: c.startswith('cn_clone'), input_df.columns)))
-    u_columns = sorted(list(filter(lambda c: c.startswith('u_clone'), input_df.columns)))
+    input_df = pd.read_csv(input_file, sep="\t")
+    cn_columns = sorted(
+        list(filter(lambda c: c.startswith("cn_clone"), input_df.columns))
+    )
+    u_columns = sorted(
+        list(filter(lambda c: c.startswith("u_clone"), input_df.columns))
+    )
 
     for sample in input_df.SAMPLE.unique():
         data = []
@@ -14,9 +18,18 @@ def run_hatchet2bed(input_file, output_prefix):
             for cn_column, u_column in zip(cn_columns, u_columns):
                 u = float(row[u_column])
                 total_u += u
-                cn = sum(map(lambda c: float(c), row[cn_column].split('|')))
+                cn = sum(map(lambda c: float(c), row[cn_column].split("|")))
                 numerator.append(u * cn)
-            data.append([row['#CHR'], row['START'], row['END'], sum(numerator)/total_u])
+            data.append(
+                [row["#CHR"], row["START"], row["END"], sum(numerator) / total_u]
+            )
 
-        output_df = pd.DataFrame(data=data, columns=['chromosome', 'start', 'end', 'value'])
-        output_df.to_csv('{}.{}.bed'.format(output_prefix, sample), sep='\t', header=False, index=False)
+        output_df = pd.DataFrame(
+            data=data, columns=["chromosome", "start", "end", "value"]
+        )
+        output_df.to_csv(
+            "{}.{}.bed".format(output_prefix, sample),
+            sep="\t",
+            header=False,
+            index=False,
+        )
diff --git a/vafator/multiallelic_filter.py b/vafator/multiallelic_filter.py
index 6f08d7e..00b0f26 100755
--- a/vafator/multiallelic_filter.py
+++ b/vafator/multiallelic_filter.py
@@ -16,7 +16,7 @@ class MultiallelicFilter(object):
 
     multiallelic_annotation_name = "multiallelic"
 
-    def __init__(self, input_vcf, output_vcf, tumor_sample_name='tumor'):
+    def __init__(self, input_vcf, output_vcf, tumor_sample_name="tumor"):
         """
         :param input_vcf: the input VCF file
         :param output_vcf: the file path of the output VCF
@@ -26,10 +26,18 @@ def __init__(self, input_vcf, output_vcf, tumor_sample_name='tumor'):
         # sets a line in the header with the command used to annotate the file
         self.vafator_header["input_vcf"] = input_vcf
         self.vafator_header["output_vcf"] = output_vcf
-        self.vcf.add_to_header("##vafator_command_line={}".format(json.dumps(self.vafator_header)))
+        self.vcf.add_to_header(
+            "##vafator_command_line={}".format(json.dumps(self.vafator_header))
+        )
         # adds to the header all the names of the annotations
-        self.vcf.add_info_to_header({'ID': self.multiallelic_annotation_name, 'Type': 'String', 'Number': '.',
-                 'Description': "Indicates multiallelic variants filtered and their frequencies if any (e.g.: T,0.12)"})
+        self.vcf.add_info_to_header(
+            {
+                "ID": self.multiallelic_annotation_name,
+                "Type": "String",
+                "Number": ".",
+                "Description": "Indicates multiallelic variants filtered and their frequencies if any (e.g.: T,0.12)",
+            }
+        )
         self.vcf_writer = Writer(output_vcf, self.vcf)
 
     def run(self):
@@ -41,8 +49,12 @@ def run(self):
                 prev_variant = variant
                 continue
             # considers only SNVs with same chromosome, position and reference
-            if variant.is_snp and variant.CHROM == prev_variant.CHROM and variant.POS == prev_variant.POS and \
-                    variant.REF == prev_variant.REF:
+            if (
+                variant.is_snp
+                and variant.CHROM == prev_variant.CHROM
+                and variant.POS == prev_variant.POS
+                and variant.REF == prev_variant.REF
+            ):
                 af1 = self.get_tumor_af(prev_variant)
                 af2 = self.get_tumor_af(variant)
                 # keeps the variant with the highest AF
@@ -56,7 +68,9 @@ def run(self):
                     alt1 = variant.ALT[0]
                     alt2 = prev_variant.ALT[0]
                     prev_variant = random.sample([variant, prev_variant], k=1)[0]
-                    self.set_multiallelic_annotation(prev_variant, alt1 if alt1 != prev_variant.ALT[0] else alt2, af1)
+                    self.set_multiallelic_annotation(
+                        prev_variant, alt1 if alt1 != prev_variant.ALT[0] else alt2, af1
+                    )
                 continue
 
             # write previous variant and stores current for next iteration
@@ -74,8 +88,10 @@ def run(self):
         self.vcf.close()
 
     def set_multiallelic_annotation(self, variant, alt, af):
-        variant.INFO[self.multiallelic_annotation_name] = \
-            ",".join(variant.INFO.get(self.multiallelic_annotation_name, "").split(",") + [alt, str(af)])
+        variant.INFO[self.multiallelic_annotation_name] = ",".join(
+            variant.INFO.get(self.multiallelic_annotation_name, "").split(",")
+            + [alt, str(af)]
+        )
 
     def get_tumor_af(self, prev_variant):
         return prev_variant.INFO.get("{}_af".format(self.tumor_sample_name), 0.0)
diff --git a/vafator/pileup_utils.py b/vafator/pileup_utils.py
index 1048e7c..22f72a6 100755
--- a/vafator/pileup_utils.py
+++ b/vafator/pileup_utils.py
@@ -17,6 +17,7 @@ class VariantRecord:
         REF: reference allele
         ALT: list of alternate alleles
     """
+
     CHROM: str
     POS: int
     REF: str
@@ -37,6 +38,7 @@ class CoverageMetrics:
         all_mqs: full mapping quality distribution per allele
         all_positions: full read position distribution per allele
     """
+
     ac: dict
     dp: int
     bqs: dict = None
@@ -48,8 +50,14 @@ class CoverageMetrics:
 
 
 EMPTY_METRICS = CoverageMetrics(
-    ac=Counter(), dp=0, bqs=Counter(), mqs=Counter(), positions=Counter(),
-    all_bqs={}, all_mqs={}, all_positions={}
+    ac=Counter(),
+    dp=0,
+    bqs=Counter(),
+    mqs=Counter(),
+    positions=Counter(),
+    all_bqs={},
+    all_mqs={},
+    all_positions={},
 )
 
 
@@ -108,4 +116,4 @@ def is_deletion(variant) -> bool:
     Args:
         variant: any object with REF and ALT attributes (Variant or VariantRecord)
     """
-    return len(variant.ALT[0]) == 1 and len(variant.REF) > 1
\ No newline at end of file
+    return len(variant.ALT[0]) == 1 and len(variant.REF) > 1
diff --git a/vafator/pileups.py b/vafator/pileups.py
index e85aa53..47ec377 100755
--- a/vafator/pileups.py
+++ b/vafator/pileups.py
@@ -5,11 +5,23 @@
 from pysam.libcalignmentfile import IteratorColumnRegion, AlignmentFile
 
 from vafator.constants import AMBIGUOUS_BASES
-from vafator.pileup_utils import VariantRecord, CoverageMetrics, safe_median, aggregate_list_per_base, is_snp, is_insertion, is_deletion
+from vafator.pileup_utils import (
+    VariantRecord,
+    CoverageMetrics,
+    safe_median,
+    aggregate_list_per_base,
+    is_snp,
+    is_insertion,
+    is_deletion,
+)
+
 
 def get_variant_pileup(
-        variant: Union[Variant, VariantRecord], bam: AlignmentFile,
-        min_base_quality: int, min_mapping_quality: int) -> IteratorColumnRegion:
+    variant: Union[Variant, VariantRecord],
+    bam: AlignmentFile,
+    min_base_quality: int,
+    min_mapping_quality: int,
+) -> IteratorColumnRegion:
     """Open a pileup iterator at a single variant position.
     Kept for backwards compatibility and use in tests.
 
@@ -31,12 +43,18 @@ def get_variant_pileup(
         max_depth=1000000,
         min_base_quality=min_base_quality,
         min_mapping_quality=min_mapping_quality,
-        stepper='samtools'
-        )
+        stepper="samtools",
+    )
 
 
-def get_region_pileup(chrom: str, start: int, end: int, bam: AlignmentFile,
-                      min_base_quality: int, min_mapping_quality: int):
+def get_region_pileup(
+    chrom: str,
+    start: int,
+    end: int,
+    bam: AlignmentFile,
+    min_base_quality: int,
+    min_mapping_quality: int,
+):
     """Open a single pileup iterator spanning a genomic region.
 
     Args:
@@ -58,8 +76,8 @@ def get_region_pileup(chrom: str, start: int, end: int, bam: AlignmentFile,
         max_depth=1000000,
         min_base_quality=min_base_quality,
         min_mapping_quality=min_mapping_quality,
-        stepper='samtools'
-        )
+        stepper="samtools",
+    )
 
 
 def stream_variants_by_chrom(vcf) -> Iterator[Tuple[str, List[Variant]]]:
@@ -86,12 +104,13 @@ def stream_variants_by_chrom(vcf) -> Iterator[Tuple[str, List[Variant]]]:
 
 
 def collect_metrics_for_chrom(
-        chrom: str,
-        variants: List[Variant],
-        bam: AlignmentFile,
-        min_base_quality: int,
-        min_mapping_quality: int,
-        include_ambiguous_bases: bool = False) -> Dict[Tuple, CoverageMetrics]:
+    chrom: str,
+    variants: List[Variant],
+    bam: AlignmentFile,
+    min_base_quality: int,
+    min_mapping_quality: int,
+    include_ambiguous_bases: bool = False,
+) -> Dict[Tuple, CoverageMetrics]:
     """Compute pileup metrics for all variants on a chromosome using a single pileup iterator.
 
     Metrics are computed immediately for each pileup column while it is still valid —
@@ -115,8 +134,8 @@ def collect_metrics_for_chrom(
     for v in variants:
         variants_by_pos[v.POS].append(v)
 
-    start = variants[0].POS - 1   # 0-based inclusive
-    end = variants[-1].POS        # exclusive end for pysam
+    start = variants[0].POS - 1  # 0-based inclusive
+    end = variants[-1].POS  # exclusive end for pysam
     results: Dict[Tuple, CoverageMetrics] = {}
 
     for pileup_col in get_region_pileup(
@@ -125,21 +144,24 @@ def collect_metrics_for_chrom(
         end=end,
         bam=bam,
         min_base_quality=min_base_quality,
-        min_mapping_quality=min_mapping_quality
+        min_mapping_quality=min_mapping_quality,
     ):
         ref_pos = pileup_col.reference_pos + 1  # convert to 1-based
         if ref_pos not in variants_by_pos:
             continue
         for variant in variants_by_pos[ref_pos]:
-            metrics = _get_metrics_from_column(variant, pileup_col, include_ambiguous_bases)
+            metrics = _get_metrics_from_column(
+                variant, pileup_col, include_ambiguous_bases
+            )
             if metrics is not None:
                 results[(ref_pos, variant.REF, variant.ALT[0])] = metrics
 
     return results
 
 
-def _get_metrics_from_column(variant, pileup_col,
-                              include_ambiguous_bases: bool = True) -> CoverageMetrics:
+def _get_metrics_from_column(
+    variant, pileup_col, include_ambiguous_bases: bool = True
+) -> CoverageMetrics:
     """Dispatch pileup metrics computation based on variant type.
 
     Args:
@@ -159,7 +181,9 @@ def _get_metrics_from_column(variant, pileup_col,
     return None
 
 
-def _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases: bool = True) -> CoverageMetrics:
+def _get_snv_metrics_from_column(
+    pileup_col, include_ambiguous_bases: bool = True
+) -> CoverageMetrics:
     """Compute SNV metrics from a pileup column.
     Deletions at the position are represented as empty string bases and included in depth.
 
@@ -203,8 +227,14 @@ def _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases: bool = Tru
         dp = sum(1 for b in bases if b == "" or b not in AMBIGUOUS_BASES)
 
     return CoverageMetrics(
-        ac=ac, dp=dp, bqs=bqs, mqs=mqs, positions=positions,
-        all_bqs=all_bqs, all_mqs=all_mqs, all_positions=all_positions
+        ac=ac,
+        dp=dp,
+        bqs=bqs,
+        mqs=mqs,
+        positions=positions,
+        all_bqs=all_bqs,
+        all_mqs=all_mqs,
+        all_positions=all_positions,
     )
 
 
@@ -239,17 +269,21 @@ def _get_insertion_metrics_from_column(variant, pileup_col) -> CoverageMetrics:
             index = pileup_read.alignment.reference_start
             relative_position = 0
             for cigar_type, cigar_length in pileup_read.alignment.cigartuples:
-                if cigar_type in [0, 2, 3, 7, 8]: # consumes reference M, D, N, =, X
+                if cigar_type in [0, 2, 3, 7, 8]:  # consumes reference M, D, N, =, X
                     index += cigar_length
                     if index > variant_position:
                         break
-                if cigar_type in [0, 1, 4, 7, 8]: # consumes query M, I, S, =, X
+                if cigar_type in [0, 1, 4, 7, 8]:  # consumes query M, I, S, =, X
                     relative_position += cigar_length
-                if cigar_type == 1: # does not count I
+                if cigar_type == 1:  # does not count I
                     insertion_in_query = pileup_read.alignment.query[
-                                         relative_position:relative_position + insertion_length]
-                    if index == variant_position and cigar_length == insertion_length \
-                            and insertion == insertion_in_query:
+                        relative_position : relative_position + insertion_length
+                    ]
+                    if (
+                        index == variant_position
+                        and cigar_length == insertion_length
+                        and insertion == insertion_in_query
+                    ):
                         # the read contains the insertion
                         ac[alt_upper] += 1
                         mq[alt_upper].append(pileup_read.alignment.mapping_quality)
@@ -260,13 +294,14 @@ def _get_insertion_metrics_from_column(variant, pileup_col) -> CoverageMetrics:
             pos[variant.REF].append(pileup_read.query_position_or_next)
 
     return CoverageMetrics(
-        ac=Counter(ac), dp=dp,
+        ac=Counter(ac),
+        dp=dp,
         mqs=Counter({k: safe_median(l) for k, l in mq.items()}),
         positions=Counter({k: safe_median(l) for k, l in pos.items()}),
         bqs=Counter(),
         all_mqs={k: l for k, l in mq.items()},
         all_positions={k: l for k, l in pos.items()},
-        all_bqs=Counter()
+        all_bqs=Counter(),
     )
 
 
@@ -298,7 +333,7 @@ def _get_deletion_metrics_from_column(variant, pileup_col) -> CoverageMetrics:
         if pileup_read.indel < 0:
             start = pileup_read.alignment.reference_start
             for cigar_type, cigar_length in pileup_read.alignment.cigartuples:
-                if cigar_type in [0, 3, 7, 8]: # consumes reference M, N, =, X
+                if cigar_type in [0, 3, 7, 8]:  # consumes reference M, N, =, X
                     start += cigar_length
                 elif cigar_type == 2:
                     if start == variant_position and cigar_length == deletion_length:
@@ -316,11 +351,12 @@ def _get_deletion_metrics_from_column(variant, pileup_col) -> CoverageMetrics:
             pos[variant.REF].append(pileup_read.query_position_or_next)
 
     return CoverageMetrics(
-        ac=Counter(ac), dp=dp,
+        ac=Counter(ac),
+        dp=dp,
         mqs=Counter({k: safe_median(l) for k, l in mq.items()}),
         positions=Counter({k: safe_median(l) for k, l in pos.items()}),
         bqs=Counter(),
         all_mqs={k: l for k, l in mq.items()},
         all_positions={k: l for k, l in pos.items()},
-        all_bqs=Counter()
-    )
\ No newline at end of file
+        all_bqs=Counter(),
+    )
diff --git a/vafator/ploidies.py b/vafator/ploidies.py
index 00748db..31afc6f 100755
--- a/vafator/ploidies.py
+++ b/vafator/ploidies.py
@@ -10,14 +10,27 @@
 
 class PloidyManager:
 
-    def __init__(self, local_copy_numbers: str = None, genome_wide_ploidy: float = DEFAULT_PLOIDY):
+    def __init__(
+        self, local_copy_numbers: str = None, genome_wide_ploidy: float = DEFAULT_PLOIDY
+    ):
 
         if local_copy_numbers is not None and not os.path.exists(local_copy_numbers):
-            raise ValueError('The provided tumor ploidy is neither a copy number value or a BED file with copy '
-                             'numbers')
-        self.report_value = local_copy_numbers if local_copy_numbers else genome_wide_ploidy
-        self.bed = pd.read_csv(local_copy_numbers, sep='\t', names=['chromosome', 'start', 'end', 'copy_number']) \
-            if local_copy_numbers is not None else None
+            raise ValueError(
+                "The provided tumor ploidy is neither a copy number value or a BED file with copy "
+                "numbers"
+            )
+        self.report_value = (
+            local_copy_numbers if local_copy_numbers else genome_wide_ploidy
+        )
+        self.bed = (
+            pd.read_csv(
+                local_copy_numbers,
+                sep="\t",
+                names=["chromosome", "start", "end", "copy_number"],
+            )
+            if local_copy_numbers is not None
+            else None
+        )
         self.ploidy = genome_wide_ploidy
 
     def get_ploidy(self, variant: Union[Variant, VariantRecord]) -> float:
@@ -26,9 +39,11 @@ def get_ploidy(self, variant: Union[Variant, VariantRecord]) -> float:
         if self.bed is not None:
             # read from the BED file
             # NOTE: converts variant position from 1-based into 0-based and considers intervals as half-closed
-            hits = self.bed[(self.bed.chromosome == variant.CHROM) &
-                            (self.bed.start <= variant.POS - 1) &
-                            (self.bed.end > variant.POS - 1)]
+            hits = self.bed[
+                (self.bed.chromosome == variant.CHROM)
+                & (self.bed.start <= variant.POS - 1)
+                & (self.bed.end > variant.POS - 1)
+            ]
             if hits.shape[0] > 0:
                 result = float(hits.copy_number.iloc[0])
 
diff --git a/vafator/power.py b/vafator/power.py
index 93ee5a3..e051e24 100644
--- a/vafator/power.py
+++ b/vafator/power.py
@@ -7,19 +7,20 @@
 
 DEFAULT_PURITY = 1.0
 DEFAULT_NORMAL_PLOIDY = 2
-DEFAULT_FPR = 5*(10**-7)
+DEFAULT_FPR = 5 * (10**-7)
 DEFAULT_ERROR_RATE = 10**-3
 
 
 class PowerCalculator:
 
     def __init__(
-            self,
-            tumor_ploidies: dict,
-            purities: dict,
-            normal_ploidy: int = DEFAULT_NORMAL_PLOIDY,
-            fpr: float = DEFAULT_FPR,
-            error_rate: float = DEFAULT_ERROR_RATE):
+        self,
+        tumor_ploidies: dict,
+        purities: dict,
+        normal_ploidy: int = DEFAULT_NORMAL_PLOIDY,
+        fpr: float = DEFAULT_FPR,
+        error_rate: float = DEFAULT_ERROR_RATE,
+    ):
 
         self.normal_ploidy = normal_ploidy
         self.purities = purities
@@ -35,7 +36,9 @@ def __init__(
         # when using genome-wide ploidy (most common case) this is the same value for all variants
         self._eaf_cache: dict = {}
 
-    def calculate_power(self, dp: int, ac: int, sample: str, variant: Optional[Variant]) -> float:
+    def calculate_power(
+        self, dp: int, ac: int, sample: str, variant: Optional[Variant]
+    ) -> float:
         """
         Return the binomial probability of observing ac or less supporting reads, given a total coverage dp and a
         expected VAF tumor purity / 2.
@@ -55,13 +58,24 @@ def calculate_expected_vaf(self, sample: str, variant: Optional[Variant]) -> flo
         """
         # cache key: use variant position for local copy number lookups,
         # or just sample name when genome-wide ploidy is used (most common case)
-        cache_key = (sample, variant.CHROM if variant else None, variant.POS if variant else None)
+        cache_key = (
+            sample,
+            variant.CHROM if variant else None,
+            variant.POS if variant else None,
+        )
         if cache_key in self._eaf_cache:
             return self._eaf_cache[cache_key]
 
         purity = self.purities.get(sample, DEFAULT_PURITY)
-        tumor_ploidy = max(1, self.tumor_ploidies.get(sample, default_ploidy_manager).get_ploidy(variant=variant))
-        corrected_tumor_ploidy = purity * tumor_ploidy + ((1 - purity) * self.normal_ploidy)
+        tumor_ploidy = max(
+            1,
+            self.tumor_ploidies.get(sample, default_ploidy_manager).get_ploidy(
+                variant=variant
+            ),
+        )
+        corrected_tumor_ploidy = purity * tumor_ploidy + (
+            (1 - purity) * self.normal_ploidy
+        )
         expected_vaf = purity / corrected_tumor_ploidy
 
         self._eaf_cache[cache_key] = expected_vaf
@@ -109,5 +123,9 @@ def calculate_absolute_power(self, sample, variant, dp: int) -> float:
         n = dp
         f = self.calculate_expected_vaf(sample, variant)
         # avoid instantiating a frozen binom distribution object — use module-level functions directly
-        power = 1 - binom.cdf(k=k - 1, n=n, p=f) + self._calculate_d(k=k, n=n) * binom.pmf(k=k, n=n, p=f)
-        return round(power, 5), k
\ No newline at end of file
+        power = (
+            1
+            - binom.cdf(k=k - 1, n=n, p=f)
+            + self._calculate_d(k=k, n=n) * binom.pmf(k=k, n=n, p=f)
+        )
+        return round(power, 5), k
diff --git a/vafator/rank_sum_test.py b/vafator/rank_sum_test.py
index b9a3b4d..2314572 100644
--- a/vafator/rank_sum_test.py
+++ b/vafator/rank_sum_test.py
@@ -3,7 +3,9 @@
 import numpy as np
 
 
-def calculate_rank_sum_test(alternate_dist: List[int], reference_dist: List[int]) -> Tuple[float, float]:
+def calculate_rank_sum_test(
+    alternate_dist: List[int], reference_dist: List[int]
+) -> Tuple[float, float]:
     if not alternate_dist or not reference_dist:  # skip empty distributions
         return np.nan, np.nan
     stat, pvalue = scipy.stats.ranksums(x=alternate_dist, y=reference_dist)
@@ -16,10 +18,9 @@ def get_rank_sum_tests(distributions: dict, variant):
     for alt in variant.ALT:
         stat, pvalue = calculate_rank_sum_test(
             alternate_dist=distributions.get(alt, []),
-            reference_dist=distributions.get(variant.REF, []))
+            reference_dist=distributions.get(variant.REF, []),
+        )
         if not np.isnan(stat) and not np.isnan(pvalue):
             stats.append(str(stat))
             pvalues.append(str(pvalue))
     return pvalues, stats
-
-
diff --git a/vafator/tests/test_annotator.py b/vafator/tests/test_annotator.py
index 7bcb716..8172870 100755
--- a/vafator/tests/test_annotator.py
+++ b/vafator/tests/test_annotator.py
@@ -13,14 +13,43 @@
 
 
 EXPECTED_ANNOTATIONS = [
-    'af', 'dp', 'ac', 'n', 'pu', 'pw', 'k', 'eaf', 'bq', 'mq', 'pos', 'rsmq', 'rsmq_pv', 'rsbq', 'rsbq_pv', 'rspos',
-    'rspos_pv'
+    "af",
+    "dp",
+    "ac",
+    "n",
+    "pu",
+    "pw",
+    "k",
+    "eaf",
+    "bq",
+    "mq",
+    "pos",
+    "rsmq",
+    "rsmq_pv",
+    "rsbq",
+    "rsbq_pv",
+    "rspos",
+    "rspos_pv",
 ]
 
 # replicates do not have EAF annotation
 EXPECTED_ANNOTATIONS_REPLICATES = [
-    'af', 'dp', 'ac', 'n', 'pu', 'pw', 'k', 'bq', 'mq', 'pos', 'rsmq', 'rsmq_pv', 'rsbq', 'rsbq_pv', 'rspos',
-    'rspos_pv'
+    "af",
+    "dp",
+    "ac",
+    "n",
+    "pu",
+    "pw",
+    "k",
+    "bq",
+    "mq",
+    "pos",
+    "rsmq",
+    "rsmq_pv",
+    "rsbq",
+    "rsbq_pv",
+    "rspos",
+    "rspos_pv",
 ]
 
 
@@ -28,11 +57,16 @@ class TestAnnotator(TestCase):
 
     def test_annotator(self):
         input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf")
-        output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_annotator1_output.vcf")
+        output_vcf = pkg_resources.resource_filename(
+            __name__, "resources/results/test_annotator1_output.vcf"
+        )
         bam1 = pkg_resources.resource_filename(__name__, "resources/COLO_829_n1.bam")
         bam2 = pkg_resources.resource_filename(__name__, "resources/COLO_829_t1.bam")
         annotator = Annotator(
-            input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam1], "tumor": [bam2]})
+            input_vcf=input_file,
+            output_vcf=output_vcf,
+            input_bams={"normal": [bam1], "tumor": [bam2]},
+        )
         annotator.run()
 
         self.assertTrue(os.path.exists(output_vcf))
@@ -47,11 +81,16 @@ def test_annotator(self):
 
     def test_annotator_with_multiple_bams(self):
         input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf")
-        output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_annotator1_output.vcf")
+        output_vcf = pkg_resources.resource_filename(
+            __name__, "resources/results/test_annotator1_output.vcf"
+        )
         bam1 = pkg_resources.resource_filename(__name__, "resources/COLO_829_n1.bam")
         bam2 = pkg_resources.resource_filename(__name__, "resources/COLO_829_t1.bam")
         annotator = Annotator(
-            input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam1, bam2], "tumor": [bam1, bam2]})
+            input_vcf=input_file,
+            output_vcf=output_vcf,
+            input_bams={"normal": [bam1, bam2], "tumor": [bam1, bam2]},
+        )
         annotator.run()
 
         self.assertTrue(os.path.exists(output_vcf))
@@ -61,19 +100,27 @@ def test_annotator_with_multiple_bams(self):
 
         info_annotations = test_utils._get_info_fields(output_vcf)
         for a in EXPECTED_ANNOTATIONS_REPLICATES:
-            self.assertTrue("tumor_{}_1".format(a) in info_annotations,
-                            "Missing annotation tumor_{}_1".format(a))
-            self.assertTrue("normal_{}_1".format(a) in info_annotations,
-                            "Missing annotation normal_{}_1".format(a))
+            self.assertTrue(
+                "tumor_{}_1".format(a) in info_annotations,
+                "Missing annotation tumor_{}_1".format(a),
+            )
+            self.assertTrue(
+                "normal_{}_1".format(a) in info_annotations,
+                "Missing annotation normal_{}_1".format(a),
+            )
 
     def test_annotator_with_prefix(self):
         input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf")
-        output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_annotator1_output.vcf")
+        output_vcf = pkg_resources.resource_filename(
+            __name__, "resources/results/test_annotator1_output.vcf"
+        )
         bam1 = pkg_resources.resource_filename(__name__, "resources/COLO_829_n1.bam")
         bam2 = pkg_resources.resource_filename(__name__, "resources/COLO_829_t1.bam")
         annotator = Annotator(
-            input_vcf=input_file, output_vcf=output_vcf,
-            input_bams={"RNA_normal": [bam1, bam2], "RNA_tumor": [bam1, bam2]})
+            input_vcf=input_file,
+            output_vcf=output_vcf,
+            input_bams={"RNA_normal": [bam1, bam2], "RNA_tumor": [bam1, bam2]},
+        )
         annotator.run()
 
         self.assertTrue(os.path.exists(output_vcf))
@@ -83,19 +130,29 @@ def test_annotator_with_prefix(self):
 
         info_annotations = test_utils._get_info_fields(output_vcf)
         for a in EXPECTED_ANNOTATIONS_REPLICATES:
-            self.assertTrue("RNA_tumor_{}_1".format(a) in info_annotations,
-                            "Missing annotation RNA_tumor_{}_1".format(a))
-            self.assertTrue("RNA_normal_{}_1".format(a) in info_annotations,
-                            "Missing annotation RNA_normal_{}_1".format(a))
+            self.assertTrue(
+                "RNA_tumor_{}_1".format(a) in info_annotations,
+                "Missing annotation RNA_tumor_{}_1".format(a),
+            )
+            self.assertTrue(
+                "RNA_normal_{}_1".format(a) in info_annotations,
+                "Missing annotation RNA_normal_{}_1".format(a),
+            )
 
     def test_annotator_with_mnvs(self):
-        input_file = pkg_resources.resource_filename(__name__, "resources/test_tumor_normal.vcf")
-        output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_tumor_normal_output.vcf")
+        input_file = pkg_resources.resource_filename(
+            __name__, "resources/test_tumor_normal.vcf"
+        )
+        output_vcf = pkg_resources.resource_filename(
+            __name__, "resources/results/test_tumor_normal_output.vcf"
+        )
         bam1 = pkg_resources.resource_filename(__name__, "resources/COLO_829_n1.bam")
         bam2 = pkg_resources.resource_filename(__name__, "resources/COLO_829_t1.bam")
         annotator = Annotator(
-            input_vcf=input_file, output_vcf=output_vcf,
-            input_bams={"RNA_normal": [bam1, bam2], "RNA_tumor": [bam1, bam2]})
+            input_vcf=input_file,
+            output_vcf=output_vcf,
+            input_bams={"RNA_normal": [bam1, bam2], "RNA_tumor": [bam1, bam2]},
+        )
         annotator.run()
 
         self.assertTrue(os.path.exists(output_vcf))
@@ -105,10 +162,14 @@ def test_annotator_with_mnvs(self):
 
         info_annotations = test_utils._get_info_fields(output_vcf)
         for a in EXPECTED_ANNOTATIONS_REPLICATES:
-            self.assertTrue("RNA_tumor_{}_1".format(a) in info_annotations,
-                            "Missing annotation RNA_tumor_{}_1".format(a))
-            self.assertTrue("RNA_normal_{}_1".format(a) in info_annotations,
-                            "Missing annotation RNA_normal_{}_1".format(a))
+            self.assertTrue(
+                "RNA_tumor_{}_1".format(a) in info_annotations,
+                "Missing annotation RNA_tumor_{}_1".format(a),
+            )
+            self.assertTrue(
+                "RNA_normal_{}_1".format(a) in info_annotations,
+                "Missing annotation RNA_normal_{}_1".format(a),
+            )
 
     def _get_info_at(self, input_file, chromosome, position, annotation):
         vcf = VCF(input_file)
@@ -122,19 +183,27 @@ def _get_info_at(self, input_file, chromosome, position, annotation):
 
     def test_nist(self):
         input_file = pkg_resources.resource_filename(
-            __name__, "resources/project.NIST.hc.snps.indels.chr1_1000000_2000000.vcf")
+            __name__, "resources/project.NIST.hc.snps.indels.chr1_1000000_2000000.vcf"
+        )
         output_vcf = pkg_resources.resource_filename(
-            __name__, "resources/results/project.NIST.hc.snps.indels.chr1_1000000_2000000.vaf.vcf")
+            __name__,
+            "resources/results/project.NIST.hc.snps.indels.chr1_1000000_2000000.vaf.vcf",
+        )
         bam_file = pkg_resources.resource_filename(
             __name__,
-            "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam")
+            "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam",
+        )
         start = time.time()
-        annotator = Annotator(input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam_file]})
+        annotator = Annotator(
+            input_vcf=input_file,
+            output_vcf=output_vcf,
+            input_bams={"normal": [bam_file]},
+        )
         annotator.run()
         duration = time.time() - start
         logger.info("Duration {} seconds".format(round(duration, 3)))
 
-        self._assert_vafator_vcf(output_vcf, sample_name='normal')
+        self._assert_vafator_vcf(output_vcf, sample_name="normal")
 
         n_variants_input = test_utils._get_count_variants(input_file)
         n_variants_output = test_utils._get_count_variants(output_vcf)
@@ -145,105 +214,125 @@ def test_nist(self):
         self.assertTrue("normal_ac" in info_annotations)
         self.assertTrue("normal_dp" in info_annotations)
 
-        variant = test_utils._get_mutation_at_position(output_vcf, 'chr1', 1506035)
+        variant = test_utils._get_mutation_at_position(output_vcf, "chr1", 1506035)
         self.assertIsNotNone(variant)
-        self.assertEqual(variant.INFO['normal_ac'], 3)
-        self.assertEqual(variant.INFO['normal_dp'], 3)
-        self.assertEqual(variant.INFO['normal_af'], 1.0)
-        self.assertEqual(variant.INFO['normal_pu'], 1.0)
-        self.assertEqual(variant.INFO['normal_eaf'], 0.5)
-        self.assertEqual(variant.INFO['normal_mq'][0], 0)
-        self.assertEqual(variant.INFO['normal_mq'][1], 60.0)
-        self.assertEqual(variant.INFO['normal_bq'][0], 0)
-        self.assertEqual(variant.INFO['normal_bq'][1], 37.0)
-        self.assertEqual(variant.INFO['normal_pos'][0], 0)
-        self.assertEqual(variant.INFO['normal_pos'][1], 56.0)
-
-        variant = test_utils._get_mutation_at_position(output_vcf, 'chr1', 1509825)
+        self.assertEqual(variant.INFO["normal_ac"], 3)
+        self.assertEqual(variant.INFO["normal_dp"], 3)
+        self.assertEqual(variant.INFO["normal_af"], 1.0)
+        self.assertEqual(variant.INFO["normal_pu"], 1.0)
+        self.assertEqual(variant.INFO["normal_eaf"], 0.5)
+        self.assertEqual(variant.INFO["normal_mq"][0], 0)
+        self.assertEqual(variant.INFO["normal_mq"][1], 60.0)
+        self.assertEqual(variant.INFO["normal_bq"][0], 0)
+        self.assertEqual(variant.INFO["normal_bq"][1], 37.0)
+        self.assertEqual(variant.INFO["normal_pos"][0], 0)
+        self.assertEqual(variant.INFO["normal_pos"][1], 56.0)
+
+        variant = test_utils._get_mutation_at_position(output_vcf, "chr1", 1509825)
         self.assertIsNotNone(variant)
-        self.assertEqual(variant.INFO['normal_ac'], 13)
-        self.assertEqual(variant.INFO['normal_dp'], 20)
+        self.assertEqual(variant.INFO["normal_ac"], 13)
+        self.assertEqual(variant.INFO["normal_dp"], 20)
         # these values are rounded to six digits inside the VCF, not sure why when read the representation is
         # different...
-        self.assertEqual(round(variant.INFO['normal_af'], 5), 0.65)
-        self.assertEqual(round(variant.INFO['normal_pu'], 5), 0.94234)
-        self.assertEqual(variant.INFO['normal_eaf'], 0.5)
-        self.assertEqual(variant.INFO['normal_mq'][0], 60.0)
-        self.assertEqual(variant.INFO['normal_mq'][1], 60.0)
-        self.assertEqual(variant.INFO['normal_bq'][0], 37.0)
-        self.assertEqual(variant.INFO['normal_bq'][1], 35.0)
-        self.assertEqual(variant.INFO['normal_pos'][0], 31.0)
-        self.assertEqual(variant.INFO['normal_pos'][1], 41.0)
+        self.assertEqual(round(variant.INFO["normal_af"], 5), 0.65)
+        self.assertEqual(round(variant.INFO["normal_pu"], 5), 0.94234)
+        self.assertEqual(variant.INFO["normal_eaf"], 0.5)
+        self.assertEqual(variant.INFO["normal_mq"][0], 60.0)
+        self.assertEqual(variant.INFO["normal_mq"][1], 60.0)
+        self.assertEqual(variant.INFO["normal_bq"][0], 37.0)
+        self.assertEqual(variant.INFO["normal_bq"][1], 35.0)
+        self.assertEqual(variant.INFO["normal_pos"][0], 31.0)
+        self.assertEqual(variant.INFO["normal_pos"][1], 41.0)
 
         # this is a deletion
-        variant = test_utils._get_mutation_at_position(output_vcf, 'chr1', 1323143)
+        variant = test_utils._get_mutation_at_position(output_vcf, "chr1", 1323143)
         self.assertIsNotNone(variant)
-        self.assertEqual(variant.INFO['normal_ac'], 20)
-        self.assertEqual(variant.INFO['normal_dp'], 21)
+        self.assertEqual(variant.INFO["normal_ac"], 20)
+        self.assertEqual(variant.INFO["normal_dp"], 21)
         # these values are rounded to six digits inside the VCF, not sure why when read the representation is
         # different...
-        self.assertEqual(round(variant.INFO['normal_af'], 5), 0.95238)
-        self.assertEqual(round(variant.INFO['normal_pu'], 5), 1.0)
-        self.assertEqual(variant.INFO['normal_eaf'], 0.5)
-        self.assertEqual(variant.INFO['normal_mq'][0], 60.0)
-        self.assertEqual(variant.INFO['normal_mq'][1], 60.0)
-        self.assertEqual(variant.INFO['normal_bq'][0], 0.0)
-        self.assertEqual(variant.INFO['normal_bq'][1], 0.0)
-        self.assertEqual(variant.INFO['normal_pos'][0], 50.0)
-        self.assertEqual(variant.INFO['normal_pos'][1], 21.0)
+        self.assertEqual(round(variant.INFO["normal_af"], 5), 0.95238)
+        self.assertEqual(round(variant.INFO["normal_pu"], 5), 1.0)
+        self.assertEqual(variant.INFO["normal_eaf"], 0.5)
+        self.assertEqual(variant.INFO["normal_mq"][0], 60.0)
+        self.assertEqual(variant.INFO["normal_mq"][1], 60.0)
+        self.assertEqual(variant.INFO["normal_bq"][0], 0.0)
+        self.assertEqual(variant.INFO["normal_bq"][1], 0.0)
+        self.assertEqual(variant.INFO["normal_pos"][0], 50.0)
+        self.assertEqual(variant.INFO["normal_pos"][1], 21.0)
 
         # this is an insertion
-        variant = test_utils._get_mutation_at_position(output_vcf, 'chr1', 1935367)
+        variant = test_utils._get_mutation_at_position(output_vcf, "chr1", 1935367)
         self.assertIsNotNone(variant)
-        self.assertEqual(variant.INFO['normal_ac'], 1)
-        self.assertEqual(variant.INFO['normal_dp'], 2)
+        self.assertEqual(variant.INFO["normal_ac"], 1)
+        self.assertEqual(variant.INFO["normal_dp"], 2)
         # these values are rounded to six digits inside the VCF, not sure why when read the representation is
         # different...
-        self.assertEqual(round(variant.INFO['normal_af'], 5), 0.5)
-        self.assertEqual(round(variant.INFO['normal_pu'], 5), 0.75)
-        self.assertEqual(variant.INFO['normal_eaf'], 0.5)
-        self.assertEqual(variant.INFO['normal_mq'][0], 60.0)
-        self.assertEqual(variant.INFO['normal_mq'][1], 29.0)
-        self.assertEqual(variant.INFO['normal_bq'][0], 0.0)
-        self.assertEqual(variant.INFO['normal_bq'][1], 0.0)
-        self.assertEqual(variant.INFO['normal_pos'][0], 95.0)
-        self.assertEqual(variant.INFO['normal_pos'][1], 56.0)
+        self.assertEqual(round(variant.INFO["normal_af"], 5), 0.5)
+        self.assertEqual(round(variant.INFO["normal_pu"], 5), 0.75)
+        self.assertEqual(variant.INFO["normal_eaf"], 0.5)
+        self.assertEqual(variant.INFO["normal_mq"][0], 60.0)
+        self.assertEqual(variant.INFO["normal_mq"][1], 29.0)
+        self.assertEqual(variant.INFO["normal_bq"][0], 0.0)
+        self.assertEqual(variant.INFO["normal_bq"][1], 0.0)
+        self.assertEqual(variant.INFO["normal_pos"][0], 95.0)
+        self.assertEqual(variant.INFO["normal_pos"][1], 56.0)
 
     def test_nist_with_replicates(self):
         input_file = pkg_resources.resource_filename(
-            __name__, "resources/project.NIST.hc.snps.indels.chr1_1000000_2000000.vcf")
+            __name__, "resources/project.NIST.hc.snps.indels.chr1_1000000_2000000.vcf"
+        )
         output_vcf = pkg_resources.resource_filename(
-            __name__, "resources/results/project.NIST.hc.snps.indels.chr1_1000000_2000000.vaf_replicates.vcf")
+            __name__,
+            "resources/results/project.NIST.hc.snps.indels.chr1_1000000_2000000.vaf_replicates.vcf",
+        )
         bam_file = pkg_resources.resource_filename(
             __name__,
-            "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam")
+            "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam",
+        )
         start = time.time()
-        annotator = Annotator(input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam_file]})
+        annotator = Annotator(
+            input_vcf=input_file,
+            output_vcf=output_vcf,
+            input_bams={"normal": [bam_file]},
+        )
         annotator.run()
         duration = time.time() - start
         logger.info("Duration {} seconds".format(round(duration, 3)))
 
-        self._assert_vafator_vcf(output_vcf, sample_name='normal')
-        self._assert_vafator_vcf(output_vcf, sample_name='normal', replicate=1)
-        self._assert_vafator_vcf(output_vcf, sample_name='normal', replicate=2)
+        self._assert_vafator_vcf(output_vcf, sample_name="normal")
+        self._assert_vafator_vcf(output_vcf, sample_name="normal", replicate=1)
+        self._assert_vafator_vcf(output_vcf, sample_name="normal", replicate=2)
 
     def test_annotator_bams_order(self):
         input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf")
-        output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_annotator1_output.vcf")
-        output_vcf_2 = pkg_resources.resource_filename(__name__, "resources/results/test_annotator2_output.vcf")
+        output_vcf = pkg_resources.resource_filename(
+            __name__, "resources/results/test_annotator1_output.vcf"
+        )
+        output_vcf_2 = pkg_resources.resource_filename(
+            __name__, "resources/results/test_annotator2_output.vcf"
+        )
         bam1 = pkg_resources.resource_filename(__name__, "resources/COLO_829_n1.bam")
         bam2 = pkg_resources.resource_filename(__name__, "resources/COLO_829_t1.bam")
 
-        Annotator(input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam1], "tumor": [bam2]}).run()
-        Annotator(input_vcf=input_file, output_vcf=output_vcf_2, input_bams={"tumor": [bam2], "normal": [bam1]}).run()
+        Annotator(
+            input_vcf=input_file,
+            output_vcf=output_vcf,
+            input_bams={"normal": [bam1], "tumor": [bam2]},
+        ).run()
+        Annotator(
+            input_vcf=input_file,
+            output_vcf=output_vcf_2,
+            input_bams={"tumor": [bam2], "normal": [bam1]},
+        ).run()
 
         self.assertTrue(os.path.exists(output_vcf))
         self.assertTrue(os.path.exists(output_vcf_2))
 
-        self._assert_vafator_vcf(output_vcf, sample_name='normal')
-        self._assert_vafator_vcf(output_vcf, sample_name='tumor')
-        self._assert_vafator_vcf(output_vcf_2, sample_name='normal')
-        self._assert_vafator_vcf(output_vcf_2, sample_name='tumor')
+        self._assert_vafator_vcf(output_vcf, sample_name="normal")
+        self._assert_vafator_vcf(output_vcf, sample_name="tumor")
+        self._assert_vafator_vcf(output_vcf_2, sample_name="normal")
+        self._assert_vafator_vcf(output_vcf_2, sample_name="tumor")
 
         vcf = VCF(output_vcf)
         vcf_2 = VCF(output_vcf_2)
@@ -253,20 +342,31 @@ def test_annotator_bams_order(self):
                 self.assertEqual(
                     v.INFO.get("normal_{}".format(a), ""),
                     v2.INFO.get("normal_{}".format(a), ""),
-                    "Variant {}:{}:{}>{} is missing annotation normal_{}".format(v.CHROM, v.POS, v.REF, v.ALT[0], a))
+                    "Variant {}:{}:{}>{} is missing annotation normal_{}".format(
+                        v.CHROM, v.POS, v.REF, v.ALT[0], a
+                    ),
+                )
                 self.assertEqual(
                     v.INFO.get("tumor_{}".format(a), ""),
                     v2.INFO.get("tumor_{}".format(a), ""),
-                    "Variant {}:{}:{}>{} is missing annotation tumor_{}".format(v.CHROM, v.POS, v.REF, v.ALT[0], a))
+                    "Variant {}:{}:{}>{} is missing annotation tumor_{}".format(
+                        v.CHROM, v.POS, v.REF, v.ALT[0], a
+                    ),
+                )
 
     def test_annotator_with_purities(self):
         input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf")
-        output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_annotator1_output.vcf")
+        output_vcf = pkg_resources.resource_filename(
+            __name__, "resources/results/test_annotator1_output.vcf"
+        )
         bam1 = pkg_resources.resource_filename(__name__, "resources/COLO_829_n1.bam")
         bam2 = pkg_resources.resource_filename(__name__, "resources/COLO_829_t1.bam")
         annotator = Annotator(
-            input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam1], "tumor": [bam2]},
-            purities={"tumor": 0.8}, tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.8)}
+            input_vcf=input_file,
+            output_vcf=output_vcf,
+            input_bams={"normal": [bam1], "tumor": [bam2]},
+            purities={"tumor": 0.8},
+            tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.8)},
         )
         annotator.run()
 
@@ -281,8 +381,11 @@ def test_annotator_with_purities(self):
             self.assertTrue("normal_{}".format(a) in info_annotations)
 
         annotator = Annotator(
-            input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam1], "tumor": [bam2]},
-            purities={"tumor": 0.2}, tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=1.5)}
+            input_vcf=input_file,
+            output_vcf=output_vcf,
+            input_bams={"normal": [bam1], "tumor": [bam2]},
+            purities={"tumor": 0.2},
+            tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=1.5)},
         )
         annotator.run()
 
@@ -291,18 +394,69 @@ def _assert_vafator_vcf(self, vcf_filename, sample_name, replicate=None):
         vcf = VCF(vcf_filename)
         for v in vcf:
             # p-values or VAFs
-            self._assert_probability(v.INFO.get(self._get_annotation_name('rsbq_pv', sample_name, replicate=replicate), 0))
-            self._assert_probability(v.INFO.get(self._get_annotation_name('rsmq_pv', sample_name, replicate=replicate), 0))
-            self._assert_probability(v.INFO.get(self._get_annotation_name('rspos_pv', sample_name, replicate=replicate), 0))
-            self._assert_probability(v.INFO.get(self._get_annotation_name('eaf', sample_name, replicate=replicate), 0))
-            self._assert_probability(v.INFO.get(self._get_annotation_name('af', sample_name, replicate=replicate), 0))
+            self._assert_probability(
+                v.INFO.get(
+                    self._get_annotation_name(
+                        "rsbq_pv", sample_name, replicate=replicate
+                    ),
+                    0,
+                )
+            )
+            self._assert_probability(
+                v.INFO.get(
+                    self._get_annotation_name(
+                        "rsmq_pv", sample_name, replicate=replicate
+                    ),
+                    0,
+                )
+            )
+            self._assert_probability(
+                v.INFO.get(
+                    self._get_annotation_name(
+                        "rspos_pv", sample_name, replicate=replicate
+                    ),
+                    0,
+                )
+            )
+            self._assert_probability(
+                v.INFO.get(
+                    self._get_annotation_name("eaf", sample_name, replicate=replicate),
+                    0,
+                )
+            )
+            self._assert_probability(
+                v.INFO.get(
+                    self._get_annotation_name("af", sample_name, replicate=replicate), 0
+                )
+            )
 
             # positive integer annotations
-            self._assert_positive_integer(v.INFO.get(self._get_annotation_name('ac', sample_name, replicate=replicate), 0))
-            self._assert_positive_integer(v.INFO.get(self._get_annotation_name('dp', sample_name, replicate=replicate), 0))
-            self._assert_positive_integer(v.INFO.get(self._get_annotation_name('mq', sample_name, replicate=replicate), 0))
-            self._assert_positive_integer(v.INFO.get(self._get_annotation_name('bq', sample_name, replicate=replicate), 0))
-            self._assert_positive_integer(v.INFO.get(self._get_annotation_name('pos', sample_name, replicate=replicate), 0))
+            self._assert_positive_integer(
+                v.INFO.get(
+                    self._get_annotation_name("ac", sample_name, replicate=replicate), 0
+                )
+            )
+            self._assert_positive_integer(
+                v.INFO.get(
+                    self._get_annotation_name("dp", sample_name, replicate=replicate), 0
+                )
+            )
+            self._assert_positive_integer(
+                v.INFO.get(
+                    self._get_annotation_name("mq", sample_name, replicate=replicate), 0
+                )
+            )
+            self._assert_positive_integer(
+                v.INFO.get(
+                    self._get_annotation_name("bq", sample_name, replicate=replicate), 0
+                )
+            )
+            self._assert_positive_integer(
+                v.INFO.get(
+                    self._get_annotation_name("pos", sample_name, replicate=replicate),
+                    0,
+                )
+            )
         vcf.close()
 
     @staticmethod
@@ -315,13 +469,25 @@ def _get_annotation_name(annotation_name, sample_name, replicate=None):
     def _assert_probability(self, annotation):
         if isinstance(annotation, list) or isinstance(annotation, tuple):
             for a in annotation:
-                self.assertTrue(0.0 <= float(a) <= 1.0, "Expected probability has a value of {}".format(a))
+                self.assertTrue(
+                    0.0 <= float(a) <= 1.0,
+                    "Expected probability has a value of {}".format(a),
+                )
         else:
-            self.assertTrue(0.0 <= float(annotation) <= 1.0, "Expected probability has a value of {}".format(annotation))
+            self.assertTrue(
+                0.0 <= float(annotation) <= 1.0,
+                "Expected probability has a value of {}".format(annotation),
+            )
 
     def _assert_positive_integer(self, annotation):
         if isinstance(annotation, list) or isinstance(annotation, tuple):
             for a in annotation:
-                self.assertTrue(np.isnan(a) or 0.0 <= a, "Expected positive integer has a value of {}".format(a))
+                self.assertTrue(
+                    np.isnan(a) or 0.0 <= a,
+                    "Expected positive integer has a value of {}".format(a),
+                )
         else:
-            self.assertTrue(np.isnan(annotation) or 0.0 <= annotation, "Expected positive integer has a value of {}".format(annotation))
+            self.assertTrue(
+                np.isnan(annotation) or 0.0 <= annotation,
+                "Expected positive integer has a value of {}".format(annotation),
+            )
diff --git a/vafator/tests/test_hatchet2bed.py b/vafator/tests/test_hatchet2bed.py
index 814651f..9e5ee47 100644
--- a/vafator/tests/test_hatchet2bed.py
+++ b/vafator/tests/test_hatchet2bed.py
@@ -8,10 +8,24 @@ class Hatchet2bedTest(TestCase):
 
     def test_hatchet2bed(self):
         run_hatchet2bed(
-            input_file=pkg_resources.resource_filename(__name__, "resources/best.seg.minimal.ucn"),
-            output_prefix=pkg_resources.resource_filename(__name__, "resources/best.seg.minimal")
+            input_file=pkg_resources.resource_filename(
+                __name__, "resources/best.seg.minimal.ucn"
+            ),
+            output_prefix=pkg_resources.resource_filename(
+                __name__, "resources/best.seg.minimal"
+            ),
         )
         self.assertTrue(
-            os.path.exists(pkg_resources.resource_filename(__name__, "resources/best.seg.minimal.my_tumor.bed")))
+            os.path.exists(
+                pkg_resources.resource_filename(
+                    __name__, "resources/best.seg.minimal.my_tumor.bed"
+                )
+            )
+        )
         self.assertTrue(
-            os.path.exists(pkg_resources.resource_filename(__name__, "resources/best.seg.minimal.my_metastasis.bed")))
+            os.path.exists(
+                pkg_resources.resource_filename(
+                    __name__, "resources/best.seg.minimal.my_metastasis.bed"
+                )
+            )
+        )
diff --git a/vafator/tests/test_multiallelic_filter.py b/vafator/tests/test_multiallelic_filter.py
index bdc15a0..1bc8f15 100755
--- a/vafator/tests/test_multiallelic_filter.py
+++ b/vafator/tests/test_multiallelic_filter.py
@@ -14,8 +14,12 @@ def setUp(self):
 
     def test_no_variants_filtered(self):
         input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf")
-        output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test1_output.vcf")
-        multiallelic_filter = MultiallelicFilter(input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name='tumor')
+        output_vcf = pkg_resources.resource_filename(
+            __name__, "resources/results/test1_output.vcf"
+        )
+        multiallelic_filter = MultiallelicFilter(
+            input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name="tumor"
+        )
         multiallelic_filter.run()
 
         self.assertTrue(os.path.exists(output_vcf))
@@ -25,33 +29,53 @@ def test_no_variants_filtered(self):
 
     def test_two_variants_filtered(self):
         input_file = pkg_resources.resource_filename(__name__, "resources/test2.vcf")
-        output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test2_output.vcf")
-        multiallelic_filter = MultiallelicFilter(input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name='tumor')
+        output_vcf = pkg_resources.resource_filename(
+            __name__, "resources/results/test2_output.vcf"
+        )
+        multiallelic_filter = MultiallelicFilter(
+            input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name="tumor"
+        )
         multiallelic_filter.run()
 
         self.assertTrue(os.path.exists(output_vcf))
         n_variants_input = test_utils._get_count_variants(input_file)
         n_variants_output = test_utils._get_count_variants(output_vcf)
-        self.assertTrue(n_variants_input == n_variants_output + 3, "input:{}; output:{}".format(
-            n_variants_input, n_variants_output))
-
-        af1 = self._get_info_at(output_vcf, chromosome="chr4", position=1235, annotation='tumor_af')
+        self.assertTrue(
+            n_variants_input == n_variants_output + 3,
+            "input:{}; output:{}".format(n_variants_input, n_variants_output),
+        )
+
+        af1 = self._get_info_at(
+            output_vcf, chromosome="chr4", position=1235, annotation="tumor_af"
+        )
         self.assertTrue(af1, 0.2)
-        multiallelic1 = self._get_info_at(output_vcf, chromosome="chr4", position=1235, annotation='multiallelic')
+        multiallelic1 = self._get_info_at(
+            output_vcf, chromosome="chr4", position=1235, annotation="multiallelic"
+        )
         self.assertTrue(multiallelic1, "T,0.1")
 
-        af2 = self._get_info_at(output_vcf, chromosome="chr6", position=1235, annotation='tumor_af')
+        af2 = self._get_info_at(
+            output_vcf, chromosome="chr6", position=1235, annotation="tumor_af"
+        )
         self.assertTrue(af2, 0.2)
-        multiallelic2 = self._get_info_at(output_vcf, chromosome="chr6", position=1235, annotation='multiallelic')
+        multiallelic2 = self._get_info_at(
+            output_vcf, chromosome="chr6", position=1235, annotation="multiallelic"
+        )
         self.assertTrue(multiallelic2, "A,0.01")
 
-        af3 = self._get_info_at(output_vcf, chromosome="chr6", position=1234, annotation='tumor_af')
+        af3 = self._get_info_at(
+            output_vcf, chromosome="chr6", position=1234, annotation="tumor_af"
+        )
         self.assertTrue(af3, 0.5)
 
     def test_different_reference_is_kept(self):
         input_file = pkg_resources.resource_filename(__name__, "resources/test3.vcf")
-        output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test3_output.vcf")
-        multiallelic_filter = MultiallelicFilter(input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name='tumor')
+        output_vcf = pkg_resources.resource_filename(
+            __name__, "resources/results/test3_output.vcf"
+        )
+        multiallelic_filter = MultiallelicFilter(
+            input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name="tumor"
+        )
         multiallelic_filter.run()
 
         self.assertTrue(os.path.exists(output_vcf))
@@ -62,8 +86,12 @@ def test_different_reference_is_kept(self):
 
     def test_three_multiallelics(self):
         input_file = pkg_resources.resource_filename(__name__, "resources/test4.vcf")
-        output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test4_output.vcf")
-        multiallelic_filter = MultiallelicFilter(input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name='tumor')
+        output_vcf = pkg_resources.resource_filename(
+            __name__, "resources/results/test4_output.vcf"
+        )
+        multiallelic_filter = MultiallelicFilter(
+            input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name="tumor"
+        )
         multiallelic_filter.run()
 
         self.assertTrue(os.path.exists(output_vcf))
@@ -74,8 +102,12 @@ def test_three_multiallelics(self):
 
     def test_equal_af(self):
         input_file = pkg_resources.resource_filename(__name__, "resources/test5.vcf")
-        output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test5_output.vcf")
-        multiallelic_filter = MultiallelicFilter(input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name='tumor')
+        output_vcf = pkg_resources.resource_filename(
+            __name__, "resources/results/test5_output.vcf"
+        )
+        multiallelic_filter = MultiallelicFilter(
+            input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name="tumor"
+        )
         multiallelic_filter.run()
 
         self.assertTrue(os.path.exists(output_vcf))
@@ -92,4 +124,4 @@ def _get_info_at(self, input_file, chromosome, position, annotation):
                 vcf.close()
                 return v.INFO.get(annotation)
         vcf.close()
-        return {}
\ No newline at end of file
+        return {}
diff --git a/vafator/tests/test_pileups.py b/vafator/tests/test_pileups.py
index c0450e2..6ead2cd 100644
--- a/vafator/tests/test_pileups.py
+++ b/vafator/tests/test_pileups.py
@@ -5,7 +5,8 @@
 
 from vafator.tests.utils import VafatorVariant
 from vafator.pileups import (
-    get_variant_pileup, _get_metrics_from_column,
+    get_variant_pileup,
+    _get_metrics_from_column,
 )
 
 
@@ -15,66 +16,127 @@ class TestPileups(TestCase):
     min_mapping_quality = 0
     bam_file = pkg_resources.resource_filename(
         __name__,
-        "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam")
+        "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam",
+    )
     bam_reader = pysam.AlignmentFile(bam_file)
 
     def test_snv_metrics(self):
-        variant = VafatorVariant(chromosome="chr1", position=1017341, reference="G", alternative=["T"])
-        self._assert_metrics(variant=variant, expected_ac={'G': 5, 'T': 6}, expected_dp=11)
-        self._assert_metrics(variant=variant, expected_ac={'G': 1, 'T': 2}, expected_dp=3, min_base_quality=40)
+        variant = VafatorVariant(
+            chromosome="chr1", position=1017341, reference="G", alternative=["T"]
+        )
+        self._assert_metrics(
+            variant=variant, expected_ac={"G": 5, "T": 6}, expected_dp=11
+        )
+        self._assert_metrics(
+            variant=variant,
+            expected_ac={"G": 1, "T": 2},
+            expected_dp=3,
+            min_base_quality=40,
+        )
 
     def test_snv_metrics_2(self):
-        variant = VafatorVariant(chromosome="chr1", position=1018144, reference="T", alternative=["C"])
-        self._assert_metrics(variant=variant, expected_ac={'C': 9, 'T': 11}, expected_dp=20)
-        self._assert_metrics(variant=variant, expected_ac={'C': 3, 'T': 4}, expected_dp=7, min_base_quality=40)
-        self._assert_metrics(variant=variant, expected_ac=Counter(), expected_dp=0, min_mapping_quality=65)
+        variant = VafatorVariant(
+            chromosome="chr1", position=1018144, reference="T", alternative=["C"]
+        )
+        self._assert_metrics(
+            variant=variant, expected_ac={"C": 9, "T": 11}, expected_dp=20
+        )
+        self._assert_metrics(
+            variant=variant,
+            expected_ac={"C": 3, "T": 4},
+            expected_dp=7,
+            min_base_quality=40,
+        )
+        self._assert_metrics(
+            variant=variant,
+            expected_ac=Counter(),
+            expected_dp=0,
+            min_mapping_quality=65,
+        )
 
     def test_insertion_metrics(self):
         # variant called in the VCF shows no read support (!?)
-        variant = VafatorVariant(chromosome="chr1", position=1247578, reference="T", alternative=["TGG"])
-        self._assert_metrics(variant=variant, expected_ac={'TGG': 0}, expected_dp=3)
+        variant = VafatorVariant(
+            chromosome="chr1", position=1247578, reference="T", alternative=["TGG"]
+        )
+        self._assert_metrics(variant=variant, expected_ac={"TGG": 0}, expected_dp=3)
         # there is one read supporting this insertion of 3 Gs
-        variant = VafatorVariant(chromosome="chr1", position=1247578, reference="T", alternative=["TGGG"])
-        self._assert_metrics(variant=variant, expected_ac={'TGGG': 1}, expected_dp=3)
+        variant = VafatorVariant(
+            chromosome="chr1", position=1247578, reference="T", alternative=["TGGG"]
+        )
+        self._assert_metrics(variant=variant, expected_ac={"TGGG": 1}, expected_dp=3)
         # this ensures that the insertion sequence is checked not only the insertion length!
-        variant = VafatorVariant(chromosome="chr1", position=1247578, reference="T", alternative=["TGGA"])
-        self._assert_metrics(variant=variant, expected_ac={'TGGA': 0}, expected_dp=3)
+        variant = VafatorVariant(
+            chromosome="chr1", position=1247578, reference="T", alternative=["TGGA"]
+        )
+        self._assert_metrics(variant=variant, expected_ac={"TGGA": 0}, expected_dp=3)
         # there is one read supporting this insertion of 4 Gs
-        variant = VafatorVariant(chromosome="chr1", position=1247578, reference="T", alternative=["TGGGG"])
-        self._assert_metrics(variant=variant, expected_ac={'TGGGG': 1}, expected_dp=3)
+        variant = VafatorVariant(
+            chromosome="chr1", position=1247578, reference="T", alternative=["TGGGG"]
+        )
+        self._assert_metrics(variant=variant, expected_ac={"TGGGG": 1}, expected_dp=3)
         # there is no read supporting an insertion of 5 Gs
-        variant = VafatorVariant(chromosome="chr1", position=1247578, reference="T", alternative=["TGGGGG"])
-        self._assert_metrics(variant=variant, expected_ac={'TGGGGG': 0}, expected_dp=3)
+        variant = VafatorVariant(
+            chromosome="chr1", position=1247578, reference="T", alternative=["TGGGGG"]
+        )
+        self._assert_metrics(variant=variant, expected_ac={"TGGGGG": 0}, expected_dp=3)
 
     def test_insertion_metrics_2(self):
-        variant = VafatorVariant(chromosome="chr1", position=1594199, reference="C", alternative=["CT"])
-        self._assert_metrics(variant=variant, expected_ac={'CT': 9}, expected_dp=11)
-        self._assert_metrics(variant=variant, expected_ac={'CT': 5}, expected_dp=7, min_mapping_quality=40)
-        self._assert_metrics(variant=variant, expected_ac={'CT': 4}, expected_dp=4, min_base_quality=40)
+        variant = VafatorVariant(
+            chromosome="chr1", position=1594199, reference="C", alternative=["CT"]
+        )
+        self._assert_metrics(variant=variant, expected_ac={"CT": 9}, expected_dp=11)
+        self._assert_metrics(
+            variant=variant,
+            expected_ac={"CT": 5},
+            expected_dp=7,
+            min_mapping_quality=40,
+        )
+        self._assert_metrics(
+            variant=variant, expected_ac={"CT": 4}, expected_dp=4, min_base_quality=40
+        )
 
     def test_deletion_metrics(self):
-        variant = VafatorVariant(chromosome="chr1", position=1510035, reference="GGC", alternative=["G"])
-        self._assert_metrics(variant=variant, expected_ac={'G': 12}, expected_dp=13)
-        self._assert_metrics(variant=variant, expected_ac={'G': 0}, expected_dp=0, min_mapping_quality=61)
-        self._assert_metrics(variant=variant, expected_ac={'G': 3}, expected_dp=3, min_base_quality=40)
+        variant = VafatorVariant(
+            chromosome="chr1", position=1510035, reference="GGC", alternative=["G"]
+        )
+        self._assert_metrics(variant=variant, expected_ac={"G": 12}, expected_dp=13)
+        self._assert_metrics(
+            variant=variant, expected_ac={"G": 0}, expected_dp=0, min_mapping_quality=61
+        )
+        self._assert_metrics(
+            variant=variant, expected_ac={"G": 3}, expected_dp=3, min_base_quality=40
+        )
         # deletions with a reference sequence not matching the reference would be matched
         # vafator expects correct indel calls
-        variant = VafatorVariant(chromosome="chr1", position=1510035, reference="GCC", alternative=["G"])
-        self._assert_metrics(variant=variant, expected_ac={'G': 12}, expected_dp=13)
+        variant = VafatorVariant(
+            chromosome="chr1", position=1510035, reference="GCC", alternative=["G"]
+        )
+        self._assert_metrics(variant=variant, expected_ac={"G": 12}, expected_dp=13)
 
-    def _assert_metrics(self, variant, expected_ac, expected_dp,
-                        min_base_quality=0, min_mapping_quality=0):
+    def _assert_metrics(
+        self,
+        variant,
+        expected_ac,
+        expected_dp,
+        min_base_quality=0,
+        min_mapping_quality=0,
+    ):
         pileups = get_variant_pileup(
-            variant=variant, bam=self.bam_reader,
-            min_base_quality=min_base_quality, min_mapping_quality=min_mapping_quality)
+            variant=variant,
+            bam=self.bam_reader,
+            min_base_quality=min_base_quality,
+            min_mapping_quality=min_mapping_quality,
+        )
         pileup_col = next(iter(pileups), None)
         if pileup_col is None:
             coverage_metrics = None
         else:
             coverage_metrics = _get_metrics_from_column(
-                variant=variant, pileup_col=pileup_col, include_ambiguous_bases=True)
+                variant=variant, pileup_col=pileup_col, include_ambiguous_bases=True
+            )
         if expected_dp == 0:
             self.assertIsNone(coverage_metrics)
         else:
             self.assertEqual(expected_ac, coverage_metrics.ac)
-            self.assertEqual(expected_dp, coverage_metrics.dp)
\ No newline at end of file
+            self.assertEqual(expected_dp, coverage_metrics.dp)
diff --git a/vafator/tests/test_ploidy_manager.py b/vafator/tests/test_ploidy_manager.py
index f90746f..4906095 100644
--- a/vafator/tests/test_ploidy_manager.py
+++ b/vafator/tests/test_ploidy_manager.py
@@ -8,40 +8,106 @@
 class PloidyManagerTest(TestCase):
 
     def test_default_ploidy_manager(self):
-        self.assertEqual(PloidyManager().get_ploidy(
-            variant=VafatorVariant(chromosome="chr1", position=12345, reference="A", alternative="C")), 2.0)
-        self.assertEqual(default_ploidy_manager.get_ploidy(
-            variant=VafatorVariant(chromosome="chr1", position=12345, reference="A", alternative="C")), 2.0)
+        self.assertEqual(
+            PloidyManager().get_ploidy(
+                variant=VafatorVariant(
+                    chromosome="chr1", position=12345, reference="A", alternative="C"
+                )
+            ),
+            2.0,
+        )
+        self.assertEqual(
+            default_ploidy_manager.get_ploidy(
+                variant=VafatorVariant(
+                    chromosome="chr1", position=12345, reference="A", alternative="C"
+                )
+            ),
+            2.0,
+        )
         self.assertEqual(default_ploidy_manager.get_ploidy(variant=None), 2.0)
 
     def test_genome_wide_ploidy_manager(self):
-        self.assertEqual(PloidyManager(genome_wide_ploidy=3.2).get_ploidy(
-            variant=VafatorVariant(chromosome="chr1", position=12345, reference="A", alternative="C")), 3.2)
-        self.assertEqual(PloidyManager(genome_wide_ploidy=3.2).get_ploidy(variant=None), 3.2)
+        self.assertEqual(
+            PloidyManager(genome_wide_ploidy=3.2).get_ploidy(
+                variant=VafatorVariant(
+                    chromosome="chr1", position=12345, reference="A", alternative="C"
+                )
+            ),
+            3.2,
+        )
+        self.assertEqual(
+            PloidyManager(genome_wide_ploidy=3.2).get_ploidy(variant=None), 3.2
+        )
 
     def test_local_copy_numbers_ploidy_manager(self):
-        input_bed = pkg_resources.resource_filename(__name__, "resources/test_copy_numbers.bed")
-        self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy(
-            variant=VafatorVariant(chromosome="chr1", position=12345, reference="A", alternative="C")), 1.2)
-        self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy(
-            variant=VafatorVariant(chromosome="chr2", position=12345, reference="A", alternative="C")), 3.2)
+        input_bed = pkg_resources.resource_filename(
+            __name__, "resources/test_copy_numbers.bed"
+        )
+        self.assertEqual(
+            PloidyManager(local_copy_numbers=input_bed).get_ploidy(
+                variant=VafatorVariant(
+                    chromosome="chr1", position=12345, reference="A", alternative="C"
+                )
+            ),
+            1.2,
+        )
+        self.assertEqual(
+            PloidyManager(local_copy_numbers=input_bed).get_ploidy(
+                variant=VafatorVariant(
+                    chromosome="chr2", position=12345, reference="A", alternative="C"
+                )
+            ),
+            3.2,
+        )
         # test non existing interval
-        self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy(
-            variant=VafatorVariant(chromosome="chr3", position=12345, reference="A", alternative="C")), 2.0)
+        self.assertEqual(
+            PloidyManager(local_copy_numbers=input_bed).get_ploidy(
+                variant=VafatorVariant(
+                    chromosome="chr3", position=12345, reference="A", alternative="C"
+                )
+            ),
+            2.0,
+        )
 
     def test_interval_boundaries(self):
-        input_bed = pkg_resources.resource_filename(__name__, "resources/test_copy_numbers.bed")
+        input_bed = pkg_resources.resource_filename(
+            __name__, "resources/test_copy_numbers.bed"
+        )
         # lower boundary — POS 10000 is 0-based 9999, outside interval start 10000
-        self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy(
-            variant=VafatorVariant(chromosome="chr1", position=10000, reference="A", alternative="C")), 2.0)
+        self.assertEqual(
+            PloidyManager(local_copy_numbers=input_bed).get_ploidy(
+                variant=VafatorVariant(
+                    chromosome="chr1", position=10000, reference="A", alternative="C"
+                )
+            ),
+            2.0,
+        )
         # just inside lower boundary
-        self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy(
-            variant=VafatorVariant(chromosome="chr1", position=10001, reference="A", alternative="C")), 1.2)
+        self.assertEqual(
+            PloidyManager(local_copy_numbers=input_bed).get_ploidy(
+                variant=VafatorVariant(
+                    chromosome="chr1", position=10001, reference="A", alternative="C"
+                )
+            ),
+            1.2,
+        )
         # upper boundary
-        self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy(
-            variant=VafatorVariant(chromosome="chr1", position=20000, reference="A", alternative="C")), 1.2)
-        self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy(
-            variant=VafatorVariant(chromosome="chr1", position=20001, reference="A", alternative="C")), 2.1)
+        self.assertEqual(
+            PloidyManager(local_copy_numbers=input_bed).get_ploidy(
+                variant=VafatorVariant(
+                    chromosome="chr1", position=20000, reference="A", alternative="C"
+                )
+            ),
+            1.2,
+        )
+        self.assertEqual(
+            PloidyManager(local_copy_numbers=input_bed).get_ploidy(
+                variant=VafatorVariant(
+                    chromosome="chr1", position=20001, reference="A", alternative="C"
+                )
+            ),
+            2.1,
+        )
 
     def test_invalid_bed_raises(self):
         with self.assertRaises(ValueError):
diff --git a/vafator/tests/test_power_calculator.py b/vafator/tests/test_power_calculator.py
index 0c7612a..c5371f9 100644
--- a/vafator/tests/test_power_calculator.py
+++ b/vafator/tests/test_power_calculator.py
@@ -10,42 +10,100 @@ class PowerCalculatorTest(TestCase):
 
     def test_power_calculator(self):
         power = PowerCalculator(
-            tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.5)}, purities={'tumor': 0.8})
-        self.assertAlmostEqual(power.calculate_power(dp=10, ac=0, sample='tumor', variant=None), 0.01734)
-        self.assertAlmostEqual(power.calculate_power(dp=10, ac=1, sample='tumor', variant=None), 0.10404917949499565, 5)
-        self.assertAlmostEqual(power.calculate_power(dp=10, ac=2, sample='tumor', variant=None), 0.29914139104811255, 5)
-        self.assertAlmostEqual(power.calculate_power(dp=10, ac=3, sample='tumor', variant=None), 0.5592643397856016, 5)
-        self.assertAlmostEqual(power.calculate_power(dp=10, ac=4, sample='tumor', variant=None), 0.7868719199309048, 5)
-        self.assertAlmostEqual(power.calculate_power(dp=10, ac=5, sample='tumor', variant=None), 0.9234364680180867, 5)
-        self.assertAlmostEqual(power.calculate_power(dp=10, ac=6, sample='tumor', variant=None), 0.9803383630544125, 5)
-        self.assertAlmostEqual(power.calculate_power(dp=10, ac=7, sample='tumor', variant=None), 0.9965960473505056, 5)
-        self.assertAlmostEqual(power.calculate_power(dp=10, ac=8, sample='tumor', variant=None), 0.999644363156023, 5)
-        self.assertAlmostEqual(power.calculate_power(dp=10, ac=9, sample='tumor', variant=None), 0.9999830649121916, 5)
-        self.assertAlmostEqual(power.calculate_power(dp=10, ac=10, sample='tumor', variant=None), 1.0)
-        self.assertAlmostEqual(power.calculate_power(dp=10, ac=11, sample='tumor', variant=None), 1.0)
+            tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.5)},
+            purities={"tumor": 0.8},
+        )
+        self.assertAlmostEqual(
+            power.calculate_power(dp=10, ac=0, sample="tumor", variant=None), 0.01734
+        )
+        self.assertAlmostEqual(
+            power.calculate_power(dp=10, ac=1, sample="tumor", variant=None),
+            0.10404917949499565,
+            5,
+        )
+        self.assertAlmostEqual(
+            power.calculate_power(dp=10, ac=2, sample="tumor", variant=None),
+            0.29914139104811255,
+            5,
+        )
+        self.assertAlmostEqual(
+            power.calculate_power(dp=10, ac=3, sample="tumor", variant=None),
+            0.5592643397856016,
+            5,
+        )
+        self.assertAlmostEqual(
+            power.calculate_power(dp=10, ac=4, sample="tumor", variant=None),
+            0.7868719199309048,
+            5,
+        )
+        self.assertAlmostEqual(
+            power.calculate_power(dp=10, ac=5, sample="tumor", variant=None),
+            0.9234364680180867,
+            5,
+        )
+        self.assertAlmostEqual(
+            power.calculate_power(dp=10, ac=6, sample="tumor", variant=None),
+            0.9803383630544125,
+            5,
+        )
+        self.assertAlmostEqual(
+            power.calculate_power(dp=10, ac=7, sample="tumor", variant=None),
+            0.9965960473505056,
+            5,
+        )
+        self.assertAlmostEqual(
+            power.calculate_power(dp=10, ac=8, sample="tumor", variant=None),
+            0.999644363156023,
+            5,
+        )
+        self.assertAlmostEqual(
+            power.calculate_power(dp=10, ac=9, sample="tumor", variant=None),
+            0.9999830649121916,
+            5,
+        )
+        self.assertAlmostEqual(
+            power.calculate_power(dp=10, ac=10, sample="tumor", variant=None), 1.0
+        )
+        self.assertAlmostEqual(
+            power.calculate_power(dp=10, ac=11, sample="tumor", variant=None), 1.0
+        )
 
     def test_zero_dp_returns_zero_power(self):
         power = PowerCalculator(
-            tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8})
-        self.assertEqual(power.calculate_power(dp=0, ac=0, sample='tumor', variant=None), 1.0)
+            tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.0)},
+            purities={"tumor": 0.8},
+        )
+        self.assertEqual(
+            power.calculate_power(dp=0, ac=0, sample="tumor", variant=None), 1.0
+        )
 
     def test_eaf_copy_number_below_one(self):
         power = PowerCalculator(
-            tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=0.5)}, purities={'tumor': 0.9})
-        self.assertLessEqual(power.calculate_expected_vaf(sample='tumor', variant=None), 1.0)
+            tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=0.5)},
+            purities={"tumor": 0.9},
+        )
+        self.assertLessEqual(
+            power.calculate_expected_vaf(sample="tumor", variant=None), 1.0
+        )
 
     def test_eaf_is_cached(self):
         power = PowerCalculator(
-            tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8})
-        v1 = VafatorVariant(chromosome='chr1', position=100, reference='A', alternative='G')
-        eaf1 = power.calculate_expected_vaf(sample='tumor', variant=v1)
-        eaf2 = power.calculate_expected_vaf(sample='tumor', variant=v1)
+            tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.0)},
+            purities={"tumor": 0.8},
+        )
+        v1 = VafatorVariant(
+            chromosome="chr1", position=100, reference="A", alternative="G"
+        )
+        eaf1 = power.calculate_expected_vaf(sample="tumor", variant=v1)
+        eaf2 = power.calculate_expected_vaf(sample="tumor", variant=v1)
         self.assertEqual(eaf1, eaf2)
         self.assertEqual(len(power._eaf_cache), 1)
 
     def test_k_is_cached(self):
         power = PowerCalculator(
-            tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8})
+            tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.0)},
+            purities={"tumor": 0.8},
+        )
         k1 = power._calculate_k(100)
         k2 = power._calculate_k(100)
         self.assertEqual(k1, k2)
@@ -53,108 +111,156 @@ def test_k_is_cached(self):
 
     def test_higher_coverage_higher_power(self):
         power = PowerCalculator(
-            tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8})
-        p_low, _ = power.calculate_absolute_power(dp=10, sample='tumor', variant=None)
-        p_high, _ = power.calculate_absolute_power(dp=100, sample='tumor', variant=None)
+            tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.0)},
+            purities={"tumor": 0.8},
+        )
+        p_low, _ = power.calculate_absolute_power(dp=10, sample="tumor", variant=None)
+        p_high, _ = power.calculate_absolute_power(dp=100, sample="tumor", variant=None)
         self.assertLess(p_low, p_high)
 
     def test_varying_purity(self):
         power1 = PowerCalculator(
-            tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.5)}, purities={'tumor': 0.8})
+            tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.5)},
+            purities={"tumor": 0.8},
+        )
         power2 = PowerCalculator(
-            tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.5)}, purities={'tumor': 0.6})
+            tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.5)},
+            purities={"tumor": 0.6},
+        )
         self.assertLess(
-            power1.calculate_power(dp=10, ac=2, sample='tumor', variant=None),
-            power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None))
+            power1.calculate_power(dp=10, ac=2, sample="tumor", variant=None),
+            power2.calculate_power(dp=10, ac=2, sample="tumor", variant=None),
+        )
         power3 = PowerCalculator(
-            tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.5)}, purities={'tumor': 0.4})
+            tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.5)},
+            purities={"tumor": 0.4},
+        )
         self.assertLess(
-            power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None),
-            power3.calculate_power(dp=10, ac=2, sample='tumor', variant=None))
+            power2.calculate_power(dp=10, ac=2, sample="tumor", variant=None),
+            power3.calculate_power(dp=10, ac=2, sample="tumor", variant=None),
+        )
 
     def test_varying_ploidy(self):
         power1 = PowerCalculator(
-            tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8})
+            tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.0)},
+            purities={"tumor": 0.8},
+        )
         power2 = PowerCalculator(
-            tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=4.0)}, purities={'tumor': 0.8})
+            tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=4.0)},
+            purities={"tumor": 0.8},
+        )
         self.assertLess(
-            power1.calculate_power(dp=10, ac=2, sample='tumor', variant=None),
-            power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None))
+            power1.calculate_power(dp=10, ac=2, sample="tumor", variant=None),
+            power2.calculate_power(dp=10, ac=2, sample="tumor", variant=None),
+        )
 
         power3 = PowerCalculator(
-            tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=6.0)}, purities={'tumor': 0.8})
+            tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=6.0)},
+            purities={"tumor": 0.8},
+        )
         self.assertLess(
-            power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None),
-            power3.calculate_power(dp=10, ac=2, sample='tumor', variant=None))
+            power2.calculate_power(dp=10, ac=2, sample="tumor", variant=None),
+            power3.calculate_power(dp=10, ac=2, sample="tumor", variant=None),
+        )
 
     def test_local_copy_numbers(self):
-        input_bed = pkg_resources.resource_filename(__name__, "resources/test_copy_numbers.bed")
+        input_bed = pkg_resources.resource_filename(
+            __name__, "resources/test_copy_numbers.bed"
+        )
         power = PowerCalculator(
-            tumor_ploidies={'tumor': PloidyManager(local_copy_numbers=input_bed)}, purities={'tumor': 0.8})
+            tumor_ploidies={"tumor": PloidyManager(local_copy_numbers=input_bed)},
+            purities={"tumor": 0.8},
+        )
 
         self.assertNotEqual(
-            power.calculate_power(dp=10, ac=2, sample='tumor',
-                                  variant=VafatorVariant(
-                                      chromosome='chr1', position=10001, reference='A', alternative='G')),
-            power.calculate_power(dp=10, ac=2, sample='tumor',
-                                  variant=VafatorVariant(
-                                      chromosome='chr1', position=20001, reference='A', alternative='G')))
+            power.calculate_power(
+                dp=10,
+                ac=2,
+                sample="tumor",
+                variant=VafatorVariant(
+                    chromosome="chr1", position=10001, reference="A", alternative="G"
+                ),
+            ),
+            power.calculate_power(
+                dp=10,
+                ac=2,
+                sample="tumor",
+                variant=VafatorVariant(
+                    chromosome="chr1", position=20001, reference="A", alternative="G"
+                ),
+            ),
+        )
         self.assertEqual(
-            power.calculate_power(dp=10, ac=2, sample='tumor',
-                                  variant=VafatorVariant(
-                                      chromosome='chr1', position=10001, reference='A', alternative='G')),
-            power.calculate_power(dp=10, ac=2, sample='tumor',
-                                  variant=VafatorVariant(
-                                      chromosome='chr1', position=10002, reference='A', alternative='G')))
+            power.calculate_power(
+                dp=10,
+                ac=2,
+                sample="tumor",
+                variant=VafatorVariant(
+                    chromosome="chr1", position=10001, reference="A", alternative="G"
+                ),
+            ),
+            power.calculate_power(
+                dp=10,
+                ac=2,
+                sample="tumor",
+                variant=VafatorVariant(
+                    chromosome="chr1", position=10002, reference="A", alternative="G"
+                ),
+            ),
+        )
 
     def test_absolute_power_calculator(self):
-        ploidy_manager = {'tumor': PloidyManager(genome_wide_ploidy=2)}
-        calculator = PowerCalculator(tumor_ploidies=ploidy_manager, purities={'tumor': 0.8})
-        p, k = calculator.calculate_absolute_power(dp=100, sample='tumor', variant=None)
+        ploidy_manager = {"tumor": PloidyManager(genome_wide_ploidy=2)}
+        calculator = PowerCalculator(
+            tumor_ploidies=ploidy_manager, purities={"tumor": 0.8}
+        )
+        p, k = calculator.calculate_absolute_power(dp=100, sample="tumor", variant=None)
         self.assertEqual(p, 1.0)
         self.assertEqual(k, 4)
-        p, k = calculator.calculate_absolute_power(dp=50, sample='tumor', variant=None)
+        p, k = calculator.calculate_absolute_power(dp=50, sample="tumor", variant=None)
         self.assertEqual(p, 1.0)
         self.assertEqual(k, 4)
-        p, k = calculator.calculate_absolute_power(dp=10, sample='tumor', variant=None)
+        p, k = calculator.calculate_absolute_power(dp=10, sample="tumor", variant=None)
         self.assertEqual(p, 0.85408)
         self.assertEqual(k, 3)
-        p, k = calculator.calculate_absolute_power(dp=2, sample='tumor', variant=None)
+        p, k = calculator.calculate_absolute_power(dp=2, sample="tumor", variant=None)
         self.assertEqual(p, 0.16009)
         self.assertEqual(k, 2)
 
-        calculator = PowerCalculator(tumor_ploidies=ploidy_manager, purities={'tumor': 0.6})
-        p, k = calculator.calculate_absolute_power(
-            dp=100, sample='tumor', variant=None)
+        calculator = PowerCalculator(
+            tumor_ploidies=ploidy_manager, purities={"tumor": 0.6}
+        )
+        p, k = calculator.calculate_absolute_power(dp=100, sample="tumor", variant=None)
         self.assertEqual(p, 1.0)
         self.assertEqual(k, 4)
-        p, k = calculator.calculate_absolute_power(dp=50, sample='tumor', variant=None)
+        p, k = calculator.calculate_absolute_power(dp=50, sample="tumor", variant=None)
         self.assertEqual(p, 1.00007)
         self.assertEqual(k, 4)
-        p, k = calculator.calculate_absolute_power(dp=10, sample='tumor', variant=None)
+        p, k = calculator.calculate_absolute_power(dp=10, sample="tumor", variant=None)
         self.assertEqual(p, 0.64373)
         self.assertEqual(k, 3)
-        p, k = calculator.calculate_absolute_power(dp=2, sample='tumor', variant=None)
+        p, k = calculator.calculate_absolute_power(dp=2, sample="tumor", variant=None)
         self.assertEqual(p, 0.09005)
         self.assertEqual(k, 2)
 
-        calculator = PowerCalculator(tumor_ploidies=ploidy_manager, purities={'tumor': 0.1})
-        p, k = calculator.calculate_absolute_power(
-            dp=100, sample='tumor', variant=None)
+        calculator = PowerCalculator(
+            tumor_ploidies=ploidy_manager, purities={"tumor": 0.1}
+        )
+        p, k = calculator.calculate_absolute_power(dp=100, sample="tumor", variant=None)
         self.assertEqual(p, 0.75607)
         self.assertEqual(k, 4)
-        p, k = calculator.calculate_absolute_power(dp=50, sample='tumor', variant=None)
+        p, k = calculator.calculate_absolute_power(dp=50, sample="tumor", variant=None)
         self.assertEqual(p, 0.33419)
         self.assertEqual(k, 4)
-        p, k = calculator.calculate_absolute_power(dp=10, sample='tumor', variant=None)
+        p, k = calculator.calculate_absolute_power(dp=10, sample="tumor", variant=None)
         self.assertEqual(p, 0.01254)
         self.assertEqual(k, 3)
-        p, k = calculator.calculate_absolute_power(dp=2, sample='tumor', variant=None)
+        p, k = calculator.calculate_absolute_power(dp=2, sample="tumor", variant=None)
         self.assertEqual(p, 0.0025)
         self.assertEqual(k, 2)
 
     def test_default_purity_is_one(self):
         power = PowerCalculator(tumor_ploidies={}, purities={})
-        eaf = power.calculate_expected_vaf(sample='tumor', variant=None)
+        eaf = power.calculate_expected_vaf(sample="tumor", variant=None)
         # purity=1, normal_ploidy=2, tumor_ploidy=2 => eaf = 1/(1*2 + 0*2) = 0.5
         self.assertAlmostEqual(eaf, 0.5)
diff --git a/vafator/tests/test_rank_sum_test.py b/vafator/tests/test_rank_sum_test.py
index 6c808a0..870cdc5 100644
--- a/vafator/tests/test_rank_sum_test.py
+++ b/vafator/tests/test_rank_sum_test.py
@@ -50,10 +50,10 @@ def test_both_empty_returns_nan(self):
         self.assertTrue(isnan(pvalue))
 
     def test_get_rank_sum_tests_snv(self):
-        variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['T'])
+        variant = VariantRecord(CHROM="chr1", POS=100, REF="A", ALT=["T"])
         distributions = {
-            'A': [20, 25, 30, 35, 40],  # ref is higher
-            'T': [1, 5, 10, 15, 20],    # alt is lower
+            "A": [20, 25, 30, 35, 40],  # ref is higher
+            "T": [1, 5, 10, 15, 20],  # alt is lower
         }
         pvalues, stats = get_rank_sum_tests(distributions, variant)
         self.assertEqual(len(stats), 1)
@@ -62,8 +62,8 @@ def test_get_rank_sum_tests_snv(self):
         self.assertLess(float(stats[0]), 0.0)
 
     def test_get_rank_sum_tests_no_alt_reads(self):
-        variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['T'])
-        distributions = {'A': [20, 25, 30]}  # no T reads
+        variant = VariantRecord(CHROM="chr1", POS=100, REF="A", ALT=["T"])
+        distributions = {"A": [20, 25, 30]}  # no T reads
         pvalues, stats = get_rank_sum_tests(distributions, variant)
         self.assertEqual(stats, [])
-        self.assertEqual(pvalues, [])
\ No newline at end of file
+        self.assertEqual(pvalues, [])
diff --git a/vafator/tests/utils.py b/vafator/tests/utils.py
index 0564cb6..2d490d6 100755
--- a/vafator/tests/utils.py
+++ b/vafator/tests/utils.py
@@ -14,7 +14,9 @@ def _get_count_variants(input_file):
 def _get_mutation_at_position(input_file, chromosome, position):
     variant = None
     vcf = VCF(input_file)
-    for v in vcf:  # we cannot query by specific positions as this requires a tabix index
+    for (
+        v
+    ) in vcf:  # we cannot query by specific positions as this requires a tabix index
         if v.CHROM == chromosome and v.POS == position:
             variant = v
             break
@@ -24,7 +26,7 @@ def _get_mutation_at_position(input_file, chromosome, position):
 
 def _get_info_fields(input_file):
     vcf = VCF(input_file)
-    return [h.info().get("ID") for h in vcf.header_iter() if h['HeaderType'] == 'INFO']
+    return [h.info().get("ID") for h in vcf.header_iter() if h["HeaderType"] == "INFO"]
 
 
 def _get_annotation_values(input_file, annotation):
diff --git a/vafator/vafator2decifer.py b/vafator/vafator2decifer.py
index 32ae4b8..39496d3 100644
--- a/vafator/vafator2decifer.py
+++ b/vafator/vafator2decifer.py
@@ -31,20 +31,29 @@ def filterByDepthAndVaf(variant: Variant, Filter, samples):
 
     for s in samples:
         # filter if genotype has low depth or is missing
-        if variant.INFO["{}_dp".format(s)] < Filter['MinDepth']:
+        if variant.INFO["{}_dp".format(s)] < Filter["MinDepth"]:
             missing += 1
 
     # filter if alt allele isn't greater than the specified threshold in at least one sample
-    if not any(np.greater_equal([variant.INFO["{}_ac".format(s)] for s in samples], Filter['MinDepthAltAllele'])):
+    if not any(
+        np.greater_equal(
+            [variant.INFO["{}_ac".format(s)] for s in samples],
+            Filter["MinDepthAltAllele"],
+        )
+    ):
         missing += 1
 
     # filter if VAF  isn't greater than the specified threshold in at least one sample
-    if not any(np.greater_equal([variant.INFO["{}_af".format(s)] for s in samples], Filter['MinVAF'])):
+    if not any(
+        np.greater_equal(
+            [variant.INFO["{}_af".format(s)] for s in samples], Filter["MinVAF"]
+        )
+    ):
         missing += 1
 
     if missing > 0:
         PASS = 0
-    return (PASS)
+    return PASS
 
 
 def compute_ref_var_depths(vcf, FilterDP, samples):
@@ -56,7 +65,9 @@ def compute_ref_var_depths(vcf, FilterDP, samples):
             PASS = filterByDepthAndVaf(variant, FilterDP, samples)
             # print(np.greater_equal(variant.gt_alt_depths,FilterDP['MinDepthAltAllele']))
             if PASS:
-                char_label = ".".join(map(str, [variant.CHROM, variant.POS, variant.REF, variant.ALT[0]]))
+                char_label = ".".join(
+                    map(str, [variant.CHROM, variant.POS, variant.REF, variant.ALT[0]])
+                )
                 for s in samples:
                     alt = variant.INFO["{}_ac".format(s)]
                     ref = variant.INFO["{}_dp".format(s)] - alt
@@ -69,14 +80,19 @@ def print_output(ref_var_depths, cna_overlaps, outdir, samples):
     chars = ref_var_depths.keys() & cna_overlaps.keys()
     header = [str(len(chars)) + " #characters"]
     header.append(str(len(samples)) + " #samples")
-    header.append("#sample_index\tsample_label\tcharacter_index\tcharacter_label\tref\tvar")
+    header.append(
+        "#sample_index\tsample_label\tcharacter_index\tcharacter_label\tref\tvar"
+    )
     # print(header)
-    with open(f"{outdir}/decifer.input.tsv", 'w') as out:
+    with open(f"{outdir}/decifer.input.tsv", "w") as out:
         print("\n".join(header), file=out)
         for char_label in ref_var_depths:
             if char_label in cna_overlaps:
                 for i in range(len(samples)):
-                    r, v = ref_var_depths[char_label][i][0], ref_var_depths[char_label][i][1]
+                    r, v = (
+                        ref_var_depths[char_label][i][0],
+                        ref_var_depths[char_label][i][1],
+                    )
                     to_print = [i, samples[i], char_index, char_label, r, v]
                     cnas = cna_overlaps[char_label][i]
                     to_print.extend(cnas)
@@ -88,14 +104,14 @@ def print_output(ref_var_depths, cna_overlaps, outdir, samples):
 def get_purities(cna_df, num_samples, min_purity):
     purities = {}
     for i, row in cna_df.head(num_samples + 1).iterrows():
-        purity = 1.0 - row['u_normal']
+        purity = 1.0 - row["u_normal"]
         if purity >= min_purity:
-            purities[row['SAMPLE']] = purity
+            purities[row["SAMPLE"]] = purity
     return purities
 
 
 def print_purities(purities, sample_index, num_samples, outdir):
-    with open(f"{outdir}/decifer.purity.tsv", 'w') as out:
+    with open(f"{outdir}/decifer.purity.tsv", "w") as out:
         for sample in sample_index:
             print(sample_index[sample], purities[sample], file=out, sep="\t")
 
@@ -113,21 +129,21 @@ def filter_high_CN_sites(cn_states_persite, max_CN):
 def print_unique_CN_states(cn_states, max_CN, outdir):
     # print unique copy number states for sites that are below the max_CN threshold
     cn_states = tuple(set(cn_states))
-    with open(f"{outdir}/cn_states.txt", 'w') as out:
+    with open(f"{outdir}/cn_states.txt", "w") as out:
         for value in cn_states:
             PASS = 1
             for i in value:
                 if int(i[0]) + int(i[1]) > max_CN:
                     PASS = 0
             if PASS:
-                out.write(';'.join([','.join(i) for i in value]) + '\n')
+                out.write(";".join([",".join(i) for i in value]) + "\n")
 
 
 def print_filtered_sites(filtered_sites, cna_overlaps, outdir):
-    with open(f"{outdir}/filtered_sites.txt", 'w') as out:
+    with open(f"{outdir}/filtered_sites.txt", "w") as out:
         out.write("\n".join(filtered_sites))
         print(file=out)
-    with open(f"{outdir}/filtered_stats.txt", 'w') as out:
+    with open(f"{outdir}/filtered_stats.txt", "w") as out:
         filtered = len(filtered_sites)
         total = len(cna_overlaps.keys())
         print("# sites that were filtered due to copy-number states > max_CN", file=out)
@@ -163,7 +179,10 @@ def overlap_cna_snp(vcf_samples, max_CN, snps, out_dir):
             if filter_high_CN_sites(cn_info.keys(), max_CN):
                 # store results, converting from dict to a list for later printing
                 cna_info = []
-                [cna_info.extend([c.split("|")[0], c.split("|")[1], cn_info[c]]) for c in cn_info]
+                [
+                    cna_info.extend([c.split("|")[0], c.split("|")[1], cn_info[c]])
+                    for c in cn_info
+                ]
                 cna_overlaps[char_label].append(cna_info)
             else:
                 filtered_sites.add(char_label)
@@ -180,22 +199,24 @@ def run_vafator2decifer(args):
     num_samples = len(args.samples.split(",")) if args.samples is not None else 0
 
     # Load in CNA information
-    cna_df = pd.read_csv(args.cna_file, sep='\t', index_col=False)
+    cna_df = pd.read_csv(args.cna_file, sep="\t", index_col=False)
 
     # get purities and filter by min_purity
     purities = get_purities(cna_df, num_samples, args.min_purity)
 
     # restrict samples considered in VCF and CNA file to those that have purity > min_purity
     restricted_samples = list(purities.keys())
-    cna_df = cna_df.loc[cna_df['SAMPLE'].isin(list(purities.keys()))]
+    cna_df = cna_df.loc[cna_df["SAMPLE"].isin(list(purities.keys()))]
     # print new CNA file, filtering out samples below min_purity
     cna_df.to_csv(f"{args.out_dir}/best.seg.ucn", sep="\t", index=False)
     if args.snp_file:
-        snp_df = pd.read_csv(args.snp_file, sep='\t', index_col=False, header=None)
+        snp_df = pd.read_csv(args.snp_file, sep="\t", index_col=False, header=None)
         snp_df = snp_df.loc[snp_df[2].isin(list(purities.keys()))]
         # rearrange columns for decifer
         snp_df = snp_df[[2, 0, 1, 3, 4]]
-        snp_df.to_csv(f"{args.out_dir}/snpfile.1bed", sep="\t", index=False, header=False)
+        snp_df.to_csv(
+            f"{args.out_dir}/snpfile.1bed", sep="\t", index=False, header=False
+        )
 
     num_samples = len(restricted_samples)
     # print purity information
@@ -204,21 +225,25 @@ def run_vafator2decifer(args):
 
     # Filtering criteria
     Filter = {}
-    Filter['MinDepth'] = args.min_depth
-    Filter['MinDepthAltAllele'] = args.min_alt_depth
-    Filter['MinVAF'] = args.min_vaf
+    Filter["MinDepth"] = args.min_depth
+    Filter["MinDepthAltAllele"] = args.min_alt_depth
+    Filter["MinVAF"] = args.min_vaf
 
     # ref_var_depths[char_label] = list of (ref,alt) tuples, one for each sample, in same order as vcf.samples
     ref_var_depths = compute_ref_var_depths(vcf, Filter, samples=restricted_samples)
 
     # print BED file for SNPs
-    with open(f"{args.out_dir}/snps.bed", 'w') as out:
+    with open(f"{args.out_dir}/snps.bed", "w") as out:
         print("chrom\tstart\tend\tREF\tALT", file=out)
         # sort ref_var_depths by the first two parts of chr_label
-        for chr_label in sorted(ref_var_depths, key=lambda x: (x.split('.')[0], int(x.split('.')[1]))):
+        for chr_label in sorted(
+            ref_var_depths, key=lambda x: (x.split(".")[0], int(x.split(".")[1]))
+        ):
             pos = chr_label.split(".")
             # subtract 1 from position to create interval in BED format
-            print(pos[0], int(pos[1]) - 1, int(pos[1]), pos[2], pos[3], sep="\t", file=out)
+            print(
+                pos[0], int(pos[1]) - 1, int(pos[1]), pos[2], pos[3], sep="\t", file=out
+            )
 
     snps = pbt.BedTool(f"{args.out_dir}/snps.bed")
     if args.exclude_list:
@@ -227,16 +252,18 @@ def run_vafator2decifer(args):
 
     # prepare BED files for CNA intervals for each sample, for overlapping with SNPs
     for sample in restricted_samples:
-        df = cna_df[cna_df['SAMPLE'] == sample]
+        df = cna_df[cna_df["SAMPLE"] == sample]
         # consider subtracting 1 from start of interval to be compatible with BED format, leave end interval alone
-        df.loc[:, 'START'] = df['START']
-        df = df.drop('SAMPLE', axis=1)
+        df.loc[:, "START"] = df["START"]
+        df = df.drop("SAMPLE", axis=1)
         df.to_csv(f"{args.out_dir}/{sample}_cna.bed", index=False, sep="\t")
 
     # overlap SNPs with CNA intervals for each sample
     # cna_overlaps[char_label] = list of tuples of CNA info (one tuple for each sample, in same order as vcf.samples)
     # this function also prints the observed CN state trees for the generatestatetrees function
-    cna_overlaps, cn_states_allsites, filtered_sites = overlap_cna_snp(restricted_samples, args.max_CN, snps, args.out_dir)
+    cna_overlaps, cn_states_allsites, filtered_sites = overlap_cna_snp(
+        restricted_samples, args.max_CN, snps, args.out_dir
+    )
 
     # sites may have unique CN states that are duplicate; set them to find unique CN states across sites
     print_unique_CN_states(cn_states_allsites, args.max_CN, args.out_dir)

From a3e5e20a46eaa79689bfae52c63a244131f34684 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Wed, 8 Apr 2026 15:24:53 +0200
Subject: [PATCH 28/32] fix unused variable bam

---
 vafator/annotator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vafator/annotator.py b/vafator/annotator.py
index 519b8eb..2a6c482 100755
--- a/vafator/annotator.py
+++ b/vafator/annotator.py
@@ -452,7 +452,7 @@ def _get_headers(input_bams: dict) -> list:
                     Annotator._make_header(suffix, description, typ, number, sample=s)
                 )
             if len(bams) > 1:
-                for i, bam in enumerate(bams, start=1):
+                for i in range(1, len(bams) + 1):
                     # n = os.path.basename(bam).split(".")[0]
                     for suffix, description, typ, number in _REPLICATE_HEADER_TEMPLATES:
                         headers.append(

From 2bcc602505e173391671bc7accb4e24b59ee4eb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Thu, 9 Apr 2026 11:37:31 +0200
Subject: [PATCH 29/32] fix Codacy errors

---
 setup.py                   |  2 +-
 vafator/annotator.py       |  9 +++++++--
 vafator/command_line.py    |  1 -
 vafator/hatchet2bed.py     | 10 +++-------
 vafator/vafator2decifer.py |  6 +-----
 5 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/setup.py b/setup.py
index 0610c88..c29369c 100755
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,4 @@
-from setuptools import find_packages, setup
+from setuptools import setup
 import vafator
 
 
diff --git a/vafator/annotator.py b/vafator/annotator.py
index 2a6c482..3fa3f50 100755
--- a/vafator/annotator.py
+++ b/vafator/annotator.py
@@ -72,10 +72,10 @@ def __init__(
         input_vcf: str,
         output_vcf: str,
         input_bams: dict,
-        purities: dict = {},
+        purities: dict = None,
         mapping_qual_thr: int = 0,
         base_call_qual_thr: int = 29,
-        tumor_ploidies: dict = {},
+        tumor_ploidies: dict = None,
         normal_ploidy: int = 2,
         fpr: float = DEFAULT_FPR,
         error_rate: float = DEFAULT_ERROR_RATE,
@@ -98,6 +98,11 @@ def __init__(
             num_processes: number of parallel processes for chromosome-level annotation (default: 1)
         """
 
+        if purities is None:
+            purities = {}
+        if tumor_ploidies is None:
+            tumor_ploidies = {}
+
         self.mapping_quality_threshold = mapping_qual_thr
         self.base_call_quality_threshold = base_call_qual_thr
         self.purities = purities
diff --git a/vafator/command_line.py b/vafator/command_line.py
index e09e9b0..ca74b0e 100755
--- a/vafator/command_line.py
+++ b/vafator/command_line.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python
 import argparse
-import sys
 import logging
 import vafator
 from vafator.power import DEFAULT_FPR, DEFAULT_ERROR_RATE
diff --git a/vafator/hatchet2bed.py b/vafator/hatchet2bed.py
index bef4ba3..cbc5989 100644
--- a/vafator/hatchet2bed.py
+++ b/vafator/hatchet2bed.py
@@ -3,12 +3,8 @@
 
 def run_hatchet2bed(input_file, output_prefix):
     input_df = pd.read_csv(input_file, sep="\t")
-    cn_columns = sorted(
-        list(filter(lambda c: c.startswith("cn_clone"), input_df.columns))
-    )
-    u_columns = sorted(
-        list(filter(lambda c: c.startswith("u_clone"), input_df.columns))
-    )
+    cn_columns = sorted([c for c in input_df.columns if c.startswith("cn_clone")])
+    u_columns = sorted([c for c in input_df.columns if c.startswith("u_clone")])
 
     for sample in input_df.SAMPLE.unique():
         data = []
@@ -18,7 +14,7 @@ def run_hatchet2bed(input_file, output_prefix):
             for cn_column, u_column in zip(cn_columns, u_columns):
                 u = float(row[u_column])
                 total_u += u
-                cn = sum(map(lambda c: float(c), row[cn_column].split("|")))
+                cn = sum(map(float, row[cn_column].split("|")))
                 numerator.append(u * cn)
             data.append(
                 [row["#CHR"], row["START"], row["END"], sum(numerator) / total_u]
diff --git a/vafator/vafator2decifer.py b/vafator/vafator2decifer.py
index 39496d3..6ebf447 100644
--- a/vafator/vafator2decifer.py
+++ b/vafator/vafator2decifer.py
@@ -13,7 +13,6 @@
 python vcf_2_decifer.py [OPTIONS]
 """
 
-import re
 import sys
 import pybedtools as pbt
 from cyvcf2 import VCF, Variant
@@ -21,7 +20,6 @@
 import pandas as pd
 import numpy as np
 from collections import defaultdict
-import argparse
 
 
 def filterByDepthAndVaf(variant: Variant, Filter, samples):
@@ -179,10 +177,8 @@ def overlap_cna_snp(vcf_samples, max_CN, snps, out_dir):
             if filter_high_CN_sites(cn_info.keys(), max_CN):
                 # store results, converting from dict to a list for later printing
                 cna_info = []
-                [
+                for c in cn_info:
                     cna_info.extend([c.split("|")[0], c.split("|")[1], cn_info[c]])
-                    for c in cn_info
-                ]
                 cna_overlaps[char_label].append(cna_info)
             else:
                 filtered_sites.add(char_label)

From c0254d00fd825b867d642403ddc8576512454616 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Fri, 10 Apr 2026 10:31:32 +0200
Subject: [PATCH 30/32] add seed for random class. fixes #54

---
 vafator/multiallelic_filter.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vafator/multiallelic_filter.py b/vafator/multiallelic_filter.py
index 00b0f26..6b998d6 100755
--- a/vafator/multiallelic_filter.py
+++ b/vafator/multiallelic_filter.py
@@ -39,6 +39,7 @@ def __init__(self, input_vcf, output_vcf, tumor_sample_name="tumor"):
             }
         )
         self.vcf_writer = Writer(output_vcf, self.vcf)
+        self.random = random.Random(42)
 
     def run(self):
         batch = []
@@ -67,7 +68,7 @@ def run(self):
                     # chooses a variant at random
                     alt1 = variant.ALT[0]
                     alt2 = prev_variant.ALT[0]
-                    prev_variant = random.sample([variant, prev_variant], k=1)[0]
+                    prev_variant = self.random.sample([variant, prev_variant], k=1)[0]
                     self.set_multiallelic_annotation(
                         prev_variant, alt1 if alt1 != prev_variant.ALT[0] else alt2, af1
                     )

From eb176144faa76785152a1f3473f0500bb0e4392b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Fri, 10 Apr 2026 10:32:15 +0200
Subject: [PATCH 31/32] change version to 3.0.0

---
 setup.cfg           | 2 +-
 vafator/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index bc22ccb..4f08792 100755
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = VAFator
-version = 3.1.0
+version = 3.0.0
 description = Annotate variants in a VCF file with technical annotations from one or more BAMs 
 description-file = README.md
 long_description = file: README.md
diff --git a/vafator/__init__.py b/vafator/__init__.py
index 273ffca..ea9d694 100755
--- a/vafator/__init__.py
+++ b/vafator/__init__.py
@@ -1 +1 @@
-VERSION = "3.1.0"
+VERSION = "3.0.0"

From f239072dba1482c3c2c8c90e9a51f982781e967e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=96zlem=20Muslu?= <oezlem.muslu@tron-mainz.de>
Date: Fri, 10 Apr 2026 10:42:27 +0200
Subject: [PATCH 32/32] add support for CRAM files. closes #45

---
 vafator/annotator.py    |  4 ++--
 vafator/command_line.py | 16 ++++++++++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/vafator/annotator.py b/vafator/annotator.py
index 3fa3f50..5351e50 100755
--- a/vafator/annotator.py
+++ b/vafator/annotator.py
@@ -52,7 +52,7 @@ def _collect_metrics_worker(
     ]
     for sample, bam_files in bam_paths.items():
         for i, bam_path in enumerate(bam_files):
-            bam = pysam.AlignmentFile(bam_path, "rb")
+            bam = pysam.AlignmentFile(bam_path, "r")
             all_metrics[(sample, i)] = collect_metrics_for_chrom(
                 chrom=chrom,
                 variants=variants,
@@ -159,7 +159,7 @@ def __init__(
 
         self.bam_paths = input_bams
         self.bam_readers = {
-            s: [pysam.AlignmentFile(b, "rb") for b in bams]
+            s: [pysam.AlignmentFile(b, "r") for b in bams]
             for s, bams in input_bams.items()
         }
 
diff --git a/vafator/command_line.py b/vafator/command_line.py
index ca74b0e..b8f3ea1 100755
--- a/vafator/command_line.py
+++ b/vafator/command_line.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 import argparse
 import logging
+from pathlib import Path
 import vafator
 from vafator.power import DEFAULT_FPR, DEFAULT_ERROR_RATE
 from vafator.hatchet2bed import run_hatchet2bed
@@ -12,6 +13,16 @@
 epilog = "Copyright (c) 2019-2021 TRON gGmbH (See LICENSE for licensing details)"
 
 
+def _validate_alignment_file_extension(alignment_file):
+    suffixes = [suffix.lower() for suffix in Path(alignment_file).suffixes]
+    if ".sam" in suffixes:
+        raise ValueError(
+            "SAM input is not supported: {}. Please provide BAM or CRAM files.".format(
+                alignment_file
+            )
+        )
+
+
 def annotator():
 
     # set up logger
@@ -40,8 +51,8 @@ def annotator():
         nargs=2,
         metavar=("sample_name", "bam_file"),
         default=[],
-        help="A sample name and a BAM file. Can be used multiple times to input multiple samples and "
-        "multiple BAM files. The same sample name can be used multiple times with different BAMs, "
+        help="A sample name and a BAM/CRAM file. Can be used multiple times to input multiple samples and "
+        "multiple BAM/CRAM files. The same sample name can be used multiple times with different BAMs/CRAMs, "
         "this will treated as replicates.",
     )
     parser.add_argument(
@@ -127,6 +138,7 @@ def annotator():
 
     bams = {}
     for sample_name, bam in args.bam:
+        _validate_alignment_file_extension(bam)
         if sample_name in bams:
             bams[sample_name].append(bam)
         else: