From 34f41a80c18f0677b6d7a2048fadf493ae60a1fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Fri, 13 Mar 2026 10:07:14 +0100 Subject: [PATCH 01/32] Update gitignore --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index fc2bc44..2abcd34 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,6 @@ out vafator/tests/resources/results .cache .jupyter -.local \ No newline at end of file +.local +run.sh +VAFator.egg-info/* From 78c870f8ef545b6b308c6bf7618de1762af8c4a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Fri, 13 Mar 2026 15:13:32 +0100 Subject: [PATCH 02/32] Upgrade Python and the Python packages. * Reprodubility of the output confirmed using WES_EA_1 sample from SEQC2 - a more comprehensive test for INDELs may be necessary * Due ensure reprodubility, the methods get_variant_pileup and get_snv_metrics are updated. * pysam could only be upgraded till 0.21.0, as above this version (up to and including 0.23.3) there are inconsistencies in base qualities. Python was therefore set to 3.11 --- requirements.txt | 7 ------ setup.cfg | 58 ++++++++++++++++++++++++++++++++++++++++++++- setup.py | 43 +-------------------------------- vafator/__init__.py | 2 +- vafator/pileups.py | 52 ++++++++++++++++++++++++++++++---------- 5 files changed, 98 insertions(+), 64 deletions(-) delete mode 100755 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100755 index bdd5c16..0000000 --- a/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -pandas~=1.3.3 -pysam~=0.19.1 -cyvcf2~=0.30.14 -logzero~=1.7.0 -pybedtools~=0.9.0 -numpy>=1.20,<2.0 -scipy>=1.0.0,<2.0.0 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index a2f3748..b74831c 100755 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,58 @@ [metadata] -description-file = README.md \ No newline at end of file +name = VAFator +version = 3.0.0 +description = Annotate variants in a VCF file with technical annotations from one or more BAMs +description-file = README.md +long_description = file: README.md +long_description_content_type = text/markdown +license = MIT +url = https://github.com/TRON-Bioinformatics/vafator +author = Pablo Riesgo Ferreiro, Jonas Ibn-Salem, Luis Kress, Özlem Muslu +classifiers = + Development Status :: 4 - Beta + Intended Audience :: Healthcare Industry + Intended Audience :: Science/Research + Topic :: Scientific/Engineering :: Bio-Informatics + Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 + Programming Language :: Python :: 3.13 + Programming Language :: Python :: 3 :: Only + License :: OSI Approved :: MIT License + Operating System :: Unix +author_email = priesgoferreiro@gmail.com + +[options.entry_points] +console_scripts = + vafator=vafator.command_line:annotator + multiallelics-filter=vafator.command_line:multiallelics_filter + vafator2decifer=vafator.command_line:vafator2decifer + hatchet2bed=vafator.command_line:hatchet2bed + +[options] +packages = find: +include_package_data = True +zip_safe = False + +python_requires = ==3.11 + +install_requires = + pandas>=3.0.1,<4 + pysam==0.21.0 # above this version base qualities show up wrong in the presence of soft clipping/insertions/both (latest release 0.23.3) + cyvcf2>=0.32.1,<0.33 + logzero>=1.7.0,<2 + pybedtools>=0.12.0,<0.13 + numpy>=2.4.3,<3 + scipy>=1.17.1,<2 + +[options.packages.find] +exclude = + tests + tests.* + legacy + legacy.* + +[options.extras_require] +dev = + pytest + ruff + mypy \ No newline at end of file diff --git a/setup.py b/setup.py index 1025dd9..398663e 100755 --- a/setup.py +++ b/setup.py @@ -4,49 +4,8 @@ VERSION = vafator.VERSION - -# parses requirements from file -with open("requirements.txt") as f: - required = f.read().splitlines() - with open("README.md", "r", encoding="utf-8") as f: long_description = f.read() # Build the Python package -setup( - name='vafator', - version=VERSION, - packages=find_packages(exclude=["legacy"]), - entry_points={ - 'console_scripts': [ - 'vafator=vafator.command_line:annotator', - 'multiallelics-filter=vafator.command_line:multiallelics_filter', - 'vafator2decifer=vafator.command_line:vafator2decifer', - 'hatchet2bed=vafator.command_line:hatchet2bed' - ], - }, - author="TRON - Translational Oncology at the University Medical Center of the Johannes Gutenberg University Mainz" - "- Computational Medicine group", - author_email='pablo.riesgoferreiro@tron-mainz.de', - description='Annotate a VCF file with AF, AD and DP from tumor and normal BAMs', - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/tron-bioinformatics/vafator", - requires=[], - install_requires=required, - classifiers=[ - 'Development Status :: 4 - Beta', # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package - 'Intended Audience :: Healthcare Industry', - 'Intended Audience :: Science/Research', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3 :: Only', - "License :: OSI Approved :: MIT License", - "Operating System :: Unix" - ], - python_requires='>=3.7', - license='MIT' -) \ No newline at end of file +setup() \ No newline at end of file diff --git a/vafator/__init__.py b/vafator/__init__.py index 90a0094..b41f4d7 100755 --- a/vafator/__init__.py +++ b/vafator/__init__.py @@ -1,4 +1,4 @@ -VERSION='2.2.2' +VERSION='3.0.0' AMBIGUOUS_BASES = ['N', 'M', 'R', 'W', 'S', 'Y', 'K', 'V', 'H', 'D', 'B'] diff --git a/vafator/pileups.py b/vafator/pileups.py index 65160a4..6c54eac 100755 --- a/vafator/pileups.py +++ b/vafator/pileups.py @@ -28,9 +28,11 @@ def get_variant_pileup( # approximately +- read size bp return bam.pileup(contig=variant.CHROM, start=position - 1, stop=position, truncate=True, # returns only this column - max_depth=0, # disables maximum depth + max_depth=1000000, min_base_quality=min_base_quality, - min_mapping_quality=min_mapping_quality) + min_mapping_quality=min_mapping_quality, + stepper='samtools', + ) @dataclass @@ -171,21 +173,45 @@ def get_deletion_metrics(variant: Variant, pileups: IteratorColumnRegion) -> Cov def get_snv_metrics(pileups: IteratorColumnRegion, include_ambiguous_bases=False) -> CoverageMetrics: try: pileup = next(pileups) - bases = [s.upper() for s in pileup.get_query_sequences()] - bqs = aggregate_median_per_base(bases, pileup.get_query_qualities()) - mqs = aggregate_median_per_base(bases, pileup.get_mapping_qualities()) - positions = aggregate_median_per_base(bases, pileup.get_query_positions()) - all_bqs = aggregate_list_per_base(bases, pileup.get_query_qualities()) - all_mqs = aggregate_list_per_base(bases, pileup.get_mapping_qualities()) - all_positions = aggregate_list_per_base(bases, pileup.get_query_positions()) + bases = [] + qualities = [] + mapping_qualities = [] + query_positions = [] + + # to reproduce the older versions of Vafator, include deletions at query position when computing stats + for read in pileup.pileups: + if read.is_refskip: + continue + if read.is_del: + bases.append("") + qualities.append(0) # no base quality for deletions + mapping_qualities.append(read.alignment.mapping_quality) + query_positions.append(read.query_position_or_next) + else: + base = read.alignment.query_sequence[read.query_position].upper() + bases.append(base) + qualities.append(read.alignment.query_qualities[read.query_position]) + mapping_qualities.append(read.alignment.mapping_quality) + query_positions.append(read.query_position) + + + all_bqs = aggregate_list_per_base(bases, qualities) + all_mqs = aggregate_list_per_base(bases, mapping_qualities) + all_positions = aggregate_list_per_base(bases, query_positions) + + bqs = Counter({b: np.median(l) for b, l in all_bqs.items()}) + mqs = Counter({b: np.median(l) for b, l in all_mqs.items()}) + positions = Counter({b: np.median(l) for b, l in all_positions.items()}) + + # print('Summary', bases, qualities, mapping_qualities, '\n\n\n') + + ac = Counter(b for b in bases if b != "") # deletions don't count as alleles if include_ambiguous_bases: - dp = len([b for b in bases if b!= ""]) + dp = len(bases) else: - # remove ambiguous bases and reads where the position is spliced out - dp = len([b for b in bases if b not in AMBIGUOUS_BASES and b != ""]) - ac = Counter(bases) + dp = sum(1 for b in bases if b == "" or b not in AMBIGUOUS_BASES) except StopIteration: # no reads dp = 0 From 2a69e4d46f5b8f17be041698ec37ce985b7ccb9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Fri, 13 Mar 2026 15:17:38 +0100 Subject: [PATCH 03/32] Introduce null check before rank sum test --- vafator/rank_sum_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vafator/rank_sum_test.py b/vafator/rank_sum_test.py index 384bd37..b9a3b4d 100644 --- a/vafator/rank_sum_test.py +++ b/vafator/rank_sum_test.py @@ -4,6 +4,8 @@ def calculate_rank_sum_test(alternate_dist: List[int], reference_dist: List[int]) -> Tuple[float, float]: + if not alternate_dist or not reference_dist: # skip empty distributions + return np.nan, np.nan stat, pvalue = scipy.stats.ranksums(x=alternate_dist, y=reference_dist) return round(stat, 3), round(pvalue, 5) From 055dc23bac88cf35b1900486edd3d10b0600f5e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Fri, 13 Mar 2026 17:01:51 +0100 Subject: [PATCH 04/32] perf: major performance optimizations reducing runtime ~4-5x replace per-variant pileup() calls with a single streaming pileup iterator per BAM per chromosome. Variants are buffered by chromosome and metrics are computed immediately as each pileup column is visited (avoids segfault from storing invalidated PileupColumn C objects). Reduces pysam pileup __init__/__dealloc__ overhead from ~80% of total runtime to negligible. --- vafator/annotator.py | 69 ++++++--- vafator/pileups.py | 346 ++++++++++++++++++++++++------------------- 2 files changed, 242 insertions(+), 173 deletions(-) diff --git a/vafator/annotator.py b/vafator/annotator.py index 00f8293..62f311c 100755 --- a/vafator/annotator.py +++ b/vafator/annotator.py @@ -13,7 +13,9 @@ from vafator.ploidies import DEFAULT_PLOIDY from vafator.rank_sum_test import calculate_rank_sum_test, get_rank_sum_tests from vafator.power import PowerCalculator, DEFAULT_ERROR_RATE, DEFAULT_FPR -from vafator.pileups import get_variant_pileup, get_metrics +from vafator.pileups import ( + collect_metrics_for_chrom, stream_variants_by_chrom, EMPTY_METRICS +) BATCH_SIZE = 10000 @@ -288,7 +290,11 @@ def _write_batch(self, batch): for v in batch: self.vcf_writer.write_record(v) - def _add_stats(self, variant: Variant): + def _add_stats(self, variant: Variant, metrics_by_bam: dict): + """ + Annotate a single variant using pre-computed metrics. + metrics_by_bam: {(sample, bam_index): CoverageMetrics} + """ for sample, bams in self.bam_readers.items(): global_dp = 0 global_ac = Counter() @@ -298,13 +304,10 @@ def _add_stats(self, variant: Variant): global_all_mqs = {} global_all_bqs = {} global_all_positions = {} + for i, bam in enumerate(bams): - pileups = get_variant_pileup( - variant=variant, bam=bam, - min_base_quality=self.base_call_quality_threshold, - min_mapping_quality=self.mapping_quality_threshold) - coverage_metrics = get_metrics(variant=variant, pileups=pileups, - include_ambiguous_bases=self.include_ambiguous_bases) + coverage_metrics = metrics_by_bam.get((sample, i), EMPTY_METRICS) + if coverage_metrics is not None: if len(bams) > 1: variant.INFO["{}_af_{}".format(sample, i + 1)] = \ @@ -377,8 +380,7 @@ def _add_stats(self, variant: Variant): variant.INFO["{}_pos".format(sample)] = ",".join( [str(global_pos[variant.REF])] + [str(global_pos[alt]) for alt in variant.ALT]) - # for these rank sum tests it is required at least one value for the alternate and one value for the - # reference otherwise it cannot be calculated + # rank sum tests require at least one ref and one alt value pvalues, stats = get_rank_sum_tests(global_all_mqs, variant) if stats: variant.INFO["{}_rsmq".format(sample)] = ",".join(stats) @@ -399,16 +401,41 @@ def _calculate_af(self, ac, dp): def run(self): batch = [] - variant: Variant - for variant in self.vcf: - # gets the counts of all bases across all BAMs - self._add_stats(variant) - - batch.append(variant) - if len(batch) >= BATCH_SIZE: - self._write_batch(batch) - batch = [] - if len(batch) > 0: + + for chrom, chrom_variants in stream_variants_by_chrom(self.vcf): + # For each BAM, open ONE pileup iterator per chromosome and compute + # metrics immediately for each column — avoids per-variant pileup overhead. + # Key: (sample, bam_index, variant_pos, REF, ALT[0]) -> CoverageMetrics + all_metrics = {} # {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}} + + for sample, bams in self.bam_readers.items(): + for i, bam in enumerate(bams): + all_metrics[(sample, i)] = collect_metrics_for_chrom( + chrom=chrom, + variants=chrom_variants, + bam=bam, + min_base_quality=self.base_call_quality_threshold, + min_mapping_quality=self.mapping_quality_threshold, + include_ambiguous_bases=self.include_ambiguous_bases, + ) + + for variant in chrom_variants: + # build per-BAM metrics lookup for this specific variant + metrics_by_bam = { + (sample, i): all_metrics[(sample, i)].get( + (variant.POS, variant.REF, variant.ALT[0]), EMPTY_METRICS + ) + for sample, bams in self.bam_readers.items() + for i in range(len(bams)) + } + self._add_stats(variant, metrics_by_bam) + + batch.append(variant) + if len(batch) >= BATCH_SIZE: + self._write_batch(batch) + batch = [] + + if batch: self._write_batch(batch) time.sleep(2) @@ -417,4 +444,4 @@ def run(self): self.vcf.close() for _, bams in self.bam_readers.items(): for bam in bams: - bam.close() + bam.close() \ No newline at end of file diff --git a/vafator/pileups.py b/vafator/pileups.py index 6c54eac..c9efd9e 100755 --- a/vafator/pileups.py +++ b/vafator/pileups.py @@ -1,6 +1,6 @@ -from collections import Counter +from collections import Counter, defaultdict from dataclasses import dataclass -from typing import Union +from typing import Union, List, Dict, Iterator, Tuple from cyvcf2 import Variant from pysam.libcalignmentfile import IteratorColumnRegion, AlignmentFile @@ -22,17 +22,97 @@ def is_deletion(variant: Variant): def get_variant_pileup( - variant: Union[Variant, VafatorVariant], bam: AlignmentFile, min_base_quality, min_mapping_quality) -> IteratorColumnRegion: + variant: Union[Variant, VafatorVariant], bam: AlignmentFile, + min_base_quality, min_mapping_quality) -> IteratorColumnRegion: + """Single-variant pileup, kept for backwards compatibility and tests.""" position = variant.POS - # this function returns the pileups at all positions covered by reads covered the queried position - # approximately +- read size bp return bam.pileup(contig=variant.CHROM, start=position - 1, stop=position, - truncate=True, # returns only this column + truncate=True, max_depth=1000000, min_base_quality=min_base_quality, min_mapping_quality=min_mapping_quality, stepper='samtools', - ) + ) + + +def get_region_pileup(chrom: str, start: int, end: int, bam: AlignmentFile, + min_base_quality, min_mapping_quality): + """ + Opens a single pileup iterator spanning a whole region (e.g. one chromosome). + start is 0-based inclusive, end is 1-based exclusive (last variant POS). + """ + return bam.pileup(contig=chrom, start=start, stop=end, + truncate=True, + max_depth=1000000, + min_base_quality=min_base_quality, + min_mapping_quality=min_mapping_quality, + stepper='samtools', + ) + + +def stream_variants_by_chrom(vcf) -> Iterator[Tuple[str, List[Variant]]]: + """ + Yields (chrom, [variants]) one chromosome at a time. + Only one chromosome's variants are held in memory at once. + """ + current_chrom = None + current_variants = [] + for variant in vcf: + if variant.CHROM != current_chrom: + if current_variants: + yield current_chrom, current_variants + current_chrom = variant.CHROM + current_variants = [variant] + else: + current_variants.append(variant) + if current_variants: + yield current_chrom, current_variants + + +def collect_metrics_for_chrom( + chrom: str, + variants: List[Variant], + bam: AlignmentFile, + min_base_quality: int, + min_mapping_quality: int, + include_ambiguous_bases: bool = False) -> Dict[Tuple, 'CoverageMetrics']: + """ + Opens ONE pileup iterator over the entire chromosome region covered by variants. + Metrics are computed IMMEDIATELY for each pileup column while it is still valid — + avoids segfaults from storing PileupColumn objects after the iterator advances. + + Returns {(pos, REF, ALT[0]): CoverageMetrics}. + """ + if not variants: + return {} + + # index variants by 1-based position for O(1) lookup during streaming + variants_by_pos: Dict[int, List[Variant]] = defaultdict(list) + for v in variants: + variants_by_pos[v.POS].append(v) + + start = variants[0].POS - 1 # 0-based inclusive + end = variants[-1].POS # exclusive end for pysam + + results: Dict[Tuple, CoverageMetrics] = {} + + for pileup_col in get_region_pileup( + chrom=chrom, start=start, end=end, + bam=bam, + min_base_quality=min_base_quality, + min_mapping_quality=min_mapping_quality, + ): + ref_pos = pileup_col.reference_pos + 1 # convert to 1-based + if ref_pos not in variants_by_pos: + continue + + # compute metrics NOW while pileup_col is still valid in C memory + for variant in variants_by_pos[ref_pos]: + metrics = _get_metrics_from_column(variant, pileup_col, include_ambiguous_bases) + if metrics is not None: + results[(ref_pos, variant.REF, variant.ALT[0])] = metrics + + return results @dataclass @@ -55,57 +135,103 @@ class CoverageMetrics: all_positions: dict = None -def get_metrics(variant: Variant, pileups: IteratorColumnRegion, include_ambiguous_bases=False) -> CoverageMetrics: +EMPTY_METRICS = CoverageMetrics( + ac=Counter(), dp=0, bqs=Counter(), mqs=Counter(), positions=Counter(), + all_bqs={}, all_mqs={}, all_positions={} +) + + +def _get_metrics_from_column(variant: Variant, pileup_col, + include_ambiguous_bases=False) -> 'CoverageMetrics': + """Dispatch to the right metrics function based on variant type.""" if is_snp(variant): - return get_snv_metrics(pileups, include_ambiguous_bases) + return _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases) elif is_insertion(variant): - return get_insertion_metrics(variant, pileups) + return _get_insertion_metrics_from_column(variant, pileup_col) elif is_deletion(variant): - return get_deletion_metrics(variant, pileups) + return _get_deletion_metrics_from_column(variant, pileup_col) return None -def get_insertion_metrics(variant: Variant, pileups: IteratorColumnRegion) -> CoverageMetrics: +def _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases=False) -> CoverageMetrics: + bases = [] + qualities = [] + mapping_qualities = [] + query_positions = [] + + for read in pileup_col.pileups: + if read.is_refskip: + continue + if read.is_del: + bases.append("") + qualities.append(0) + mapping_qualities.append(read.alignment.mapping_quality) + query_positions.append(read.query_position_or_next) + else: + base = read.alignment.query_sequence[read.query_position].upper() + bases.append(base) + qualities.append(read.alignment.query_qualities[read.query_position]) + mapping_qualities.append(read.alignment.mapping_quality) + query_positions.append(read.query_position) + + all_bqs = aggregate_list_per_base(bases, qualities) + all_mqs = aggregate_list_per_base(bases, mapping_qualities) + all_positions = aggregate_list_per_base(bases, query_positions) + + bqs = Counter({b: np.median(l) for b, l in all_bqs.items()}) + mqs = Counter({b: np.median(l) for b, l in all_mqs.items()}) + positions = Counter({b: np.median(l) for b, l in all_positions.items()}) + + ac = Counter(b for b in bases if b != "") + + if include_ambiguous_bases: + dp = len(bases) + else: + dp = sum(1 for b in bases if b == "" or b not in AMBIGUOUS_BASES) + + return CoverageMetrics( + ac=ac, dp=dp, bqs=bqs, mqs=mqs, positions=positions, + all_bqs=all_bqs, all_mqs=all_mqs, all_positions=all_positions + ) + + +def _get_insertion_metrics_from_column(variant: Variant, pileup_col) -> CoverageMetrics: ac = {alt.upper(): 0 for alt in variant.ALT} mq = {alt.upper(): [] for alt in variant.ALT} mq[variant.REF] = [] pos = {alt.upper(): [] for alt in variant.ALT} pos[variant.REF] = [] - dp = 0 variant_position = variant.POS insertion_length = len(variant.ALT[0]) - len(variant.REF) insertion = variant.ALT[0][1:] alt_upper = variant.ALT[0].upper() - try: - pileups = next(pileups).pileups - dp += len(pileups) - for pileup_read in pileups: - if pileup_read.indel > 0: - # read with an insertion - index = pileup_read.alignment.reference_start - relative_position = 0 - for cigar_type, cigar_length in pileup_read.alignment.cigartuples: - if cigar_type in [0, 2, 3, 7, 8]: # consumes reference M, D, N, =, X - index += cigar_length - if index > variant_position: - break - if cigar_type in [0, 1, 4, 7, 8]: # consumes query M, I, S, =, X - relative_position += cigar_length - if cigar_type == 1: # does not count I - insertion_in_query = pileup_read.alignment.query[relative_position : relative_position + insertion_length] - if index == variant_position and cigar_length == insertion_length and insertion == insertion_in_query: - # the read contains the insertion - ac[alt_upper] = ac[alt_upper] + 1 - mq[alt_upper].append(pileup_read.alignment.mapping_quality) - pos[alt_upper].append(pileup_read.query_position_or_next) - elif pileup_read.indel == 0: - # NOTE: considers all reads without indels to be the reference! - mq[variant.REF].append(pileup_read.alignment.mapping_quality) - pos[variant.REF].append(pileup_read.query_position_or_next) - - except StopIteration: - # no reads - pass + + pileup_reads = pileup_col.pileups + dp = len(pileup_reads) + + for pileup_read in pileup_reads: + if pileup_read.indel > 0: + index = pileup_read.alignment.reference_start + relative_position = 0 + for cigar_type, cigar_length in pileup_read.alignment.cigartuples: + if cigar_type in [0, 2, 3, 7, 8]: + index += cigar_length + if index > variant_position: + break + if cigar_type in [0, 1, 4, 7, 8]: + relative_position += cigar_length + if cigar_type == 1: + insertion_in_query = pileup_read.alignment.query[ + relative_position:relative_position + insertion_length] + if index == variant_position and cigar_length == insertion_length \ + and insertion == insertion_in_query: + ac[alt_upper] += 1 + mq[alt_upper].append(pileup_read.alignment.mapping_quality) + pos[alt_upper].append(pileup_read.query_position_or_next) + elif pileup_read.indel == 0: + mq[variant.REF].append(pileup_read.alignment.mapping_quality) + pos[variant.REF].append(pileup_read.query_position_or_next) + return CoverageMetrics( ac=Counter(ac), dp=dp, mqs=Counter({k: np.median(l) for k, l in mq.items()}), @@ -117,48 +243,39 @@ def get_insertion_metrics(variant: Variant, pileups: IteratorColumnRegion) -> Co ) -def get_deletion_metrics(variant: Variant, pileups: IteratorColumnRegion) -> CoverageMetrics: +def _get_deletion_metrics_from_column(variant: Variant, pileup_col) -> CoverageMetrics: ac = {alt.upper(): 0 for alt in variant.ALT} mq = {alt.upper(): [] for alt in variant.ALT} mq[variant.REF] = [] pos = {alt.upper(): [] for alt in variant.ALT} pos[variant.REF] = [] - dp = 0 variant_position = variant.POS deletion_length = len(variant.REF) - len(variant.ALT[0]) alt_upper = variant.ALT[0].upper() - try: - pileups = next(pileups).pileups - dp += len(pileups) - for pileup_read in pileups: - if pileup_read.indel < 0: - # read with a deletion - start = pileup_read.alignment.reference_start - match = False - for cigar_type, cigar_length in pileup_read.alignment.cigartuples: - if cigar_type in [0, 3, 7, 8]: # consumes reference M, N, =, X - start += cigar_length - elif cigar_type == 2: # D - if start == variant_position and cigar_length == deletion_length: - ac[alt_upper] = ac[alt_upper] + 1 - mq[alt_upper].append(pileup_read.alignment.mapping_quality) - pos[alt_upper].append(pileup_read.query_position_or_next) - match = True - break - else: - start += cigar_length - if start > variant_position: + + pileup_reads = pileup_col.pileups + dp = len(pileup_reads) + + for pileup_read in pileup_reads: + if pileup_read.indel < 0: + start = pileup_read.alignment.reference_start + for cigar_type, cigar_length in pileup_read.alignment.cigartuples: + if cigar_type in [0, 3, 7, 8]: + start += cigar_length + elif cigar_type == 2: + if start == variant_position and cigar_length == deletion_length: + ac[alt_upper] += 1 + mq[alt_upper].append(pileup_read.alignment.mapping_quality) + pos[alt_upper].append(pileup_read.query_position_or_next) break - if not match: - # TODO: when finds a read with an indel not matching our particular indel it counts it - pass - elif pileup_read.indel == 0: - # NOTE: considers all reads without indels to be the reference! - mq[variant.REF].append(pileup_read.alignment.mapping_quality) - pos[variant.REF].append(pileup_read.query_position_or_next) - except StopIteration: - # no reads - pass + else: + start += cigar_length + if start > variant_position: + break + elif pileup_read.indel == 0: + mq[variant.REF].append(pileup_read.alignment.mapping_quality) + pos[variant.REF].append(pileup_read.query_position_or_next) + return CoverageMetrics( ac=Counter(ac), dp=dp, mqs=Counter({k: np.median(l) for k, l in mq.items()}), @@ -169,85 +286,10 @@ def get_deletion_metrics(variant: Variant, pileups: IteratorColumnRegion) -> Cov all_bqs=Counter() ) - -def get_snv_metrics(pileups: IteratorColumnRegion, include_ambiguous_bases=False) -> CoverageMetrics: - try: - pileup = next(pileups) - - bases = [] - qualities = [] - mapping_qualities = [] - query_positions = [] - - # to reproduce the older versions of Vafator, include deletions at query position when computing stats - for read in pileup.pileups: - if read.is_refskip: - continue - if read.is_del: - bases.append("") - qualities.append(0) # no base quality for deletions - mapping_qualities.append(read.alignment.mapping_quality) - query_positions.append(read.query_position_or_next) - else: - base = read.alignment.query_sequence[read.query_position].upper() - bases.append(base) - qualities.append(read.alignment.query_qualities[read.query_position]) - mapping_qualities.append(read.alignment.mapping_quality) - query_positions.append(read.query_position) - - - all_bqs = aggregate_list_per_base(bases, qualities) - all_mqs = aggregate_list_per_base(bases, mapping_qualities) - all_positions = aggregate_list_per_base(bases, query_positions) - - bqs = Counter({b: np.median(l) for b, l in all_bqs.items()}) - mqs = Counter({b: np.median(l) for b, l in all_mqs.items()}) - positions = Counter({b: np.median(l) for b, l in all_positions.items()}) - - # print('Summary', bases, qualities, mapping_qualities, '\n\n\n') - - ac = Counter(b for b in bases if b != "") # deletions don't count as alleles - - if include_ambiguous_bases: - dp = len(bases) - else: - dp = sum(1 for b in bases if b == "" or b not in AMBIGUOUS_BASES) - except StopIteration: - # no reads - dp = 0 - ac = Counter() - bqs = Counter() - mqs = Counter() - positions = Counter() - all_bqs = {} - all_mqs = {} - all_positions = {} - - return CoverageMetrics( - ac=ac, - dp=dp, - bqs=bqs, - mqs=mqs, - positions=positions, - all_bqs=all_bqs, - all_mqs=all_mqs, - all_positions=all_positions - ) - - -def aggregate_median_per_base(bases, values) -> Counter: - aggregated_values = {} - for b, v in zip(bases, values): - if b not in aggregated_values: - aggregated_values[b] = [] - aggregated_values[b].append(v) - return Counter({b: np.median(bq_list) for b, bq_list in aggregated_values.items()}) - - def aggregate_list_per_base(bases, values) -> dict: aggregated_values = {} for b, v in zip(bases, values): if b not in aggregated_values: aggregated_values[b] = [] aggregated_values[b].append(v) - return aggregated_values + return aggregated_values \ No newline at end of file From 547667b7210fa3f9d89f8c5c970a91e3c8971a38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Fri, 13 Mar 2026 17:02:46 +0100 Subject: [PATCH 05/32] perf: cache in power.py to avoid computing the same values repeatedly. cache _calculate_k() results by dp value (same depth repeats across thousands of variants). Cache calculate_expected_vaf() by (sample, chrom, pos). Replace frozen binom(n, f).pmf(k) object instantiation with direct binom.pmf(k, n, f) call to avoid scipy distribution object overhead on every variant. --- vafator/power.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/vafator/power.py b/vafator/power.py index 2430e31..f465fe9 100644 --- a/vafator/power.py +++ b/vafator/power.py @@ -1,3 +1,4 @@ +from functools import lru_cache from typing import Optional from cyvcf2 import Variant @@ -27,6 +28,14 @@ def __init__( self.fpr = fpr self.error_rate = error_rate + # cache for _calculate_k: dp -> k + # dp values repeat heavily across variants so this avoids repeated binom.cdf loops + self._k_cache: dict = {} + + # cache for calculate_expected_vaf: (sample, pos) -> vaf + # when using genome-wide ploidy (most common case) this is the same value for all variants + self._eaf_cache: dict = {} + def calculate_power(self, dp: int, ac: int, sample: str, variant: Optional[Variant]) -> float: """ Return the binomial probability of observing ac or less supporting reads, given a total coverage dp and a @@ -45,10 +54,18 @@ def calculate_expected_vaf(self, sample: str, variant: Optional[Variant]) -> flo In a scenario with purity = 1, tumor CN = 2 and normal CN = 2 => expected VAF = 0.5 """ + # cache key: use variant position for local copy number lookups, + # or just sample name when genome-wide ploidy is used (most common case) + cache_key = (sample, variant.CHROM if variant else None, variant.POS if variant else None) + if cache_key in self._eaf_cache: + return self._eaf_cache[cache_key] + purity = self.purities.get(sample, DEFAULT_PURITY) tumor_ploidy = max(1, self.tumor_ploidies.get(sample, default_ploidy_manager).get_ploidy(variant=variant)) corrected_tumor_ploidy = purity * tumor_ploidy + ((1 - purity) * self.normal_ploidy) expected_vaf = purity / corrected_tumor_ploidy + + self._eaf_cache[cache_key] = expected_vaf return expected_vaf def _calculate_p(self, m: int, n: int) -> float: @@ -74,10 +91,13 @@ def _calculate_d(self, k: int, n: int) -> float: return (self.fpr - p) / (p_1 - p) def _calculate_k(self, dp: int) -> int: + # cache: same dp value appears across thousands of variants + if dp in self._k_cache: + return self._k_cache[dp] k = 1 while self._calculate_p(m=k, n=dp) > self.fpr: k += 1 - + self._k_cache[dp] = k return k def calculate_absolute_power(self, sample, variant, dp: int) -> float: @@ -89,7 +109,6 @@ def calculate_absolute_power(self, sample, variant, dp: int) -> float: k = self._calculate_k(dp=dp) n = dp f = self.calculate_expected_vaf(sample, variant) - power = 1 - binom.cdf(k=k - 1, n=n, p=f) + self._calculate_d(k=k, n=n) * binom(n, f).pmf(k) - return round(power, 5), k - - + # avoid instantiating a frozen binom distribution object — use module-level functions directly + power = 1 - binom.cdf(k=k - 1, n=n, p=f) + self._calculate_d(k=k, n=n) * binom.pmf(k=k, n=n, p=f) + return round(power, 5), k \ No newline at end of file From 449314030411fba57678bc66eeb2a102956cc8b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Tue, 17 Mar 2026 16:24:53 +0100 Subject: [PATCH 06/32] feat: chromosome-level parallelization via ProcessPoolExecutor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - annotator.py: add --num-processes parameter (default: 1, serial behaviour unchanged). When >1, _run_parallel() submits one future per chromosome to a ProcessPoolExecutor. Workers receive only picklable data: BAM file paths and variant tuples (POS, REF, ALT[0]) — cyvcf2.Variant objects and pysam AlignmentFile handles are not picklable and stay in the main process. - annotator.py: add module-level _collect_metrics_worker() — must be module-level to be picklable by ProcessPoolExecutor. Workers open their own pysam.AlignmentFile instances and reconstruct VariantRecord objects from the serialized tuples before calling collect_metrics_for_chrom(). - annotator.py: refactor run() into _run_serial(), _run_parallel(), _collect_chrom_metrics(), and _annotate_and_batch() for clarity. VCF output order is preserved by collecting futures in submission order before annotating. - pileups.py: replace test utility import (vafator.tests.utils.VafatorVariant) with a proper VariantRecord dataclass defined in pileups.py. Picklable by default, mirrors the .CHROM/.POS/.REF/.ALT interface of cyvcf2.Variant. - pileups.py: add safe_median() helper to guard np.median() calls against empty lists, eliminating RuntimeWarning: Mean of empty slice on positions with no reads supporting a particular allele. - command_line.py: wire --num-processes argument through to Annotator. remove unnecessary try/catch blocks as they would disable the traceback --- vafator/annotator.py | 149 +++++++++++++++++++++++++++------------- vafator/command_line.py | 55 +++++++-------- vafator/pileups.py | 36 +++++++--- 3 files changed, 152 insertions(+), 88 deletions(-) diff --git a/vafator/annotator.py b/vafator/annotator.py index 62f311c..14c1d27 100755 --- a/vafator/annotator.py +++ b/vafator/annotator.py @@ -1,31 +1,50 @@ from collections import Counter +import os +from concurrent.futures import ProcessPoolExecutor import numpy as np import pysam from cyvcf2 import VCF, Writer, Variant -import os import vafator import datetime import json -import asyncio -import time from vafator.ploidies import DEFAULT_PLOIDY from vafator.rank_sum_test import calculate_rank_sum_test, get_rank_sum_tests from vafator.power import PowerCalculator, DEFAULT_ERROR_RATE, DEFAULT_FPR from vafator.pileups import ( - collect_metrics_for_chrom, stream_variants_by_chrom, EMPTY_METRICS + collect_metrics_for_chrom, stream_variants_by_chrom, EMPTY_METRICS, VariantRecord ) BATCH_SIZE = 10000 -def background(f): - def wrapped(*args, **kwargs): - return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs) - - return wrapped +def _collect_metrics_worker(chrom, variant_tuples, bam_paths, min_base_quality, + min_mapping_quality, include_ambiguous_bases): + """ + Top-level worker function for ProcessPoolExecutor — must be module-level to be picklable. + Opens its own BAM readers (AlignmentFile objects cannot be shared across processes). + Receives variant data as plain tuples (cyvcf2.Variant objects are not picklable). + + Returns {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}} + """ + all_metrics = {} + variants = [VariantRecord(CHROM=chrom, POS=pos, REF=ref, ALT=[alt]) + for pos, ref, alt in variant_tuples] + for sample, bam_files in bam_paths.items(): + for i, bam_path in enumerate(bam_files): + bam = pysam.AlignmentFile(bam_path, "rb") + all_metrics[(sample, i)] = collect_metrics_for_chrom( + chrom=chrom, + variants=variants, + bam=bam, + min_base_quality=min_base_quality, + min_mapping_quality=min_mapping_quality, + include_ambiguous_bases=include_ambiguous_bases, + ) + bam.close() + return all_metrics class Annotator(object): @@ -46,7 +65,8 @@ def __init__(self, input_vcf: str, output_vcf: str, normal_ploidy=2, fpr=DEFAULT_FPR, error_rate=DEFAULT_ERROR_RATE, - include_ambiguous_bases=False): + include_ambiguous_bases=False, + num_processes: int = 1): self.mapping_quality_threshold = mapping_qual_thr self.base_call_quality_threshold = base_call_qual_thr @@ -54,6 +74,7 @@ def __init__(self, input_vcf: str, output_vcf: str, self.tumor_ploidies = tumor_ploidies self.normal_ploidy = normal_ploidy self.include_ambiguous_bases = include_ambiguous_bases + self.num_processes = num_processes self.power = PowerCalculator( normal_ploidy=normal_ploidy, tumor_ploidies=tumor_ploidies, purities=purities, error_rate=error_rate, fpr=fpr) @@ -77,7 +98,8 @@ def __init__(self, input_vcf: str, output_vcf: str, for a in Annotator._get_headers(input_bams): self.vcf.add_info_to_header(a) self.vcf_writer = Writer(output_vcf, self.vcf) - self.bam_readers = {s : [pysam.AlignmentFile(b, "rb") for b in bams] for s, bams in input_bams.items()} + self.bam_paths = input_bams # {sample: [path, ...]} — picklable, passed to workers + self.bam_readers = {s: [pysam.AlignmentFile(b, "rb") for b in bams] for s, bams in input_bams.items()} @staticmethod def _get_headers(input_bams: dict): @@ -285,7 +307,6 @@ def _get_headers(input_bams: dict): ] return headers - @background def _write_batch(self, batch): for v in batch: self.vcf_writer.write_record(v) @@ -401,47 +422,79 @@ def _calculate_af(self, ac, dp): def run(self): batch = [] - - for chrom, chrom_variants in stream_variants_by_chrom(self.vcf): - # For each BAM, open ONE pileup iterator per chromosome and compute - # metrics immediately for each column — avoids per-variant pileup overhead. - # Key: (sample, bam_index, variant_pos, REF, ALT[0]) -> CoverageMetrics - all_metrics = {} # {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}} - - for sample, bams in self.bam_readers.items(): - for i, bam in enumerate(bams): - all_metrics[(sample, i)] = collect_metrics_for_chrom( - chrom=chrom, - variants=chrom_variants, - bam=bam, - min_base_quality=self.base_call_quality_threshold, - min_mapping_quality=self.mapping_quality_threshold, - include_ambiguous_bases=self.include_ambiguous_bases, - ) - - for variant in chrom_variants: - # build per-BAM metrics lookup for this specific variant - metrics_by_bam = { - (sample, i): all_metrics[(sample, i)].get( - (variant.POS, variant.REF, variant.ALT[0]), EMPTY_METRICS - ) - for sample, bams in self.bam_readers.items() - for i in range(len(bams)) - } - self._add_stats(variant, metrics_by_bam) - - batch.append(variant) - if len(batch) >= BATCH_SIZE: - self._write_batch(batch) - batch = [] + if self.num_processes > 1: + self._run_parallel(batch) + else: + self._run_serial(batch) if batch: self._write_batch(batch) - time.sleep(2) - self.vcf_writer.close() self.vcf.close() for _, bams in self.bam_readers.items(): for bam in bams: - bam.close() \ No newline at end of file + bam.close() + + def _run_serial(self, batch): + for chrom, chrom_variants in stream_variants_by_chrom(self.vcf): + all_metrics = self._collect_chrom_metrics(chrom, chrom_variants) + self._annotate_and_batch(chrom_variants, all_metrics, batch) + + def _run_parallel(self, batch): + # variant objects are not picklable — pass only (POS, REF, ALT) tuples to workers, + # keep the actual Variant objects in the main process for annotation and writing + chrom_variants_map = {} + futures = {} + + with ProcessPoolExecutor(max_workers=self.num_processes) as executor: + for chrom, chrom_variants in stream_variants_by_chrom(self.vcf): + chrom_variants_map[chrom] = chrom_variants + variant_tuples = [(v.POS, v.REF, v.ALT[0]) for v in chrom_variants] + futures[executor.submit( + _collect_metrics_worker, + chrom=chrom, + variant_tuples=variant_tuples, + bam_paths=self.bam_paths, + min_base_quality=self.base_call_quality_threshold, + min_mapping_quality=self.mapping_quality_threshold, + include_ambiguous_bases=self.include_ambiguous_bases, + )] = chrom + + # collect in submission order to preserve VCF chromosome order + chrom_results = {futures[f]: f.result() for f in futures} + + for chrom, chrom_variants in chrom_variants_map.items(): + self._annotate_and_batch(chrom_variants, chrom_results[chrom], batch) + + + def _collect_chrom_metrics(self, chrom, chrom_variants): + """Collect metrics for all BAMs for one chromosome in the main process.""" + all_metrics = {} + for sample, bams in self.bam_readers.items(): + for i, bam in enumerate(bams): + all_metrics[(sample, i)] = collect_metrics_for_chrom( + chrom=chrom, + variants=chrom_variants, + bam=bam, + min_base_quality=self.base_call_quality_threshold, + min_mapping_quality=self.mapping_quality_threshold, + include_ambiguous_bases=self.include_ambiguous_bases, + ) + return all_metrics + + def _annotate_and_batch(self, chrom_variants, all_metrics, batch): + """Annotate variants using pre-computed metrics and append to write batch.""" + for variant in chrom_variants: + metrics_by_bam = { + (sample, i): all_metrics[(sample, i)].get( + (variant.POS, variant.REF, variant.ALT[0]), EMPTY_METRICS + ) + for sample, bams in self.bam_readers.items() + for i in range(len(bams)) + } + self._add_stats(variant, metrics_by_bam) + batch.append(variant) + if len(batch) >= BATCH_SIZE: + self._write_batch(batch) + batch.clear() \ No newline at end of file diff --git a/vafator/command_line.py b/vafator/command_line.py index 06cdd85..66e617c 100755 --- a/vafator/command_line.py +++ b/vafator/command_line.py @@ -50,6 +50,8 @@ def annotator(): help="Error rate to use in the power calculation") parser.add_argument("--include-ambiguous-bases", dest="include_ambiguous_bases", action='store_true', help="Flag indicating to include ambiguous bases from the DP calculation") + parser.add_argument("--num-processes", dest="num_processes", required=False, default=1, type=int, + help="Number of processes for parallel chromosome-level annotation (default: 1)") args = parser.parse_args() @@ -87,24 +89,22 @@ def annotator(): if len(bams) == 0: raise ValueError("Please, provide at least one bam file with '--bam sample_name /path/to/file.bam'") - try: - annotator = Annotator( - input_vcf=args.input_vcf, - output_vcf=args.output_vcf, - input_bams=bams, - mapping_qual_thr=args.mapping_quality, - base_call_qual_thr=args.base_call_quality, - purities=purities, - tumor_ploidies=tumor_ploidies, - normal_ploidy=int(args.normal_ploidy), - fpr=args.fpr, - error_rate=args.error_rate, - include_ambiguous_bases=args.include_ambiguous_bases - ) - annotator.run() - except Exception as e: - logging.error(str(e)) - sys.exit(-1) + annotator = Annotator( + input_vcf=args.input_vcf, + output_vcf=args.output_vcf, + input_bams=bams, + mapping_qual_thr=args.mapping_quality, + base_call_qual_thr=args.base_call_quality, + purities=purities, + tumor_ploidies=tumor_ploidies, + normal_ploidy=int(args.normal_ploidy), + fpr=args.fpr, + error_rate=args.error_rate, + include_ambiguous_bases=args.include_ambiguous_bases, + num_processes=args.num_processes, + ) + annotator.run() + logging.info("Vafator finished!") @@ -120,16 +120,13 @@ def multiallelics_filter(): args = parser.parse_args() logging.info("Vafator multiallelic filter starting...") - try: - filter = MultiallelicFilter( - input_vcf=args.input_vcf, - output_vcf=args.output_vcf, - tumor_sample_name=args.tumor_sample_name - ) - filter.run() - except Exception as e: - logging.error(str(e)) - sys.exit(-1) + filter = MultiallelicFilter( + input_vcf=args.input_vcf, + output_vcf=args.output_vcf, + tumor_sample_name=args.tumor_sample_name + ) + filter.run() + logging.info("Vafator multiallelic filter finished!") @@ -167,4 +164,4 @@ def hatchet2bed(): help="output BED file prefix, one file will be created per sample in the input with the " "average tumor copy number in each segment") args = parser.parse_args() - run_hatchet2bed(input_file=args.input_file, output_prefix=args.output_prefix) + run_hatchet2bed(input_file=args.input_file, output_prefix=args.output_prefix) \ No newline at end of file diff --git a/vafator/pileups.py b/vafator/pileups.py index c9efd9e..a3ad36c 100755 --- a/vafator/pileups.py +++ b/vafator/pileups.py @@ -5,10 +5,19 @@ from pysam.libcalignmentfile import IteratorColumnRegion, AlignmentFile from vafator import AMBIGUOUS_BASES -from vafator.tests.utils import VafatorVariant import numpy as np +@dataclass +class VariantRecord: + """Lightweight, picklable variant representation used by pileup workers. + Mirrors the cyvcf2.Variant fields accessed by pileup and metrics functions.""" + CHROM: str + POS: int + REF: str + ALT: List[str] + + def is_snp(variant: Variant): return len(variant.REF) == 1 and len(variant.ALT[0]) == 1 @@ -22,7 +31,7 @@ def is_deletion(variant: Variant): def get_variant_pileup( - variant: Union[Variant, VafatorVariant], bam: AlignmentFile, + variant: Union[Variant, VariantRecord], bam: AlignmentFile, min_base_quality, min_mapping_quality) -> IteratorColumnRegion: """Single-variant pileup, kept for backwards compatibility and tests.""" position = variant.POS @@ -178,9 +187,9 @@ def _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases=False) -> C all_mqs = aggregate_list_per_base(bases, mapping_qualities) all_positions = aggregate_list_per_base(bases, query_positions) - bqs = Counter({b: np.median(l) for b, l in all_bqs.items()}) - mqs = Counter({b: np.median(l) for b, l in all_mqs.items()}) - positions = Counter({b: np.median(l) for b, l in all_positions.items()}) + bqs = Counter({b: safe_median(l) for b, l in all_bqs.items()}) + mqs = Counter({b: safe_median(l) for b, l in all_mqs.items()}) + positions = Counter({b: safe_median(l) for b, l in all_positions.items()}) ac = Counter(b for b in bases if b != "") @@ -234,8 +243,8 @@ def _get_insertion_metrics_from_column(variant: Variant, pileup_col) -> Coverage return CoverageMetrics( ac=Counter(ac), dp=dp, - mqs=Counter({k: np.median(l) for k, l in mq.items()}), - positions=Counter({k: np.median(l) for k, l in pos.items()}), + mqs=Counter({k: safe_median(l) for k, l in mq.items()}), + positions=Counter({k: safe_median(l) for k, l in pos.items()}), bqs=Counter(), all_mqs={k: l for k, l in mq.items()}, all_positions={k: l for k, l in pos.items()}, @@ -278,18 +287,23 @@ def _get_deletion_metrics_from_column(variant: Variant, pileup_col) -> CoverageM return CoverageMetrics( ac=Counter(ac), dp=dp, - mqs=Counter({k: np.median(l) for k, l in mq.items()}), - positions=Counter({k: np.median(l) for k, l in pos.items()}), + mqs=Counter({k: safe_median(l) for k, l in mq.items()}), + positions=Counter({k: safe_median(l) for k, l in pos.items()}), bqs=Counter(), all_mqs={k: l for k, l in mq.items()}, all_positions={k: l for k, l in pos.items()}, all_bqs=Counter() ) - + def aggregate_list_per_base(bases, values) -> dict: aggregated_values = {} for b, v in zip(bases, values): if b not in aggregated_values: aggregated_values[b] = [] aggregated_values[b].append(v) - return aggregated_values \ No newline at end of file + return aggregated_values + + +def safe_median(values) -> float: + """Return median of values, or 0.0 for empty lists (avoids numpy RuntimeWarning).""" + return float(np.median(values)) if values else 0.0 \ No newline at end of file From 9d3259e5ae8328d9babf3cfe60eff5f700b50150 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Tue, 17 Mar 2026 16:25:24 +0100 Subject: [PATCH 07/32] return nan in safe_median to repeat previous results --- vafator/pileups.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/vafator/pileups.py b/vafator/pileups.py index a3ad36c..b6afb9f 100755 --- a/vafator/pileups.py +++ b/vafator/pileups.py @@ -1,12 +1,11 @@ +import numpy as np from collections import Counter, defaultdict -from dataclasses import dataclass -from typing import Union, List, Dict, Iterator, Tuple from cyvcf2 import Variant +from dataclasses import dataclass +from math import nan from pysam.libcalignmentfile import IteratorColumnRegion, AlignmentFile - +from typing import Union, List, Dict, Iterator, Tuple from vafator import AMBIGUOUS_BASES -import numpy as np - @dataclass class VariantRecord: @@ -306,4 +305,4 @@ def aggregate_list_per_base(bases, values) -> dict: def safe_median(values) -> float: """Return median of values, or 0.0 for empty lists (avoids numpy RuntimeWarning).""" - return float(np.median(values)) if values else 0.0 \ No newline at end of file + return float(np.median(values)) if values else nan From a4f228835490c9275217aefab0423417cb53d759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Wed, 18 Mar 2026 13:08:53 +0100 Subject: [PATCH 08/32] breaking: change cli argument include-ambiguous-bases to exclude-ambiguous-bases and make inclusion of ambiguous the default behaviour for accurate depth calculation. This replicates the results of version 2.2.0, but makes it more explicit --- vafator/command_line.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vafator/command_line.py b/vafator/command_line.py index 66e617c..f2d639d 100755 --- a/vafator/command_line.py +++ b/vafator/command_line.py @@ -48,7 +48,7 @@ def annotator(): help="False Positive Rate (FPR) to use in the power calculation") parser.add_argument("--error-rate", dest="error_rate", required=False, default=DEFAULT_ERROR_RATE, type=float, help="Error rate to use in the power calculation") - parser.add_argument("--include-ambiguous-bases", dest="include_ambiguous_bases", action='store_true', + parser.add_argument("--exclude-ambiguous-bases", dest="exclude_ambiguous_bases", action='store_true', help="Flag indicating to include ambiguous bases from the DP calculation") parser.add_argument("--num-processes", dest="num_processes", required=False, default=1, type=int, help="Number of processes for parallel chromosome-level annotation (default: 1)") @@ -100,7 +100,7 @@ def annotator(): normal_ploidy=int(args.normal_ploidy), fpr=args.fpr, error_rate=args.error_rate, - include_ambiguous_bases=args.include_ambiguous_bases, + include_ambiguous_bases=(not args.exclude_ambiguous_bases), num_processes=args.num_processes, ) annotator.run() From 8f62e75fcc44a7f8de5de9d3244f2e81b3a559ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Wed, 18 Mar 2026 14:30:46 +0100 Subject: [PATCH 09/32] Setup --- requirements.txt | 11 +++++++++++ setup.cfg | 7 ++++++- 2 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3484f8c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +pandas>=3.0.1,<4 +# pysam pinned: above 0.21.0 base qualities show up wrong in the presence of soft clipping/insertions/overlapping read pairs or a combination of these factors +pysam==0.21.0 +cyvcf2>=0.32.1,<0.33 +logzero>=1.7.0,<2 +pybedtools>=0.12.0,<0.13 +numpy>=2.4.3,<3 +scipy>=1.17.1,<2 +setuptools +pytest +pytest-cov \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index b74831c..9c95fe7 100755 --- a/setup.cfg +++ b/setup.cfg @@ -43,6 +43,7 @@ install_requires = pybedtools>=0.12.0,<0.13 numpy>=2.4.3,<3 scipy>=1.17.1,<2 + setuptools [options.packages.find] exclude = @@ -55,4 +56,8 @@ exclude = dev = pytest ruff - mypy \ No newline at end of file + mypy +test = + pytest + pytest-cov + setuptools \ No newline at end of file From 8f8f1c3c27b2f5bc1c7e70e935d097fecafca454 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Wed, 18 Mar 2026 14:40:24 +0100 Subject: [PATCH 10/32] Update unit tests --- vafator/ploidies.py | 4 +- vafator/tests/test_pileups.py | 210 ++++++++++++++++++++++++- vafator/tests/test_ploidy_manager.py | 13 +- vafator/tests/test_power_calculator.py | 43 ++++- vafator/tests/test_rank_sum_test.py | 38 ++++- vafator/tests/utils.py | 83 +++++----- 6 files changed, 331 insertions(+), 60 deletions(-) diff --git a/vafator/ploidies.py b/vafator/ploidies.py index b9f9099..00748db 100755 --- a/vafator/ploidies.py +++ b/vafator/ploidies.py @@ -3,7 +3,7 @@ import pandas as pd from cyvcf2 import Variant -from vafator.tests.utils import VafatorVariant +from vafator.pileups import VariantRecord DEFAULT_PLOIDY = 2.0 @@ -20,7 +20,7 @@ def __init__(self, local_copy_numbers: str = None, genome_wide_ploidy: float = D if local_copy_numbers is not None else None self.ploidy = genome_wide_ploidy - def get_ploidy(self, variant: Union[Variant, VafatorVariant]) -> float: + def get_ploidy(self, variant: Union[Variant, VariantRecord]) -> float: result = self.ploidy if self.bed is not None: diff --git a/vafator/tests/test_pileups.py b/vafator/tests/test_pileups.py index d7b3c76..21546ee 100644 --- a/vafator/tests/test_pileups.py +++ b/vafator/tests/test_pileups.py @@ -1,10 +1,216 @@ from collections import Counter from unittest import TestCase +from unittest.mock import MagicMock import pkg_resources import pysam from vafator.tests.utils import VafatorVariant -from vafator.pileups import get_variant_pileup, get_metrics +from vafator.pileups import ( + get_variant_pileup, get_metrics, + _get_insertion_metrics_from_column, _get_deletion_metrics_from_column, +) +from vafator.pileups import VariantRecord + + +def _make_pileup_col(reads): + col = MagicMock() + col.pileups = reads + return col + + +def _make_insertion_read(indel, reference_start, cigartuples, query, mapping_quality=60, + query_position_or_next=0): + """Build a mock pileup read with an insertion (indel > 0).""" + read = MagicMock() + read.indel = indel + read.alignment.reference_start = reference_start + read.alignment.cigartuples = cigartuples + read.alignment.query = query + read.alignment.mapping_quality = mapping_quality + read.query_position_or_next = query_position_or_next + return read + + +def _make_ref_read(mapping_quality=60, query_position_or_next=0): + """Build a mock pileup read with no indel (indel == 0), counted as reference.""" + read = MagicMock() + read.indel = 0 + read.alignment.mapping_quality = mapping_quality + read.query_position_or_next = query_position_or_next + return read + + +def _make_deletion_read(indel, reference_start, cigartuples, mapping_quality=60, + query_position_or_next=0): + """Build a mock pileup read with a deletion (indel < 0).""" + read = MagicMock() + read.indel = indel + read.alignment.reference_start = reference_start + read.alignment.cigartuples = cigartuples + read.alignment.mapping_quality = mapping_quality + read.query_position_or_next = query_position_or_next + return read + + +class TestInsertionMetricsFromColumn(TestCase): + # CIGAR op codes: 0=M, 1=I, 2=D, 3=N, 4=S, 7==, 8=X + + def test_no_reads(self): + variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT']) + col = _make_pileup_col([]) + metrics = _get_insertion_metrics_from_column(variant, col) + self.assertEqual(metrics.dp, 0) + self.assertEqual(metrics.ac['ATT'], 0) + + def test_one_matching_insertion(self): + # variant: A -> ATT at pos 100 (insertion of "TT", length 2) + # read: starts at 98, M2 then I2 of "TT" landing at pos 100 + # CIGAR: [(0,2), (1,2)] — M2 advances ref to 100, then I2 + variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT']) + read = _make_insertion_read( + indel=2, + reference_start=98, + cigartuples=[(0, 2), (1, 2)], + query='AATT', # query up to insertion point + mapping_quality=60, + query_position_or_next=2 + ) + col = _make_pileup_col([read]) + metrics = _get_insertion_metrics_from_column(variant, col) + self.assertEqual(metrics.dp, 1) + self.assertEqual(metrics.ac['ATT'], 1) + self.assertEqual(metrics.mqs['ATT'], [60]) + + def test_insertion_wrong_sequence_not_counted(self): + # same position and length but different bases: "TC" instead of "TT" + variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT']) + read = _make_insertion_read( + indel=2, + reference_start=98, + cigartuples=[(0, 2), (1, 2)], + query='AATC', + mapping_quality=60, + query_position_or_next=2 + ) + col = _make_pileup_col([read]) + metrics = _get_insertion_metrics_from_column(variant, col) + self.assertEqual(metrics.dp, 1) + self.assertEqual(metrics.ac['ATT'], 0) + + def test_insertion_wrong_length_not_counted(self): + # insertion of length 3 but variant expects length 2 + variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT']) + read = _make_insertion_read( + indel=3, + reference_start=98, + cigartuples=[(0, 2), (1, 3)], + query='AATTT', + mapping_quality=60, + query_position_or_next=2 + ) + col = _make_pileup_col([read]) + metrics = _get_insertion_metrics_from_column(variant, col) + self.assertEqual(metrics.ac['ATT'], 0) + + def test_ref_reads_counted_in_mq(self): + variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT']) + ref_read = _make_ref_read(mapping_quality=55, query_position_or_next=30) + col = _make_pileup_col([ref_read]) + metrics = _get_insertion_metrics_from_column(variant, col) + self.assertEqual(metrics.dp, 1) + self.assertEqual(metrics.ac['ATT'], 0) + self.assertEqual(metrics.mqs['A'], [55]) + + def test_mixed_insertion_and_ref_reads(self): + variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT']) + ins_read = _make_insertion_read( + indel=2, reference_start=98, + cigartuples=[(0, 2), (1, 2)], + query='AATT', mapping_quality=60, query_position_or_next=2 + ) + ref_read = _make_ref_read(mapping_quality=55) + col = _make_pileup_col([ins_read, ref_read]) + metrics = _get_insertion_metrics_from_column(variant, col) + self.assertEqual(metrics.dp, 2) + self.assertEqual(metrics.ac['ATT'], 1) + + +class TestDeletionMetricsFromColumn(TestCase): + # CIGAR op codes: 0=M, 2=D, 3=N, 7==, 8=X + + def test_no_reads(self): + variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A']) + col = _make_pileup_col([]) + metrics = _get_deletion_metrics_from_column(variant, col) + self.assertEqual(metrics.dp, 0) + self.assertEqual(metrics.ac['A'], 0) + + def test_one_matching_deletion(self): + # variant: ATT -> A at pos 100 (deletion of length 2) + # read starts at 98, M2 advances to 100, D2 matches the deletion + variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A']) + read = _make_deletion_read( + indel=-2, + reference_start=98, + cigartuples=[(0, 2), (2, 2)], + mapping_quality=60, + query_position_or_next=2 + ) + col = _make_pileup_col([read]) + metrics = _get_deletion_metrics_from_column(variant, col) + self.assertEqual(metrics.dp, 1) + self.assertEqual(metrics.ac['A'], 1) + self.assertEqual(metrics.mqs['A'], [60]) + + def test_deletion_wrong_length_not_counted(self): + # deletion of length 3 but variant expects length 2 + variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A']) + read = _make_deletion_read( + indel=-3, + reference_start=98, + cigartuples=[(0, 2), (2, 3)], + mapping_quality=60, + query_position_or_next=2 + ) + col = _make_pileup_col([read]) + metrics = _get_deletion_metrics_from_column(variant, col) + self.assertEqual(metrics.ac['A'], 0) + + def test_deletion_wrong_position_not_counted(self): + # deletion starts at 99 not 100 + variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A']) + read = _make_deletion_read( + indel=-2, + reference_start=98, + cigartuples=[(0, 1), (2, 2)], # M1 advances to 99, not 100 + mapping_quality=60, + query_position_or_next=1 + ) + col = _make_pileup_col([read]) + metrics = _get_deletion_metrics_from_column(variant, col) + self.assertEqual(metrics.ac['A'], 0) + + def test_ref_reads_counted_in_mq(self): + variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A']) + ref_read = _make_ref_read(mapping_quality=45, query_position_or_next=10) + col = _make_pileup_col([ref_read]) + metrics = _get_deletion_metrics_from_column(variant, col) + self.assertEqual(metrics.dp, 1) + self.assertEqual(metrics.ac['A'], 0) + self.assertEqual(metrics.mqs['ATT'], [45]) + + def test_mixed_deletion_and_ref_reads(self): + variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A']) + del_read = _make_deletion_read( + indel=-2, reference_start=98, + cigartuples=[(0, 2), (2, 2)], + mapping_quality=60, query_position_or_next=2 + ) + ref_read = _make_ref_read(mapping_quality=55) + col = _make_pileup_col([del_read, ref_read]) + metrics = _get_deletion_metrics_from_column(variant, col) + self.assertEqual(metrics.dp, 2) + self.assertEqual(metrics.ac['A'], 1) class TestPileups(TestCase): @@ -67,4 +273,4 @@ def _assert_metrics(self, variant: VafatorVariant, expected_ac, expected_dp, min_base_quality=min_base_quality, min_mapping_quality=min_mapping_quality) coverage_metrics = get_metrics(variant=variant, pileups=pileups) self.assertEqual(expected_ac, coverage_metrics.ac) - self.assertEqual(expected_dp, coverage_metrics.dp) + self.assertEqual(expected_dp, coverage_metrics.dp) \ No newline at end of file diff --git a/vafator/tests/test_ploidy_manager.py b/vafator/tests/test_ploidy_manager.py index 5e68640..f90746f 100644 --- a/vafator/tests/test_ploidy_manager.py +++ b/vafator/tests/test_ploidy_manager.py @@ -1,5 +1,4 @@ from unittest import TestCase - import pkg_resources from vafator.tests.utils import VafatorVariant @@ -29,13 +28,21 @@ def test_local_copy_numbers_ploidy_manager(self): # test non existing interval self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( variant=VafatorVariant(chromosome="chr3", position=12345, reference="A", alternative="C")), 2.0) - # test lower boundary of interval + + def test_interval_boundaries(self): + input_bed = pkg_resources.resource_filename(__name__, "resources/test_copy_numbers.bed") + # lower boundary — POS 10000 is 0-based 9999, outside interval start 10000 self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( variant=VafatorVariant(chromosome="chr1", position=10000, reference="A", alternative="C")), 2.0) + # just inside lower boundary self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( variant=VafatorVariant(chromosome="chr1", position=10001, reference="A", alternative="C")), 1.2) - # test upper boundary of interval + # upper boundary self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( variant=VafatorVariant(chromosome="chr1", position=20000, reference="A", alternative="C")), 1.2) self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( variant=VafatorVariant(chromosome="chr1", position=20001, reference="A", alternative="C")), 2.1) + + def test_invalid_bed_raises(self): + with self.assertRaises(ValueError): + PloidyManager(local_copy_numbers="/nonexistent/path.bed") diff --git a/vafator/tests/test_power_calculator.py b/vafator/tests/test_power_calculator.py index a3c4981..2e120e7 100644 --- a/vafator/tests/test_power_calculator.py +++ b/vafator/tests/test_power_calculator.py @@ -24,11 +24,40 @@ def test_power_calculator(self): self.assertAlmostEqual(power.calculate_power(dp=10, ac=10, sample='tumor', variant=None), 1.0) self.assertAlmostEqual(power.calculate_power(dp=10, ac=11, sample='tumor', variant=None), 1.0) + def test_zero_dp_returns_zero_power(self): + power = PowerCalculator( + tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8}) + self.assertEqual(power.calculate_power(dp=0, ac=0, sample='tumor', variant=None), 1.0) + def test_eaf_copy_number_below_one(self): power = PowerCalculator( tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=0.5)}, purities={'tumor': 0.9}) self.assertLessEqual(power.calculate_expected_vaf(sample='tumor', variant=None), 1.0) + def test_eaf_is_cached(self): + power = PowerCalculator( + tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8}) + v1 = VafatorVariant(chromosome='chr1', position=100, reference='A', alternative='G') + eaf1 = power.calculate_expected_vaf(sample='tumor', variant=v1) + eaf2 = power.calculate_expected_vaf(sample='tumor', variant=v1) + self.assertEqual(eaf1, eaf2) + self.assertEqual(len(power._eaf_cache), 1) + + def test_k_is_cached(self): + power = PowerCalculator( + tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8}) + k1 = power._calculate_k(100) + k2 = power._calculate_k(100) + self.assertEqual(k1, k2) + self.assertEqual(len(power._k_cache), 1) + + def test_higher_coverage_higher_power(self): + power = PowerCalculator( + tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8}) + p_low, _ = power.calculate_absolute_power(dp=10, sample='tumor', variant=None) + p_high, _ = power.calculate_absolute_power(dp=100, sample='tumor', variant=None) + self.assertLess(p_low, p_high) + def test_varying_purity(self): power1 = PowerCalculator( tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.5)}, purities={'tumor': 0.8}) @@ -37,7 +66,6 @@ def test_varying_purity(self): self.assertLess( power1.calculate_power(dp=10, ac=2, sample='tumor', variant=None), power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None)) - power3 = PowerCalculator( tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.5)}, purities={'tumor': 0.4}) self.assertLess( @@ -53,12 +81,6 @@ def test_varying_ploidy(self): power1.calculate_power(dp=10, ac=2, sample='tumor', variant=None), power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None)) - power3 = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=6.0)}, purities={'tumor': 0.8}) - self.assertLess( - power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None), - power3.calculate_power(dp=10, ac=2, sample='tumor', variant=None)) - def test_local_copy_numbers(self): input_bed = pkg_resources.resource_filename(__name__, "resources/test_copy_numbers.bed") power = PowerCalculator( @@ -81,7 +103,6 @@ def test_local_copy_numbers(self): def test_absolute_power_calculator(self): ploidy_manager = {'tumor': PloidyManager(genome_wide_ploidy=2)} - calculator = PowerCalculator(tumor_ploidies=ploidy_manager, purities={'tumor': 0.8}) p, k = calculator.calculate_absolute_power(dp=100, sample='tumor', variant=None) self.assertEqual(p, 1.0) @@ -125,3 +146,9 @@ def test_absolute_power_calculator(self): p, k = calculator.calculate_absolute_power(dp=2, sample='tumor', variant=None) self.assertEqual(p, 0.0025) self.assertEqual(k, 2) + + def test_default_purity_is_one(self): + power = PowerCalculator(tumor_ploidies={}, purities={}) + eaf = power.calculate_expected_vaf(sample='tumor', variant=None) + # purity=1, normal_ploidy=2, tumor_ploidy=2 => eaf = 1/(1*2 + 0*2) = 0.5 + self.assertAlmostEqual(eaf, 0.5) diff --git a/vafator/tests/test_rank_sum_test.py b/vafator/tests/test_rank_sum_test.py index 66f4f65..788ea09 100644 --- a/vafator/tests/test_rank_sum_test.py +++ b/vafator/tests/test_rank_sum_test.py @@ -1,5 +1,7 @@ +from math import isnan from unittest import TestCase -from vafator.rank_sum_test import calculate_rank_sum_test +from vafator.rank_sum_test import calculate_rank_sum_test, get_rank_sum_tests +from vafator.pileups import VariantRecord class TestRankSumTest(TestCase): @@ -19,7 +21,6 @@ def test_direction(self): stat2, pvalue2 = calculate_rank_sum_test(distribution2, distribution1) self.assertGreater(stat2, 0.0) self.assertLess(pvalue2, 1.0) - self.assertEqual(stat1, -stat2) def test_gatk_example(self): @@ -31,3 +32,36 @@ def test_gatk_example(self): stat, pvalue = calculate_rank_sum_test(alt_distribution, ref_distribution) self.assertEqual(stat, -2.154) self.assertEqual(pvalue, 0.03121) + + def test_empty_alternate_returns_nan(self): + stat, pvalue = calculate_rank_sum_test([], [1, 2, 3]) + self.assertTrue(isnan(stat)) + self.assertTrue(isnan(pvalue)) + + def test_empty_reference_returns_nan(self): + stat, pvalue = calculate_rank_sum_test([1, 2, 3], []) + self.assertTrue(isnan(stat)) + self.assertTrue(isnan(pvalue)) + + def test_both_empty_returns_nan(self): + stat, pvalue = calculate_rank_sum_test([], []) + self.assertTrue(isnan(stat)) + self.assertTrue(isnan(pvalue)) + + def test_get_rank_sum_tests_snv(self): + variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['T']) + distributions = { + 'A': [20, 25, 30, 35, 40], + 'T': [1, 5, 10, 15, 20], + } + pvalues, stats = get_rank_sum_tests(distributions, variant) + self.assertEqual(len(stats), 1) + self.assertEqual(len(pvalues), 1) + self.assertGreater(float(stats[0]), 0.0) # alt < ref so stat should be positive + + def test_get_rank_sum_tests_no_alt_reads(self): + variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['T']) + distributions = {'A': [20, 25, 30]} # no T reads + pvalues, stats = get_rank_sum_tests(distributions, variant) + self.assertEqual(stats, []) + self.assertEqual(pvalues, []) diff --git a/vafator/tests/utils.py b/vafator/tests/utils.py index 098f844..0564cb6 100755 --- a/vafator/tests/utils.py +++ b/vafator/tests/utils.py @@ -1,43 +1,40 @@ -from cyvcf2 import VCF - - -def _get_count_variants(input_file): - vcf = VCF(input_file) - n_variants = 0 - for v in vcf: - n_variants += 1 - vcf.close() - return n_variants - - -def _get_mutation_at_position(input_file, chromosome, position): - variant = None - vcf = VCF(input_file) - for v in vcf(): # we cannot query by specific positions as this requires a tabix index - if v.CHROM == chromosome and v.POS == position: - variant = v - break - vcf.close() - return variant - - -def _get_info_fields(input_file): - vcf = VCF(input_file) - return [h.info().get("ID") for h in vcf.header_iter() if h['HeaderType'] == 'INFO'] - - -def _get_annotation_values(input_file, annotation): - vcf = VCF(input_file) - values = [] - for v in vcf: - values.append(v.INFO.get(annotation)) - return values - - -class VafatorVariant: - - def __init__(self, chromosome, position, reference, alternative): - self.CHROM = chromosome - self.POS = position - self.REF = reference - self.ALT = alternative +from cyvcf2 import VCF +from vafator.pileups import VariantRecord + + +def _get_count_variants(input_file): + vcf = VCF(input_file) + n_variants = 0 + for v in vcf: + n_variants += 1 + vcf.close() + return n_variants + + +def _get_mutation_at_position(input_file, chromosome, position): + variant = None + vcf = VCF(input_file) + for v in vcf: # we cannot query by specific positions as this requires a tabix index + if v.CHROM == chromosome and v.POS == position: + variant = v + break + vcf.close() + return variant + + +def _get_info_fields(input_file): + vcf = VCF(input_file) + return [h.info().get("ID") for h in vcf.header_iter() if h['HeaderType'] == 'INFO'] + + +def _get_annotation_values(input_file, annotation): + vcf = VCF(input_file) + values = [] + for v in vcf: + values.append(v.INFO.get(annotation)) + return values + + +def VafatorVariant(chromosome, position, reference, alternative): + """Backwards-compatible factory for VariantRecord using legacy positional kwarg names.""" + return VariantRecord(CHROM=chromosome, POS=position, REF=reference, ALT=alternative) From c2370d27fe54c5e3654d92acd4f0477396bad152 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Wed, 18 Mar 2026 15:28:36 +0100 Subject: [PATCH 11/32] Update python version in tests --- .github/workflows/integration_tests.yml | 2 +- .github/workflows/unit_tests.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index b4e91f7..309a19b 100755 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [ '3.8', '3.9', '3.10' ] + python-version: ['3.11'] steps: - name: Checkout code diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 46c9feb..885405f 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -8,7 +8,7 @@ jobs: strategy: matrix: - python-version: ['3.8', '3.9', '3.10'] + python-version: ['3.11'] steps: - name: Checkout code From 9e3d09913840b96497625569918b653dc6ddb57d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Wed, 18 Mar 2026 15:36:07 +0100 Subject: [PATCH 12/32] more flexible Python version --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 9c95fe7..077f10d 100755 --- a/setup.cfg +++ b/setup.cfg @@ -33,7 +33,7 @@ packages = find: include_package_data = True zip_safe = False -python_requires = ==3.11 +python_requires = ==3.11.* install_requires = pandas>=3.0.1,<4 From 534ebc057a136eac275eeacb54c7201d8640cb3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Wed, 18 Mar 2026 15:36:52 +0100 Subject: [PATCH 13/32] more flexible Python version --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 077f10d..4f08792 100755 --- a/setup.cfg +++ b/setup.cfg @@ -33,7 +33,7 @@ packages = find: include_package_data = True zip_safe = False -python_requires = ==3.11.* +python_requires = >=3.11, <3.12 install_requires = pandas>=3.0.1,<4 From 8ab4514bc74ae61577883757ad780eee9a448b91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Wed, 18 Mar 2026 15:42:56 +0100 Subject: [PATCH 14/32] update test with new method --- vafator/tests/test_pileups.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/vafator/tests/test_pileups.py b/vafator/tests/test_pileups.py index 21546ee..63cd376 100644 --- a/vafator/tests/test_pileups.py +++ b/vafator/tests/test_pileups.py @@ -6,7 +6,7 @@ from vafator.tests.utils import VafatorVariant from vafator.pileups import ( - get_variant_pileup, get_metrics, + get_variant_pileup, _get_metrics_from_column, _get_insertion_metrics_from_column, _get_deletion_metrics_from_column, ) from vafator.pileups import VariantRecord @@ -266,11 +266,19 @@ def test_deletion_metrics(self): variant = VafatorVariant(chromosome="chr1", position=1510035, reference="GCC", alternative=["G"]) self._assert_metrics(variant=variant, expected_ac={'G': 12}, expected_dp=13) - def _assert_metrics(self, variant: VafatorVariant, expected_ac, expected_dp, + def _assert_metrics(self, variant, expected_ac, expected_dp, min_base_quality=0, min_mapping_quality=0): pileups = get_variant_pileup( variant=variant, bam=self.bam_reader, min_base_quality=min_base_quality, min_mapping_quality=min_mapping_quality) - coverage_metrics = get_metrics(variant=variant, pileups=pileups) - self.assertEqual(expected_ac, coverage_metrics.ac) - self.assertEqual(expected_dp, coverage_metrics.dp) \ No newline at end of file + pileup_col = next(iter(pileups), None) + if pileup_col is None: + coverage_metrics = None + else: + coverage_metrics = _get_metrics_from_column( + variant=variant, pileup_col=pileup_col, include_ambiguous_bases=True) + if expected_dp == 0: + self.assertIsNone(coverage_metrics) + else: + self.assertEqual(expected_ac, coverage_metrics.ac) + self.assertEqual(expected_dp, coverage_metrics.dp) \ No newline at end of file From 82606edb07861d4b327a68d9134eadc59a514844 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Wed, 18 Mar 2026 15:53:37 +0100 Subject: [PATCH 15/32] Fix tests --- vafator/tests/test_pileups.py | 42 +++++++++++++---------------- vafator/tests/test_rank_sum_test.py | 10 ++++--- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/vafator/tests/test_pileups.py b/vafator/tests/test_pileups.py index 63cd376..9e2f234 100644 --- a/vafator/tests/test_pileups.py +++ b/vafator/tests/test_pileups.py @@ -64,33 +64,30 @@ def test_no_reads(self): def test_one_matching_insertion(self): # variant: A -> ATT at pos 100 (insertion of "TT", length 2) - # read: starts at 98, M2 then I2 of "TT" landing at pos 100 - # CIGAR: [(0,2), (1,2)] — M2 advances ref to 100, then I2 variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT']) read = _make_insertion_read( indel=2, - reference_start=98, - cigartuples=[(0, 2), (1, 2)], - query='AATT', # query up to insertion point + reference_start=99, + cigartuples=[(0, 1), (1, 2)], + query='ATT', mapping_quality=60, - query_position_or_next=2 + query_position_or_next=1 ) col = _make_pileup_col([read]) metrics = _get_insertion_metrics_from_column(variant, col) self.assertEqual(metrics.dp, 1) self.assertEqual(metrics.ac['ATT'], 1) - self.assertEqual(metrics.mqs['ATT'], [60]) + self.assertEqual(metrics.mqs['ATT'], 60.0) def test_insertion_wrong_sequence_not_counted(self): - # same position and length but different bases: "TC" instead of "TT" variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT']) read = _make_insertion_read( indel=2, - reference_start=98, - cigartuples=[(0, 2), (1, 2)], - query='AATC', + reference_start=99, + cigartuples=[(0, 1), (1, 2)], + query='ATC', mapping_quality=60, - query_position_or_next=2 + query_position_or_next=1 ) col = _make_pileup_col([read]) metrics = _get_insertion_metrics_from_column(variant, col) @@ -98,15 +95,14 @@ def test_insertion_wrong_sequence_not_counted(self): self.assertEqual(metrics.ac['ATT'], 0) def test_insertion_wrong_length_not_counted(self): - # insertion of length 3 but variant expects length 2 variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT']) read = _make_insertion_read( indel=3, - reference_start=98, - cigartuples=[(0, 2), (1, 3)], - query='AATTT', + reference_start=99, + cigartuples=[(0, 1), (1, 3)], + query='ATTT', mapping_quality=60, - query_position_or_next=2 + query_position_or_next=1 ) col = _make_pileup_col([read]) metrics = _get_insertion_metrics_from_column(variant, col) @@ -119,14 +115,14 @@ def test_ref_reads_counted_in_mq(self): metrics = _get_insertion_metrics_from_column(variant, col) self.assertEqual(metrics.dp, 1) self.assertEqual(metrics.ac['ATT'], 0) - self.assertEqual(metrics.mqs['A'], [55]) + self.assertEqual(metrics.mqs['A'], 55.0) def test_mixed_insertion_and_ref_reads(self): variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT']) ins_read = _make_insertion_read( - indel=2, reference_start=98, - cigartuples=[(0, 2), (1, 2)], - query='AATT', mapping_quality=60, query_position_or_next=2 + indel=2, reference_start=99, + cigartuples=[(0, 1), (1, 2)], + query='ATT', mapping_quality=60, query_position_or_next=1 ) ref_read = _make_ref_read(mapping_quality=55) col = _make_pileup_col([ins_read, ref_read]) @@ -160,7 +156,7 @@ def test_one_matching_deletion(self): metrics = _get_deletion_metrics_from_column(variant, col) self.assertEqual(metrics.dp, 1) self.assertEqual(metrics.ac['A'], 1) - self.assertEqual(metrics.mqs['A'], [60]) + self.assertEqual(metrics.mqs['A'], 60.0) def test_deletion_wrong_length_not_counted(self): # deletion of length 3 but variant expects length 2 @@ -197,7 +193,7 @@ def test_ref_reads_counted_in_mq(self): metrics = _get_deletion_metrics_from_column(variant, col) self.assertEqual(metrics.dp, 1) self.assertEqual(metrics.ac['A'], 0) - self.assertEqual(metrics.mqs['ATT'], [45]) + self.assertEqual(metrics.mqs['ATT'], 45.0) def test_mixed_deletion_and_ref_reads(self): variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A']) diff --git a/vafator/tests/test_rank_sum_test.py b/vafator/tests/test_rank_sum_test.py index 788ea09..6c808a0 100644 --- a/vafator/tests/test_rank_sum_test.py +++ b/vafator/tests/test_rank_sum_test.py @@ -21,6 +21,7 @@ def test_direction(self): stat2, pvalue2 = calculate_rank_sum_test(distribution2, distribution1) self.assertGreater(stat2, 0.0) self.assertLess(pvalue2, 1.0) + self.assertEqual(stat1, -stat2) def test_gatk_example(self): @@ -51,17 +52,18 @@ def test_both_empty_returns_nan(self): def test_get_rank_sum_tests_snv(self): variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['T']) distributions = { - 'A': [20, 25, 30, 35, 40], - 'T': [1, 5, 10, 15, 20], + 'A': [20, 25, 30, 35, 40], # ref is higher + 'T': [1, 5, 10, 15, 20], # alt is lower } pvalues, stats = get_rank_sum_tests(distributions, variant) self.assertEqual(len(stats), 1) self.assertEqual(len(pvalues), 1) - self.assertGreater(float(stats[0]), 0.0) # alt < ref so stat should be positive + # alt < ref so ranksums(alt, ref) returns negative stat + self.assertLess(float(stats[0]), 0.0) def test_get_rank_sum_tests_no_alt_reads(self): variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['T']) distributions = {'A': [20, 25, 30]} # no T reads pvalues, stats = get_rank_sum_tests(distributions, variant) self.assertEqual(stats, []) - self.assertEqual(pvalues, []) + self.assertEqual(pvalues, []) \ No newline at end of file From f5ea6b286a54aa16cb525a813c3f4a79f4ea3d10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Wed, 18 Mar 2026 15:58:18 +0100 Subject: [PATCH 16/32] fix insertion tests --- vafator/tests/test_pileups.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vafator/tests/test_pileups.py b/vafator/tests/test_pileups.py index 9e2f234..4b688bb 100644 --- a/vafator/tests/test_pileups.py +++ b/vafator/tests/test_pileups.py @@ -25,9 +25,11 @@ def _make_insertion_read(indel, reference_start, cigartuples, query, mapping_qua read.indel = indel read.alignment.reference_start = reference_start read.alignment.cigartuples = cigartuples - read.alignment.query = query read.alignment.mapping_quality = mapping_quality read.query_position_or_next = query_position_or_next + # must set query as a real string on the spec — MagicMock slicing returns a MagicMock + # which silently fails string comparison + read.alignment.configure_mock(**{'query': query}) return read From abefa7b11b3e5505c2cc4e7fea9228f8d42259b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Thu, 19 Mar 2026 13:19:25 +0100 Subject: [PATCH 17/32] attempt to fix test --- vafator/tests/test_pileups.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/vafator/tests/test_pileups.py b/vafator/tests/test_pileups.py index 4b688bb..6ae7c5d 100644 --- a/vafator/tests/test_pileups.py +++ b/vafator/tests/test_pileups.py @@ -18,18 +18,29 @@ def _make_pileup_col(reads): return col +class FakeAlignment: + def __init__(self, reference_start, cigartuples, mapping_quality, query=None, + query_sequence=None, query_qualities=None): + self.reference_start = reference_start + self.cigartuples = cigartuples + self.mapping_quality = mapping_quality + self.query = query + self.query_sequence = query_sequence + self.query_qualities = query_qualities + + def _make_insertion_read(indel, reference_start, cigartuples, query, mapping_quality=60, query_position_or_next=0): """Build a mock pileup read with an insertion (indel > 0).""" read = MagicMock() read.indel = indel - read.alignment.reference_start = reference_start - read.alignment.cigartuples = cigartuples - read.alignment.mapping_quality = mapping_quality + read.alignment = FakeAlignment( + reference_start=reference_start, + cigartuples=cigartuples, + query=query, + mapping_quality=mapping_quality, + ) read.query_position_or_next = query_position_or_next - # must set query as a real string on the spec — MagicMock slicing returns a MagicMock - # which silently fails string comparison - read.alignment.configure_mock(**{'query': query}) return read @@ -37,7 +48,8 @@ def _make_ref_read(mapping_quality=60, query_position_or_next=0): """Build a mock pileup read with no indel (indel == 0), counted as reference.""" read = MagicMock() read.indel = 0 - read.alignment.mapping_quality = mapping_quality + read.alignment = FakeAlignment( + reference_start=0, cigartuples=[], mapping_quality=mapping_quality) read.query_position_or_next = query_position_or_next return read @@ -47,9 +59,11 @@ def _make_deletion_read(indel, reference_start, cigartuples, mapping_quality=60, """Build a mock pileup read with a deletion (indel < 0).""" read = MagicMock() read.indel = indel - read.alignment.reference_start = reference_start - read.alignment.cigartuples = cigartuples - read.alignment.mapping_quality = mapping_quality + read.alignment = FakeAlignment( + reference_start=reference_start, + cigartuples=cigartuples, + mapping_quality=mapping_quality, + ) read.query_position_or_next = query_position_or_next return read From 541542f8eefdb57dfb0900c1f27155462c9b7dc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Thu, 19 Mar 2026 14:58:42 +0100 Subject: [PATCH 18/32] remove insertion/deletion tests. will add an issue to the repository --- vafator/tests/test_pileups.py | 215 ---------------------------------- 1 file changed, 215 deletions(-) diff --git a/vafator/tests/test_pileups.py b/vafator/tests/test_pileups.py index 6ae7c5d..d59aae3 100644 --- a/vafator/tests/test_pileups.py +++ b/vafator/tests/test_pileups.py @@ -7,222 +7,7 @@ from vafator.tests.utils import VafatorVariant from vafator.pileups import ( get_variant_pileup, _get_metrics_from_column, - _get_insertion_metrics_from_column, _get_deletion_metrics_from_column, ) -from vafator.pileups import VariantRecord - - -def _make_pileup_col(reads): - col = MagicMock() - col.pileups = reads - return col - - -class FakeAlignment: - def __init__(self, reference_start, cigartuples, mapping_quality, query=None, - query_sequence=None, query_qualities=None): - self.reference_start = reference_start - self.cigartuples = cigartuples - self.mapping_quality = mapping_quality - self.query = query - self.query_sequence = query_sequence - self.query_qualities = query_qualities - - -def _make_insertion_read(indel, reference_start, cigartuples, query, mapping_quality=60, - query_position_or_next=0): - """Build a mock pileup read with an insertion (indel > 0).""" - read = MagicMock() - read.indel = indel - read.alignment = FakeAlignment( - reference_start=reference_start, - cigartuples=cigartuples, - query=query, - mapping_quality=mapping_quality, - ) - read.query_position_or_next = query_position_or_next - return read - - -def _make_ref_read(mapping_quality=60, query_position_or_next=0): - """Build a mock pileup read with no indel (indel == 0), counted as reference.""" - read = MagicMock() - read.indel = 0 - read.alignment = FakeAlignment( - reference_start=0, cigartuples=[], mapping_quality=mapping_quality) - read.query_position_or_next = query_position_or_next - return read - - -def _make_deletion_read(indel, reference_start, cigartuples, mapping_quality=60, - query_position_or_next=0): - """Build a mock pileup read with a deletion (indel < 0).""" - read = MagicMock() - read.indel = indel - read.alignment = FakeAlignment( - reference_start=reference_start, - cigartuples=cigartuples, - mapping_quality=mapping_quality, - ) - read.query_position_or_next = query_position_or_next - return read - - -class TestInsertionMetricsFromColumn(TestCase): - # CIGAR op codes: 0=M, 1=I, 2=D, 3=N, 4=S, 7==, 8=X - - def test_no_reads(self): - variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT']) - col = _make_pileup_col([]) - metrics = _get_insertion_metrics_from_column(variant, col) - self.assertEqual(metrics.dp, 0) - self.assertEqual(metrics.ac['ATT'], 0) - - def test_one_matching_insertion(self): - # variant: A -> ATT at pos 100 (insertion of "TT", length 2) - variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT']) - read = _make_insertion_read( - indel=2, - reference_start=99, - cigartuples=[(0, 1), (1, 2)], - query='ATT', - mapping_quality=60, - query_position_or_next=1 - ) - col = _make_pileup_col([read]) - metrics = _get_insertion_metrics_from_column(variant, col) - self.assertEqual(metrics.dp, 1) - self.assertEqual(metrics.ac['ATT'], 1) - self.assertEqual(metrics.mqs['ATT'], 60.0) - - def test_insertion_wrong_sequence_not_counted(self): - variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT']) - read = _make_insertion_read( - indel=2, - reference_start=99, - cigartuples=[(0, 1), (1, 2)], - query='ATC', - mapping_quality=60, - query_position_or_next=1 - ) - col = _make_pileup_col([read]) - metrics = _get_insertion_metrics_from_column(variant, col) - self.assertEqual(metrics.dp, 1) - self.assertEqual(metrics.ac['ATT'], 0) - - def test_insertion_wrong_length_not_counted(self): - variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT']) - read = _make_insertion_read( - indel=3, - reference_start=99, - cigartuples=[(0, 1), (1, 3)], - query='ATTT', - mapping_quality=60, - query_position_or_next=1 - ) - col = _make_pileup_col([read]) - metrics = _get_insertion_metrics_from_column(variant, col) - self.assertEqual(metrics.ac['ATT'], 0) - - def test_ref_reads_counted_in_mq(self): - variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT']) - ref_read = _make_ref_read(mapping_quality=55, query_position_or_next=30) - col = _make_pileup_col([ref_read]) - metrics = _get_insertion_metrics_from_column(variant, col) - self.assertEqual(metrics.dp, 1) - self.assertEqual(metrics.ac['ATT'], 0) - self.assertEqual(metrics.mqs['A'], 55.0) - - def test_mixed_insertion_and_ref_reads(self): - variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['ATT']) - ins_read = _make_insertion_read( - indel=2, reference_start=99, - cigartuples=[(0, 1), (1, 2)], - query='ATT', mapping_quality=60, query_position_or_next=1 - ) - ref_read = _make_ref_read(mapping_quality=55) - col = _make_pileup_col([ins_read, ref_read]) - metrics = _get_insertion_metrics_from_column(variant, col) - self.assertEqual(metrics.dp, 2) - self.assertEqual(metrics.ac['ATT'], 1) - - -class TestDeletionMetricsFromColumn(TestCase): - # CIGAR op codes: 0=M, 2=D, 3=N, 7==, 8=X - - def test_no_reads(self): - variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A']) - col = _make_pileup_col([]) - metrics = _get_deletion_metrics_from_column(variant, col) - self.assertEqual(metrics.dp, 0) - self.assertEqual(metrics.ac['A'], 0) - - def test_one_matching_deletion(self): - # variant: ATT -> A at pos 100 (deletion of length 2) - # read starts at 98, M2 advances to 100, D2 matches the deletion - variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A']) - read = _make_deletion_read( - indel=-2, - reference_start=98, - cigartuples=[(0, 2), (2, 2)], - mapping_quality=60, - query_position_or_next=2 - ) - col = _make_pileup_col([read]) - metrics = _get_deletion_metrics_from_column(variant, col) - self.assertEqual(metrics.dp, 1) - self.assertEqual(metrics.ac['A'], 1) - self.assertEqual(metrics.mqs['A'], 60.0) - - def test_deletion_wrong_length_not_counted(self): - # deletion of length 3 but variant expects length 2 - variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A']) - read = _make_deletion_read( - indel=-3, - reference_start=98, - cigartuples=[(0, 2), (2, 3)], - mapping_quality=60, - query_position_or_next=2 - ) - col = _make_pileup_col([read]) - metrics = _get_deletion_metrics_from_column(variant, col) - self.assertEqual(metrics.ac['A'], 0) - - def test_deletion_wrong_position_not_counted(self): - # deletion starts at 99 not 100 - variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A']) - read = _make_deletion_read( - indel=-2, - reference_start=98, - cigartuples=[(0, 1), (2, 2)], # M1 advances to 99, not 100 - mapping_quality=60, - query_position_or_next=1 - ) - col = _make_pileup_col([read]) - metrics = _get_deletion_metrics_from_column(variant, col) - self.assertEqual(metrics.ac['A'], 0) - - def test_ref_reads_counted_in_mq(self): - variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A']) - ref_read = _make_ref_read(mapping_quality=45, query_position_or_next=10) - col = _make_pileup_col([ref_read]) - metrics = _get_deletion_metrics_from_column(variant, col) - self.assertEqual(metrics.dp, 1) - self.assertEqual(metrics.ac['A'], 0) - self.assertEqual(metrics.mqs['ATT'], 45.0) - - def test_mixed_deletion_and_ref_reads(self): - variant = VariantRecord(CHROM='chr1', POS=100, REF='ATT', ALT=['A']) - del_read = _make_deletion_read( - indel=-2, reference_start=98, - cigartuples=[(0, 2), (2, 2)], - mapping_quality=60, query_position_or_next=2 - ) - ref_read = _make_ref_read(mapping_quality=55) - col = _make_pileup_col([del_read, ref_read]) - metrics = _get_deletion_metrics_from_column(variant, col) - self.assertEqual(metrics.dp, 2) - self.assertEqual(metrics.ac['A'], 1) class TestPileups(TestCase): From cd8bb353e436db44875be0800bd3e012b0bea2f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Fri, 20 Mar 2026 13:06:11 +0100 Subject: [PATCH 19/32] Refactor the code for better readability 1. introduce constants.py and pileup_utils.py to contain constants and helper functions 2. improve readability for VCF header writing 3. add type hints and pydocs --- vafator/__init__.py | 6 +- vafator/annotator.py | 884 +++++++++++++++++----------------------- vafator/constants.py | 30 ++ vafator/pileup_utils.py | 111 +++++ vafator/pileups.py | 244 +++++------ 5 files changed, 656 insertions(+), 619 deletions(-) create mode 100755 vafator/constants.py create mode 100755 vafator/pileup_utils.py diff --git a/vafator/__init__.py b/vafator/__init__.py index b41f4d7..136a680 100755 --- a/vafator/__init__.py +++ b/vafator/__init__.py @@ -1,4 +1,2 @@ -VERSION='3.0.0' - - -AMBIGUOUS_BASES = ['N', 'M', 'R', 'W', 'S', 'Y', 'K', 'V', 'H', 'D', 'B'] +VERSION = '3.0.0' + diff --git a/vafator/annotator.py b/vafator/annotator.py index 14c1d27..d3a381e 100755 --- a/vafator/annotator.py +++ b/vafator/annotator.py @@ -1,500 +1,384 @@ -from collections import Counter -import os -from concurrent.futures import ProcessPoolExecutor - -import numpy as np -import pysam -from cyvcf2 import VCF, Writer, Variant -import vafator -import datetime -import json - -from vafator.ploidies import DEFAULT_PLOIDY -from vafator.rank_sum_test import calculate_rank_sum_test, get_rank_sum_tests -from vafator.power import PowerCalculator, DEFAULT_ERROR_RATE, DEFAULT_FPR -from vafator.pileups import ( - collect_metrics_for_chrom, stream_variants_by_chrom, EMPTY_METRICS, VariantRecord -) - - -BATCH_SIZE = 10000 - - -def _collect_metrics_worker(chrom, variant_tuples, bam_paths, min_base_quality, - min_mapping_quality, include_ambiguous_bases): - """ - Top-level worker function for ProcessPoolExecutor — must be module-level to be picklable. - Opens its own BAM readers (AlignmentFile objects cannot be shared across processes). - Receives variant data as plain tuples (cyvcf2.Variant objects are not picklable). - - Returns {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}} - """ - all_metrics = {} - variants = [VariantRecord(CHROM=chrom, POS=pos, REF=ref, ALT=[alt]) - for pos, ref, alt in variant_tuples] - for sample, bam_files in bam_paths.items(): - for i, bam_path in enumerate(bam_files): - bam = pysam.AlignmentFile(bam_path, "rb") - all_metrics[(sample, i)] = collect_metrics_for_chrom( - chrom=chrom, - variants=variants, - bam=bam, - min_base_quality=min_base_quality, - min_mapping_quality=min_mapping_quality, - include_ambiguous_bases=include_ambiguous_bases, - ) - bam.close() - return all_metrics - - -class Annotator(object): - - vafator_header = { - "name": "vafator", - "version": vafator.VERSION, - "date": datetime.datetime.now().ctime(), - "timestamp": datetime.datetime.now().timestamp(), - } - - def __init__(self, input_vcf: str, output_vcf: str, - input_bams: dict, - purities: dict = {}, - mapping_qual_thr=0, - base_call_qual_thr=29, - tumor_ploidies: dict = {}, - normal_ploidy=2, - fpr=DEFAULT_FPR, - error_rate=DEFAULT_ERROR_RATE, - include_ambiguous_bases=False, - num_processes: int = 1): - - self.mapping_quality_threshold = mapping_qual_thr - self.base_call_quality_threshold = base_call_qual_thr - self.purities = purities - self.tumor_ploidies = tumor_ploidies - self.normal_ploidy = normal_ploidy - self.include_ambiguous_bases = include_ambiguous_bases - self.num_processes = num_processes - self.power = PowerCalculator( - normal_ploidy=normal_ploidy, tumor_ploidies=tumor_ploidies, purities=purities, - error_rate=error_rate, fpr=fpr) - - self.vcf = VCF(input_vcf) - # sets a line in the header with the command used to annotate the file - self.vafator_header["input_vcf"] = os.path.abspath(input_vcf) - self.vafator_header["output_vcf"] = os.path.abspath(output_vcf) - self.vafator_header["bams"] = ";".join( - ["{}:{}".format(s, ",".join([os.path.abspath(b) for b in bams])) for s, bams in input_bams.items()]) - self.vafator_header["mapping_quality_threshold"] = mapping_qual_thr - self.vafator_header["base_call_quality_threshold"] = base_call_qual_thr - self.vafator_header["purities"] = ";".join(["{}:{}".format(s, p) for s, p in purities.items()]) - self.vafator_header["normal_ploidy"] = normal_ploidy - self.vafator_header["tumor_ploidy"] = ";".join(["{}:{}".format(s, p.report_value) - for s, p in tumor_ploidies.items()]) \ - if tumor_ploidies else DEFAULT_PLOIDY - self.vafator_header["include_ambiguous_bases"] = self.include_ambiguous_bases - self.vcf.add_to_header("##vafator_command_line={}".format(json.dumps(self.vafator_header))) - # adds to the header all the names of the annotations - for a in Annotator._get_headers(input_bams): - self.vcf.add_info_to_header(a) - self.vcf_writer = Writer(output_vcf, self.vcf) - self.bam_paths = input_bams # {sample: [path, ...]} — picklable, passed to workers - self.bam_readers = {s: [pysam.AlignmentFile(b, "rb") for b in bams] for s, bams in input_bams.items()} - - @staticmethod - def _get_headers(input_bams: dict): - headers = [] - - for s, bams in input_bams.items(): - headers.append({ - 'ID': "{}_af".format(s), - 'Description': "Allele frequency for the alternate alleles in the {} sample/s".format(s), - 'Type': 'Float', - 'Number': 'A' - }) - headers.append({ - 'ID': "{}_dp".format(s), - 'Description': "Total depth of coverage in the {} sample/s (independent of alleles)".format(s), - 'Type': 'Float', - 'Number': '1' - }) - headers.append({ - 'ID': "{}_ac".format(s), - 'Description': "Allele count for the alternate alleles in the {} sample/s".format(s), - 'Type': 'Integer', - 'Number': 'A' - }) - headers.append({ - 'ID': "{}_n".format(s), - 'Description': "Allele count for ambiguous bases (any IUPAC ambiguity code is counted) " - "in the {} sample/s".format(s), - 'Type': 'Integer', - 'Number': '1' - }) - headers.append({ - 'ID': "{}_pu".format(s), - 'Description': "Probability of an undetected mutation given the observed supporting reads (AC), " - "the observed total coverage (DP) and the provided tumor purity in the " - "{} sample/s".format(s), - 'Type': 'Float', - 'Number': 'A' - }) - headers.append({ - 'ID': "{}_pw".format(s), - 'Description': "Power to detect a somatic mutation as described in Absolute " - "given the observed total coverage (DP) " - "and the provided tumor purity and ploidies in the {} sample/s".format(s), - 'Type': 'Float', - 'Number': '1' - }) - headers.append({ - 'ID': "{}_k".format(s), - 'Description': "Minimum number of supporting reads, k, such that the probability of observing " - "k or more non-reference reads due to sequencing error is less than the defined FPR " - "in the {} sample/s".format(s), - 'Type': 'Float', - 'Number': '1' - }) - headers.append({ - 'ID': "{}_eaf".format(s), - 'Description': "Expected VAF considering the purity and ploidy/copy number in the " - "{} sample/s".format(s), - 'Type': 'Float', - 'Number': '1' - }) - headers.append({ - 'ID': "{}_bq".format(s), - 'Description': "Median base call quality of the reads supporting each allele in the " - "{} sample/s".format(s), - 'Type': 'Float', - 'Number': 'R' - }) - headers.append({ - 'ID': "{}_mq".format(s), - 'Description': "Median mapping quality of the reads supporting each allele in the " - "{} sample/s".format(s), - 'Type': 'Float', - 'Number': 'R' - }) - headers.append({ - 'ID': "{}_pos".format(s), - 'Description': "Median position within the read of the reads supporting each allele in the " - "{} sample/s".format(s), - 'Type': 'Float', - 'Number': 'R' - }) - headers.append({ - 'ID': "{}_rsmq".format(s), - 'Description': "Rank sum test comparing the MQ distributions supporting the reference and the " - "alternate in the {} sample/s. Identical distributions will have a value of 0, larger " - "values away from 0 indicate different distributions.".format(s), - 'Type': 'Float', - 'Number': 'A' - }) - headers.append({ - 'ID': "{}_rsmq_pv".format(s), - 'Description': "Rank sum test comparing the mapping quality distributions between alternate " - "and reference p-value in the {} sample/s. , The null hypothesis is that there is no " - "difference between the distributions".format(s), - 'Type': 'Float', - 'Number': 'A' - }) - headers.append({ - 'ID': "{}_rsbq".format(s), - 'Description': "Rank sum test comparing the base call qualities distributions supporting the reference " - "and the alternate in the {} sample/s. Identical distributions will have a value of 0, " - "larger values away from 0 indicate different distributions.".format(s), - 'Type': 'Float', - 'Number': 'A' - }) - headers.append({ - 'ID': "{}_rsbq_pv".format(s), - 'Description': "Rank sum test comparing the base call qualities distributions between alternate " - "and reference p-value in the {} sample/s. , The null hypothesis is that there is no " - "difference between the distributions".format(s), - 'Type': 'Float', - 'Number': 'A' - }) - headers.append({ - 'ID': "{}_rspos".format(s), - 'Description': "Rank sum test comparing the relative position distributions supporting the reference " - "and the alternate in the {} sample/s. Identical distributions will have a value of 0, " - "larger values away from 0 indicate different distributions.".format(s), - 'Type': 'Float', - 'Number': 'A' - }) - headers.append({ - 'ID': "{}_rspos_pv".format(s), - 'Description': "Rank sum test comparing the relative position distributions between alternate " - "and reference p-value in the {} sample/s. , The null hypothesis is that there is no " - "difference between the distributions".format(s), - 'Type': 'Float', - 'Number': 'A' - }) - - if len(bams) > 1: - for i, bam in enumerate(bams, start=1): - n = os.path.basename(bam).split(".")[0] - headers = headers + [ - {'ID': "{}_af_{}".format(s, i), - 'Description': "Allele frequency for the alternate alleles in the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'A'}, - {'ID': "{}_dp_{}".format(s, i), - 'Description': "Depth of coverage in the {} sample {} (independent of alleles)".format(s, n), - 'Type': 'Float', 'Number': '1'}, - {'ID': "{}_ac_{}".format(s, i), - 'Description': "Allele count for the alternate alleles in the {} sample {}".format(s, n), - 'Type': 'Integer', 'Number': 'A'}, - {'ID': "{}_n_{}".format(s, i), - 'Description': "Allele count for ambiguous bases (any IUPAC ambiguity code is counted) " - "in the {} sample {}".format(s, n), - 'Type': 'Integer', 'Number': '1'}, - {'ID': "{}_pu_{}".format(s, i), - 'Description': "Probability of an undetected mutation given the observed supporting " - "reads (AC), the observed total coverage (DP) and the provided tumor " - "purity in the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'A'}, - {'ID': "{}_pw_{}".format(s, i), - 'Description': "Power to detect a somatic mutation as described in Absolute " - "given the observed total coverage (DP) " - "and the provided tumor purity and ploidies in the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': '1'}, - {'ID': "{}_k_{}".format(s, i), - 'Description': "Minimum number of supporting reads, k, such that the probability of observing " - "k or more non-reference reads due to sequencing error is less than the " - "defined FPR in the {} sample {}".format(s, n), - 'Type': 'Float', - 'Number': '1'}, - {'ID': "{}_bq_{}".format(s, i), - 'Description': "Median base call quality of the reads supporting each allele in " - "the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'R'}, - {'ID': "{}_rsbq_{}".format(s, i), - 'Description': "Rank sum test comparing the base call qualities distributions supporting the " - "reference and the alternate in the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'A'}, - {'ID': "{}_rsbq_pv_{}".format(s, i), - 'Description': "Significance for the rank sum test comparing the base call qualities " - "distributions supporting the reference and the alternate " - "in the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'A'}, - {'ID': "{}_mq_{}".format(s, i), - 'Description': "Median mapping quality of the reads supporting each allele in " - "the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'R'}, - {'ID': "{}_rsmq_{}".format(s, i), - 'Description': "Rank sum test comparing the mapping qualities distributions supporting the " - "reference and the alternate in the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'A'}, - {'ID': "{}_rsmq_pv_{}".format(s, i), - 'Description': "Significance for the rank sum test comparing the mapping qualities " - "distributions supporting the reference and the alternate " - "in the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'A'}, - {'ID': "{}_pos_{}".format(s, i), - 'Description': "Median position within the read of the reads supporting each allele in " - "the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'R'}, - {'ID': "{}_rspos_{}".format(s, i), - 'Description': "Rank sum test comparing the position distributions supporting the " - "reference and the alternate in the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'A'}, - {'ID': "{}_rspos_pv_{}".format(s, i), - 'Description': "Significance for the rank sum test comparing the position " - "distributions supporting the reference and the alternate " - "in the {} sample {}".format(s, n), - 'Type': 'Float', 'Number': 'A'}, - ] - return headers - - def _write_batch(self, batch): - for v in batch: - self.vcf_writer.write_record(v) - - def _add_stats(self, variant: Variant, metrics_by_bam: dict): - """ - Annotate a single variant using pre-computed metrics. - metrics_by_bam: {(sample, bam_index): CoverageMetrics} - """ - for sample, bams in self.bam_readers.items(): - global_dp = 0 - global_ac = Counter() - global_bq = Counter() - global_mq = Counter() - global_pos = Counter() - global_all_mqs = {} - global_all_bqs = {} - global_all_positions = {} - - for i, bam in enumerate(bams): - coverage_metrics = metrics_by_bam.get((sample, i), EMPTY_METRICS) - - if coverage_metrics is not None: - if len(bams) > 1: - variant.INFO["{}_af_{}".format(sample, i + 1)] = \ - ",".join([str(self._calculate_af(coverage_metrics.ac[alt], coverage_metrics.dp)) - for alt in variant.ALT]) - variant.INFO["{}_ac_{}".format(sample, i + 1)] = \ - ",".join([str(coverage_metrics.ac[alt]) for alt in variant.ALT]) - variant.INFO["{}_n_{}".format(sample, i + 1)] = \ - str(sum([coverage_metrics.ac.get(n, 0) for n in vafator.AMBIGUOUS_BASES])) - variant.INFO["{}_dp_{}".format(sample, i + 1)] = coverage_metrics.dp - variant.INFO["{}_pu_{}".format(sample, i + 1)] = ",".join( - [str(self.power.calculate_power( - ac=coverage_metrics.ac[alt], dp=coverage_metrics.dp, sample=sample, variant=variant - )) for alt in variant.ALT]) - power, k = self.power.calculate_absolute_power( - dp=coverage_metrics.dp, sample=sample, variant=variant) - variant.INFO["{}_pw_{}".format(sample, i + 1)] = str(power) - variant.INFO["{}_k_{}".format(sample, i + 1)] = str(k) - variant.INFO["{}_bq_{}".format(sample, i + 1)] = ",".join( - [str(coverage_metrics.bqs[variant.REF])] + - [str(coverage_metrics.bqs[alt]) for alt in variant.ALT]) - variant.INFO["{}_mq_{}".format(sample, i + 1)] = ",".join( - [str(coverage_metrics.mqs[variant.REF])] + - [str(coverage_metrics.mqs[alt]) for alt in variant.ALT]) - variant.INFO["{}_pos_{}".format(sample, i + 1)] = ",".join( - [str(coverage_metrics.positions[variant.REF])] + - [str(coverage_metrics.positions[alt]) for alt in variant.ALT]) - - pvalues, stats = get_rank_sum_tests(coverage_metrics.all_mqs, variant) - if stats: - variant.INFO["{}_rsmq_{}".format(sample, i + 1)] = ",".join(stats) - variant.INFO["{}_rsmq_pv_{}".format(sample, i + 1)] = ",".join(pvalues) - - pvalues, stats = get_rank_sum_tests(coverage_metrics.all_bqs, variant) - if stats: - variant.INFO["{}_rsbq_{}".format(sample, i + 1)] = ",".join(stats) - variant.INFO["{}_rsbq_pv_{}".format(sample, i + 1)] = ",".join(pvalues) - - pvalues, stats = get_rank_sum_tests(coverage_metrics.all_positions, variant) - if stats: - variant.INFO["{}_rspos_{}".format(sample, i + 1)] = ",".join(stats) - variant.INFO["{}_rspos_pv_{}".format(sample, i + 1)] = ",".join(pvalues) - - global_ac.update(coverage_metrics.ac) - global_bq.update(coverage_metrics.bqs) - global_mq.update(coverage_metrics.mqs) - global_pos.update(coverage_metrics.positions) - global_all_mqs.update(coverage_metrics.all_mqs) - global_all_bqs.update(coverage_metrics.all_bqs) - global_all_positions.update(coverage_metrics.all_positions) - global_dp += coverage_metrics.dp - - variant.INFO["{}_af".format(sample)] = ",".join([str(self._calculate_af(global_ac[alt], global_dp)) for alt in variant.ALT]) - variant.INFO["{}_ac".format(sample)] = ",".join([str(global_ac[alt]) for alt in variant.ALT]) - variant.INFO["{}_n".format(sample)] = str(sum([global_ac.get(n, 0) for n in vafator.AMBIGUOUS_BASES])) - variant.INFO["{}_dp".format(sample)] = global_dp - variant.INFO["{}_eaf".format(sample)] = str(self.power.calculate_expected_vaf( - sample=sample, variant=variant)) - variant.INFO["{}_pu".format(sample)] = ",".join( - [str(self.power.calculate_power(ac=global_ac[alt], dp=global_dp, sample=sample, variant=variant)) - for alt in variant.ALT]) - power, k = self.power.calculate_absolute_power( - dp=global_dp, sample=sample, variant=variant) - variant.INFO["{}_pw".format(sample)] = str(power) - variant.INFO["{}_k".format(sample)] = str(k) - variant.INFO["{}_bq".format(sample)] = ",".join( - [str(global_bq[variant.REF])] + [str(global_bq[alt]) for alt in variant.ALT]) - variant.INFO["{}_mq".format(sample)] = ",".join( - [str(global_mq[variant.REF])] + [str(global_mq[alt]) for alt in variant.ALT]) - variant.INFO["{}_pos".format(sample)] = ",".join( - [str(global_pos[variant.REF])] + [str(global_pos[alt]) for alt in variant.ALT]) - - # rank sum tests require at least one ref and one alt value - pvalues, stats = get_rank_sum_tests(global_all_mqs, variant) - if stats: - variant.INFO["{}_rsmq".format(sample)] = ",".join(stats) - variant.INFO["{}_rsmq_pv".format(sample)] = ",".join(pvalues) - - pvalues, stats = get_rank_sum_tests(global_all_bqs, variant) - if stats: - variant.INFO["{}_rsbq".format(sample)] = ",".join(stats) - variant.INFO["{}_rsbq_pv".format(sample)] = ",".join(pvalues) - - pvalues, stats = get_rank_sum_tests(global_all_positions, variant) - if stats: - variant.INFO["{}_rspos".format(sample)] = ",".join(stats) - variant.INFO["{}_rspos_pv".format(sample)] = ",".join(pvalues) - - def _calculate_af(self, ac, dp): - return round(float(ac) / dp, 5) if dp > 0 else 0.0 - - def run(self): - batch = [] - if self.num_processes > 1: - self._run_parallel(batch) - else: - self._run_serial(batch) - - if batch: - self._write_batch(batch) - - self.vcf_writer.close() - self.vcf.close() - for _, bams in self.bam_readers.items(): - for bam in bams: - bam.close() - - def _run_serial(self, batch): - for chrom, chrom_variants in stream_variants_by_chrom(self.vcf): - all_metrics = self._collect_chrom_metrics(chrom, chrom_variants) - self._annotate_and_batch(chrom_variants, all_metrics, batch) - - def _run_parallel(self, batch): - # variant objects are not picklable — pass only (POS, REF, ALT) tuples to workers, - # keep the actual Variant objects in the main process for annotation and writing - chrom_variants_map = {} - futures = {} - - with ProcessPoolExecutor(max_workers=self.num_processes) as executor: - for chrom, chrom_variants in stream_variants_by_chrom(self.vcf): - chrom_variants_map[chrom] = chrom_variants - variant_tuples = [(v.POS, v.REF, v.ALT[0]) for v in chrom_variants] - futures[executor.submit( - _collect_metrics_worker, - chrom=chrom, - variant_tuples=variant_tuples, - bam_paths=self.bam_paths, - min_base_quality=self.base_call_quality_threshold, - min_mapping_quality=self.mapping_quality_threshold, - include_ambiguous_bases=self.include_ambiguous_bases, - )] = chrom - - # collect in submission order to preserve VCF chromosome order - chrom_results = {futures[f]: f.result() for f in futures} - - for chrom, chrom_variants in chrom_variants_map.items(): - self._annotate_and_batch(chrom_variants, chrom_results[chrom], batch) - - - def _collect_chrom_metrics(self, chrom, chrom_variants): - """Collect metrics for all BAMs for one chromosome in the main process.""" - all_metrics = {} - for sample, bams in self.bam_readers.items(): - for i, bam in enumerate(bams): - all_metrics[(sample, i)] = collect_metrics_for_chrom( - chrom=chrom, - variants=chrom_variants, - bam=bam, - min_base_quality=self.base_call_quality_threshold, - min_mapping_quality=self.mapping_quality_threshold, - include_ambiguous_bases=self.include_ambiguous_bases, - ) - return all_metrics - - def _annotate_and_batch(self, chrom_variants, all_metrics, batch): - """Annotate variants using pre-computed metrics and append to write batch.""" - for variant in chrom_variants: - metrics_by_bam = { - (sample, i): all_metrics[(sample, i)].get( - (variant.POS, variant.REF, variant.ALT[0]), EMPTY_METRICS - ) - for sample, bams in self.bam_readers.items() - for i in range(len(bams)) - } - self._add_stats(variant, metrics_by_bam) - batch.append(variant) - if len(batch) >= BATCH_SIZE: - self._write_batch(batch) - batch.clear() \ No newline at end of file +from collections import Counter +import os +from concurrent.futures import ProcessPoolExecutor + +import pysam +from cyvcf2 import VCF, Writer, Variant +import vafator +import datetime +import json + +from vafator.constants import AMBIGUOUS_BASES, BATCH_SIZE, _HEADER_TEMPLATES, _REPLICATE_HEADER_TEMPLATES +from vafator.ploidies import DEFAULT_PLOIDY +from vafator.rank_sum_test import get_rank_sum_tests +from vafator.power import PowerCalculator, DEFAULT_ERROR_RATE, DEFAULT_FPR +from vafator.pileups import collect_metrics_for_chrom, stream_variants_by_chrom +from vafator.pileup_utils import VariantRecord, EMPTY_METRICS + + +def _collect_metrics_worker( + chrom: str, + variant_tuples: list, + bam_paths: dict, + min_base_quality: int, + min_mapping_quality: int, + include_ambiguous_bases: bool + ) -> dict: + """ + Top-level worker function for ProcessPoolExecutor — must be module-level to be picklable. + Opens its own BAM readers (AlignmentFile objects cannot be shared across processes). + Receives variant data as plain tuples (cyvcf2.Variant objects are not picklable). + + Args: + chrom: chromosome name + variant_tuples: list of (POS, REF, ALT) tuples for variants on this chromosome + bam_paths: {sample: [bam_path, ...]} — picklable BAM file paths + min_base_quality: minimum base call quality threshold + min_mapping_quality: minimum mapping quality threshold + include_ambiguous_bases: whether to include ambiguous bases in depth calculation + + Returns: + {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}} + """ + all_metrics = {} + variants = [VariantRecord(CHROM=chrom, POS=pos, REF=ref, ALT=[alt]) + for pos, ref, alt in variant_tuples] + for sample, bam_files in bam_paths.items(): + for i, bam_path in enumerate(bam_files): + bam = pysam.AlignmentFile(bam_path, "rb") + all_metrics[(sample, i)] = collect_metrics_for_chrom( + chrom=chrom, + variants=variants, + bam=bam, + min_base_quality=min_base_quality, + min_mapping_quality=min_mapping_quality, + include_ambiguous_bases=include_ambiguous_bases, + ) + bam.close() + return all_metrics + + +class Annotator(object): + + def __init__(self, input_vcf: str, output_vcf: str, + input_bams: dict, + purities: dict = {}, + mapping_qual_thr: int = 0, + base_call_qual_thr: int = 29, + tumor_ploidies: dict = {}, + normal_ploidy: int = 2, + fpr: float = DEFAULT_FPR, + error_rate: float = DEFAULT_ERROR_RATE, + include_ambiguous_bases: bool = True, + num_processes: int = 1): + """ + Args: + input_vcf: path to the input VCF file to annotate + output_vcf: path for the annotated output VCF + input_bams: {sample_name: [bam_path, ...]} — one or more BAMs per sample + purities: {sample_name: purity} — tumor purity per sample (default: 1.0) + mapping_qual_thr: minimum mapping quality; reads below this are excluded + base_call_qual_thr: minimum base call quality; bases below this are excluded + tumor_ploidies: {sample_name: PloidyManager} — tumor ploidy per sample (default: 2) + normal_ploidy: normal ploidy for power calculation (default: 2) + fpr: false positive rate for power calculation + error_rate: sequencing error rate for power calculation + include_ambiguous_bases: if True, ambiguous bases (N and IUPAC codes) are counted in DP + num_processes: number of parallel processes for chromosome-level annotation (default: 1) + """ + + self.mapping_quality_threshold = mapping_qual_thr + self.base_call_quality_threshold = base_call_qual_thr + self.purities = purities + self.tumor_ploidies = tumor_ploidies + self.normal_ploidy = normal_ploidy + self.include_ambiguous_bases = include_ambiguous_bases + self.num_processes = num_processes + self.power = PowerCalculator( + normal_ploidy=normal_ploidy, + tumor_ploidies=tumor_ploidies, + purities=purities, + error_rate=error_rate, + fpr=fpr + ) + + self.vcf = VCF(input_vcf) + + self.header = { + "name": "vafator", + "version": vafator.VERSION, + "date": datetime.datetime.now().ctime(), + "timestamp": datetime.datetime.now().timestamp(), + "input_vcf": os.path.abspath(input_vcf), + "output_vcf": os.path.abspath(output_vcf), + "bams": ";".join( + ["{}:{}".format(s, ",".join([os.path.abspath(b) for b in bams])) + for s, bams in input_bams.items()] + ), + "mapping_quality_threshold": mapping_qual_thr, + "base_call_quality_threshold": base_call_qual_thr, + "purities": ";".join(["{}:{}".format(s, p) for s, p in purities.items()]), + "normal_ploidy": normal_ploidy, + "tumor_ploidy": ";".join( + ["{}:{}".format(s, p.report_value) for s, p in tumor_ploidies.items()] + ) if tumor_ploidies else DEFAULT_PLOIDY, + "include_ambiguous_bases": include_ambiguous_bases, + } + self.vcf.add_to_header("##vafator_command_line={}".format(json.dumps(self.header))) + + for a in Annotator._get_headers(input_bams): + self.vcf.add_info_to_header(a) + self.vcf_writer = Writer(output_vcf, self.vcf) + + self.bam_paths = input_bams + self.bam_readers = {s: [pysam.AlignmentFile(b, "rb") for b in bams] for s, bams in input_bams.items()} + + def run(self) -> None: + """Run the annotation pipeline over all variants in the input VCF, + writing annotated records to the output VCF.""" + batch = [] + if self.num_processes > 1: + self._run_parallel(batch) + else: + self._run_serial(batch) + if batch: + self._write_batch(batch) + self.vcf_writer.close() + self.vcf.close() + for _, bams in self.bam_readers.items(): + for bam in bams: + bam.close() + + def _run_serial(self, batch: list) -> None: + """Annotate variants chromosome by chromosome in the main process.""" + for chrom, chrom_variants in stream_variants_by_chrom(self.vcf): + all_metrics = self._collect_chrom_metrics(chrom, chrom_variants) + self._annotate_and_batch(chrom_variants, all_metrics, batch) + + def _run_parallel(self, batch: list) -> None: + """Annotate variants using a process pool — one worker per chromosome. + Results are collected and written in original VCF chromosome order.""" + chrom_variants_map = {} + futures = {} + with ProcessPoolExecutor(max_workers=self.num_processes) as executor: + for chrom, chrom_variants in stream_variants_by_chrom(self.vcf): + chrom_variants_map[chrom] = chrom_variants + variant_tuples = [(v.POS, v.REF, v.ALT[0]) for v in chrom_variants] + futures[executor.submit( + _collect_metrics_worker, + chrom=chrom, variant_tuples=variant_tuples, bam_paths=self.bam_paths, + min_base_quality=self.base_call_quality_threshold, + min_mapping_quality=self.mapping_quality_threshold, + include_ambiguous_bases=self.include_ambiguous_bases, + )] = chrom + chrom_results = {futures[f]: f.result() for f in futures} + for chrom, chrom_variants in chrom_variants_map.items(): + self._annotate_and_batch(chrom_variants, chrom_results[chrom], batch) + + def _collect_chrom_metrics(self, chrom: str, chrom_variants: list) -> dict: + """Collect metrics for all BAMs for one chromosome in the main process. + + Args: + chrom: chromosome name + chrom_variants: list of cyvcf2 Variant objects on this chromosome + + Returns: + {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}} + """ + all_metrics = {} + for sample, bams in self.bam_readers.items(): + for i, bam in enumerate(bams): + all_metrics[(sample, i)] = collect_metrics_for_chrom( + chrom=chrom, variants=chrom_variants, bam=bam, + min_base_quality=self.base_call_quality_threshold, + min_mapping_quality=self.mapping_quality_threshold, + include_ambiguous_bases=self.include_ambiguous_bases, + ) + return all_metrics + + def _annotate_and_batch(self, chrom_variants: list, all_metrics: dict, batch: list) -> None: + """Annotate variants using pre-computed metrics and append to write batch. + Flushes the batch to disk when it reaches BATCH_SIZE. + + Args: + chrom_variants: list of cyvcf2 Variant objects to annotate + all_metrics: {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}} + batch: accumulator list for annotated variants pending write + """ + for variant in chrom_variants: + metrics_by_bam = { + (sample, i): all_metrics[(sample, i)].get( + (variant.POS, variant.REF, variant.ALT[0]), EMPTY_METRICS) + for sample, bams in self.bam_readers.items() + for i in range(len(bams)) + } + self._add_stats(variant, metrics_by_bam) + batch.append(variant) + if len(batch) >= BATCH_SIZE: + self._write_batch(batch) + batch.clear() + + + def _add_stats(self, variant: Variant, metrics_by_bam: dict) -> None: + """Annotate a single variant using pre-computed metrics. + + Args: + variant: the cyvcf2 Variant to annotate in place + metrics_by_bam: {(sample, bam_idx): CoverageMetrics} + """ + for sample, bams in self.bam_readers.items(): + global_dp = 0 + global_ac = Counter() + global_bq = Counter() + global_mq = Counter() + global_pos = Counter() + global_all_mqs = {} + global_all_bqs = {} + global_all_positions = {} + + for i in range(len(bams)): + metrics = metrics_by_bam.get((sample, i), EMPTY_METRICS) + if metrics is not None: + if len(bams) > 1: + self._annotate_replicate(variant, sample, i, metrics) + global_ac.update(metrics.ac) + global_bq.update(metrics.bqs) + global_mq.update(metrics.mqs) + global_pos.update(metrics.positions) + global_all_mqs.update(metrics.all_mqs) + global_all_bqs.update(metrics.all_bqs) + global_all_positions.update(metrics.all_positions) + global_dp += metrics.dp + + self._annotate_sample(variant, sample, global_ac, global_dp, global_bq, + global_mq, global_pos, global_all_mqs, global_all_bqs, + global_all_positions) + + def _annotate_replicate(self, v: Variant, s: str, i: int, m) -> None: + """Write per-replicate annotations — only called when multiple BAMs are provided for a sample. + + Args: + v: the cyvcf2 Variant being annotated + s: sample name (e.g. 'tumor', 'normal') + i: 0-based index of the BAM within the sample's BAM list + m: pre-computed CoverageMetrics for this variant in this BAM + """ + p = "{}_{}".format(s, i + 1) + v.INFO["{}_af".format(p)] = ",".join([str(self._calculate_af(m.ac[alt], m.dp)) for alt in v.ALT]) + v.INFO["{}_ac".format(p)] = ",".join([str(m.ac[alt]) for alt in v.ALT]) + v.INFO["{}_n".format(p)] = str(sum(m.ac.get(b, 0) for b in AMBIGUOUS_BASES)) + v.INFO["{}_dp".format(p)] = m.dp + v.INFO["{}_pu".format(p)] = ",".join([str(self.power.calculate_power(ac=m.ac[alt], dp=m.dp, sample=s, variant=v)) for alt in v.ALT]) + + power, k = self.power.calculate_absolute_power(dp=m.dp, sample=s, variant=v) + v.INFO["{}_pw".format(p)] = str(power) + v.INFO["{}_k".format(p)] = str(k) + v.INFO["{}_bq".format(p)] = ",".join([str(m.bqs[v.REF])] + [str(m.bqs[alt]) for alt in v.ALT]) + v.INFO["{}_mq".format(p)] = ",".join([str(m.mqs[v.REF])] + [str(m.mqs[alt]) for alt in v.ALT]) + v.INFO["{}_pos".format(p)] = ",".join([str(m.positions[v.REF])] + [str(m.positions[alt]) for alt in v.ALT]) + for key, tag in [(m.all_mqs, "rsmq"), (m.all_bqs, "rsbq"), (m.all_positions, "rspos")]: + pvalues, stats = get_rank_sum_tests(key, v) + if stats: + v.INFO["{}_{}_{}".format(s, tag, i + 1)] = ",".join(stats) + v.INFO["{}_{}_pv_{}".format(s, tag, i + 1)] = ",".join(pvalues) + + def _annotate_sample(self, v: Variant, s: str, gac: Counter, gdp: int, + gbq: Counter, gmq: Counter, gpos: Counter, + gallmq: dict, gallbq: dict, gallpos: dict) -> None: + """Write aggregate annotations for a sample, combining metrics across all replicates. + + Args: + v: the cyvcf2 Variant being annotated + s: sample name (e.g. 'tumor', 'normal') + gac: allele counts summed across all BAMs + gdp: total depth summed across all BAMs + gbq: median BQ per allele, summed across all BAMs + gmq: median MQ per allele, summed across all BAMs + gpos: median read position per allele, summed across all BAMs + gallmq: MQ distributions per allele across all BAMs + gallbq: BQ distributions per allele across all BAMs + gallpos: read position distributions per allele across all BAMs + """ + v.INFO["{}_af".format(s)] = ",".join([str(self._calculate_af(gac[alt], gdp)) for alt in v.ALT]) + v.INFO["{}_ac".format(s)] = ",".join([str(gac[alt]) for alt in v.ALT]) + v.INFO["{}_n".format(s)] = str(sum(gac.get(b, 0) for b in AMBIGUOUS_BASES)) + v.INFO["{}_dp".format(s)] = gdp + v.INFO["{}_eaf".format(s)] = str(self.power.calculate_expected_vaf(sample=s, variant=v)) + v.INFO["{}_pu".format(s)] = ",".join([str(self.power.calculate_power(ac=gac[alt], dp=gdp, sample=s, variant=v)) for alt in v.ALT]) + + power, k = self.power.calculate_absolute_power(dp=gdp, sample=s, variant=v) + v.INFO["{}_pw".format(s)] = str(power) + v.INFO["{}_k".format(s)] = str(k) + v.INFO["{}_bq".format(s)] = ",".join([str(gbq[v.REF])] + [str(gbq[alt]) for alt in v.ALT]) + v.INFO["{}_mq".format(s)] = ",".join([str(gmq[v.REF])] + [str(gmq[alt]) for alt in v.ALT]) + v.INFO["{}_pos".format(s)] = ",".join([str(gpos[v.REF])] + [str(gpos[alt]) for alt in v.ALT]) + for distributions, tag in [(gallmq, "rsmq"), (gallbq, "rsbq"), (gallpos, "rspos")]: + pvalues, stats = get_rank_sum_tests(distributions, v) + if stats: + v.INFO["{}_{}".format(s, tag)] = ",".join(stats) + v.INFO["{}_{}_pv".format(s, tag)] = ",".join(pvalues) + + def _calculate_af(self, ac: int, dp: int) -> float: + """Return allele frequency, or 0.0 if depth is zero. + + Args: + ac: allele count for this alternate allele + dp: total depth of coverage + + Returns: + allele frequency rounded to 5 decimal places, or 0.0 if dp is zero + """ + return round(float(ac) / dp, 5) if dp > 0 else 0.0 + + def _write_batch(self, batch: list) -> None: + """Write a batch of annotated variants to the output VCF. + + Args: + batch: list of cyvcf2 Variant objects to write + """ + for v in batch: + self.vcf_writer.write_record(v) + + @staticmethod + def _get_headers(input_bams: dict) -> list: + """Build the list of INFO header entries for all samples and replicates. + + Args: + input_bams: {sample_name: [bam_path, ...]} + + Returns: + list of header dicts suitable for cyvcf2's add_info_to_header + """ + headers = [] + for s, bams in input_bams.items(): + for suffix, description, type, number in _HEADER_TEMPLATES: + headers.append(Annotator._make_header(suffix, description, type, number, sample=s)) + if len(bams) > 1: + for i, bam in enumerate(bams, start=1): + n = os.path.basename(bam).split(".")[0] + sample_label = "{} {}".format(s, n) + for suffix, description, type, number in _REPLICATE_HEADER_TEMPLATES: + headers.append(Annotator._make_header(suffix, description, type, number, + sample=sample_label, index=i)) + return headers + + @staticmethod + def _make_header(suffix: str, description: str, type: str, number: str, + sample: str, index: int = None) -> dict: + """Build a single INFO header dict for cyvcf2's add_info_to_header. + + Args: + suffix: annotation suffix (e.g. 'af', 'dp') + description: description template with {sample} placeholder + type: VCF type string ('Float', 'Integer', 'String') + number: VCF number string ('A', 'R', '1', etc.) + sample: sample name to substitute into ID and description + index: optional replicate index appended to the ID + + Returns: + dict with keys ID, Description, Type, Number suitable for cyvcf2's add_info_to_header + """ + id = "{}_{}".format(sample, suffix) + if index is not None: + id = "{}_{}".format(id, index) + return {'ID': id, 'Description': description.format(sample=sample), 'Type': type, 'Number': number} \ No newline at end of file diff --git a/vafator/constants.py b/vafator/constants.py new file mode 100755 index 0000000..bfad7aa --- /dev/null +++ b/vafator/constants.py @@ -0,0 +1,30 @@ +# Annotation batch size for VCF writing +BATCH_SIZE = 10000 + +# IUPAC ambiguity codes treated as ambiguous bases in pileup metrics +AMBIGUOUS_BASES = ['N', 'M', 'R', 'W', 'S', 'Y', 'K', 'V', 'H', 'D', 'B'] + +# Header templates: (suffix, description, type, number) +# {sample} is substituted at generation time +_HEADER_TEMPLATES = [ + ("af", "Allele frequency for the alternate alleles in the {sample} sample/s", "Float", "A"), + ("dp", "Total depth of coverage in the {sample} sample/s (independent of alleles)", "Float", "1"), + ("ac", "Allele count for the alternate alleles in the {sample} sample/s", "Integer", "A"), + ("n", "Allele count for ambiguous bases (any IUPAC ambiguity code is counted) in the {sample} sample/s", "Integer", "1"), + ("pu", "Probability of an undetected mutation given the observed supporting reads (AC), the observed total coverage (DP) and the provided tumor purity in the {sample} sample/s", "Float", "A"), + ("pw", "Power to detect a somatic mutation as described in Absolute given the observed total coverage (DP) and the provided tumor purity and ploidies in the {sample} sample/s", "Float", "1"), + ("k", "Minimum number of supporting reads, k, such that the probability of observing k or more non-reference reads due to sequencing error is less than the defined FPR in the {sample} sample/s", "Float", "1"), + ("eaf", "Expected VAF considering the purity and ploidy/copy number in the {sample} sample/s", "Float", "1"), + ("bq", "Median base call quality of the reads supporting each allele in the {sample} sample/s", "Float", "R"), + ("mq", "Median mapping quality of the reads supporting each allele in the {sample} sample/s", "Float", "R"), + ("pos", "Median position within the read of the reads supporting each allele in the {sample} sample/s", "Float", "R"), + ("rsmq", "Rank sum test comparing the MQ distributions supporting the reference and the alternate in the {sample} sample/s", "Float", "A"), + ("rsmq_pv", "Rank sum test p-value for MQ distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions", "Float", "A"), + ("rsbq", "Rank sum test comparing the BQ distributions supporting the reference and the alternate in the {sample} sample/s", "Float", "A"), + ("rsbq_pv", "Rank sum test p-value for BQ distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions", "Float", "A"), + ("rspos", "Rank sum test comparing the position distributions supporting the reference and the alternate in the {sample} sample/s", "Float", "A"), + ("rspos_pv", "Rank sum test p-value for position distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions", "Float", "A"), +] + +# eaf is not produced per-replicate +_REPLICATE_HEADER_TEMPLATES = [t for t in _HEADER_TEMPLATES if t[0] != "eaf"] \ No newline at end of file diff --git a/vafator/pileup_utils.py b/vafator/pileup_utils.py new file mode 100755 index 0000000..1048e7c --- /dev/null +++ b/vafator/pileup_utils.py @@ -0,0 +1,111 @@ +from collections import Counter +from dataclasses import dataclass +from math import nan +from typing import List, Dict + +import numpy as np + + +@dataclass +class VariantRecord: + """Lightweight, picklable variant representation used by pileup workers. + Mirrors the cyvcf2.Variant fields accessed by pileup and metrics functions. + + Attributes: + CHROM: chromosome name + POS: 1-based variant position + REF: reference allele + ALT: list of alternate alleles + """ + CHROM: str + POS: int + REF: str + ALT: List[str] + + +@dataclass +class CoverageMetrics: + """Pileup metrics computed for a single variant in a single BAM. + + Attributes: + ac: allele counts per base, including the reference + dp: total depth of coverage + bqs: median base call quality per allele, including the reference + mqs: median mapping quality per allele, including the reference + positions: median read position per allele, including the reference + all_bqs: full base call quality distribution per allele + all_mqs: full mapping quality distribution per allele + all_positions: full read position distribution per allele + """ + ac: dict + dp: int + bqs: dict = None + mqs: dict = None + positions: dict = None + all_bqs: dict = None + all_mqs: dict = None + all_positions: dict = None + + +EMPTY_METRICS = CoverageMetrics( + ac=Counter(), dp=0, bqs=Counter(), mqs=Counter(), positions=Counter(), + all_bqs={}, all_mqs={}, all_positions={} +) + + +def aggregate_list_per_base(bases: List[str], values: list) -> Dict[str, list]: + """Group a list of values by their corresponding base. + + Args: + bases: list of base characters (e.g. ['A', 'T', 'A', '']) + values: list of numeric values parallel to bases + + Returns: + dict mapping each base to a list of its associated values + """ + aggregated_values = {} + for b, v in zip(bases, values): + if b not in aggregated_values: + aggregated_values[b] = [] + aggregated_values[b].append(v) + return aggregated_values + + +def safe_median(values: list) -> float: + """Return the median of a list, or nan if the list is empty. + Avoids numpy RuntimeWarning raised by np.median on empty arrays. + + Args: + values: list of numeric values + + Returns: + median as float, or nan if values is empty + """ + return float(np.median(values)) if values else nan + + +def is_snp(variant) -> bool: + """Return True if the variant is a single nucleotide polymorphism. + + Args: + variant: any object with REF and ALT attributes (Variant or VariantRecord) + """ + return len(variant.REF) == 1 and len(variant.ALT[0]) == 1 + + +def is_insertion(variant) -> bool: + """Return True if the variant is an insertion. + + Args: + variant: any object with REF and ALT attributes (Variant or VariantRecord) + """ + return len(variant.REF) == 1 and len(variant.ALT[0]) > 1 + + +def is_deletion(variant) -> bool: + """Return True if the variant is a deletion. + + Args: + variant: any object with REF and ALT attributes (Variant or VariantRecord) + """ + return len(variant.ALT[0]) == 1 and len(variant.REF) > 1 \ No newline at end of file diff --git a/vafator/pileups.py b/vafator/pileups.py index b6afb9f..c23041f 100755 --- a/vafator/pileups.py +++ b/vafator/pileups.py @@ -1,67 +1,75 @@ -import numpy as np from collections import Counter, defaultdict -from cyvcf2 import Variant -from dataclasses import dataclass -from math import nan -from pysam.libcalignmentfile import IteratorColumnRegion, AlignmentFile from typing import Union, List, Dict, Iterator, Tuple -from vafator import AMBIGUOUS_BASES - -@dataclass -class VariantRecord: - """Lightweight, picklable variant representation used by pileup workers. - Mirrors the cyvcf2.Variant fields accessed by pileup and metrics functions.""" - CHROM: str - POS: int - REF: str - ALT: List[str] - - -def is_snp(variant: Variant): - return len(variant.REF) == 1 and len(variant.ALT[0]) == 1 - - -def is_insertion(variant: Variant): - return len(variant.REF) == 1 and len(variant.ALT[0]) > 1 - -def is_deletion(variant: Variant): - return len(variant.ALT[0]) == 1 and len(variant.REF) > 1 +from cyvcf2 import Variant +from pysam.libcalignmentfile import IteratorColumnRegion, AlignmentFile +from vafator.constants import AMBIGUOUS_BASES +from vafator.pileup_utils import * def get_variant_pileup( variant: Union[Variant, VariantRecord], bam: AlignmentFile, - min_base_quality, min_mapping_quality) -> IteratorColumnRegion: - """Single-variant pileup, kept for backwards compatibility and tests.""" + min_base_quality: int, min_mapping_quality: int) -> IteratorColumnRegion: + """Open a pileup iterator at a single variant position. + Kept for backwards compatibility and use in tests. + + Args: + variant: variant to query (cyvcf2 Variant or VariantRecord) + bam: open pysam AlignmentFile + min_base_quality: minimum base call quality; bases below this are excluded + min_mapping_quality: minimum mapping quality; reads below this are excluded + + Returns: + pysam IteratorColumnRegion over the variant position + """ position = variant.POS - return bam.pileup(contig=variant.CHROM, start=position - 1, stop=position, - truncate=True, - max_depth=1000000, - min_base_quality=min_base_quality, - min_mapping_quality=min_mapping_quality, - stepper='samtools', - ) + return bam.pileup( + contig=variant.CHROM, + start=position - 1, + stop=position, + truncate=True, + max_depth=1000000, + min_base_quality=min_base_quality, + min_mapping_quality=min_mapping_quality, + stepper='samtools' + ) def get_region_pileup(chrom: str, start: int, end: int, bam: AlignmentFile, - min_base_quality, min_mapping_quality): - """ - Opens a single pileup iterator spanning a whole region (e.g. one chromosome). - start is 0-based inclusive, end is 1-based exclusive (last variant POS). + min_base_quality: int, min_mapping_quality: int): + """Open a single pileup iterator spanning a genomic region. + + Args: + chrom: chromosome name + start: 0-based inclusive start position + end: 1-based exclusive end position (last variant POS) + bam: open pysam AlignmentFile + min_base_quality: minimum base call quality; bases below this are excluded + min_mapping_quality: minimum mapping quality; reads below this are excluded + + Returns: + pysam pileup iterator over the region """ - return bam.pileup(contig=chrom, start=start, stop=end, - truncate=True, - max_depth=1000000, - min_base_quality=min_base_quality, - min_mapping_quality=min_mapping_quality, - stepper='samtools', - ) + return bam.pileup( + contig=chrom, + start=start, + stop=end, + truncate=True, + max_depth=1000000, + min_base_quality=min_base_quality, + min_mapping_quality=min_mapping_quality, + stepper='samtools' + ) def stream_variants_by_chrom(vcf) -> Iterator[Tuple[str, List[Variant]]]: - """ - Yields (chrom, [variants]) one chromosome at a time. - Only one chromosome's variants are held in memory at once. + """Yield variants grouped by chromosome, one chromosome at a time. + + Args: + vcf: open cyvcf2 VCF iterator + + Returns: + iterator of (chrom, [variants]) tuples """ current_chrom = None current_variants = [] @@ -83,38 +91,45 @@ def collect_metrics_for_chrom( bam: AlignmentFile, min_base_quality: int, min_mapping_quality: int, - include_ambiguous_bases: bool = False) -> Dict[Tuple, 'CoverageMetrics']: - """ - Opens ONE pileup iterator over the entire chromosome region covered by variants. - Metrics are computed IMMEDIATELY for each pileup column while it is still valid — - avoids segfaults from storing PileupColumn objects after the iterator advances. - - Returns {(pos, REF, ALT[0]): CoverageMetrics}. + include_ambiguous_bases: bool = False) -> Dict[Tuple, CoverageMetrics]: + """Compute pileup metrics for all variants on a chromosome using a single pileup iterator. + + Metrics are computed immediately for each pileup column while it is still valid — + avoids segfaults from storing PileupColumn C objects after the iterator advances. + + Args: + chrom: chromosome name + variants: list of variants on this chromosome (must be sorted by position) + bam: open pysam AlignmentFile + min_base_quality: minimum base call quality threshold + min_mapping_quality: minimum mapping quality threshold + include_ambiguous_bases: if True, ambiguous bases are counted in depth + + Returns: + {(pos, REF, ALT[0]): CoverageMetrics} for each variant with read support """ if not variants: return {} - # index variants by 1-based position for O(1) lookup during streaming - variants_by_pos: Dict[int, List[Variant]] = defaultdict(list) + variants_by_pos: Dict[int, List] = defaultdict(list) for v in variants: variants_by_pos[v.POS].append(v) start = variants[0].POS - 1 # 0-based inclusive end = variants[-1].POS # exclusive end for pysam - results: Dict[Tuple, CoverageMetrics] = {} for pileup_col in get_region_pileup( - chrom=chrom, start=start, end=end, - bam=bam, - min_base_quality=min_base_quality, - min_mapping_quality=min_mapping_quality, + chrom=chrom, + start=start, + end=end, + bam=bam, + min_base_quality=min_base_quality, + min_mapping_quality=min_mapping_quality ): ref_pos = pileup_col.reference_pos + 1 # convert to 1-based if ref_pos not in variants_by_pos: continue - - # compute metrics NOW while pileup_col is still valid in C memory for variant in variants_by_pos[ref_pos]: metrics = _get_metrics_from_column(variant, pileup_col, include_ambiguous_bases) if metrics is not None: @@ -123,35 +138,18 @@ def collect_metrics_for_chrom( return results -@dataclass -class CoverageMetrics: - # number supporting reads of each base, including the reference - ac: dict - # total depth of coverage - dp: int - # median base call quality of each base, including the reference - bqs: dict = None - # median mapping quality of each alternate base, including the reference - mqs: dict = None - # median position within the read of each alternate base, including the reference - positions: dict = None - # base call quality distribution of each base, including the reference - all_bqs: dict = None - # mapping quality distribution of each base, including the reference - all_mqs: dict = None - # position within the read distribution of each base, including the reference - all_positions: dict = None - - -EMPTY_METRICS = CoverageMetrics( - ac=Counter(), dp=0, bqs=Counter(), mqs=Counter(), positions=Counter(), - all_bqs={}, all_mqs={}, all_positions={} -) - - -def _get_metrics_from_column(variant: Variant, pileup_col, - include_ambiguous_bases=False) -> 'CoverageMetrics': - """Dispatch to the right metrics function based on variant type.""" +def _get_metrics_from_column(variant, pileup_col, + include_ambiguous_bases: bool = True) -> CoverageMetrics: + """Dispatch pileup metrics computation based on variant type. + + Args: + variant: Variant or VariantRecord being evaluated + pileup_col: pysam PileupColumn at the variant position + include_ambiguous_bases: if True, ambiguous bases are counted in depth + + Returns: + CoverageMetrics for this variant, or None if the variant type is not supported + """ if is_snp(variant): return _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases) elif is_insertion(variant): @@ -161,11 +159,18 @@ def _get_metrics_from_column(variant: Variant, pileup_col, return None -def _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases=False) -> CoverageMetrics: - bases = [] - qualities = [] - mapping_qualities = [] - query_positions = [] +def _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases: bool = True) -> CoverageMetrics: + """Compute SNV metrics from a pileup column. + Deletions at the position are represented as empty string bases and included in depth. + + Args: + pileup_col: pysam PileupColumn at the SNV position + include_ambiguous_bases: if True, IUPAC ambiguous bases are counted in depth + + Returns: + CoverageMetrics with ac, dp, bqs, mqs, positions and their distributions + """ + bases, qualities, mapping_qualities, query_positions = [], [], [], [] for read in pileup_col.pileups: if read.is_refskip: @@ -203,7 +208,18 @@ def _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases=False) -> C ) -def _get_insertion_metrics_from_column(variant: Variant, pileup_col) -> CoverageMetrics: +def _get_insertion_metrics_from_column(variant, pileup_col) -> CoverageMetrics: + """Compute insertion metrics from a pileup column. + Checks both insertion length and sequence to count supporting reads. + Reads with no indel at this position are counted as reference support. + + Args: + variant: Variant or VariantRecord with REF and ALT defining the insertion + pileup_col: pysam PileupColumn at the insertion position + + Returns: + CoverageMetrics with ac, dp, mqs, positions (bqs is empty for indels) + """ ac = {alt.upper(): 0 for alt in variant.ALT} mq = {alt.upper(): [] for alt in variant.ALT} mq[variant.REF] = [] @@ -251,7 +267,18 @@ def _get_insertion_metrics_from_column(variant: Variant, pileup_col) -> Coverage ) -def _get_deletion_metrics_from_column(variant: Variant, pileup_col) -> CoverageMetrics: +def _get_deletion_metrics_from_column(variant, pileup_col) -> CoverageMetrics: + """Compute deletion metrics from a pileup column. + Checks both deletion length and position via CIGAR traversal to count supporting reads. + Reads with no indel at this position are counted as reference support. + + Args: + variant: Variant or VariantRecord with REF and ALT defining the deletion + pileup_col: pysam PileupColumn at the deletion position + + Returns: + CoverageMetrics with ac, dp, mqs, positions (bqs is empty for indels) + """ ac = {alt.upper(): 0 for alt in variant.ALT} mq = {alt.upper(): [] for alt in variant.ALT} mq[variant.REF] = [] @@ -292,17 +319,4 @@ def _get_deletion_metrics_from_column(variant: Variant, pileup_col) -> CoverageM all_mqs={k: l for k, l in mq.items()}, all_positions={k: l for k, l in pos.items()}, all_bqs=Counter() - ) - -def aggregate_list_per_base(bases, values) -> dict: - aggregated_values = {} - for b, v in zip(bases, values): - if b not in aggregated_values: - aggregated_values[b] = [] - aggregated_values[b].append(v) - return aggregated_values - - -def safe_median(values) -> float: - """Return median of values, or 0.0 for empty lists (avoids numpy RuntimeWarning).""" - return float(np.median(values)) if values else nan + ) \ No newline at end of file From e198540523fd3e5b624fa337a17be3a544a08ad5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Fri, 20 Mar 2026 13:24:24 +0100 Subject: [PATCH 20/32] Fix VCF INFO field error --- vafator/annotator.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/vafator/annotator.py b/vafator/annotator.py index d3a381e..f0cc2fd 100755 --- a/vafator/annotator.py +++ b/vafator/annotator.py @@ -263,24 +263,24 @@ def _annotate_replicate(self, v: Variant, s: str, i: int, m) -> None: i: 0-based index of the BAM within the sample's BAM list m: pre-computed CoverageMetrics for this variant in this BAM """ - p = "{}_{}".format(s, i + 1) - v.INFO["{}_af".format(p)] = ",".join([str(self._calculate_af(m.ac[alt], m.dp)) for alt in v.ALT]) - v.INFO["{}_ac".format(p)] = ",".join([str(m.ac[alt]) for alt in v.ALT]) - v.INFO["{}_n".format(p)] = str(sum(m.ac.get(b, 0) for b in AMBIGUOUS_BASES)) - v.INFO["{}_dp".format(p)] = m.dp - v.INFO["{}_pu".format(p)] = ",".join([str(self.power.calculate_power(ac=m.ac[alt], dp=m.dp, sample=s, variant=v)) for alt in v.ALT]) - + n = i + 1 + v.INFO["{}_af_{}".format(s, n)] = ",".join([str(self._calculate_af(m.ac[alt], m.dp)) for alt in v.ALT]) + v.INFO["{}_ac_{}".format(s, n)] = ",".join([str(m.ac[alt]) for alt in v.ALT]) + v.INFO["{}_n_{}".format(s, n)] = str(sum(m.ac.get(b, 0) for b in AMBIGUOUS_BASES)) + v.INFO["{}_dp_{}".format(s, n)] = m.dp + v.INFO["{}_pu_{}".format(s, n)] = ",".join([str(self.power.calculate_power(ac=m.ac[alt], dp=m.dp, sample=s, variant=v)) for alt in v.ALT]) + power, k = self.power.calculate_absolute_power(dp=m.dp, sample=s, variant=v) - v.INFO["{}_pw".format(p)] = str(power) - v.INFO["{}_k".format(p)] = str(k) - v.INFO["{}_bq".format(p)] = ",".join([str(m.bqs[v.REF])] + [str(m.bqs[alt]) for alt in v.ALT]) - v.INFO["{}_mq".format(p)] = ",".join([str(m.mqs[v.REF])] + [str(m.mqs[alt]) for alt in v.ALT]) - v.INFO["{}_pos".format(p)] = ",".join([str(m.positions[v.REF])] + [str(m.positions[alt]) for alt in v.ALT]) + v.INFO["{}_pw_{}".format(s, n)] = str(power) + v.INFO["{}_k_{}".format(s, n)] = str(k) + v.INFO["{}_bq_{}".format(s, n)] = ",".join([str(m.bqs[v.REF])] + [str(m.bqs[alt]) for alt in v.ALT]) + v.INFO["{}_mq_{}".format(s, n)] = ",".join([str(m.mqs[v.REF])] + [str(m.mqs[alt]) for alt in v.ALT]) + v.INFO["{}_pos_{}".format(s, n)] = ",".join([str(m.positions[v.REF])] + [str(m.positions[alt]) for alt in v.ALT]) for key, tag in [(m.all_mqs, "rsmq"), (m.all_bqs, "rsbq"), (m.all_positions, "rspos")]: pvalues, stats = get_rank_sum_tests(key, v) if stats: - v.INFO["{}_{}_{}".format(s, tag, i + 1)] = ",".join(stats) - v.INFO["{}_{}_pv_{}".format(s, tag, i + 1)] = ",".join(pvalues) + v.INFO["{}_{}_{}".format(s, tag, n)] = ",".join(stats) + v.INFO["{}_{}_pv_{}".format(s, tag, n)] = ",".join(pvalues) def _annotate_sample(self, v: Variant, s: str, gac: Counter, gdp: int, gbq: Counter, gmq: Counter, gpos: Counter, From 5e3344deacd557bf42ddc2aa5f2b1e03103414df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Fri, 20 Mar 2026 13:36:22 +0100 Subject: [PATCH 21/32] Fix wrong tag --- vafator/annotator.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vafator/annotator.py b/vafator/annotator.py index f0cc2fd..781c355 100755 --- a/vafator/annotator.py +++ b/vafator/annotator.py @@ -356,10 +356,12 @@ def _get_headers(input_bams: dict) -> list: if len(bams) > 1: for i, bam in enumerate(bams, start=1): n = os.path.basename(bam).split(".")[0] - sample_label = "{} {}".format(s, n) for suffix, description, type, number in _REPLICATE_HEADER_TEMPLATES: - headers.append(Annotator._make_header(suffix, description, type, number, - sample=sample_label, index=i)) + headers.append( + Annotator._make_header( + suffix, description, type, number, sample=s, index=i + ) + ) return headers @staticmethod From 6db73d6b8c6f9b82c331d909d96d2da2d21bc106 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Fri, 20 Mar 2026 14:34:01 +0100 Subject: [PATCH 22/32] bump version --- setup.cfg | 2 +- vafator/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 4f08792..bc22ccb 100755 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = VAFator -version = 3.0.0 +version = 3.1.0 description = Annotate variants in a VCF file with technical annotations from one or more BAMs description-file = README.md long_description = file: README.md diff --git a/vafator/__init__.py b/vafator/__init__.py index 136a680..f52972e 100755 --- a/vafator/__init__.py +++ b/vafator/__init__.py @@ -1,2 +1,2 @@ -VERSION = '3.0.0' +VERSION = '3.1.0' From 12f017af36983292714a4644809224b7d97dcb25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Fri, 20 Mar 2026 14:46:36 +0100 Subject: [PATCH 23/32] Readd test that was accidentally deleted --- vafator/tests/test_power_calculator.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vafator/tests/test_power_calculator.py b/vafator/tests/test_power_calculator.py index 2e120e7..c079a83 100644 --- a/vafator/tests/test_power_calculator.py +++ b/vafator/tests/test_power_calculator.py @@ -80,6 +80,12 @@ def test_varying_ploidy(self): self.assertLess( power1.calculate_power(dp=10, ac=2, sample='tumor', variant=None), power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None)) + + power3 = PowerCalculator( + tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=6.0)}, purities={'tumor': 0.8}) + self.assertLess( + power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None), + power3.calculate_power(dp=10, ac=2, sample='tumor', variant=None)) def test_local_copy_numbers(self): input_bed = pkg_resources.resource_filename(__name__, "resources/test_copy_numbers.bed") From 4b6a272e3c4314539aaaa98cacdb2c866817930e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Wed, 1 Apr 2026 17:20:51 +0200 Subject: [PATCH 24/32] improve code quality based on Codacy output --- vafator/annotator.py | 26 +++++++++++++------------- vafator/command_line.py | 2 +- vafator/pileups.py | 20 ++++++++++---------- vafator/power.py | 1 - vafator/tests/test_pileups.py | 1 - vafator/tests/test_power_calculator.py | 2 +- 6 files changed, 25 insertions(+), 27 deletions(-) diff --git a/vafator/annotator.py b/vafator/annotator.py index 781c355..48008e4 100755 --- a/vafator/annotator.py +++ b/vafator/annotator.py @@ -95,10 +95,10 @@ def __init__(self, input_vcf: str, output_vcf: str, self.include_ambiguous_bases = include_ambiguous_bases self.num_processes = num_processes self.power = PowerCalculator( - normal_ploidy=normal_ploidy, - tumor_ploidies=tumor_ploidies, + normal_ploidy=normal_ploidy, + tumor_ploidies=tumor_ploidies, purities=purities, - error_rate=error_rate, + error_rate=error_rate, fpr=fpr ) @@ -129,7 +129,7 @@ def __init__(self, input_vcf: str, output_vcf: str, for a in Annotator._get_headers(input_bams): self.vcf.add_info_to_header(a) self.vcf_writer = Writer(output_vcf, self.vcf) - + self.bam_paths = input_bams self.bam_readers = {s: [pysam.AlignmentFile(b, "rb") for b in bams] for s, bams in input_bams.items()} @@ -351,28 +351,28 @@ def _get_headers(input_bams: dict) -> list: """ headers = [] for s, bams in input_bams.items(): - for suffix, description, type, number in _HEADER_TEMPLATES: - headers.append(Annotator._make_header(suffix, description, type, number, sample=s)) + for suffix, description, typ, number in _HEADER_TEMPLATES: + headers.append(Annotator._make_header(suffix, description, typ, number, sample=s)) if len(bams) > 1: for i, bam in enumerate(bams, start=1): n = os.path.basename(bam).split(".")[0] - for suffix, description, type, number in _REPLICATE_HEADER_TEMPLATES: + for suffix, description, typ, number in _REPLICATE_HEADER_TEMPLATES: headers.append( Annotator._make_header( - suffix, description, type, number, sample=s, index=i + suffix, description, typ, number, sample=s, index=i ) ) return headers @staticmethod - def _make_header(suffix: str, description: str, type: str, number: str, + def _make_header(suffix: str, description: str, typ: str, number: str, sample: str, index: int = None) -> dict: """Build a single INFO header dict for cyvcf2's add_info_to_header. Args: suffix: annotation suffix (e.g. 'af', 'dp') description: description template with {sample} placeholder - type: VCF type string ('Float', 'Integer', 'String') + typ: VCF type string ('Float', 'Integer', 'String') number: VCF number string ('A', 'R', '1', etc.) sample: sample name to substitute into ID and description index: optional replicate index appended to the ID @@ -380,7 +380,7 @@ def _make_header(suffix: str, description: str, type: str, number: str, Returns: dict with keys ID, Description, Type, Number suitable for cyvcf2's add_info_to_header """ - id = "{}_{}".format(sample, suffix) + header_id = "{}_{}".format(sample, suffix) if index is not None: - id = "{}_{}".format(id, index) - return {'ID': id, 'Description': description.format(sample=sample), 'Type': type, 'Number': number} \ No newline at end of file + header_id = "{}_{}".format(header_id, index) + return {'ID': header_id, 'Description': description.format(sample=sample), 'Type': typ, 'Number': number} \ No newline at end of file diff --git a/vafator/command_line.py b/vafator/command_line.py index f2d639d..6a9a58a 100755 --- a/vafator/command_line.py +++ b/vafator/command_line.py @@ -104,7 +104,7 @@ def annotator(): num_processes=args.num_processes, ) annotator.run() - + logging.info("Vafator finished!") diff --git a/vafator/pileups.py b/vafator/pileups.py index c23041f..72b70e8 100755 --- a/vafator/pileups.py +++ b/vafator/pileups.py @@ -5,7 +5,7 @@ from pysam.libcalignmentfile import IteratorColumnRegion, AlignmentFile from vafator.constants import AMBIGUOUS_BASES -from vafator.pileup_utils import * +from vafator.pileup_utils import VariantRecord, CoverageMetrics, safe_median, aggregate_list_per_base, is_snp, is_insertion, is_deletion def get_variant_pileup( variant: Union[Variant, VariantRecord], bam: AlignmentFile, @@ -24,10 +24,10 @@ def get_variant_pileup( """ position = variant.POS return bam.pileup( - contig=variant.CHROM, - start=position - 1, + contig=variant.CHROM, + start=position - 1, stop=position, - truncate=True, + truncate=True, max_depth=1000000, min_base_quality=min_base_quality, min_mapping_quality=min_mapping_quality, @@ -51,10 +51,10 @@ def get_region_pileup(chrom: str, start: int, end: int, bam: AlignmentFile, pysam pileup iterator over the region """ return bam.pileup( - contig=chrom, - start=start, + contig=chrom, + start=start, stop=end, - truncate=True, + truncate=True, max_depth=1000000, min_base_quality=min_base_quality, min_mapping_quality=min_mapping_quality, @@ -120,9 +120,9 @@ def collect_metrics_for_chrom( results: Dict[Tuple, CoverageMetrics] = {} for pileup_col in get_region_pileup( - chrom=chrom, - start=start, - end=end, + chrom=chrom, + start=start, + end=end, bam=bam, min_base_quality=min_base_quality, min_mapping_quality=min_mapping_quality diff --git a/vafator/power.py b/vafator/power.py index f465fe9..93ee5a3 100644 --- a/vafator/power.py +++ b/vafator/power.py @@ -1,4 +1,3 @@ -from functools import lru_cache from typing import Optional from cyvcf2 import Variant diff --git a/vafator/tests/test_pileups.py b/vafator/tests/test_pileups.py index d59aae3..c0450e2 100644 --- a/vafator/tests/test_pileups.py +++ b/vafator/tests/test_pileups.py @@ -1,6 +1,5 @@ from collections import Counter from unittest import TestCase -from unittest.mock import MagicMock import pkg_resources import pysam diff --git a/vafator/tests/test_power_calculator.py b/vafator/tests/test_power_calculator.py index c079a83..0c7612a 100644 --- a/vafator/tests/test_power_calculator.py +++ b/vafator/tests/test_power_calculator.py @@ -80,7 +80,7 @@ def test_varying_ploidy(self): self.assertLess( power1.calculate_power(dp=10, ac=2, sample='tumor', variant=None), power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None)) - + power3 = PowerCalculator( tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=6.0)}, purities={'tumor': 0.8}) self.assertLess( From 8b808376f29ed5ee29e558a66cc72a1af8badeb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Thu, 2 Apr 2026 12:56:37 +0200 Subject: [PATCH 25/32] more codacy improvements --- vafator/annotator.py | 4 ++-- vafator/command_line.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vafator/annotator.py b/vafator/annotator.py index 48008e4..ec983d4 100755 --- a/vafator/annotator.py +++ b/vafator/annotator.py @@ -253,7 +253,7 @@ def _add_stats(self, variant: Variant, metrics_by_bam: dict) -> None: self._annotate_sample(variant, sample, global_ac, global_dp, global_bq, global_mq, global_pos, global_all_mqs, global_all_bqs, global_all_positions) - + def _annotate_replicate(self, v: Variant, s: str, i: int, m) -> None: """Write per-replicate annotations — only called when multiple BAMs are provided for a sample. @@ -355,7 +355,7 @@ def _get_headers(input_bams: dict) -> list: headers.append(Annotator._make_header(suffix, description, typ, number, sample=s)) if len(bams) > 1: for i, bam in enumerate(bams, start=1): - n = os.path.basename(bam).split(".")[0] + # n = os.path.basename(bam).split(".")[0] for suffix, description, typ, number in _REPLICATE_HEADER_TEMPLATES: headers.append( Annotator._make_header( diff --git a/vafator/command_line.py b/vafator/command_line.py index 6a9a58a..e565c01 100755 --- a/vafator/command_line.py +++ b/vafator/command_line.py @@ -126,7 +126,7 @@ def multiallelics_filter(): tumor_sample_name=args.tumor_sample_name ) filter.run() - + logging.info("Vafator multiallelic filter finished!") From cf31f1e4543c7bd5071e935af14c6e12e8b37b2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Thu, 2 Apr 2026 14:21:24 +0200 Subject: [PATCH 26/32] Re-add comments --- vafator/pileups.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vafator/pileups.py b/vafator/pileups.py index 72b70e8..e85aa53 100755 --- a/vafator/pileups.py +++ b/vafator/pileups.py @@ -235,24 +235,27 @@ def _get_insertion_metrics_from_column(variant, pileup_col) -> CoverageMetrics: for pileup_read in pileup_reads: if pileup_read.indel > 0: + # read with an insertion index = pileup_read.alignment.reference_start relative_position = 0 for cigar_type, cigar_length in pileup_read.alignment.cigartuples: - if cigar_type in [0, 2, 3, 7, 8]: + if cigar_type in [0, 2, 3, 7, 8]: # consumes reference M, D, N, =, X index += cigar_length if index > variant_position: break - if cigar_type in [0, 1, 4, 7, 8]: + if cigar_type in [0, 1, 4, 7, 8]: # consumes query M, I, S, =, X relative_position += cigar_length - if cigar_type == 1: + if cigar_type == 1: # does not count I insertion_in_query = pileup_read.alignment.query[ relative_position:relative_position + insertion_length] if index == variant_position and cigar_length == insertion_length \ and insertion == insertion_in_query: + # the read contains the insertion ac[alt_upper] += 1 mq[alt_upper].append(pileup_read.alignment.mapping_quality) pos[alt_upper].append(pileup_read.query_position_or_next) elif pileup_read.indel == 0: + # NOTE: considers all reads without indels to be the reference! mq[variant.REF].append(pileup_read.alignment.mapping_quality) pos[variant.REF].append(pileup_read.query_position_or_next) @@ -295,7 +298,7 @@ def _get_deletion_metrics_from_column(variant, pileup_col) -> CoverageMetrics: if pileup_read.indel < 0: start = pileup_read.alignment.reference_start for cigar_type, cigar_length in pileup_read.alignment.cigartuples: - if cigar_type in [0, 3, 7, 8]: + if cigar_type in [0, 3, 7, 8]: # consumes reference M, N, =, X start += cigar_length elif cigar_type == 2: if start == variant_position and cigar_length == deletion_length: @@ -308,6 +311,7 @@ def _get_deletion_metrics_from_column(variant, pileup_col) -> CoverageMetrics: if start > variant_position: break elif pileup_read.indel == 0: + # NOTE: considers all reads without indels to be the reference! mq[variant.REF].append(pileup_read.alignment.mapping_quality) pos[variant.REF].append(pileup_read.query_position_or_next) From 8803b484594bc10f2eb4c50bb8dc95f3f4a75673 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Wed, 8 Apr 2026 15:20:56 +0200 Subject: [PATCH 27/32] black code formatting --- setup.py | 2 +- vafator/__init__.py | 3 +- vafator/annotator.py | 251 ++++++++++---- vafator/command_line.py | 329 +++++++++++++----- vafator/constants.py | 123 ++++++- vafator/hatchet2bed.py | 27 +- vafator/multiallelic_filter.py | 34 +- vafator/pileup_utils.py | 14 +- vafator/pileups.py | 108 ++++-- vafator/ploidies.py | 33 +- vafator/power.py | 44 ++- vafator/rank_sum_test.py | 9 +- vafator/tests/test_annotator.py | 400 +++++++++++++++------- vafator/tests/test_hatchet2bed.py | 22 +- vafator/tests/test_multiallelic_filter.py | 70 +++- vafator/tests/test_pileups.py | 132 +++++-- vafator/tests/test_ploidy_manager.py | 112 ++++-- vafator/tests/test_power_calculator.py | 250 ++++++++++---- vafator/tests/test_rank_sum_test.py | 12 +- vafator/tests/utils.py | 6 +- vafator/vafator2decifer.py | 87 +++-- 21 files changed, 1509 insertions(+), 559 deletions(-) diff --git a/setup.py b/setup.py index 398663e..0610c88 100755 --- a/setup.py +++ b/setup.py @@ -8,4 +8,4 @@ long_description = f.read() # Build the Python package -setup() \ No newline at end of file +setup() diff --git a/vafator/__init__.py b/vafator/__init__.py index f52972e..273ffca 100755 --- a/vafator/__init__.py +++ b/vafator/__init__.py @@ -1,2 +1 @@ -VERSION = '3.1.0' - +VERSION = "3.1.0" diff --git a/vafator/annotator.py b/vafator/annotator.py index ec983d4..519b8eb 100755 --- a/vafator/annotator.py +++ b/vafator/annotator.py @@ -8,7 +8,12 @@ import datetime import json -from vafator.constants import AMBIGUOUS_BASES, BATCH_SIZE, _HEADER_TEMPLATES, _REPLICATE_HEADER_TEMPLATES +from vafator.constants import ( + AMBIGUOUS_BASES, + BATCH_SIZE, + _HEADER_TEMPLATES, + _REPLICATE_HEADER_TEMPLATES, +) from vafator.ploidies import DEFAULT_PLOIDY from vafator.rank_sum_test import get_rank_sum_tests from vafator.power import PowerCalculator, DEFAULT_ERROR_RATE, DEFAULT_FPR @@ -17,13 +22,13 @@ def _collect_metrics_worker( - chrom: str, - variant_tuples: list, - bam_paths: dict, - min_base_quality: int, - min_mapping_quality: int, - include_ambiguous_bases: bool - ) -> dict: + chrom: str, + variant_tuples: list, + bam_paths: dict, + min_base_quality: int, + min_mapping_quality: int, + include_ambiguous_bases: bool, +) -> dict: """ Top-level worker function for ProcessPoolExecutor — must be module-level to be picklable. Opens its own BAM readers (AlignmentFile objects cannot be shared across processes). @@ -41,8 +46,10 @@ def _collect_metrics_worker( {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}} """ all_metrics = {} - variants = [VariantRecord(CHROM=chrom, POS=pos, REF=ref, ALT=[alt]) - for pos, ref, alt in variant_tuples] + variants = [ + VariantRecord(CHROM=chrom, POS=pos, REF=ref, ALT=[alt]) + for pos, ref, alt in variant_tuples + ] for sample, bam_files in bam_paths.items(): for i, bam_path in enumerate(bam_files): bam = pysam.AlignmentFile(bam_path, "rb") @@ -60,17 +67,21 @@ def _collect_metrics_worker( class Annotator(object): - def __init__(self, input_vcf: str, output_vcf: str, - input_bams: dict, - purities: dict = {}, - mapping_qual_thr: int = 0, - base_call_qual_thr: int = 29, - tumor_ploidies: dict = {}, - normal_ploidy: int = 2, - fpr: float = DEFAULT_FPR, - error_rate: float = DEFAULT_ERROR_RATE, - include_ambiguous_bases: bool = True, - num_processes: int = 1): + def __init__( + self, + input_vcf: str, + output_vcf: str, + input_bams: dict, + purities: dict = {}, + mapping_qual_thr: int = 0, + base_call_qual_thr: int = 29, + tumor_ploidies: dict = {}, + normal_ploidy: int = 2, + fpr: float = DEFAULT_FPR, + error_rate: float = DEFAULT_ERROR_RATE, + include_ambiguous_bases: bool = True, + num_processes: int = 1, + ): """ Args: input_vcf: path to the input VCF file to annotate @@ -99,7 +110,7 @@ def __init__(self, input_vcf: str, output_vcf: str, tumor_ploidies=tumor_ploidies, purities=purities, error_rate=error_rate, - fpr=fpr + fpr=fpr, ) self.vcf = VCF(input_vcf) @@ -112,26 +123,40 @@ def __init__(self, input_vcf: str, output_vcf: str, "input_vcf": os.path.abspath(input_vcf), "output_vcf": os.path.abspath(output_vcf), "bams": ";".join( - ["{}:{}".format(s, ",".join([os.path.abspath(b) for b in bams])) - for s, bams in input_bams.items()] - ), + [ + "{}:{}".format(s, ",".join([os.path.abspath(b) for b in bams])) + for s, bams in input_bams.items() + ] + ), "mapping_quality_threshold": mapping_qual_thr, "base_call_quality_threshold": base_call_qual_thr, "purities": ";".join(["{}:{}".format(s, p) for s, p in purities.items()]), "normal_ploidy": normal_ploidy, - "tumor_ploidy": ";".join( - ["{}:{}".format(s, p.report_value) for s, p in tumor_ploidies.items()] - ) if tumor_ploidies else DEFAULT_PLOIDY, + "tumor_ploidy": ( + ";".join( + [ + "{}:{}".format(s, p.report_value) + for s, p in tumor_ploidies.items() + ] + ) + if tumor_ploidies + else DEFAULT_PLOIDY + ), "include_ambiguous_bases": include_ambiguous_bases, } - self.vcf.add_to_header("##vafator_command_line={}".format(json.dumps(self.header))) + self.vcf.add_to_header( + "##vafator_command_line={}".format(json.dumps(self.header)) + ) for a in Annotator._get_headers(input_bams): self.vcf.add_info_to_header(a) self.vcf_writer = Writer(output_vcf, self.vcf) self.bam_paths = input_bams - self.bam_readers = {s: [pysam.AlignmentFile(b, "rb") for b in bams] for s, bams in input_bams.items()} + self.bam_readers = { + s: [pysam.AlignmentFile(b, "rb") for b in bams] + for s, bams in input_bams.items() + } def run(self) -> None: """Run the annotation pipeline over all variants in the input VCF, @@ -164,13 +189,17 @@ def _run_parallel(self, batch: list) -> None: for chrom, chrom_variants in stream_variants_by_chrom(self.vcf): chrom_variants_map[chrom] = chrom_variants variant_tuples = [(v.POS, v.REF, v.ALT[0]) for v in chrom_variants] - futures[executor.submit( - _collect_metrics_worker, - chrom=chrom, variant_tuples=variant_tuples, bam_paths=self.bam_paths, - min_base_quality=self.base_call_quality_threshold, - min_mapping_quality=self.mapping_quality_threshold, - include_ambiguous_bases=self.include_ambiguous_bases, - )] = chrom + futures[ + executor.submit( + _collect_metrics_worker, + chrom=chrom, + variant_tuples=variant_tuples, + bam_paths=self.bam_paths, + min_base_quality=self.base_call_quality_threshold, + min_mapping_quality=self.mapping_quality_threshold, + include_ambiguous_bases=self.include_ambiguous_bases, + ) + ] = chrom chrom_results = {futures[f]: f.result() for f in futures} for chrom, chrom_variants in chrom_variants_map.items(): self._annotate_and_batch(chrom_variants, chrom_results[chrom], batch) @@ -189,17 +218,21 @@ def _collect_chrom_metrics(self, chrom: str, chrom_variants: list) -> dict: for sample, bams in self.bam_readers.items(): for i, bam in enumerate(bams): all_metrics[(sample, i)] = collect_metrics_for_chrom( - chrom=chrom, variants=chrom_variants, bam=bam, + chrom=chrom, + variants=chrom_variants, + bam=bam, min_base_quality=self.base_call_quality_threshold, min_mapping_quality=self.mapping_quality_threshold, include_ambiguous_bases=self.include_ambiguous_bases, ) return all_metrics - def _annotate_and_batch(self, chrom_variants: list, all_metrics: dict, batch: list) -> None: + def _annotate_and_batch( + self, chrom_variants: list, all_metrics: dict, batch: list + ) -> None: """Annotate variants using pre-computed metrics and append to write batch. Flushes the batch to disk when it reaches BATCH_SIZE. - + Args: chrom_variants: list of cyvcf2 Variant objects to annotate all_metrics: {(sample, bam_idx): {(pos, REF, ALT): CoverageMetrics}} @@ -208,7 +241,8 @@ def _annotate_and_batch(self, chrom_variants: list, all_metrics: dict, batch: li for variant in chrom_variants: metrics_by_bam = { (sample, i): all_metrics[(sample, i)].get( - (variant.POS, variant.REF, variant.ALT[0]), EMPTY_METRICS) + (variant.POS, variant.REF, variant.ALT[0]), EMPTY_METRICS + ) for sample, bams in self.bam_readers.items() for i in range(len(bams)) } @@ -218,7 +252,6 @@ def _annotate_and_batch(self, chrom_variants: list, all_metrics: dict, batch: li self._write_batch(batch) batch.clear() - def _add_stats(self, variant: Variant, metrics_by_bam: dict) -> None: """Annotate a single variant using pre-computed metrics. @@ -250,9 +283,18 @@ def _add_stats(self, variant: Variant, metrics_by_bam: dict) -> None: global_all_positions.update(metrics.all_positions) global_dp += metrics.dp - self._annotate_sample(variant, sample, global_ac, global_dp, global_bq, - global_mq, global_pos, global_all_mqs, global_all_bqs, - global_all_positions) + self._annotate_sample( + variant, + sample, + global_ac, + global_dp, + global_bq, + global_mq, + global_pos, + global_all_mqs, + global_all_bqs, + global_all_positions, + ) def _annotate_replicate(self, v: Variant, s: str, i: int, m) -> None: """Write per-replicate annotations — only called when multiple BAMs are provided for a sample. @@ -264,27 +306,60 @@ def _annotate_replicate(self, v: Variant, s: str, i: int, m) -> None: m: pre-computed CoverageMetrics for this variant in this BAM """ n = i + 1 - v.INFO["{}_af_{}".format(s, n)] = ",".join([str(self._calculate_af(m.ac[alt], m.dp)) for alt in v.ALT]) + v.INFO["{}_af_{}".format(s, n)] = ",".join( + [str(self._calculate_af(m.ac[alt], m.dp)) for alt in v.ALT] + ) v.INFO["{}_ac_{}".format(s, n)] = ",".join([str(m.ac[alt]) for alt in v.ALT]) - v.INFO["{}_n_{}".format(s, n)] = str(sum(m.ac.get(b, 0) for b in AMBIGUOUS_BASES)) + v.INFO["{}_n_{}".format(s, n)] = str( + sum(m.ac.get(b, 0) for b in AMBIGUOUS_BASES) + ) v.INFO["{}_dp_{}".format(s, n)] = m.dp - v.INFO["{}_pu_{}".format(s, n)] = ",".join([str(self.power.calculate_power(ac=m.ac[alt], dp=m.dp, sample=s, variant=v)) for alt in v.ALT]) + v.INFO["{}_pu_{}".format(s, n)] = ",".join( + [ + str( + self.power.calculate_power( + ac=m.ac[alt], dp=m.dp, sample=s, variant=v + ) + ) + for alt in v.ALT + ] + ) power, k = self.power.calculate_absolute_power(dp=m.dp, sample=s, variant=v) v.INFO["{}_pw_{}".format(s, n)] = str(power) v.INFO["{}_k_{}".format(s, n)] = str(k) - v.INFO["{}_bq_{}".format(s, n)] = ",".join([str(m.bqs[v.REF])] + [str(m.bqs[alt]) for alt in v.ALT]) - v.INFO["{}_mq_{}".format(s, n)] = ",".join([str(m.mqs[v.REF])] + [str(m.mqs[alt]) for alt in v.ALT]) - v.INFO["{}_pos_{}".format(s, n)] = ",".join([str(m.positions[v.REF])] + [str(m.positions[alt]) for alt in v.ALT]) - for key, tag in [(m.all_mqs, "rsmq"), (m.all_bqs, "rsbq"), (m.all_positions, "rspos")]: + v.INFO["{}_bq_{}".format(s, n)] = ",".join( + [str(m.bqs[v.REF])] + [str(m.bqs[alt]) for alt in v.ALT] + ) + v.INFO["{}_mq_{}".format(s, n)] = ",".join( + [str(m.mqs[v.REF])] + [str(m.mqs[alt]) for alt in v.ALT] + ) + v.INFO["{}_pos_{}".format(s, n)] = ",".join( + [str(m.positions[v.REF])] + [str(m.positions[alt]) for alt in v.ALT] + ) + for key, tag in [ + (m.all_mqs, "rsmq"), + (m.all_bqs, "rsbq"), + (m.all_positions, "rspos"), + ]: pvalues, stats = get_rank_sum_tests(key, v) if stats: v.INFO["{}_{}_{}".format(s, tag, n)] = ",".join(stats) v.INFO["{}_{}_pv_{}".format(s, tag, n)] = ",".join(pvalues) - def _annotate_sample(self, v: Variant, s: str, gac: Counter, gdp: int, - gbq: Counter, gmq: Counter, gpos: Counter, - gallmq: dict, gallbq: dict, gallpos: dict) -> None: + def _annotate_sample( + self, + v: Variant, + s: str, + gac: Counter, + gdp: int, + gbq: Counter, + gmq: Counter, + gpos: Counter, + gallmq: dict, + gallbq: dict, + gallpos: dict, + ) -> None: """Write aggregate annotations for a sample, combining metrics across all replicates. Args: @@ -299,20 +374,41 @@ def _annotate_sample(self, v: Variant, s: str, gac: Counter, gdp: int, gallbq: BQ distributions per allele across all BAMs gallpos: read position distributions per allele across all BAMs """ - v.INFO["{}_af".format(s)] = ",".join([str(self._calculate_af(gac[alt], gdp)) for alt in v.ALT]) + v.INFO["{}_af".format(s)] = ",".join( + [str(self._calculate_af(gac[alt], gdp)) for alt in v.ALT] + ) v.INFO["{}_ac".format(s)] = ",".join([str(gac[alt]) for alt in v.ALT]) v.INFO["{}_n".format(s)] = str(sum(gac.get(b, 0) for b in AMBIGUOUS_BASES)) v.INFO["{}_dp".format(s)] = gdp - v.INFO["{}_eaf".format(s)] = str(self.power.calculate_expected_vaf(sample=s, variant=v)) - v.INFO["{}_pu".format(s)] = ",".join([str(self.power.calculate_power(ac=gac[alt], dp=gdp, sample=s, variant=v)) for alt in v.ALT]) - + v.INFO["{}_eaf".format(s)] = str( + self.power.calculate_expected_vaf(sample=s, variant=v) + ) + v.INFO["{}_pu".format(s)] = ",".join( + [ + str( + self.power.calculate_power(ac=gac[alt], dp=gdp, sample=s, variant=v) + ) + for alt in v.ALT + ] + ) + power, k = self.power.calculate_absolute_power(dp=gdp, sample=s, variant=v) v.INFO["{}_pw".format(s)] = str(power) v.INFO["{}_k".format(s)] = str(k) - v.INFO["{}_bq".format(s)] = ",".join([str(gbq[v.REF])] + [str(gbq[alt]) for alt in v.ALT]) - v.INFO["{}_mq".format(s)] = ",".join([str(gmq[v.REF])] + [str(gmq[alt]) for alt in v.ALT]) - v.INFO["{}_pos".format(s)] = ",".join([str(gpos[v.REF])] + [str(gpos[alt]) for alt in v.ALT]) - for distributions, tag in [(gallmq, "rsmq"), (gallbq, "rsbq"), (gallpos, "rspos")]: + v.INFO["{}_bq".format(s)] = ",".join( + [str(gbq[v.REF])] + [str(gbq[alt]) for alt in v.ALT] + ) + v.INFO["{}_mq".format(s)] = ",".join( + [str(gmq[v.REF])] + [str(gmq[alt]) for alt in v.ALT] + ) + v.INFO["{}_pos".format(s)] = ",".join( + [str(gpos[v.REF])] + [str(gpos[alt]) for alt in v.ALT] + ) + for distributions, tag in [ + (gallmq, "rsmq"), + (gallbq, "rsbq"), + (gallpos, "rspos"), + ]: pvalues, stats = get_rank_sum_tests(distributions, v) if stats: v.INFO["{}_{}".format(s, tag)] = ",".join(stats) @@ -320,11 +416,11 @@ def _annotate_sample(self, v: Variant, s: str, gac: Counter, gdp: int, def _calculate_af(self, ac: int, dp: int) -> float: """Return allele frequency, or 0.0 if depth is zero. - + Args: ac: allele count for this alternate allele dp: total depth of coverage - + Returns: allele frequency rounded to 5 decimal places, or 0.0 if dp is zero """ @@ -332,7 +428,7 @@ def _calculate_af(self, ac: int, dp: int) -> float: def _write_batch(self, batch: list) -> None: """Write a batch of annotated variants to the output VCF. - + Args: batch: list of cyvcf2 Variant objects to write """ @@ -352,21 +448,29 @@ def _get_headers(input_bams: dict) -> list: headers = [] for s, bams in input_bams.items(): for suffix, description, typ, number in _HEADER_TEMPLATES: - headers.append(Annotator._make_header(suffix, description, typ, number, sample=s)) + headers.append( + Annotator._make_header(suffix, description, typ, number, sample=s) + ) if len(bams) > 1: for i, bam in enumerate(bams, start=1): # n = os.path.basename(bam).split(".")[0] for suffix, description, typ, number in _REPLICATE_HEADER_TEMPLATES: headers.append( Annotator._make_header( - suffix, description, typ, number, sample=s, index=i + suffix, description, typ, number, sample=s, index=i ) ) return headers @staticmethod - def _make_header(suffix: str, description: str, typ: str, number: str, - sample: str, index: int = None) -> dict: + def _make_header( + suffix: str, + description: str, + typ: str, + number: str, + sample: str, + index: int = None, + ) -> dict: """Build a single INFO header dict for cyvcf2's add_info_to_header. Args: @@ -383,4 +487,9 @@ def _make_header(suffix: str, description: str, typ: str, number: str, header_id = "{}_{}".format(sample, suffix) if index is not None: header_id = "{}_{}".format(header_id, index) - return {'ID': header_id, 'Description': description.format(sample=sample), 'Type': typ, 'Number': number} \ No newline at end of file + return { + "ID": header_id, + "Description": description.format(sample=sample), + "Type": typ, + "Number": number, + } diff --git a/vafator/command_line.py b/vafator/command_line.py index e565c01..e09e9b0 100755 --- a/vafator/command_line.py +++ b/vafator/command_line.py @@ -16,42 +16,111 @@ def annotator(): # set up logger - parser = argparse.ArgumentParser(description="vafator v{}".format(vafator.VERSION), - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - epilog=epilog) - parser.add_argument("--input-vcf", dest="input_vcf", action="store", help="The VCF to annotate", required=True) - parser.add_argument("--output-vcf", dest="output_vcf", action="store", help="The annotated VCF", required=True) - parser.add_argument('--bam', action='append', nargs=2, - metavar=('sample_name', 'bam_file'), default=[], - help='A sample name and a BAM file. Can be used multiple times to input multiple samples and ' - 'multiple BAM files. The same sample name can be used multiple times with different BAMs, ' - 'this will treated as replicates.') - parser.add_argument("--mapping-quality", dest="mapping_quality", action="store", type=int, default=1, - help="All reads with a mapping quality below this threshold will be filtered out") - parser.add_argument("--base-call-quality", dest="base_call_quality", action="store", type=int, default=30, - help="All bases with a base call quality below this threshold will be filtered out") - parser.add_argument('--purity', action='append', nargs=2, - metavar=('sample_name', 'purity'), default=[], - help='A sample name and a tumor purity value. Can be used multiple times to input multiple ' - 'samples in combination with --bam. If no purity is provided for a given sample the ' - 'default value is 1.0') - parser.add_argument("--tumor-ploidy", action='append', nargs=2, - metavar=('sample_name', 'tumor_ploidy'), default=[], - help='A sample name and a tumor ploidy. Can be used multiple times to input multiple ' - 'samples in combination with --bam. The tumor ploidy can be provided as a genome-wide ' - 'value (eg: --tumor-ploidy primary 2) or as local copy numbers in a BED file ' - '(eg: --tumor-ploidy primary /path/to/copy_numbers.bed), see the documentation for ' - 'expected BED format (default: 2)') - parser.add_argument("--normal-ploidy", dest="normal_ploidy", required=False, default=2, type=int, - help="Normal ploidy for the power calculation (default: 2)") - parser.add_argument("--fpr", dest="fpr", required=False, default=DEFAULT_FPR, type=float, - help="False Positive Rate (FPR) to use in the power calculation") - parser.add_argument("--error-rate", dest="error_rate", required=False, default=DEFAULT_ERROR_RATE, type=float, - help="Error rate to use in the power calculation") - parser.add_argument("--exclude-ambiguous-bases", dest="exclude_ambiguous_bases", action='store_true', - help="Flag indicating to include ambiguous bases from the DP calculation") - parser.add_argument("--num-processes", dest="num_processes", required=False, default=1, type=int, - help="Number of processes for parallel chromosome-level annotation (default: 1)") + parser = argparse.ArgumentParser( + description="vafator v{}".format(vafator.VERSION), + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + epilog=epilog, + ) + parser.add_argument( + "--input-vcf", + dest="input_vcf", + action="store", + help="The VCF to annotate", + required=True, + ) + parser.add_argument( + "--output-vcf", + dest="output_vcf", + action="store", + help="The annotated VCF", + required=True, + ) + parser.add_argument( + "--bam", + action="append", + nargs=2, + metavar=("sample_name", "bam_file"), + default=[], + help="A sample name and a BAM file. Can be used multiple times to input multiple samples and " + "multiple BAM files. The same sample name can be used multiple times with different BAMs, " + "this will treated as replicates.", + ) + parser.add_argument( + "--mapping-quality", + dest="mapping_quality", + action="store", + type=int, + default=1, + help="All reads with a mapping quality below this threshold will be filtered out", + ) + parser.add_argument( + "--base-call-quality", + dest="base_call_quality", + action="store", + type=int, + default=30, + help="All bases with a base call quality below this threshold will be filtered out", + ) + parser.add_argument( + "--purity", + action="append", + nargs=2, + metavar=("sample_name", "purity"), + default=[], + help="A sample name and a tumor purity value. Can be used multiple times to input multiple " + "samples in combination with --bam. If no purity is provided for a given sample the " + "default value is 1.0", + ) + parser.add_argument( + "--tumor-ploidy", + action="append", + nargs=2, + metavar=("sample_name", "tumor_ploidy"), + default=[], + help="A sample name and a tumor ploidy. Can be used multiple times to input multiple " + "samples in combination with --bam. The tumor ploidy can be provided as a genome-wide " + "value (eg: --tumor-ploidy primary 2) or as local copy numbers in a BED file " + "(eg: --tumor-ploidy primary /path/to/copy_numbers.bed), see the documentation for " + "expected BED format (default: 2)", + ) + parser.add_argument( + "--normal-ploidy", + dest="normal_ploidy", + required=False, + default=2, + type=int, + help="Normal ploidy for the power calculation (default: 2)", + ) + parser.add_argument( + "--fpr", + dest="fpr", + required=False, + default=DEFAULT_FPR, + type=float, + help="False Positive Rate (FPR) to use in the power calculation", + ) + parser.add_argument( + "--error-rate", + dest="error_rate", + required=False, + default=DEFAULT_ERROR_RATE, + type=float, + help="Error rate to use in the power calculation", + ) + parser.add_argument( + "--exclude-ambiguous-bases", + dest="exclude_ambiguous_bases", + action="store_true", + help="Flag indicating to include ambiguous bases from the DP calculation", + ) + parser.add_argument( + "--num-processes", + dest="num_processes", + required=False, + default=1, + type=int, + help="Number of processes for parallel chromosome-level annotation (default: 1)", + ) args = parser.parse_args() @@ -67,27 +136,44 @@ def annotator(): purities = {} for sample_name, purity in args.purity: if sample_name in purities: - raise ValueError('Multiple purity values provided for sample: {}'.format(sample_name)) + raise ValueError( + "Multiple purity values provided for sample: {}".format(sample_name) + ) if sample_name not in bams: - raise ValueError('Provided a purity value for a sample for which no BAM is provided: {}'.format(sample_name)) + raise ValueError( + "Provided a purity value for a sample for which no BAM is provided: {}".format( + sample_name + ) + ) purities[sample_name] = float(purity) tumor_ploidies = {} for sample_name, tumor_ploidy in args.tumor_ploidy: if sample_name in tumor_ploidies: - raise ValueError('Multiple tumor ploidy values provided for sample: {}'.format(sample_name)) + raise ValueError( + "Multiple tumor ploidy values provided for sample: {}".format( + sample_name + ) + ) if sample_name not in bams: raise ValueError( - 'Provided a tumor ploidy value for a sample for which no BAM is provided: {}'.format(sample_name)) + "Provided a tumor ploidy value for a sample for which no BAM is provided: {}".format( + sample_name + ) + ) try: # checks if a genome-wide purity value was passed - tumor_ploidies[sample_name] = PloidyManager(genome_wide_ploidy=float(tumor_ploidy)) + tumor_ploidies[sample_name] = PloidyManager( + genome_wide_ploidy=float(tumor_ploidy) + ) except ValueError: # checks if the non float-like value is a path to an existing file tumor_ploidies[sample_name] = PloidyManager(local_copy_numbers=tumor_ploidy) if len(bams) == 0: - raise ValueError("Please, provide at least one bam file with '--bam sample_name /path/to/file.bam'") + raise ValueError( + "Please, provide at least one bam file with '--bam sample_name /path/to/file.bam'" + ) annotator = Annotator( input_vcf=args.input_vcf, @@ -111,19 +197,39 @@ def annotator(): def multiallelics_filter(): # set up logger - parser = argparse.ArgumentParser(description="vafator v{}".format(vafator.VERSION), - formatter_class=argparse.ArgumentDefaultsHelpFormatter, epilog=epilog) - parser.add_argument("--input-vcf", dest="input_vcf", action="store", help="The VCF to annotate", required=True) - parser.add_argument("--output-vcf", dest="output_vcf", action="store", help="The annotated VCF", required=True) - parser.add_argument("--tumor-sample-name", dest="tumor_sample_name", action="store", - help='The tumor sample name (will look for annotation ${SAMPLE_NAME}_af)', default='tumor') + parser = argparse.ArgumentParser( + description="vafator v{}".format(vafator.VERSION), + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + epilog=epilog, + ) + parser.add_argument( + "--input-vcf", + dest="input_vcf", + action="store", + help="The VCF to annotate", + required=True, + ) + parser.add_argument( + "--output-vcf", + dest="output_vcf", + action="store", + help="The annotated VCF", + required=True, + ) + parser.add_argument( + "--tumor-sample-name", + dest="tumor_sample_name", + action="store", + help="The tumor sample name (will look for annotation ${SAMPLE_NAME}_af)", + default="tumor", + ) args = parser.parse_args() logging.info("Vafator multiallelic filter starting...") filter = MultiallelicFilter( input_vcf=args.input_vcf, output_vcf=args.output_vcf, - tumor_sample_name=args.tumor_sample_name + tumor_sample_name=args.tumor_sample_name, ) filter.run() @@ -131,37 +237,106 @@ def multiallelics_filter(): def vafator2decifer(): - parser = argparse.ArgumentParser(description='Generate input for Decifer using VCF file and HATCHet CNA file') - parser.add_argument("-V", "--vcf_file", required=True, type=str, help="single or multi-sample VCF file") - parser.add_argument("-S", "--samples", required=True, type=str, - help="comma separated list of sample name prefixes to use for VAFator annotations, " - "eg: primary_tumor,metastasis_tumor; the annotations primary_tumor_ac, primary_tumor_dp, " - "etc. will be expected to exist") - parser.add_argument("-C", "--cna_file", required=True, type=str, help="HATCHet CNA file: best.seg.ucn ") - parser.add_argument("-O", "--out_dir", required=True, default="./", type=str, - help="directory for printing files; please make unique for each patient!") - parser.add_argument("-M", "--min_depth", required=True, type=int, help="minimum depth PER sample") - parser.add_argument("-A", "--min_alt_depth", required=True, type=int, - help="minimum depth of ALT allele in at least one sample") - parser.add_argument("-F", "--min_vaf", required=True, type=float, - help="minimum VAF of ALT allele in at least one sample") - parser.add_argument("-N", "--max_CN", required=False, default=6, type=int, - help="maximum total copy number for each observed clone") - parser.add_argument("-B", "--exclude_list", required=False, default=None, type=str, - help="BED file of genomic regions to exclude") - parser.add_argument("-p", "--min_purity", required=False, default=0.0, type=float, - help="minimum purity to consider samples") - parser.add_argument("--snp_file", required=False, default=None, type=str, - help="HATCHet file containing germline SNP counts in tumor samples, baf/tumor.1bed") + parser = argparse.ArgumentParser( + description="Generate input for Decifer using VCF file and HATCHet CNA file" + ) + parser.add_argument( + "-V", + "--vcf_file", + required=True, + type=str, + help="single or multi-sample VCF file", + ) + parser.add_argument( + "-S", + "--samples", + required=True, + type=str, + help="comma separated list of sample name prefixes to use for VAFator annotations, " + "eg: primary_tumor,metastasis_tumor; the annotations primary_tumor_ac, primary_tumor_dp, " + "etc. will be expected to exist", + ) + parser.add_argument( + "-C", + "--cna_file", + required=True, + type=str, + help="HATCHet CNA file: best.seg.ucn ", + ) + parser.add_argument( + "-O", + "--out_dir", + required=True, + default="./", + type=str, + help="directory for printing files; please make unique for each patient!", + ) + parser.add_argument( + "-M", "--min_depth", required=True, type=int, help="minimum depth PER sample" + ) + parser.add_argument( + "-A", + "--min_alt_depth", + required=True, + type=int, + help="minimum depth of ALT allele in at least one sample", + ) + parser.add_argument( + "-F", + "--min_vaf", + required=True, + type=float, + help="minimum VAF of ALT allele in at least one sample", + ) + parser.add_argument( + "-N", + "--max_CN", + required=False, + default=6, + type=int, + help="maximum total copy number for each observed clone", + ) + parser.add_argument( + "-B", + "--exclude_list", + required=False, + default=None, + type=str, + help="BED file of genomic regions to exclude", + ) + parser.add_argument( + "-p", + "--min_purity", + required=False, + default=0.0, + type=float, + help="minimum purity to consider samples", + ) + parser.add_argument( + "--snp_file", + required=False, + default=None, + type=str, + help="HATCHet file containing germline SNP counts in tumor samples, baf/tumor.1bed", + ) args = parser.parse_args() run_vafator2decifer(args) def hatchet2bed(): - parser = argparse.ArgumentParser(description='Generate input for Decifer using VCF file and HATCHet CNA file') - parser.add_argument("-i", "--input-file", required=True, type=str, help="input *.ucn hatchet file") - parser.add_argument("-o", "--output-prefix", required=True, type=str, - help="output BED file prefix, one file will be created per sample in the input with the " - "average tumor copy number in each segment") + parser = argparse.ArgumentParser( + description="Generate input for Decifer using VCF file and HATCHet CNA file" + ) + parser.add_argument( + "-i", "--input-file", required=True, type=str, help="input *.ucn hatchet file" + ) + parser.add_argument( + "-o", + "--output-prefix", + required=True, + type=str, + help="output BED file prefix, one file will be created per sample in the input with the " + "average tumor copy number in each segment", + ) args = parser.parse_args() - run_hatchet2bed(input_file=args.input_file, output_prefix=args.output_prefix) \ No newline at end of file + run_hatchet2bed(input_file=args.input_file, output_prefix=args.output_prefix) diff --git a/vafator/constants.py b/vafator/constants.py index bfad7aa..8496c0f 100755 --- a/vafator/constants.py +++ b/vafator/constants.py @@ -2,29 +2,114 @@ BATCH_SIZE = 10000 # IUPAC ambiguity codes treated as ambiguous bases in pileup metrics -AMBIGUOUS_BASES = ['N', 'M', 'R', 'W', 'S', 'Y', 'K', 'V', 'H', 'D', 'B'] +AMBIGUOUS_BASES = ["N", "M", "R", "W", "S", "Y", "K", "V", "H", "D", "B"] # Header templates: (suffix, description, type, number) # {sample} is substituted at generation time _HEADER_TEMPLATES = [ - ("af", "Allele frequency for the alternate alleles in the {sample} sample/s", "Float", "A"), - ("dp", "Total depth of coverage in the {sample} sample/s (independent of alleles)", "Float", "1"), - ("ac", "Allele count for the alternate alleles in the {sample} sample/s", "Integer", "A"), - ("n", "Allele count for ambiguous bases (any IUPAC ambiguity code is counted) in the {sample} sample/s", "Integer", "1"), - ("pu", "Probability of an undetected mutation given the observed supporting reads (AC), the observed total coverage (DP) and the provided tumor purity in the {sample} sample/s", "Float", "A"), - ("pw", "Power to detect a somatic mutation as described in Absolute given the observed total coverage (DP) and the provided tumor purity and ploidies in the {sample} sample/s", "Float", "1"), - ("k", "Minimum number of supporting reads, k, such that the probability of observing k or more non-reference reads due to sequencing error is less than the defined FPR in the {sample} sample/s", "Float", "1"), - ("eaf", "Expected VAF considering the purity and ploidy/copy number in the {sample} sample/s", "Float", "1"), - ("bq", "Median base call quality of the reads supporting each allele in the {sample} sample/s", "Float", "R"), - ("mq", "Median mapping quality of the reads supporting each allele in the {sample} sample/s", "Float", "R"), - ("pos", "Median position within the read of the reads supporting each allele in the {sample} sample/s", "Float", "R"), - ("rsmq", "Rank sum test comparing the MQ distributions supporting the reference and the alternate in the {sample} sample/s", "Float", "A"), - ("rsmq_pv", "Rank sum test p-value for MQ distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions", "Float", "A"), - ("rsbq", "Rank sum test comparing the BQ distributions supporting the reference and the alternate in the {sample} sample/s", "Float", "A"), - ("rsbq_pv", "Rank sum test p-value for BQ distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions", "Float", "A"), - ("rspos", "Rank sum test comparing the position distributions supporting the reference and the alternate in the {sample} sample/s", "Float", "A"), - ("rspos_pv", "Rank sum test p-value for position distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions", "Float", "A"), + ( + "af", + "Allele frequency for the alternate alleles in the {sample} sample/s", + "Float", + "A", + ), + ( + "dp", + "Total depth of coverage in the {sample} sample/s (independent of alleles)", + "Float", + "1", + ), + ( + "ac", + "Allele count for the alternate alleles in the {sample} sample/s", + "Integer", + "A", + ), + ( + "n", + "Allele count for ambiguous bases (any IUPAC ambiguity code is counted) in the {sample} sample/s", + "Integer", + "1", + ), + ( + "pu", + "Probability of an undetected mutation given the observed supporting reads (AC), the observed total coverage (DP) and the provided tumor purity in the {sample} sample/s", + "Float", + "A", + ), + ( + "pw", + "Power to detect a somatic mutation as described in Absolute given the observed total coverage (DP) and the provided tumor purity and ploidies in the {sample} sample/s", + "Float", + "1", + ), + ( + "k", + "Minimum number of supporting reads, k, such that the probability of observing k or more non-reference reads due to sequencing error is less than the defined FPR in the {sample} sample/s", + "Float", + "1", + ), + ( + "eaf", + "Expected VAF considering the purity and ploidy/copy number in the {sample} sample/s", + "Float", + "1", + ), + ( + "bq", + "Median base call quality of the reads supporting each allele in the {sample} sample/s", + "Float", + "R", + ), + ( + "mq", + "Median mapping quality of the reads supporting each allele in the {sample} sample/s", + "Float", + "R", + ), + ( + "pos", + "Median position within the read of the reads supporting each allele in the {sample} sample/s", + "Float", + "R", + ), + ( + "rsmq", + "Rank sum test comparing the MQ distributions supporting the reference and the alternate in the {sample} sample/s", + "Float", + "A", + ), + ( + "rsmq_pv", + "Rank sum test p-value for MQ distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions", + "Float", + "A", + ), + ( + "rsbq", + "Rank sum test comparing the BQ distributions supporting the reference and the alternate in the {sample} sample/s", + "Float", + "A", + ), + ( + "rsbq_pv", + "Rank sum test p-value for BQ distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions", + "Float", + "A", + ), + ( + "rspos", + "Rank sum test comparing the position distributions supporting the reference and the alternate in the {sample} sample/s", + "Float", + "A", + ), + ( + "rspos_pv", + "Rank sum test p-value for position distributions in the {sample} sample/s. The null hypothesis is that there is no difference between the distributions", + "Float", + "A", + ), ] # eaf is not produced per-replicate -_REPLICATE_HEADER_TEMPLATES = [t for t in _HEADER_TEMPLATES if t[0] != "eaf"] \ No newline at end of file +_REPLICATE_HEADER_TEMPLATES = [t for t in _HEADER_TEMPLATES if t[0] != "eaf"] diff --git a/vafator/hatchet2bed.py b/vafator/hatchet2bed.py index d0b45bb..bef4ba3 100644 --- a/vafator/hatchet2bed.py +++ b/vafator/hatchet2bed.py @@ -2,9 +2,13 @@ def run_hatchet2bed(input_file, output_prefix): - input_df = pd.read_csv(input_file, sep='\t') - cn_columns = sorted(list(filter(lambda c: c.startswith('cn_clone'), input_df.columns))) - u_columns = sorted(list(filter(lambda c: c.startswith('u_clone'), input_df.columns))) + input_df = pd.read_csv(input_file, sep="\t") + cn_columns = sorted( + list(filter(lambda c: c.startswith("cn_clone"), input_df.columns)) + ) + u_columns = sorted( + list(filter(lambda c: c.startswith("u_clone"), input_df.columns)) + ) for sample in input_df.SAMPLE.unique(): data = [] @@ -14,9 +18,18 @@ def run_hatchet2bed(input_file, output_prefix): for cn_column, u_column in zip(cn_columns, u_columns): u = float(row[u_column]) total_u += u - cn = sum(map(lambda c: float(c), row[cn_column].split('|'))) + cn = sum(map(lambda c: float(c), row[cn_column].split("|"))) numerator.append(u * cn) - data.append([row['#CHR'], row['START'], row['END'], sum(numerator)/total_u]) + data.append( + [row["#CHR"], row["START"], row["END"], sum(numerator) / total_u] + ) - output_df = pd.DataFrame(data=data, columns=['chromosome', 'start', 'end', 'value']) - output_df.to_csv('{}.{}.bed'.format(output_prefix, sample), sep='\t', header=False, index=False) + output_df = pd.DataFrame( + data=data, columns=["chromosome", "start", "end", "value"] + ) + output_df.to_csv( + "{}.{}.bed".format(output_prefix, sample), + sep="\t", + header=False, + index=False, + ) diff --git a/vafator/multiallelic_filter.py b/vafator/multiallelic_filter.py index 6f08d7e..00b0f26 100755 --- a/vafator/multiallelic_filter.py +++ b/vafator/multiallelic_filter.py @@ -16,7 +16,7 @@ class MultiallelicFilter(object): multiallelic_annotation_name = "multiallelic" - def __init__(self, input_vcf, output_vcf, tumor_sample_name='tumor'): + def __init__(self, input_vcf, output_vcf, tumor_sample_name="tumor"): """ :param input_vcf: the input VCF file :param output_vcf: the file path of the output VCF @@ -26,10 +26,18 @@ def __init__(self, input_vcf, output_vcf, tumor_sample_name='tumor'): # sets a line in the header with the command used to annotate the file self.vafator_header["input_vcf"] = input_vcf self.vafator_header["output_vcf"] = output_vcf - self.vcf.add_to_header("##vafator_command_line={}".format(json.dumps(self.vafator_header))) + self.vcf.add_to_header( + "##vafator_command_line={}".format(json.dumps(self.vafator_header)) + ) # adds to the header all the names of the annotations - self.vcf.add_info_to_header({'ID': self.multiallelic_annotation_name, 'Type': 'String', 'Number': '.', - 'Description': "Indicates multiallelic variants filtered and their frequencies if any (e.g.: T,0.12)"}) + self.vcf.add_info_to_header( + { + "ID": self.multiallelic_annotation_name, + "Type": "String", + "Number": ".", + "Description": "Indicates multiallelic variants filtered and their frequencies if any (e.g.: T,0.12)", + } + ) self.vcf_writer = Writer(output_vcf, self.vcf) def run(self): @@ -41,8 +49,12 @@ def run(self): prev_variant = variant continue # considers only SNVs with same chromosome, position and reference - if variant.is_snp and variant.CHROM == prev_variant.CHROM and variant.POS == prev_variant.POS and \ - variant.REF == prev_variant.REF: + if ( + variant.is_snp + and variant.CHROM == prev_variant.CHROM + and variant.POS == prev_variant.POS + and variant.REF == prev_variant.REF + ): af1 = self.get_tumor_af(prev_variant) af2 = self.get_tumor_af(variant) # keeps the variant with the highest AF @@ -56,7 +68,9 @@ def run(self): alt1 = variant.ALT[0] alt2 = prev_variant.ALT[0] prev_variant = random.sample([variant, prev_variant], k=1)[0] - self.set_multiallelic_annotation(prev_variant, alt1 if alt1 != prev_variant.ALT[0] else alt2, af1) + self.set_multiallelic_annotation( + prev_variant, alt1 if alt1 != prev_variant.ALT[0] else alt2, af1 + ) continue # write previous variant and stores current for next iteration @@ -74,8 +88,10 @@ def run(self): self.vcf.close() def set_multiallelic_annotation(self, variant, alt, af): - variant.INFO[self.multiallelic_annotation_name] = \ - ",".join(variant.INFO.get(self.multiallelic_annotation_name, "").split(",") + [alt, str(af)]) + variant.INFO[self.multiallelic_annotation_name] = ",".join( + variant.INFO.get(self.multiallelic_annotation_name, "").split(",") + + [alt, str(af)] + ) def get_tumor_af(self, prev_variant): return prev_variant.INFO.get("{}_af".format(self.tumor_sample_name), 0.0) diff --git a/vafator/pileup_utils.py b/vafator/pileup_utils.py index 1048e7c..22f72a6 100755 --- a/vafator/pileup_utils.py +++ b/vafator/pileup_utils.py @@ -17,6 +17,7 @@ class VariantRecord: REF: reference allele ALT: list of alternate alleles """ + CHROM: str POS: int REF: str @@ -37,6 +38,7 @@ class CoverageMetrics: all_mqs: full mapping quality distribution per allele all_positions: full read position distribution per allele """ + ac: dict dp: int bqs: dict = None @@ -48,8 +50,14 @@ class CoverageMetrics: EMPTY_METRICS = CoverageMetrics( - ac=Counter(), dp=0, bqs=Counter(), mqs=Counter(), positions=Counter(), - all_bqs={}, all_mqs={}, all_positions={} + ac=Counter(), + dp=0, + bqs=Counter(), + mqs=Counter(), + positions=Counter(), + all_bqs={}, + all_mqs={}, + all_positions={}, ) @@ -108,4 +116,4 @@ def is_deletion(variant) -> bool: Args: variant: any object with REF and ALT attributes (Variant or VariantRecord) """ - return len(variant.ALT[0]) == 1 and len(variant.REF) > 1 \ No newline at end of file + return len(variant.ALT[0]) == 1 and len(variant.REF) > 1 diff --git a/vafator/pileups.py b/vafator/pileups.py index e85aa53..47ec377 100755 --- a/vafator/pileups.py +++ b/vafator/pileups.py @@ -5,11 +5,23 @@ from pysam.libcalignmentfile import IteratorColumnRegion, AlignmentFile from vafator.constants import AMBIGUOUS_BASES -from vafator.pileup_utils import VariantRecord, CoverageMetrics, safe_median, aggregate_list_per_base, is_snp, is_insertion, is_deletion +from vafator.pileup_utils import ( + VariantRecord, + CoverageMetrics, + safe_median, + aggregate_list_per_base, + is_snp, + is_insertion, + is_deletion, +) + def get_variant_pileup( - variant: Union[Variant, VariantRecord], bam: AlignmentFile, - min_base_quality: int, min_mapping_quality: int) -> IteratorColumnRegion: + variant: Union[Variant, VariantRecord], + bam: AlignmentFile, + min_base_quality: int, + min_mapping_quality: int, +) -> IteratorColumnRegion: """Open a pileup iterator at a single variant position. Kept for backwards compatibility and use in tests. @@ -31,12 +43,18 @@ def get_variant_pileup( max_depth=1000000, min_base_quality=min_base_quality, min_mapping_quality=min_mapping_quality, - stepper='samtools' - ) + stepper="samtools", + ) -def get_region_pileup(chrom: str, start: int, end: int, bam: AlignmentFile, - min_base_quality: int, min_mapping_quality: int): +def get_region_pileup( + chrom: str, + start: int, + end: int, + bam: AlignmentFile, + min_base_quality: int, + min_mapping_quality: int, +): """Open a single pileup iterator spanning a genomic region. Args: @@ -58,8 +76,8 @@ def get_region_pileup(chrom: str, start: int, end: int, bam: AlignmentFile, max_depth=1000000, min_base_quality=min_base_quality, min_mapping_quality=min_mapping_quality, - stepper='samtools' - ) + stepper="samtools", + ) def stream_variants_by_chrom(vcf) -> Iterator[Tuple[str, List[Variant]]]: @@ -86,12 +104,13 @@ def stream_variants_by_chrom(vcf) -> Iterator[Tuple[str, List[Variant]]]: def collect_metrics_for_chrom( - chrom: str, - variants: List[Variant], - bam: AlignmentFile, - min_base_quality: int, - min_mapping_quality: int, - include_ambiguous_bases: bool = False) -> Dict[Tuple, CoverageMetrics]: + chrom: str, + variants: List[Variant], + bam: AlignmentFile, + min_base_quality: int, + min_mapping_quality: int, + include_ambiguous_bases: bool = False, +) -> Dict[Tuple, CoverageMetrics]: """Compute pileup metrics for all variants on a chromosome using a single pileup iterator. Metrics are computed immediately for each pileup column while it is still valid — @@ -115,8 +134,8 @@ def collect_metrics_for_chrom( for v in variants: variants_by_pos[v.POS].append(v) - start = variants[0].POS - 1 # 0-based inclusive - end = variants[-1].POS # exclusive end for pysam + start = variants[0].POS - 1 # 0-based inclusive + end = variants[-1].POS # exclusive end for pysam results: Dict[Tuple, CoverageMetrics] = {} for pileup_col in get_region_pileup( @@ -125,21 +144,24 @@ def collect_metrics_for_chrom( end=end, bam=bam, min_base_quality=min_base_quality, - min_mapping_quality=min_mapping_quality + min_mapping_quality=min_mapping_quality, ): ref_pos = pileup_col.reference_pos + 1 # convert to 1-based if ref_pos not in variants_by_pos: continue for variant in variants_by_pos[ref_pos]: - metrics = _get_metrics_from_column(variant, pileup_col, include_ambiguous_bases) + metrics = _get_metrics_from_column( + variant, pileup_col, include_ambiguous_bases + ) if metrics is not None: results[(ref_pos, variant.REF, variant.ALT[0])] = metrics return results -def _get_metrics_from_column(variant, pileup_col, - include_ambiguous_bases: bool = True) -> CoverageMetrics: +def _get_metrics_from_column( + variant, pileup_col, include_ambiguous_bases: bool = True +) -> CoverageMetrics: """Dispatch pileup metrics computation based on variant type. Args: @@ -159,7 +181,9 @@ def _get_metrics_from_column(variant, pileup_col, return None -def _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases: bool = True) -> CoverageMetrics: +def _get_snv_metrics_from_column( + pileup_col, include_ambiguous_bases: bool = True +) -> CoverageMetrics: """Compute SNV metrics from a pileup column. Deletions at the position are represented as empty string bases and included in depth. @@ -203,8 +227,14 @@ def _get_snv_metrics_from_column(pileup_col, include_ambiguous_bases: bool = Tru dp = sum(1 for b in bases if b == "" or b not in AMBIGUOUS_BASES) return CoverageMetrics( - ac=ac, dp=dp, bqs=bqs, mqs=mqs, positions=positions, - all_bqs=all_bqs, all_mqs=all_mqs, all_positions=all_positions + ac=ac, + dp=dp, + bqs=bqs, + mqs=mqs, + positions=positions, + all_bqs=all_bqs, + all_mqs=all_mqs, + all_positions=all_positions, ) @@ -239,17 +269,21 @@ def _get_insertion_metrics_from_column(variant, pileup_col) -> CoverageMetrics: index = pileup_read.alignment.reference_start relative_position = 0 for cigar_type, cigar_length in pileup_read.alignment.cigartuples: - if cigar_type in [0, 2, 3, 7, 8]: # consumes reference M, D, N, =, X + if cigar_type in [0, 2, 3, 7, 8]: # consumes reference M, D, N, =, X index += cigar_length if index > variant_position: break - if cigar_type in [0, 1, 4, 7, 8]: # consumes query M, I, S, =, X + if cigar_type in [0, 1, 4, 7, 8]: # consumes query M, I, S, =, X relative_position += cigar_length - if cigar_type == 1: # does not count I + if cigar_type == 1: # does not count I insertion_in_query = pileup_read.alignment.query[ - relative_position:relative_position + insertion_length] - if index == variant_position and cigar_length == insertion_length \ - and insertion == insertion_in_query: + relative_position : relative_position + insertion_length + ] + if ( + index == variant_position + and cigar_length == insertion_length + and insertion == insertion_in_query + ): # the read contains the insertion ac[alt_upper] += 1 mq[alt_upper].append(pileup_read.alignment.mapping_quality) @@ -260,13 +294,14 @@ def _get_insertion_metrics_from_column(variant, pileup_col) -> CoverageMetrics: pos[variant.REF].append(pileup_read.query_position_or_next) return CoverageMetrics( - ac=Counter(ac), dp=dp, + ac=Counter(ac), + dp=dp, mqs=Counter({k: safe_median(l) for k, l in mq.items()}), positions=Counter({k: safe_median(l) for k, l in pos.items()}), bqs=Counter(), all_mqs={k: l for k, l in mq.items()}, all_positions={k: l for k, l in pos.items()}, - all_bqs=Counter() + all_bqs=Counter(), ) @@ -298,7 +333,7 @@ def _get_deletion_metrics_from_column(variant, pileup_col) -> CoverageMetrics: if pileup_read.indel < 0: start = pileup_read.alignment.reference_start for cigar_type, cigar_length in pileup_read.alignment.cigartuples: - if cigar_type in [0, 3, 7, 8]: # consumes reference M, N, =, X + if cigar_type in [0, 3, 7, 8]: # consumes reference M, N, =, X start += cigar_length elif cigar_type == 2: if start == variant_position and cigar_length == deletion_length: @@ -316,11 +351,12 @@ def _get_deletion_metrics_from_column(variant, pileup_col) -> CoverageMetrics: pos[variant.REF].append(pileup_read.query_position_or_next) return CoverageMetrics( - ac=Counter(ac), dp=dp, + ac=Counter(ac), + dp=dp, mqs=Counter({k: safe_median(l) for k, l in mq.items()}), positions=Counter({k: safe_median(l) for k, l in pos.items()}), bqs=Counter(), all_mqs={k: l for k, l in mq.items()}, all_positions={k: l for k, l in pos.items()}, - all_bqs=Counter() - ) \ No newline at end of file + all_bqs=Counter(), + ) diff --git a/vafator/ploidies.py b/vafator/ploidies.py index 00748db..31afc6f 100755 --- a/vafator/ploidies.py +++ b/vafator/ploidies.py @@ -10,14 +10,27 @@ class PloidyManager: - def __init__(self, local_copy_numbers: str = None, genome_wide_ploidy: float = DEFAULT_PLOIDY): + def __init__( + self, local_copy_numbers: str = None, genome_wide_ploidy: float = DEFAULT_PLOIDY + ): if local_copy_numbers is not None and not os.path.exists(local_copy_numbers): - raise ValueError('The provided tumor ploidy is neither a copy number value or a BED file with copy ' - 'numbers') - self.report_value = local_copy_numbers if local_copy_numbers else genome_wide_ploidy - self.bed = pd.read_csv(local_copy_numbers, sep='\t', names=['chromosome', 'start', 'end', 'copy_number']) \ - if local_copy_numbers is not None else None + raise ValueError( + "The provided tumor ploidy is neither a copy number value or a BED file with copy " + "numbers" + ) + self.report_value = ( + local_copy_numbers if local_copy_numbers else genome_wide_ploidy + ) + self.bed = ( + pd.read_csv( + local_copy_numbers, + sep="\t", + names=["chromosome", "start", "end", "copy_number"], + ) + if local_copy_numbers is not None + else None + ) self.ploidy = genome_wide_ploidy def get_ploidy(self, variant: Union[Variant, VariantRecord]) -> float: @@ -26,9 +39,11 @@ def get_ploidy(self, variant: Union[Variant, VariantRecord]) -> float: if self.bed is not None: # read from the BED file # NOTE: converts variant position from 1-based into 0-based and considers intervals as half-closed - hits = self.bed[(self.bed.chromosome == variant.CHROM) & - (self.bed.start <= variant.POS - 1) & - (self.bed.end > variant.POS - 1)] + hits = self.bed[ + (self.bed.chromosome == variant.CHROM) + & (self.bed.start <= variant.POS - 1) + & (self.bed.end > variant.POS - 1) + ] if hits.shape[0] > 0: result = float(hits.copy_number.iloc[0]) diff --git a/vafator/power.py b/vafator/power.py index 93ee5a3..e051e24 100644 --- a/vafator/power.py +++ b/vafator/power.py @@ -7,19 +7,20 @@ DEFAULT_PURITY = 1.0 DEFAULT_NORMAL_PLOIDY = 2 -DEFAULT_FPR = 5*(10**-7) +DEFAULT_FPR = 5 * (10**-7) DEFAULT_ERROR_RATE = 10**-3 class PowerCalculator: def __init__( - self, - tumor_ploidies: dict, - purities: dict, - normal_ploidy: int = DEFAULT_NORMAL_PLOIDY, - fpr: float = DEFAULT_FPR, - error_rate: float = DEFAULT_ERROR_RATE): + self, + tumor_ploidies: dict, + purities: dict, + normal_ploidy: int = DEFAULT_NORMAL_PLOIDY, + fpr: float = DEFAULT_FPR, + error_rate: float = DEFAULT_ERROR_RATE, + ): self.normal_ploidy = normal_ploidy self.purities = purities @@ -35,7 +36,9 @@ def __init__( # when using genome-wide ploidy (most common case) this is the same value for all variants self._eaf_cache: dict = {} - def calculate_power(self, dp: int, ac: int, sample: str, variant: Optional[Variant]) -> float: + def calculate_power( + self, dp: int, ac: int, sample: str, variant: Optional[Variant] + ) -> float: """ Return the binomial probability of observing ac or less supporting reads, given a total coverage dp and a expected VAF tumor purity / 2. @@ -55,13 +58,24 @@ def calculate_expected_vaf(self, sample: str, variant: Optional[Variant]) -> flo """ # cache key: use variant position for local copy number lookups, # or just sample name when genome-wide ploidy is used (most common case) - cache_key = (sample, variant.CHROM if variant else None, variant.POS if variant else None) + cache_key = ( + sample, + variant.CHROM if variant else None, + variant.POS if variant else None, + ) if cache_key in self._eaf_cache: return self._eaf_cache[cache_key] purity = self.purities.get(sample, DEFAULT_PURITY) - tumor_ploidy = max(1, self.tumor_ploidies.get(sample, default_ploidy_manager).get_ploidy(variant=variant)) - corrected_tumor_ploidy = purity * tumor_ploidy + ((1 - purity) * self.normal_ploidy) + tumor_ploidy = max( + 1, + self.tumor_ploidies.get(sample, default_ploidy_manager).get_ploidy( + variant=variant + ), + ) + corrected_tumor_ploidy = purity * tumor_ploidy + ( + (1 - purity) * self.normal_ploidy + ) expected_vaf = purity / corrected_tumor_ploidy self._eaf_cache[cache_key] = expected_vaf @@ -109,5 +123,9 @@ def calculate_absolute_power(self, sample, variant, dp: int) -> float: n = dp f = self.calculate_expected_vaf(sample, variant) # avoid instantiating a frozen binom distribution object — use module-level functions directly - power = 1 - binom.cdf(k=k - 1, n=n, p=f) + self._calculate_d(k=k, n=n) * binom.pmf(k=k, n=n, p=f) - return round(power, 5), k \ No newline at end of file + power = ( + 1 + - binom.cdf(k=k - 1, n=n, p=f) + + self._calculate_d(k=k, n=n) * binom.pmf(k=k, n=n, p=f) + ) + return round(power, 5), k diff --git a/vafator/rank_sum_test.py b/vafator/rank_sum_test.py index b9a3b4d..2314572 100644 --- a/vafator/rank_sum_test.py +++ b/vafator/rank_sum_test.py @@ -3,7 +3,9 @@ import numpy as np -def calculate_rank_sum_test(alternate_dist: List[int], reference_dist: List[int]) -> Tuple[float, float]: +def calculate_rank_sum_test( + alternate_dist: List[int], reference_dist: List[int] +) -> Tuple[float, float]: if not alternate_dist or not reference_dist: # skip empty distributions return np.nan, np.nan stat, pvalue = scipy.stats.ranksums(x=alternate_dist, y=reference_dist) @@ -16,10 +18,9 @@ def get_rank_sum_tests(distributions: dict, variant): for alt in variant.ALT: stat, pvalue = calculate_rank_sum_test( alternate_dist=distributions.get(alt, []), - reference_dist=distributions.get(variant.REF, [])) + reference_dist=distributions.get(variant.REF, []), + ) if not np.isnan(stat) and not np.isnan(pvalue): stats.append(str(stat)) pvalues.append(str(pvalue)) return pvalues, stats - - diff --git a/vafator/tests/test_annotator.py b/vafator/tests/test_annotator.py index 7bcb716..8172870 100755 --- a/vafator/tests/test_annotator.py +++ b/vafator/tests/test_annotator.py @@ -13,14 +13,43 @@ EXPECTED_ANNOTATIONS = [ - 'af', 'dp', 'ac', 'n', 'pu', 'pw', 'k', 'eaf', 'bq', 'mq', 'pos', 'rsmq', 'rsmq_pv', 'rsbq', 'rsbq_pv', 'rspos', - 'rspos_pv' + "af", + "dp", + "ac", + "n", + "pu", + "pw", + "k", + "eaf", + "bq", + "mq", + "pos", + "rsmq", + "rsmq_pv", + "rsbq", + "rsbq_pv", + "rspos", + "rspos_pv", ] # replicates do not have EAF annotation EXPECTED_ANNOTATIONS_REPLICATES = [ - 'af', 'dp', 'ac', 'n', 'pu', 'pw', 'k', 'bq', 'mq', 'pos', 'rsmq', 'rsmq_pv', 'rsbq', 'rsbq_pv', 'rspos', - 'rspos_pv' + "af", + "dp", + "ac", + "n", + "pu", + "pw", + "k", + "bq", + "mq", + "pos", + "rsmq", + "rsmq_pv", + "rsbq", + "rsbq_pv", + "rspos", + "rspos_pv", ] @@ -28,11 +57,16 @@ class TestAnnotator(TestCase): def test_annotator(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_annotator1_output.vcf") + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test_annotator1_output.vcf" + ) bam1 = pkg_resources.resource_filename(__name__, "resources/COLO_829_n1.bam") bam2 = pkg_resources.resource_filename(__name__, "resources/COLO_829_t1.bam") annotator = Annotator( - input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam1], "tumor": [bam2]}) + input_vcf=input_file, + output_vcf=output_vcf, + input_bams={"normal": [bam1], "tumor": [bam2]}, + ) annotator.run() self.assertTrue(os.path.exists(output_vcf)) @@ -47,11 +81,16 @@ def test_annotator(self): def test_annotator_with_multiple_bams(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_annotator1_output.vcf") + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test_annotator1_output.vcf" + ) bam1 = pkg_resources.resource_filename(__name__, "resources/COLO_829_n1.bam") bam2 = pkg_resources.resource_filename(__name__, "resources/COLO_829_t1.bam") annotator = Annotator( - input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam1, bam2], "tumor": [bam1, bam2]}) + input_vcf=input_file, + output_vcf=output_vcf, + input_bams={"normal": [bam1, bam2], "tumor": [bam1, bam2]}, + ) annotator.run() self.assertTrue(os.path.exists(output_vcf)) @@ -61,19 +100,27 @@ def test_annotator_with_multiple_bams(self): info_annotations = test_utils._get_info_fields(output_vcf) for a in EXPECTED_ANNOTATIONS_REPLICATES: - self.assertTrue("tumor_{}_1".format(a) in info_annotations, - "Missing annotation tumor_{}_1".format(a)) - self.assertTrue("normal_{}_1".format(a) in info_annotations, - "Missing annotation normal_{}_1".format(a)) + self.assertTrue( + "tumor_{}_1".format(a) in info_annotations, + "Missing annotation tumor_{}_1".format(a), + ) + self.assertTrue( + "normal_{}_1".format(a) in info_annotations, + "Missing annotation normal_{}_1".format(a), + ) def test_annotator_with_prefix(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_annotator1_output.vcf") + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test_annotator1_output.vcf" + ) bam1 = pkg_resources.resource_filename(__name__, "resources/COLO_829_n1.bam") bam2 = pkg_resources.resource_filename(__name__, "resources/COLO_829_t1.bam") annotator = Annotator( - input_vcf=input_file, output_vcf=output_vcf, - input_bams={"RNA_normal": [bam1, bam2], "RNA_tumor": [bam1, bam2]}) + input_vcf=input_file, + output_vcf=output_vcf, + input_bams={"RNA_normal": [bam1, bam2], "RNA_tumor": [bam1, bam2]}, + ) annotator.run() self.assertTrue(os.path.exists(output_vcf)) @@ -83,19 +130,29 @@ def test_annotator_with_prefix(self): info_annotations = test_utils._get_info_fields(output_vcf) for a in EXPECTED_ANNOTATIONS_REPLICATES: - self.assertTrue("RNA_tumor_{}_1".format(a) in info_annotations, - "Missing annotation RNA_tumor_{}_1".format(a)) - self.assertTrue("RNA_normal_{}_1".format(a) in info_annotations, - "Missing annotation RNA_normal_{}_1".format(a)) + self.assertTrue( + "RNA_tumor_{}_1".format(a) in info_annotations, + "Missing annotation RNA_tumor_{}_1".format(a), + ) + self.assertTrue( + "RNA_normal_{}_1".format(a) in info_annotations, + "Missing annotation RNA_normal_{}_1".format(a), + ) def test_annotator_with_mnvs(self): - input_file = pkg_resources.resource_filename(__name__, "resources/test_tumor_normal.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_tumor_normal_output.vcf") + input_file = pkg_resources.resource_filename( + __name__, "resources/test_tumor_normal.vcf" + ) + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test_tumor_normal_output.vcf" + ) bam1 = pkg_resources.resource_filename(__name__, "resources/COLO_829_n1.bam") bam2 = pkg_resources.resource_filename(__name__, "resources/COLO_829_t1.bam") annotator = Annotator( - input_vcf=input_file, output_vcf=output_vcf, - input_bams={"RNA_normal": [bam1, bam2], "RNA_tumor": [bam1, bam2]}) + input_vcf=input_file, + output_vcf=output_vcf, + input_bams={"RNA_normal": [bam1, bam2], "RNA_tumor": [bam1, bam2]}, + ) annotator.run() self.assertTrue(os.path.exists(output_vcf)) @@ -105,10 +162,14 @@ def test_annotator_with_mnvs(self): info_annotations = test_utils._get_info_fields(output_vcf) for a in EXPECTED_ANNOTATIONS_REPLICATES: - self.assertTrue("RNA_tumor_{}_1".format(a) in info_annotations, - "Missing annotation RNA_tumor_{}_1".format(a)) - self.assertTrue("RNA_normal_{}_1".format(a) in info_annotations, - "Missing annotation RNA_normal_{}_1".format(a)) + self.assertTrue( + "RNA_tumor_{}_1".format(a) in info_annotations, + "Missing annotation RNA_tumor_{}_1".format(a), + ) + self.assertTrue( + "RNA_normal_{}_1".format(a) in info_annotations, + "Missing annotation RNA_normal_{}_1".format(a), + ) def _get_info_at(self, input_file, chromosome, position, annotation): vcf = VCF(input_file) @@ -122,19 +183,27 @@ def _get_info_at(self, input_file, chromosome, position, annotation): def test_nist(self): input_file = pkg_resources.resource_filename( - __name__, "resources/project.NIST.hc.snps.indels.chr1_1000000_2000000.vcf") + __name__, "resources/project.NIST.hc.snps.indels.chr1_1000000_2000000.vcf" + ) output_vcf = pkg_resources.resource_filename( - __name__, "resources/results/project.NIST.hc.snps.indels.chr1_1000000_2000000.vaf.vcf") + __name__, + "resources/results/project.NIST.hc.snps.indels.chr1_1000000_2000000.vaf.vcf", + ) bam_file = pkg_resources.resource_filename( __name__, - "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam") + "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam", + ) start = time.time() - annotator = Annotator(input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam_file]}) + annotator = Annotator( + input_vcf=input_file, + output_vcf=output_vcf, + input_bams={"normal": [bam_file]}, + ) annotator.run() duration = time.time() - start logger.info("Duration {} seconds".format(round(duration, 3))) - self._assert_vafator_vcf(output_vcf, sample_name='normal') + self._assert_vafator_vcf(output_vcf, sample_name="normal") n_variants_input = test_utils._get_count_variants(input_file) n_variants_output = test_utils._get_count_variants(output_vcf) @@ -145,105 +214,125 @@ def test_nist(self): self.assertTrue("normal_ac" in info_annotations) self.assertTrue("normal_dp" in info_annotations) - variant = test_utils._get_mutation_at_position(output_vcf, 'chr1', 1506035) + variant = test_utils._get_mutation_at_position(output_vcf, "chr1", 1506035) self.assertIsNotNone(variant) - self.assertEqual(variant.INFO['normal_ac'], 3) - self.assertEqual(variant.INFO['normal_dp'], 3) - self.assertEqual(variant.INFO['normal_af'], 1.0) - self.assertEqual(variant.INFO['normal_pu'], 1.0) - self.assertEqual(variant.INFO['normal_eaf'], 0.5) - self.assertEqual(variant.INFO['normal_mq'][0], 0) - self.assertEqual(variant.INFO['normal_mq'][1], 60.0) - self.assertEqual(variant.INFO['normal_bq'][0], 0) - self.assertEqual(variant.INFO['normal_bq'][1], 37.0) - self.assertEqual(variant.INFO['normal_pos'][0], 0) - self.assertEqual(variant.INFO['normal_pos'][1], 56.0) - - variant = test_utils._get_mutation_at_position(output_vcf, 'chr1', 1509825) + self.assertEqual(variant.INFO["normal_ac"], 3) + self.assertEqual(variant.INFO["normal_dp"], 3) + self.assertEqual(variant.INFO["normal_af"], 1.0) + self.assertEqual(variant.INFO["normal_pu"], 1.0) + self.assertEqual(variant.INFO["normal_eaf"], 0.5) + self.assertEqual(variant.INFO["normal_mq"][0], 0) + self.assertEqual(variant.INFO["normal_mq"][1], 60.0) + self.assertEqual(variant.INFO["normal_bq"][0], 0) + self.assertEqual(variant.INFO["normal_bq"][1], 37.0) + self.assertEqual(variant.INFO["normal_pos"][0], 0) + self.assertEqual(variant.INFO["normal_pos"][1], 56.0) + + variant = test_utils._get_mutation_at_position(output_vcf, "chr1", 1509825) self.assertIsNotNone(variant) - self.assertEqual(variant.INFO['normal_ac'], 13) - self.assertEqual(variant.INFO['normal_dp'], 20) + self.assertEqual(variant.INFO["normal_ac"], 13) + self.assertEqual(variant.INFO["normal_dp"], 20) # these values are rounded to six digits inside the VCF, not sure why when read the representation is # different... - self.assertEqual(round(variant.INFO['normal_af'], 5), 0.65) - self.assertEqual(round(variant.INFO['normal_pu'], 5), 0.94234) - self.assertEqual(variant.INFO['normal_eaf'], 0.5) - self.assertEqual(variant.INFO['normal_mq'][0], 60.0) - self.assertEqual(variant.INFO['normal_mq'][1], 60.0) - self.assertEqual(variant.INFO['normal_bq'][0], 37.0) - self.assertEqual(variant.INFO['normal_bq'][1], 35.0) - self.assertEqual(variant.INFO['normal_pos'][0], 31.0) - self.assertEqual(variant.INFO['normal_pos'][1], 41.0) + self.assertEqual(round(variant.INFO["normal_af"], 5), 0.65) + self.assertEqual(round(variant.INFO["normal_pu"], 5), 0.94234) + self.assertEqual(variant.INFO["normal_eaf"], 0.5) + self.assertEqual(variant.INFO["normal_mq"][0], 60.0) + self.assertEqual(variant.INFO["normal_mq"][1], 60.0) + self.assertEqual(variant.INFO["normal_bq"][0], 37.0) + self.assertEqual(variant.INFO["normal_bq"][1], 35.0) + self.assertEqual(variant.INFO["normal_pos"][0], 31.0) + self.assertEqual(variant.INFO["normal_pos"][1], 41.0) # this is a deletion - variant = test_utils._get_mutation_at_position(output_vcf, 'chr1', 1323143) + variant = test_utils._get_mutation_at_position(output_vcf, "chr1", 1323143) self.assertIsNotNone(variant) - self.assertEqual(variant.INFO['normal_ac'], 20) - self.assertEqual(variant.INFO['normal_dp'], 21) + self.assertEqual(variant.INFO["normal_ac"], 20) + self.assertEqual(variant.INFO["normal_dp"], 21) # these values are rounded to six digits inside the VCF, not sure why when read the representation is # different... - self.assertEqual(round(variant.INFO['normal_af'], 5), 0.95238) - self.assertEqual(round(variant.INFO['normal_pu'], 5), 1.0) - self.assertEqual(variant.INFO['normal_eaf'], 0.5) - self.assertEqual(variant.INFO['normal_mq'][0], 60.0) - self.assertEqual(variant.INFO['normal_mq'][1], 60.0) - self.assertEqual(variant.INFO['normal_bq'][0], 0.0) - self.assertEqual(variant.INFO['normal_bq'][1], 0.0) - self.assertEqual(variant.INFO['normal_pos'][0], 50.0) - self.assertEqual(variant.INFO['normal_pos'][1], 21.0) + self.assertEqual(round(variant.INFO["normal_af"], 5), 0.95238) + self.assertEqual(round(variant.INFO["normal_pu"], 5), 1.0) + self.assertEqual(variant.INFO["normal_eaf"], 0.5) + self.assertEqual(variant.INFO["normal_mq"][0], 60.0) + self.assertEqual(variant.INFO["normal_mq"][1], 60.0) + self.assertEqual(variant.INFO["normal_bq"][0], 0.0) + self.assertEqual(variant.INFO["normal_bq"][1], 0.0) + self.assertEqual(variant.INFO["normal_pos"][0], 50.0) + self.assertEqual(variant.INFO["normal_pos"][1], 21.0) # this is an insertion - variant = test_utils._get_mutation_at_position(output_vcf, 'chr1', 1935367) + variant = test_utils._get_mutation_at_position(output_vcf, "chr1", 1935367) self.assertIsNotNone(variant) - self.assertEqual(variant.INFO['normal_ac'], 1) - self.assertEqual(variant.INFO['normal_dp'], 2) + self.assertEqual(variant.INFO["normal_ac"], 1) + self.assertEqual(variant.INFO["normal_dp"], 2) # these values are rounded to six digits inside the VCF, not sure why when read the representation is # different... - self.assertEqual(round(variant.INFO['normal_af'], 5), 0.5) - self.assertEqual(round(variant.INFO['normal_pu'], 5), 0.75) - self.assertEqual(variant.INFO['normal_eaf'], 0.5) - self.assertEqual(variant.INFO['normal_mq'][0], 60.0) - self.assertEqual(variant.INFO['normal_mq'][1], 29.0) - self.assertEqual(variant.INFO['normal_bq'][0], 0.0) - self.assertEqual(variant.INFO['normal_bq'][1], 0.0) - self.assertEqual(variant.INFO['normal_pos'][0], 95.0) - self.assertEqual(variant.INFO['normal_pos'][1], 56.0) + self.assertEqual(round(variant.INFO["normal_af"], 5), 0.5) + self.assertEqual(round(variant.INFO["normal_pu"], 5), 0.75) + self.assertEqual(variant.INFO["normal_eaf"], 0.5) + self.assertEqual(variant.INFO["normal_mq"][0], 60.0) + self.assertEqual(variant.INFO["normal_mq"][1], 29.0) + self.assertEqual(variant.INFO["normal_bq"][0], 0.0) + self.assertEqual(variant.INFO["normal_bq"][1], 0.0) + self.assertEqual(variant.INFO["normal_pos"][0], 95.0) + self.assertEqual(variant.INFO["normal_pos"][1], 56.0) def test_nist_with_replicates(self): input_file = pkg_resources.resource_filename( - __name__, "resources/project.NIST.hc.snps.indels.chr1_1000000_2000000.vcf") + __name__, "resources/project.NIST.hc.snps.indels.chr1_1000000_2000000.vcf" + ) output_vcf = pkg_resources.resource_filename( - __name__, "resources/results/project.NIST.hc.snps.indels.chr1_1000000_2000000.vaf_replicates.vcf") + __name__, + "resources/results/project.NIST.hc.snps.indels.chr1_1000000_2000000.vaf_replicates.vcf", + ) bam_file = pkg_resources.resource_filename( __name__, - "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam") + "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam", + ) start = time.time() - annotator = Annotator(input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam_file]}) + annotator = Annotator( + input_vcf=input_file, + output_vcf=output_vcf, + input_bams={"normal": [bam_file]}, + ) annotator.run() duration = time.time() - start logger.info("Duration {} seconds".format(round(duration, 3))) - self._assert_vafator_vcf(output_vcf, sample_name='normal') - self._assert_vafator_vcf(output_vcf, sample_name='normal', replicate=1) - self._assert_vafator_vcf(output_vcf, sample_name='normal', replicate=2) + self._assert_vafator_vcf(output_vcf, sample_name="normal") + self._assert_vafator_vcf(output_vcf, sample_name="normal", replicate=1) + self._assert_vafator_vcf(output_vcf, sample_name="normal", replicate=2) def test_annotator_bams_order(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_annotator1_output.vcf") - output_vcf_2 = pkg_resources.resource_filename(__name__, "resources/results/test_annotator2_output.vcf") + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test_annotator1_output.vcf" + ) + output_vcf_2 = pkg_resources.resource_filename( + __name__, "resources/results/test_annotator2_output.vcf" + ) bam1 = pkg_resources.resource_filename(__name__, "resources/COLO_829_n1.bam") bam2 = pkg_resources.resource_filename(__name__, "resources/COLO_829_t1.bam") - Annotator(input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam1], "tumor": [bam2]}).run() - Annotator(input_vcf=input_file, output_vcf=output_vcf_2, input_bams={"tumor": [bam2], "normal": [bam1]}).run() + Annotator( + input_vcf=input_file, + output_vcf=output_vcf, + input_bams={"normal": [bam1], "tumor": [bam2]}, + ).run() + Annotator( + input_vcf=input_file, + output_vcf=output_vcf_2, + input_bams={"tumor": [bam2], "normal": [bam1]}, + ).run() self.assertTrue(os.path.exists(output_vcf)) self.assertTrue(os.path.exists(output_vcf_2)) - self._assert_vafator_vcf(output_vcf, sample_name='normal') - self._assert_vafator_vcf(output_vcf, sample_name='tumor') - self._assert_vafator_vcf(output_vcf_2, sample_name='normal') - self._assert_vafator_vcf(output_vcf_2, sample_name='tumor') + self._assert_vafator_vcf(output_vcf, sample_name="normal") + self._assert_vafator_vcf(output_vcf, sample_name="tumor") + self._assert_vafator_vcf(output_vcf_2, sample_name="normal") + self._assert_vafator_vcf(output_vcf_2, sample_name="tumor") vcf = VCF(output_vcf) vcf_2 = VCF(output_vcf_2) @@ -253,20 +342,31 @@ def test_annotator_bams_order(self): self.assertEqual( v.INFO.get("normal_{}".format(a), ""), v2.INFO.get("normal_{}".format(a), ""), - "Variant {}:{}:{}>{} is missing annotation normal_{}".format(v.CHROM, v.POS, v.REF, v.ALT[0], a)) + "Variant {}:{}:{}>{} is missing annotation normal_{}".format( + v.CHROM, v.POS, v.REF, v.ALT[0], a + ), + ) self.assertEqual( v.INFO.get("tumor_{}".format(a), ""), v2.INFO.get("tumor_{}".format(a), ""), - "Variant {}:{}:{}>{} is missing annotation tumor_{}".format(v.CHROM, v.POS, v.REF, v.ALT[0], a)) + "Variant {}:{}:{}>{} is missing annotation tumor_{}".format( + v.CHROM, v.POS, v.REF, v.ALT[0], a + ), + ) def test_annotator_with_purities(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test_annotator1_output.vcf") + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test_annotator1_output.vcf" + ) bam1 = pkg_resources.resource_filename(__name__, "resources/COLO_829_n1.bam") bam2 = pkg_resources.resource_filename(__name__, "resources/COLO_829_t1.bam") annotator = Annotator( - input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam1], "tumor": [bam2]}, - purities={"tumor": 0.8}, tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.8)} + input_vcf=input_file, + output_vcf=output_vcf, + input_bams={"normal": [bam1], "tumor": [bam2]}, + purities={"tumor": 0.8}, + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.8)}, ) annotator.run() @@ -281,8 +381,11 @@ def test_annotator_with_purities(self): self.assertTrue("normal_{}".format(a) in info_annotations) annotator = Annotator( - input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam1], "tumor": [bam2]}, - purities={"tumor": 0.2}, tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=1.5)} + input_vcf=input_file, + output_vcf=output_vcf, + input_bams={"normal": [bam1], "tumor": [bam2]}, + purities={"tumor": 0.2}, + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=1.5)}, ) annotator.run() @@ -291,18 +394,69 @@ def _assert_vafator_vcf(self, vcf_filename, sample_name, replicate=None): vcf = VCF(vcf_filename) for v in vcf: # p-values or VAFs - self._assert_probability(v.INFO.get(self._get_annotation_name('rsbq_pv', sample_name, replicate=replicate), 0)) - self._assert_probability(v.INFO.get(self._get_annotation_name('rsmq_pv', sample_name, replicate=replicate), 0)) - self._assert_probability(v.INFO.get(self._get_annotation_name('rspos_pv', sample_name, replicate=replicate), 0)) - self._assert_probability(v.INFO.get(self._get_annotation_name('eaf', sample_name, replicate=replicate), 0)) - self._assert_probability(v.INFO.get(self._get_annotation_name('af', sample_name, replicate=replicate), 0)) + self._assert_probability( + v.INFO.get( + self._get_annotation_name( + "rsbq_pv", sample_name, replicate=replicate + ), + 0, + ) + ) + self._assert_probability( + v.INFO.get( + self._get_annotation_name( + "rsmq_pv", sample_name, replicate=replicate + ), + 0, + ) + ) + self._assert_probability( + v.INFO.get( + self._get_annotation_name( + "rspos_pv", sample_name, replicate=replicate + ), + 0, + ) + ) + self._assert_probability( + v.INFO.get( + self._get_annotation_name("eaf", sample_name, replicate=replicate), + 0, + ) + ) + self._assert_probability( + v.INFO.get( + self._get_annotation_name("af", sample_name, replicate=replicate), 0 + ) + ) # positive integer annotations - self._assert_positive_integer(v.INFO.get(self._get_annotation_name('ac', sample_name, replicate=replicate), 0)) - self._assert_positive_integer(v.INFO.get(self._get_annotation_name('dp', sample_name, replicate=replicate), 0)) - self._assert_positive_integer(v.INFO.get(self._get_annotation_name('mq', sample_name, replicate=replicate), 0)) - self._assert_positive_integer(v.INFO.get(self._get_annotation_name('bq', sample_name, replicate=replicate), 0)) - self._assert_positive_integer(v.INFO.get(self._get_annotation_name('pos', sample_name, replicate=replicate), 0)) + self._assert_positive_integer( + v.INFO.get( + self._get_annotation_name("ac", sample_name, replicate=replicate), 0 + ) + ) + self._assert_positive_integer( + v.INFO.get( + self._get_annotation_name("dp", sample_name, replicate=replicate), 0 + ) + ) + self._assert_positive_integer( + v.INFO.get( + self._get_annotation_name("mq", sample_name, replicate=replicate), 0 + ) + ) + self._assert_positive_integer( + v.INFO.get( + self._get_annotation_name("bq", sample_name, replicate=replicate), 0 + ) + ) + self._assert_positive_integer( + v.INFO.get( + self._get_annotation_name("pos", sample_name, replicate=replicate), + 0, + ) + ) vcf.close() @staticmethod @@ -315,13 +469,25 @@ def _get_annotation_name(annotation_name, sample_name, replicate=None): def _assert_probability(self, annotation): if isinstance(annotation, list) or isinstance(annotation, tuple): for a in annotation: - self.assertTrue(0.0 <= float(a) <= 1.0, "Expected probability has a value of {}".format(a)) + self.assertTrue( + 0.0 <= float(a) <= 1.0, + "Expected probability has a value of {}".format(a), + ) else: - self.assertTrue(0.0 <= float(annotation) <= 1.0, "Expected probability has a value of {}".format(annotation)) + self.assertTrue( + 0.0 <= float(annotation) <= 1.0, + "Expected probability has a value of {}".format(annotation), + ) def _assert_positive_integer(self, annotation): if isinstance(annotation, list) or isinstance(annotation, tuple): for a in annotation: - self.assertTrue(np.isnan(a) or 0.0 <= a, "Expected positive integer has a value of {}".format(a)) + self.assertTrue( + np.isnan(a) or 0.0 <= a, + "Expected positive integer has a value of {}".format(a), + ) else: - self.assertTrue(np.isnan(annotation) or 0.0 <= annotation, "Expected positive integer has a value of {}".format(annotation)) + self.assertTrue( + np.isnan(annotation) or 0.0 <= annotation, + "Expected positive integer has a value of {}".format(annotation), + ) diff --git a/vafator/tests/test_hatchet2bed.py b/vafator/tests/test_hatchet2bed.py index 814651f..9e5ee47 100644 --- a/vafator/tests/test_hatchet2bed.py +++ b/vafator/tests/test_hatchet2bed.py @@ -8,10 +8,24 @@ class Hatchet2bedTest(TestCase): def test_hatchet2bed(self): run_hatchet2bed( - input_file=pkg_resources.resource_filename(__name__, "resources/best.seg.minimal.ucn"), - output_prefix=pkg_resources.resource_filename(__name__, "resources/best.seg.minimal") + input_file=pkg_resources.resource_filename( + __name__, "resources/best.seg.minimal.ucn" + ), + output_prefix=pkg_resources.resource_filename( + __name__, "resources/best.seg.minimal" + ), ) self.assertTrue( - os.path.exists(pkg_resources.resource_filename(__name__, "resources/best.seg.minimal.my_tumor.bed"))) + os.path.exists( + pkg_resources.resource_filename( + __name__, "resources/best.seg.minimal.my_tumor.bed" + ) + ) + ) self.assertTrue( - os.path.exists(pkg_resources.resource_filename(__name__, "resources/best.seg.minimal.my_metastasis.bed"))) + os.path.exists( + pkg_resources.resource_filename( + __name__, "resources/best.seg.minimal.my_metastasis.bed" + ) + ) + ) diff --git a/vafator/tests/test_multiallelic_filter.py b/vafator/tests/test_multiallelic_filter.py index bdc15a0..1bc8f15 100755 --- a/vafator/tests/test_multiallelic_filter.py +++ b/vafator/tests/test_multiallelic_filter.py @@ -14,8 +14,12 @@ def setUp(self): def test_no_variants_filtered(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test1_output.vcf") - multiallelic_filter = MultiallelicFilter(input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name='tumor') + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test1_output.vcf" + ) + multiallelic_filter = MultiallelicFilter( + input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name="tumor" + ) multiallelic_filter.run() self.assertTrue(os.path.exists(output_vcf)) @@ -25,33 +29,53 @@ def test_no_variants_filtered(self): def test_two_variants_filtered(self): input_file = pkg_resources.resource_filename(__name__, "resources/test2.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test2_output.vcf") - multiallelic_filter = MultiallelicFilter(input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name='tumor') + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test2_output.vcf" + ) + multiallelic_filter = MultiallelicFilter( + input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name="tumor" + ) multiallelic_filter.run() self.assertTrue(os.path.exists(output_vcf)) n_variants_input = test_utils._get_count_variants(input_file) n_variants_output = test_utils._get_count_variants(output_vcf) - self.assertTrue(n_variants_input == n_variants_output + 3, "input:{}; output:{}".format( - n_variants_input, n_variants_output)) - - af1 = self._get_info_at(output_vcf, chromosome="chr4", position=1235, annotation='tumor_af') + self.assertTrue( + n_variants_input == n_variants_output + 3, + "input:{}; output:{}".format(n_variants_input, n_variants_output), + ) + + af1 = self._get_info_at( + output_vcf, chromosome="chr4", position=1235, annotation="tumor_af" + ) self.assertTrue(af1, 0.2) - multiallelic1 = self._get_info_at(output_vcf, chromosome="chr4", position=1235, annotation='multiallelic') + multiallelic1 = self._get_info_at( + output_vcf, chromosome="chr4", position=1235, annotation="multiallelic" + ) self.assertTrue(multiallelic1, "T,0.1") - af2 = self._get_info_at(output_vcf, chromosome="chr6", position=1235, annotation='tumor_af') + af2 = self._get_info_at( + output_vcf, chromosome="chr6", position=1235, annotation="tumor_af" + ) self.assertTrue(af2, 0.2) - multiallelic2 = self._get_info_at(output_vcf, chromosome="chr6", position=1235, annotation='multiallelic') + multiallelic2 = self._get_info_at( + output_vcf, chromosome="chr6", position=1235, annotation="multiallelic" + ) self.assertTrue(multiallelic2, "A,0.01") - af3 = self._get_info_at(output_vcf, chromosome="chr6", position=1234, annotation='tumor_af') + af3 = self._get_info_at( + output_vcf, chromosome="chr6", position=1234, annotation="tumor_af" + ) self.assertTrue(af3, 0.5) def test_different_reference_is_kept(self): input_file = pkg_resources.resource_filename(__name__, "resources/test3.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test3_output.vcf") - multiallelic_filter = MultiallelicFilter(input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name='tumor') + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test3_output.vcf" + ) + multiallelic_filter = MultiallelicFilter( + input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name="tumor" + ) multiallelic_filter.run() self.assertTrue(os.path.exists(output_vcf)) @@ -62,8 +86,12 @@ def test_different_reference_is_kept(self): def test_three_multiallelics(self): input_file = pkg_resources.resource_filename(__name__, "resources/test4.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test4_output.vcf") - multiallelic_filter = MultiallelicFilter(input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name='tumor') + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test4_output.vcf" + ) + multiallelic_filter = MultiallelicFilter( + input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name="tumor" + ) multiallelic_filter.run() self.assertTrue(os.path.exists(output_vcf)) @@ -74,8 +102,12 @@ def test_three_multiallelics(self): def test_equal_af(self): input_file = pkg_resources.resource_filename(__name__, "resources/test5.vcf") - output_vcf = pkg_resources.resource_filename(__name__, "resources/results/test5_output.vcf") - multiallelic_filter = MultiallelicFilter(input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name='tumor') + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/test5_output.vcf" + ) + multiallelic_filter = MultiallelicFilter( + input_vcf=input_file, output_vcf=output_vcf, tumor_sample_name="tumor" + ) multiallelic_filter.run() self.assertTrue(os.path.exists(output_vcf)) @@ -92,4 +124,4 @@ def _get_info_at(self, input_file, chromosome, position, annotation): vcf.close() return v.INFO.get(annotation) vcf.close() - return {} \ No newline at end of file + return {} diff --git a/vafator/tests/test_pileups.py b/vafator/tests/test_pileups.py index c0450e2..6ead2cd 100644 --- a/vafator/tests/test_pileups.py +++ b/vafator/tests/test_pileups.py @@ -5,7 +5,8 @@ from vafator.tests.utils import VafatorVariant from vafator.pileups import ( - get_variant_pileup, _get_metrics_from_column, + get_variant_pileup, + _get_metrics_from_column, ) @@ -15,66 +16,127 @@ class TestPileups(TestCase): min_mapping_quality = 0 bam_file = pkg_resources.resource_filename( __name__, - "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam") + "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam", + ) bam_reader = pysam.AlignmentFile(bam_file) def test_snv_metrics(self): - variant = VafatorVariant(chromosome="chr1", position=1017341, reference="G", alternative=["T"]) - self._assert_metrics(variant=variant, expected_ac={'G': 5, 'T': 6}, expected_dp=11) - self._assert_metrics(variant=variant, expected_ac={'G': 1, 'T': 2}, expected_dp=3, min_base_quality=40) + variant = VafatorVariant( + chromosome="chr1", position=1017341, reference="G", alternative=["T"] + ) + self._assert_metrics( + variant=variant, expected_ac={"G": 5, "T": 6}, expected_dp=11 + ) + self._assert_metrics( + variant=variant, + expected_ac={"G": 1, "T": 2}, + expected_dp=3, + min_base_quality=40, + ) def test_snv_metrics_2(self): - variant = VafatorVariant(chromosome="chr1", position=1018144, reference="T", alternative=["C"]) - self._assert_metrics(variant=variant, expected_ac={'C': 9, 'T': 11}, expected_dp=20) - self._assert_metrics(variant=variant, expected_ac={'C': 3, 'T': 4}, expected_dp=7, min_base_quality=40) - self._assert_metrics(variant=variant, expected_ac=Counter(), expected_dp=0, min_mapping_quality=65) + variant = VafatorVariant( + chromosome="chr1", position=1018144, reference="T", alternative=["C"] + ) + self._assert_metrics( + variant=variant, expected_ac={"C": 9, "T": 11}, expected_dp=20 + ) + self._assert_metrics( + variant=variant, + expected_ac={"C": 3, "T": 4}, + expected_dp=7, + min_base_quality=40, + ) + self._assert_metrics( + variant=variant, + expected_ac=Counter(), + expected_dp=0, + min_mapping_quality=65, + ) def test_insertion_metrics(self): # variant called in the VCF shows no read support (!?) - variant = VafatorVariant(chromosome="chr1", position=1247578, reference="T", alternative=["TGG"]) - self._assert_metrics(variant=variant, expected_ac={'TGG': 0}, expected_dp=3) + variant = VafatorVariant( + chromosome="chr1", position=1247578, reference="T", alternative=["TGG"] + ) + self._assert_metrics(variant=variant, expected_ac={"TGG": 0}, expected_dp=3) # there is one read supporting this insertion of 3 Gs - variant = VafatorVariant(chromosome="chr1", position=1247578, reference="T", alternative=["TGGG"]) - self._assert_metrics(variant=variant, expected_ac={'TGGG': 1}, expected_dp=3) + variant = VafatorVariant( + chromosome="chr1", position=1247578, reference="T", alternative=["TGGG"] + ) + self._assert_metrics(variant=variant, expected_ac={"TGGG": 1}, expected_dp=3) # this ensures that the insertion sequence is checked not only the insertion length! - variant = VafatorVariant(chromosome="chr1", position=1247578, reference="T", alternative=["TGGA"]) - self._assert_metrics(variant=variant, expected_ac={'TGGA': 0}, expected_dp=3) + variant = VafatorVariant( + chromosome="chr1", position=1247578, reference="T", alternative=["TGGA"] + ) + self._assert_metrics(variant=variant, expected_ac={"TGGA": 0}, expected_dp=3) # there is one read supporting this insertion of 4 Gs - variant = VafatorVariant(chromosome="chr1", position=1247578, reference="T", alternative=["TGGGG"]) - self._assert_metrics(variant=variant, expected_ac={'TGGGG': 1}, expected_dp=3) + variant = VafatorVariant( + chromosome="chr1", position=1247578, reference="T", alternative=["TGGGG"] + ) + self._assert_metrics(variant=variant, expected_ac={"TGGGG": 1}, expected_dp=3) # there is no read supporting an insertion of 5 Gs - variant = VafatorVariant(chromosome="chr1", position=1247578, reference="T", alternative=["TGGGGG"]) - self._assert_metrics(variant=variant, expected_ac={'TGGGGG': 0}, expected_dp=3) + variant = VafatorVariant( + chromosome="chr1", position=1247578, reference="T", alternative=["TGGGGG"] + ) + self._assert_metrics(variant=variant, expected_ac={"TGGGGG": 0}, expected_dp=3) def test_insertion_metrics_2(self): - variant = VafatorVariant(chromosome="chr1", position=1594199, reference="C", alternative=["CT"]) - self._assert_metrics(variant=variant, expected_ac={'CT': 9}, expected_dp=11) - self._assert_metrics(variant=variant, expected_ac={'CT': 5}, expected_dp=7, min_mapping_quality=40) - self._assert_metrics(variant=variant, expected_ac={'CT': 4}, expected_dp=4, min_base_quality=40) + variant = VafatorVariant( + chromosome="chr1", position=1594199, reference="C", alternative=["CT"] + ) + self._assert_metrics(variant=variant, expected_ac={"CT": 9}, expected_dp=11) + self._assert_metrics( + variant=variant, + expected_ac={"CT": 5}, + expected_dp=7, + min_mapping_quality=40, + ) + self._assert_metrics( + variant=variant, expected_ac={"CT": 4}, expected_dp=4, min_base_quality=40 + ) def test_deletion_metrics(self): - variant = VafatorVariant(chromosome="chr1", position=1510035, reference="GGC", alternative=["G"]) - self._assert_metrics(variant=variant, expected_ac={'G': 12}, expected_dp=13) - self._assert_metrics(variant=variant, expected_ac={'G': 0}, expected_dp=0, min_mapping_quality=61) - self._assert_metrics(variant=variant, expected_ac={'G': 3}, expected_dp=3, min_base_quality=40) + variant = VafatorVariant( + chromosome="chr1", position=1510035, reference="GGC", alternative=["G"] + ) + self._assert_metrics(variant=variant, expected_ac={"G": 12}, expected_dp=13) + self._assert_metrics( + variant=variant, expected_ac={"G": 0}, expected_dp=0, min_mapping_quality=61 + ) + self._assert_metrics( + variant=variant, expected_ac={"G": 3}, expected_dp=3, min_base_quality=40 + ) # deletions with a reference sequence not matching the reference would be matched # vafator expects correct indel calls - variant = VafatorVariant(chromosome="chr1", position=1510035, reference="GCC", alternative=["G"]) - self._assert_metrics(variant=variant, expected_ac={'G': 12}, expected_dp=13) + variant = VafatorVariant( + chromosome="chr1", position=1510035, reference="GCC", alternative=["G"] + ) + self._assert_metrics(variant=variant, expected_ac={"G": 12}, expected_dp=13) - def _assert_metrics(self, variant, expected_ac, expected_dp, - min_base_quality=0, min_mapping_quality=0): + def _assert_metrics( + self, + variant, + expected_ac, + expected_dp, + min_base_quality=0, + min_mapping_quality=0, + ): pileups = get_variant_pileup( - variant=variant, bam=self.bam_reader, - min_base_quality=min_base_quality, min_mapping_quality=min_mapping_quality) + variant=variant, + bam=self.bam_reader, + min_base_quality=min_base_quality, + min_mapping_quality=min_mapping_quality, + ) pileup_col = next(iter(pileups), None) if pileup_col is None: coverage_metrics = None else: coverage_metrics = _get_metrics_from_column( - variant=variant, pileup_col=pileup_col, include_ambiguous_bases=True) + variant=variant, pileup_col=pileup_col, include_ambiguous_bases=True + ) if expected_dp == 0: self.assertIsNone(coverage_metrics) else: self.assertEqual(expected_ac, coverage_metrics.ac) - self.assertEqual(expected_dp, coverage_metrics.dp) \ No newline at end of file + self.assertEqual(expected_dp, coverage_metrics.dp) diff --git a/vafator/tests/test_ploidy_manager.py b/vafator/tests/test_ploidy_manager.py index f90746f..4906095 100644 --- a/vafator/tests/test_ploidy_manager.py +++ b/vafator/tests/test_ploidy_manager.py @@ -8,40 +8,106 @@ class PloidyManagerTest(TestCase): def test_default_ploidy_manager(self): - self.assertEqual(PloidyManager().get_ploidy( - variant=VafatorVariant(chromosome="chr1", position=12345, reference="A", alternative="C")), 2.0) - self.assertEqual(default_ploidy_manager.get_ploidy( - variant=VafatorVariant(chromosome="chr1", position=12345, reference="A", alternative="C")), 2.0) + self.assertEqual( + PloidyManager().get_ploidy( + variant=VafatorVariant( + chromosome="chr1", position=12345, reference="A", alternative="C" + ) + ), + 2.0, + ) + self.assertEqual( + default_ploidy_manager.get_ploidy( + variant=VafatorVariant( + chromosome="chr1", position=12345, reference="A", alternative="C" + ) + ), + 2.0, + ) self.assertEqual(default_ploidy_manager.get_ploidy(variant=None), 2.0) def test_genome_wide_ploidy_manager(self): - self.assertEqual(PloidyManager(genome_wide_ploidy=3.2).get_ploidy( - variant=VafatorVariant(chromosome="chr1", position=12345, reference="A", alternative="C")), 3.2) - self.assertEqual(PloidyManager(genome_wide_ploidy=3.2).get_ploidy(variant=None), 3.2) + self.assertEqual( + PloidyManager(genome_wide_ploidy=3.2).get_ploidy( + variant=VafatorVariant( + chromosome="chr1", position=12345, reference="A", alternative="C" + ) + ), + 3.2, + ) + self.assertEqual( + PloidyManager(genome_wide_ploidy=3.2).get_ploidy(variant=None), 3.2 + ) def test_local_copy_numbers_ploidy_manager(self): - input_bed = pkg_resources.resource_filename(__name__, "resources/test_copy_numbers.bed") - self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( - variant=VafatorVariant(chromosome="chr1", position=12345, reference="A", alternative="C")), 1.2) - self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( - variant=VafatorVariant(chromosome="chr2", position=12345, reference="A", alternative="C")), 3.2) + input_bed = pkg_resources.resource_filename( + __name__, "resources/test_copy_numbers.bed" + ) + self.assertEqual( + PloidyManager(local_copy_numbers=input_bed).get_ploidy( + variant=VafatorVariant( + chromosome="chr1", position=12345, reference="A", alternative="C" + ) + ), + 1.2, + ) + self.assertEqual( + PloidyManager(local_copy_numbers=input_bed).get_ploidy( + variant=VafatorVariant( + chromosome="chr2", position=12345, reference="A", alternative="C" + ) + ), + 3.2, + ) # test non existing interval - self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( - variant=VafatorVariant(chromosome="chr3", position=12345, reference="A", alternative="C")), 2.0) + self.assertEqual( + PloidyManager(local_copy_numbers=input_bed).get_ploidy( + variant=VafatorVariant( + chromosome="chr3", position=12345, reference="A", alternative="C" + ) + ), + 2.0, + ) def test_interval_boundaries(self): - input_bed = pkg_resources.resource_filename(__name__, "resources/test_copy_numbers.bed") + input_bed = pkg_resources.resource_filename( + __name__, "resources/test_copy_numbers.bed" + ) # lower boundary — POS 10000 is 0-based 9999, outside interval start 10000 - self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( - variant=VafatorVariant(chromosome="chr1", position=10000, reference="A", alternative="C")), 2.0) + self.assertEqual( + PloidyManager(local_copy_numbers=input_bed).get_ploidy( + variant=VafatorVariant( + chromosome="chr1", position=10000, reference="A", alternative="C" + ) + ), + 2.0, + ) # just inside lower boundary - self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( - variant=VafatorVariant(chromosome="chr1", position=10001, reference="A", alternative="C")), 1.2) + self.assertEqual( + PloidyManager(local_copy_numbers=input_bed).get_ploidy( + variant=VafatorVariant( + chromosome="chr1", position=10001, reference="A", alternative="C" + ) + ), + 1.2, + ) # upper boundary - self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( - variant=VafatorVariant(chromosome="chr1", position=20000, reference="A", alternative="C")), 1.2) - self.assertEqual(PloidyManager(local_copy_numbers=input_bed).get_ploidy( - variant=VafatorVariant(chromosome="chr1", position=20001, reference="A", alternative="C")), 2.1) + self.assertEqual( + PloidyManager(local_copy_numbers=input_bed).get_ploidy( + variant=VafatorVariant( + chromosome="chr1", position=20000, reference="A", alternative="C" + ) + ), + 1.2, + ) + self.assertEqual( + PloidyManager(local_copy_numbers=input_bed).get_ploidy( + variant=VafatorVariant( + chromosome="chr1", position=20001, reference="A", alternative="C" + ) + ), + 2.1, + ) def test_invalid_bed_raises(self): with self.assertRaises(ValueError): diff --git a/vafator/tests/test_power_calculator.py b/vafator/tests/test_power_calculator.py index 0c7612a..c5371f9 100644 --- a/vafator/tests/test_power_calculator.py +++ b/vafator/tests/test_power_calculator.py @@ -10,42 +10,100 @@ class PowerCalculatorTest(TestCase): def test_power_calculator(self): power = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.5)}, purities={'tumor': 0.8}) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=0, sample='tumor', variant=None), 0.01734) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=1, sample='tumor', variant=None), 0.10404917949499565, 5) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=2, sample='tumor', variant=None), 0.29914139104811255, 5) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=3, sample='tumor', variant=None), 0.5592643397856016, 5) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=4, sample='tumor', variant=None), 0.7868719199309048, 5) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=5, sample='tumor', variant=None), 0.9234364680180867, 5) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=6, sample='tumor', variant=None), 0.9803383630544125, 5) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=7, sample='tumor', variant=None), 0.9965960473505056, 5) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=8, sample='tumor', variant=None), 0.999644363156023, 5) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=9, sample='tumor', variant=None), 0.9999830649121916, 5) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=10, sample='tumor', variant=None), 1.0) - self.assertAlmostEqual(power.calculate_power(dp=10, ac=11, sample='tumor', variant=None), 1.0) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.5)}, + purities={"tumor": 0.8}, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=0, sample="tumor", variant=None), 0.01734 + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=1, sample="tumor", variant=None), + 0.10404917949499565, + 5, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=2, sample="tumor", variant=None), + 0.29914139104811255, + 5, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=3, sample="tumor", variant=None), + 0.5592643397856016, + 5, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=4, sample="tumor", variant=None), + 0.7868719199309048, + 5, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=5, sample="tumor", variant=None), + 0.9234364680180867, + 5, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=6, sample="tumor", variant=None), + 0.9803383630544125, + 5, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=7, sample="tumor", variant=None), + 0.9965960473505056, + 5, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=8, sample="tumor", variant=None), + 0.999644363156023, + 5, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=9, sample="tumor", variant=None), + 0.9999830649121916, + 5, + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=10, sample="tumor", variant=None), 1.0 + ) + self.assertAlmostEqual( + power.calculate_power(dp=10, ac=11, sample="tumor", variant=None), 1.0 + ) def test_zero_dp_returns_zero_power(self): power = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8}) - self.assertEqual(power.calculate_power(dp=0, ac=0, sample='tumor', variant=None), 1.0) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.0)}, + purities={"tumor": 0.8}, + ) + self.assertEqual( + power.calculate_power(dp=0, ac=0, sample="tumor", variant=None), 1.0 + ) def test_eaf_copy_number_below_one(self): power = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=0.5)}, purities={'tumor': 0.9}) - self.assertLessEqual(power.calculate_expected_vaf(sample='tumor', variant=None), 1.0) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=0.5)}, + purities={"tumor": 0.9}, + ) + self.assertLessEqual( + power.calculate_expected_vaf(sample="tumor", variant=None), 1.0 + ) def test_eaf_is_cached(self): power = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8}) - v1 = VafatorVariant(chromosome='chr1', position=100, reference='A', alternative='G') - eaf1 = power.calculate_expected_vaf(sample='tumor', variant=v1) - eaf2 = power.calculate_expected_vaf(sample='tumor', variant=v1) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.0)}, + purities={"tumor": 0.8}, + ) + v1 = VafatorVariant( + chromosome="chr1", position=100, reference="A", alternative="G" + ) + eaf1 = power.calculate_expected_vaf(sample="tumor", variant=v1) + eaf2 = power.calculate_expected_vaf(sample="tumor", variant=v1) self.assertEqual(eaf1, eaf2) self.assertEqual(len(power._eaf_cache), 1) def test_k_is_cached(self): power = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8}) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.0)}, + purities={"tumor": 0.8}, + ) k1 = power._calculate_k(100) k2 = power._calculate_k(100) self.assertEqual(k1, k2) @@ -53,108 +111,156 @@ def test_k_is_cached(self): def test_higher_coverage_higher_power(self): power = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8}) - p_low, _ = power.calculate_absolute_power(dp=10, sample='tumor', variant=None) - p_high, _ = power.calculate_absolute_power(dp=100, sample='tumor', variant=None) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.0)}, + purities={"tumor": 0.8}, + ) + p_low, _ = power.calculate_absolute_power(dp=10, sample="tumor", variant=None) + p_high, _ = power.calculate_absolute_power(dp=100, sample="tumor", variant=None) self.assertLess(p_low, p_high) def test_varying_purity(self): power1 = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.5)}, purities={'tumor': 0.8}) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.5)}, + purities={"tumor": 0.8}, + ) power2 = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.5)}, purities={'tumor': 0.6}) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.5)}, + purities={"tumor": 0.6}, + ) self.assertLess( - power1.calculate_power(dp=10, ac=2, sample='tumor', variant=None), - power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None)) + power1.calculate_power(dp=10, ac=2, sample="tumor", variant=None), + power2.calculate_power(dp=10, ac=2, sample="tumor", variant=None), + ) power3 = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.5)}, purities={'tumor': 0.4}) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.5)}, + purities={"tumor": 0.4}, + ) self.assertLess( - power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None), - power3.calculate_power(dp=10, ac=2, sample='tumor', variant=None)) + power2.calculate_power(dp=10, ac=2, sample="tumor", variant=None), + power3.calculate_power(dp=10, ac=2, sample="tumor", variant=None), + ) def test_varying_ploidy(self): power1 = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=2.0)}, purities={'tumor': 0.8}) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=2.0)}, + purities={"tumor": 0.8}, + ) power2 = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=4.0)}, purities={'tumor': 0.8}) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=4.0)}, + purities={"tumor": 0.8}, + ) self.assertLess( - power1.calculate_power(dp=10, ac=2, sample='tumor', variant=None), - power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None)) + power1.calculate_power(dp=10, ac=2, sample="tumor", variant=None), + power2.calculate_power(dp=10, ac=2, sample="tumor", variant=None), + ) power3 = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(genome_wide_ploidy=6.0)}, purities={'tumor': 0.8}) + tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=6.0)}, + purities={"tumor": 0.8}, + ) self.assertLess( - power2.calculate_power(dp=10, ac=2, sample='tumor', variant=None), - power3.calculate_power(dp=10, ac=2, sample='tumor', variant=None)) + power2.calculate_power(dp=10, ac=2, sample="tumor", variant=None), + power3.calculate_power(dp=10, ac=2, sample="tumor", variant=None), + ) def test_local_copy_numbers(self): - input_bed = pkg_resources.resource_filename(__name__, "resources/test_copy_numbers.bed") + input_bed = pkg_resources.resource_filename( + __name__, "resources/test_copy_numbers.bed" + ) power = PowerCalculator( - tumor_ploidies={'tumor': PloidyManager(local_copy_numbers=input_bed)}, purities={'tumor': 0.8}) + tumor_ploidies={"tumor": PloidyManager(local_copy_numbers=input_bed)}, + purities={"tumor": 0.8}, + ) self.assertNotEqual( - power.calculate_power(dp=10, ac=2, sample='tumor', - variant=VafatorVariant( - chromosome='chr1', position=10001, reference='A', alternative='G')), - power.calculate_power(dp=10, ac=2, sample='tumor', - variant=VafatorVariant( - chromosome='chr1', position=20001, reference='A', alternative='G'))) + power.calculate_power( + dp=10, + ac=2, + sample="tumor", + variant=VafatorVariant( + chromosome="chr1", position=10001, reference="A", alternative="G" + ), + ), + power.calculate_power( + dp=10, + ac=2, + sample="tumor", + variant=VafatorVariant( + chromosome="chr1", position=20001, reference="A", alternative="G" + ), + ), + ) self.assertEqual( - power.calculate_power(dp=10, ac=2, sample='tumor', - variant=VafatorVariant( - chromosome='chr1', position=10001, reference='A', alternative='G')), - power.calculate_power(dp=10, ac=2, sample='tumor', - variant=VafatorVariant( - chromosome='chr1', position=10002, reference='A', alternative='G'))) + power.calculate_power( + dp=10, + ac=2, + sample="tumor", + variant=VafatorVariant( + chromosome="chr1", position=10001, reference="A", alternative="G" + ), + ), + power.calculate_power( + dp=10, + ac=2, + sample="tumor", + variant=VafatorVariant( + chromosome="chr1", position=10002, reference="A", alternative="G" + ), + ), + ) def test_absolute_power_calculator(self): - ploidy_manager = {'tumor': PloidyManager(genome_wide_ploidy=2)} - calculator = PowerCalculator(tumor_ploidies=ploidy_manager, purities={'tumor': 0.8}) - p, k = calculator.calculate_absolute_power(dp=100, sample='tumor', variant=None) + ploidy_manager = {"tumor": PloidyManager(genome_wide_ploidy=2)} + calculator = PowerCalculator( + tumor_ploidies=ploidy_manager, purities={"tumor": 0.8} + ) + p, k = calculator.calculate_absolute_power(dp=100, sample="tumor", variant=None) self.assertEqual(p, 1.0) self.assertEqual(k, 4) - p, k = calculator.calculate_absolute_power(dp=50, sample='tumor', variant=None) + p, k = calculator.calculate_absolute_power(dp=50, sample="tumor", variant=None) self.assertEqual(p, 1.0) self.assertEqual(k, 4) - p, k = calculator.calculate_absolute_power(dp=10, sample='tumor', variant=None) + p, k = calculator.calculate_absolute_power(dp=10, sample="tumor", variant=None) self.assertEqual(p, 0.85408) self.assertEqual(k, 3) - p, k = calculator.calculate_absolute_power(dp=2, sample='tumor', variant=None) + p, k = calculator.calculate_absolute_power(dp=2, sample="tumor", variant=None) self.assertEqual(p, 0.16009) self.assertEqual(k, 2) - calculator = PowerCalculator(tumor_ploidies=ploidy_manager, purities={'tumor': 0.6}) - p, k = calculator.calculate_absolute_power( - dp=100, sample='tumor', variant=None) + calculator = PowerCalculator( + tumor_ploidies=ploidy_manager, purities={"tumor": 0.6} + ) + p, k = calculator.calculate_absolute_power(dp=100, sample="tumor", variant=None) self.assertEqual(p, 1.0) self.assertEqual(k, 4) - p, k = calculator.calculate_absolute_power(dp=50, sample='tumor', variant=None) + p, k = calculator.calculate_absolute_power(dp=50, sample="tumor", variant=None) self.assertEqual(p, 1.00007) self.assertEqual(k, 4) - p, k = calculator.calculate_absolute_power(dp=10, sample='tumor', variant=None) + p, k = calculator.calculate_absolute_power(dp=10, sample="tumor", variant=None) self.assertEqual(p, 0.64373) self.assertEqual(k, 3) - p, k = calculator.calculate_absolute_power(dp=2, sample='tumor', variant=None) + p, k = calculator.calculate_absolute_power(dp=2, sample="tumor", variant=None) self.assertEqual(p, 0.09005) self.assertEqual(k, 2) - calculator = PowerCalculator(tumor_ploidies=ploidy_manager, purities={'tumor': 0.1}) - p, k = calculator.calculate_absolute_power( - dp=100, sample='tumor', variant=None) + calculator = PowerCalculator( + tumor_ploidies=ploidy_manager, purities={"tumor": 0.1} + ) + p, k = calculator.calculate_absolute_power(dp=100, sample="tumor", variant=None) self.assertEqual(p, 0.75607) self.assertEqual(k, 4) - p, k = calculator.calculate_absolute_power(dp=50, sample='tumor', variant=None) + p, k = calculator.calculate_absolute_power(dp=50, sample="tumor", variant=None) self.assertEqual(p, 0.33419) self.assertEqual(k, 4) - p, k = calculator.calculate_absolute_power(dp=10, sample='tumor', variant=None) + p, k = calculator.calculate_absolute_power(dp=10, sample="tumor", variant=None) self.assertEqual(p, 0.01254) self.assertEqual(k, 3) - p, k = calculator.calculate_absolute_power(dp=2, sample='tumor', variant=None) + p, k = calculator.calculate_absolute_power(dp=2, sample="tumor", variant=None) self.assertEqual(p, 0.0025) self.assertEqual(k, 2) def test_default_purity_is_one(self): power = PowerCalculator(tumor_ploidies={}, purities={}) - eaf = power.calculate_expected_vaf(sample='tumor', variant=None) + eaf = power.calculate_expected_vaf(sample="tumor", variant=None) # purity=1, normal_ploidy=2, tumor_ploidy=2 => eaf = 1/(1*2 + 0*2) = 0.5 self.assertAlmostEqual(eaf, 0.5) diff --git a/vafator/tests/test_rank_sum_test.py b/vafator/tests/test_rank_sum_test.py index 6c808a0..870cdc5 100644 --- a/vafator/tests/test_rank_sum_test.py +++ b/vafator/tests/test_rank_sum_test.py @@ -50,10 +50,10 @@ def test_both_empty_returns_nan(self): self.assertTrue(isnan(pvalue)) def test_get_rank_sum_tests_snv(self): - variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['T']) + variant = VariantRecord(CHROM="chr1", POS=100, REF="A", ALT=["T"]) distributions = { - 'A': [20, 25, 30, 35, 40], # ref is higher - 'T': [1, 5, 10, 15, 20], # alt is lower + "A": [20, 25, 30, 35, 40], # ref is higher + "T": [1, 5, 10, 15, 20], # alt is lower } pvalues, stats = get_rank_sum_tests(distributions, variant) self.assertEqual(len(stats), 1) @@ -62,8 +62,8 @@ def test_get_rank_sum_tests_snv(self): self.assertLess(float(stats[0]), 0.0) def test_get_rank_sum_tests_no_alt_reads(self): - variant = VariantRecord(CHROM='chr1', POS=100, REF='A', ALT=['T']) - distributions = {'A': [20, 25, 30]} # no T reads + variant = VariantRecord(CHROM="chr1", POS=100, REF="A", ALT=["T"]) + distributions = {"A": [20, 25, 30]} # no T reads pvalues, stats = get_rank_sum_tests(distributions, variant) self.assertEqual(stats, []) - self.assertEqual(pvalues, []) \ No newline at end of file + self.assertEqual(pvalues, []) diff --git a/vafator/tests/utils.py b/vafator/tests/utils.py index 0564cb6..2d490d6 100755 --- a/vafator/tests/utils.py +++ b/vafator/tests/utils.py @@ -14,7 +14,9 @@ def _get_count_variants(input_file): def _get_mutation_at_position(input_file, chromosome, position): variant = None vcf = VCF(input_file) - for v in vcf: # we cannot query by specific positions as this requires a tabix index + for ( + v + ) in vcf: # we cannot query by specific positions as this requires a tabix index if v.CHROM == chromosome and v.POS == position: variant = v break @@ -24,7 +26,7 @@ def _get_mutation_at_position(input_file, chromosome, position): def _get_info_fields(input_file): vcf = VCF(input_file) - return [h.info().get("ID") for h in vcf.header_iter() if h['HeaderType'] == 'INFO'] + return [h.info().get("ID") for h in vcf.header_iter() if h["HeaderType"] == "INFO"] def _get_annotation_values(input_file, annotation): diff --git a/vafator/vafator2decifer.py b/vafator/vafator2decifer.py index 32ae4b8..39496d3 100644 --- a/vafator/vafator2decifer.py +++ b/vafator/vafator2decifer.py @@ -31,20 +31,29 @@ def filterByDepthAndVaf(variant: Variant, Filter, samples): for s in samples: # filter if genotype has low depth or is missing - if variant.INFO["{}_dp".format(s)] < Filter['MinDepth']: + if variant.INFO["{}_dp".format(s)] < Filter["MinDepth"]: missing += 1 # filter if alt allele isn't greater than the specified threshold in at least one sample - if not any(np.greater_equal([variant.INFO["{}_ac".format(s)] for s in samples], Filter['MinDepthAltAllele'])): + if not any( + np.greater_equal( + [variant.INFO["{}_ac".format(s)] for s in samples], + Filter["MinDepthAltAllele"], + ) + ): missing += 1 # filter if VAF isn't greater than the specified threshold in at least one sample - if not any(np.greater_equal([variant.INFO["{}_af".format(s)] for s in samples], Filter['MinVAF'])): + if not any( + np.greater_equal( + [variant.INFO["{}_af".format(s)] for s in samples], Filter["MinVAF"] + ) + ): missing += 1 if missing > 0: PASS = 0 - return (PASS) + return PASS def compute_ref_var_depths(vcf, FilterDP, samples): @@ -56,7 +65,9 @@ def compute_ref_var_depths(vcf, FilterDP, samples): PASS = filterByDepthAndVaf(variant, FilterDP, samples) # print(np.greater_equal(variant.gt_alt_depths,FilterDP['MinDepthAltAllele'])) if PASS: - char_label = ".".join(map(str, [variant.CHROM, variant.POS, variant.REF, variant.ALT[0]])) + char_label = ".".join( + map(str, [variant.CHROM, variant.POS, variant.REF, variant.ALT[0]]) + ) for s in samples: alt = variant.INFO["{}_ac".format(s)] ref = variant.INFO["{}_dp".format(s)] - alt @@ -69,14 +80,19 @@ def print_output(ref_var_depths, cna_overlaps, outdir, samples): chars = ref_var_depths.keys() & cna_overlaps.keys() header = [str(len(chars)) + " #characters"] header.append(str(len(samples)) + " #samples") - header.append("#sample_index\tsample_label\tcharacter_index\tcharacter_label\tref\tvar") + header.append( + "#sample_index\tsample_label\tcharacter_index\tcharacter_label\tref\tvar" + ) # print(header) - with open(f"{outdir}/decifer.input.tsv", 'w') as out: + with open(f"{outdir}/decifer.input.tsv", "w") as out: print("\n".join(header), file=out) for char_label in ref_var_depths: if char_label in cna_overlaps: for i in range(len(samples)): - r, v = ref_var_depths[char_label][i][0], ref_var_depths[char_label][i][1] + r, v = ( + ref_var_depths[char_label][i][0], + ref_var_depths[char_label][i][1], + ) to_print = [i, samples[i], char_index, char_label, r, v] cnas = cna_overlaps[char_label][i] to_print.extend(cnas) @@ -88,14 +104,14 @@ def print_output(ref_var_depths, cna_overlaps, outdir, samples): def get_purities(cna_df, num_samples, min_purity): purities = {} for i, row in cna_df.head(num_samples + 1).iterrows(): - purity = 1.0 - row['u_normal'] + purity = 1.0 - row["u_normal"] if purity >= min_purity: - purities[row['SAMPLE']] = purity + purities[row["SAMPLE"]] = purity return purities def print_purities(purities, sample_index, num_samples, outdir): - with open(f"{outdir}/decifer.purity.tsv", 'w') as out: + with open(f"{outdir}/decifer.purity.tsv", "w") as out: for sample in sample_index: print(sample_index[sample], purities[sample], file=out, sep="\t") @@ -113,21 +129,21 @@ def filter_high_CN_sites(cn_states_persite, max_CN): def print_unique_CN_states(cn_states, max_CN, outdir): # print unique copy number states for sites that are below the max_CN threshold cn_states = tuple(set(cn_states)) - with open(f"{outdir}/cn_states.txt", 'w') as out: + with open(f"{outdir}/cn_states.txt", "w") as out: for value in cn_states: PASS = 1 for i in value: if int(i[0]) + int(i[1]) > max_CN: PASS = 0 if PASS: - out.write(';'.join([','.join(i) for i in value]) + '\n') + out.write(";".join([",".join(i) for i in value]) + "\n") def print_filtered_sites(filtered_sites, cna_overlaps, outdir): - with open(f"{outdir}/filtered_sites.txt", 'w') as out: + with open(f"{outdir}/filtered_sites.txt", "w") as out: out.write("\n".join(filtered_sites)) print(file=out) - with open(f"{outdir}/filtered_stats.txt", 'w') as out: + with open(f"{outdir}/filtered_stats.txt", "w") as out: filtered = len(filtered_sites) total = len(cna_overlaps.keys()) print("# sites that were filtered due to copy-number states > max_CN", file=out) @@ -163,7 +179,10 @@ def overlap_cna_snp(vcf_samples, max_CN, snps, out_dir): if filter_high_CN_sites(cn_info.keys(), max_CN): # store results, converting from dict to a list for later printing cna_info = [] - [cna_info.extend([c.split("|")[0], c.split("|")[1], cn_info[c]]) for c in cn_info] + [ + cna_info.extend([c.split("|")[0], c.split("|")[1], cn_info[c]]) + for c in cn_info + ] cna_overlaps[char_label].append(cna_info) else: filtered_sites.add(char_label) @@ -180,22 +199,24 @@ def run_vafator2decifer(args): num_samples = len(args.samples.split(",")) if args.samples is not None else 0 # Load in CNA information - cna_df = pd.read_csv(args.cna_file, sep='\t', index_col=False) + cna_df = pd.read_csv(args.cna_file, sep="\t", index_col=False) # get purities and filter by min_purity purities = get_purities(cna_df, num_samples, args.min_purity) # restrict samples considered in VCF and CNA file to those that have purity > min_purity restricted_samples = list(purities.keys()) - cna_df = cna_df.loc[cna_df['SAMPLE'].isin(list(purities.keys()))] + cna_df = cna_df.loc[cna_df["SAMPLE"].isin(list(purities.keys()))] # print new CNA file, filtering out samples below min_purity cna_df.to_csv(f"{args.out_dir}/best.seg.ucn", sep="\t", index=False) if args.snp_file: - snp_df = pd.read_csv(args.snp_file, sep='\t', index_col=False, header=None) + snp_df = pd.read_csv(args.snp_file, sep="\t", index_col=False, header=None) snp_df = snp_df.loc[snp_df[2].isin(list(purities.keys()))] # rearrange columns for decifer snp_df = snp_df[[2, 0, 1, 3, 4]] - snp_df.to_csv(f"{args.out_dir}/snpfile.1bed", sep="\t", index=False, header=False) + snp_df.to_csv( + f"{args.out_dir}/snpfile.1bed", sep="\t", index=False, header=False + ) num_samples = len(restricted_samples) # print purity information @@ -204,21 +225,25 @@ def run_vafator2decifer(args): # Filtering criteria Filter = {} - Filter['MinDepth'] = args.min_depth - Filter['MinDepthAltAllele'] = args.min_alt_depth - Filter['MinVAF'] = args.min_vaf + Filter["MinDepth"] = args.min_depth + Filter["MinDepthAltAllele"] = args.min_alt_depth + Filter["MinVAF"] = args.min_vaf # ref_var_depths[char_label] = list of (ref,alt) tuples, one for each sample, in same order as vcf.samples ref_var_depths = compute_ref_var_depths(vcf, Filter, samples=restricted_samples) # print BED file for SNPs - with open(f"{args.out_dir}/snps.bed", 'w') as out: + with open(f"{args.out_dir}/snps.bed", "w") as out: print("chrom\tstart\tend\tREF\tALT", file=out) # sort ref_var_depths by the first two parts of chr_label - for chr_label in sorted(ref_var_depths, key=lambda x: (x.split('.')[0], int(x.split('.')[1]))): + for chr_label in sorted( + ref_var_depths, key=lambda x: (x.split(".")[0], int(x.split(".")[1])) + ): pos = chr_label.split(".") # subtract 1 from position to create interval in BED format - print(pos[0], int(pos[1]) - 1, int(pos[1]), pos[2], pos[3], sep="\t", file=out) + print( + pos[0], int(pos[1]) - 1, int(pos[1]), pos[2], pos[3], sep="\t", file=out + ) snps = pbt.BedTool(f"{args.out_dir}/snps.bed") if args.exclude_list: @@ -227,16 +252,18 @@ def run_vafator2decifer(args): # prepare BED files for CNA intervals for each sample, for overlapping with SNPs for sample in restricted_samples: - df = cna_df[cna_df['SAMPLE'] == sample] + df = cna_df[cna_df["SAMPLE"] == sample] # consider subtracting 1 from start of interval to be compatible with BED format, leave end interval alone - df.loc[:, 'START'] = df['START'] - df = df.drop('SAMPLE', axis=1) + df.loc[:, "START"] = df["START"] + df = df.drop("SAMPLE", axis=1) df.to_csv(f"{args.out_dir}/{sample}_cna.bed", index=False, sep="\t") # overlap SNPs with CNA intervals for each sample # cna_overlaps[char_label] = list of tuples of CNA info (one tuple for each sample, in same order as vcf.samples) # this function also prints the observed CN state trees for the generatestatetrees function - cna_overlaps, cn_states_allsites, filtered_sites = overlap_cna_snp(restricted_samples, args.max_CN, snps, args.out_dir) + cna_overlaps, cn_states_allsites, filtered_sites = overlap_cna_snp( + restricted_samples, args.max_CN, snps, args.out_dir + ) # sites may have unique CN states that are duplicate; set them to find unique CN states across sites print_unique_CN_states(cn_states_allsites, args.max_CN, args.out_dir) From a3e5e20a46eaa79689bfae52c63a244131f34684 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Wed, 8 Apr 2026 15:24:53 +0200 Subject: [PATCH 28/32] fix unused variable bam --- vafator/annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vafator/annotator.py b/vafator/annotator.py index 519b8eb..2a6c482 100755 --- a/vafator/annotator.py +++ b/vafator/annotator.py @@ -452,7 +452,7 @@ def _get_headers(input_bams: dict) -> list: Annotator._make_header(suffix, description, typ, number, sample=s) ) if len(bams) > 1: - for i, bam in enumerate(bams, start=1): + for i in range(1, len(bams) + 1): # n = os.path.basename(bam).split(".")[0] for suffix, description, typ, number in _REPLICATE_HEADER_TEMPLATES: headers.append( From 2bcc602505e173391671bc7accb4e24b59ee4eb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Thu, 9 Apr 2026 11:37:31 +0200 Subject: [PATCH 29/32] fix Codacy errors --- setup.py | 2 +- vafator/annotator.py | 9 +++++++-- vafator/command_line.py | 1 - vafator/hatchet2bed.py | 10 +++------- vafator/vafator2decifer.py | 6 +----- 5 files changed, 12 insertions(+), 16 deletions(-) diff --git a/setup.py b/setup.py index 0610c88..c29369c 100755 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -from setuptools import find_packages, setup +from setuptools import setup import vafator diff --git a/vafator/annotator.py b/vafator/annotator.py index 2a6c482..3fa3f50 100755 --- a/vafator/annotator.py +++ b/vafator/annotator.py @@ -72,10 +72,10 @@ def __init__( input_vcf: str, output_vcf: str, input_bams: dict, - purities: dict = {}, + purities: dict = None, mapping_qual_thr: int = 0, base_call_qual_thr: int = 29, - tumor_ploidies: dict = {}, + tumor_ploidies: dict = None, normal_ploidy: int = 2, fpr: float = DEFAULT_FPR, error_rate: float = DEFAULT_ERROR_RATE, @@ -98,6 +98,11 @@ def __init__( num_processes: number of parallel processes for chromosome-level annotation (default: 1) """ + if purities is None: + purities = {} + if tumor_ploidies is None: + tumor_ploidies = {} + self.mapping_quality_threshold = mapping_qual_thr self.base_call_quality_threshold = base_call_qual_thr self.purities = purities diff --git a/vafator/command_line.py b/vafator/command_line.py index e09e9b0..ca74b0e 100755 --- a/vafator/command_line.py +++ b/vafator/command_line.py @@ -1,6 +1,5 @@ #!/usr/bin/env python import argparse -import sys import logging import vafator from vafator.power import DEFAULT_FPR, DEFAULT_ERROR_RATE diff --git a/vafator/hatchet2bed.py b/vafator/hatchet2bed.py index bef4ba3..cbc5989 100644 --- a/vafator/hatchet2bed.py +++ b/vafator/hatchet2bed.py @@ -3,12 +3,8 @@ def run_hatchet2bed(input_file, output_prefix): input_df = pd.read_csv(input_file, sep="\t") - cn_columns = sorted( - list(filter(lambda c: c.startswith("cn_clone"), input_df.columns)) - ) - u_columns = sorted( - list(filter(lambda c: c.startswith("u_clone"), input_df.columns)) - ) + cn_columns = sorted([c for c in input_df.columns if c.startswith("cn_clone")]) + u_columns = sorted([c for c in input_df.columns if c.startswith("u_clone")]) for sample in input_df.SAMPLE.unique(): data = [] @@ -18,7 +14,7 @@ def run_hatchet2bed(input_file, output_prefix): for cn_column, u_column in zip(cn_columns, u_columns): u = float(row[u_column]) total_u += u - cn = sum(map(lambda c: float(c), row[cn_column].split("|"))) + cn = sum(map(float, row[cn_column].split("|"))) numerator.append(u * cn) data.append( [row["#CHR"], row["START"], row["END"], sum(numerator) / total_u] diff --git a/vafator/vafator2decifer.py b/vafator/vafator2decifer.py index 39496d3..6ebf447 100644 --- a/vafator/vafator2decifer.py +++ b/vafator/vafator2decifer.py @@ -13,7 +13,6 @@ python vcf_2_decifer.py [OPTIONS] """ -import re import sys import pybedtools as pbt from cyvcf2 import VCF, Variant @@ -21,7 +20,6 @@ import pandas as pd import numpy as np from collections import defaultdict -import argparse def filterByDepthAndVaf(variant: Variant, Filter, samples): @@ -179,10 +177,8 @@ def overlap_cna_snp(vcf_samples, max_CN, snps, out_dir): if filter_high_CN_sites(cn_info.keys(), max_CN): # store results, converting from dict to a list for later printing cna_info = [] - [ + for c in cn_info: cna_info.extend([c.split("|")[0], c.split("|")[1], cn_info[c]]) - for c in cn_info - ] cna_overlaps[char_label].append(cna_info) else: filtered_sites.add(char_label) From c0254d00fd825b867d642403ddc8576512454616 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Fri, 10 Apr 2026 10:31:32 +0200 Subject: [PATCH 30/32] add seed for random class. fixes #54 --- vafator/multiallelic_filter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vafator/multiallelic_filter.py b/vafator/multiallelic_filter.py index 00b0f26..6b998d6 100755 --- a/vafator/multiallelic_filter.py +++ b/vafator/multiallelic_filter.py @@ -39,6 +39,7 @@ def __init__(self, input_vcf, output_vcf, tumor_sample_name="tumor"): } ) self.vcf_writer = Writer(output_vcf, self.vcf) + self.random = random.Random(42) def run(self): batch = [] @@ -67,7 +68,7 @@ def run(self): # chooses a variant at random alt1 = variant.ALT[0] alt2 = prev_variant.ALT[0] - prev_variant = random.sample([variant, prev_variant], k=1)[0] + prev_variant = self.random.sample([variant, prev_variant], k=1)[0] self.set_multiallelic_annotation( prev_variant, alt1 if alt1 != prev_variant.ALT[0] else alt2, af1 ) From eb176144faa76785152a1f3473f0500bb0e4392b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Fri, 10 Apr 2026 10:32:15 +0200 Subject: [PATCH 31/32] change version to 3.0.0 --- setup.cfg | 2 +- vafator/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index bc22ccb..4f08792 100755 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = VAFator -version = 3.1.0 +version = 3.0.0 description = Annotate variants in a VCF file with technical annotations from one or more BAMs description-file = README.md long_description = file: README.md diff --git a/vafator/__init__.py b/vafator/__init__.py index 273ffca..ea9d694 100755 --- a/vafator/__init__.py +++ b/vafator/__init__.py @@ -1 +1 @@ -VERSION = "3.1.0" +VERSION = "3.0.0" From f239072dba1482c3c2c8c90e9a51f982781e967e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zlem=20Muslu?= Date: Fri, 10 Apr 2026 10:42:27 +0200 Subject: [PATCH 32/32] add support for CRAM files. closes #45 --- vafator/annotator.py | 4 ++-- vafator/command_line.py | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/vafator/annotator.py b/vafator/annotator.py index 3fa3f50..5351e50 100755 --- a/vafator/annotator.py +++ b/vafator/annotator.py @@ -52,7 +52,7 @@ def _collect_metrics_worker( ] for sample, bam_files in bam_paths.items(): for i, bam_path in enumerate(bam_files): - bam = pysam.AlignmentFile(bam_path, "rb") + bam = pysam.AlignmentFile(bam_path, "r") all_metrics[(sample, i)] = collect_metrics_for_chrom( chrom=chrom, variants=variants, @@ -159,7 +159,7 @@ def __init__( self.bam_paths = input_bams self.bam_readers = { - s: [pysam.AlignmentFile(b, "rb") for b in bams] + s: [pysam.AlignmentFile(b, "r") for b in bams] for s, bams in input_bams.items() } diff --git a/vafator/command_line.py b/vafator/command_line.py index ca74b0e..b8f3ea1 100755 --- a/vafator/command_line.py +++ b/vafator/command_line.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import argparse import logging +from pathlib import Path import vafator from vafator.power import DEFAULT_FPR, DEFAULT_ERROR_RATE from vafator.hatchet2bed import run_hatchet2bed @@ -12,6 +13,16 @@ epilog = "Copyright (c) 2019-2021 TRON gGmbH (See LICENSE for licensing details)" +def _validate_alignment_file_extension(alignment_file): + suffixes = [suffix.lower() for suffix in Path(alignment_file).suffixes] + if ".sam" in suffixes: + raise ValueError( + "SAM input is not supported: {}. Please provide BAM or CRAM files.".format( + alignment_file + ) + ) + + def annotator(): # set up logger @@ -40,8 +51,8 @@ def annotator(): nargs=2, metavar=("sample_name", "bam_file"), default=[], - help="A sample name and a BAM file. Can be used multiple times to input multiple samples and " - "multiple BAM files. The same sample name can be used multiple times with different BAMs, " + help="A sample name and a BAM/CRAM file. Can be used multiple times to input multiple samples and " + "multiple BAM/CRAM files. The same sample name can be used multiple times with different BAMs/CRAMs, " "this will treated as replicates.", ) parser.add_argument( @@ -127,6 +138,7 @@ def annotator(): bams = {} for sample_name, bam in args.bam: + _validate_alignment_file_extension(bam) if sample_name in bams: bams[sample_name].append(bam) else: