diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..0102123 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,60 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + quality: + name: Lint, type-check, test + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: Install bcftools + run: | + sudo apt-get update + sudo apt-get install -y bcftools + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + + - name: Install package + run: | + python -m pip install -U pip + python -m pip install -e ".[dev,test]" + + - name: Ruff lint + run: ruff check . + + - name: Mypy + run: mypy src + + - name: Pytest + run: pytest + + - name: bcftools round-trip check + shell: bash + run: | + set -euo pipefail + tmp=$(mktemp -d) + svforge gen --caller manta --out "$tmp/manta.vcf.gz" --n 50 --sample-name T01 --seed 42 + svforge gen --caller delly --out "$tmp/delly.bcf" --n 50 --sample-name T01 --seed 42 + bcftools view -h "$tmp/manta.vcf.gz" > /dev/null + bcftools view -h "$tmp/delly.bcf" > /dev/null + test "$(bcftools view -H "$tmp/manta.vcf.gz" | wc -l)" -gt 0 + test "$(bcftools view -H "$tmp/delly.bcf" | wc -l)" -gt 0 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..6dbe7b5 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,123 @@ +name: Publish + +on: + push: + tags: + - "v*" + +jobs: + verify-tag: + name: Verify tag matches pyproject version + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Check tag == project version + shell: bash + run: | + set -euo pipefail + tag="${GITHUB_REF_NAME}" + tag_version="${tag#v}" + project_version=$(python -c "import tomllib, pathlib; print(tomllib.loads(pathlib.Path('pyproject.toml').read_text())['project']['version'])") + echo "Tag: $tag (parsed: $tag_version)" + echo "Project: $project_version" + if [[ "$tag_version" != "$project_version" ]]; then + echo "::error::Tag ($tag_version) does not match pyproject.toml version ($project_version)" + exit 1 + fi + + quality: + name: Quality gate before publish + needs: verify-tag + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install bcftools + run: | + sudo apt-get update + sudo apt-get install -y bcftools + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + + - name: Install package + run: | + python -m pip install -U pip + python -m pip install -e ".[dev,test]" + + - name: Ruff lint + run: ruff check . + + - name: Mypy + run: mypy src + + - name: Pytest + run: pytest + + - name: bcftools round-trip check + shell: bash + run: | + set -euo pipefail + tmp=$(mktemp -d) + svforge gen --caller manta --out "$tmp/manta.vcf.gz" --n 50 --sample-name T01 --seed 42 + svforge gen --caller delly --out "$tmp/delly.bcf" --n 50 --sample-name T01 --seed 42 + bcftools view -h "$tmp/manta.vcf.gz" > /dev/null + bcftools view -h "$tmp/delly.bcf" > /dev/null + test "$(bcftools view -H "$tmp/manta.vcf.gz" | wc -l)" -gt 0 + test "$(bcftools view -H "$tmp/delly.bcf" | wc -l)" -gt 0 + + build: + name: Build sdist and wheel + needs: quality + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install build and twine + run: python -m pip install -U pip build twine + + - name: Build + run: python -m build + + - name: Twine check + run: twine check dist/* + + - name: Upload artefacts + uses: actions/upload-artifact@v4 + with: + name: dist + path: dist/ + + publish: + name: Publish to PyPI + needs: build + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/project/svforge + permissions: + id-token: write + + steps: + - name: Download artefacts + uses: actions/download-artifact@v4 + with: + name: dist + path: dist/ + + - name: Publish + uses: pypa/gh-action-pypi-publish@release/v1 \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c437d2b --- /dev/null +++ b/.gitignore @@ -0,0 +1,54 @@ +data_local/* +!data_local/README.md +!data_local/gen-test/ +data_local/gen-test/* +!data_local/gen-test/.gitkeep + +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +*.egg +*.egg-info/ +.eggs/ +dist/ +build/ +*.manifest +*.spec +pip-wheel-metadata/ +share/python-wheels/ +wheels/ + +.venv/ +venv/ +ENV/ +env/ +.env.venv + +.env +.env.local +.env.*.local +*.env + +.pytest_cache/ +.coverage +.coverage.* +htmlcov/ +.tox/ +.nox/ +coverage.xml +*.cover +.hypothesis/ +.mypy_cache/ +.ruff_cache/ +.dmypy.json +dmypy.json + +.ipynb_checkpoints/ + +.idea/ +.vscode/ +.DS_Store +Thumbs.db +*.log diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..8cc5aab --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,20 @@ +# Changelog + +## [1.0.0] — 2026-04-25 + +Initial release. + +- `svforge gen` and `svforge gen-pair` to generate synthetic VCFs (single-sample or coherent tumor/normal pair) +- Manta (VCFv4.1) and DELLY (VCFv4.2) writers +- Injection via `--gnomad-fraction` and `--blacklist-fraction` +- `INFO/SVFORGE_SOURCE` tag on every record (`bank` / `gnomad` / `blacklist`) for self-verifying pipelines +- `svforge validate` to confirm injected records match the bundled catalogs exactly +- Chromosome filtering via `--chromosomes` +- Custom header support via `--header-template PATH` +- Deterministic sampling via `--seed`; effective seed always logged in the output VCF +- BCF / VCF.gz / VCF output formats +- Default hg38 SV bank with weighted templates across DEL, DUP, INV, INS, BND +- Writer plugin system: third-party callers can register via `svforge.writers` entry point +- Non-configurable `##svforgeWarning=SYNTHETIC_DATA_DO_NOT_USE_FOR_CLINICAL_DIAGNOSIS` injected in every VCF header +- `sanitize_command()` strips absolute paths from the logged command line so user home directories and cluster paths never leak into generated VCFs +- Python 3.10+, hg38 only \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..2efdf56 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,37 @@ +# Contributing to svForge + +## Development setup + +```bash +python -m venv .venv +source .venv/bin/activate +pip install -e ".[dev,test]" +``` + +## Contribution workflow + +1. Open an issue first and describe the change (new caller, new template, fix, etc.). Wait for feedback before coding. +2. Create a dedicated branch from `main` named `{issue-number}-short-description`. Never work directly on `main`. +3. Use commit messages that reference the issue number: `feat: XXX #12`, `fix: XXX (#15)`. +4. Update `CHANGELOG.md`. +5. Run the local quality gate before pushing. If it is not all green, the PR is not reviewed: + +```bash +ruff check . +mypy src +pytest -q +``` + +6. Push and open a PR to `main`. In the PR description, add `Closes #` for automatic issue closing at merge. +7. Use squash merge. The squash commit message must be clean and final (not a draft branch history dump). +8. Delete the branch after merge. + +## Common contribution cases + +- Add a new caller (GRIDSS, SvABA, Lumpy, Smoove, ...). This is the main contribution path and a core modularity goal of the project. +- Add a new header template for an uncovered genome or scenario (for example `hg19`, germline mode, tumor-only mode). +- Enrich the bundled mini-catalogs (`gnomad_hg38_mini.tsv`, `blacklist_hg38_mini.tsv`) with more entries or more diversity. +- Improve the default bank (`default_hg38.yaml`) with more representative SV templates. +- Report a bug with a minimal command that reproduces it. +- Improve documentation. +- Add tests for an uncovered edge case (for example negative `SVLEN`, BND to alt contig, etc.). diff --git a/README.md b/README.md new file mode 100644 index 0000000..0520c05 --- /dev/null +++ b/README.md @@ -0,0 +1,69 @@ +

+ + + + svForge logo + +

+ +### Generate synthetic SV VCFs to stress-test your pipelines with confidence + +--- + +**svForge** produces caller-specific VCFs (Manta, DELLY) in VCF / VCF.gz / BCF format with fine-grained control over variability (HOMLEN, SVLEN, VAF) and realistic artefact injection (SVs in ENCODE blacklist regions, gnomAD germline SVs). + +Designed to be modular, it is easy to adapt to your own use case. +You can tune generation parameters, plug in new callers, and customize the workflow without reworking the whole tool. + +## Installation + +```bash +pip install svforge +``` + +Or from source: + +```bash +git clone https://github.com/pieetie/svforge +cd svforge +pip install -e ".[dev,test]" +``` + +## Quick start + +For ready-to-run command lines (single sample, tumor/normal pair, validation, banks, and dev checks), see [`docs/ready-to-use.md`](docs/ready-to-use.md). + +## Typical use cases + +- Validate downstream filters (for example, `SVFORGE_SOURCE=gnomad` records should disappear after your gnomAD filtering step). +- Validate ENCODE blacklist annotation logic (for example, `SVFORGE_SOURCE=blacklist` records should receive your expected poor-mappability label). +- Run reproducible CI regression tests with fixed seeds, without committing generated VCF files. +- Smoke-test deployments and pipeline changes in seconds instead of rerunning full variant callers on BAM files. +- Reproduce specific scenarios and edge cases (cross-chrom BNDs, contig-edge events, controlled VAF/HOMLEN ranges) for debugging and QA. +- Demo or onboard safely with realistic SV VCFs and no patient data. + +## CLI + +``` +svforge gen # one VCF for one sample +svforge gen-pair # tumor + normal VCFs for somatic pipelines +svforge validate # self-consistency check of injected SVs +svforge bank list # list built-in banks +svforge bank show # dump a bank as YAML +svforge callers # list registered writers +``` + +Run `svforge --help` for the full flag list. + +## Credits + +- **Logo and visual identity** - [Elisa Perrin](https://www.linkedin.com/in/elisaperrin/) +- **Claude** (Anthropic) - assisted with tests, documentation, refactoring, and release tooling (Ruff linting/formatting, CI cleanup) + +## License + + + + + svForge icon + Distributed under the MIT License. diff --git a/data_local/README.md b/data_local/README.md new file mode 100644 index 0000000..6f22580 --- /dev/null +++ b/data_local/README.md @@ -0,0 +1,12 @@ +# Run from the svforge project root + +mkdir -p data_local/gnomad data_local/blacklist + +## gnomAD SV v4.1 +wget -P data_local/gnomad/ https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz + +wget -P data_local/gnomad/ https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz.tbi + +## ENCODE blacklist v2 +wget -P data_local/blacklist/ https://raw.githubusercontent.com/Boyle-Lab/Blacklist/master/lists/hg38-blacklist.v2.bed.gz + diff --git a/data_local/gen-test/.gitkeep b/data_local/gen-test/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docs/assets/README.md b/docs/assets/README.md new file mode 100644 index 0000000..b574bab --- /dev/null +++ b/docs/assets/README.md @@ -0,0 +1,9 @@ +# Logos and visual assets + +The graphics in this directory (logos, icons, related images) were made by Elisa Perrin for svForge. © Elisa Perrin, used with permission. They're **not** covered by the code licence in `LICENSE`. + +Use them freely for anything non-commercial that supports the project: papers, talks, blog posts, slides, docs, community stuff. A credit ("Logo by Elisa Perrin") is appreciated when it fits. + +Ask first for: merch, paid ads, rebranding, or anything where you're making money off our name or visuals. + +*Intent, not legal advice.* \ No newline at end of file diff --git a/docs/assets/raster-300ppi/icons-no-background/icon-black@300x.png b/docs/assets/raster-300ppi/icons-no-background/icon-black@300x.png new file mode 100644 index 0000000..7b8ea1b Binary files /dev/null and b/docs/assets/raster-300ppi/icons-no-background/icon-black@300x.png differ diff --git a/docs/assets/raster-300ppi/icons-no-background/icon-red@300x.png b/docs/assets/raster-300ppi/icons-no-background/icon-red@300x.png new file mode 100644 index 0000000..5d883a7 Binary files /dev/null and b/docs/assets/raster-300ppi/icons-no-background/icon-red@300x.png differ diff --git a/docs/assets/raster-300ppi/icons-no-background/icon-white@300x.png b/docs/assets/raster-300ppi/icons-no-background/icon-white@300x.png new file mode 100644 index 0000000..6bac61b Binary files /dev/null and b/docs/assets/raster-300ppi/icons-no-background/icon-white@300x.png differ diff --git a/docs/assets/raster-300ppi/icons-on-background/icon-bn-app@300x.png b/docs/assets/raster-300ppi/icons-on-background/icon-bn-app@300x.png new file mode 100644 index 0000000..9cb513d Binary files /dev/null and b/docs/assets/raster-300ppi/icons-on-background/icon-bn-app@300x.png differ diff --git a/docs/assets/raster-300ppi/icons-on-background/icon-bn@300x.png b/docs/assets/raster-300ppi/icons-on-background/icon-bn@300x.png new file mode 100644 index 0000000..72c9516 Binary files /dev/null and b/docs/assets/raster-300ppi/icons-on-background/icon-bn@300x.png differ diff --git a/docs/assets/raster-300ppi/icons-on-background/icon-dark-app@300x.png b/docs/assets/raster-300ppi/icons-on-background/icon-dark-app@300x.png new file mode 100644 index 0000000..020550c Binary files /dev/null and b/docs/assets/raster-300ppi/icons-on-background/icon-dark-app@300x.png differ diff --git a/docs/assets/raster-300ppi/icons-on-background/icon-dark@300x.png b/docs/assets/raster-300ppi/icons-on-background/icon-dark@300x.png new file mode 100644 index 0000000..888074f Binary files /dev/null and b/docs/assets/raster-300ppi/icons-on-background/icon-dark@300x.png differ diff --git a/docs/assets/raster-300ppi/icons-on-background/icon-light-app@300x.png b/docs/assets/raster-300ppi/icons-on-background/icon-light-app@300x.png new file mode 100644 index 0000000..91d6bdb Binary files /dev/null and b/docs/assets/raster-300ppi/icons-on-background/icon-light-app@300x.png differ diff --git a/docs/assets/raster-300ppi/icons-on-background/icon-light@300x.png b/docs/assets/raster-300ppi/icons-on-background/icon-light@300x.png new file mode 100644 index 0000000..bd0ace0 Binary files /dev/null and b/docs/assets/raster-300ppi/icons-on-background/icon-light@300x.png differ diff --git a/docs/assets/raster-300ppi/icons-on-background/icon-nb-app@300x.png b/docs/assets/raster-300ppi/icons-on-background/icon-nb-app@300x.png new file mode 100644 index 0000000..7516c0c Binary files /dev/null and b/docs/assets/raster-300ppi/icons-on-background/icon-nb-app@300x.png differ diff --git a/docs/assets/raster-300ppi/icons-on-background/icon-nb@300x.png b/docs/assets/raster-300ppi/icons-on-background/icon-nb@300x.png new file mode 100644 index 0000000..e3314df Binary files /dev/null and b/docs/assets/raster-300ppi/icons-on-background/icon-nb@300x.png differ diff --git a/docs/assets/raster-300ppi/logos-no-background/logo-black@300x.png b/docs/assets/raster-300ppi/logos-no-background/logo-black@300x.png new file mode 100644 index 0000000..f3f689c Binary files /dev/null and b/docs/assets/raster-300ppi/logos-no-background/logo-black@300x.png differ diff --git a/docs/assets/raster-300ppi/logos-no-background/logo-red-and-black@300x.png b/docs/assets/raster-300ppi/logos-no-background/logo-red-and-black@300x.png new file mode 100644 index 0000000..49b9391 Binary files /dev/null and b/docs/assets/raster-300ppi/logos-no-background/logo-red-and-black@300x.png differ diff --git a/docs/assets/raster-300ppi/logos-no-background/logo-red-and-white@300x.png b/docs/assets/raster-300ppi/logos-no-background/logo-red-and-white@300x.png new file mode 100644 index 0000000..431e521 Binary files /dev/null and b/docs/assets/raster-300ppi/logos-no-background/logo-red-and-white@300x.png differ diff --git a/docs/assets/raster-300ppi/logos-no-background/logo-white@300x.png b/docs/assets/raster-300ppi/logos-no-background/logo-white@300x.png new file mode 100644 index 0000000..5e4e0f7 Binary files /dev/null and b/docs/assets/raster-300ppi/logos-no-background/logo-white@300x.png differ diff --git a/docs/assets/raster-300ppi/logos-on-background/logo-bn@300x.png b/docs/assets/raster-300ppi/logos-on-background/logo-bn@300x.png new file mode 100644 index 0000000..a3e86fb Binary files /dev/null and b/docs/assets/raster-300ppi/logos-on-background/logo-bn@300x.png differ diff --git a/docs/assets/raster-300ppi/logos-on-background/logo-dark@300x.png b/docs/assets/raster-300ppi/logos-on-background/logo-dark@300x.png new file mode 100644 index 0000000..a1e2da7 Binary files /dev/null and b/docs/assets/raster-300ppi/logos-on-background/logo-dark@300x.png differ diff --git a/docs/assets/raster-300ppi/logos-on-background/logo-light@300x.png b/docs/assets/raster-300ppi/logos-on-background/logo-light@300x.png new file mode 100644 index 0000000..3046d39 Binary files /dev/null and b/docs/assets/raster-300ppi/logos-on-background/logo-light@300x.png differ diff --git a/docs/assets/raster-300ppi/logos-on-background/logo-nb@300x.png b/docs/assets/raster-300ppi/logos-on-background/logo-nb@300x.png new file mode 100644 index 0000000..6f26390 Binary files /dev/null and b/docs/assets/raster-300ppi/logos-on-background/logo-nb@300x.png differ diff --git a/docs/assets/vector-svg/icons-no-background/icon-black.svg b/docs/assets/vector-svg/icons-no-background/icon-black.svg new file mode 100644 index 0000000..e1a1358 --- /dev/null +++ b/docs/assets/vector-svg/icons-no-background/icon-black.svg @@ -0,0 +1,9 @@ + + + + \ No newline at end of file diff --git a/docs/assets/vector-svg/icons-no-background/icon-red.svg b/docs/assets/vector-svg/icons-no-background/icon-red.svg new file mode 100644 index 0000000..0a01971 --- /dev/null +++ b/docs/assets/vector-svg/icons-no-background/icon-red.svg @@ -0,0 +1,9 @@ + + + + \ No newline at end of file diff --git a/docs/assets/vector-svg/icons-no-background/icon-white.svg b/docs/assets/vector-svg/icons-no-background/icon-white.svg new file mode 100644 index 0000000..9b0f34e --- /dev/null +++ b/docs/assets/vector-svg/icons-no-background/icon-white.svg @@ -0,0 +1,9 @@ + + + + \ No newline at end of file diff --git a/docs/assets/vector-svg/icons-on-background/icon-bn-app.svg b/docs/assets/vector-svg/icons-on-background/icon-bn-app.svg new file mode 100644 index 0000000..ddfd06e --- /dev/null +++ b/docs/assets/vector-svg/icons-on-background/icon-bn-app.svg @@ -0,0 +1,12 @@ + + + + + + + \ No newline at end of file diff --git a/docs/assets/vector-svg/icons-on-background/icon-bn.svg b/docs/assets/vector-svg/icons-on-background/icon-bn.svg new file mode 100644 index 0000000..0f180a6 --- /dev/null +++ b/docs/assets/vector-svg/icons-on-background/icon-bn.svg @@ -0,0 +1,12 @@ + + + + + + + \ No newline at end of file diff --git a/docs/assets/vector-svg/icons-on-background/icon-dark-app.svg b/docs/assets/vector-svg/icons-on-background/icon-dark-app.svg new file mode 100644 index 0000000..cf00a57 --- /dev/null +++ b/docs/assets/vector-svg/icons-on-background/icon-dark-app.svg @@ -0,0 +1,12 @@ + + + + + + + \ No newline at end of file diff --git a/docs/assets/vector-svg/icons-on-background/icon-dark.svg b/docs/assets/vector-svg/icons-on-background/icon-dark.svg new file mode 100644 index 0000000..a78b47c --- /dev/null +++ b/docs/assets/vector-svg/icons-on-background/icon-dark.svg @@ -0,0 +1,12 @@ + + + + + + + \ No newline at end of file diff --git a/docs/assets/vector-svg/icons-on-background/icon-light-app.svg b/docs/assets/vector-svg/icons-on-background/icon-light-app.svg new file mode 100644 index 0000000..2f6c7e4 --- /dev/null +++ b/docs/assets/vector-svg/icons-on-background/icon-light-app.svg @@ -0,0 +1,12 @@ + + + + + + + \ No newline at end of file diff --git a/docs/assets/vector-svg/icons-on-background/icon-light.svg b/docs/assets/vector-svg/icons-on-background/icon-light.svg new file mode 100644 index 0000000..fdde4c9 --- /dev/null +++ b/docs/assets/vector-svg/icons-on-background/icon-light.svg @@ -0,0 +1,12 @@ + + + + + + + \ No newline at end of file diff --git a/docs/assets/vector-svg/icons-on-background/icon-nb-app.svg b/docs/assets/vector-svg/icons-on-background/icon-nb-app.svg new file mode 100644 index 0000000..4f198f3 --- /dev/null +++ b/docs/assets/vector-svg/icons-on-background/icon-nb-app.svg @@ -0,0 +1,12 @@ + + + + + + + \ No newline at end of file diff --git a/docs/assets/vector-svg/icons-on-background/icon-nb.svg b/docs/assets/vector-svg/icons-on-background/icon-nb.svg new file mode 100644 index 0000000..dc89ea1 --- /dev/null +++ b/docs/assets/vector-svg/icons-on-background/icon-nb.svg @@ -0,0 +1,12 @@ + + + + + + + \ No newline at end of file diff --git a/docs/assets/vector-svg/logos-no-background/logo-black.svg b/docs/assets/vector-svg/logos-no-background/logo-black.svg new file mode 100644 index 0000000..ca9e5de --- /dev/null +++ b/docs/assets/vector-svg/logos-no-background/logo-black.svg @@ -0,0 +1,21 @@ + + + + \ No newline at end of file diff --git a/docs/assets/vector-svg/logos-no-background/logo-red-and-black.svg b/docs/assets/vector-svg/logos-no-background/logo-red-and-black.svg new file mode 100644 index 0000000..21301c7 --- /dev/null +++ b/docs/assets/vector-svg/logos-no-background/logo-red-and-black.svg @@ -0,0 +1,21 @@ + + + + \ No newline at end of file diff --git a/docs/assets/vector-svg/logos-no-background/logo-red-and-white.svg b/docs/assets/vector-svg/logos-no-background/logo-red-and-white.svg new file mode 100644 index 0000000..2ac9a7b --- /dev/null +++ b/docs/assets/vector-svg/logos-no-background/logo-red-and-white.svg @@ -0,0 +1,21 @@ + + + + \ No newline at end of file diff --git a/docs/assets/vector-svg/logos-no-background/logo-white.svg b/docs/assets/vector-svg/logos-no-background/logo-white.svg new file mode 100644 index 0000000..6e5b082 --- /dev/null +++ b/docs/assets/vector-svg/logos-no-background/logo-white.svg @@ -0,0 +1,21 @@ + + + + \ No newline at end of file diff --git a/docs/assets/vector-svg/logos-on-background/logo-bn.svg b/docs/assets/vector-svg/logos-on-background/logo-bn.svg new file mode 100644 index 0000000..7c34eff --- /dev/null +++ b/docs/assets/vector-svg/logos-on-background/logo-bn.svg @@ -0,0 +1,24 @@ + + + + + + + \ No newline at end of file diff --git a/docs/assets/vector-svg/logos-on-background/logo-dark.svg b/docs/assets/vector-svg/logos-on-background/logo-dark.svg new file mode 100644 index 0000000..3212912 --- /dev/null +++ b/docs/assets/vector-svg/logos-on-background/logo-dark.svg @@ -0,0 +1,24 @@ + + + + + + + \ No newline at end of file diff --git a/docs/assets/vector-svg/logos-on-background/logo-light.svg b/docs/assets/vector-svg/logos-on-background/logo-light.svg new file mode 100644 index 0000000..9b8b1eb --- /dev/null +++ b/docs/assets/vector-svg/logos-on-background/logo-light.svg @@ -0,0 +1,24 @@ + + + + \ No newline at end of file diff --git a/docs/assets/vector-svg/logos-on-background/logo-nb.svg b/docs/assets/vector-svg/logos-on-background/logo-nb.svg new file mode 100644 index 0000000..41af7f8 --- /dev/null +++ b/docs/assets/vector-svg/logos-on-background/logo-nb.svg @@ -0,0 +1,24 @@ + + + + \ No newline at end of file diff --git a/docs/ready-to-use.md b/docs/ready-to-use.md new file mode 100644 index 0000000..bf00e37 --- /dev/null +++ b/docs/ready-to-use.md @@ -0,0 +1,215 @@ +# svForge - ready-to-use commands + +Test **outputs** in the examples below are written under `data_local/gen-test/` + +```bash +mkdir -p data_local/gen-test +``` + +--- + +## 1. Environment and install + +```bash +python -m venv .venv +source .venv/bin/activate # Windows: .venv\Scripts\activate +pip install -U pip +pip install -e ".[dev,test]" # app + ruff, mypy, pytest +``` + +```bash +svforge --version +``` + +**More logging** (optional): + +```bash +svforge -v gen --help +svforge -vv gen --help +``` + +--- + +## 2. Discover built-in options + +| Command | What it does | +|--------|----------------| +| `svforge callers` | Prints every **writer** name for `--caller` (e.g. `manta`, `delly`). | +| `svforge bank list` | Lists **built-in bank** names. | +| `svforge bank show` | Dumps the default bank (`default_hg38`) as YAML. | +| `svforge bank show ` | Dumps a specific built-in bank. | + +```bash +svforge callers +svforge bank list +svforge bank show +svforge bank show default_hg38 +``` + +--- + +## 3. Generate one VCF (`gen`) + +**Minimal example** (Manta-style VCF, 50 records, one sample): + +```bash +svforge gen --caller manta --out data_local/gen-test/synthetic.vcf.gz --n 50 --sample-name TUMOR01 +``` + +**Same result every time** (fixed seed): + +```bash +svforge gen --caller manta --out data_local/gen-test/out.vcf.gz --n 100 --sample-name S1 --seed 42 +``` + +**DELLY** (output can be `.bcf` if you set the path): + +```bash +svforge gen --caller delly --out data_local/gen-test/synthetic.bcf --n 30 --sample-name S1 --seed 1 +``` + +**Only some SV types** (comma-separated, upper case) : + +```bash +svforge gen --caller manta --out data_local/gen-test/only_del_dup.vcf.gz --n 20 --sample-name S1 \ + --svtypes DEL,DUP +``` + +**Custom bank (YAML on disk):** + +```bash +svforge gen --caller manta --out data_local/gen-test/out.vcf.gz --n 20 --sample-name S1 --bank /path/to/bank.yaml +``` + +**Custom VCF header template:** + +```bash +svforge gen --caller manta --out data_local/gen-test/out.vcf.gz --n 10 --sample-name S1 \ + --header-template /path/to/header.template +``` + +**Reference build** (v1 is mainly **hg38**) : + +```bash +svforge gen --caller manta --out data_local/gen-test/out.vcf.gz --n 10 --sample-name S1 --genome hg38 +``` + +**Sampling ranges** (examples): + +```bash +# SV length in bp, inclusive range +svforge gen --caller manta --out data_local/gen-test/out.vcf.gz --n 15 --sample-name S1 --svlen-range 100,5000 + +# Microhomology length +svforge gen --caller manta --out data_local/gen-test/out.vcf.gz --n 15 --sample-name S1 --homlen-range 5,30 + +# Variant allele frequency range (0–1) +svforge gen --caller manta --out data_local/gen-test/out.vcf.gz --n 15 --sample-name S1 --vaf-range 0.2,0.9 +``` + +**Only some chromosomes** (with or without `chr`) : + +```bash +svforge gen --caller manta --out data_local/gen-test/out.vcf.gz --n 20 --sample-name S1 --chromosomes chr1,chr7 +svforge gen --caller manta --out data_local/gen-test/out.vcf.gz --n 20 --sample-name S1 --chromosomes 1,7 +``` + +**Bundled mini-catalogs** (fractions in `[0, 1]`; the rest is synthetic) : + +```bash +# gnomAD-style mini TSV +svforge gen --caller manta --out data_local/gen-test/out.vcf.gz --n 50 --sample-name S1 --gnomad-fraction 0.2 + +# ENCODE blacklist-style mini TSV +svforge gen --caller manta --out data_local/gen-test/out.vcf.gz --n 50 --sample-name S1 --blacklist-fraction 0.1 + +# Both +svforge gen --caller manta --out data_local/gen-test/out.vcf.gz --n 100 --sample-name S1 --gnomad-fraction 0.15 --blacklist-fraction 0.1 --seed 7 +``` + +--- + +## 4. Tumor + normal pair (`gen-pair`) + +**Example:** + +```bash +svforge gen-pair \ + --caller manta \ + --out-tumor data_local/gen-test/tumor.vcf.gz \ + --out-normal data_local/gen-test/normal.vcf.gz \ + --n-somatic 30 \ + --n-germline 10 \ + --tumor-sample-name TUMOR_01 \ + --normal-sample-name NORMAL_01 \ + --seed 99 +``` + +Same **sampling flags** as `gen` work here (`--bank`, `--gnomad-fraction`, `--blacklist-fraction`, `--svtypes`, ranges, etc.). + +--- + +## 5. Check a VCF you generated (`validate`) + +Exit code `0` = pass; non-zero = fail. + +**Print to the terminal:** + +```bash +svforge validate --vcf data_local/gen-test/synthetic.vcf.gz +``` + +**Write a TSV report:** + +```bash +svforge validate --vcf data_local/gen-test/synthetic.vcf.gz --report-tsv data_local/gen-test/validate_report.tsv +``` + +If `bcftools index` fails, sort first: + +```bash +bcftools sort -Oz -o data_local/gen-test/synthetic.sorted.vcf.gz data_local/gen-test/synthetic.vcf.gz +bcftools index -t data_local/gen-test/synthetic.sorted.vcf.gz +svforge validate --vcf data_local/gen-test/synthetic.sorted.vcf.gz --report-tsv data_local/gen-test/validate_report.tsv +``` + +--- + +## 6. Developer commands (from repository root) + +| Step | Command | +|------|--------| +| Lint | `ruff check .` | +| Auto-fix (safe) | `ruff check . --fix` | +| Format | `ruff format .` | +| Types | `mypy src` | +| Tests | `pytest -q` | +| Tests + coverage | `pytest --cov=svforge --cov-report=term-missing` | +| One file | `pytest tests/test_cli.py` | + +**All in one go:** + +```bash +ruff check . && ruff format --check . && mypy src && pytest -q +``` + +--- + +## 7. Optional: **bcftools** (sanity on the file) + +```bash +bcftools view -h data_local/gen-test/synthetic.vcf.gz +bcftools view -H data_local/gen-test/synthetic.vcf.gz | head +``` + +--- + +## 8. Help in the terminal + +```bash +svforge --help +svforge gen --help +svforge gen-pair --help +svforge validate --help +svforge bank --help +``` diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c2df056 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,146 @@ +[build-system] +requires = ["hatchling>=1.24"] +build-backend = "hatchling.build" + +[project] +name = "svforge" +version = "1.0.0" +description = "Synthetic VCF generator for structural variants (Manta, DELLY) with controlled variability and realistic artefact injection" +readme = "README.md" +requires-python = ">=3.10" +license = { file = "LICENSE" } +authors = [ + { name = "Pierre NATIEZ" }, +] +keywords = [ + "bioinformatics", + "structural-variants", + "vcf", + "manta", + "delly", + "synthetic-data", + "testing", +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: POSIX :: Linux", + "Operating System :: MacOS", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +dependencies = [ + "pysam>=0.22", + "intervaltree>=3.1", + "PyYAML>=6.0", +] + +[project.optional-dependencies] +dev = [ + "ruff>=0.5", + "mypy>=1.10", + "types-PyYAML", +] +test = [ + "pytest>=8.0", + "pytest-cov>=5.0", +] + +[project.scripts] +svforge = "svforge.cli:main" + +[project.entry-points."svforge.writers"] +manta = "svforge.writers.manta:MantaWriter" +delly = "svforge.writers.delly:DellyWriter" + +[project.urls] +Homepage = "https://github.com/pieetie/svforge" +Repository = "https://github.com/pieetie/svforge" +Issues = "https://github.com/pieetie/svforge/issues" +Changelog = "https://github.com/pieetie/svforge/blob/main/CHANGELOG.md" + +[tool.hatch.build.targets.wheel] +packages = ["src/svforge"] + +[tool.hatch.build.targets.wheel.force-include] +"src/svforge/data/banks/default_hg38.yaml" = "svforge/data/banks/default_hg38.yaml" +"src/svforge/data/headers/manta_somatic_hg38.vcf.template" = "svforge/data/headers/manta_somatic_hg38.vcf.template" +"src/svforge/data/headers/delly_somatic_hg38.vcf.template" = "svforge/data/headers/delly_somatic_hg38.vcf.template" +"src/svforge/data/injections/gnomad_hg38_mini.tsv" = "svforge/data/injections/gnomad_hg38_mini.tsv" +"src/svforge/data/injections/blacklist_hg38_mini.tsv" = "svforge/data/injections/blacklist_hg38_mini.tsv" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", + "/README.md", + "/CHANGELOG.md", + "/CONTRIBUTING.md", + "/LICENSE", + "/pyproject.toml", +] + +[tool.ruff] +line-length = 100 +target-version = "py310" +src = ["src", "tests"] + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # isort + "B", # bugbear + "UP", # pyupgrade + "SIM", # simplify + "PL", # pylint subset + "RUF", # ruff-specific + "C4", # comprehensions + "PTH", # pathlib + "TID", # tidy imports +] +ignore = [ + "E501", # line length handled by formatter + "PLR0913", # too many arguments (CLI code needs this) + "PLR2004", # magic values (tolerated in tests and VCF spec constants) +] + +[tool.ruff.lint.per-file-ignores] +"tests/**/*.py" = ["PLR2004", "S101"] + +[tool.ruff.format] +quote-style = "double" +indent-style = "space" + +[tool.mypy] +python_version = "3.10" +strict = true +warn_unused_configs = true +warn_unreachable = true +disallow_untyped_defs = true +disallow_incomplete_defs = true +check_untyped_defs = true +no_implicit_optional = true +show_error_codes = true + +[[tool.mypy.overrides]] +module = [ + "pysam", + "pysam.*", + "intervaltree", +] +ignore_missing_imports = true + +[tool.pytest.ini_options] +minversion = "8.0" +testpaths = ["tests"] +addopts = [ + "-ra", + "--strict-markers", + "--strict-config", +] diff --git a/src/svforge/__init__.py b/src/svforge/__init__.py new file mode 100644 index 0000000..e1fb9f5 --- /dev/null +++ b/src/svforge/__init__.py @@ -0,0 +1,15 @@ +""" +svForge -- synthetic structural-variant VCF generator +""" + +from __future__ import annotations + +from importlib.metadata import PackageNotFoundError +from importlib.metadata import version as _pkg_version + +try: + __version__ = _pkg_version("svforge") +except PackageNotFoundError: # pragma: no cover -- running from a source checkout + __version__ = "0.0.0+local" + +__all__ = ["__version__"] diff --git a/src/svforge/__main__.py b/src/svforge/__main__.py new file mode 100644 index 0000000..72fc52a --- /dev/null +++ b/src/svforge/__main__.py @@ -0,0 +1,10 @@ +""" +Allow ``python -m svforge`` to invoke the CLI +""" + +from __future__ import annotations + +from svforge.cli import main + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/svforge/cli.py b/src/svforge/cli.py new file mode 100644 index 0000000..62d15e0 --- /dev/null +++ b/src/svforge/cli.py @@ -0,0 +1,396 @@ +""" +svforge command-line interface + +Subcommands: + +- ``gen`` single-sample synthetic VCF +- ``gen-pair`` tumor + normal paired VCFs +- ``validate`` self-consistency check against bundled injection catalogs +- ``bank`` list / show built-in banks +- ``callers`` list every registered writer +""" + +from __future__ import annotations + +import argparse +import logging +import secrets +import sys +from collections.abc import Sequence +from pathlib import Path + +import yaml + +from svforge import __version__ +from svforge.core.bank import Bank, list_builtin_banks, load_bank +from svforge.core.genome import GenomeBuild, normalize_chromosomes, validate_genome +from svforge.core.models import SV +from svforge.core.provenance import build_svforge_tags +from svforge.core.sampler import SamplerConfig, sample, sample_pair +from svforge.io.vcf_writer import write_vcf +from svforge.validate.annotate import ValidationResult, validate_vcf +from svforge.writers import CallerWriter, available_writers, get_writer + +_SEED_MAX = 2**31 - 1 + +log = logging.getLogger("svforge") + + +def main(argv: Sequence[str] | None = None) -> int: + """ + Entry point invoked by the ``svforge`` console script + """ + parser = _build_parser() + args = parser.parse_args(argv) + _setup_logging(args.verbose if hasattr(args, "verbose") else 0) + + handler = _DISPATCH.get(args.command) + if handler is None: + parser.print_help() + return 2 + try: + return handler(args) + except (ValueError, FileNotFoundError, KeyError) as exc: + log.error("%s: %s", type(exc).__name__, exc) + return 1 + + +def _build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="svforge", + description="Synthetic VCF generator for structural variants", + ) + p.add_argument("--version", action="version", version=f"svforge {__version__}") + p.add_argument("-v", "--verbose", action="count", default=0, help="Increase log verbosity") + + sub = p.add_subparsers(dest="command", required=True) + + _add_gen_parser(sub) + _add_gen_pair_parser(sub) + _add_validate_parser(sub) + _add_bank_parser(sub) + _add_callers_parser(sub) + + return p + + +def _svtype_list(value: str) -> frozenset[str]: + items = [v.strip().upper() for v in value.split(",") if v.strip()] + if not items: + raise argparse.ArgumentTypeError("svtypes must not be empty") + return frozenset(items) + + +def _int_pair(value: str) -> tuple[int, int]: + try: + lo, hi = (int(v) for v in value.split(",")) + except ValueError as exc: + raise argparse.ArgumentTypeError(f"expected MIN,MAX ints, got {value!r}") from exc + if hi < lo: + raise argparse.ArgumentTypeError(f"MAX < MIN in {value!r}") + return lo, hi + + +def _float_pair(value: str) -> tuple[float, float]: + try: + lo, hi = (float(v) for v in value.split(",")) + except ValueError as exc: + raise argparse.ArgumentTypeError(f"expected MIN,MAX floats, got {value!r}") from exc + if hi < lo: + raise argparse.ArgumentTypeError(f"MAX < MIN in {value!r}") + return lo, hi + + +def _chromosome_list(value: str) -> list[str]: + items = [v.strip() for v in value.split(",") if v.strip()] + if not items: + raise argparse.ArgumentTypeError("--chromosomes must not be empty") + return items + + +def _add_common_sampling_args(p: argparse.ArgumentParser) -> None: + p.add_argument("--caller", required=True, choices=sorted(available_writers())) + p.add_argument("--bank", default="default_hg38", help="Built-in bank name or path to YAML bank") + p.add_argument("--seed", type=int, default=None) + p.add_argument( + "--genome", + default="hg38", + help="Reference build for contig lengths (V1: hg38 only)", + ) + p.add_argument( + "--header-template", + type=Path, + default=None, + metavar="PATH", + help="Override the bundled VCF header template with a user-supplied one", + ) + p.add_argument("--svtypes", type=_svtype_list, default=None) + p.add_argument( + "--blacklist-fraction", + type=float, + default=0.0, + help="Fraction of SVs drawn from the bundled ENCODE blacklist mini catalog", + ) + p.add_argument( + "--gnomad-fraction", + type=float, + default=0.0, + help="Fraction of SVs drawn from the bundled gnomAD mini catalog", + ) + p.add_argument("--homlen-range", type=_int_pair, default=None, metavar="MIN,MAX") + p.add_argument("--svlen-range", type=_int_pair, default=None, metavar="MIN,MAX") + p.add_argument("--vaf-range", type=_float_pair, default=(0.3, 1.0), metavar="MIN,MAX") + p.add_argument( + "--chromosomes", + type=_chromosome_list, + default=None, + metavar="CHR[,CHR...]", + help="Restrict SV generation to this subset of chromosomes (e.g. chr1,chr7 or 1,7)", + ) + + +def _add_gen_parser(sub: argparse._SubParsersAction[argparse.ArgumentParser]) -> None: + p = sub.add_parser("gen", help="Generate one synthetic VCF for one sample") + p.add_argument("--out", type=Path, required=True, help="Output .vcf / .vcf.gz / .bcf") + p.add_argument("--n", type=int, required=True) + p.add_argument("--sample-name", required=True) + _add_common_sampling_args(p) + + +def _add_gen_pair_parser(sub: argparse._SubParsersAction[argparse.ArgumentParser]) -> None: + p = sub.add_parser("gen-pair", help="Generate paired tumor/normal VCFs") + p.add_argument("--out-tumor", type=Path, required=True) + p.add_argument("--out-normal", type=Path, required=True) + p.add_argument("--n-somatic", type=int, required=True) + p.add_argument("--n-germline", type=int, required=True) + p.add_argument("--tumor-sample-name", required=True) + p.add_argument("--normal-sample-name", required=True) + _add_common_sampling_args(p) + + +def _add_validate_parser(sub: argparse._SubParsersAction[argparse.ArgumentParser]) -> None: + p = sub.add_parser( + "validate", + help="Self-consistency check: every SVFORGE_SOURCE-tagged record " + "matches the bundled mini catalog", + ) + p.add_argument("--vcf", type=Path, required=True) + p.add_argument( + "--report-tsv", + type=Path, + default=None, + help="Write the divergence report to this TSV (default: stdout)", + ) + + +def _add_bank_parser(sub: argparse._SubParsersAction[argparse.ArgumentParser]) -> None: + p = sub.add_parser("bank", help="Inspect built-in banks") + bsub = p.add_subparsers(dest="bank_cmd", required=True) + bsub.add_parser("list", help="List built-in banks") + show = bsub.add_parser("show", help="Dump a bank as YAML") + show.add_argument("name", nargs="?", default="default_hg38") + + +def _add_callers_parser(sub: argparse._SubParsersAction[argparse.ArgumentParser]) -> None: + sub.add_parser("callers", help="List every registered writer") + + +def _cmd_gen(args: argparse.Namespace) -> int: + bank = load_bank(args.bank) + effective_seed = _effective_seed(args.seed) + args.seed = effective_seed + cfg = _make_sampler_config(args, bank) + svs = sample(bank, cfg) + writer = get_writer(args.caller) + provenance = build_svforge_tags(caller=args.caller, seed=effective_seed, argv=sys.argv) + _write_sample_vcf( + writer, + svs, + args.sample_name, + args.out, + args.genome, + provenance, + header_template_override=args.header_template, + ) + log.info("Wrote %d SVs to %s (seed=%d)", len(svs), args.out, effective_seed) + return 0 + + +def _cmd_gen_pair(args: argparse.Namespace) -> int: + bank = load_bank(args.bank) + effective_seed = _effective_seed(args.seed) + args.seed = effective_seed + cfg = _make_sampler_config(args, bank, n_override=0) + pair = sample_pair(bank, args.n_somatic, args.n_germline, cfg) + writer = get_writer(args.caller) + provenance = build_svforge_tags(caller=args.caller, seed=effective_seed, argv=sys.argv) + _write_sample_vcf( + writer, + pair.tumor, + args.tumor_sample_name, + args.out_tumor, + args.genome, + provenance, + header_template_override=args.header_template, + ) + _write_sample_vcf( + writer, + pair.normal, + args.normal_sample_name, + args.out_normal, + args.genome, + provenance, + header_template_override=args.header_template, + ) + log.info( + "Tumor: %d SVs (%d somatic + %d germline), Normal: %d SVs -> %s, %s (seed=%d)", + len(pair.tumor), + len(pair.somatic_ids), + len(pair.germline_ids), + len(pair.normal), + args.out_tumor, + args.out_normal, + effective_seed, + ) + return 0 + + +def _effective_seed(seed: int | None) -> int: + """ + Pick the effective seed: the user's if supplied, otherwise a fresh one + + The result is always logged in the ``##svforgeSeed`` provenance tag so a + run is always reproducible from its output VCF + """ + if seed is not None: + return int(seed) + return secrets.randbelow(_SEED_MAX) + + +def _cmd_validate(args: argparse.Namespace) -> int: + result = validate_vcf(args.vcf) + text = _format_validation_report(result) + if args.report_tsv: + args.report_tsv.parent.mkdir(parents=True, exist_ok=True) + args.report_tsv.write_text(text, encoding="utf-8") + else: + sys.stdout.write(text) + return 0 if result.passed else 1 + + +def _format_validation_report(result: ValidationResult) -> str: + lines = ["metric\ttotal\tmatched\tstatus"] + for metric, total, matched in ( + ("gnomad_self_consistency", result.gnomad_total, result.gnomad_matched), + ("blacklist_self_consistency", result.blacklist_total, result.blacklist_matched), + ): + status = "PASS" if total == matched else "FAIL" + lines.append(f"{metric}\t{total}\t{matched}\t{status}") + if result.divergences: + lines.append("") + lines.append("# divergences (record_id, source, chrom, pos, end)") + for div in result.divergences: + lines.append(f"{div.record_id}\t{div.source}\t{div.chrom}\t{div.pos}\t{div.end}") + return "\n".join(lines) + "\n" + + +def _cmd_bank(args: argparse.Namespace) -> int: + if args.bank_cmd == "list": + for name in list_builtin_banks(): + print(name) + return 0 + if args.bank_cmd == "show": + bank = load_bank(args.name) + yaml.safe_dump(_bank_to_dict(bank), sys.stdout, sort_keys=False) + return 0 + return 2 # pragma: no cover -- argparse enforces required subcommand + + +def _cmd_callers(_args: argparse.Namespace) -> int: + for name in available_writers(): + print(name) + return 0 + + +_DISPATCH = { + "gen": _cmd_gen, + "gen-pair": _cmd_gen_pair, + "validate": _cmd_validate, + "bank": _cmd_bank, + "callers": _cmd_callers, +} + + +def _make_sampler_config( + args: argparse.Namespace, + bank: Bank, + n_override: int | None = None, +) -> SamplerConfig: + genome: GenomeBuild = validate_genome(args.genome) + + chroms: frozenset[str] | None = None + raw_chroms = getattr(args, "chromosomes", None) + if raw_chroms: + chroms = frozenset(normalize_chromosomes(raw_chroms, genome)) + + return SamplerConfig( + n=n_override if n_override is not None else args.n, + genome=genome, + svtypes=args.svtypes, + svlen_range=args.svlen_range, + homlen_range=args.homlen_range, + vaf_range=args.vaf_range, + blacklist_fraction=args.blacklist_fraction, + gnomad_fraction=args.gnomad_fraction, + seed=args.seed, + chroms=chroms, + ) + + +def _write_sample_vcf( + writer: CallerWriter, + svs: list[SV], + sample_name: str, + out_path: Path, + genome: GenomeBuild, + provenance_tags: Sequence[str], + *, + header_template_override: Path | None = None, +) -> None: + header = writer.header_lines( + sample_name, + genome=genome, + provenance_tags=provenance_tags, + template_override=header_template_override, + ) + records = writer.format_records(svs, sample_name) + write_vcf(out_path, header, records) + + +def _bank_to_dict(bank: Bank) -> dict[str, object]: + return { + "name": bank.name, + "genome": bank.genome, + "templates": [ + { + "svtype": t.svtype, + "svlen": [t.svlen_min, t.svlen_max], + "homlen": [t.homlen_min, t.homlen_max], + "weight": t.weight, + "chroms": list(t.chroms), + } + for t in bank.templates + ], + } + + +def _setup_logging(verbose: int) -> None: + level = logging.WARNING + if verbose == 1: + level = logging.INFO + elif verbose >= 2: + level = logging.DEBUG + logging.basicConfig( + level=level, + format="%(asctime)s %(levelname)s %(name)s: %(message)s", + ) diff --git a/src/svforge/core/__init__.py b/src/svforge/core/__init__.py new file mode 100644 index 0000000..eaf5902 --- /dev/null +++ b/src/svforge/core/__init__.py @@ -0,0 +1,5 @@ +""" +Core domain: models, sampler, bank, regions, genome +""" + +from __future__ import annotations diff --git a/src/svforge/core/bank.py b/src/svforge/core/bank.py new file mode 100644 index 0000000..ba472f4 --- /dev/null +++ b/src/svforge/core/bank.py @@ -0,0 +1,152 @@ +""" +SV banks: YAML-backed catalogue of weighted SV templates + +A bank is a set of :class:`SVTemplate` entries. Each template describes a +family of SVs (an svtype plus coordinate / length ranges) from which the +sampler draws concrete SVs. Banks are pure metadata: no genomic sequence +ever ships with the package +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from importlib import resources +from pathlib import Path +from typing import Any + +import yaml + +from svforge.core.genome import GenomeBuild, validate_genome + +BUILTIN_BANKS_PACKAGE = "svforge.data.banks" +BUILTIN_BANK_SUFFIX = ".yaml" + + +@dataclass(slots=True, frozen=True) +class SVTemplate: + """ + A weighted template describing how to draw one SV + """ + + svtype: str + svlen_min: int + svlen_max: int + homlen_min: int = 0 + homlen_max: int = 0 + weight: float = 1.0 + chroms: tuple[str, ...] = () + + def __post_init__(self) -> None: + if self.svlen_min < 1: + raise ValueError(f"svlen_min must be >= 1, got {self.svlen_min}") + if self.svlen_max < self.svlen_min: + raise ValueError( + f"svlen_max ({self.svlen_max}) must be >= svlen_min ({self.svlen_min})" + ) + if self.homlen_min < 0 or self.homlen_max < self.homlen_min: + raise ValueError(f"invalid homlen range [{self.homlen_min}, {self.homlen_max}]") + if self.weight <= 0: + raise ValueError(f"weight must be > 0, got {self.weight}") + + +@dataclass(slots=True) +class Bank: + """ + A named collection of SV templates bound to a genome build + """ + + name: str + genome: GenomeBuild + templates: list[SVTemplate] = field(default_factory=list) + + def __post_init__(self) -> None: + if not self.templates: + raise ValueError(f"Bank {self.name!r} has no templates") + + def by_svtype(self, svtype: str) -> list[SVTemplate]: + """ + Return all templates that produce ``svtype`` SVs + """ + return [t for t in self.templates if t.svtype == svtype] + + def svtypes(self) -> set[str]: + """ + Distinct svtypes covered by this bank + """ + return {t.svtype for t in self.templates} + + +def _template_from_dict(raw: dict[str, Any]) -> SVTemplate: + try: + svtype = str(raw["svtype"]) + svlen = raw["svlen"] + svlen_min, svlen_max = int(svlen[0]), int(svlen[1]) + except (KeyError, TypeError, ValueError) as exc: + raise ValueError(f"Invalid template entry: {raw!r} ({exc})") from exc + + homlen = raw.get("homlen", [0, 0]) + homlen_min, homlen_max = int(homlen[0]), int(homlen[1]) + weight = float(raw.get("weight", 1.0)) + chroms = tuple(str(c) for c in raw.get("chroms", [])) + + return SVTemplate( + svtype=svtype, + svlen_min=svlen_min, + svlen_max=svlen_max, + homlen_min=homlen_min, + homlen_max=homlen_max, + weight=weight, + chroms=chroms, + ) + + +def load_bank(source: str | Path) -> Bank: + """ + Load a bank by name (built-in) or by filesystem path + + If ``source`` names an existing file, it is parsed as YAML. Otherwise + ``source`` is looked up among the banks packaged under + ``svforge.data.banks`` + """ + text = _read_bank_text(source) + data = yaml.safe_load(text) + if not isinstance(data, dict): + raise ValueError(f"Bank source {source!r} must be a YAML mapping at top level") + + name = str(data.get("name", str(source))) + genome_raw = str(data.get("genome", "hg38")) + genome: GenomeBuild = validate_genome(genome_raw) + + raw_templates = data.get("templates") + if not isinstance(raw_templates, list) or not raw_templates: + raise ValueError(f"Bank {name!r} must define a non-empty 'templates' list") + + templates = [_template_from_dict(t) for t in raw_templates] + return Bank(name=name, genome=genome, templates=templates) + + +def _read_bank_text(source: str | Path) -> str: + path = Path(source) + if path.exists(): + return path.read_text(encoding="utf-8") + + resource = resources.files(BUILTIN_BANKS_PACKAGE).joinpath(f"{source}{BUILTIN_BANK_SUFFIX}") + if not resource.is_file(): + raise FileNotFoundError( + f"Bank {source!r} not found: neither a file path nor a built-in bank. " + f"Available: {sorted(list_builtin_banks())}" + ) + return resource.read_text(encoding="utf-8") + + +def list_builtin_banks() -> list[str]: + """ + Return the names of banks shipped with the package + """ + root = resources.files(BUILTIN_BANKS_PACKAGE) + names: list[str] = [] + for entry in root.iterdir(): + entry_name = entry.name + if entry_name.endswith(BUILTIN_BANK_SUFFIX): + names.append(entry_name.removesuffix(BUILTIN_BANK_SUFFIX)) + return sorted(names) diff --git a/src/svforge/core/genome.py b/src/svforge/core/genome.py new file mode 100644 index 0000000..772a417 --- /dev/null +++ b/src/svforge/core/genome.py @@ -0,0 +1,101 @@ +""" +Reference genome contig sizes + +svForge V1 supports hg38 only: the bundled header templates and contig +lengths are hg38-specific. Attempting to use any other build raises a +clear error pointing to ``--header-template`` for self-service override +""" + +from __future__ import annotations + +from typing import Literal + +GenomeBuild = Literal["hg38"] + +HG38_CONTIGS: dict[str, int] = { + "chr1": 248_956_422, + "chr2": 242_193_529, + "chr3": 198_295_559, + "chr4": 190_214_555, + "chr5": 181_538_259, + "chr6": 170_805_979, + "chr7": 159_345_973, + "chr8": 145_138_636, + "chr9": 138_394_717, + "chr10": 133_797_422, + "chr11": 135_086_622, + "chr12": 133_275_309, + "chr13": 114_364_328, + "chr14": 107_043_718, + "chr15": 101_991_189, + "chr16": 90_338_345, + "chr17": 83_257_441, + "chr18": 80_373_285, + "chr19": 58_617_616, + "chr20": 64_444_167, + "chr21": 46_709_983, + "chr22": 50_818_468, + "chrX": 156_040_895, + "chrY": 57_227_415, + "chrM": 16_569, +} + +SUPPORTED_GENOMES: tuple[str, ...] = ("hg38",) + + +def validate_genome(genome: str) -> GenomeBuild: + """ + Return ``genome`` if supported, else raise ``ValueError`` with a + message pointing the user at the ``--header-template`` escape hatch + """ + if genome in SUPPORTED_GENOMES: + return "hg38" + raise ValueError( + f"No bundled header template for genome {genome!r}.\n" + f"Provide your own via --header-template PATH, or use " + f"--genome {SUPPORTED_GENOMES[0]}." + ) + + +def get_contigs(genome: GenomeBuild) -> dict[str, int]: + """ + Return the contig -> length mapping for the given build + """ + validate_genome(genome) + return dict(HG38_CONTIGS) + + +def contig_length(genome: GenomeBuild, chrom: str) -> int: + """ + Length of ``chrom`` on ``genome``, raises KeyError if unknown + """ + return get_contigs(genome)[chrom] + + +def normalize_chromosomes( + chroms: list[str] | tuple[str, ...], + genome: GenomeBuild, +) -> list[str]: + """ + Normalise a list of chromosome names against ``genome`` + + Accepts short aliases (``"1"``, ``"X"``, ``"M"``, ``"MT"``) and adds the + ``chr`` prefix as needed. Unknown chromosomes raise :class:`ValueError` + with the list of valid names + """ + valid = get_contigs(genome) + out: list[str] = [] + for raw in chroms: + name = raw.strip() + if not name: + continue + candidate = name if name.startswith("chr") else f"chr{name}" + if candidate == "chrMT": + candidate = "chrM" + if candidate not in valid: + raise ValueError( + f"Unknown chromosome {raw!r} for genome {genome}. Valid: {', '.join(valid.keys())}" + ) + if candidate not in out: + out.append(candidate) + return out diff --git a/src/svforge/core/injection_catalogs.py b/src/svforge/core/injection_catalogs.py new file mode 100644 index 0000000..22d75c9 --- /dev/null +++ b/src/svforge/core/injection_catalogs.py @@ -0,0 +1,103 @@ +""" +Load and expose the bundled real-catalog injection entries + +Two TSV files ship in :mod:`svforge.data.injections`: + +- ``gnomad_hg38_mini.tsv`` -- curated real gnomAD v4.1 SV sites +- ``blacklist_hg38_mini.tsv`` -- curated ENCODE hg38 blacklist v2 regions + +When ``svforge gen`` is invoked with ``--gnomad-fraction`` or +``--blacklist-fraction`` the sampler pulls the requested number of +entries from these catalogs verbatim (no synthesis, no spatial bias). +Downstream pipelines recognise the injected SVs directly because the +CHROM/POS/END coincide with real catalog rows, giving ``svforge +validate`` a zero-ambiguity self-consistency check to verify that a +generated VCF was actually produced by this build +""" + +from __future__ import annotations + +import csv +from collections.abc import Iterator +from dataclasses import dataclass +from functools import cache +from importlib import resources + +GNOMAD_TSV = "gnomad_hg38_mini.tsv" +BLACKLIST_TSV = "blacklist_hg38_mini.tsv" + +@dataclass(frozen=True, slots=True) +class GnomadEntry: + """ + One curated gnomAD SV site + """ + + chrom: str + pos: int + end: int + end_chrom: str + svtype: str + source_id: str + +@dataclass(frozen=True, slots=True) +class BlacklistEntry: + """ + One curated ENCODE blacklist region + """ + + chrom: str + pos: int + end: int + region_type: str + source_id: str + + +@cache +def load_gnomad_catalog() -> tuple[GnomadEntry, ...]: + """ + Return the bundled gnomAD mini catalog, cached for the process life + """ + return tuple(_iter_gnomad(_read_tsv(GNOMAD_TSV))) + + +@cache +def load_blacklist_catalog() -> tuple[BlacklistEntry, ...]: + """ + Return the bundled blacklist mini catalog, cached for the process life + """ + return tuple(_iter_blacklist(_read_tsv(BLACKLIST_TSV))) + +def _read_tsv(name: str) -> list[dict[str, str]]: + resource = resources.files("svforge.data.injections").joinpath(name) + if not resource.is_file(): + raise RuntimeError(f"Injection catalog {name!r} is missing from the package") + text = resource.read_text(encoding="utf-8") + reader = csv.DictReader(text.splitlines(), delimiter="\t") + rows = list(reader) + if not rows: + raise RuntimeError(f"Injection catalog {name!r} is empty") + return rows + + + +def _iter_gnomad(rows: list[dict[str, str]]) -> Iterator[GnomadEntry]: + for row in rows: + yield GnomadEntry( + chrom=row["chrom"], + pos=int(row["pos"]), + end=int(row["end"]), + end_chrom=row.get("end_chrom") or row["chrom"], + svtype=row["svtype"].upper(), + source_id=row["source_id"], + ) + + +def _iter_blacklist(rows: list[dict[str, str]]) -> Iterator[BlacklistEntry]: + for row in rows: + yield BlacklistEntry( + chrom=row["chrom"], + pos=int(row["pos"]), + end=int(row["end"]), + region_type=row["region_type"], + source_id=row["source_id"], + ) diff --git a/src/svforge/core/models.py b/src/svforge/core/models.py new file mode 100644 index 0000000..9408ba1 --- /dev/null +++ b/src/svforge/core/models.py @@ -0,0 +1,136 @@ +""" +Domain models shared by sampler, writers and validator + +These dataclasses are the contract that every other module depends on. +A writer only ever receives :class:`SV` instances and must never assume +anything beyond what is declared here +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Literal + +SVType = Literal["DEL", "DUP", "INV", "INS", "BND"] +Genotype = Literal["0/0", "0/1", "1/1"] +Origin = Literal["somatic", "germline"] +SVSource = Literal["bank", "gnomad", "blacklist"] + +VALID_SVTYPES: frozenset[str] = frozenset({"DEL", "DUP", "INV", "INS", "BND"}) +VALID_GENOTYPES: frozenset[str] = frozenset({"0/0", "0/1", "1/1"}) +VALID_SOURCES: frozenset[str] = frozenset({"bank", "gnomad", "blacklist"}) + + +@dataclass(slots=True) +class Breakpoint: + """ + A single breakpoint on the reference + + ``strand`` encodes the orientation at the breakpoint: ``+`` means the + joined sequence extends to the right of ``pos``, ``-`` means to the left + """ + + chrom: str + pos: int + strand: Literal["+", "-"] = "+" + + def __post_init__(self) -> None: + if self.pos < 1: + raise ValueError(f"Breakpoint pos must be 1-based positive, got {self.pos}") + if self.strand not in {"+", "-"}: + raise ValueError(f"Breakpoint strand must be '+' or '-', got {self.strand!r}") + + +@dataclass(slots=True) +class SV: + """ + Caller-agnostic structural-variant record + + Writers format this into their own VCF dialect. The sampler produces it + with deterministic fields so two runs with the same seed yield identical + records + """ + + id: str + svtype: str + chrom: str + pos: int + end: int + svlen: int + mate_chrom: str | None = None + mate_pos: int | None = None + strands: str = "+-" + homlen: int = 0 + homseq: str = "" + vaf: float = 0.5 + genotype: str = "0/1" + ref_base: str = "N" + ins_seq: str = "" + filter: str = "PASS" + origin: Origin = "somatic" + source: SVSource = "bank" + info_extra: dict[str, str] = field(default_factory=dict) + + def __post_init__(self) -> None: + if self.svtype not in VALID_SVTYPES: + raise ValueError(f"Unknown svtype {self.svtype!r} (valid: {sorted(VALID_SVTYPES)})") + if self.genotype not in VALID_GENOTYPES: + raise ValueError( + f"Unknown genotype {self.genotype!r} (valid: {sorted(VALID_GENOTYPES)})" + ) + if self.pos < 1: + raise ValueError(f"SV pos must be 1-based positive, got {self.pos}") + if self.end < self.pos and self.svtype != "BND": + raise ValueError(f"SV end ({self.end}) must be >= pos ({self.pos}) for {self.svtype}") + if self.homlen < 0: + raise ValueError(f"HOMLEN must be non-negative, got {self.homlen}") + if not 0.0 <= self.vaf <= 1.0: + raise ValueError(f"VAF must be in [0, 1], got {self.vaf}") + if self.svtype == "BND" and (self.mate_chrom is None or self.mate_pos is None): + raise ValueError("BND SV requires mate_chrom and mate_pos") + if len(self.strands) != 2 or any(c not in "+-" for c in self.strands): + raise ValueError(f"strands must be two chars in '+-', got {self.strands!r}") + if self.source not in VALID_SOURCES: + raise ValueError(f"Unknown source {self.source!r} (valid: {sorted(VALID_SOURCES)})") + + @property + def breakpoint1(self) -> Breakpoint: + """ + Primary breakpoint (chrom, pos, strand1) + """ + return Breakpoint(self.chrom, self.pos, self.strands[0]) # type: ignore[arg-type] + + @property + def breakpoint2(self) -> Breakpoint: + """ + Secondary breakpoint (mate for BND, end for intra-chrom SVs) + """ + chrom = self.mate_chrom if self.svtype == "BND" else self.chrom + pos = self.mate_pos if self.svtype == "BND" else self.end + assert chrom is not None and pos is not None + return Breakpoint(chrom, pos, self.strands[1]) # type: ignore[arg-type] + + +@dataclass(slots=True) +class SVPair: + """ + Tumor + normal SV sets for paired somatic pipelines + + ``tumor`` contains all SVs present in the tumor VCF (somatic + germline) + and ``normal`` contains only the germline subset. Germline SVs appear in + both lists with the same id and coordinates; only VAF/genotype may + differ between tumor and normal for a given germline SV + """ + + tumor: list[SV] = field(default_factory=list) + normal: list[SV] = field(default_factory=list) + + @property + def somatic_ids(self) -> set[str]: + normal_ids = {sv.id for sv in self.normal} + return {sv.id for sv in self.tumor if sv.id not in normal_ids} + + @property + def germline_ids(self) -> set[str]: + normal_ids = {sv.id for sv in self.normal} + return {sv.id for sv in self.tumor if sv.id in normal_ids} diff --git a/src/svforge/core/provenance.py b/src/svforge/core/provenance.py new file mode 100644 index 0000000..cdf1a93 --- /dev/null +++ b/src/svforge/core/provenance.py @@ -0,0 +1,105 @@ +""" +Synthetic-data provenance helpers + +Every VCF produced by svforge carries six ``##svforge*`` tags immediately +after ``##fileformat`` so that a downstream consumer can always tell the +data is synthetic and trace it back to the exact command line and seed +that generated it + +The warning tag :data:`WARNING_VALUE` is deliberately not configurable: no +CLI flag, no environment variable, no hidden parameter can remove it. +That invariant is enforced by :func:`build_svforge_tags` +""" + +from __future__ import annotations + +from collections.abc import Sequence +from pathlib import PurePosixPath, PureWindowsPath + +from svforge import __version__ + +WARNING_VALUE = "SYNTHETIC_DATA_DO_NOT_USE_FOR_CLINICAL_DIAGNOSIS" +DOCUMENTATION_URL = "https://github.com/pieetie/svforge" + +_SVFORGE_TAG_ORDER: tuple[str, ...] = ( + "##svforgeVersion", + "##svforgeCommand", + "##svforgeSeed", + "##svforgeCaller", + "##svforgeWarning", + "##svforgeDocumentation", +) + + +def build_svforge_tags( + *, + caller: str, + seed: int, + argv: Sequence[str], + version: str = __version__, +) -> list[str]: + """ + Return the six provenance header lines, in spec order + + ``seed`` is the effective seed, even when the user did not pass one + explicitly. ``argv`` is typically :data:`sys.argv`; it is piped through + :func:`sanitize_command` before being embedded in the header + """ + command = sanitize_command(argv) + tags = [ + f"##svforgeVersion={version}", + f"##svforgeCommand={command}", + f"##svforgeSeed={seed}", + f"##svforgeCaller={caller}", + f"##svforgeWarning={WARNING_VALUE}", + f"##svforgeDocumentation={DOCUMENTATION_URL}", + ] + _assert_ordered(tags) + return tags + + +def sanitize_command(argv: Sequence[str]) -> str: + """ + Return a shell-safe command line with absolute paths stripped + + Each argv token is inspected individually: if the whole token (or the + value portion of a ``--flag=VALUE`` form) is an absolute path -- POSIX + ``/foo/bar``, macOS ``/Users/alice/...`` (which may contain whitespace + on the dev workstation), or Windows-style ``C:\\path\\bar`` -- it is + replaced by its basename so that cluster-internal or user-home + directories never leak into a VCF header + """ + parts: list[str] = [] + for token in argv: + if "=" in token and token.startswith("--"): + flag, _, value = token.partition("=") + if _is_absolute_path(value): + parts.append(f"{flag}={_basename(value)}") + continue + if _is_absolute_path(token): + parts.append(_basename(token)) + continue + parts.append(token) + return " ".join(parts) + + +def _is_absolute_path(s: str) -> bool: + if not s: + return False + if s.startswith("/"): + return True + return len(s) >= 3 and s[0].isalpha() and s[1:3] in (":\\", ":/") + + +def _basename(path: str) -> str: + if len(path) >= 3 and path[0].isalpha() and path[1:3] in (":\\", ":/"): + name = PureWindowsPath(path).name + else: + name = PurePosixPath(path).name + return name or path + + +def _assert_ordered(tags: Sequence[str]) -> None: + keys = [t.split("=", 1)[0] for t in tags] + if tuple(keys) != _SVFORGE_TAG_ORDER: + raise AssertionError(f"svforge tag order drift: {keys} != {_SVFORGE_TAG_ORDER}") diff --git a/src/svforge/core/regions.py b/src/svforge/core/regions.py new file mode 100644 index 0000000..008f04c --- /dev/null +++ b/src/svforge/core/regions.py @@ -0,0 +1,119 @@ +""" +BED region index with pure-Python interval trees + +Designed for blacklist- and gnomAD-region overlap queries. Supports +plain and gzip-compressed BED files via the standard library only; no +system dependency on tabix or bgzip is introduced here +""" + +from __future__ import annotations + +import gzip +from collections import defaultdict +from collections.abc import Iterable, Iterator +from pathlib import Path + +from intervaltree import IntervalTree + + +class RegionSet: + """ + In-memory set of half-open [start, end) intervals per chromosome + + BED uses 0-based half-open coordinates. We keep that convention + internally and convert at the boundaries (callers usually work in + 1-based closed VCF coordinates) + """ + + __slots__ = ("_trees",) + + def __init__(self, intervals: Iterable[tuple[str, int, int]] | None = None) -> None: + self._trees: dict[str, IntervalTree] = defaultdict(IntervalTree) + if intervals is not None: + for chrom, start, end in intervals: + self.add(chrom, start, end) + + def add(self, chrom: str, start: int, end: int) -> None: + """ + Add one half-open BED interval + """ + if end <= start: + return + self._trees[chrom].addi(start, end) + + def overlaps_bed(self, chrom: str, start: int, end: int) -> bool: + """ + True if any indexed region overlaps BED interval [start, end) + """ + tree = self._trees.get(chrom) + if tree is None or end <= start: + return False + return bool(tree.overlaps(start, end)) + + def overlaps_vcf(self, chrom: str, pos: int, end: int) -> bool: + """ + True if any indexed region overlaps a 1-based closed [pos, end] SV + """ + return self.overlaps_bed(chrom, pos - 1, end) + + def overlaps_point(self, chrom: str, pos: int) -> bool: + """ + True if any indexed region contains the 1-based VCF position ``pos`` + """ + tree = self._trees.get(chrom) + if tree is None: + return False + return bool(tree.at(pos - 1)) + + def __len__(self) -> int: + return sum(len(t) for t in self._trees.values()) + + @property + def chromosomes(self) -> set[str]: + return set(self._trees.keys()) + + def intervals_on(self, chrom: str) -> list[tuple[int, int]]: + """ + Return ``[(start, end)]`` BED intervals indexed on ``chrom`` + """ + tree = self._trees.get(chrom) + if tree is None: + return [] + return [(iv.begin, iv.end) for iv in tree] + + def all_intervals(self) -> list[tuple[str, int, int]]: + """ + Return ``[(chrom, start, end)]`` for every indexed interval + """ + return [(chrom, iv.begin, iv.end) for chrom, tree in self._trees.items() for iv in tree] + + +def load_bed(path: str | Path) -> RegionSet: + """ + Load a (optionally gzip-compressed) BED file into a :class:`RegionSet` + + Only the first three columns are read. Lines starting with ``#`` or + the ``browser``/``track`` header lines are skipped + """ + rs = RegionSet() + for chrom, start, end in _iter_bed(Path(path)): + rs.add(chrom, start, end) + return rs + + +def _iter_bed(path: Path) -> Iterator[tuple[str, int, int]]: + opener = gzip.open if path.suffix == ".gz" else open + with opener(path, "rt", encoding="utf-8") as fh: + for raw in fh: + line = raw.strip() + if not line or line.startswith(("#", "browser", "track")): + continue + parts = line.split("\t") + if len(parts) < 3: + continue + try: + start = int(parts[1]) + end = int(parts[2]) + except ValueError: + continue + yield parts[0], start, end diff --git a/src/svforge/core/sampler.py b/src/svforge/core/sampler.py new file mode 100644 index 0000000..680f6e8 --- /dev/null +++ b/src/svforge/core/sampler.py @@ -0,0 +1,503 @@ +""" +Sample SVs from a :class:`Bank` with reproducible RNG + +Knobs: + +- ``svtypes``: restrict to a subset of svtypes (e.g. ``{"DEL", "DUP"}``) +- ``svlen_range`` / ``homlen_range``: hard CLI overrides, clamp template ranges +- ``vaf_range``: uniform VAF draw +- ``gnomad_fraction`` / ``blacklist_fraction``: fraction of the requested SV + count drawn directly from the bundled mini catalogs (Phase C). The + catalog rows (CHROM/POS/END) are reproduced verbatim so downstream + pipelines match them exactly. Sampling is without replacement when + possible; with replacement otherwise + +``sample_pair`` composes ``sample`` twice to produce a consistent tumor/normal +SV set where germline events are duplicated across both samples with the same +coordinates but possibly different VAFs +""" + +from __future__ import annotations + +import logging +import random +import uuid +from collections.abc import Sequence +from dataclasses import dataclass +from typing import Literal, TypeVar + +from svforge.core.bank import Bank, SVTemplate +from svforge.core.genome import GenomeBuild, get_contigs +from svforge.core.injection_catalogs import ( + BlacklistEntry, + GnomadEntry, + load_blacklist_catalog, + load_gnomad_catalog, +) +from svforge.core.models import SV, SVPair + +_CHROM_SAMPLE_MARGIN = 10_000 +_BLACKLIST_SVTYPES = ("DEL", "DUP", "INV") + +_T = TypeVar("_T") + +log = logging.getLogger(__name__) + + +@dataclass(slots=True) +class SamplerConfig: + """ + Knobs that shape a sampling run + """ + + n: int + genome: GenomeBuild = "hg38" + svtypes: frozenset[str] | None = None + svlen_range: tuple[int, int] | None = None + homlen_range: tuple[int, int] | None = None + vaf_range: tuple[float, float] = (0.3, 1.0) + blacklist_fraction: float = 0.0 + gnomad_fraction: float = 0.0 + seed: int | None = None + origin: Literal["somatic", "germline"] = "somatic" + id_prefix: str = "svforge" + chroms: frozenset[str] | None = None + + def __post_init__(self) -> None: + if self.n < 0: + raise ValueError(f"n must be >= 0, got {self.n}") + if not 0.0 <= self.blacklist_fraction <= 1.0: + raise ValueError("blacklist_fraction must be in [0, 1]") + if not 0.0 <= self.gnomad_fraction <= 1.0: + raise ValueError("gnomad_fraction must be in [0, 1]") + total = self.blacklist_fraction + self.gnomad_fraction + if total > 1.0 + 1e-9: + raise ValueError( + f"blacklist_fraction + gnomad_fraction must be <= 1.0, got {total:.3f}" + ) + if self.vaf_range[0] > self.vaf_range[1] or not ( + 0.0 <= self.vaf_range[0] <= 1.0 and 0.0 <= self.vaf_range[1] <= 1.0 + ): + raise ValueError(f"invalid vaf_range: {self.vaf_range}") + + +def sample(bank: Bank, cfg: SamplerConfig) -> list[SV]: + """ + Draw ``cfg.n`` SVs from ``bank`` respecting ``cfg`` constraints + """ + rng = random.Random(cfg.seed) + contigs = get_contigs(cfg.genome) + if cfg.chroms is not None: + contigs = {c: n for c, n in contigs.items() if c in cfg.chroms} + + templates = _filter_templates(bank.templates, cfg.svtypes, cfg.chroms) + if not templates: + if cfg.chroms is not None: + raise ValueError( + f"No SV in bank matches the requested chromosomes: " + f"{sorted(cfg.chroms)}. Enrich the bank or broaden the filter" + ) + raise ValueError(f"No template in bank {bank.name!r} matches svtypes={cfg.svtypes}") + + weights = [t.weight for t in templates] + + n_blacklist = round(cfg.n * cfg.blacklist_fraction) + n_gnomad = round(cfg.n * cfg.gnomad_fraction) + if n_blacklist + n_gnomad > cfg.n: + n_gnomad = max(0, cfg.n - n_blacklist) + n_bank = max(0, cfg.n - n_blacklist - n_gnomad) + + svs: list[SV] = [] + svs.extend(_draw_bank(n_bank, templates, weights, contigs, cfg, rng)) + if n_gnomad: + svs.extend(_draw_gnomad(n_gnomad, cfg, rng)) + if n_blacklist: + svs.extend(_draw_blacklist(n_blacklist, cfg, rng)) + + rng.shuffle(svs) + svs.sort(key=lambda sv: (_chrom_key(sv.chrom), sv.pos)) + return svs + + +def sample_pair( + bank: Bank, + n_somatic: int, + n_germline: int, + cfg: SamplerConfig, + germline_vaf_range: tuple[float, float] = (0.45, 1.0), +) -> SVPair: + """ + Draw a coherent tumor + normal SV pair + + Germline SVs appear in both VCFs with the same id/coords. Their VAF in + the normal is drawn from ``germline_vaf_range`` (~0.5 het / ~1.0 hom) + while their VAF in the tumor stays within ``cfg.vaf_range`` + """ + if n_somatic < 0 or n_germline < 0: + raise ValueError("n_somatic and n_germline must be >= 0") + + base_seed = cfg.seed + + somatic_cfg = _replace(cfg, n=n_somatic, origin="somatic", seed=base_seed) + somatic = sample(bank, somatic_cfg) if n_somatic else [] + + germline_seed = None if base_seed is None else base_seed + 1 + germline_cfg = _replace(cfg, n=n_germline, origin="germline", seed=germline_seed) + germline_tumor = sample(bank, germline_cfg) if n_germline else [] + + rng_normal = random.Random(None if base_seed is None else base_seed + 2) + germline_normal: list[SV] = [] + for sv in germline_tumor: + vaf = rng_normal.uniform(*germline_vaf_range) + genotype = "1/1" if vaf >= 0.9 else "0/1" + germline_normal.append( + SV( + id=sv.id, + svtype=sv.svtype, + chrom=sv.chrom, + pos=sv.pos, + end=sv.end, + svlen=sv.svlen, + mate_chrom=sv.mate_chrom, + mate_pos=sv.mate_pos, + strands=sv.strands, + homlen=sv.homlen, + homseq=sv.homseq, + vaf=vaf, + genotype=genotype, + ref_base=sv.ref_base, + ins_seq=sv.ins_seq, + filter=sv.filter, + origin="germline", + source=sv.source, + info_extra=dict(sv.info_extra), + ) + ) + + tumor = sorted(somatic + germline_tumor, key=lambda sv: (_chrom_key(sv.chrom), sv.pos)) + normal = sorted(germline_normal, key=lambda sv: (_chrom_key(sv.chrom), sv.pos)) + return SVPair(tumor=tumor, normal=normal) + + +def _filter_templates( + templates: Sequence[SVTemplate], + svtypes: frozenset[str] | None, + chroms: frozenset[str] | None, +) -> list[SVTemplate]: + out: list[SVTemplate] = [] + for t in templates: + if svtypes is not None and t.svtype not in svtypes: + continue + if chroms is not None and t.chroms and not any(c in chroms for c in t.chroms): + continue + out.append(t) + return out + + +def _draw_bank( + count: int, + templates: Sequence[SVTemplate], + weights: Sequence[float], + contigs: dict[str, int], + cfg: SamplerConfig, + rng: random.Random, +) -> list[SV]: + out: list[SV] = [] + for _ in range(count): + template = rng.choices(list(templates), weights=list(weights), k=1)[0] + sv = _materialize(template, contigs, cfg, rng) + out.append(sv) + return out + + +def _draw_gnomad(count: int, cfg: SamplerConfig, rng: random.Random) -> list[SV]: + catalog = list(load_gnomad_catalog()) + eligible = _filter_gnomad_catalog(catalog, cfg) + if not eligible: + raise ValueError( + "No gnomAD catalog entry matches the active filter " + f"(svtypes={cfg.svtypes}, chroms={sorted(cfg.chroms) if cfg.chroms else None})" + ) + picks = _sample_without_replacement(eligible, count, rng) + return [_gnomad_entry_to_sv(entry, cfg, rng) for entry in picks] + + +def _draw_blacklist(count: int, cfg: SamplerConfig, rng: random.Random) -> list[SV]: + catalog = list(load_blacklist_catalog()) + eligible = _filter_blacklist_catalog(catalog, cfg) + if not eligible: + raise ValueError( + "No blacklist catalog entry matches the active filter " + f"(chroms={sorted(cfg.chroms) if cfg.chroms else None})" + ) + picks = _sample_without_replacement(eligible, count, rng) + return [_blacklist_entry_to_sv(entry, cfg, rng) for entry in picks] + + +def _filter_gnomad_catalog(catalog: list[GnomadEntry], cfg: SamplerConfig) -> list[GnomadEntry]: + out: list[GnomadEntry] = [] + for entry in catalog: + if cfg.svtypes is not None and entry.svtype not in cfg.svtypes: + continue + if cfg.chroms is not None and entry.chrom not in cfg.chroms: + continue + if cfg.chroms is not None and entry.end_chrom not in cfg.chroms: + continue + out.append(entry) + return out + + +def _filter_blacklist_catalog( + catalog: list[BlacklistEntry], cfg: SamplerConfig +) -> list[BlacklistEntry]: + out: list[BlacklistEntry] = [] + eligible_svtypes = tuple(cfg.svtypes) if cfg.svtypes is not None else _BLACKLIST_SVTYPES + if not any(t in _BLACKLIST_SVTYPES for t in eligible_svtypes): + raise ValueError( + "Blacklist injection requires at least one of DEL/DUP/INV in svtypes; " + f"got {sorted(eligible_svtypes)}" + ) + for entry in catalog: + if cfg.chroms is not None and entry.chrom not in cfg.chroms: + continue + out.append(entry) + return out + + +def _sample_without_replacement(pool: Sequence[_T], count: int, rng: random.Random) -> list[_T]: + if count <= len(pool): + return rng.sample(list(pool), count) + log.warning( + "Requested %d injections from catalog of size %d; falling back to sampling " + "with replacement", + count, + len(pool), + ) + return [rng.choice(list(pool)) for _ in range(count)] + + +def _gnomad_entry_to_sv(entry: GnomadEntry, cfg: SamplerConfig, rng: random.Random) -> SV: + vaf = rng.uniform(*cfg.vaf_range) + genotype = "1/1" if vaf >= 0.9 else "0/1" + sv_id = f"{cfg.id_prefix}_{uuid.UUID(int=rng.getrandbits(128)).hex[:10]}" + + if entry.svtype == "BND": + return SV( + id=sv_id, + svtype="BND", + chrom=entry.chrom, + pos=entry.pos, + end=entry.pos, + svlen=1, + mate_chrom=entry.end_chrom, + mate_pos=entry.end, + strands="+-", + vaf=vaf, + genotype=genotype, + origin=cfg.origin, + source="gnomad", + info_extra={"SVFORGE_SOURCE_ID": entry.source_id}, + ) + + svlen = max(1, entry.end - entry.pos) + return SV( + id=sv_id, + svtype=entry.svtype, + chrom=entry.chrom, + pos=entry.pos, + end=entry.end, + svlen=svlen, + strands=_default_strands(entry.svtype), + vaf=vaf, + genotype=genotype, + origin=cfg.origin, + source="gnomad", + info_extra={"SVFORGE_SOURCE_ID": entry.source_id}, + ) + + +def _blacklist_entry_to_sv(entry: BlacklistEntry, cfg: SamplerConfig, rng: random.Random) -> SV: + vaf = rng.uniform(*cfg.vaf_range) + genotype = "1/1" if vaf >= 0.9 else "0/1" + sv_id = f"{cfg.id_prefix}_{uuid.UUID(int=rng.getrandbits(128)).hex[:10]}" + + svtype_pool = ( + [t for t in _BLACKLIST_SVTYPES if cfg.svtypes is None or t in cfg.svtypes] + if cfg.svtypes is not None + else list(_BLACKLIST_SVTYPES) + ) + svtype = rng.choice(svtype_pool) + svlen = max(1, entry.end - entry.pos) + return SV( + id=sv_id, + svtype=svtype, + chrom=entry.chrom, + pos=entry.pos, + end=entry.end, + svlen=svlen, + strands=_default_strands(svtype), + vaf=vaf, + genotype=genotype, + origin=cfg.origin, + source="blacklist", + info_extra={"SVFORGE_SOURCE_ID": entry.source_id}, + ) + + +def _materialize( + template: SVTemplate, + contigs: dict[str, int], + cfg: SamplerConfig, + rng: random.Random, +) -> SV: + allowed_chroms = _template_chrom_pool(template, contigs, cfg) + if not allowed_chroms: + raise ValueError( + f"Template {template.svtype!r} has no chromosome compatible with the " + f"current filter (chroms={sorted(cfg.chroms) if cfg.chroms else None})" + ) + chrom = rng.choice(allowed_chroms) + chrom_len = contigs[chrom] + + svlen_min, svlen_max = _clamp_range((template.svlen_min, template.svlen_max), cfg.svlen_range) + homlen_min, homlen_max = _clamp_range( + (template.homlen_min, template.homlen_max), cfg.homlen_range + ) + + svlen = 1 if template.svtype == "BND" else rng.randint(svlen_min, svlen_max) + + homlen = rng.randint(homlen_min, homlen_max) + vaf = rng.uniform(*cfg.vaf_range) + genotype = "1/1" if vaf >= 0.9 else "0/1" + sv_id = f"{cfg.id_prefix}_{uuid.UUID(int=rng.getrandbits(128)).hex[:10]}" + + if template.svtype == "BND": + mate_candidates = _bnd_mate_candidates(chrom, contigs, cfg) + mate_chrom = rng.choice(mate_candidates) + pos = rng.randint( + _CHROM_SAMPLE_MARGIN, max(_CHROM_SAMPLE_MARGIN + 1, chrom_len - _CHROM_SAMPLE_MARGIN) + ) + mate_pos = rng.randint( + _CHROM_SAMPLE_MARGIN, + max(_CHROM_SAMPLE_MARGIN + 1, contigs[mate_chrom] - _CHROM_SAMPLE_MARGIN), + ) + strands = rng.choice(("+-", "-+", "++", "--")) + return SV( + id=sv_id, + svtype="BND", + chrom=chrom, + pos=pos, + end=pos, + svlen=svlen, + mate_chrom=mate_chrom, + mate_pos=mate_pos, + strands=strands, + homlen=homlen, + vaf=vaf, + genotype=genotype, + origin=cfg.origin, + source="bank", + ) + + max_pos = chrom_len - svlen - _CHROM_SAMPLE_MARGIN + if max_pos <= _CHROM_SAMPLE_MARGIN: + pos = _CHROM_SAMPLE_MARGIN + else: + pos = rng.randint(_CHROM_SAMPLE_MARGIN, max_pos) + end = pos + svlen + + strands = _default_strands(template.svtype) + return SV( + id=sv_id, + svtype=template.svtype, + chrom=chrom, + pos=pos, + end=end, + svlen=svlen, + strands=strands, + homlen=homlen, + vaf=vaf, + genotype=genotype, + origin=cfg.origin, + source="bank", + ) + + +def _template_chrom_pool( + template: SVTemplate, contigs: dict[str, int], cfg: SamplerConfig +) -> list[str]: + base = template.chroms if template.chroms else tuple(contigs.keys()) + pool = [c for c in base if c in contigs] + if cfg.chroms is not None: + pool = [c for c in pool if c in cfg.chroms] + return pool + + +def _bnd_mate_candidates(chrom: str, contigs: dict[str, int], cfg: SamplerConfig) -> list[str]: + """ + Return chroms eligible as a BND mate: different from ``chrom`` when + possible, restricted to the user filter when one is active + """ + universe = list(contigs.keys()) + if cfg.chroms is not None: + universe = [c for c in universe if c in cfg.chroms] + others = [c for c in universe if c != chrom] + if others: + return others + return universe or [chrom] + + +def _default_strands(svtype: str) -> str: + match svtype: + case "DEL": + return "+-" + case "DUP": + return "-+" + case "INV": + return "++" + case "INS": + return "+-" + case _: + return "+-" + + +def _clamp_range( + template_range: tuple[int, int], override: tuple[int, int] | None +) -> tuple[int, int]: + lo, hi = template_range + if override is None: + return lo, hi + o_lo, o_hi = override + new_lo = max(lo, o_lo) + new_hi = min(hi, o_hi) + if new_hi < new_lo: + new_lo = new_hi = max(o_lo, min(o_hi, lo)) + return new_lo, new_hi + + +def _replace(cfg: SamplerConfig, **kw: object) -> SamplerConfig: + data: dict[str, object] = { + "n": cfg.n, + "genome": cfg.genome, + "svtypes": cfg.svtypes, + "svlen_range": cfg.svlen_range, + "homlen_range": cfg.homlen_range, + "vaf_range": cfg.vaf_range, + "blacklist_fraction": cfg.blacklist_fraction, + "gnomad_fraction": cfg.gnomad_fraction, + "seed": cfg.seed, + "origin": cfg.origin, + "id_prefix": cfg.id_prefix, + "chroms": cfg.chroms, + } + for k, v in kw.items(): + data[k] = v + return SamplerConfig(**data) # type: ignore[arg-type] + + +def _chrom_key(chrom: str) -> tuple[int, str]: + stripped = chrom.removeprefix("chr") + if stripped.isdigit(): + return (int(stripped), "") + order = {"X": 23, "Y": 24, "M": 25, "MT": 25}.get(stripped, 99) + return (order, stripped) diff --git a/src/svforge/data/__init__.py b/src/svforge/data/__init__.py new file mode 100644 index 0000000..4f1602a --- /dev/null +++ b/src/svforge/data/__init__.py @@ -0,0 +1,5 @@ +""" +Packaged data (default banks) +""" + +from __future__ import annotations diff --git a/src/svforge/data/banks/default_hg38.yaml b/src/svforge/data/banks/default_hg38.yaml new file mode 100644 index 0000000..496765f --- /dev/null +++ b/src/svforge/data/banks/default_hg38.yaml @@ -0,0 +1,108 @@ +# Default hg38 SV bank shipped with svforge +# +# Each template draws an SV family. The sampler picks a template +# proportionally to its weight, then draws svlen uniformly within the +# range. chroms=[] means any primary contig is allowed + +name: default_hg38 +genome: hg38 + +templates: + # Deletions - dominant in most SV callsets + - svtype: DEL + svlen: [50, 500] + homlen: [0, 10] + weight: 4.0 + - svtype: DEL + svlen: [500, 5000] + homlen: [0, 20] + weight: 3.0 + - svtype: DEL + svlen: [5000, 50000] + homlen: [0, 30] + weight: 2.0 + - svtype: DEL + svlen: [50000, 500000] + homlen: [0, 50] + weight: 1.0 + + # Duplications + - svtype: DUP + svlen: [200, 5000] + homlen: [0, 15] + weight: 2.0 + - svtype: DUP + svlen: [5000, 50000] + homlen: [0, 25] + weight: 1.5 + - svtype: DUP + svlen: [50000, 500000] + homlen: [0, 40] + weight: 0.8 + + # Inversions + - svtype: INV + svlen: [500, 10000] + homlen: [0, 15] + weight: 1.5 + - svtype: INV + svlen: [10000, 200000] + homlen: [0, 30] + weight: 1.0 + - svtype: INV + svlen: [200000, 2000000] + homlen: [0, 50] + weight: 0.5 + + # Insertions + - svtype: INS + svlen: [50, 300] + homlen: [0, 10] + weight: 2.0 + - svtype: INS + svlen: [300, 2000] + homlen: [0, 15] + weight: 1.0 + + # BND - translocations and complex events + - svtype: BND + svlen: [1, 1] + homlen: [0, 10] + weight: 1.5 + - svtype: BND + svlen: [1, 1] + homlen: [0, 10] + weight: 1.0 + chroms: [chr1, chr2, chr3, chr4, chr5] + - svtype: BND + svlen: [1, 1] + homlen: [0, 10] + weight: 1.0 + chroms: [chr17, chr18, chr19, chr20, chr21, chr22] + + # Small representative extras to reach ~20 templates + - svtype: DEL + svlen: [50, 150] + homlen: [0, 5] + weight: 2.0 + chroms: [chrX] + - svtype: DUP + svlen: [200, 2000] + homlen: [0, 10] + weight: 0.5 + chroms: [chrX, chrY] + - svtype: INV + svlen: [1000, 50000] + homlen: [0, 20] + weight: 0.5 + chroms: [chrX] + - svtype: INS + svlen: [50, 500] + homlen: [0, 8] + weight: 0.5 + chroms: [chr21, chr22] + - svtype: DEL + svlen: [10000, 100000] + homlen: [0, 30] + weight: 1.0 + chroms: [chr13, chr14, chr15] diff --git a/src/svforge/data/headers/__init__.py b/src/svforge/data/headers/__init__.py new file mode 100644 index 0000000..c9921db --- /dev/null +++ b/src/svforge/data/headers/__init__.py @@ -0,0 +1,3 @@ +""" +Reference VCF header templates shipped with svforge +""" diff --git a/src/svforge/data/headers/delly_somatic_hg38.vcf.template b/src/svforge/data/headers/delly_somatic_hg38.vcf.template new file mode 100644 index 0000000..36daaea --- /dev/null +++ b/src/svforge/data/headers/delly_somatic_hg38.vcf.template @@ -0,0 +1,503 @@ +##fileformat=VCFv4.2 +##FILTER= +##fileDate={FILE_DATE} +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##reference={REFERENCE_FASTA} +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT {TUMOR_SAMPLE} {NORMAL_SAMPLE} diff --git a/src/svforge/data/headers/manta_somatic_hg38.vcf.template b/src/svforge/data/headers/manta_somatic_hg38.vcf.template new file mode 100644 index 0000000..1150923 --- /dev/null +++ b/src/svforge/data/headers/manta_somatic_hg38.vcf.template @@ -0,0 +1,491 @@ +##fileformat=VCFv4.1 +##fileDate={FILE_DATE} +##source=GenerateSVCandidates 1.6.0 +##reference=file:///{REFERENCE_FASTA} +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT=0.999"> +##FILTER= +##FILTER= +##FILTER= +##ALT= +##ALT= +##ALT= +##cmdline={CMDLINE} +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT {NORMAL_SAMPLE} {TUMOR_SAMPLE} diff --git a/src/svforge/data/injections/__init__.py b/src/svforge/data/injections/__init__.py new file mode 100644 index 0000000..b7033a3 --- /dev/null +++ b/src/svforge/data/injections/__init__.py @@ -0,0 +1,3 @@ +""" +Bundled real-catalog injection TSVs: gnomAD SVs + ENCODE blacklist regions +""" diff --git a/src/svforge/data/injections/blacklist_hg38_mini.tsv b/src/svforge/data/injections/blacklist_hg38_mini.tsv new file mode 100644 index 0000000..6ddbb4c --- /dev/null +++ b/src/svforge/data/injections/blacklist_hg38_mini.tsv @@ -0,0 +1,51 @@ +chrom pos end region_type source_id +chr1 1 792500 High_Signal_Region ENCODE-blacklist-v2-156 +chr1 121605201 124938900 High_Signal_Region ENCODE-blacklist-v2-159 +chr1 125130201 143562200 High_Signal_Region ENCODE-blacklist-v2-161 +chr2 89235301 89947100 High_Signal_Region ENCODE-blacklist-v2-207 +chr2 90246301 91735500 High_Signal_Region ENCODE-blacklist-v2-208 +chr2 91969001 94569500 High_Signal_Region ENCODE-blacklist-v2-210 +chr3 90455401 91297100 High_Signal_Region ENCODE-blacklist-v2-267 +chr3 91516201 93749200 High_Signal_Region ENCODE-blacklist-v2-268 +chr3 198099801 198295500 High_Signal_Region ENCODE-blacklist-v2-320 +chr4 9212701 9369600 High_Signal_Region ENCODE-blacklist-v2-326 +chr4 49077201 51816100 High_Signal_Region ENCODE-blacklist-v2-328 +chr4 190048601 190214500 High_Signal_Region ENCODE-blacklist-v2-339 +chr5 46708101 50165300 High_Signal_Region ENCODE-blacklist-v2-355 +chr5 69540701 71359500 High_Signal_Region ENCODE-blacklist-v2-358 +chr5 181172601 181538200 High_Signal_Region ENCODE-blacklist-v2-386 +chr6 26669201 26832300 High_Signal_Region ENCODE-blacklist-v2-390 +chr6 58432201 60242300 High_Signal_Region ENCODE-blacklist-v2-399 +chr6 61321801 61493000 High_Signal_Region ENCODE-blacklist-v2-400 +chr7 58031801 60997400 High_Signal_Region ENCODE-blacklist-v2-420 +chr7 61102901 61725200 Low_Mappability ENCODE-blacklist-v2-422 +chr7 102474701 102686400 High_Signal_Region ENCODE-blacklist-v2-428 +chr8 7209801 7914700 High_Signal_Region ENCODE-blacklist-v2-437 +chr8 12136901 12614300 High_Signal_Region ENCODE-blacklist-v2-440 +chr8 43937901 45969600 High_Signal_Region ENCODE-blacklist-v2-442 +chr9 43263101 61518900 High_Signal_Region ENCODE-blacklist-v2-470 +chr9 61735301 63548000 High_Signal_Region ENCODE-blacklist-v2-471 +chr9 66959001 68398100 High_Signal_Region ENCODE-blacklist-v2-475 +chr10 38782601 38967900 High_Signal_Region ENCODE-blacklist-v2-3 +chr10 39901301 41712900 High_Signal_Region ENCODE-blacklist-v2-4 +chr10 41838901 42107300 High_Signal_Region ENCODE-blacklist-v2-5 +chr11 1 194500 Low_Mappability ENCODE-blacklist-v2-9 +chr11 50813601 54383000 High_Signal_Region ENCODE-blacklist-v2-18 +chr11 61084501 61130400 High_Signal_Region ENCODE-blacklist-v2-19 +chr12 1 77800 High_Signal_Region ENCODE-blacklist-v2-29 +chr12 371801 422400 High_Signal_Region ENCODE-blacklist-v2-30 +chr12 34715401 37269100 High_Signal_Region ENCODE-blacklist-v2-37 +chr13 16087601 16165300 High_Signal_Region ENCODE-blacklist-v2-51 +chr13 16226301 18171400 High_Signal_Region ENCODE-blacklist-v2-52 +chr13 57140501 57172500 High_Signal_Region ENCODE-blacklist-v2-54 +chr14 1 18670900 High_Signal_Region ENCODE-blacklist-v2-57 +chr14 18695401 19724300 High_Signal_Region ENCODE-blacklist-v2-58 +chr14 23033301 23098600 High_Signal_Region ENCODE-blacklist-v2-59 +chr15 1 17035000 High_Signal_Region ENCODE-blacklist-v2-84 +chr15 17058501 19790100 High_Signal_Region ENCODE-blacklist-v2-85 +chr15 20005601 22606300 High_Signal_Region ENCODE-blacklist-v2-86 +chr16 34272001 34633100 High_Signal_Region ENCODE-blacklist-v2-106 +chr16 36225201 46423000 High_Signal_Region ENCODE-blacklist-v2-112 +chr16 90100501 90338300 Low_Mappability ENCODE-blacklist-v2-114 +chr17 21783301 22054000 High_Signal_Region ENCODE-blacklist-v2-121 +chr17 22745201 26629800 High_Signal_Region ENCODE-blacklist-v2-123 diff --git a/src/svforge/data/injections/gnomad_hg38_mini.tsv b/src/svforge/data/injections/gnomad_hg38_mini.tsv new file mode 100644 index 0000000..2df4b13 --- /dev/null +++ b/src/svforge/data/injections/gnomad_hg38_mini.tsv @@ -0,0 +1,51 @@ +chrom pos end end_chrom svtype source_id +chr1 11000 51000 chr1 DUP gnomAD-SV_v3_DUP_chr1_7d73682f +chr1 40000 47000 chr1 DEL gnomAD-SV_v3_DEL_chr1_b26f63f7 +chr1 48000 128000 chr1 DUP gnomAD-SV_v3_DUP_chr1_3d99dd1e +chr1 413968 420000 chr1 DEL gnomAD-SV_v3_DEL_chr1_9ccc15ff +chr1 788630 788842 chr1 INS gnomAD-SV_v3_INS_chr1_2de0b574 +chr1 904659 904661 chr1 INS gnomAD-SV_v3_INS_chr1_a12a6a8a +chr1 21203905 21204753 chr1 INV gnomAD-SV_v3_INV_chr1_dc645849 +chr1 83019127 83019322 chr1 INV gnomAD-SV_v3_INV_chr1_25560503 +chr2 10297 10596 chr2 DEL gnomAD-SV_v3_DEL_chr2_d39246cd +chr2 32148 34838 chr2 DEL gnomAD-SV_v3_DEL_chr2_942daf4f +chr2 108204 108269 chr2 DEL gnomAD-SV_v3_DEL_chr2_685cd0f1 +chr2 126760 126864 chr2 DUP gnomAD-SV_v3_DUP_chr2_51e5ae64 +chr2 126879 127037 chr2 DUP gnomAD-SV_v3_DUP_chr2_9d0bd14b +chr2 293737 294344 chr2 DEL gnomAD-SV_v3_DEL_chr2_fe24586f +chr2 306268 306784 chr2 DEL gnomAD-SV_v3_DEL_chr2_97cf8ee8 +chr2 306468 307318 chr2 DEL gnomAD-SV_v3_DEL_chr2_6bffebce +chr8 64900 107200 chr8 DUP gnomAD-SV_v3_DUP_chr8_ca5f17ef +chr8 65500 111250 chr8 DEL gnomAD-SV_v3_DEL_chr8_e6b32082 +chr8 76450 82000 chr8 DEL gnomAD-SV_v3_DEL_chr8_9ca25b62 +chr8 145000 151100 chr8 DUP gnomAD-SV_v3_DUP_chr8_c2284cfa +chr8 305613 305624 chr8 INS gnomAD-SV_v3_INS_chr8_01e27349 +chr8 329084 329130 chr8 INS gnomAD-SV_v3_INS_chr8_9dccb13f +chr8 20919207 20920924 chr8 INV gnomAD-SV_v3_INV_chr8_0a983474 +chr8 74175660 74183142 chr8 INV gnomAD-SV_v3_INV_chr8_1af72e78 +chr14 16022637 16037637 chr14 DUP gnomAD-SV_v3_DUP_chr14_673a2a3e +chr14 16038537 16043637 chr14 DEL gnomAD-SV_v3_DEL_chr14_465f6e53 +chr14 16070187 16075637 chr14 DEL gnomAD-SV_v3_DEL_chr14_f4c39155 +chr14 18225523 18249523 chr14 DUP gnomAD-SV_v3_DUP_chr14_0f14c9c8 +chr14 19098423 19098426 chr14 INS gnomAD-SV_v3_INS_chr14_22ab2005 +chr14 19434997 19435012 chr14 INS gnomAD-SV_v3_INS_chr14_83c766c7 +chr14 65375819 65376644 chr14 INV gnomAD-SV_v3_INV_chr14_7d152e48 +chr17 66000 122000 chr17 DEL gnomAD-SV_v3_DEL_chr17_b42efb31 +chr17 117310 117368 chr17 DUP gnomAD-SV_v3_DUP_chr17_93a44dec +chr17 121000 132000 chr17 DUP gnomAD-SV_v3_DUP_chr17_1e3b5a87 +chr17 135827 136376 chr17 DEL gnomAD-SV_v3_DEL_chr17_bd5ea421 +chr17 198647 198656 chr17 INS gnomAD-SV_v3_INS_chr17_757f4338 +chr17 227349 227365 chr17 INS gnomAD-SV_v3_INS_chr17_07e4dcab +chr17 37415089 37415701 chr17 INV gnomAD-SV_v3_INV_chr17_a6243630 +chr22 10510000 10587800 chr22 DUP gnomAD-SV_v3_DUP_chr22_e84c04b9 +chr22 10525801 10526000 chr22 DEL gnomAD-SV_v3_DEL_chr22_f605c0e4 +chr22 10526487 10526995 chr22 DEL gnomAD-SV_v3_DEL_chr22_3ef8bb14 +chr22 10571952 10572007 chr22 DUP gnomAD-SV_v3_DUP_chr22_1475627f +chr22 10572350 10572393 chr22 INS gnomAD-SV_v3_INS_chr22_69b1fb48 +chr22 10698837 10698869 chr22 INS gnomAD-SV_v3_INS_chr22_d15d476e +chrX 10000 43825 chrX DEL gnomAD-SV_v3_DEL_chrX_1b8d89f3 +chrX 12810 16459 chrX DEL gnomAD-SV_v3_DEL_chrX_2a4e1536 +chrX 13559 13759 chrX DUP gnomAD-SV_v3_DUP_chrX_24a451b4 +chrX 13789 13981 chrX DUP gnomAD-SV_v3_DUP_chrX_f71d7ae2 +chrX 226050 226068 chrX INS gnomAD-SV_v3_INS_chrX_a8e79c48 +chrX 226530 226618 chrX INS gnomAD-SV_v3_INS_chrX_488f71fb diff --git a/src/svforge/io/__init__.py b/src/svforge/io/__init__.py new file mode 100644 index 0000000..5fdc5be --- /dev/null +++ b/src/svforge/io/__init__.py @@ -0,0 +1,5 @@ +""" +Unified VCF/VCF.gz/BCF output helpers +""" + +from __future__ import annotations diff --git a/src/svforge/io/vcf_writer.py b/src/svforge/io/vcf_writer.py new file mode 100644 index 0000000..6ea07a5 --- /dev/null +++ b/src/svforge/io/vcf_writer.py @@ -0,0 +1,89 @@ +""" +Unified VCF / VCF.gz / BCF writer + +Writers produce text VCF lines. Plain ``.vcf`` and ``.vcf.gz`` outputs are +emitted as raw bytes so that the header the writer built (``##fileformat`` +immediately followed by the six ``##svforge*`` tags, then the rest of the +template) reaches disk verbatim. pysam is only used for ``.bcf``, where +BCF encoding is required; in that mode pysam always declares +``##FILTER=`` itself, which is an accepted drift for BCF and is +documented in the release notes +""" + +from __future__ import annotations + +import tempfile +from collections.abc import Iterable +from pathlib import Path + +import pysam + + +def detect_mode(path: Path) -> str: + """ + Map a path suffix to the pysam write-mode string + + Returns one of ``"w"`` (plain VCF), ``"wz"`` (bgzipped VCF) or + ``"wb"`` (BCF). Raises :class:`ValueError` for unknown suffixes + """ + name = path.name.lower() + if name.endswith(".vcf.gz"): + return "wz" + if name.endswith(".bcf"): + return "wb" + if name.endswith(".vcf"): + return "w" + raise ValueError(f"Cannot infer output format from {path!r}: expected .vcf, .vcf.gz or .bcf") + + +def write_vcf( + out_path: str | Path, + header_lines: Iterable[str], + record_lines: Iterable[str], +) -> Path: + """ + Materialise a text VCF to a file in the format implied by ``out_path`` + + Plain ``.vcf`` is written as raw text and the input header bytes hit + disk verbatim. ``.vcf.gz`` is bgzipped from the same text buffer. + ``.bcf`` is encoded via pysam (which introduces a pysam-managed + ``##FILTER=`` header line; accepted drift, see module docstring) + """ + out = Path(out_path) + out.parent.mkdir(parents=True, exist_ok=True) + mode = detect_mode(out) + + header_text = _lines_to_text(header_lines) + record_text = _lines_to_text(record_lines) + + if mode == "w": + out.write_text(header_text + record_text, encoding="utf-8") + return out + + if mode == "wz": + with tempfile.TemporaryDirectory(prefix="svforge_") as tmpdir: + staging = Path(tmpdir) / "staging.vcf" + staging.write_text(header_text + record_text, encoding="utf-8") + if out.exists(): + out.unlink() + pysam.tabix_compress(str(staging), str(out), force=True) + return out + + with tempfile.TemporaryDirectory(prefix="svforge_") as tmpdir: + staging = Path(tmpdir) / "staging.vcf" + staging.write_text(header_text + record_text, encoding="utf-8") + with ( + pysam.VariantFile(str(staging)) as vin, + pysam.VariantFile(str(out), mode, header=vin.header) as vout, # type: ignore[arg-type] + ): + for rec in vin: + vout.write(rec) + + return out + + +def _lines_to_text(lines: Iterable[str]) -> str: + parts: list[str] = [] + for line in lines: + parts.append(line if line.endswith("\n") else line + "\n") + return "".join(parts) diff --git a/src/svforge/validate/__init__.py b/src/svforge/validate/__init__.py new file mode 100644 index 0000000..c24e9d7 --- /dev/null +++ b/src/svforge/validate/__init__.py @@ -0,0 +1,5 @@ +""" +End-to-end sanity check of svforge-generated VCFs +""" + +from __future__ import annotations diff --git a/src/svforge/validate/annotate.py b/src/svforge/validate/annotate.py new file mode 100644 index 0000000..bfda2e6 --- /dev/null +++ b/src/svforge/validate/annotate.py @@ -0,0 +1,195 @@ +""" +Self-consistency validation for svforge-generated VCFs + +Reads a VCF produced by ``svforge gen`` / ``gen-pair`` and, for every +record tagged ``INFO/SVFORGE_SOURCE=gnomad`` or ``INFO/SVFORGE_SOURCE=blacklist``, +asserts that the record's ``CHROM/POS/END`` matches an entry in the +corresponding bundled mini catalog. The check is deterministic and +tolerance-free: either the VCF was generated by this svforge build and +matches, or it didn't and doesn't +""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + +import pysam + +from svforge.core.injection_catalogs import ( + BlacklistEntry, + GnomadEntry, + load_blacklist_catalog, + load_gnomad_catalog, +) + + +@dataclass(frozen=True, slots=True) +class ValidationDivergence: + """ + One record whose coordinates do not match the bundled catalog + """ + + record_id: str + source: str + chrom: str + pos: int + end: int + + +@dataclass(frozen=True, slots=True) +class ValidationResult: + """ + Outcome of a self-consistency validation run + """ + + gnomad_total: int + gnomad_matched: int + blacklist_total: int + blacklist_matched: int + divergences: tuple[ValidationDivergence, ...] + + @property + def passed(self) -> bool: + return not self.divergences + + +def validate_vcf(vcf_path: str | Path) -> ValidationResult: + """ + Validate every ``SVFORGE_SOURCE``-tagged record in ``vcf_path`` + """ + gnomad_index = _index_gnomad(load_gnomad_catalog()) + blacklist_index = _index_blacklist(load_blacklist_catalog()) + + gnomad_total = 0 + gnomad_matched = 0 + blacklist_total = 0 + blacklist_matched = 0 + divergences: list[ValidationDivergence] = [] + + with pysam.VariantFile(str(vcf_path)) as vin: + for rec in vin: + source = _source_tag(rec) + if source is None: + continue + + chrom = str(rec.chrom) + pos = int(rec.pos) + end = _record_end(rec) + + if source == "gnomad": + gnomad_total += 1 + if _matches_gnomad(chrom, pos, end, gnomad_index): + gnomad_matched += 1 + else: + divergences.append( + ValidationDivergence( + record_id=str(rec.id or ""), + source=source, + chrom=chrom, + pos=pos, + end=end, + ) + ) + elif source == "blacklist": + blacklist_total += 1 + if _matches_blacklist(chrom, pos, end, blacklist_index): + blacklist_matched += 1 + else: + divergences.append( + ValidationDivergence( + record_id=str(rec.id or ""), + source=source, + chrom=chrom, + pos=pos, + end=end, + ) + ) + + return ValidationResult( + gnomad_total=gnomad_total, + gnomad_matched=gnomad_matched, + blacklist_total=blacklist_total, + blacklist_matched=blacklist_matched, + divergences=tuple(divergences), + ) + + +def _source_tag(rec: pysam.VariantRecord) -> str | None: + if "SVFORGE_SOURCE" not in rec.header.info: + return None + try: + value = rec.info.get("SVFORGE_SOURCE") + except (KeyError, ValueError): + return None + if isinstance(value, tuple): + value = value[0] if value else None + if value is None: + return None + source = str(value).lower() + if source in {"gnomad", "blacklist"}: + return source + return None + + +def _record_end(rec: pysam.VariantRecord) -> int: + svtype = _first_info(rec, "SVTYPE") + if svtype == "BND": + chr2 = _first_info(rec, "CHR2") + pos2 = _first_info(rec, "POS2") + if chr2 is not None and pos2 is not None: + return _to_int(pos2) + if svtype == "INS": + # Manta emits END=POS for insertions; the real end is POS + SVLEN + svlen = _first_info(rec, "SVLEN") + if svlen is not None: + return int(rec.pos) + abs(_to_int(svlen)) + return int(rec.stop) + + +def _index_gnomad( + catalog: tuple[GnomadEntry, ...], +) -> dict[tuple[str, int, int], GnomadEntry]: + return {(e.chrom, e.pos, e.end): e for e in catalog} + + +def _index_blacklist( + catalog: tuple[BlacklistEntry, ...], +) -> dict[tuple[str, int, int], BlacklistEntry]: + return {(e.chrom, e.pos, e.end): e for e in catalog} + + +def _matches_gnomad( + chrom: str, + pos: int, + end: int, + index: dict[tuple[str, int, int], GnomadEntry], +) -> bool: + return (chrom, pos, end) in index + + +def _matches_blacklist( + chrom: str, + pos: int, + end: int, + index: dict[tuple[str, int, int], BlacklistEntry], +) -> bool: + return (chrom, pos, end) in index + + +def _first_info(rec: pysam.VariantRecord, key: str) -> object: + if key not in rec.header.info: + return None + try: + value = rec.info.get(key) + except (KeyError, ValueError): + return None + if isinstance(value, tuple): + return value[0] if value else None + return value + + +def _to_int(value: object) -> int: + if isinstance(value, (int, str, float)): + return int(value) + return int(str(value)) diff --git a/src/svforge/writers/__init__.py b/src/svforge/writers/__init__.py new file mode 100644 index 0000000..8a4bf39 --- /dev/null +++ b/src/svforge/writers/__init__.py @@ -0,0 +1,27 @@ +""" +Writer registry: in-tree @register_writer + external entry-points + +External packages add a caller by exposing a subclass of +:class:`~svforge.writers.base.CallerWriter` under the +``svforge.writers`` entry-point group in their ``pyproject.toml``:: + + [project.entry-points."svforge.writers"] + mycaller = "mypkg.writers:MyCallerWriter" +""" + +from __future__ import annotations + +from svforge.writers import delly, manta # noqa: F401 ensure in-tree registration +from svforge.writers._registry import ( + available_writers, + get_writer, + register_writer, +) +from svforge.writers.base import CallerWriter + +__all__ = [ + "CallerWriter", + "available_writers", + "get_writer", + "register_writer", +] diff --git a/src/svforge/writers/_registry.py b/src/svforge/writers/_registry.py new file mode 100644 index 0000000..4fd342c --- /dev/null +++ b/src/svforge/writers/_registry.py @@ -0,0 +1,66 @@ +""" +Writer registry -- kept in its own module to avoid circular imports +""" + +from __future__ import annotations + +from collections.abc import Iterable +from functools import cache +from importlib.metadata import entry_points + +from svforge.writers.base import CallerWriter + +_ENTRY_POINT_GROUP = "svforge.writers" +_REGISTRY: dict[str, type[CallerWriter]] = {} + + +def register_writer(name: str): # type: ignore[no-untyped-def] + """ + Decorator that registers a :class:`CallerWriter` subclass under ``name`` + """ + + def decorator(cls: type[CallerWriter]) -> type[CallerWriter]: + if not issubclass(cls, CallerWriter): + raise TypeError(f"{cls!r} is not a CallerWriter subclass") + _REGISTRY[name] = cls + cls.name = name + return cls + + return decorator + + +def get_writer(name: str) -> CallerWriter: + """ + Instantiate the writer registered under ``name`` + """ + cls = _lookup(name) + if cls is None: + raise KeyError(f"No writer registered under {name!r}. Known: {sorted(available_writers())}") + return cls() + + +def available_writers() -> Iterable[str]: + """ + Names of every known writer (in-tree + entry-point) + """ + names = set(_REGISTRY.keys()) | set(_load_entry_point_classes().keys()) + return sorted(names) + + +def _lookup(name: str) -> type[CallerWriter] | None: + if name in _REGISTRY: + return _REGISTRY[name] + return _load_entry_point_classes().get(name) + + +@cache +def _load_entry_point_classes() -> dict[str, type[CallerWriter]]: + found: dict[str, type[CallerWriter]] = {} + for ep in entry_points(group=_ENTRY_POINT_GROUP): + try: + cls = ep.load() + except Exception: # pragma: no cover -- defensive + continue + if isinstance(cls, type) and issubclass(cls, CallerWriter): + found[ep.name] = cls + return found diff --git a/src/svforge/writers/base.py b/src/svforge/writers/base.py new file mode 100644 index 0000000..9871155 --- /dev/null +++ b/src/svforge/writers/base.py @@ -0,0 +1,243 @@ +""" +Abstract writer: every caller formatter inherits from :class:`CallerWriter` + +Writers know nothing of the sampler, the bank or the CLI. They only know +the :class:`SV` contract and produce VCF text lines (header + records) +which :mod:`svforge.io.vcf_writer` later materialises as VCF / VCF.gz / BCF + +Headers are built from real-world reference templates shipped under +:mod:`svforge.data.headers` so that the output is indistinguishable from +the target caller except for the ``##svforge*`` provenance tags injected +right after ``##fileformat``. Users who need a different build or a custom +caller variant can supply their own template via ``--header-template PATH`` +as long as it carries the placeholders listed in :attr:`CallerWriter.REQUIRED_PLACEHOLDERS` +""" + +from __future__ import annotations + +import datetime as _dt +from abc import ABC, abstractmethod +from collections.abc import Iterable, Sequence +from functools import cache +from importlib import resources +from pathlib import Path +from typing import ClassVar + +from svforge.core.genome import GenomeBuild, validate_genome +from svforge.core.models import SV + +SYNTHETIC_REFERENCE = "svforge_synthetic_reference.fa" + + +class CallerWriter(ABC): + """ + Caller-specific VCF dialect writer + + Attributes: + name: registry key, set by :func:`register_writer` + supported_svtypes: subset of ``{"DEL", "DUP", "INV", "INS", "BND"}`` + FILEFORMAT: VCF version string emitted by the caller (``VCFv4.1`` or + ``VCFv4.2``) + SOURCE_STRING: value of the ``##source`` header line; svforge always + acknowledges that it is mimicking the upstream caller + SAMPLE_COLUMN_ORDER: order of the sample columns in paired + (somatic) headers, e.g. ``("NORMAL", "TUMOR")`` for Manta + TEMPLATE_BASENAME: stem used to locate the reference header + template in :mod:`svforge.data.headers` (``f"{stem}_{genome}.vcf.template"``) + REQUIRED_PLACEHOLDERS: placeholders a user-supplied template must + contain to be considered valid for this writer + """ + + name: ClassVar[str] = "" + supported_svtypes: ClassVar[frozenset[str]] = frozenset({"DEL", "DUP", "INV", "INS", "BND"}) + + FILEFORMAT: ClassVar[str] = "VCFv4.2" + SOURCE_STRING: ClassVar[str] = "svforge" + SAMPLE_COLUMN_ORDER: ClassVar[tuple[str, ...]] = ("TUMOR", "NORMAL") + TEMPLATE_BASENAME: ClassVar[str] = "" + REQUIRED_PLACEHOLDERS: ClassVar[tuple[str, ...]] = ( + "{FILE_DATE}", + "{REFERENCE_FASTA}", + "{TUMOR_SAMPLE}", + "{NORMAL_SAMPLE}", + ) + + def header_lines( + self, + sample_name: str, + genome: GenomeBuild = "hg38", + *, + provenance_tags: Sequence[str] = (), + template_override: Path | None = None, + ) -> list[str]: + """ + Return the full VCF header for a single-sample VCF + + Built from the caller's reference template with the ``NORMAL`` column + dropped and the remaining sample slot filled with ``sample_name`` + """ + template = self._load_template(genome, template_override) + return _render_template( + template, + tumor_sample=sample_name, + normal_sample=None, + sample_order=self.SAMPLE_COLUMN_ORDER, + source_string=self.SOURCE_STRING, + provenance_tags=provenance_tags, + ) + + def header_lines_paired( + self, + tumor_sample: str, + normal_sample: str, + genome: GenomeBuild = "hg38", + *, + provenance_tags: Sequence[str] = (), + template_override: Path | None = None, + ) -> list[str]: + """ + Return the full VCF header for a paired tumor/normal (somatic) VCF + + Column order follows :attr:`SAMPLE_COLUMN_ORDER`. The rest of the + header is preserved verbatim from the reference template + """ + template = self._load_template(genome, template_override) + return _render_template( + template, + tumor_sample=tumor_sample, + normal_sample=normal_sample, + sample_order=self.SAMPLE_COLUMN_ORDER, + source_string=self.SOURCE_STRING, + provenance_tags=provenance_tags, + ) + + def _load_template( + self, + genome: GenomeBuild, + override: Path | None, + ) -> tuple[str, ...]: + if not self.TEMPLATE_BASENAME: + raise RuntimeError(f"{type(self).__name__} did not set TEMPLATE_BASENAME") + if override is not None: + return _load_override_template(override, self.name, self.REQUIRED_PLACEHOLDERS) + validate_genome(genome) + name = f"{self.TEMPLATE_BASENAME}_{genome}.vcf.template" + return _load_bundled_template(name) + + @abstractmethod + def format_record(self, sv: SV, sample_name: str) -> list[str]: + """ + Return one or more text VCF records for ``sv`` + + Manta returns two lines for a BND event (one per mate). DELLY and + simple symbolic records (DEL/DUP/INV/INS) return a single-line list + """ + + def format_records(self, svs: Iterable[SV], sample_name: str) -> list[str]: + """ + Format a whole SV collection to a list of text VCF records + """ + records: list[str] = [] + for sv in svs: + if sv.svtype not in self.supported_svtypes: + raise ValueError(f"Writer {self.name!r} does not support svtype {sv.svtype!r}") + records.extend(self.format_record(sv, sample_name)) + return records + + +@cache +def _load_bundled_template(name: str) -> tuple[str, ...]: + try: + resource = resources.files("svforge.data.headers").joinpath(name) + except ModuleNotFoundError as exc: # pragma: no cover -- packaging-time guard + raise RuntimeError("svforge.data.headers not installed") from exc + if not resource.is_file(): + raise ValueError( + f"No bundled header template {name!r}. Provide your own via --header-template PATH." + ) + text = resource.read_text(encoding="utf-8") + return tuple(text.splitlines()) + + +def _load_override_template( + path: Path, + caller: str, + required: Sequence[str], +) -> tuple[str, ...]: + if not path.is_file(): + raise FileNotFoundError(f"Header template {path} does not exist") + text = path.read_text(encoding="utf-8") + missing = [ph for ph in required if ph not in text] + if missing: + raise ValueError( + f"Header template {path} is missing required placeholders for " + f"caller {caller!r}: {', '.join(missing)}" + ) + return tuple(text.splitlines()) + + +def _render_template( + template: Sequence[str], + *, + tumor_sample: str, + normal_sample: str | None, + sample_order: Sequence[str], + source_string: str, + provenance_tags: Sequence[str], +) -> list[str]: + """ + Apply placeholder substitutions and inject svforge + source lines + """ + lines = list(template) + if not lines or not lines[0].startswith("##fileformat="): + raise RuntimeError("template must start with ##fileformat=") + + file_date = _dt.datetime.now(_dt.timezone.utc).strftime("%Y%m%d") + cmdline = _cmdline_from_tags(provenance_tags) + out: list[str] = [lines[0]] + out.extend(provenance_tags) + + saw_source = False + for raw in lines[1:]: + line = ( + raw.replace("{FILE_DATE}", file_date) + .replace("{CMDLINE}", cmdline) + .replace("{REFERENCE_FASTA}", SYNTHETIC_REFERENCE) + ) + if line.startswith("#CHROM"): + line = _render_chrom_line(line, tumor_sample, normal_sample, sample_order) + if not saw_source: + out.append(f"##source={source_string}") + saw_source = True + out.append(line) + continue + if line.startswith("##source"): + out.append(f"##source={source_string}") + saw_source = True + continue + out.append(line) + return out + + +def _render_chrom_line( + line: str, + tumor_sample: str, + normal_sample: str | None, + sample_order: Sequence[str], +) -> str: + fields = line.split("\t") + if len(fields) < 10: + raise RuntimeError(f"malformed #CHROM line in template: {line!r}") + header = fields[:9] + if normal_sample is None: + return "\t".join([*header, tumor_sample]) + mapping = {"TUMOR": tumor_sample, "NORMAL": normal_sample} + ordered = [mapping[role] for role in sample_order] + return "\t".join([*header, *ordered]) + + +def _cmdline_from_tags(provenance_tags: Sequence[str]) -> str: + for tag in provenance_tags: + if tag.startswith("##svforgeCommand="): + return tag.split("=", 1)[1] + return "" diff --git a/src/svforge/writers/delly.py b/src/svforge/writers/delly.py new file mode 100644 index 0000000..ea0db82 --- /dev/null +++ b/src/svforge/writers/delly.py @@ -0,0 +1,139 @@ +""" +DELLY VCF writer + +Reproduces the VCF 4.2 dialect emitted by DELLY 1.x for somatic SV calls. +The header is loaded verbatim from the reference template shipped under +:mod:`svforge.data.headers`; only ``##svforge*`` provenance tags, the +``##source`` line and dynamic fields (``##fileDate``) are rewritten. +Unlike Manta, DELLY emits a single record per BND event and encodes mate +coordinates via ``CHR2``/``POS2`` INFO fields plus a connection type +``CT`` (e.g. ``3to5`` for DEL-like, ``5to3`` for DUP-like) +""" + +from __future__ import annotations + +from typing import ClassVar + +from svforge import __version__ +from svforge.core.models import SV +from svforge.writers.base import CallerWriter + + +class DellyWriter(CallerWriter): + """ + DELLY 1.x VCF 4.2 writer + """ + + name: ClassVar[str] = "delly" + supported_svtypes: ClassVar[frozenset[str]] = frozenset({"DEL", "DUP", "INV", "INS", "BND"}) + + FILEFORMAT: ClassVar[str] = "VCFv4.2" + SOURCE_STRING: ClassVar[str] = f"svforge-{__version__} (mimics DELLY)" + SAMPLE_COLUMN_ORDER: ClassVar[tuple[str, ...]] = ("TUMOR", "NORMAL") + TEMPLATE_BASENAME: ClassVar[str] = "delly_somatic" + REQUIRED_PLACEHOLDERS: ClassVar[tuple[str, ...]] = ( + "{FILE_DATE}", + "{REFERENCE_FASTA}", + "{TUMOR_SAMPLE}", + "{NORMAL_SAMPLE}", + ) + + def format_record(self, sv: SV, sample_name: str) -> list[str]: + return [_delly_record(sv)] + + +def _ct_for(sv: SV) -> str: + """ + Return the DELLY connection-type string for an SV + """ + match sv.svtype: + case "DEL": + return "3to5" + case "DUP": + return "5to3" + case "INV": + return "3to3" if sv.strands == "++" else "5to5" + case "INS": + return "3to5" + case "BND": + s1, s2 = sv.strands[0], sv.strands[1] + left = "3" if s1 == "+" else "5" + right = "5" if s2 == "-" else "3" + return f"{left}to{right}" + case _: # pragma: no cover + return "NtoN" + + +def _sample_column(sv: SV) -> tuple[str, str]: + total = 30 + alt = max(1, round(total * sv.vaf)) + ref = max(0, total - alt) + junction_alt = max(1, alt // 2) + junction_ref = max(0, ref // 2) + gq = 60 + gl = ( + "-50,-5,0" + if sv.genotype == "1/1" + else ("0,-5,-50" if sv.genotype == "0/0" else "-10,-1,-10") + ) + fmt = "GT:GL:GQ:FT:RC:RCL:RCR:RDCN:DR:DV:RR:RV" + sample = f"{sv.genotype}:{gl}:{gq}:PASS:40:20:20:2:{ref}:{alt}:{junction_ref}:{junction_alt}" + return fmt, sample + + +def _base_info(sv: SV) -> list[str]: + ct = _ct_for(sv) + end = sv.end if sv.svtype != "BND" else sv.pos + chr2 = sv.mate_chrom if sv.svtype == "BND" else sv.chrom + pos2 = sv.mate_pos if sv.svtype == "BND" else sv.end + info = [ + "PRECISE", + f"SVTYPE={sv.svtype}", + "SVMETHOD=EMBL.DELLYv1.2.6-svforge", + f"CHR2={chr2}", + f"END={end}", + f"POS2={pos2}", + "PE=20", + "MAPQ=60", + f"CT={ct}", + "CIPOS=-10,10", + "CIEND=-10,10", + "SR=5", + "SRQ=1", + "CE=1.9", + ] + if sv.svtype == "INS": + info.append(f"SVLEN={sv.svlen}") + if sv.homlen: + info.append(f"HOMLEN={sv.homlen}") + if sv.origin == "somatic": + info.append("SOMATIC") + info.append(f"SVFORGE_SOURCE={sv.source}") + for key, value in sv.info_extra.items(): + info.append(f"{key}={value}" if value else key) + return info + + +def _delly_record(sv: SV) -> str: + alt = f"<{sv.svtype}>" + info = ";".join(_base_info(sv)) + fmt, sample = _sample_column(sv) + return "\t".join( + [ + sv.chrom, + str(sv.pos), + sv.id, + sv.ref_base, + alt, + ".", + sv.filter, + info, + fmt, + sample, + ] + ) + + +from svforge.writers._registry import register_writer # noqa: E402 + +register_writer("delly")(DellyWriter) diff --git a/src/svforge/writers/manta.py b/src/svforge/writers/manta.py new file mode 100644 index 0000000..2bc7d2f --- /dev/null +++ b/src/svforge/writers/manta.py @@ -0,0 +1,204 @@ +""" +Manta VCF writer + +Reproduces the VCF 4.1 dialect emitted by Illumina Manta 1.6.0 for +somatic SV calls. The header is loaded verbatim from the reference +template shipped under :mod:`svforge.data.headers` so that downstream +parsers cannot distinguish svforge output from a real Manta run, except +for the ``##svforge*`` provenance tags + +BND events are emitted as two mate records with ``MATEID`` cross-reference, +matching real Manta output. DEL/DUP/INV/INS events use symbolic alleles +""" + +from __future__ import annotations + +from typing import ClassVar + +from svforge import __version__ +from svforge.core.models import SV +from svforge.writers.base import CallerWriter + + +class MantaWriter(CallerWriter): + """ + Manta 1.6.x VCF 4.1 writer + """ + + name: ClassVar[str] = "manta" + supported_svtypes: ClassVar[frozenset[str]] = frozenset({"DEL", "DUP", "INV", "INS", "BND"}) + + FILEFORMAT: ClassVar[str] = "VCFv4.1" + SOURCE_STRING: ClassVar[str] = f"svforge-{__version__} (mimics GenerateSVCandidates 1.6.0)" + SAMPLE_COLUMN_ORDER: ClassVar[tuple[str, ...]] = ("NORMAL", "TUMOR") + TEMPLATE_BASENAME: ClassVar[str] = "manta_somatic" + REQUIRED_PLACEHOLDERS: ClassVar[tuple[str, ...]] = ( + "{FILE_DATE}", + "{REFERENCE_FASTA}", + "{CMDLINE}", + "{TUMOR_SAMPLE}", + "{NORMAL_SAMPLE}", + ) + + def format_record(self, sv: SV, sample_name: str) -> list[str]: + if sv.svtype == "BND": + return _bnd_mate_records(sv) + return [_symbolic_record(sv)] + + +def _sample_depth(vaf: float) -> tuple[int, int]: + """ + Return (ref_count, alt_count) consistent with ``vaf`` at 30x total + """ + total = 30 + alt = max(1, round(total * vaf)) + ref = max(0, total - alt) + return ref, alt + + +def _sample_column(sv: SV) -> tuple[str, str]: + """ + Return (FORMAT, sample) columns for a Manta record + + Only the FORMAT fields declared in the reference template (``PR``, ``SR``) + are emitted so that the output parses cleanly through :mod:`pysam` + """ + ref, alt = _sample_depth(sv.vaf) + split_alt = max(1, alt // 2) + split_ref = max(0, ref // 2) + fmt = "PR:SR" + sample = f"{ref},{alt}:{split_ref},{split_alt}" + return fmt, sample + + +def _base_info(sv: SV) -> list[str]: + info: list[str] = [f"SVTYPE={sv.svtype}"] + if sv.svtype in {"DEL", "DUP", "INV"}: + signed_len = -sv.svlen if sv.svtype == "DEL" else sv.svlen + info.append(f"END={sv.end}") + info.append(f"SVLEN={signed_len}") + elif sv.svtype == "INS": + info.append(f"END={sv.pos}") + info.append(f"SVLEN={sv.svlen}") + info.append(f"SVINSLEN={sv.svlen}") + if sv.ins_seq: + info.append(f"SVINSSEQ={sv.ins_seq}") + if sv.homlen: + info.append(f"HOMLEN={sv.homlen}") + if sv.homseq: + info.append(f"HOMSEQ={sv.homseq}") + if sv.origin == "somatic" and sv.svtype != "BND": + info.append("SOMATIC") + info.append("SOMATICSCORE=60") + info.append(f"SVFORGE_SOURCE={sv.source}") + for key, value in sv.info_extra.items(): + info.append(f"{key}={value}" if value else key) + return info + + +def _symbolic_record(sv: SV) -> str: + alt = f"<{sv.svtype}>" + info = ";".join(_base_info(sv)) + fmt, sample = _sample_column(sv) + return "\t".join( + [ + sv.chrom, + str(sv.pos), + sv.id, + sv.ref_base, + alt, + ".", + sv.filter, + info, + fmt, + sample, + ] + ) + + +def _bnd_alt(ref_base: str, mate_chrom: str, mate_pos: int, strands: str) -> str: + """ + Return the Manta-style BND ALT string for a breakend + + VCF 4.2 mate syntax: + + - ``++`` (``t[p[``): REF joins region right of ``p`` + - ``+-`` (``t]p]``): REF joins region left of ``p`` + - ``-+`` (``[p[t``): region right of ``p`` joins REF + - ``--`` (``]p]t``): region left of ``p`` joins REF + """ + loc = f"{mate_chrom}:{mate_pos}" + s1, s2 = strands[0], strands[1] + if s1 == "+" and s2 == "+": + return f"{ref_base}[{loc}[" + if s1 == "+" and s2 == "-": + return f"{ref_base}]{loc}]" + if s1 == "-" and s2 == "+": + return f"[{loc}[{ref_base}" + return f"]{loc}]{ref_base}" + + +def _mate_strands(strands: str) -> str: + """ + Flip strands for the mate record (breakend pair) + """ + s1, s2 = strands[0], strands[1] + return f"{s2}{s1}" + + +def _bnd_mate_records(sv: SV) -> list[str]: + if sv.mate_chrom is None or sv.mate_pos is None: + raise ValueError(f"BND {sv.id!r} missing mate coordinates") + id1 = f"{sv.id}_1" + id2 = f"{sv.id}_2" + + alt1 = _bnd_alt(sv.ref_base, sv.mate_chrom, sv.mate_pos, sv.strands) + alt2 = _bnd_alt(sv.ref_base, sv.chrom, sv.pos, _mate_strands(sv.strands)) + + info1 = ["SVTYPE=BND", f"MATEID={id2}", f"EVENT={sv.id}"] + info2 = ["SVTYPE=BND", f"MATEID={id1}", f"EVENT={sv.id}"] + if sv.homlen: + info1.append(f"HOMLEN={sv.homlen}") + info2.append(f"HOMLEN={sv.homlen}") + if sv.origin == "somatic": + for info in (info1, info2): + info.append("SOMATIC") + info.append("SOMATICSCORE=60") + info1.append(f"SVFORGE_SOURCE={sv.source}") + info2.append(f"SVFORGE_SOURCE={sv.source}") + + fmt, sample = _sample_column(sv) + rec1 = "\t".join( + [ + sv.chrom, + str(sv.pos), + id1, + sv.ref_base, + alt1, + ".", + sv.filter, + ";".join(info1), + fmt, + sample, + ] + ) + rec2 = "\t".join( + [ + sv.mate_chrom, + str(sv.mate_pos), + id2, + sv.ref_base, + alt2, + ".", + sv.filter, + ";".join(info2), + fmt, + sample, + ] + ) + return [rec1, rec2] + + +from svforge.writers._registry import register_writer # noqa: E402 + +register_writer("manta")(MantaWriter) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..9652f74 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,36 @@ +""" +Shared fixtures for the svforge test suite +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from svforge.core.bank import Bank, load_bank +from svforge.core.models import SV +from svforge.core.sampler import SamplerConfig, sample + +FIXTURES_DIR = Path(__file__).parent / "fixtures" + +@pytest.fixture +def fixtures_dir() -> Path: + return FIXTURES_DIR + +@pytest.fixture +def mini_bank_path(fixtures_dir: Path) -> Path: + return fixtures_dir / "mini_bank.yaml" + +@pytest.fixture +def mini_blacklist_path(fixtures_dir: Path) -> Path: + return fixtures_dir / "mini_blacklist.bed" + +@pytest.fixture +def mini_bank(mini_bank_path: Path) -> Bank: + return load_bank(mini_bank_path) + +@pytest.fixture +def sample_svs(mini_bank: Bank) -> list[SV]: + cfg = SamplerConfig(n=10, seed=42) + return sample(mini_bank, cfg) diff --git a/tests/fixtures/broken_manta.template b/tests/fixtures/broken_manta.template new file mode 100644 index 0000000..abad62c --- /dev/null +++ b/tests/fixtures/broken_manta.template @@ -0,0 +1,7 @@ +##fileformat=VCFv4.1 +##fileDate={FILE_DATE} +##source=GenerateSVCandidates 1.6.0 +##contig= +##INFO= +##cmdline={CMDLINE} +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT {NORMAL_SAMPLE} {TUMOR_SAMPLE} diff --git a/tests/fixtures/custom_manta.template b/tests/fixtures/custom_manta.template new file mode 100644 index 0000000..a80a994 --- /dev/null +++ b/tests/fixtures/custom_manta.template @@ -0,0 +1,27 @@ +##fileformat=VCFv4.1 +##fileDate={FILE_DATE} +##source=GenerateSVCandidates 1.6.0 +##reference=file:///{REFERENCE_FASTA} +##contig= +##contig= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FORMAT= +##FORMAT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##cmdline={CMDLINE} +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT {NORMAL_SAMPLE} {TUMOR_SAMPLE} diff --git a/tests/fixtures/mini_bank.yaml b/tests/fixtures/mini_bank.yaml new file mode 100644 index 0000000..8825200 --- /dev/null +++ b/tests/fixtures/mini_bank.yaml @@ -0,0 +1,24 @@ +name: mini_bank +genome: hg38 + +templates: + - svtype: DEL + svlen: [100, 1000] + homlen: [0, 5] + weight: 2.0 + - svtype: DUP + svlen: [500, 5000] + homlen: [0, 10] + weight: 1.0 + - svtype: INV + svlen: [1000, 10000] + homlen: [0, 15] + weight: 1.0 + - svtype: INS + svlen: [50, 500] + homlen: [0, 5] + weight: 1.0 + - svtype: BND + svlen: [1, 1] + homlen: [0, 5] + weight: 1.0 diff --git a/tests/fixtures/mini_blacklist.bed b/tests/fixtures/mini_blacklist.bed new file mode 100644 index 0000000..c0299e5 --- /dev/null +++ b/tests/fixtures/mini_blacklist.bed @@ -0,0 +1,11 @@ +# Hand-crafted mini blacklist for testing, BED format (0-based half-open) +chr1 1000000 1100000 +chr1 50000000 50010000 +chr2 2000000 2050000 +chr3 3000000 3100000 +chr4 4000000 4020000 +chr5 5000000 5050000 +chr10 10000000 10100000 +chr17 17000000 17200000 +chrX 30000000 30100000 +chrY 5000000 5010000 diff --git a/tests/test_bank.py b/tests/test_bank.py new file mode 100644 index 0000000..cd442ad --- /dev/null +++ b/tests/test_bank.py @@ -0,0 +1,43 @@ +""" +Tests for bank loading and the built-in default bank +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from svforge.core.bank import list_builtin_banks, load_bank + + +def test_load_builtin_default_bank() -> None: + bank = load_bank("default_hg38") + assert bank.name == "default_hg38" + assert bank.genome == "hg38" + assert len(bank.templates) >= 15 + assert bank.svtypes() == {"DEL", "DUP", "INV", "INS", "BND"} + +def test_list_builtin_banks_contains_default() -> None: + assert "default_hg38" in list_builtin_banks() + +def test_load_mini_bank_from_path(mini_bank_path: Path) -> None: + bank = load_bank(mini_bank_path) + assert bank.name == "mini_bank" + assert len(bank.templates) == 5 + +def test_load_bank_missing() -> None: + with pytest.raises(FileNotFoundError): + load_bank("does_not_exist_anywhere") + +def test_invalid_bank_missing_templates(tmp_path: Path) -> None: + bad = tmp_path / "bad.yaml" + bad.write_text("name: bad\ngenome: hg38\ntemplates: []\n") + with pytest.raises(ValueError, match="templates"): + load_bank(bad) + +def test_invalid_genome(tmp_path: Path) -> None: + bad = tmp_path / "bad.yaml" + bad.write_text("name: bad\ngenome: t2t\ntemplates:\n - {svtype: DEL, svlen: [100, 200]}\n") + with pytest.raises(ValueError, match="genome"): + load_bank(bad) diff --git a/tests/test_chromosome_filter.py b/tests/test_chromosome_filter.py new file mode 100644 index 0000000..546b971 --- /dev/null +++ b/tests/test_chromosome_filter.py @@ -0,0 +1,219 @@ +""" +Chromosome-filter tests for ``--chromosomes`` on gen and gen-pair + +Requirements from the V1 robustness pass: + +1. ``--chromosomes chr1`` only produces records on ``chr1`` and no + cross-chromosome BND +2. ``--chromosomes chr8,chr14`` permits chr8<->chr14 BND events +3. ``--chromosomes 1,7`` is normalised to ``chr1,chr7`` +4. ``--chromosomes chrZZ`` raises a validation error with a non-zero exit code +5. A filter that empties the pool raises an explicit error +""" + +from __future__ import annotations + +import re +from pathlib import Path + +import pysam + +from svforge.cli import main +from svforge.core.genome import normalize_chromosomes + +FIXTURES = Path(__file__).parent / "fixtures" +BANK = FIXTURES / "mini_bank.yaml" + +def _records(vcf: Path) -> list[pysam.VariantRecord]: + with pysam.VariantFile(str(vcf)) as vf: + return list(vf) + +def test_single_chromosome_filters_all_records(tmp_path: Path) -> None: + out = tmp_path / "out.vcf" + rc = main( + [ + "gen", + "--caller", + "manta", + "--bank", + str(BANK), + "--out", + str(out), + "--n", + "20", + "--sample-name", + "S", + "--chromosomes", + "chr1", + "--seed", + "7", + "--svtypes", + "DEL,DUP,INV,INS", + ] + ) + assert rc == 0 + recs = _records(out) + assert recs + for rec in recs: + assert rec.chrom == "chr1" + +def test_two_chromosomes_permit_bnd_between_them(tmp_path: Path) -> None: + out = tmp_path / "out.vcf" + rc = main( + [ + "gen", + "--caller", + "delly", + "--bank", + str(BANK), + "--out", + str(out), + "--n", + "30", + "--sample-name", + "S", + "--chromosomes", + "chr8,chr14", + "--seed", + "11", + ] + ) + assert rc == 0 + recs = _records(out) + assert recs + allowed = {"chr8", "chr14"} + for rec in recs: + assert rec.chrom in allowed + if rec.info.get("SVTYPE") == "BND": + alt = str(rec.alts[0]) if rec.alts else "" + mate_match = re.search(r"[\[\]]([^:\[\]]+):\d+[\[\]]", alt) + if mate_match: + assert mate_match.group(1) in allowed + +def test_alias_digits_normalize_to_chr_prefix() -> None: + assert normalize_chromosomes(["1", "7"], "hg38") == ["chr1", "chr7"] + assert normalize_chromosomes(["X", "MT"], "hg38") == ["chrX", "chrM"] + assert normalize_chromosomes(["chr3"], "hg38") == ["chr3"] + +def test_cli_alias_digits_normalize(tmp_path: Path) -> None: + out = tmp_path / "out.vcf" + rc = main( + [ + "gen", + "--caller", + "manta", + "--bank", + str(BANK), + "--out", + str(out), + "--n", + "5", + "--sample-name", + "S", + "--chromosomes", + "1", + "--seed", + "0", + "--svtypes", + "DEL", + ] + ) + assert rc == 0 + for rec in _records(out): + assert rec.chrom == "chr1" + +def test_unknown_chromosome_exits_non_zero(tmp_path: Path) -> None: + out = tmp_path / "out.vcf" + rc = main( + [ + "gen", + "--caller", + "manta", + "--bank", + str(BANK), + "--out", + str(out), + "--n", + "1", + "--sample-name", + "S", + "--chromosomes", + "chrZZ", + "--seed", + "0", + ] + ) + assert rc != 0 + +def test_empty_pool_raises_explicit_error(tmp_path: Path) -> None: + """ + Filter requests a chromosome that no template in the bank covers + """ + constrained = tmp_path / "bank.yaml" + constrained.write_text( + "name: bounded\n" + "genome: hg38\n" + "templates:\n" + " - svtype: DEL\n" + " svlen: [100, 1000]\n" + " homlen: [0, 5]\n" + " weight: 1.0\n" + " chroms: [chr1, chr2]\n", + encoding="utf-8", + ) + out = tmp_path / "out.vcf" + rc = main( + [ + "gen", + "--caller", + "manta", + "--bank", + str(constrained), + "--out", + str(out), + "--n", + "5", + "--sample-name", + "S", + "--chromosomes", + "chr21,chr22", + "--seed", + "0", + ] + ) + assert rc != 0 + +def test_gen_pair_accepts_chromosome_filter(tmp_path: Path) -> None: + tumor = tmp_path / "t.vcf" + normal = tmp_path / "n.vcf" + rc = main( + [ + "gen-pair", + "--caller", + "manta", + "--bank", + str(BANK), + "--out-tumor", + str(tumor), + "--out-normal", + str(normal), + "--n-somatic", + "5", + "--n-germline", + "5", + "--tumor-sample-name", + "T", + "--normal-sample-name", + "N", + "--chromosomes", + "chr1,chr2", + "--seed", + "3", + "--svtypes", + "DEL,DUP", + ] + ) + assert rc == 0 + for path in (tumor, normal): + for rec in _records(path): + assert rec.chrom in {"chr1", "chr2"} diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..4ec21b3 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,139 @@ +""" +CLI integration tests -- each subcommand end-to-end +""" + +from __future__ import annotations + +from pathlib import Path + +import pysam + +from svforge.cli import main + + +def test_cli_callers(capsys) -> None: # type: ignore[no-untyped-def] + rc = main(["callers"]) + assert rc == 0 + out = capsys.readouterr().out.splitlines() + assert "manta" in out + assert "delly" in out + +def test_cli_bank_list(capsys) -> None: # type: ignore[no-untyped-def] + rc = main(["bank", "list"]) + assert rc == 0 + assert "default_hg38" in capsys.readouterr().out + +def test_cli_bank_show(capsys) -> None: # type: ignore[no-untyped-def] + rc = main(["bank", "show", "default_hg38"]) + assert rc == 0 + out = capsys.readouterr().out + assert "name: default_hg38" in out + assert "templates:" in out + +def test_cli_gen_manta_vcf_gz(tmp_path: Path, mini_bank_path: Path) -> None: + out = tmp_path / "out.vcf.gz" + rc = main( + [ + "gen", + "--caller", + "manta", + "--out", + str(out), + "--n", + "15", + "--sample-name", + "T01", + "--bank", + str(mini_bank_path), + "--seed", + "42", + ] + ) + assert rc == 0 + assert out.exists() + with pysam.VariantFile(str(out)) as vf: + assert sum(1 for _ in vf) > 0 + +def test_cli_gen_pair(tmp_path: Path, mini_bank_path: Path) -> None: + tumor = tmp_path / "tumor.vcf.gz" + normal = tmp_path / "normal.vcf.gz" + rc = main( + [ + "gen-pair", + "--caller", + "delly", + "--out-tumor", + str(tumor), + "--out-normal", + str(normal), + "--n-somatic", + "5", + "--n-germline", + "6", + "--tumor-sample-name", + "T01", + "--normal-sample-name", + "N01", + "--bank", + str(mini_bank_path), + "--seed", + "99", + "--svtypes", + "DEL,DUP,INV", + ] + ) + assert rc == 0 + with pysam.VariantFile(str(tumor)) as vf: + tumor_ids = {r.id for r in vf if r.id} + with pysam.VariantFile(str(normal)) as vf: + normal_ids = {r.id for r in vf if r.id} + assert normal_ids.issubset(tumor_ids) + assert len(tumor_ids - normal_ids) >= 5 + +def test_cli_validate_pass( + tmp_path: Path, + mini_bank_path: Path, +) -> None: + gen_out = tmp_path / "gen.vcf.gz" + rc = main( + [ + "gen", + "--caller", + "manta", + "--out", + str(gen_out), + "--n", + "40", + "--sample-name", + "T01", + "--bank", + str(mini_bank_path), + "--seed", + "7", + "--svtypes", + "DEL,DUP,INV", + "--blacklist-fraction", + "0.10", + "--gnomad-fraction", + "0.20", + ] + ) + assert rc == 0 + + report = tmp_path / "report.tsv" + rc = main( + [ + "validate", + "--vcf", + str(gen_out), + "--report-tsv", + str(report), + ] + ) + assert rc == 0 + content = report.read_text() + assert "metric" in content + assert "gnomad_self_consistency\t" in content + assert "blacklist_self_consistency\t" in content + assert content.count("PASS") == 2 + assert "FAIL" not in content diff --git a/tests/test_delly_writer.py b/tests/test_delly_writer.py new file mode 100644 index 0000000..a75e81c --- /dev/null +++ b/tests/test_delly_writer.py @@ -0,0 +1,83 @@ +""" +Tests for the DELLY writer with pysam round-trip validation +""" + +from __future__ import annotations + +from pathlib import Path + +import pysam + +from svforge.core.models import SV +from svforge.io.vcf_writer import write_vcf +from svforge.writers import get_writer + + +def _svs() -> list[SV]: + return [ + SV(id="del1", svtype="DEL", chrom="chr1", pos=100_000, end=101_000, svlen=1_000), + SV(id="dup1", svtype="DUP", chrom="chr2", pos=200_000, end=205_000, svlen=5_000), + SV( + id="inv1", + svtype="INV", + chrom="chr3", + pos=300_000, + end=310_000, + svlen=10_000, + strands="++", + ), + SV(id="ins1", svtype="INS", chrom="chr4", pos=400_000, end=400_000, svlen=200), + SV( + id="bnd1", + svtype="BND", + chrom="chr5", + pos=500_000, + end=500_000, + svlen=1, + mate_chrom="chr7", + mate_pos=700_000, + strands="+-", + ), + ] + +def test_delly_one_record_per_event(tmp_path: Path) -> None: + writer = get_writer("delly") + svs = _svs() + header = writer.header_lines("SAMP01") + records = writer.format_records(svs, "SAMP01") + assert len(records) == len(svs) + + out = tmp_path / "out.vcf" + write_vcf(out, header, records) + with pysam.VariantFile(str(out)) as vf: + got = list(vf) + assert len(got) == len(svs) + +def test_delly_bnd_has_chr2_pos2(tmp_path: Path) -> None: + writer = get_writer("delly") + sv = SV( + id="bnd1", + svtype="BND", + chrom="chr5", + pos=500_000, + end=500_000, + svlen=1, + mate_chrom="chr7", + mate_pos=700_000, + strands="+-", + ) + out = tmp_path / "d.vcf" + write_vcf(out, writer.header_lines("S"), writer.format_records([sv], "S")) + with pysam.VariantFile(str(out)) as vf: + rec = next(iter(vf)) + assert rec.info["CHR2"] == "chr7" + assert int(rec.info["POS2"]) == 700_000 + assert "CT" in rec.info + +def test_delly_roundtrip_bcf(tmp_path: Path) -> None: + writer = get_writer("delly") + svs = _svs() + out = tmp_path / "out.bcf" + write_vcf(out, writer.header_lines("S"), writer.format_records(svs, "S")) + with pysam.VariantFile(str(out)) as vf: + assert sum(1 for _ in vf) == len(svs) diff --git a/tests/test_genome_validation.py b/tests/test_genome_validation.py new file mode 100644 index 0000000..f78b87c --- /dev/null +++ b/tests/test_genome_validation.py @@ -0,0 +1,98 @@ +""" +Validate the ``--genome`` CLI option + +V1 supports hg38 only. Any other value must exit non-zero with a clear +message that points the user at ``--header-template`` for self-service +override +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from svforge.cli import main +from svforge.core.genome import validate_genome + +BANK = Path(__file__).parent / "fixtures" / "mini_bank.yaml" + +def test_validate_genome_accepts_hg38() -> None: + assert validate_genome("hg38") == "hg38" + +@pytest.mark.parametrize("bad", ["hg19", "t2t", "chm13", "invalid"]) +def test_validate_genome_rejects_everything_else(bad: str) -> None: + with pytest.raises(ValueError, match="No bundled header template"): + validate_genome(bad) + +def test_validate_genome_error_points_to_header_template() -> None: + with pytest.raises(ValueError, match="--header-template PATH"): + validate_genome("hg19") + +def test_cli_gen_with_hg38_succeeds(tmp_path: Path) -> None: + rc = main( + [ + "gen", + "--caller", + "manta", + "--bank", + str(BANK), + "--out", + str(tmp_path / "out.vcf"), + "--n", + "2", + "--sample-name", + "S", + "--genome", + "hg38", + "--seed", + "1", + "--svtypes", + "DEL", + ] + ) + assert rc == 0 + +def test_cli_gen_with_hg19_exits_non_zero(tmp_path: Path) -> None: + rc = main( + [ + "gen", + "--caller", + "manta", + "--bank", + str(BANK), + "--out", + str(tmp_path / "out.vcf"), + "--n", + "1", + "--sample-name", + "S", + "--genome", + "hg19", + "--seed", + "0", + ] + ) + assert rc != 0 + +def test_cli_gen_with_invalid_genome_exits_non_zero(tmp_path: Path) -> None: + rc = main( + [ + "gen", + "--caller", + "manta", + "--bank", + str(BANK), + "--out", + str(tmp_path / "out.vcf"), + "--n", + "1", + "--sample-name", + "S", + "--genome", + "does_not_exist", + "--seed", + "0", + ] + ) + assert rc != 0 diff --git a/tests/test_header_fidelity.py b/tests/test_header_fidelity.py new file mode 100644 index 0000000..7b43003 --- /dev/null +++ b/tests/test_header_fidelity.py @@ -0,0 +1,129 @@ +""" +Header-fidelity test: svforge output must match the real-caller template + +The test generates a paired tumor/normal header for each writer with zero +records and compares it line-by-line to the reference template. Only five +categories of lines are ignored (they are either dynamic or deliberately +different): + +- ``##svforge*`` (not present in the reference) +- ``##source`` (delibarately different from real output) +- ``##fileDate`` (dynamic) +- ``##cmdline`` (dynamic) +- ``##reference`` (dynamic: substituted with a synthetic placeholder) + +Any other divergence means the writer drifted from the real caller and +must be fixed before release +""" + +from __future__ import annotations + +from importlib import resources +from pathlib import Path + +import pytest + +from svforge.core.provenance import build_svforge_tags +from svforge.io.vcf_writer import write_vcf +from svforge.writers import get_writer +from svforge.writers.base import CallerWriter + + +def _load_template(name: str) -> list[str]: + return ( + resources.files("svforge.data.headers") + .joinpath(name) + .read_text(encoding="utf-8") + .splitlines() + ) + +def _filter_ignored(lines: list[str]) -> list[str]: + return [ + line + for line in lines + if not line.startswith("##svforge") + and not line.startswith("##source") + and not line.startswith("##fileDate") + and not line.startswith("##cmdline") + and not line.startswith("##reference") + ] + +def _substitute_samples(lines: list[str], tumor: str, normal: str) -> list[str]: + out: list[str] = [] + for line in lines: + if line.startswith("#CHROM"): + out.append(line.replace("{TUMOR_SAMPLE}", tumor).replace("{NORMAL_SAMPLE}", normal)) + else: + out.append(line) + return out + +@pytest.mark.parametrize( + ("caller", "template_name"), + [ + ("manta", "manta_somatic_hg38.vcf.template"), + ("delly", "delly_somatic_hg38.vcf.template"), + ], +) +def test_paired_header_matches_template(caller: str, template_name: str) -> None: + writer: CallerWriter = get_writer(caller) + tumor, normal = "TUMOR01", "NORMAL01" + tags = build_svforge_tags(caller=caller, seed=42, argv=["svforge", "gen-pair"]) + + generated = writer.header_lines_paired(tumor, normal, provenance_tags=tags) + template = _substitute_samples(_load_template(template_name), tumor, normal) + + assert _filter_ignored(generated) == _filter_ignored(template) + +@pytest.mark.parametrize("caller", ["manta", "delly"]) +def test_svforge_tags_are_injected_right_after_fileformat(caller: str) -> None: + writer: CallerWriter = get_writer(caller) + tags = build_svforge_tags(caller=caller, seed=0, argv=["svforge"]) + lines = writer.header_lines_paired("T", "N", provenance_tags=tags) + + assert lines[0].startswith("##fileformat=") + for idx, tag in enumerate(tags, start=1): + assert lines[idx] == tag + +def test_templates_contain_no_sensitive_data() -> None: + """ + Guard the committed templates against leaks of internal run metadata + """ + for name in ( + "manta_somatic_hg38.vcf.template", + "delly_somatic_hg38.vcf.template", + ): + text = resources.files("svforge.data.headers").joinpath(name).read_text(encoding="utf-8") + for bad in ("P69", "pmichonn", "beegfs", "/mnt/"): + assert bad not in text, f"{bad!r} leaked into {name}" + +@pytest.mark.parametrize("caller", ["manta", "delly"]) +def test_single_sample_header_keeps_one_sample_column(caller: str) -> None: + writer: CallerWriter = get_writer(caller) + lines = writer.header_lines("ONLY") + chrom = next(line for line in lines if line.startswith("#CHROM")) + assert chrom.split("\t")[-1] == "ONLY" + assert chrom.count("\t") == 9 # 9 fixed + 1 sample = 10 fields + +def test_hg19_has_no_bundled_template() -> None: + writer = get_writer("manta") + with pytest.raises(ValueError, match="No bundled header template"): + writer.header_lines("S", genome="hg19") + +def test_no_pass_filter_in_manta_output(tmp_path: Path) -> None: + """ + Manta template does not declare ``##FILTER=`` so the + generated plain VCF must not either (the real Manta somatic header + doesn't carry it; pysam was previously injecting it, see B1) + """ + writer = get_writer("manta") + tags = build_svforge_tags(caller="manta", seed=0, argv=["svforge", "gen"]) + header = writer.header_lines("S", provenance_tags=tags) + + out = tmp_path / "manta_header_only.vcf" + write_vcf(out, header, []) + + text = out.read_text(encoding="utf-8") + assert "##FILTER=; Manta's somatic " + "template doesn't declare it" + ) diff --git a/tests/test_header_template_override.py b/tests/test_header_template_override.py new file mode 100644 index 0000000..f6b3407 --- /dev/null +++ b/tests/test_header_template_override.py @@ -0,0 +1,111 @@ +""" +User-supplied header template override: ``--header-template PATH`` + +1. A valid template with all required placeholders is used as-is +2. A template missing a required placeholder raises an explicit error +3. Omitting ``--header-template`` keeps the bundled template active +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from svforge.cli import main +from svforge.writers import get_writer + +FIXTURES = Path(__file__).parent / "fixtures" +BANK = FIXTURES / "mini_bank.yaml" +CUSTOM = FIXTURES / "custom_manta.template" +BROKEN = FIXTURES / "broken_manta.template" + +def test_override_template_is_used(tmp_path: Path) -> None: + out = tmp_path / "out.vcf" + rc = main( + [ + "gen", + "--caller", + "manta", + "--bank", + str(BANK), + "--out", + str(out), + "--n", + "1", + "--sample-name", + "S", + "--seed", + "0", + "--svtypes", + "DEL", + "--chromosomes", + "chr1", + "--header-template", + str(CUSTOM), + ] + ) + assert rc == 0 + text = out.read_text(encoding="utf-8") + assert "CUSTOM_SVFORGE_TAG" in text, "custom template marker not propagated" + +def test_broken_template_raises_missing_placeholder(tmp_path: Path) -> None: + rc = main( + [ + "gen", + "--caller", + "manta", + "--bank", + str(BANK), + "--out", + str(tmp_path / "out.vcf"), + "--n", + "1", + "--sample-name", + "S", + "--seed", + "0", + "--header-template", + str(BROKEN), + ] + ) + assert rc != 0 + +def test_missing_placeholder_raised_via_writer_api() -> None: + writer = get_writer("manta") + with pytest.raises(ValueError, match=r"missing required placeholders.*\{REFERENCE_FASTA\}"): + writer.header_lines("S", template_override=BROKEN) + +def test_bundled_template_used_when_override_absent(tmp_path: Path) -> None: + out = tmp_path / "out.vcf" + rc = main( + [ + "gen", + "--caller", + "manta", + "--bank", + str(BANK), + "--out", + str(out), + "--n", + "1", + "--sample-name", + "S", + "--seed", + "0", + "--svtypes", + "DEL", + "--chromosomes", + "chr1", + ] + ) + assert rc == 0 + text = out.read_text(encoding="utf-8") + assert "CUSTOM_SVFORGE_TAG" not in text + # the bundled Manta template advertises GenerateSVCandidates + assert "GenerateSVCandidates" in text + +def test_nonexistent_template_raises(tmp_path: Path) -> None: + writer = get_writer("manta") + with pytest.raises(FileNotFoundError): + writer.header_lines("S", template_override=tmp_path / "nope.template") diff --git a/tests/test_injection_catalogs.py b/tests/test_injection_catalogs.py new file mode 100644 index 0000000..98cea1d --- /dev/null +++ b/tests/test_injection_catalogs.py @@ -0,0 +1,47 @@ +""" +Tests for the bundled mini injection catalogs +""" + +from __future__ import annotations + +from svforge.core.genome import HG38_CONTIGS +from svforge.core.injection_catalogs import load_blacklist_catalog, load_gnomad_catalog + + +def test_gnomad_catalog_parses_with_min_entries() -> None: + catalog = load_gnomad_catalog() + assert len(catalog) >= 50 + +def test_blacklist_catalog_parses_with_min_entries() -> None: + catalog = load_blacklist_catalog() + assert len(catalog) >= 50 + +def test_gnomad_chroms_and_positions_within_contigs() -> None: + for entry in load_gnomad_catalog(): + assert entry.chrom in HG38_CONTIGS, entry + assert entry.end_chrom in HG38_CONTIGS, entry + assert 1 <= entry.pos <= HG38_CONTIGS[entry.chrom] + assert 1 <= entry.end <= HG38_CONTIGS[entry.end_chrom] + +def test_blacklist_chroms_and_positions_within_contigs() -> None: + for entry in load_blacklist_catalog(): + assert entry.chrom in HG38_CONTIGS, entry + assert 1 <= entry.pos <= HG38_CONTIGS[entry.chrom] + assert 1 <= entry.end <= HG38_CONTIGS[entry.chrom] + assert entry.end >= entry.pos + +def test_gnomad_has_four_or_more_svtypes() -> None: + svtypes = {entry.svtype for entry in load_gnomad_catalog()} + assert len(svtypes) >= 4, svtypes + +def test_catalogs_cover_at_least_10_chroms_combined() -> None: + chroms = {e.chrom for e in load_gnomad_catalog()} | {e.chrom for e in load_blacklist_catalog()} + assert len(chroms) >= 10, chroms + +def test_gnomad_source_ids_unique() -> None: + ids = [e.source_id for e in load_gnomad_catalog()] + assert len(ids) == len(set(ids)) + +def test_blacklist_source_ids_unique() -> None: + ids = [e.source_id for e in load_blacklist_catalog()] + assert len(ids) == len(set(ids)) diff --git a/tests/test_injection_sampling.py b/tests/test_injection_sampling.py new file mode 100644 index 0000000..30540f3 --- /dev/null +++ b/tests/test_injection_sampling.py @@ -0,0 +1,176 @@ +""" +Injection sampling: SVFORGE_SOURCE counts + exact catalog match +""" + +from __future__ import annotations + +from pathlib import Path + +import pysam +import pytest + +from svforge.cli import main +from svforge.core.bank import Bank +from svforge.core.injection_catalogs import load_blacklist_catalog, load_gnomad_catalog +from svforge.core.sampler import SamplerConfig, sample + + +def _record_coords(rec: pysam.VariantRecord) -> tuple[str, int, int]: + svtype = rec.info.get("SVTYPE") + if isinstance(svtype, tuple): + svtype = svtype[0] if svtype else None + pos = int(rec.pos) + if str(svtype) == "INS": + svlen = rec.info.get("SVLEN") + if isinstance(svlen, tuple): + svlen = svlen[0] if svlen else None + if svlen is not None: + return (str(rec.chrom), pos, pos + abs(int(svlen))) + return (str(rec.chrom), pos, int(rec.stop)) + +def _count_sources(vcf_path: Path) -> dict[str, int]: + counts: dict[str, int] = {"bank": 0, "gnomad": 0, "blacklist": 0} + seen_events: set[str] = set() + with pysam.VariantFile(str(vcf_path)) as vf: + for rec in vf: + event = rec.info.get("EVENT") + if isinstance(event, tuple): + event = event[0] if event else None + key = str(event) if event else f"{rec.chrom}:{rec.pos}:{rec.id}" + if event and event in seen_events: + continue + seen_events.add(key) + src = rec.info.get("SVFORGE_SOURCE") + if isinstance(src, tuple): + src = src[0] if src else None + if src: + counts[str(src)] = counts.get(str(src), 0) + 1 + return counts + +def test_gnomad_fraction_produces_matching_records(tmp_path: Path, mini_bank_path: Path) -> None: + out = tmp_path / "g.vcf" + rc = main( + [ + "gen", + "--caller", + "manta", + "--out", + str(out), + "--n", + "50", + "--sample-name", + "T", + "--bank", + str(mini_bank_path), + "--seed", + "42", + "--svtypes", + "DEL,DUP,INV,INS", + "--gnomad-fraction", + "0.20", + ] + ) + assert rc == 0 + counts = _count_sources(out) + assert counts["gnomad"] == 10 + + gnomad_coords = {(e.chrom, e.pos, e.end) for e in load_gnomad_catalog()} + with pysam.VariantFile(str(out)) as vf: + for rec in vf: + src = rec.info.get("SVFORGE_SOURCE") + if isinstance(src, tuple): + src = src[0] if src else None + if str(src) != "gnomad": + continue + coord = _record_coords(rec) + assert coord in gnomad_coords, coord + +def test_blacklist_fraction_produces_matching_records(tmp_path: Path, mini_bank_path: Path) -> None: + out = tmp_path / "b.vcf" + rc = main( + [ + "gen", + "--caller", + "manta", + "--out", + str(out), + "--n", + "50", + "--sample-name", + "T", + "--bank", + str(mini_bank_path), + "--seed", + "99", + "--svtypes", + "DEL,DUP,INV", + "--blacklist-fraction", + "0.20", + ] + ) + assert rc == 0 + counts = _count_sources(out) + assert counts["blacklist"] == 10 + + bl_coords = {(e.chrom, e.pos, e.end) for e in load_blacklist_catalog()} + with pysam.VariantFile(str(out)) as vf: + for rec in vf: + src = rec.info.get("SVFORGE_SOURCE") + if isinstance(src, tuple): + src = src[0] if src else None + if str(src) != "blacklist": + continue + coord = _record_coords(rec) + assert coord in bl_coords, coord + +def test_combined_fractions(tmp_path: Path, mini_bank_path: Path) -> None: + out = tmp_path / "c.vcf" + rc = main( + [ + "gen", + "--caller", + "manta", + "--out", + str(out), + "--n", + "100", + "--sample-name", + "T", + "--bank", + str(mini_bank_path), + "--seed", + "1", + "--svtypes", + "DEL,DUP,INV", + "--gnomad-fraction", + "0.20", + "--blacklist-fraction", + "0.10", + ] + ) + assert rc == 0 + counts = _count_sources(out) + assert counts["gnomad"] == 20 + assert counts["blacklist"] == 10 + assert counts["bank"] >= 70 + +def test_sum_of_fractions_over_one_is_rejected(mini_bank: Bank) -> None: + with pytest.raises(ValueError, match=r"<= 1\.0"): + sample( + mini_bank, + SamplerConfig(n=10, seed=0, gnomad_fraction=0.6, blacklist_fraction=0.5), + ) + +def test_sampling_without_replacement_when_possible(mini_bank: Bank) -> None: + svs = sample( + mini_bank, + SamplerConfig( + n=50, + seed=3, + svtypes=frozenset({"DEL", "DUP", "INV", "INS"}), + gnomad_fraction=0.20, + ), + ) + gnomad_ids = [sv.info_extra.get("SVFORGE_SOURCE_ID") for sv in svs if sv.source == "gnomad"] + assert len(gnomad_ids) == 10 + assert len(set(gnomad_ids)) == 10 diff --git a/tests/test_manta_writer.py b/tests/test_manta_writer.py new file mode 100644 index 0000000..ac065eb --- /dev/null +++ b/tests/test_manta_writer.py @@ -0,0 +1,116 @@ +""" +Tests for the Manta writer with pysam round-trip validation +""" + +from __future__ import annotations + +from pathlib import Path + +import pysam +import pytest + +from svforge.core.models import SV +from svforge.io.vcf_writer import write_vcf +from svforge.writers import get_writer + + +def _svs() -> list[SV]: + return [ + SV(id="del1", svtype="DEL", chrom="chr1", pos=100_000, end=101_000, svlen=1_000), + SV(id="dup1", svtype="DUP", chrom="chr2", pos=200_000, end=205_000, svlen=5_000), + SV( + id="inv1", + svtype="INV", + chrom="chr3", + pos=300_000, + end=310_000, + svlen=10_000, + strands="++", + ), + SV( + id="ins1", + svtype="INS", + chrom="chr4", + pos=400_000, + end=400_000, + svlen=200, + ins_seq="ACGT", + ), + SV( + id="bnd1", + svtype="BND", + chrom="chr5", + pos=500_000, + end=500_000, + svlen=1, + mate_chrom="chr7", + mate_pos=700_000, + strands="+-", + ), + ] + +def test_manta_header_and_records_parse(tmp_path: Path) -> None: + writer = get_writer("manta") + svs = _svs() + header = writer.header_lines("TUMOR01") + records = writer.format_records(svs, "TUMOR01") + + out = tmp_path / "out.vcf" + write_vcf(out, header, records) + + with pysam.VariantFile(str(out)) as vf: + got = list(vf) + # BND emits two records (mate pair); other svtypes emit one each + assert len(got) == len(svs) + 1 + + for rec in got: + assert "SVTYPE" in rec.info + +def test_manta_bnd_mates_have_mateid() -> None: + writer = get_writer("manta") + svs = [ + SV( + id="bnd1", + svtype="BND", + chrom="chr5", + pos=500_000, + end=500_000, + svlen=1, + mate_chrom="chr7", + mate_pos=700_000, + strands="+-", + ) + ] + records = writer.format_records(svs, "S") + assert len(records) == 2 + assert "MATEID=bnd1_2" in records[0] + assert "MATEID=bnd1_1" in records[1] + +def test_manta_writes_vcf_gz_and_bcf(tmp_path: Path) -> None: + writer = get_writer("manta") + svs = _svs() + header = writer.header_lines("S") + records = writer.format_records(svs, "S") + + for suffix in (".vcf", ".vcf.gz", ".bcf"): + out = tmp_path / f"out{suffix}" + write_vcf(out, header, records) + with pysam.VariantFile(str(out)) as vf: + assert sum(1 for _ in vf) == len(svs) + 1 + +def test_manta_rejects_unknown_svtype() -> None: + writer = get_writer("manta") + bad = [ + SV( + id="b1", + svtype="DEL", + chrom="chr1", + pos=1_000, + end=2_000, + svlen=1_000, + ) + ] + # sanity: valid svtypes are accepted; CallerWriter validates itself + writer.format_records(bad, "S") + with pytest.raises(ValueError, match="svtype"): + SV(id="b2", svtype="XYZ", chrom="chr1", pos=1, end=2, svlen=1) diff --git a/tests/test_models.py b/tests/test_models.py new file mode 100644 index 0000000..c8db52c --- /dev/null +++ b/tests/test_models.py @@ -0,0 +1,83 @@ +""" +Tests for core domain models +""" + +from __future__ import annotations + +import pytest + +from svforge.core.models import SV, Breakpoint, SVPair + + +class TestBreakpoint: + def test_valid(self) -> None: + bp = Breakpoint("chr1", 100, "+") + assert bp.chrom == "chr1" + assert bp.pos == 100 + assert bp.strand == "+" + + def test_invalid_pos(self) -> None: + with pytest.raises(ValueError, match="pos"): + Breakpoint("chr1", 0) + + def test_invalid_strand(self) -> None: + with pytest.raises(ValueError, match="strand"): + Breakpoint("chr1", 100, "x") #type : ignore[arg-type] + +class TestSV: + def test_valid_deletion(self) -> None: + sv = SV(id="x", svtype="DEL", chrom="chr1", pos=100, end=200, svlen=100) + assert sv.svlen == 100 + assert sv.breakpoint1 == Breakpoint("chr1", 100, "+") + assert sv.breakpoint2 == Breakpoint("chr1", 200, "-") + + def test_valid_bnd(self) -> None: + sv = SV( + id="x", + svtype="BND", + chrom="chr1", + pos=100, + end=100, + svlen=1, + mate_chrom="chr2", + mate_pos=200, + strands="+-", + ) + assert sv.breakpoint2.chrom == "chr2" + assert sv.breakpoint2.pos == 200 + + def test_unknown_svtype(self) -> None: + with pytest.raises(ValueError, match="svtype"): + SV(id="x", svtype="WAT", chrom="chr1", pos=1, end=2, svlen=1) + + def test_bnd_requires_mate(self) -> None: + with pytest.raises(ValueError, match="mate"): + SV(id="x", svtype="BND", chrom="chr1", pos=1, end=1, svlen=1) + + def test_end_before_pos(self) -> None: + with pytest.raises(ValueError, match="end"): + SV(id="x", svtype="DEL", chrom="chr1", pos=200, end=100, svlen=1) + + def test_invalid_vaf(self) -> None: + with pytest.raises(ValueError, match="VAF"): + SV(id="x", svtype="DEL", chrom="chr1", pos=1, end=2, svlen=1, vaf=1.5) + + def test_invalid_strands(self) -> None: + with pytest.raises(ValueError, match="strands"): + SV(id="x", svtype="DEL", chrom="chr1", pos=1, end=2, svlen=1, strands="xy") + +class TestSVPair: + def _mk(self, sv_id: str, pos: int) -> SV: + return SV(id=sv_id, svtype="DEL", chrom="chr1", pos=pos, end=pos + 50, svlen=50) + + def test_somatic_and_germline_partition(self) -> None: + germ = self._mk("g1", 1_000) + som = self._mk("s1", 2_000) + pair = SVPair(tumor=[germ, som], normal=[germ]) + assert pair.germline_ids == {"g1"} + assert pair.somatic_ids == {"s1"} + + def test_empty(self) -> None: + pair = SVPair() + assert pair.somatic_ids == set() + assert pair.germline_ids == set() diff --git a/tests/test_pair_mode.py b/tests/test_pair_mode.py new file mode 100644 index 0000000..33d9ed5 --- /dev/null +++ b/tests/test_pair_mode.py @@ -0,0 +1,53 @@ +""" +End-to-end tests for the pair (tumor + normal) mode +""" + +from __future__ import annotations + +from pathlib import Path + +import pysam + +from svforge.core.bank import Bank +from svforge.core.sampler import SamplerConfig, sample_pair +from svforge.io.vcf_writer import write_vcf +from svforge.writers import get_writer + + +def test_sample_pair_writes_two_consistent_vcfs(tmp_path: Path, mini_bank: Bank) -> None: + pair = sample_pair( + mini_bank, + n_somatic=8, + n_germline=10, + cfg=SamplerConfig( + n=0, + seed=13, + svtypes=frozenset({"DEL", "DUP", "INV"}), + vaf_range=(0.3, 0.9), + ), + ) + + writer = get_writer("manta") + tumor_out = tmp_path / "tumor.vcf.gz" + normal_out = tmp_path / "normal.vcf.gz" + + write_vcf( + tumor_out, + writer.header_lines("TUMOR01"), + writer.format_records(pair.tumor, "TUMOR01"), + ) + write_vcf( + normal_out, + writer.header_lines("NORMAL01"), + writer.format_records(pair.normal, "NORMAL01"), + ) + + with pysam.VariantFile(str(tumor_out)) as vf: + tumor_ids = {rec.id for rec in vf if rec.id} + with pysam.VariantFile(str(normal_out)) as vf: + normal_ids = {rec.id for rec in vf if rec.id} + + assert normal_ids.issubset(tumor_ids) + somatic_ids = tumor_ids - normal_ids + assert len(somatic_ids) >= 8 + assert len(normal_ids) >= 10 diff --git a/tests/test_provenance.py b/tests/test_provenance.py new file mode 100644 index 0000000..5fb4811 --- /dev/null +++ b/tests/test_provenance.py @@ -0,0 +1,154 @@ +""" +Provenance tests: every VCF produced by svforge must carry the six +``##svforge*`` header tags, in the spec order, right after ``##fileformat``. +The warning tag must be the exact invariant string +``SYNTHETIC_DATA_DO_NOT_USE_FOR_CLINICAL_DIAGNOSIS`` and must not be +removable via CLI or otherwise. ``sanitize_command`` must strip absolute +paths to their basename on POSIX, macOS and Windows-style inputs +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from svforge import __version__ +from svforge.cli import main +from svforge.core.provenance import ( + WARNING_VALUE, + build_svforge_tags, + sanitize_command, +) +from svforge.io.vcf_writer import write_vcf +from svforge.writers import get_writer + +_EXPECTED_KEYS = ( + "##svforgeVersion", + "##svforgeCommand", + "##svforgeSeed", + "##svforgeCaller", + "##svforgeWarning", + "##svforgeDocumentation", +) + +def test_build_tags_returns_six_tags_in_order() -> None: + tags = build_svforge_tags(caller="manta", seed=42, argv=["svforge", "gen"]) + assert len(tags) == 6 + keys = [t.split("=", 1)[0] for t in tags] + assert tuple(keys) == _EXPECTED_KEYS + +def test_warning_tag_value_is_the_invariant_string() -> None: + tags = build_svforge_tags(caller="delly", seed=0, argv=["svforge"]) + warning = next(t for t in tags if t.startswith("##svforgeWarning=")) + assert warning == f"##svforgeWarning={WARNING_VALUE}" + assert WARNING_VALUE == "SYNTHETIC_DATA_DO_NOT_USE_FOR_CLINICAL_DIAGNOSIS" + +def test_version_tag_matches_package_version() -> None: + tags = build_svforge_tags(caller="manta", seed=1, argv=["svforge"]) + assert f"##svforgeVersion={__version__}" in tags + +def test_caller_tag_reflects_caller_argument() -> None: + assert "##svforgeCaller=manta" in build_svforge_tags(caller="manta", seed=0, argv=["x"]) + assert "##svforgeCaller=delly" in build_svforge_tags(caller="delly", seed=0, argv=["x"]) + +@pytest.mark.parametrize( + ("argv", "expected"), + [ + ( + ["svforge", "gen", "--bank", "/mnt/beegfs/user/secret/bank.yaml"], + "svforge gen --bank bank.yaml", + ), + ( + ["svforge", "gen", "--bank=/home/alice/banks/my_bank.yaml"], + "svforge gen --bank=my_bank.yaml", + ), + ( + ["svforge", "--out", "C:\\Users\\bob\\run\\out.vcf"], + "svforge --out out.vcf", + ), + ( + ["svforge", "--seed", "42"], + "svforge --seed 42", + ), + ( + ["/Users/name with space/.venv/bin/svforge", "gen"], + "svforge gen", + ), + ( + ["svforge", "--bank=/home/alice/my banks/bank.yaml"], + "svforge --bank=bank.yaml", + ), + ( + ["svforge", "--out", "C:\\Users\\Name With Space\\svforge.exe"], + "svforge --out svforge.exe", + ), + ( + ["svforge", "--out", "C:/Users/Name With Space/out.vcf"], + "svforge --out out.vcf", + ), + ], +) +def test_sanitize_command_strips_absolute_paths(argv: list[str], expected: str) -> None: + assert sanitize_command(argv) == expected + +def test_sanitize_command_preserves_relative_paths() -> None: + argv = ["svforge", "gen", "--bank", "banks/local.yaml", "--out", "./out.vcf"] + assert sanitize_command(argv) == "svforge gen --bank banks/local.yaml --out ./out.vcf" + +@pytest.mark.parametrize("caller", ["manta", "delly"]) +def test_header_lines_inject_tags_in_order_after_fileformat(caller: str) -> None: + writer = get_writer(caller) + tags = build_svforge_tags(caller=caller, seed=123, argv=["svforge", "gen"]) + header = writer.header_lines("S1", provenance_tags=tags) + + assert header[0].startswith("##fileformat=") + injected = header[1 : 1 + len(_EXPECTED_KEYS)] + assert [line.split("=", 1)[0] for line in injected] == list(_EXPECTED_KEYS) + +@pytest.mark.parametrize("caller", ["manta", "delly"]) +def test_generated_vcf_contains_all_six_tags(tmp_path: Path, caller: str) -> None: + writer = get_writer(caller) + tags = build_svforge_tags(caller=caller, seed=123, argv=["svforge", "gen"]) + header = writer.header_lines("S1", provenance_tags=tags) + + out = tmp_path / f"{caller}.vcf" + write_vcf(out, header, []) + + lines = out.read_text(encoding="utf-8").splitlines() + assert lines[0].startswith("##fileformat=") + injected = lines[1 : 1 + len(_EXPECTED_KEYS)] + assert [line.split("=", 1)[0] for line in injected] == list(_EXPECTED_KEYS), ( + f"six ##svforge* tags must appear on lines 2-7 of the generated {caller} VCF; " + f"found {injected!r}" + ) + +def test_cli_gen_injects_provenance_and_warning(tmp_path: Path) -> None: + """ + End-to-end: ``svforge gen`` writes a VCF with all six tags + """ + bank = Path(__file__).parent / "fixtures" / "mini_bank.yaml" + out = tmp_path / "out.vcf" + rc = main( + [ + "gen", + "--caller", + "manta", + "--bank", + str(bank), + "--out", + str(out), + "--n", + "3", + "--sample-name", + "S1", + "--seed", + "42", + ] + ) + assert rc == 0 + text = out.read_text(encoding="utf-8") + for key in _EXPECTED_KEYS: + assert key in text, f"missing {key}" + assert WARNING_VALUE in text + assert "##svforgeSeed=42" in text diff --git a/tests/test_regions.py b/tests/test_regions.py new file mode 100644 index 0000000..d688210 --- /dev/null +++ b/tests/test_regions.py @@ -0,0 +1,48 @@ +""" +Tests for BED region overlap index +""" + +from __future__ import annotations + +import gzip +from pathlib import Path + +from svforge.core.regions import RegionSet, load_bed + + +def test_overlap_point() -> None: + rs = RegionSet([("chr1", 100, 200)]) + assert rs.overlaps_point("chr1", 150) + assert not rs.overlaps_point("chr1", 250) + assert not rs.overlaps_point("chr2", 150) + +def test_overlap_vcf_coords() -> None: + rs = RegionSet([("chr1", 1000, 2000)]) + assert rs.overlaps_vcf("chr1", 1500, 1600) + assert rs.overlaps_vcf("chr1", 500, 1500) + assert not rs.overlaps_vcf("chr1", 2001, 3000) + +def test_overlap_bed_half_open() -> None: + rs = RegionSet([("chr1", 100, 200)]) + assert rs.overlaps_bed("chr1", 100, 101) + assert rs.overlaps_bed("chr1", 199, 200) + assert not rs.overlaps_bed("chr1", 200, 300) + +def test_empty_regionset() -> None: + rs = RegionSet() + assert len(rs) == 0 + assert not rs.overlaps_point("chr1", 1) + +def test_load_bed_plain(mini_blacklist_path: Path) -> None: + rs = load_bed(mini_blacklist_path) + assert len(rs) >= 8 + assert "chr1" in rs.chromosomes + assert rs.overlaps_bed("chr1", 1_050_000, 1_060_000) + +def test_load_bed_gzipped(tmp_path: Path, mini_blacklist_path: Path) -> None: + src = mini_blacklist_path.read_bytes() + gz = tmp_path / "bl.bed.gz" + gz.write_bytes(gzip.compress(src)) + rs = load_bed(gz) + assert len(rs) >= 8 + assert rs.overlaps_bed("chr2", 2_010_000, 2_020_000) diff --git a/tests/test_sampler.py b/tests/test_sampler.py new file mode 100644 index 0000000..f82e983 --- /dev/null +++ b/tests/test_sampler.py @@ -0,0 +1,71 @@ +""" +Tests for the sampler +""" + +from __future__ import annotations + +from svforge.core.bank import Bank +from svforge.core.sampler import SamplerConfig, sample, sample_pair + + +def test_sample_deterministic_with_seed(mini_bank: Bank) -> None: + cfg = SamplerConfig(n=30, seed=42) + a = sample(mini_bank, cfg) + b = sample(mini_bank, cfg) + assert [sv.id for sv in a] == [sv.id for sv in b] + assert [(sv.chrom, sv.pos, sv.svtype, sv.svlen) for sv in a] == [ + (sv.chrom, sv.pos, sv.svtype, sv.svlen) for sv in b + ] + +def test_sample_different_seeds_differ(mini_bank: Bank) -> None: + a = sample(mini_bank, SamplerConfig(n=20, seed=1)) + b = sample(mini_bank, SamplerConfig(n=20, seed=2)) + assert [sv.id for sv in a] != [sv.id for sv in b] + +def test_sample_count(mini_bank: Bank) -> None: + svs = sample(mini_bank, SamplerConfig(n=50, seed=0)) + assert len(svs) == 50 + +def test_sample_restricted_svtypes(mini_bank: Bank) -> None: + svs = sample(mini_bank, SamplerConfig(n=40, seed=0, svtypes=frozenset({"DEL"}))) + assert {sv.svtype for sv in svs} == {"DEL"} + +def test_sample_svlen_range_clamp(mini_bank: Bank) -> None: + svs = sample( + mini_bank, + SamplerConfig(n=40, seed=0, svtypes=frozenset({"DEL"}), svlen_range=(200, 300)), + ) + for sv in svs: + assert 200 <= sv.svlen <= 300 + +def test_sample_vaf_range(mini_bank: Bank) -> None: + svs = sample(mini_bank, SamplerConfig(n=30, seed=0, vaf_range=(0.2, 0.8))) + for sv in svs: + assert 0.2 <= sv.vaf <= 0.8 + +def test_sample_blacklist_injection(mini_bank: Bank) -> None: + cfg = SamplerConfig( + n=20, + seed=0, + svtypes=frozenset({"DEL", "DUP", "INV"}), + blacklist_fraction=0.4, + ) + svs = sample(mini_bank, cfg) + assert sum(1 for sv in svs if sv.source == "blacklist") == 8 + +def test_sample_pair_germline_shared(mini_bank: Bank) -> None: + pair = sample_pair( + mini_bank, + n_somatic=10, + n_germline=15, + cfg=SamplerConfig(n=0, seed=7, svtypes=frozenset({"DEL", "DUP", "INV"})), + ) + germline_ids = pair.germline_ids + somatic_ids = pair.somatic_ids + assert len(germline_ids) == 15 + assert len(somatic_ids) == 10 + assert germline_ids.isdisjoint(somatic_ids) + + normal_coords = {(sv.id, sv.chrom, sv.pos, sv.end) for sv in pair.normal} + tumor_germ = {(sv.id, sv.chrom, sv.pos, sv.end) for sv in pair.tumor if sv.id in germline_ids} + assert normal_coords == tumor_germ