From 197b977a03250c7776f6cfcd342c3b00a621a7f5 Mon Sep 17 00:00:00 2001 From: Rene Snajder Date: Wed, 19 May 2021 16:43:43 +0200 Subject: [PATCH] Versipy auto bump-up --- .travis.yml | 22 +-- README.md | 346 +++++++++++++++++++++++--------------------- meta.yaml | 74 +++++----- setup.py | 76 +++++----- versipy.yaml | 76 +++++----- versipy_history.txt | 33 +++-- 6 files changed, 320 insertions(+), 307 deletions(-) diff --git a/.travis.yml b/.travis.yml index 56e0a05..1bfb3f2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,11 +1,11 @@ -dist: xenial -language: python -python: 3.7 -branches: - only: - - main - -install: - - pip install meth5 - -script: true +dist: xenial +language: python +python: 3.7 +branches: + only: + - main + +install: + - pip install meth5 + +script: true diff --git a/README.md b/README.md index d2cff8f..ff410ec 100644 --- a/README.md +++ b/README.md @@ -1,167 +1,179 @@ -from re import match# MetH5Format 0.3.1 - -[![GitHub license](https://img.shields.io/github/license/snajder-r/meth5format.svg)](https://github.com/snajder-r/meth5format/blob/master/LICENSE) -[![DOI](https://zenodo.org/badge/303672813.svg)](https://zenodo.org/badge/latestdoi/303672813) -[![Language](https://img.shields.io/badge/Language-Python3.7+-yellow.svg)](https://www.python.org/) -[![Build Status](https://travis-ci.com/snajder-r/meth5format.svg?branch=main)](https://travis-ci.com/snajder-r/meth5format) -[![Code style: black](https://img.shields.io/badge/code%20style-black-black.svg?style=flat)](https://github.com/snajder-r/black "Black (modified)") - - -[![PyPI version](https://badge.fury.io/py/meth5.svg)](https://badge.fury.io/py/meth5) -[![PyPI downloads](https://pepy.tech/badge/meth5)](https://pepy.tech/project/meth5) -[![Anaconda Version](https://img.shields.io/conda/v/snajder-r/meth5?color=blue)](https://anaconda.org/snajder-r/meth5) -[![Anaconda Downloads](https://anaconda.org/snajder-r/meth5/badges/downloads.svg)](https://anaconda.org/snajder-r/meth5) - -MetH5 is an HDF5-based container format for methylation calls from long reads. - -In the current version, the MetH5 format can store the following information: -* Log-likelihood ratio of each methylation call -* Genomic coordinates (start and end) of each methylation call -* The read name associated with each call -* Read grouping (i.e. annotation such as samples or haplotypes) - -## Installation - -Through pip: - -``` -pip install meth5 -```` - -Through anaconda: - -``` -conda install -c snajder-r meth5 -``` - -## Usage - -### Creating a MetH5 file from nanopolish methylation calls - -Assuming you have nanopolish methylation calls with filenames `*.tsv`, you can create a MetH5 file with the following command: - -``` -meth5 create_h5 --input_dir INPUT_DIR/ --output_file OUTPUT_FILE.m5 -``` - -In order to annotate reads with read grouping (for example as samples or haplotypes) you can do so by running: - -``` -meth annotate_reads --m5file M5FILE.m5 --read_groups_key READ_GROUPS_KEY --read_group_file READ_GROUP_FILE -``` - -Where the `READ_GROUPS_KEY` is the key under which you want to store the annotation (you can store multiple read annotations), -and `READ_GROUP_FILE` is a tab-delimited file containg read name and read group. For example: - -``` -read_name group -7741f9ee-ad41-42a4-99b2-290c66960410 1 -4f18b48e-a1d3-49ad-ace3-cfb96b78ad79 2 -... -``` - -### Quick start for python API - -Here an example on how to access methylation values from a MetH5 file: - -```python -from meth5.meth5 import MetH5File - -with MetH5File(filename, mode="r") as m: - # List chromosomes in the MetH5 file - m.get_chromosomes() - - # Access chromosome 7 - chr7 = m["chr7"] - - # Get number of chunks - chr7.get_number_of_chunks() - - # Get a container that manages the values of chunk 3 - # (note that the data is not yet loaded into memory) - values = chr7.get_chunk(3) - - # Get the log-likelihood ratios in the container as a numpy array of shape (n,) - llrs = values.get_llrs() - - # Get the genomic start and end locations for each methylation call in the - # chunk as a numpy array of shape (n,2) - ranges = values.get_ranges() - - # Compute methylation rate (beta-score of methylation) for each genomic location, - # as well as the respective coordinates - met_rates, met_rate_ranges = values.get_llr_site_rate() - - # You can also compute other aggregates if you like - met_count, met_count_ranges = values.get_llr_site_aggregate(aggregation_fun=lambda llrs: (llrs>2).sum()) - - # Instead of accessing chunk wise, you can query a genomic range - values = chr7.get_values_in_range(36852906, 37449223) -``` - -A more detailed API documentation is in the works. Stay tuned! - -### Sparse methylation matrix - -In addition to accessing methylation calls in its unraveled form, the `meth5` library also contains a way to represent -the methylation calls as a sparse matrix. Seeing how the values are already stored in the MetH5 file in the same way a -coordinate sparse matrix would be stored in memory, this is a very cheap operation. Example: - -```python -from meth5.meth5 import MetH5File - -with MetH5File(filename, mode="r") as m: - values = m["chr7"].get_values_in_range(36852906, 37449223) - - # The parameter "read_read_names" allows is to choose whether we want to load the actual - # read names into memory. It's slightly more expensive than not reading it, so only load them - # if you are interested in them - matrix = values.to_sparse_methylation_matrix(read_read_names=True) - - # This is a scipy.sparse.csc_matrix matrix of dimension (r,s), containing the log-likelihood ratios of methylation - # where r is the number of reads covering the genomic range we selected, and s is the number of unique genomic - # ranges for which we have methylation calls. Since an LLR of 0 means total uncertainty, a 0 indicates no call. - matrix.met_matrix - - # A numpy array of shape (s, ) containing the start position for each unique genomic range - matrix.genomic_coord - # A numpy array of shape (s, ) containing the end position for each unique genomic range - matrix.genomic_coord_end - - # A numpy array of shape (r, ) containing the read names - matrix.read_names - - # Get a submatrix containing only the first 10 genomic locations - submatrix = matrix.get_submatrix(0, 10) - - # Get a submatrix containing only the reads in the provided list of read names - submatrix = matrix.get_submatrix_from_read_names(allowed_read_names) -``` - - - -## The MetH5 Format - -A MetH5 file is an HDF5 container that stores methylation calls for long reads. The structure of the HDF5 file is as follows: - -``` -/ -├─ chromosomes -│ ├─ CHROMOSOME_NAME1 -│ │ ├─ llr (float dataset of shape (n,)) -│ │ ├─ read_id (int dataset of shape (n,)) -│ │ ├─ range (int dataset of shape (n,2)) -│ │ └─ chunk_ranges (dataset of shape (c, 2)) -│ │ -│ ├─ CHROMOSOME_NAME2 -│ │ └─ ... -│ └─ ... -└─ reads - ├─ read_name_mapping (string dataset of shape (r,)) - └─ read_groups - ├─ READ_GROUP_KEY1 (int dataset of shape (r,)) - ├─ READ_GROUP_KEY2 (int dataset of shape (r,)) - └─ ... -``` - -Where `n` is the number of methylation calls in the respective chromosome, `c` is the number of chunks, and `r`is the total number of reads across all chromosomes. +# MetH5Format 0.3.1 + +[![GitHub license](https://img.shields.io/github/license/snajder-r/meth5format.svg)](https://github.com/snajder-r/meth5format/blob/master/LICENSE) +[![DOI](https://zenodo.org/badge/303672813.svg)](https://zenodo.org/badge/latestdoi/303672813) +[![Language](https://img.shields.io/badge/Language-Python3.7+-yellow.svg)](https://www.python.org/) +[![Build Status](https://travis-ci.com/snajder-r/meth5format.svg?branch=main)](https://travis-ci.com/snajder-r/meth5format) +[![Code style: black](https://img.shields.io/badge/code%20style-black-black.svg?style=flat)](https://github.com/snajder-r/black "Black (modified)") + + +[![PyPI version](https://badge.fury.io/py/meth5.svg)](https://badge.fury.io/py/meth5) +[![PyPI downloads](https://pepy.tech/badge/meth5)](https://pepy.tech/project/meth5) +[![Anaconda Version](https://img.shields.io/conda/v/snajder-r/meth5?color=blue)](https://anaconda.org/snajder-r/meth5) +[![Anaconda Downloads](https://anaconda.org/snajder-r/meth5/badges/downloads.svg)](https://anaconda.org/snajder-r/meth5) + +MetH5 is an HDF5-based container format for methylation calls from long reads. + +In the current version, the MetH5 format can store the following information: +* Log-likelihood ratio of each methylation call +* Genomic coordinates (start and end) of each methylation call +* The read name associated with each call +* Read grouping (i.e. annotation such as samples or haplotypes) + +## Installation + +Through pip: + +``` +pip install meth5 +```` + +Through anaconda: + +``` +conda install -c snajder-r meth5 +``` + +## Usage + +### Creating a MetH5 file from nanopolish methylation calls + +Assuming you have nanopolish methylation calls with filenames `*.tsv`, you can create a MetH5 file with the following command: + +``` +meth5 create_h5 --input_dir INPUT_DIR/ --output_file OUTPUT_FILE.m5 +``` + +In order to annotate reads with read grouping (for example as samples or haplotypes) you can do so by running: + +``` +meth annotate_reads --m5file M5FILE.m5 --read_groups_key READ_GROUPS_KEY --read_group_file READ_GROUP_FILE +``` + +Where the `READ_GROUPS_KEY` is the key under which you want to store the annotation (you can store multiple read annotations), +and `READ_GROUP_FILE` is a tab-delimited file containg read name and read group. For example: + +``` +read_name group +7741f9ee-ad41-42a4-99b2-290c66960410 1 +4f18b48e-a1d3-49ad-ace3-cfb96b78ad79 2 +... +``` + +### Quick start for python API + +Here an example on how to access methylation values from a MetH5 file: + +```python +from meth5.meth5 import MetH5File + +with MetH5File(filename, mode="r") as m: + # List chromosomes in the MetH5 file + m.get_chromosomes() + + # Access chromosome 7 + chr7 = m["chr7"] + + # Get number of chunks + chr7.get_number_of_chunks() + + # Get a container that manages the values of chunk 3 + # (note that the data is not yet loaded into memory) + values = chr7.get_chunk(3) + + # Get the log-likelihood ratios in the container as a numpy array of shape (n,) + llrs = values.get_llrs() + + # Get the genomic start and end locations for each methylation call in the + # chunk as a numpy array of shape (n,2) + ranges = values.get_ranges() + + # Compute methylation rate (beta-score of methylation) for each genomic location, + # as well as the respective coordinates + met_rates, met_rate_ranges = values.get_llr_site_rate() + + # You can also compute other aggregates if you like + met_count, met_count_ranges = values.get_llr_site_aggregate(aggregation_fun=lambda llrs: (llrs>2).sum()) + + # Instead of accessing chunk wise, you can query a genomic range + values = chr7.get_values_in_range(36852906, 37449223) +``` + +A more detailed API documentation is in the works. Stay tuned! + +### Sparse methylation matrix + +In addition to accessing methylation calls in its unraveled form, the `meth5` library also contains a way to represent +the methylation calls as a sparse matrix. Seeing how the values are already stored in the MetH5 file in the same way a +coordinate sparse matrix would be stored in memory, this is a very cheap operation. Example: + +```python +from meth5.meth5 import MetH5File + +with MetH5File(filename, mode="r") as m: + values = m["chr7"].get_values_in_range(36852906, 37449223) + + # The parameter "read_read_names" allows is to choose whether we want to load the actual + # read names into memory. It's slightly more expensive than not reading it, so only load them + # if you are interested in them + matrix = values.to_sparse_methylation_matrix(read_read_names=True) + + # This is a scipy.sparse.csc_matrix matrix of dimension (r,s), containing the log-likelihood ratios of methylation + # where r is the number of reads covering the genomic range we selected, and s is the number of unique genomic + # ranges for which we have methylation calls. Since an LLR of 0 means total uncertainty, a 0 indicates no call. + matrix.met_matrix + + # A numpy array of shape (s, ) containing the start position for each unique genomic range + matrix.genomic_coord + # A numpy array of shape (s, ) containing the end position for each unique genomic range + matrix.genomic_coord_end + + # A numpy array of shape (r, ) containing the read names + matrix.read_names + + # Get a submatrix containing only the first 10 genomic locations + submatrix = matrix.get_submatrix(0, 10) + + # Get a submatrix containing only the reads in the provided list of read names + submatrix = matrix.get_submatrix_from_read_names(allowed_read_names) +``` + + + +## The MetH5 Format + +A MetH5 file is an HDF5 container that stores methylation calls for long reads. The structure of the HDF5 file is as follows: + +``` +/ +├─ chromosomes +│ ├─ CHROMOSOME_NAME1 +│ │ ├─ llr (float dataset of shape (n,)) +│ │ ├─ read_id (int dataset of shape (n,)) +│ │ ├─ range (int dataset of shape (n,2)) +│ │ └─ chunk_ranges (dataset of shape (c, 2)) +│ │ +│ ├─ CHROMOSOME_NAME2 +│ │ └─ ... +│ └─ ... +└─ reads + ├─ read_name_mapping (string dataset of shape (r,)) + └─ read_groups + ├─ READ_GROUP_KEY1 (int dataset of shape (r,)) + ├─ READ_GROUP_KEY2 (int dataset of shape (r,)) + └─ ... +``` + +Where `n` is the number of methylation calls in the respective chromosome, `c` is the number of chunks, and `r`is the total number of reads across all chromosomes. + +--- + +## Citing + +The repository is archived at Zenodo. If you use `meth5` please cite as follow: + +Rene Snajder. (2021, May 18). snajder-r/meth5. Zenodo. https://doi.org/10.5281/zenodo.4772327 + +## Authors and contributors + +* Rene Snajder (@snajder-r): rene.snajder(at)dkfz-heidelberg.de \ No newline at end of file diff --git a/meta.yaml b/meta.yaml index 56f18fa..453af8b 100644 --- a/meta.yaml +++ b/meta.yaml @@ -1,38 +1,38 @@ -{% set version = "0.3.1" %} -{% set name = "meth5" %} - -package: - name: {{ name|lower }} - version: {{ version }} - -source: - path: dist/{{ name }}-{{ version }}.tar.gz - -build: - number: 0 - script: "pip install {{ name }}-{{ version }}.tar.gz --no-deps --ignore-installed -vv " - - entry_points: - - meth5=meth5.__main__:main - noarch: "python" - -requirements: - build: - - python>=3.7 - - pip>=19.2.1 - - ripgrep>=11.0.1 - run: - - numpy>=1.19.2 - - scipy==1.4.1 - - pandas>=1.1.3 - - h5py>=2.10.0 -about: - home: https://github.com/snajder-r/meth5format - license: MIT - licence_url: https://opensource.org/licenses/MIT - summary: HDF5 based file format for storage, retrieval, and analysis of modification predictions from Nanopore - -extra: - author: Rene Snajder - author_email: r.snajder@dkfz-heidelberg.de +{% set version = "0.3.1" %} +{% set name = "meth5" %} + +package: + name: {{ name|lower }} + version: {{ version }} + +source: + path: dist/{{ name }}-{{ version }}.tar.gz + +build: + number: 0 + script: "pip install {{ name }}-{{ version }}.tar.gz --no-deps --ignore-installed -vv " + + entry_points: + - meth5=meth5.__main__:main + noarch: "python" + +requirements: + build: + - python>=3.7 + - pip>=19.2.1 + - ripgrep>=11.0.1 + run: + - numpy>=1.19.2 + - scipy==1.4.1 + - pandas>=1.1.3 + - h5py>=2.10.0 +about: + home: https://github.com/snajder-r/meth5format + license: MIT + licence_url: https://opensource.org/licenses/MIT + summary: HDF5 based file format for storage, retrieval, and analysis of modification predictions from Nanopore + +extra: + author: Rene Snajder + author_email: r.snajder@dkfz-heidelberg.de author_url: https://github.com/snajder-r \ No newline at end of file diff --git a/setup.py b/setup.py index f740faf..9677129 100644 --- a/setup.py +++ b/setup.py @@ -1,38 +1,38 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -from setuptools import setup - -# Long description from README file -with open("README.md", "r") as fh: - long_description = fh.read() - -# Collect info in a dictionary for setup.py -setup( - name="meth5", - description="HDF5 based file format for storage, retrieval, and analysis of modification predictions from Nanopore", - version="0.3.1", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/snajder-r/meth5format", - author="Rene Snajder", - author_email="r.snajder@dkfz-heidelberg.de", - license="MIT", - python_requires=">=3.7", - classifiers=[ - "Development Status :: 4 - Beta", - "Intended Audience :: Science/Research", - "Topic :: Scientific/Engineering :: Bio-Informatics", - "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3" - ], - install_requires=[ - "numpy>=1.19.2", - "scipy==1.4.1", - "pandas>=1.1.3", - "h5py>=2.10.0" - ], - packages=["meth5"], - package_dir={"meth5": "meth5"}, - entry_points={"console_scripts": ["meth5=meth5.__main__:main"]}, -) +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from setuptools import setup + +# Long description from README file +with open("README.md", "r") as fh: + long_description = fh.read() + +# Collect info in a dictionary for setup.py +setup( + name="meth5", + description="HDF5 based file format for storage, retrieval, and analysis of modification predictions from Nanopore", + version="0.3.1", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/snajder-r/meth5format", + author="Rene Snajder", + author_email="r.snajder@dkfz-heidelberg.de", + license="MIT", + python_requires=">=3.7", + classifiers=[ + "Development Status :: 4 - Beta", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3" + ], + install_requires=[ + "numpy>=1.19.2", + "scipy==1.4.1", + "pandas>=1.1.3", + "h5py>=2.10.0" + ], + packages=["meth5"], + package_dir={"meth5": "meth5"}, + entry_points={"console_scripts": ["meth5=meth5.__main__:main"]}, +) diff --git a/versipy.yaml b/versipy.yaml index aaf6c95..9d7783b 100644 --- a/versipy.yaml +++ b/versipy.yaml @@ -1,38 +1,38 @@ -version: - major: 0 - minor: 3 - micro: 1 - a: null - b: null - rc: null - post: null - dev: null -managed_values: - __package_name__: meth5 - __package_description__: HDF5 based file format for storage, retrieval, and analysis - of modification predictions from Nanopore - __package_url__: https://github.com/snajder-r/meth5format - __author_name__: Rene Snajder - __author_email__: r.snajder@dkfz-heidelberg.de - __author_url__: https://github.com/snajder-r - __package_licence__: MIT - __package_licence_url__: https://opensource.org/licenses/MIT - __minimal_python__: '3.7' - __entry_point1__: meth5=meth5.__main__:main - __dependencies__: - - numpy>=1.19.2 - - scipy==1.4.1 - - pandas>=1.1.3 - - h5py>=2.10.0 - __classifiers__: - - 'Development Status :: 4 - Beta' - - 'Intended Audience :: Science/Research' - - 'Topic :: Scientific/Engineering :: Bio-Informatics' - - 'License :: OSI Approved :: MIT License' - - 'Programming Language :: Python :: 3' - __citation__: Rene Snajder. (2021, May 4). snajder-r/meth5format -managed_files: - versipy_templates/setup.py: setup.py - versipy_templates/meta.yaml: meta.yaml - versipy_templates/.travis.yml: .travis.yml - versipy_templates/README.md: README.md +version: + major: 0 + minor: 3 + micro: 1 + a: null + b: null + rc: null + post: null + dev: null +managed_values: + __package_name__: meth5 + __package_description__: HDF5 based file format for storage, retrieval, and analysis + of modification predictions from Nanopore + __package_url__: https://github.com/snajder-r/meth5format + __author_name__: Rene Snajder + __author_email__: r.snajder@dkfz-heidelberg.de + __author_url__: https://github.com/snajder-r + __package_licence__: MIT + __package_licence_url__: https://opensource.org/licenses/MIT + __minimal_python__: '3.7' + __entry_point1__: meth5=meth5.__main__:main + __dependencies__: + - numpy>=1.19.2 + - scipy==1.4.1 + - pandas>=1.1.3 + - h5py>=2.10.0 + __classifiers__: + - 'Development Status :: 4 - Beta' + - 'Intended Audience :: Science/Research' + - 'Topic :: Scientific/Engineering :: Bio-Informatics' + - 'License :: OSI Approved :: MIT License' + - 'Programming Language :: Python :: 3' + __citation__: Rene Snajder. (2021, May 4). snajder-r/meth5format +managed_files: + versipy_templates/setup.py: setup.py + versipy_templates/meta.yaml: meta.yaml + versipy_templates/.travis.yml: .travis.yml + versipy_templates/README.md: README.md diff --git a/versipy_history.txt b/versipy_history.txt index 342a435..2fcd1cb 100644 --- a/versipy_history.txt +++ b/versipy_history.txt @@ -1,16 +1,17 @@ -2021-05-04 12:46:46.832974 0.0.0 Initialise versipy history -2021-05-04 16:01:08.562201 0.2.0.dev1 Versipy auto bump-up -2021-05-04 16:14:26.817354 0.2.1.dev1 Versipy auto bump-up -2021-05-04 16:15:20.837250 0.2.2.dev1 Versipy auto bump-up -2021-05-04 16:28:50.379984 0.2.3.dev1 Versipy auto bump-up -2021-05-04 17:00:00.655716 0.2.4.dev1 Versipy auto bump-up -2021-05-04 17:00:25.481294 0.2.5.dev1 Versipy auto bump-up -2021-05-10 12:18:20.138114 0.2.6.dev1 Versipy auto bump-up -2021-05-10 14:45:55.563354 0.2.6.dev1 Versipy auto bump-up -2021-05-10 14:47:04.584037 0.2.6.dev2 Versipy auto bump-up -2021-05-10 14:49:18.885224 0.2.7 Versipy auto bump-up -2021-05-19 12:12:36.729407 0.3.0 Versipy auto bump-up -2021-05-19 12:19:10.398917 0.3.1 Versipy auto bump-up -2021-05-19 12:22:55.538224 0.3.1 Versipy auto bump-up -2021-05-19 16:03:38.592471 0.3.1 Versipy auto bump-up -2021-05-19 16:21:04.697323 0.3.1 Versipy auto bump-up +2021-05-04 12:46:46.832974 0.0.0 Initialise versipy history +2021-05-04 16:01:08.562201 0.2.0.dev1 Versipy auto bump-up +2021-05-04 16:14:26.817354 0.2.1.dev1 Versipy auto bump-up +2021-05-04 16:15:20.837250 0.2.2.dev1 Versipy auto bump-up +2021-05-04 16:28:50.379984 0.2.3.dev1 Versipy auto bump-up +2021-05-04 17:00:00.655716 0.2.4.dev1 Versipy auto bump-up +2021-05-04 17:00:25.481294 0.2.5.dev1 Versipy auto bump-up +2021-05-10 12:18:20.138114 0.2.6.dev1 Versipy auto bump-up +2021-05-10 14:45:55.563354 0.2.6.dev1 Versipy auto bump-up +2021-05-10 14:47:04.584037 0.2.6.dev2 Versipy auto bump-up +2021-05-10 14:49:18.885224 0.2.7 Versipy auto bump-up +2021-05-19 12:12:36.729407 0.3.0 Versipy auto bump-up +2021-05-19 12:19:10.398917 0.3.1 Versipy auto bump-up +2021-05-19 12:22:55.538224 0.3.1 Versipy auto bump-up +2021-05-19 16:03:38.592471 0.3.1 Versipy auto bump-up +2021-05-19 16:21:04.697323 0.3.1 Versipy auto bump-up +2021-05-19 16:43:43.414729 0.3.1 Versipy auto bump-up