diff --git a/README.md b/README.md index 9594318..aaea4bd 100644 --- a/README.md +++ b/README.md @@ -7,25 +7,25 @@ # cstag -`cstag` is a Python library tailored for the manipulation and handling of [minimap2's CS tags](https://github.com/lh3/minimap2#cs). +`cstag` is a Python library tailored for for manipulating and visualizing [minimap2's cs tags](https://github.com/lh3/minimap2#cs). ## 🌟 Features -- `cstag.call()`: Generate a CS tag -- `cstag.shorten()`: Convert a CS tag from its long to short format -- `cstag.lengthen()`: Convert a CS tag from its short to long format -- `cstag.consensus()`: Create a consensus CS tag from multiple CS tags -- `cstag.mask()`: Mask low-quality bases within a CS tag -- `cstag.split()`: Break down a CS tag into its constituent parts -- `cstag.revcomp()`: Convert a CS tag to its reverse complement +- `cstag.call()`: Generate a cs tag +- `cstag.shorten()`: Convert a cs tag from its long to short format +- `cstag.lengthen()`: Convert a cs tag from its short to long format +- `cstag.consensus()`: Create a consensus cs tag from multiple cs tags +- `cstag.mask()`: Mask low-quality bases within a cs tag +- `cstag.split()`: Break down a cs tag into its constituent parts +- `cstag.revcomp()`: Convert a cs tag to its reverse complement - `cstag.to_sequence()`: Reconstruct a reference subsequence from the alignment - `cstag.to_vcf()`: Generate a VCF representation - `cstag.to_html()`: Generate an HTML representation - `cstag.to_pdf()`: Produce a PDF file For comprehensive documentation, please visit [our docs](https://akikuno.github.io/cstag/cstag/). -To add CS tags to SAM/BAM files, check out [`cstag-cli`](https://github.com/akikuno/cstag-cli). +To add cs tags to SAM/BAM files, check out [`cstag-cli`](https://github.com/akikuno/cstag-cli). ## 🛠 Installation @@ -44,7 +44,7 @@ conda install -c bioconda cstag ## 💡 Usage -### Generating CS Tags +### Generating cs Tags ```python import cstag @@ -60,19 +60,19 @@ print(cstag.call(cigar, md, seq, long=True)) # =AC*ag=TACGT-ag=ACGT+ac~nn3nn=G ``` -### Shortening or Lengthening CS Tags +### Shortening or Lengthening cs Tags ```python import cstag -# Convert a CS tag from long to short +# Convert a cs tag from long to short cs_tag = "=ACGT*ag=CGT" print(cstag.shorten(cs_tag)) # :4*ag:3 -# Convert a CS tag from short to long +# Convert a cs tag from short to long cs_tag = ":4*ag:3" cigar = "8M" seq = "ACGTACGT" @@ -106,7 +106,7 @@ print(cstag.mask(cs_tag, cigar, qual, phred_threshold)) # =ACNN*an+ng-cc=T ``` -### Splitting a CS Tag +### Splitting a cs Tag ```python import cstag @@ -116,7 +116,7 @@ print(cstag.split(cs_tag)) # ['=ACGT', '*ac', '+gg', '-cc', '=T'] ``` -### Reverse Complement of a CS Tag +### Reverse Complement of a cs Tag ```python import cstag @@ -152,7 +152,7 @@ chr1 5 . C CTT . . . """ ``` -The multiple CS tags enable reporting of the variant allele frequency (VAF). +The multiple cs tags enable reporting of the variant allele frequency (VAF). ```python import cstag @@ -186,7 +186,7 @@ Path("report.html").write_text(cs_tag_html) # Output "report.html" ``` -You can visualize mutations indicated by the CS tag using the generated `report.html` file as shown below: +You can visualize mutations indicated by the cs tag using the generated `report.html` file as shown below: image diff --git a/src/cstag/call.py b/src/cstag/call.py index 2de3b61..5aa0381 100644 --- a/src/cstag/call.py +++ b/src/cstag/call.py @@ -74,8 +74,10 @@ def trim_clips(cigar: str, seq: str) -> tuple[str, str]: ########################################################### -# Generate CS long +# Generate cs tag in long format ########################################################### + + def expand_cigar_operations(cigar: str) -> list[str]: parsed_cigar = parse_cigar(cigar) expanded_list = [] diff --git a/src/cstag/consensus.py b/src/cstag/consensus.py index 9cd4ec4..4526736 100644 --- a/src/cstag/consensus.py +++ b/src/cstag/consensus.py @@ -25,20 +25,20 @@ def expand_deletion_tags(tags_combined: list[str]) -> list[str]: def split_cs_tags(cs_tags: list[str]) -> list[list[str]]: """ - Split and process each CS tag in cs_tags. + Split and process each cs tag in cs_tags. Args: - cs_tags (list[str]): list of CS tags in the long format. + cs_tags (list[str]): list of cs tags in the long format. Returns: - list[list[str]]: list of processed CS tags. + list[list[str]]: list of processed cs tags. """ cs_tags_splitted = [] for cs_tag in cs_tags: # Remove the prefix "cs:Z:" if present cs_tag = cs_tag.replace("cs:Z:", "") - # Split the CS tag using special symbols (-, *, ~, =) + # Split the cs tag using special symbols (-, *, ~, =) # insertion symbol (+) is ignored because it is not observed in reference sequence tags_splitted = re.split(r"([-*~=])", cs_tag)[1:] # Combine the symbol with the corresponding sequence @@ -70,7 +70,7 @@ def normalize_read_lengths(cs_tags: list[str], positions: list[int]) -> list[lis Normalize the lengths of each read in cs_tags based on their starts positions. If the length is insufficient, fill in with `None`. Args: - cs_tags (list[str]): list of CS tags. + cs_tags (list[str]): list of cs tags. positions (list[int]): Starting positions of each read. Returns: @@ -109,7 +109,7 @@ def get_consensus(cs_tags: list[list[str]]) -> str: for cs in zip(*cs_tags): # Remove the None that is compensating for the insufficient lead length. cs = [c for c in cs if c] - # Get the most common CS tag(s) + # Get the most common cs tag(s) most_common_tags = Counter(cs).most_common() # If there's a unique most common tag, return it @@ -134,13 +134,13 @@ def get_consensus(cs_tags: list[list[str]]) -> str: def consensus(cs_tags: list[str], positions: list[int], prefix: bool = False) -> str: - """generate consensus of CS tags + """generate consensus of cs tags Args: - cs_tags (list): CS tags in the **long** format + cs_tags (list): cs tags in the **long** format positions (list): 1-based leftmost mapping position (4th column in SAM file) - prefix (bool, optional): Whether to add the prefix 'cs:Z:' to the CS tag. Defaults to False + prefix (bool, optional): Whether to add the prefix 'cs:Z:' to the cs tag. Defaults to False Return: - str: a consensus of CS tag in the **long** format + str: a consensus of cs tag in the **long** format Example: >>> import cstag >>> cs_tags = ["=ACGT", "=AC*gt=T", "=C*gt=T", "=C*gt=T", "=ACT+ccc=T"] diff --git a/src/cstag/to_html.py b/src/cstag/to_html.py index 7ef315b..bc6e31e 100644 --- a/src/cstag/to_html.py +++ b/src/cstag/to_html.py @@ -83,7 +83,7 @@ def append_mark_to_n(cs_tag: str) -> str: - """Process each CS tag by adding specific markers `@` to `N`.""" + """Process each cs tag by adding specific markers `@` to `N`.""" def append_mark(cs: str) -> str: if cs.startswith("N"): @@ -138,7 +138,7 @@ def process_cs_tag(cs_tag: str) -> str: def to_html(cs_tag: str, description: str = "") -> str: """Output HTML string showing a sequence with mutations colored Args: - cs_tag (str): CS tag in the **long** format + cs_tag (str): cs tag in the **long** format description (str): (optional) header information in the output string Return: HTML string diff --git a/src/cstag/to_pdf.py b/src/cstag/to_pdf.py index 15098c6..75b8369 100644 --- a/src/cstag/to_pdf.py +++ b/src/cstag/to_pdf.py @@ -7,15 +7,15 @@ def to_pdf(cs_tag: str, description: str, path_out: str | Path) -> None: """ - Convert a CS tag and its description to a PDF file. + Convert a cs tag and its description to a PDF file. - This function takes a CS (custom string) tag and its description, converts + This function takes a cs (custom string) tag and its description, converts it to HTML using the `to_html` function, and then writes it to a PDF file using WeasyPrint. Args: - cs_tag (str): The CS tag to be converted. - description (str): The description associated with the CS tag. + cs_tag (str): The cs tag to be converted. + description (str): The description associated with the cs tag. path_out (str | Path): The path where the output PDF file will be saved. Returns: diff --git a/src/cstag/to_sequence.py b/src/cstag/to_sequence.py index ef3e6e2..c7ec1c5 100644 --- a/src/cstag/to_sequence.py +++ b/src/cstag/to_sequence.py @@ -8,10 +8,10 @@ def to_sequence(cs_tag: str) -> str: """Reconstruct the reference subsequence in the alignment Args: - cs_tag (str): CS tag in the **long** format + cs_tag (str): cs tag in the **long** format Returns: - str: The sequence string derived from the CS tag. + str: The sequence string derived from the cs tag. Example: >>> import cstag diff --git a/src/cstag/to_vcf.py b/src/cstag/to_vcf.py index 223db4e..409d16f 100644 --- a/src/cstag/to_vcf.py +++ b/src/cstag/to_vcf.py @@ -96,7 +96,7 @@ def get_variant_annotations(cs_tag_split: list[str], position: int) -> list[Vcf] ########################################################### -# Format the CS tags +# Format the cs tags ########################################################### @@ -146,7 +146,7 @@ def format_cs_tags(cs_tags: list[str], chroms: list[str] | list[int], positions: def group_by_chrom(cs_tags_formatted: list[tuple]) -> dict[str, tuple]: - """Group CS tags by chromosomes""" + """Group cs tags by chromosomes""" cs_tags_grouped = defaultdict(list) for cs in cs_tags_formatted: cs_tags_grouped[cs.chrom].append( @@ -234,7 +234,7 @@ def add_vcf_fields( ########################################################### -# Process CS tag (One) +# Process cs tag (One) ########################################################### @@ -259,7 +259,7 @@ def process_cs_tag(cs_tag: str, chrom: str | int, pos: int) -> str: ########################################################### -# Process CS tags (Many) +# Process cs tags (Many) ########################################################### @@ -319,10 +319,10 @@ def process_cs_tags(cs_tags: list[str], chroms: list[str], positions: list[int]) def to_vcf(cs_tags: str | list[str], chroms: str | int | list[str] | list[int], positions: int | list[int]) -> str: """ - Convert CS tag(s) to VCF (Variant Call Format) string. + Convert cs tag(s) to VCF (Variant Call Format) string. Args: - cs_tag (str | list[str]): The CS tag representing the sequence alignment. + cs_tag (str | list[str]): The cs tag representing the sequence alignment. chrom (str | list[str]): The chromosome name. pos (int | list[int]): The starting position for the sequence. diff --git a/src/cstag/utils/validator.py b/src/cstag/utils/validator.py index a57e04c..4c2e147 100644 --- a/src/cstag/utils/validator.py +++ b/src/cstag/utils/validator.py @@ -9,17 +9,17 @@ def validate_cs_tag(cs_tag: str) -> None: ) if not pattern.fullmatch(cs_tag.replace("cs:Z:", "")): - raise ValueError(f"Invalid CS tag: {cs_tag}") + raise ValueError(f"Invalid cs tag: {cs_tag}") def validate_short_format(cs_tag: str) -> None: if re.search(r"=[ACGTN]+", cs_tag): - raise ValueError("CS tag must be in short format") + raise ValueError("cs tag must be in short format") def validate_long_format(cs_tag: str) -> None: if re.search(r":[0-9]+", cs_tag): - raise ValueError("CS tag must be in long format") + raise ValueError("cs tag must be in long format") def validate_threshold(threshold: int) -> None: diff --git a/tests/test_to_vcf.py b/tests/test_to_vcf.py index 5500859..c7e1b61 100644 --- a/tests/test_to_vcf.py +++ b/tests/test_to_vcf.py @@ -79,7 +79,7 @@ def test_get_variant_annotations(): ########################################################### -# Format the CS tags +# Format the cs tags ########################################################### @@ -202,7 +202,7 @@ def test_add_vcf_fields(): ########################################################### -# process_cs_tag: Single CS tag +# process_cs_tag: Single cs tag ########################################################### @@ -227,7 +227,7 @@ def test_process_cs_tag(): ########################################################### -# process_cs_tags: Multuple CS tags +# process_cs_tags: Multuple cs tags ###########################################################