Skip to content

Commit

Permalink
Refactor modules and improve FASTA parsing
Browse files Browse the repository at this point in the history
- Break into multiple modules
- Support passing Chain args in FASTA parsing
- Support getting unique germlines
- Raise error on ANARCI IMGT position 61A bug
  • Loading branch information
prihoda committed Jun 29, 2021
1 parent b191a3a commit 5eec3c1
Show file tree
Hide file tree
Showing 7 changed files with 519 additions and 455 deletions.
5 changes: 4 additions & 1 deletion abnumber/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from abnumber.__version__ import __version__
from abnumber.chain import Chain, Position, Alignment, SUPPORTED_SCHEMES, SUPPORTED_CDR_DEFINITIONS
from abnumber.chain import Chain
from abnumber.position import Position, sort_positions
from abnumber.alignment import Alignment
from abnumber.common import SUPPORTED_SCHEMES, SUPPORTED_CDR_DEFINITIONS
from abnumber.exceptions import ChainParseError
195 changes: 195 additions & 0 deletions abnumber/alignment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
from typing import Union

from abnumber.common import is_similar_residue, is_integer
from abnumber.position import Position


class Alignment:
"""Antibody chain alignment of two or more chains
>>> from abnumber import Chain
>>>
>>> seq1 = 'QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSSAKTTAP'
>>> chain1 = Chain(seq1, scheme='imgt')
>>>
>>> seq2 = 'QVQLVQSGAELDRPGATVKMSCKASGYTTTRYTMHWVKQRPGQGLDWIGYINPSDRSYTNYNQKFKDKATLTTDKSSSTAYMQKTSLTSEDSAVYYCARYYDDYLDRWGQGTTLTVSSAKTTAP'
>>> chain2 = Chain(seq2, scheme='imgt')
>>> alignment = chain1.align(chain2)
Alignment can be sliced and iterated:
>>> for pos, (aa, bb) in alignment[:'5']:
>>> print(pos, aa, bb)
H1 Q Q
H2 V V
H3 Q Q
H4 L L
H5 Q V
...
"""
def __init__(self, positions, residues, scheme, chain_type):
assert isinstance(positions, list), 'Expected list of positions and residues. ' \
'Use chain.align(other) to create an alignment.'
assert len(positions) == len(residues)
unique_cdr_definitions = set(pos.cdr_definition for pos in positions)
assert len(unique_cdr_definitions) <= 1, f'Aligned chains should use the same CDR definitions, got: {unique_cdr_definitions}'
self.positions = positions
self.residues = residues
self.scheme = scheme
self.chain_type = chain_type
self._zipped = list(zip(self.positions, self.residues))

def __repr__(self):
return self.format()

def __iter__(self):
yield from self._zipped.__iter__()

def __len__(self):
return len(self.positions)

def __getitem__(self, item):
if isinstance(item, slice):
if item.step is not None and item.step != 1:
raise IndexError(f'Slicing with step != 1 is not implemented, got: {item}')
return self.slice(start=item.start, stop=item.stop)
pos = self._parse_position(item)
raw_pos = self.positions.index(pos)
return self.residues[raw_pos]

def slice(self, start: Union[str, int, 'Position'] = None, stop: Union[str, int, 'Position'] = None,
stop_inclusive: bool = True, allow_raw: bool = False):
"""Create a slice of this alignment
You can also slice directly using ``alignment['111':'112A']`` or ``alignment.raw[10:20]``.
:param start: Slice start position (inclusive), :class:`Position` or string (e.g. '111A')
:param stop: Slice stop position (inclusive), :class:`Position` or string (e.g. '112A')
:param stop_inclusive: Include stop position in slice
:param allow_raw: Allow unaligned numeric indexing from 0 to length of sequence - 1
:return: new sliced Alignment object
"""

start = self._parse_position(start, allow_raw=allow_raw) if start is not None else None
stop = self._parse_position(stop, allow_raw=allow_raw) if stop is not None else None

new_positions = []
new_residues = []
for pos, residues in zip(self.positions, self.residues):
if start is not None and pos < start:
continue
if stop is not None and (pos > stop or (not stop_inclusive and pos >= stop)):
break
new_positions.append(pos)
new_residues.append(residues)

return Alignment(positions=new_positions, residues=new_residues, scheme=self.scheme, chain_type=self.chain_type)

def _parse_position(self, position: Union[int, str, 'Position'], allow_raw=False):
"""Create :class:`Position` key object from string or int.
Note: The position should only be used for indexing, CDR definition is not preserved!
:param position: Numeric or string position representation
:param allow_raw: Also allow unaligned numeric (int) indexing from 0 to length of sequence - 1
:return: new Position object, should only be used for indexing, CDR definition is not preserved!
"""
if isinstance(position, str):
return Position.from_string(position, chain_type=self.chain_type, scheme=self.scheme)
if isinstance(position, Position):
return position
try:
position = int(position)
except TypeError:
raise IndexError(f'Invalid position key, expected Position, string or integer, got {type(position)}: "{position}"')
if not allow_raw:
raise IndexError("Use chain.raw[i] for raw numeric indexing or pass allow_raw=True. "
"For named position indexing, use string (e.g. chain['111A'] or chain['H111A'])")
if position >= len(self.positions):
return None
return self.positions[position]

def format(self, mark_identity=True, mark_cdrs=True):
"""Format alignment to string
:param mark_identity: Add BLAST style middle line showing identity (``|``), similar residue (``+``) or different residue (``.``)
:param mark_cdrs: Add line highlighting CDR regions using ``^``
:return: formatted string
"""

def _identity_symbol(a, b):
return '|' if a == b else ('+' if is_similar_residue(a, b) else '.')

lines = []
for i in range(len(self.residues[0])):
if mark_identity and i != 0:
lines.append(''.join(_identity_symbol(aas[i], aas[i-1]) for pos, aas in self))
lines.append(''.join(aas[i] for pos, aas in self))
if mark_cdrs:
if self.positions[0].cdr_definition == 'kabat':
lines.append(''.join('^' if pos.is_in_cdr() else ("°" if pos.is_in_vernier() else ' ') for pos in self.positions))
else:
lines.append(''.join('^' if pos.is_in_cdr() else ' ' for pos in self.positions))
return '\n'.join(lines)

def print(self, mark_identity=True, mark_cdrs=True):
"""Print string representation of alignment created using :meth:`Alignment.format`
>>> alignment.print()
QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPS-RGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS
||||.||||||.||||+|||||||||||.||||||||||||||||+||||||||.|.||||||||||||||||||||||||||.+|||||||||||||||||....||.|||||||||||
QVQLVQSGAELDRPGATVKMSCKASGYTTTRYTMHWVKQRPGQGLDWIGYINPSDRSYTNYNQKFKDKATLTTDKSSSTAYMQKTSLTSEDSAVYYCARYYD--DYLDRWGQGTTLTVSS
^^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^^^
>>> alignment.print(mark_identity=False, mark_cdrs=False)
QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPS-RGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS
QVQLVQSGAELDRPGATVKMSCKASGYTTTRYTMHWVKQRPGQGLDWIGYINPSDRSYTNYNQKFKDKATLTTDKSSSTAYMQKTSLTSEDSAVYYCARYYD--DYLDRWGQGTTLTVSS
:param mark_identity: Add BLAST style middle line showing identity (``|``), similar residue (``+``) or different residue (``.``)
:param mark_cdrs: Add line highlighting CDR regions using ``^``
"""
print(self.format(mark_identity=mark_identity, mark_cdrs=mark_cdrs))

def has_mutation(self):
"""Check if there is a mutation in the alignment or not"""
return any(len(set(aas)) != 1 for aas in self.residues)

def num_mutations(self):
"""Get number of mutations (positions with more than one type of residue)"""
return sum(len(set(aas)) != 1 for aas in self.residues)

@property
def raw(self):
"""Access raw representation of this alignment to allow unaligned numeric indexing and slicing
>>> # Numbering of ``chain.raw`` starts at 0
>>> alignment.raw[0]
'H1'
>>> # Slicing with string is based on schema numbering, the end is inclusive
>>> chain['1':'10']
'QVQLQQSGAE'
>>> # Slicing with ``chain.raw`` starts at 0, the end is exclusive (Python style)
>>> chain.raw[0:10]
'QVQLQQSGAE'
:return: Raw alignment accessor that can be sliced or indexed to produce a new :class:`Alignment` object
"""
return RawAlignmentAccessor(self)


class RawAlignmentAccessor:
def __init__(self, alignment: Alignment):
self.alignment = alignment

def __getitem__(self, item):
if isinstance(item, slice):
if item.step is not None and item.step != 1:
raise IndexError(f'Slicing with step != 1 is not implemented, got: {item}')
if item.start is not None and not is_integer(item.start):
raise IndexError(f'Expected int start index for alignment.raw, got {type(item.start)}: {item.start}')
if item.stop is not None and not is_integer(item.stop):
raise IndexError(f'Expected int end index for alignment.raw, got {type(item.stop)}: {item.stop}')
return self.alignment.slice(start=item.start, stop=item.stop, stop_inclusive=False, allow_raw=True)
if not is_integer(item):
raise IndexError(f'Expected int indexing for alignment.raw, got {type(item)}: {item}')
pos = self.alignment.positions[item]
return self.alignment[pos]
Loading

0 comments on commit 5eec3c1

Please sign in to comment.