Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: cnv translator supports uncertain ranges for g. and m. #387

Merged
merged 2 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ extras =
psycopg2-binary
biocommons.seqrepo>=0.5.1
bioutils>=0.5.2
hgvs>=1.4
hgvs@git+https://github.com/biocommons/hgvs@225-uncertain-ranges
requests
dill~=0.3.7
click
Expand Down
50 changes: 47 additions & 3 deletions src/ga4gh/vrs/extras/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from typing import Optional, Union
from ga4gh.vrs.dataproxy import create_dataproxy, _DataProxy
from ga4gh.vrs.extras.decorators import lazy_property
from hgvs.location import Interval, SimplePosition, AAPosition, BaseOffsetInterval
larrybabb marked this conversation as resolved.
Show resolved Hide resolved
import logging
import re

Expand Down Expand Up @@ -438,11 +439,14 @@ def _from_hgvs(self, hgvs_dup_del_expr: str, **kwargs):
copy_change: Copy change. If not provided, default is efo:0030067 for
deletions and efo:0030070 for duplications
"""
# sv = self._get_parsed_hgvs(hgvs_dup_del_expr)
sv = self.hgvs_tools.parse(hgvs_dup_del_expr)
if not sv:
return None

if sv.type not in {"g", "m"}:
err_msg = "Only 'g' and 'm' reference sequences are supported"
raise ValueError(err_msg)

larrybabb marked this conversation as resolved.
Show resolved Hide resolved
sv_type = self.hgvs_tools.get_edit_type(sv)
if sv_type not in {"del", "dup"}:
raise ValueError("Must provide a 'del' or 'dup'")
Expand All @@ -454,10 +458,13 @@ def _from_hgvs(self, hgvs_dup_del_expr: str, **kwargs):
if not refget_accession:
return None

start = self._get_vrs_loc_start_end_val(sv.posedit.pos.start, is_start=True)
end = self._get_vrs_loc_start_end_val(sv.posedit.pos.end, is_start=False)

location = models.SequenceLocation(
sequenceReference=models.SequenceReference(refgetAccession=refget_accession),
start=sv.posedit.pos.start.base - 1,
end=sv.posedit.pos.end.base
start=start,
end=end
)

copies = kwargs.get("copies")
Expand All @@ -472,6 +479,43 @@ def _from_hgvs(self, hgvs_dup_del_expr: str, **kwargs):
cnv =self._post_process_imported_cnv(cnv)
return cnv

@staticmethod
def _get_vrs_loc_start_end_val(
pos: Union[SimplePosition, AAPosition, BaseOffsetInterval, Interval],
is_start: bool = True
) -> Union[int, models.Range]:
"""Get VRS Location start or end value

:param pos: biocommons hgvs location instance for position
:param is_start: ``True`` if ``pos`` represents VRS Sequence Location start.
``False`` if ``pos`` represents VRS Sequence Location end.
:raise ValueError: If unsupported biocommons hgvs location is passed
:return: VRS Location start or end value using inter-residue positions
"""
def _get_pos_value(
position: Optional[int],
do_subtract_1: bool
) -> Optional[int]:
"""Get position value

:param position: Position
:param do_subtract_1: Whether or not we need to subtract 1 for ``position``
:return: Adjusted position value
"""
return position - 1 if do_subtract_1 else position

if isinstance(pos, (SimplePosition, AAPosition, BaseOffsetInterval)):
vrs_loc_pos_val = _get_pos_value(pos.base, is_start)
elif isinstance(pos, Interval):
start_val = _get_pos_value(pos.start.base, is_start and pos.start.base is not None)
end_val = _get_pos_value(pos.end.base, is_start and pos.end.base is not None)
vrs_loc_pos_val = start_val if start_val == end_val else models.Range([start_val, end_val])
else:
err_msg = f"HGVS Location is not supported: {type(pos)}"
raise ValueError(err_msg)

return vrs_loc_pos_val

def _post_process_imported_cnv(self, copy_number):
"""Provide common post-processing for imported Copy Numbers IN-PLACE."""
if self.identify:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
interactions:
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- python-requests/2.31.0
method: GET
uri: http://localhost:5000/seqrepo/1/metadata/refseq:NC_000005.9
response:
body:
string: "{\n \"added\": \"2016-08-24T05:17:01Z\",\n \"aliases\": [\n \"GRCh37:5\",\n
\ \"GRCh37:chr5\",\n \"GRCh37.p10:5\",\n \"GRCh37.p10:chr5\",\n \"GRCh37.p11:5\",\n
\ \"GRCh37.p11:chr5\",\n \"GRCh37.p12:5\",\n \"GRCh37.p12:chr5\",\n
\ \"GRCh37.p13:5\",\n \"GRCh37.p13:chr5\",\n \"GRCh37.p2:5\",\n \"GRCh37.p2:chr5\",\n
\ \"GRCh37.p5:5\",\n \"GRCh37.p5:chr5\",\n \"GRCh37.p9:5\",\n \"GRCh37.p9:chr5\",\n
\ \"MD5:0740173db9ffd264d728f32784845cd7\",\n \"NCBI:NC_000005.9\",\n
\ \"refseq:NC_000005.9\",\n \"SEGUID:Ja+pA+dtRy6jSKdOZXN58wY0rK4\",\n
\ \"SHA1:25afa903e76d472ea348a74e657379f30634acae\",\n \"VMC:GS_vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX\",\n
\ \"sha512t24u:vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX\",\n \"ga4gh:SQ.vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX\",\n
\ \"hs37-1kg:5\",\n \"hs37d5:5\"\n ],\n \"alphabet\": \"ACGNT\",\n
\ \"length\": 180915260\n}\n"
headers:
Connection:
- close
Content-Length:
- '803'
Content-Type:
- application/json
Date:
- Fri, 05 Apr 2024 19:19:57 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
interactions:
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- python-requests/2.31.0
method: GET
uri: http://localhost:5000/seqrepo/1/metadata/refseq:NC_000009.11
response:
body:
string: "{\n \"added\": \"2016-08-24T05:18:51Z\",\n \"aliases\": [\n \"GRCh37:9\",\n
\ \"GRCh37:chr9\",\n \"GRCh37.p10:9\",\n \"GRCh37.p10:chr9\",\n \"GRCh37.p11:9\",\n
\ \"GRCh37.p11:chr9\",\n \"GRCh37.p12:9\",\n \"GRCh37.p12:chr9\",\n
\ \"GRCh37.p13:9\",\n \"GRCh37.p13:chr9\",\n \"GRCh37.p2:9\",\n \"GRCh37.p2:chr9\",\n
\ \"GRCh37.p5:9\",\n \"GRCh37.p5:chr9\",\n \"GRCh37.p9:9\",\n \"GRCh37.p9:chr9\",\n
\ \"MD5:3e273117f15e0a400f01055d9f393768\",\n \"NCBI:NC_000009.11\",\n
\ \"refseq:NC_000009.11\",\n \"SEGUID:06BhLSlH1xeVNSYu0zRK9qIVmFg\",\n
\ \"SHA1:d3a0612d2947d7179535262ed3344af6a2159858\",\n \"VMC:GS_HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt\",\n
\ \"sha512t24u:HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt\",\n \"ga4gh:SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt\",\n
\ \"hs37-1kg:9\",\n \"hs37d5:9\"\n ],\n \"alphabet\": \"ACGNT\",\n
\ \"length\": 141213431\n}\n"
headers:
Connection:
- close
Content-Length:
- '805'
Content-Type:
- application/json
Date:
- Fri, 05 Apr 2024 19:12:51 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ interactions:
Content-Type:
- application/json
Date:
- Tue, 26 Mar 2024 16:28:06 GMT
- Fri, 05 Apr 2024 19:12:51 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
interactions:
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- python-requests/2.31.0
method: GET
uri: http://localhost:5000/seqrepo/1/metadata/refseq:NC_000023.11
response:
body:
string: "{\n \"added\": \"2016-08-27T23:57:18Z\",\n \"aliases\": [\n \"GRCh38:X\",\n
\ \"GRCh38:chrX\",\n \"GRCh38.p1:X\",\n \"GRCh38.p1:chrX\",\n \"GRCh38.p10:X\",\n
\ \"GRCh38.p10:chrX\",\n \"GRCh38.p11:X\",\n \"GRCh38.p11:chrX\",\n
\ \"GRCh38.p12:X\",\n \"GRCh38.p12:chrX\",\n \"GRCh38.p2:X\",\n \"GRCh38.p2:chrX\",\n
\ \"GRCh38.p3:X\",\n \"GRCh38.p3:chrX\",\n \"GRCh38.p4:X\",\n \"GRCh38.p4:chrX\",\n
\ \"GRCh38.p5:X\",\n \"GRCh38.p5:chrX\",\n \"GRCh38.p6:X\",\n \"GRCh38.p6:chrX\",\n
\ \"GRCh38.p7:X\",\n \"GRCh38.p7:chrX\",\n \"GRCh38.p8:X\",\n \"GRCh38.p8:chrX\",\n
\ \"GRCh38.p9:X\",\n \"GRCh38.p9:chrX\",\n \"MD5:2b3a55ff7f58eb308420c8a9b11cac50\",\n
\ \"NCBI:NC_000023.11\",\n \"refseq:NC_000023.11\",\n \"SEGUID:Z9QbQrrPjpjXSMJesDYqC3A43lA\",\n
\ \"SHA1:67d41b42bacf8e98d748c25eb0362a0b7038de50\",\n \"VMC:GS_w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP\",\n
\ \"sha512t24u:w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP\",\n \"ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP\"\n
\ ],\n \"alphabet\": \"ACGNRSTWY\",\n \"length\": 156040895\n}\n"
headers:
Connection:
- close
Content-Length:
- '978'
Content-Type:
- application/json
Date:
- Fri, 05 Apr 2024 19:12:51 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
code: 200
message: OK
version: 1
95 changes: 94 additions & 1 deletion tests/extras/test_cnv_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,91 @@ def tlr(rest_dataproxy):
'start': 32344742,
'type': 'SequenceLocation'},
'type': 'CopyNumberChange'}
)
),
(
"NC_000023.11:g.(31060227_31100351)_(33274278_33417151)dup",
None,
{
"copyChange": "efo:0030070",
"digest": "H0-_q06in6rsvLfq_5b-CSmP4ZQ6r7-Q",
"id": "ga4gh:CX.H0-_q06in6rsvLfq_5b-CSmP4ZQ6r7-Q",
"location": {
"digest": "R3FeXqOiAu8Vms7QngINQwIxW904fdWY",
"end": [33274278, 33417151],
"id": "ga4gh:SL.R3FeXqOiAu8Vms7QngINQwIxW904fdWY",
"sequenceReference": {
"refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP",
"type": "SequenceReference"
},
"start": [31060226, 31100350],
"type": "SequenceLocation"
},
"type": "CopyNumberChange"
}
),
(
"NC_000009.11:g.(?_108337304)_(108337428_?)del",
None,
{
"copyChange": "efo:0030067",
"digest": "ANthOqEGX8MIn0kXuyQcn9bouYfbFgjH",
"id": "ga4gh:CX.ANthOqEGX8MIn0kXuyQcn9bouYfbFgjH",
"location": {
"digest": "lpDGeQvnz80iis8xSxoCX_Pulnu7wx4M",
"end": [108337428, None],
"id": "ga4gh:SL.lpDGeQvnz80iis8xSxoCX_Pulnu7wx4M",
"sequenceReference": {
"refgetAccession": "SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt",
"type": "SequenceReference"
},
"start": [None, 108337303],
"type": "SequenceLocation"
},
"type": "CopyNumberChange"
}
),
(
"NC_000005.9:g.(90136803)_(90159675)dup",
None,
{
"copyChange": "efo:0030070",
"digest": "YcbXUe21Bt1wQDV7zGM0lacOupkxduFS",
"id": "ga4gh:CX.YcbXUe21Bt1wQDV7zGM0lacOupkxduFS",
"location": {
"digest": "r82CARuf8IxOidMdvQCUcsXNp3XiHEVH",
"end": 90159675,
"id": "ga4gh:SL.r82CARuf8IxOidMdvQCUcsXNp3XiHEVH",
"sequenceReference": {
"refgetAccession": "SQ.vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX",
"type": "SequenceReference"
},
"start": 90136802,
"type": "SequenceLocation"
},
"type": "CopyNumberChange"
}
),
(
"NC_000009.11:g.108337304_(108337428_?)del",
None,
{
"copyChange": "efo:0030067",
"digest": "brfJaiKCnSw-mvc3K9sUIEAyCN620PuD",
"id": "ga4gh:CX.brfJaiKCnSw-mvc3K9sUIEAyCN620PuD",
"location": {
"digest": "6myLdODZ8WgbEDXc3HLp88ZbG536NCM-",
"end": [108337428, None],
"id": "ga4gh:SL.6myLdODZ8WgbEDXc3HLp88ZbG536NCM-",
"sequenceReference": {
"refgetAccession": "SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt",
"type": "SequenceReference"
},
"start": 108337303,
"type": "SequenceLocation"
},
"type": "CopyNumberChange"
}
)
)


Expand All @@ -76,6 +160,15 @@ def test_from_hgvs_cx(tlr, hgvsexpr ,copy_change, expected):
cx = tlr._from_hgvs(hgvsexpr, copy_change=copy_change)
assert cx.model_dump(exclude_none=True) == expected

@pytest.mark.vcf
def test_from_hgvs_cx_invalid(tlr):
"""test that _from_hgvs works correctly for copy number change invalid input"""
# Should fail since it's not g. or m.
with pytest.raises(
ValueError, match="Only 'g' and 'm' reference sequences are supported"
):
tlr._from_hgvs("NM_001197320.1:c.281_283dup")


from_hgvs_cn_tests = (
("NC_000013.11:g.26440969_26443305del", 1,
Expand Down
Loading