diff --git a/setup.cfg b/setup.cfg index def13ba1..b12ca2ea 100644 --- a/setup.cfg +++ b/setup.cfg @@ -92,7 +92,7 @@ extras = psycopg2-binary biocommons.seqrepo>=0.5.1 bioutils>=0.5.2 - hgvs>=1.4 + hgvs@git+https://github.com/biocommons/hgvs@225-uncertain-ranges requests dill~=0.3.7 click diff --git a/src/ga4gh/vrs/extras/translator.py b/src/ga4gh/vrs/extras/translator.py index e6e24354..5b5cb15c 100644 --- a/src/ga4gh/vrs/extras/translator.py +++ b/src/ga4gh/vrs/extras/translator.py @@ -10,6 +10,7 @@ from typing import Optional, Union from ga4gh.vrs.dataproxy import create_dataproxy, _DataProxy from ga4gh.vrs.extras.decorators import lazy_property +from hgvs.location import Interval, SimplePosition, AAPosition, BaseOffsetInterval import logging import re @@ -438,11 +439,14 @@ def _from_hgvs(self, hgvs_dup_del_expr: str, **kwargs): copy_change: Copy change. If not provided, default is efo:0030067 for deletions and efo:0030070 for duplications """ - # sv = self._get_parsed_hgvs(hgvs_dup_del_expr) sv = self.hgvs_tools.parse(hgvs_dup_del_expr) if not sv: return None + if sv.type not in {"g", "m"}: + err_msg = "Only 'g' and 'm' reference sequences are supported" + raise ValueError(err_msg) + sv_type = self.hgvs_tools.get_edit_type(sv) if sv_type not in {"del", "dup"}: raise ValueError("Must provide a 'del' or 'dup'") @@ -454,10 +458,13 @@ def _from_hgvs(self, hgvs_dup_del_expr: str, **kwargs): if not refget_accession: return None + start = self._get_vrs_loc_start_end_val(sv.posedit.pos.start, is_start=True) + end = self._get_vrs_loc_start_end_val(sv.posedit.pos.end, is_start=False) + location = models.SequenceLocation( sequenceReference=models.SequenceReference(refgetAccession=refget_accession), - start=sv.posedit.pos.start.base - 1, - end=sv.posedit.pos.end.base + start=start, + end=end ) copies = kwargs.get("copies") @@ -472,6 +479,43 @@ def _from_hgvs(self, hgvs_dup_del_expr: str, **kwargs): cnv =self._post_process_imported_cnv(cnv) return cnv + @staticmethod + def _get_vrs_loc_start_end_val( + pos: Union[SimplePosition, AAPosition, BaseOffsetInterval, Interval], + is_start: bool = True + ) -> Union[int, models.Range]: + """Get VRS Location start or end value + + :param pos: biocommons hgvs location instance for position + :param is_start: ``True`` if ``pos`` represents VRS Sequence Location start. + ``False`` if ``pos`` represents VRS Sequence Location end. + :raise ValueError: If unsupported biocommons hgvs location is passed + :return: VRS Location start or end value using inter-residue positions + """ + def _get_pos_value( + position: Optional[int], + do_subtract_1: bool + ) -> Optional[int]: + """Get position value + + :param position: Position + :param do_subtract_1: Whether or not we need to subtract 1 for ``position`` + :return: Adjusted position value + """ + return position - 1 if do_subtract_1 else position + + if isinstance(pos, (SimplePosition, AAPosition, BaseOffsetInterval)): + vrs_loc_pos_val = _get_pos_value(pos.base, is_start) + elif isinstance(pos, Interval): + start_val = _get_pos_value(pos.start.base, is_start and pos.start.base is not None) + end_val = _get_pos_value(pos.end.base, is_start and pos.end.base is not None) + vrs_loc_pos_val = start_val if start_val == end_val else models.Range([start_val, end_val]) + else: + err_msg = f"HGVS Location is not supported: {type(pos)}" + raise ValueError(err_msg) + + return vrs_loc_pos_val + def _post_process_imported_cnv(self, copy_number): """Provide common post-processing for imported Copy Numbers IN-PLACE.""" if self.identify: diff --git a/tests/extras/cassettes/test_from_hgvs_cx[NC_000005.9:g.(90136803)_(90159675)dup-None-expected6].yaml b/tests/extras/cassettes/test_from_hgvs_cx[NC_000005.9:g.(90136803)_(90159675)dup-None-expected6].yaml new file mode 100644 index 00000000..c9b27b97 --- /dev/null +++ b/tests/extras/cassettes/test_from_hgvs_cx[NC_000005.9:g.(90136803)_(90159675)dup-None-expected6].yaml @@ -0,0 +1,42 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/metadata/refseq:NC_000005.9 + response: + body: + string: "{\n \"added\": \"2016-08-24T05:17:01Z\",\n \"aliases\": [\n \"GRCh37:5\",\n + \ \"GRCh37:chr5\",\n \"GRCh37.p10:5\",\n \"GRCh37.p10:chr5\",\n \"GRCh37.p11:5\",\n + \ \"GRCh37.p11:chr5\",\n \"GRCh37.p12:5\",\n \"GRCh37.p12:chr5\",\n + \ \"GRCh37.p13:5\",\n \"GRCh37.p13:chr5\",\n \"GRCh37.p2:5\",\n \"GRCh37.p2:chr5\",\n + \ \"GRCh37.p5:5\",\n \"GRCh37.p5:chr5\",\n \"GRCh37.p9:5\",\n \"GRCh37.p9:chr5\",\n + \ \"MD5:0740173db9ffd264d728f32784845cd7\",\n \"NCBI:NC_000005.9\",\n + \ \"refseq:NC_000005.9\",\n \"SEGUID:Ja+pA+dtRy6jSKdOZXN58wY0rK4\",\n + \ \"SHA1:25afa903e76d472ea348a74e657379f30634acae\",\n \"VMC:GS_vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX\",\n + \ \"sha512t24u:vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX\",\n \"ga4gh:SQ.vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX\",\n + \ \"hs37-1kg:5\",\n \"hs37d5:5\"\n ],\n \"alphabet\": \"ACGNT\",\n + \ \"length\": 180915260\n}\n" + headers: + Connection: + - close + Content-Length: + - '803' + Content-Type: + - application/json + Date: + - Fri, 05 Apr 2024 19:19:57 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/extras/cassettes/test_from_hgvs_cx[NC_000009.11:g.(?_108337304)_(108337428_?)del-None-expected5].yaml b/tests/extras/cassettes/test_from_hgvs_cx[NC_000009.11:g.(?_108337304)_(108337428_?)del-None-expected5].yaml new file mode 100644 index 00000000..46de454d --- /dev/null +++ b/tests/extras/cassettes/test_from_hgvs_cx[NC_000009.11:g.(?_108337304)_(108337428_?)del-None-expected5].yaml @@ -0,0 +1,42 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/metadata/refseq:NC_000009.11 + response: + body: + string: "{\n \"added\": \"2016-08-24T05:18:51Z\",\n \"aliases\": [\n \"GRCh37:9\",\n + \ \"GRCh37:chr9\",\n \"GRCh37.p10:9\",\n \"GRCh37.p10:chr9\",\n \"GRCh37.p11:9\",\n + \ \"GRCh37.p11:chr9\",\n \"GRCh37.p12:9\",\n \"GRCh37.p12:chr9\",\n + \ \"GRCh37.p13:9\",\n \"GRCh37.p13:chr9\",\n \"GRCh37.p2:9\",\n \"GRCh37.p2:chr9\",\n + \ \"GRCh37.p5:9\",\n \"GRCh37.p5:chr9\",\n \"GRCh37.p9:9\",\n \"GRCh37.p9:chr9\",\n + \ \"MD5:3e273117f15e0a400f01055d9f393768\",\n \"NCBI:NC_000009.11\",\n + \ \"refseq:NC_000009.11\",\n \"SEGUID:06BhLSlH1xeVNSYu0zRK9qIVmFg\",\n + \ \"SHA1:d3a0612d2947d7179535262ed3344af6a2159858\",\n \"VMC:GS_HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt\",\n + \ \"sha512t24u:HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt\",\n \"ga4gh:SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt\",\n + \ \"hs37-1kg:9\",\n \"hs37d5:9\"\n ],\n \"alphabet\": \"ACGNT\",\n + \ \"length\": 141213431\n}\n" + headers: + Connection: + - close + Content-Length: + - '805' + Content-Type: + - application/json + Date: + - Fri, 05 Apr 2024 19:12:51 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/extras/cassettes/test_from_hgvs_cx[NC_000013.11:g.26440969_26443305del-CopyChange.EFO_0030069-expected0].yaml b/tests/extras/cassettes/test_from_hgvs_cx[NC_000013.11:g.26440969_26443305del-CopyChange.EFO_0030069-expected0].yaml index 27462278..e8810be2 100644 --- a/tests/extras/cassettes/test_from_hgvs_cx[NC_000013.11:g.26440969_26443305del-CopyChange.EFO_0030069-expected0].yaml +++ b/tests/extras/cassettes/test_from_hgvs_cx[NC_000013.11:g.26440969_26443305del-CopyChange.EFO_0030069-expected0].yaml @@ -35,7 +35,7 @@ interactions: Content-Type: - application/json Date: - - Tue, 26 Mar 2024 16:28:06 GMT + - Fri, 05 Apr 2024 19:12:51 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: diff --git a/tests/extras/cassettes/test_from_hgvs_cx[NC_000023.11:g.(31060227_31100351)_(33274278_33417151)dup-None-expected4].yaml b/tests/extras/cassettes/test_from_hgvs_cx[NC_000023.11:g.(31060227_31100351)_(33274278_33417151)dup-None-expected4].yaml new file mode 100644 index 00000000..473c0656 --- /dev/null +++ b/tests/extras/cassettes/test_from_hgvs_cx[NC_000023.11:g.(31060227_31100351)_(33274278_33417151)dup-None-expected4].yaml @@ -0,0 +1,43 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/metadata/refseq:NC_000023.11 + response: + body: + string: "{\n \"added\": \"2016-08-27T23:57:18Z\",\n \"aliases\": [\n \"GRCh38:X\",\n + \ \"GRCh38:chrX\",\n \"GRCh38.p1:X\",\n \"GRCh38.p1:chrX\",\n \"GRCh38.p10:X\",\n + \ \"GRCh38.p10:chrX\",\n \"GRCh38.p11:X\",\n \"GRCh38.p11:chrX\",\n + \ \"GRCh38.p12:X\",\n \"GRCh38.p12:chrX\",\n \"GRCh38.p2:X\",\n \"GRCh38.p2:chrX\",\n + \ \"GRCh38.p3:X\",\n \"GRCh38.p3:chrX\",\n \"GRCh38.p4:X\",\n \"GRCh38.p4:chrX\",\n + \ \"GRCh38.p5:X\",\n \"GRCh38.p5:chrX\",\n \"GRCh38.p6:X\",\n \"GRCh38.p6:chrX\",\n + \ \"GRCh38.p7:X\",\n \"GRCh38.p7:chrX\",\n \"GRCh38.p8:X\",\n \"GRCh38.p8:chrX\",\n + \ \"GRCh38.p9:X\",\n \"GRCh38.p9:chrX\",\n \"MD5:2b3a55ff7f58eb308420c8a9b11cac50\",\n + \ \"NCBI:NC_000023.11\",\n \"refseq:NC_000023.11\",\n \"SEGUID:Z9QbQrrPjpjXSMJesDYqC3A43lA\",\n + \ \"SHA1:67d41b42bacf8e98d748c25eb0362a0b7038de50\",\n \"VMC:GS_w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP\",\n + \ \"sha512t24u:w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP\",\n \"ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP\"\n + \ ],\n \"alphabet\": \"ACGNRSTWY\",\n \"length\": 156040895\n}\n" + headers: + Connection: + - close + Content-Length: + - '978' + Content-Type: + - application/json + Date: + - Fri, 05 Apr 2024 19:12:51 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/extras/test_cnv_translator.py b/tests/extras/test_cnv_translator.py index edb553d7..b9e45012 100644 --- a/tests/extras/test_cnv_translator.py +++ b/tests/extras/test_cnv_translator.py @@ -65,7 +65,91 @@ def tlr(rest_dataproxy): 'start': 32344742, 'type': 'SequenceLocation'}, 'type': 'CopyNumberChange'} - ) + ), + ( + "NC_000023.11:g.(31060227_31100351)_(33274278_33417151)dup", + None, + { + "copyChange": "efo:0030070", + "digest": "H0-_q06in6rsvLfq_5b-CSmP4ZQ6r7-Q", + "id": "ga4gh:CX.H0-_q06in6rsvLfq_5b-CSmP4ZQ6r7-Q", + "location": { + "digest": "R3FeXqOiAu8Vms7QngINQwIxW904fdWY", + "end": [33274278, 33417151], + "id": "ga4gh:SL.R3FeXqOiAu8Vms7QngINQwIxW904fdWY", + "sequenceReference": { + "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "type": "SequenceReference" + }, + "start": [31060226, 31100350], + "type": "SequenceLocation" + }, + "type": "CopyNumberChange" + } + ), + ( + "NC_000009.11:g.(?_108337304)_(108337428_?)del", + None, + { + "copyChange": "efo:0030067", + "digest": "ANthOqEGX8MIn0kXuyQcn9bouYfbFgjH", + "id": "ga4gh:CX.ANthOqEGX8MIn0kXuyQcn9bouYfbFgjH", + "location": { + "digest": "lpDGeQvnz80iis8xSxoCX_Pulnu7wx4M", + "end": [108337428, None], + "id": "ga4gh:SL.lpDGeQvnz80iis8xSxoCX_Pulnu7wx4M", + "sequenceReference": { + "refgetAccession": "SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt", + "type": "SequenceReference" + }, + "start": [None, 108337303], + "type": "SequenceLocation" + }, + "type": "CopyNumberChange" + } + ), + ( + "NC_000005.9:g.(90136803)_(90159675)dup", + None, + { + "copyChange": "efo:0030070", + "digest": "YcbXUe21Bt1wQDV7zGM0lacOupkxduFS", + "id": "ga4gh:CX.YcbXUe21Bt1wQDV7zGM0lacOupkxduFS", + "location": { + "digest": "r82CARuf8IxOidMdvQCUcsXNp3XiHEVH", + "end": 90159675, + "id": "ga4gh:SL.r82CARuf8IxOidMdvQCUcsXNp3XiHEVH", + "sequenceReference": { + "refgetAccession": "SQ.vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX", + "type": "SequenceReference" + }, + "start": 90136802, + "type": "SequenceLocation" + }, + "type": "CopyNumberChange" + } + ), + ( + "NC_000009.11:g.108337304_(108337428_?)del", + None, + { + "copyChange": "efo:0030067", + "digest": "brfJaiKCnSw-mvc3K9sUIEAyCN620PuD", + "id": "ga4gh:CX.brfJaiKCnSw-mvc3K9sUIEAyCN620PuD", + "location": { + "digest": "6myLdODZ8WgbEDXc3HLp88ZbG536NCM-", + "end": [108337428, None], + "id": "ga4gh:SL.6myLdODZ8WgbEDXc3HLp88ZbG536NCM-", + "sequenceReference": { + "refgetAccession": "SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt", + "type": "SequenceReference" + }, + "start": 108337303, + "type": "SequenceLocation" + }, + "type": "CopyNumberChange" + } + ) ) @@ -76,6 +160,15 @@ def test_from_hgvs_cx(tlr, hgvsexpr ,copy_change, expected): cx = tlr._from_hgvs(hgvsexpr, copy_change=copy_change) assert cx.model_dump(exclude_none=True) == expected +@pytest.mark.vcf +def test_from_hgvs_cx_invalid(tlr): + """test that _from_hgvs works correctly for copy number change invalid input""" + # Should fail since it's not g. or m. + with pytest.raises( + ValueError, match="Only 'g' and 'm' reference sequences are supported" + ): + tlr._from_hgvs("NM_001197320.1:c.281_283dup") + from_hgvs_cn_tests = ( ("NC_000013.11:g.26440969_26443305del", 1,