Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions openfold3/core/data/primitives/structure/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,29 @@ def get_asym_id_to_canonical_seq_dict(
}


def get_label_to_author_chain_id_dict(
cif_file: CIFFile | BinaryCIFFile,
) -> dict[str, str]:
"""Get a mapping from label asym_id to author (pdb_strand_id) chain ID.

Reads from ``pdbx_poly_seq_scheme`` so no atom array is needed.

Args:
cif_file:
Parsed mmCIF file containing the structure.

Returns:
A dictionary mapping label asym IDs to author chain IDs.
"""
block = cif_file.block
poly_scheme = block["pdbx_poly_seq_scheme"]
asym_ids = poly_scheme["asym_id"].as_array()
author_ids = poly_scheme["pdb_strand_id"].as_array()

_, idx = np.unique(asym_ids, return_index=True)
return dict(zip(asym_ids[idx].tolist(), author_ids[idx].tolist(), strict=True))


def get_entity_to_three_letter_codes_dict(cif_data: CIFBlock) -> dict[int, list[str]]:
"""Get a dictionary mapping entity IDs to their three-letter-code sequences.

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
101 1b27_B 1.0 110 0 0 1 110 1 110 1.022e-45 160 110M
101 6pqk_A 0.981 108 2 0 3 110 1 108 2.6399999999999998e-45 159 108M
101 1bse_C 0.99 108 1 0 3 110 1 108 4.9679999999999996e-45 158 108M
101 1b2s_C 0.99 110 1 0 1 110 1 110 6.815e-45 158 110M
101 1bsb_C 0.99 108 1 0 3 110 1 108 9.349999999999999e-45 158 108M
101 1x1y_B 0.99 110 1 0 1 110 1 110 9.349999999999999e-45 158 110M
101 2za4_A 0.99 108 1 0 3 110 1 108 1.76e-44 157 108M
101 1ban_B 0.99 108 1 0 3 110 1 108 3.3119999999999996e-44 156 108M
101 1b20_B 0.99 108 1 0 3 110 1 108 3.3119999999999996e-44 156 108M
101 1buj_A 0.842 108 17 0 3 110 2 109 3.3119999999999996e-44 156 108M
101 1bsd_C 0.99 107 1 0 4 110 1 107 4.544e-44 156 107M
101 1brg_C 0.99 108 1 0 3 110 1 108 4.544e-44 156 108M
101 1brk_C 0.99 107 1 0 4 110 1 107 6.233e-44 155 107M
101 1bsa_C 0.99 107 1 0 4 110 1 107 6.233e-44 155 107M
101 1b2z_C 0.99 107 1 0 4 110 1 107 8.552e-44 155 107M
101 1b21_B 0.981 108 2 0 3 110 1 108 8.552e-44 155 108M
101 1rnb_A 0.981 109 2 0 2 110 1 109 8.552e-44 155 109M
101 1bsc_B 0.99 107 1 0 4 110 1 107 1.173e-43 154 107M
101 1bao_C 0.99 107 1 0 4 110 1 107 1.173e-43 154 107M
101 1brj_C 0.99 108 1 0 3 110 1 108 1.173e-43 154 108M
101 1brh_C 0.99 108 1 0 3 110 1 108 1.173e-43 154 108M
101 4haa_D 0.833 108 18 0 3 110 2 109 1.173e-43 154 108M
101 1bri_C 0.99 107 1 0 4 110 1 107 1.61e-43 154 107M
101 1bns_C 0.99 107 1 0 4 110 1 107 2.208e-43 154 107M
101 1bnf_A 0.981 108 2 0 3 110 1 108 4.156e-43 153 108M
101 2rbi_B 0.833 108 18 0 3 110 1 108 5.701e-43 153 108M
101 2kf3_A 0.99 108 1 0 3 110 1 108 5.701e-43 153 108M
101 2c4b_B 0.99 108 1 0 3 110 1 108 5.701e-43 153 108M
101 1bng_C 0.981 107 2 0 4 110 1 107 5.2150000000000006e-42 150 107M
101 3q3f_A 0.936 110 7 0 1 110 1 110 5.2150000000000006e-42 150 110M
101 1goy_B 0.796 108 22 0 3 110 1 108 7.155e-42 149 108M
101 3da7_A 1.0 66 0 0 1 66 44 109 4.0910000000000004e-25 101 66M
101 3da7_E 1.0 60 0 0 7 66 44 103 2.286e-22 93 60M
101 3da7_B 1.0 57 0 0 8 64 44 100 1.525e-21 91 57M
101 3da7_G 0.892 65 7 0 2 66 38 102 1.525e-21 91 65M
101 3d5g_C 0.31 58 39 1 53 109 35 92 5.437e-14 69 30M1D27M
101 1mgr_A 0.322 59 38 2 53 109 35 93 7.458e-14 69 30M1D11M1D16M
101 3dgy_C 0.35 57 36 1 53 109 32 87 1.023e-13 68 24M1I32M
101 3dgy_A 0.35 57 36 1 53 109 33 88 1.023e-13 68 24M1I32M
101 3d5i_C 0.315 57 37 1 53 109 32 86 4.964e-13 66 27M2I28M
101 3d4a_C 0.315 57 36 1 53 109 32 85 6.808e-13 66 27M3I27M
101 4j5g_B 0.338 59 37 2 53 109 34 92 9.337e-13 66 30M1D9M1D18M
101 1ay7_A 0.338 59 37 2 53 109 34 92 9.337e-13 66 30M1D9M1D18M
101 1ynv_X 0.338 59 37 2 53 109 34 92 9.337e-13 66 30M1D9M1D18M
101 1c54_A 0.338 59 37 2 53 109 34 92 9.337e-13 66 30M1D9M1D18M
101 1uci_B 0.338 59 37 2 53 109 34 92 9.337e-13 66 30M1D9M1D18M
101 4gho_B 0.338 59 37 2 53 109 34 92 9.337e-13 66 30M1D9M1D18M
101 4j5g_A 0.338 59 37 2 53 109 34 92 9.337e-13 66 30M1D9M1D18M
101 1rsn_A 0.338 59 37 2 53 109 34 92 1.281e-12 65 30M1D9M1D18M
101 1ucj_B 0.338 59 37 2 53 109 34 92 1.281e-12 65 30M1D9M1D18M
101 1t2i_A 0.338 59 37 2 53 109 34 92 1.756e-12 65 30M1D11M1D16M
101 1uck_B 0.338 59 37 2 53 109 34 92 1.756e-12 65 30M1D9M1D18M
101 4j5k_A 0.338 59 37 2 53 109 34 92 3.303e-12 64 30M1D9M1D18M
101 4j5k_B 0.338 59 37 2 53 109 34 92 3.303e-12 64 30M1D9M1D18M
101 1i8v_B 0.322 59 38 2 53 109 34 92 4.529e-12 64 30M1D9M1D18M
101 3a5e_A 0.322 59 38 2 53 109 34 92 4.529e-12 64 30M1D9M1D18M
101 1t2h_B 0.338 59 37 2 53 109 34 92 6.211e-12 63 30M1D9M1D18M
101 1box_A 0.322 59 38 2 53 109 33 91 8.517e-12 63 30M1D9M1D18M
101 1i70_B 0.322 59 38 2 53 109 34 92 8.517e-12 63 30M1D9M1D18M
101 1ucl_B 0.355 59 36 2 53 109 34 92 8.517e-12 63 22M1D17M1D18M
101 1zgx_A 0.321 28 19 0 53 80 34 61 0.002519 38 28M
101 1zgx_B 0.392 28 16 1 83 109 2 29 0.01637 36 9M1D18M
83 changes: 83 additions & 0 deletions openfold3/tests/core/data/pipelines/preprocessing/test_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from openfold3.core.data.io.sequence.template import (
A3mParser,
parse_template_alignment,
)
import pytest

Check failure on line 5 in openfold3/tests/core/data/pipelines/preprocessing/test_template.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

openfold3/tests/core/data/pipelines/preprocessing/test_template.py:5:8: F401 `pytest` imported but unused

Check failure on line 5 in openfold3/tests/core/data/pipelines/preprocessing/test_template.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

openfold3/tests/core/data/pipelines/preprocessing/test_template.py:5:8: F401 `pytest` imported but unused
from openfold3.core.data.io.structure.cif import _load_ciffile

from biotite.database.rcsb import fetch

from openfold3.core.data.primitives.structure.metadata import (
get_chain_to_canonical_seq_dict,

Check failure on line 11 in openfold3/tests/core/data/pipelines/preprocessing/test_template.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

openfold3/tests/core/data/pipelines/preprocessing/test_template.py:11:5: F401 `openfold3.core.data.primitives.structure.metadata.get_chain_to_canonical_seq_dict` imported but unused

Check failure on line 11 in openfold3/tests/core/data/pipelines/preprocessing/test_template.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

openfold3/tests/core/data/pipelines/preprocessing/test_template.py:11:5: F401 `openfold3.core.data.primitives.structure.metadata.get_chain_to_canonical_seq_dict` imported but unused
get_cif_block,

Check failure on line 12 in openfold3/tests/core/data/pipelines/preprocessing/test_template.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

openfold3/tests/core/data/pipelines/preprocessing/test_template.py:12:5: F401 `openfold3.core.data.primitives.structure.metadata.get_cif_block` imported but unused

Check failure on line 12 in openfold3/tests/core/data/pipelines/preprocessing/test_template.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

openfold3/tests/core/data/pipelines/preprocessing/test_template.py:12:5: F401 `openfold3.core.data.primitives.structure.metadata.get_cif_block` imported but unused

)

from openfold3.core.data.io.sequence.template import (
A3mParser

Check failure on line 17 in openfold3/tests/core/data/pipelines/preprocessing/test_template.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F811)

openfold3/tests/core/data/pipelines/preprocessing/test_template.py:17:1: F811 Redefinition of unused `A3mParser` from line 2: `A3mParser` redefined here

Check failure on line 17 in openfold3/tests/core/data/pipelines/preprocessing/test_template.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F811)

openfold3/tests/core/data/pipelines/preprocessing/test_template.py:17:1: F811 Redefinition of unused `A3mParser` from line 2: `A3mParser` redefined here
)
from openfold3.core.data.primitives.structure.metadata import (
get_asym_id_to_canonical_seq_dict,
get_label_to_author_chain_id_dict,
)


from pathlib import Path

Check failure on line 25 in openfold3/tests/core/data/pipelines/preprocessing/test_template.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (I001)

openfold3/tests/core/data/pipelines/preprocessing/test_template.py:1:1: I001 Import block is un-sorted or un-formatted

Check failure on line 25 in openfold3/tests/core/data/pipelines/preprocessing/test_template.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (I001)

openfold3/tests/core/data/pipelines/preprocessing/test_template.py:1:1: I001 Import block is un-sorted or un-formatted

class TestTemplatePreprocessor():

Check failure on line 27 in openfold3/tests/core/data/pipelines/preprocessing/test_template.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP039)

openfold3/tests/core/data/pipelines/preprocessing/test_template.py:27:31: UP039 Unnecessary parentheses after class definition

Check failure on line 27 in openfold3/tests/core/data/pipelines/preprocessing/test_template.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP039)

openfold3/tests/core/data/pipelines/preprocessing/test_template.py:27:31: UP039 Unnecessary parentheses after class definition

def test_template_has_author_chain_id(self, tmp_path):
"""
https://github.com/aqlaboratory/openfold-3/issues/101

"""

alignment_file = Path(__file__).parent / "colabfold_template.m8"
query_seq_str = "AQVINTFDGVADYLQTYHKLPDNYITKSEAQALGWVASKGNLADVAPGKSIGGDIFSNREGKLPGKSGRTWREADINYTSGFRNSDRILYSSDWLIYKTTDHYQTFTKIR"
templates = parse_template_alignment(
aln_path=Path(alignment_file),
query_seq_str=query_seq_str,
max_sequences=200

)

# find the offending "1rnb_A"
template = templates[16]
assert template.chain_id == "A" and template.entry_id == "1rnb"

fetch(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we mock this call instead of explicitly calling the RCSB database using fetch?

As this is a unit test, it would be good to remove dependencies on web servers so that we don't have latency issues / failures due to the availability of the service.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Switched to just a cif file as fixture

pdb_ids=template.entry_id,
format="cif",
target_path=tmp_path,
)


template_structure_file = tmp_path / f"{template.entry_id}.cif"

cif_file = _load_ciffile(template_structure_file)

chain_id_seq_map = get_asym_id_to_canonical_seq_dict(cif_file)

# template.chain_id is an author chain ID; map it to label asym_id
label_to_author = get_label_to_author_chain_id_dict(cif_file)
author_to_label = {v: k for k, v in label_to_author.items()}
label_chain_id = author_to_label[template.chain_id]
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the re-mapping from "auth" chains IDs to "label" chain IDs... very wishful in terms of inputs not being pathological

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So if there are multiple label IDs mapping to the same author ID, this will just always map an author ID to the l ast label ID. @ljarosch can confirm, but I think this only happens with homomeric chains, so it should be fine. We should just document this behavior.

A way to make this more robust would be to explicitly sort the label_to_author dict when iterating over it, so maybe add that here so we are not relying on the dict ordering for this mapping.


template_sequence = chain_id_seq_map.get(label_chain_id)

parser = A3mParser(max_sequences=None)
parsed = parser(
(
f">query_X/1-{len(query_seq_str)}\n"
f"{query_seq_str}\n"
f">{template.entry_id}_{label_chain_id}/{1}-{len(template_sequence)}\n"
f"{template_sequence}\n"
),
query_seq_str,
realign=True,
)

assert len(parsed) == 2
assert parsed[0].seq_id == 1
assert parsed[1].seq_id < 1

Loading