direct-phonology · thatbudakguy · May 30, 2026 · May 12, 2026 · May 30, 2026 · May 30, 2026
diff --git a/src/dphon/align.py b/src/dphon/align.py
@@ -65,8 +65,8 @@ def __call__(self, match: Match) -> Match:
         # ---->                <----
         u, v = match.utxt.doc, match.vtxt.doc
         us, vs = match.utxt.start + len(lu), match.vtxt.start + len(lv)
-        utxt = u[us : us + len(cu)]
-        vtxt = v[vs : vs + len(cv)]
+        utxt = u[us : us + sum(1 for c in cu if c != "-")]
+        vtxt = v[vs : vs + sum(1 for c in cv if c != "-")]
 
         # use the gaps in the alignment to construct a new sequence of token
         # texts, inserting gap_char wherever the aligner created a gap
@@ -88,9 +88,9 @@ def __call__(self, match: Match) -> Match:
 
         # trim back the sequence boundaries further to remove any non-alphanum.
         # tokens from the start and end of both alignment and orig. sequence
-        while not au[-1].isalnum() or not av[-1].isalnum():
+        while (not au[-1].isalnum() and au[-1] != self.gap_char) or (not av[-1].isalnum() and av[-1] != self.gap_char):
             utxt, vtxt, au, av = utxt[:-1], vtxt[:-1], au[:-1], av[:-1]
-        while not au[0].isalnum() or not av[0].isalnum():
+        while (not au[0].isalnum() and au[0] != self.gap_char) or (not av[0].isalnum() and av[0] != self.gap_char):
             utxt, vtxt, au, av = utxt[1:], vtxt[1:], au[1:], av[1:]
 
         # normalize score to length; 1.0 is perfect

diff --git a/src/dphon/console.py b/src/dphon/console.py
@@ -96,7 +96,7 @@ def _mark_span(
         marked_span: List[str] = []
         span_ptr = 0
         other_ptr = 0
-        for i in range(len(span)):
+        for i in range(len(alignment)):
             # gap in u: insertion in v (if not punctuation)
             if alignment[i] == self.gap_char and other_alignment[i].isalnum():
                 other_ptr += 1
@@ -108,6 +108,13 @@ def _mark_span(
                 span_ptr += 1
                 continue
 
+            # if either pointer is out of bounds, just append the character
+            if span_ptr >= len(span) or other_ptr >= len(other):
+                marked_span.append(alignment[i])
+                span_ptr += 1
+                other_ptr += 1
+                continue
+
             # variants (both u and v)
             if self.g2p.are_graphic_variants(span[span_ptr], other[other_ptr]):
                 marked_span.append(f"[variant]{alignment[i]}[/variant]")

diff --git a/src/dphon/g2p.py b/src/dphon/g2p.py
@@ -5,7 +5,7 @@
 from importlib.resources.abc import Traversable
 import json
 import logging
-from typing import Iterable, Iterator, Mapping, Optional, Tuple, List
+from typing import Iterable, Iterator, Optional, Tuple, List
 
 from spacy.language import Language
 from spacy.lookups import Table
@@ -18,7 +18,7 @@
 
 # types for sound tables: map a string to a tuple of syllable phonemes
 Phonemes_T = Tuple[Optional[str], ...]
-SoundTable_T = Mapping[str, Phonemes_T]
+SoundTable_T = dict[str, Phonemes_T]
 
 
 class GraphemesToPhonemes:
@@ -45,17 +45,13 @@ def __init__(self, nlp: Language, sound_table: SoundTable_T):
         syllable_parts = len(next(iter(sound_table.values())))
         self.empty_phonemes = tuple(None for _ in range(syllable_parts))
 
-        # register extensions on spaCy primitives
-        if not Doc.has_extension("phonemes"):
-            Doc.set_extension("phonemes", getter=self.get_all_phonemes)
-        if not Span.has_extension("phonemes"):
-            Span.set_extension("phonemes", getter=self.get_all_phonemes)
-        if not Span.has_extension("syllables"):
-            Span.set_extension("syllables", getter=self._get_syllables)
-        if not Token.has_extension("phonemes"):
-            Token.set_extension("phonemes", getter=self.get_token_phonemes)
-        if not Token.has_extension("is_oov"):
-            Token.set_extension("is_oov", getter=self.is_token_oov)
+        # register extensions on spaCy primitives; use force=true because
+        # registration is global and we want to overwrite any old state
+        Doc.set_extension("phonemes", getter=self.get_all_phonemes, force=True)
+        Span.set_extension("phonemes", getter=self.get_all_phonemes, force=True)
+        Span.set_extension("syllables", getter=self._get_syllables, force=True)
+        Token.set_extension("phonemes", getter=self.get_token_phonemes, force=True)
+        Token.set_extension("is_oov", getter=self.is_token_oov, force=True)
 
         # store the sound table in the vocab's Lookups
         self.table = nlp.vocab.lookups.add_table("phonemes", sound_table)

diff --git a/tests/unit/test_align.py b/tests/unit/test_align.py
@@ -2,10 +2,11 @@
 """Aligner unit tests."""
 
 from unittest import TestCase
-
+from pathlib import Path
 import spacy
+from dphon.g2p import get_sound_table_json
 from dphon.match import Match
-from dphon.align import SmithWatermanAligner
+from dphon.align import SmithWatermanAligner, SmithWatermanPhoneticAligner
 from lingpy.align.pairwise import _get_scorer
 
 
@@ -130,3 +131,43 @@ def test_scorer(self) -> None:
         self.assertEqual(aligned.au, list("CABACABACABA"))
         self.assertEqual(aligned.av, list("CBBBCBBBCBBB"))
         self.assertEqual(aligned.weight, 1.0)
+
+class TestSmithWatermanPhoneticAligner(TestCase):
+    """Test the SmithWatermanPhoneticAligner."""
+
+    maxDiff = None  # don't limit length of diff output for failures
+
+    def setUp(self) -> None:
+        """Create a spaCy pipeline and aligner for use in testing."""
+
+        # chinese pipeline with sound table g2p
+        self.nlp = spacy.blank(
+            "zh", meta={"tokenizer": {"config": {"use_jieba": False}}}
+        )
+        self.project_dir = Path(__file__).parent.parent.parent.parent
+        sound_table = get_sound_table_json(self.project_dir / "dphon" / "src" / "dphon" / "data" / "sound_table_v2.json")
+        self.nlp.add_pipe("g2p", config={"sound_table": sound_table})
+
+        # aligner with default phonetic scoring matrix
+        self.align = SmithWatermanPhoneticAligner()
+
+    def test_spacing(self) -> None:
+        """Matches with gaps should maintain correct transcription alignment.
+
+        Text sources:
+        - CHANT 尚書-0013
+        - CHANT 尚書-0059-0023"""
+
+        # create a mock match and align it
+        u = self.nlp.make_doc("罔有天災山川鬼神亦莫不寧暨鳥獸魚鱉咸若于")
+        v = self.nlp.make_doc("胡敢異心山川鬼神亦莫敢不寧若能共允住天下")
+        match = Match("shangshu13", "shangshu59", u[:], v[:])
+        aligned = self.align(match)
+
+        # first string gets a gap for the insertion of "敢" (kˤamʔ)
+        self.assertEqual(aligned.au, list("山川鬼神亦莫-不寧"))
+        self.assertEqual(aligned.u_transcription, "*s-ŋrar t-◦lun k-ʔujʔ Cə-lin ɢrAk mˤak pə nˤeŋ")
+
+        # second string has the inserted character
+        self.assertEqual(aligned.av, list("山川鬼神亦莫敢不寧"))
+        self.assertEqual(aligned.v_transcription, "*s-ŋrar t-◦lun k-ʔujʔ Cə-lin ɢrAk mˤak kˤamʔ pə nˤeŋ")