diff --git a/src/dphon/align.py b/src/dphon/align.py index 9fd67b2..ab4f94e 100644 --- a/src/dphon/align.py +++ b/src/dphon/align.py @@ -65,8 +65,8 @@ def __call__(self, match: Match) -> Match: # ----> <---- u, v = match.utxt.doc, match.vtxt.doc us, vs = match.utxt.start + len(lu), match.vtxt.start + len(lv) - utxt = u[us : us + len(cu)] - vtxt = v[vs : vs + len(cv)] + utxt = u[us : us + sum(1 for c in cu if c != "-")] + vtxt = v[vs : vs + sum(1 for c in cv if c != "-")] # use the gaps in the alignment to construct a new sequence of token # texts, inserting gap_char wherever the aligner created a gap @@ -88,9 +88,9 @@ def __call__(self, match: Match) -> Match: # trim back the sequence boundaries further to remove any non-alphanum. # tokens from the start and end of both alignment and orig. sequence - while not au[-1].isalnum() or not av[-1].isalnum(): + while (not au[-1].isalnum() and au[-1] != self.gap_char) or (not av[-1].isalnum() and av[-1] != self.gap_char): utxt, vtxt, au, av = utxt[:-1], vtxt[:-1], au[:-1], av[:-1] - while not au[0].isalnum() or not av[0].isalnum(): + while (not au[0].isalnum() and au[0] != self.gap_char) or (not av[0].isalnum() and av[0] != self.gap_char): utxt, vtxt, au, av = utxt[1:], vtxt[1:], au[1:], av[1:] # normalize score to length; 1.0 is perfect diff --git a/src/dphon/console.py b/src/dphon/console.py index d178887..964ee13 100644 --- a/src/dphon/console.py +++ b/src/dphon/console.py @@ -96,7 +96,7 @@ def _mark_span( marked_span: List[str] = [] span_ptr = 0 other_ptr = 0 - for i in range(len(span)): + for i in range(len(alignment)): # gap in u: insertion in v (if not punctuation) if alignment[i] == self.gap_char and other_alignment[i].isalnum(): other_ptr += 1 @@ -108,6 +108,13 @@ def _mark_span( span_ptr += 1 continue + # if either pointer is out of bounds, just append the character + if span_ptr >= len(span) or other_ptr >= len(other): + marked_span.append(alignment[i]) + span_ptr += 1 + other_ptr += 1 + continue + # variants (both u and v) if self.g2p.are_graphic_variants(span[span_ptr], other[other_ptr]): marked_span.append(f"[variant]{alignment[i]}[/variant]") diff --git a/src/dphon/g2p.py b/src/dphon/g2p.py index 07ea2ac..d914666 100644 --- a/src/dphon/g2p.py +++ b/src/dphon/g2p.py @@ -5,7 +5,7 @@ from importlib.resources.abc import Traversable import json import logging -from typing import Iterable, Iterator, Mapping, Optional, Tuple, List +from typing import Iterable, Iterator, Optional, Tuple, List from spacy.language import Language from spacy.lookups import Table @@ -18,7 +18,7 @@ # types for sound tables: map a string to a tuple of syllable phonemes Phonemes_T = Tuple[Optional[str], ...] -SoundTable_T = Mapping[str, Phonemes_T] +SoundTable_T = dict[str, Phonemes_T] class GraphemesToPhonemes: @@ -45,17 +45,13 @@ def __init__(self, nlp: Language, sound_table: SoundTable_T): syllable_parts = len(next(iter(sound_table.values()))) self.empty_phonemes = tuple(None for _ in range(syllable_parts)) - # register extensions on spaCy primitives - if not Doc.has_extension("phonemes"): - Doc.set_extension("phonemes", getter=self.get_all_phonemes) - if not Span.has_extension("phonemes"): - Span.set_extension("phonemes", getter=self.get_all_phonemes) - if not Span.has_extension("syllables"): - Span.set_extension("syllables", getter=self._get_syllables) - if not Token.has_extension("phonemes"): - Token.set_extension("phonemes", getter=self.get_token_phonemes) - if not Token.has_extension("is_oov"): - Token.set_extension("is_oov", getter=self.is_token_oov) + # register extensions on spaCy primitives; use force=true because + # registration is global and we want to overwrite any old state + Doc.set_extension("phonemes", getter=self.get_all_phonemes, force=True) + Span.set_extension("phonemes", getter=self.get_all_phonemes, force=True) + Span.set_extension("syllables", getter=self._get_syllables, force=True) + Token.set_extension("phonemes", getter=self.get_token_phonemes, force=True) + Token.set_extension("is_oov", getter=self.is_token_oov, force=True) # store the sound table in the vocab's Lookups self.table = nlp.vocab.lookups.add_table("phonemes", sound_table) diff --git a/tests/unit/test_align.py b/tests/unit/test_align.py index ec7c382..a64bdbf 100644 --- a/tests/unit/test_align.py +++ b/tests/unit/test_align.py @@ -2,10 +2,11 @@ """Aligner unit tests.""" from unittest import TestCase - +from pathlib import Path import spacy +from dphon.g2p import get_sound_table_json from dphon.match import Match -from dphon.align import SmithWatermanAligner +from dphon.align import SmithWatermanAligner, SmithWatermanPhoneticAligner from lingpy.align.pairwise import _get_scorer @@ -130,3 +131,43 @@ def test_scorer(self) -> None: self.assertEqual(aligned.au, list("CABACABACABA")) self.assertEqual(aligned.av, list("CBBBCBBBCBBB")) self.assertEqual(aligned.weight, 1.0) + +class TestSmithWatermanPhoneticAligner(TestCase): + """Test the SmithWatermanPhoneticAligner.""" + + maxDiff = None # don't limit length of diff output for failures + + def setUp(self) -> None: + """Create a spaCy pipeline and aligner for use in testing.""" + + # chinese pipeline with sound table g2p + self.nlp = spacy.blank( + "zh", meta={"tokenizer": {"config": {"use_jieba": False}}} + ) + self.project_dir = Path(__file__).parent.parent.parent.parent + sound_table = get_sound_table_json(self.project_dir / "dphon" / "src" / "dphon" / "data" / "sound_table_v2.json") + self.nlp.add_pipe("g2p", config={"sound_table": sound_table}) + + # aligner with default phonetic scoring matrix + self.align = SmithWatermanPhoneticAligner() + + def test_spacing(self) -> None: + """Matches with gaps should maintain correct transcription alignment. + + Text sources: + - CHANT 尚書-0013 + - CHANT 尚書-0059-0023""" + + # create a mock match and align it + u = self.nlp.make_doc("罔有天災山川鬼神亦莫不寧暨鳥獸魚鱉咸若于") + v = self.nlp.make_doc("胡敢異心山川鬼神亦莫敢不寧若能共允住天下") + match = Match("shangshu13", "shangshu59", u[:], v[:]) + aligned = self.align(match) + + # first string gets a gap for the insertion of "敢" (kˤamʔ) + self.assertEqual(aligned.au, list("山川鬼神亦莫-不寧")) + self.assertEqual(aligned.u_transcription, "*s-ŋrar t-◦lun k-ʔujʔ Cə-lin ɢrAk mˤak pə nˤeŋ") + + # second string has the inserted character + self.assertEqual(aligned.av, list("山川鬼神亦莫敢不寧")) + self.assertEqual(aligned.v_transcription, "*s-ŋrar t-◦lun k-ʔujʔ Cə-lin ɢrAk mˤak kˤamʔ pə nˤeŋ")