Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/dphon/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ def __call__(self, match: Match) -> Match:
# ----> <----
u, v = match.utxt.doc, match.vtxt.doc
us, vs = match.utxt.start + len(lu), match.vtxt.start + len(lv)
utxt = u[us : us + len(cu)]
vtxt = v[vs : vs + len(cv)]
utxt = u[us : us + sum(1 for c in cu if c != "-")]
vtxt = v[vs : vs + sum(1 for c in cv if c != "-")]

# use the gaps in the alignment to construct a new sequence of token
# texts, inserting gap_char wherever the aligner created a gap
Expand All @@ -88,9 +88,9 @@ def __call__(self, match: Match) -> Match:

# trim back the sequence boundaries further to remove any non-alphanum.
# tokens from the start and end of both alignment and orig. sequence
while not au[-1].isalnum() or not av[-1].isalnum():
while (not au[-1].isalnum() and au[-1] != self.gap_char) or (not av[-1].isalnum() and av[-1] != self.gap_char):
utxt, vtxt, au, av = utxt[:-1], vtxt[:-1], au[:-1], av[:-1]
while not au[0].isalnum() or not av[0].isalnum():
while (not au[0].isalnum() and au[0] != self.gap_char) or (not av[0].isalnum() and av[0] != self.gap_char):
utxt, vtxt, au, av = utxt[1:], vtxt[1:], au[1:], av[1:]

# normalize score to length; 1.0 is perfect
Expand Down
9 changes: 8 additions & 1 deletion src/dphon/console.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def _mark_span(
marked_span: List[str] = []
span_ptr = 0
other_ptr = 0
for i in range(len(span)):
for i in range(len(alignment)):
# gap in u: insertion in v (if not punctuation)
if alignment[i] == self.gap_char and other_alignment[i].isalnum():
other_ptr += 1
Expand All @@ -108,6 +108,13 @@ def _mark_span(
span_ptr += 1
continue

# if either pointer is out of bounds, just append the character
if span_ptr >= len(span) or other_ptr >= len(other):
marked_span.append(alignment[i])
span_ptr += 1
other_ptr += 1
continue

# variants (both u and v)
if self.g2p.are_graphic_variants(span[span_ptr], other[other_ptr]):
marked_span.append(f"[variant]{alignment[i]}[/variant]")
Expand Down
22 changes: 9 additions & 13 deletions src/dphon/g2p.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from importlib.resources.abc import Traversable
import json
import logging
from typing import Iterable, Iterator, Mapping, Optional, Tuple, List
from typing import Iterable, Iterator, Optional, Tuple, List

from spacy.language import Language
from spacy.lookups import Table
Expand All @@ -18,7 +18,7 @@

# types for sound tables: map a string to a tuple of syllable phonemes
Phonemes_T = Tuple[Optional[str], ...]
SoundTable_T = Mapping[str, Phonemes_T]
SoundTable_T = dict[str, Phonemes_T]


class GraphemesToPhonemes:
Expand All @@ -45,17 +45,13 @@ def __init__(self, nlp: Language, sound_table: SoundTable_T):
syllable_parts = len(next(iter(sound_table.values())))
self.empty_phonemes = tuple(None for _ in range(syllable_parts))

# register extensions on spaCy primitives
if not Doc.has_extension("phonemes"):
Doc.set_extension("phonemes", getter=self.get_all_phonemes)
if not Span.has_extension("phonemes"):
Span.set_extension("phonemes", getter=self.get_all_phonemes)
if not Span.has_extension("syllables"):
Span.set_extension("syllables", getter=self._get_syllables)
if not Token.has_extension("phonemes"):
Token.set_extension("phonemes", getter=self.get_token_phonemes)
if not Token.has_extension("is_oov"):
Token.set_extension("is_oov", getter=self.is_token_oov)
# register extensions on spaCy primitives; use force=true because
# registration is global and we want to overwrite any old state
Doc.set_extension("phonemes", getter=self.get_all_phonemes, force=True)
Span.set_extension("phonemes", getter=self.get_all_phonemes, force=True)
Span.set_extension("syllables", getter=self._get_syllables, force=True)
Token.set_extension("phonemes", getter=self.get_token_phonemes, force=True)
Token.set_extension("is_oov", getter=self.is_token_oov, force=True)

# store the sound table in the vocab's Lookups
self.table = nlp.vocab.lookups.add_table("phonemes", sound_table)
Expand Down
45 changes: 43 additions & 2 deletions tests/unit/test_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
"""Aligner unit tests."""

from unittest import TestCase

from pathlib import Path
import spacy
from dphon.g2p import get_sound_table_json
from dphon.match import Match
from dphon.align import SmithWatermanAligner
from dphon.align import SmithWatermanAligner, SmithWatermanPhoneticAligner
from lingpy.align.pairwise import _get_scorer


Expand Down Expand Up @@ -130,3 +131,43 @@ def test_scorer(self) -> None:
self.assertEqual(aligned.au, list("CABACABACABA"))
self.assertEqual(aligned.av, list("CBBBCBBBCBBB"))
self.assertEqual(aligned.weight, 1.0)

class TestSmithWatermanPhoneticAligner(TestCase):
"""Test the SmithWatermanPhoneticAligner."""

maxDiff = None # don't limit length of diff output for failures

def setUp(self) -> None:
"""Create a spaCy pipeline and aligner for use in testing."""

# chinese pipeline with sound table g2p
self.nlp = spacy.blank(
"zh", meta={"tokenizer": {"config": {"use_jieba": False}}}
)
self.project_dir = Path(__file__).parent.parent.parent.parent
sound_table = get_sound_table_json(self.project_dir / "dphon" / "src" / "dphon" / "data" / "sound_table_v2.json")
self.nlp.add_pipe("g2p", config={"sound_table": sound_table})

# aligner with default phonetic scoring matrix
self.align = SmithWatermanPhoneticAligner()

def test_spacing(self) -> None:
"""Matches with gaps should maintain correct transcription alignment.

Text sources:
- CHANT 尚書-0013
- CHANT 尚書-0059-0023"""

# create a mock match and align it
u = self.nlp.make_doc("罔有天災山川鬼神亦莫不寧暨鳥獸魚鱉咸若于")
v = self.nlp.make_doc("胡敢異心山川鬼神亦莫敢不寧若能共允住天下")
match = Match("shangshu13", "shangshu59", u[:], v[:])
aligned = self.align(match)

# first string gets a gap for the insertion of "敢" (kˤamʔ)
self.assertEqual(aligned.au, list("山川鬼神亦莫-不寧"))
self.assertEqual(aligned.u_transcription, "*s-ŋrar t-◦lun k-ʔujʔ Cə-lin ɢrAk mˤak pə nˤeŋ")

# second string has the inserted character
self.assertEqual(aligned.av, list("山川鬼神亦莫敢不寧"))
self.assertEqual(aligned.v_transcription, "*s-ŋrar t-◦lun k-ʔujʔ Cə-lin ɢrAk mˤak kˤamʔ pə nˤeŋ")
Loading