Skip to content

Commit 9e80ee9

Browse files
committed
Port unicode string-related tests to Python; address discrepancy regarding combining characters in Python strings
1 parent 0e707ce commit 9e80ee9

11 files changed

+172
-45
lines changed

machine/corpora/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@
8585
from .usx_file_text_corpus import UsxFileTextCorpus
8686
from .usx_memory_text import UsxMemoryText
8787
from .usx_zip_text import UsxZipText
88+
from .zip_paratext_project_quote_convention_detector import ZipParatextProjectQuoteConventionDetector
8889
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
8990
from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase
9091
from .zip_paratext_project_terms_parser import ZipParatextProjectTermsParser
@@ -188,6 +189,7 @@
188189
"UsxFileTextCorpus",
189190
"UsxMemoryText",
190191
"UsxZipText",
192+
"ZipParatextProjectQuoteConventionDetector",
191193
"ZipParatextProjectSettingsParser",
192194
"ZipParatextProjectSettingsParserBase",
193195
"ZipParatextProjectTermsParser",

machine/punctuation_analysis/quotation_mark_finder.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,15 @@ def find_all_potential_quotation_marks_in_text_segment(
3636
self, text_segment: TextSegment
3737
) -> List[QuotationMarkStringMatch]:
3838
quotation_matches: List[QuotationMarkStringMatch] = []
39-
for quotation_mark_match in self._QUOTATION_MARK_PATTERN.finditer(text_segment.text):
39+
for quotation_mark_match in self._QUOTATION_MARK_PATTERN.finditer(text_segment.text.string):
4040
if self._quote_conventions.is_valid_opening_quotation_mark(
4141
quotation_mark_match.group()
4242
) or self._quote_conventions.is_valid_closing_quotation_mark(quotation_mark_match.group()):
4343
quotation_matches.append(
44-
QuotationMarkStringMatch(text_segment, quotation_mark_match.start(), quotation_mark_match.end())
44+
QuotationMarkStringMatch(
45+
text_segment,
46+
text_segment.text.string_index_to_grapheme_index(quotation_mark_match.start()),
47+
text_segment.text.string_index_to_grapheme_index(quotation_mark_match.end()),
48+
)
4549
)
4650
return quotation_matches

machine/punctuation_analysis/quotation_mark_string_match.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def __eq__(self, value):
3535

3636
@property
3737
def quotation_mark(self) -> str:
38-
return self._text_segment.text[self._start_index : self._end_index]
38+
return self._text_segment.text[self._start_index : self._end_index].string
3939

4040
def is_valid_opening_quotation_mark(self, quote_conventions: QuoteConventionSet) -> bool:
4141
return quote_conventions.is_valid_opening_quotation_mark(self.quotation_mark)
@@ -59,18 +59,18 @@ def previous_character(self) -> Optional[str]:
5959
if previous_segment is not None and not self._text_segment.marker_is_in_preceding_context(
6060
UsfmMarkerType.PARAGRAPH
6161
):
62-
return previous_segment.text[-1]
62+
return previous_segment.text[-1].string
6363
return None
64-
return self._text_segment.text[self._start_index - 1]
64+
return self._text_segment.text[self._start_index - 1].string
6565

6666
@property
6767
def next_character(self) -> Optional[str]:
6868
if self.is_at_end_of_segment():
6969
next_segment = self._text_segment.next_segment
7070
if next_segment is not None and not next_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH):
71-
return next_segment.text[0]
71+
return next_segment.text[0].string
7272
return None
73-
return self._text_segment.text[self._end_index]
73+
return self._text_segment.text[self._end_index].string
7474

7575
def leading_substring_matches(self, regex_pattern: regex.Pattern) -> bool:
7676
return regex_pattern.search(self._text_segment.substring_before(self._start_index)) is not None
@@ -102,7 +102,7 @@ def end_index(self) -> int:
102102
def context(self) -> str:
103103
return self._text_segment.text[
104104
max(self._start_index - 10, 0) : min(self._end_index + 10, len(self._text_segment.text))
105-
]
105+
].string
106106

107107
def resolve(self, depth: int, direction: QuotationMarkDirection) -> QuotationMarkMetadata:
108108
return QuotationMarkMetadata(

machine/punctuation_analysis/text_segment.py

Lines changed: 74 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import unicodedata
12
from typing import Optional, Set
23

34
from ..corpora.usfm_token import UsfmToken
@@ -6,7 +7,7 @@
67

78
class TextSegment:
89
def __init__(self):
9-
self._text = ""
10+
self._text: GraphemeString = GraphemeString("")
1011
self._immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NO_MARKER
1112
self._markers_in_preceding_context: Set[UsfmMarkerType] = set()
1213
self.previous_segment: Optional[TextSegment] = None
@@ -31,18 +32,18 @@ def __eq__(self, value):
3132
return True
3233

3334
@property
34-
def text(self) -> str:
35+
def text(self) -> "GraphemeString":
3536
return self._text
3637

3738
@property
3839
def length(self) -> int:
3940
return len(self._text)
4041

4142
def substring_before(self, index: int) -> str:
42-
return self._text[:index]
43+
return self._text[:index].string
4344

4445
def substring_after(self, index: int) -> str:
45-
return self._text[index:]
46+
return self._text[index:].string
4647

4748
def marker_is_in_preceding_context(self, marker: UsfmMarkerType) -> bool:
4849
return marker in self._markers_in_preceding_context
@@ -54,9 +55,9 @@ def is_last_segment_in_verse(self) -> bool:
5455
return self.index_in_verse == self.num_segments_in_verse - 1
5556

5657
def replace_substring(self, start_index: int, end_index: int, replacement: str) -> None:
57-
self._text = self.substring_before(start_index) + replacement + self.substring_after(end_index)
58+
self._text = GraphemeString(self.substring_before(start_index) + replacement + self.substring_after(end_index))
5859
if self._usfm_token is not None:
59-
self._usfm_token.text = self._text
60+
self._usfm_token.text = self._text.string
6061

6162
class Builder:
6263
def __init__(self):
@@ -76,8 +77,74 @@ def set_usfm_token(self, token: UsfmToken) -> "TextSegment.Builder":
7677
return self
7778

7879
def set_text(self, text: str) -> "TextSegment.Builder":
79-
self._text_segment._text = text
80+
self._text_segment._text = GraphemeString(text)
8081
return self
8182

8283
def build(self) -> "TextSegment":
8384
return self._text_segment
85+
86+
87+
class GraphemeString:
88+
def __init__(self, string: str) -> None:
89+
self._string = string
90+
self._string_index_by_grapheme_index = {
91+
grapheme_index: string_index
92+
for grapheme_index, string_index in enumerate(
93+
[i for i, c in enumerate(string) if unicodedata.category(c) not in ["Mc", "Mn"]]
94+
)
95+
}
96+
97+
def __len__(self) -> int:
98+
return len(self._string_index_by_grapheme_index)
99+
100+
@property
101+
def string(self) -> str:
102+
return self._string
103+
104+
def __str__(self):
105+
return self._string
106+
107+
def __eq__(self, other) -> bool:
108+
if not isinstance(other, GraphemeString):
109+
return False
110+
return self._string == other.string
111+
112+
def __getitem__(self, key) -> "GraphemeString":
113+
if isinstance(key, int):
114+
grapheme_start = self._normalize_start_index(key)
115+
grapheme_stop = self._normalize_stop_index(grapheme_start + 1)
116+
string_start = self._string_index_by_grapheme_index.get(grapheme_start, len(self))
117+
string_stop = self._string_index_by_grapheme_index.get(grapheme_stop, None)
118+
return GraphemeString(self._string[string_start:string_stop])
119+
elif isinstance(key, slice):
120+
if key.step is not None and key.step != 1:
121+
raise TypeError("Steps are not allowed in _GraphemeString slices")
122+
grapheme_start = self._normalize_start_index(key.start)
123+
grapheme_stop = self._normalize_stop_index(key.stop)
124+
string_start = self._string_index_by_grapheme_index.get(grapheme_start, len(self))
125+
string_stop = self._string_index_by_grapheme_index.get(grapheme_stop, None)
126+
return GraphemeString(self._string[string_start:string_stop])
127+
else:
128+
raise TypeError("Indices must be integers or slices")
129+
130+
def _normalize_start_index(self, index: int | None) -> int:
131+
if index is None:
132+
return 0
133+
if index < 0:
134+
return len(self) + index
135+
return index
136+
137+
def _normalize_stop_index(self, index: int | None) -> int:
138+
if index is None:
139+
return len(self)
140+
if index < 0:
141+
return len(self) + index
142+
return index
143+
144+
def string_index_to_grapheme_index(self, string_index: int) -> int:
145+
if string_index == len(self._string):
146+
return len(self)
147+
for g_index, s_index in self._string_index_by_grapheme_index.items():
148+
if s_index == string_index:
149+
return g_index
150+
raise ValueError(f"No corresponding grapheme index found for string index {string_index}.")

tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -476,9 +476,12 @@ def test_process_scripture_element() -> None:
476476

477477
assert quote_convention_changer._quotation_mark_finder.num_times_called == 1
478478
assert mock_quotation_mark_resolver.num_times_called == 1
479-
assert quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment._text == "this is a ‘test"
480479
assert (
481-
quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment._text
480+
str(quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment.text)
481+
== "this is a ‘test"
482+
)
483+
assert (
484+
str(quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment.text)
482485
== "the test ends” here"
483486
)
484487

@@ -494,7 +497,7 @@ def test_create_text_segments_basic() -> None:
494497
text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element)
495498

496499
assert len(text_segments) == 1
497-
assert text_segments[0]._text == "test segment"
500+
assert str(text_segments[0].text) == "test segment"
498501
assert text_segments[0]._immediate_preceding_marker is UsfmMarkerType.NO_MARKER
499502
assert text_segments[0]._markers_in_preceding_context == set()
500503
assert text_segments[0].previous_segment is None
@@ -517,7 +520,7 @@ def test_create_text_segments_with_preceding_markers() -> None:
517520
text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element)
518521

519522
assert len(text_segments) == 1
520-
assert text_segments[0]._text == "test segment"
523+
assert str(text_segments[0].text) == "test segment"
521524
assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH
522525
assert text_segments[0]._markers_in_preceding_context == {
523526
UsfmMarkerType.VERSE,
@@ -547,15 +550,15 @@ def test_create_text_segments_with_multiple_text_tokens() -> None:
547550
text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element)
548551

549552
assert len(text_segments) == 2
550-
assert text_segments[0]._text == "test segment1"
553+
assert str(text_segments[0].text) == "test segment1"
551554
assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH
552555
assert text_segments[0]._markers_in_preceding_context == {
553556
UsfmMarkerType.VERSE,
554557
UsfmMarkerType.PARAGRAPH,
555558
}
556559
assert text_segments[0].previous_segment is None
557560
assert text_segments[0].next_segment == text_segments[1]
558-
assert text_segments[1]._text == "test segment2"
561+
assert str(text_segments[1].text) == "test segment2"
559562
assert text_segments[1]._immediate_preceding_marker == UsfmMarkerType.CHARACTER
560563
assert text_segments[1]._markers_in_preceding_context == {
561564
UsfmMarkerType.VERSE,
@@ -574,7 +577,7 @@ def test_create_text_segment() -> None:
574577
segment: Union[TextSegment, None] = quote_convention_changer._create_text_segment(usfm_token)
575578

576579
assert segment is not None
577-
assert segment._text == "test segment"
580+
assert str(segment.text) == "test segment"
578581
assert segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER
579582
assert segment._markers_in_preceding_context == set()
580583
assert segment._usfm_token == usfm_token
@@ -644,7 +647,7 @@ def test_update_quotation_marks() -> None:
644647

645648
multi_char_to_single_char_quote_convention_changer._update_quotation_marks(multi_character_quotation_marks)
646649

647-
assert multi_character_text_segment.text == "this “is ‘a test segment’ ”"
650+
assert str(multi_character_text_segment.text) == "this “is ‘a test segment’ ”"
648651

649652
assert multi_character_quotation_marks[0].start_index == 5
650653
assert multi_character_quotation_marks[0].end_index == 6
@@ -704,7 +707,7 @@ def test_update_quotation_marks() -> None:
704707

705708
single_char_to_multi_char_quote_convention_changer._update_quotation_marks(single_character_quotation_marks)
706709

707-
assert single_character_text_segment.text == "this <<is <a test segment> >>"
710+
assert str(single_character_text_segment.text) == "this <<is <a test segment> >>"
708711

709712
assert single_character_quotation_marks[0].start_index == 5
710713
assert single_character_quotation_marks[0].end_index == 7
@@ -765,7 +768,7 @@ def test_start_new_chapter() -> None:
765768
segment = quote_convention_changer._next_scripture_text_segment_builder.build()
766769
assert quote_convention_changer._current_strategy == QuotationMarkUpdateStrategy.SKIP
767770
assert segment._immediate_preceding_marker == UsfmMarkerType.CHAPTER
768-
assert segment._text == ""
771+
assert str(segment.text) == ""
769772
assert UsfmMarkerType.EMBED not in segment._markers_in_preceding_context
770773
assert quote_convention_changer._verse_text_quotation_mark_resolver._issues == set()
771774

tests/corpora/test_usfm_manual.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,13 @@
55
from typing import List, Optional
66

77
import pytest
8-
from testutils.corpora_test_helpers import TEST_DATA_PATH, USFM_SOURCE_PROJECT_PATH, USFM_TARGET_PROJECT_PATH
8+
from testutils.corpora_test_helpers import (
9+
TEST_DATA_PATH,
10+
USFM_SOURCE_PROJECT_PATH,
11+
USFM_SOURCE_PROJECT_ZIP_PATH,
12+
USFM_TARGET_PROJECT_PATH,
13+
USFM_TARGET_PROJECT_ZIP_PATH,
14+
)
915

1016
from machine.corpora import (
1117
FileParatextProjectSettingsParser,
@@ -15,9 +21,11 @@
1521
StandardParallelTextCorpus,
1622
UpdateUsfmRow,
1723
UpdateUsfmTextBehavior,
24+
ZipParatextProjectQuoteConventionDetector,
1825
ZipParatextProjectSettingsParser,
1926
ZipParatextProjectTextUpdater,
2027
)
28+
from machine.punctuation_analysis import QuoteConventionDetector
2129

2230

2331
@pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.")
@@ -124,3 +132,22 @@ def get_usfm(project_path: Path):
124132
assert False, f"Failed to process {subdir}: {e}"
125133
else:
126134
get_usfm(PARATEXT_PROJECT_PATH)
135+
136+
137+
@pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.")
138+
def test_analyze_corpora_quote_conventions():
139+
source_handler = QuoteConventionDetector()
140+
source_archive = zipfile.ZipFile(USFM_SOURCE_PROJECT_ZIP_PATH, "r")
141+
source_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(source_archive)
142+
source_quote_convention_detector.get_quote_convention_analysis(source_handler)
143+
144+
target_handler = QuoteConventionDetector()
145+
target_archive = zipfile.ZipFile(USFM_TARGET_PROJECT_ZIP_PATH, "r")
146+
target_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(target_archive)
147+
target_quote_convention_detector.get_quote_convention_analysis(target_handler)
148+
149+
source_analysis = source_handler.detect_quote_convention()
150+
target_analysis = target_handler.detect_quote_convention()
151+
152+
assert source_analysis is not None
153+
assert target_analysis is not None

tests/punctuation_analysis/test_quotation_mark_finder.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,10 @@ def test_that_all_possible_quotation_marks_are_identified() -> None:
175175
),
176176
]
177177

178+
assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment(
179+
TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build()
180+
) == [QuotationMarkStringMatch(TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build(), 6, 7)]
181+
178182

179183
def test_that_it_uses_the_quote_convention_set() -> None:
180184
standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english")

tests/punctuation_analysis/test_quotation_mark_metadata.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def test_update_quotation_mark() -> None:
1919
end_index=23,
2020
)
2121
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english"))
22-
assert quotation_mark_metadata.text_segment._text == "He said to the woman, “Has God really said,"
22+
assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, “Has God really said,"
2323

2424
quotation_mark_metadata = QuotationMarkMetadata(
2525
quotation_mark='"',
@@ -30,7 +30,7 @@ def test_update_quotation_mark() -> None:
3030
end_index=23,
3131
)
3232
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european"))
33-
assert quotation_mark_metadata.text_segment._text == "He said to the woman, «Has God really said,"
33+
assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, «Has God really said,"
3434

3535
quotation_mark_metadata = QuotationMarkMetadata(
3636
quotation_mark='"',
@@ -41,7 +41,7 @@ def test_update_quotation_mark() -> None:
4141
end_index=24,
4242
)
4343
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european"))
44-
assert quotation_mark_metadata.text_segment._text == 'He said to the woman, "«as God really said,'
44+
assert str(quotation_mark_metadata.text_segment.text) == 'He said to the woman, "«as God really said,'
4545

4646

4747
def test_update_quotation_mark_with_multi_character_quotation_marks() -> None:
@@ -54,7 +54,7 @@ def test_update_quotation_mark_with_multi_character_quotation_marks() -> None:
5454
end_index=23,
5555
)
5656
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("typewriter_french"))
57-
assert quotation_mark_metadata.text_segment._text == "He said to the woman, <<Has God really said,"
57+
assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, <<Has God really said,"
5858
assert quotation_mark_metadata.start_index == 22
5959
assert quotation_mark_metadata.end_index == 24
6060

@@ -67,7 +67,7 @@ def test_update_quotation_mark_with_multi_character_quotation_marks() -> None:
6767
end_index=24,
6868
)
6969
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english"))
70-
assert quotation_mark_metadata.text_segment._text == "He said to the woman, “Has God really said,"
70+
assert str(quotation_mark_metadata.text_segment.text) == "He said to the woman, “Has God really said,"
7171
assert quotation_mark_metadata.start_index == 22
7272
assert quotation_mark_metadata.end_index == 23
7373

0 commit comments

Comments
 (0)