Skip to content

Commit f1778e1

Browse files
authored
Port quotation denormalization unicode tests (#228)
1 parent 3a79c67 commit f1778e1

11 files changed

+89
-31
lines changed

machine/corpora/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@
8585
from .usx_file_text_corpus import UsxFileTextCorpus
8686
from .usx_memory_text import UsxMemoryText
8787
from .usx_zip_text import UsxZipText
88+
from .zip_paratext_project_quote_convention_detector import ZipParatextProjectQuoteConventionDetector
8889
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
8990
from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase
9091
from .zip_paratext_project_terms_parser import ZipParatextProjectTermsParser
@@ -188,6 +189,7 @@
188189
"UsxFileTextCorpus",
189190
"UsxMemoryText",
190191
"UsxZipText",
192+
"ZipParatextProjectQuoteConventionDetector",
191193
"ZipParatextProjectSettingsParser",
192194
"ZipParatextProjectSettingsParserBase",
193195
"ZipParatextProjectTermsParser",

machine/punctuation_analysis/quotation_mark_finder.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,15 @@ def find_all_potential_quotation_marks_in_text_segment(
3636
self, text_segment: TextSegment
3737
) -> List[QuotationMarkStringMatch]:
3838
quotation_matches: List[QuotationMarkStringMatch] = []
39-
for quotation_mark_match in self._QUOTATION_MARK_PATTERN.finditer(text_segment.text):
39+
for quotation_mark_match in self._QUOTATION_MARK_PATTERN.finditer(str(text_segment.text)):
4040
if self._quote_conventions.is_valid_opening_quotation_mark(
4141
quotation_mark_match.group()
4242
) or self._quote_conventions.is_valid_closing_quotation_mark(quotation_mark_match.group()):
4343
quotation_matches.append(
44-
QuotationMarkStringMatch(text_segment, quotation_mark_match.start(), quotation_mark_match.end())
44+
QuotationMarkStringMatch(
45+
text_segment,
46+
quotation_mark_match.start(),
47+
quotation_mark_match.end(),
48+
)
4549
)
4650
return quotation_matches

machine/punctuation_analysis/quotation_mark_string_match.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def __eq__(self, value):
3535

3636
@property
3737
def quotation_mark(self) -> str:
38-
return self._text_segment.text[self._start_index : self._end_index]
38+
return str(self._text_segment.text[self._start_index : self._end_index])
3939

4040
def is_valid_opening_quotation_mark(self, quote_conventions: QuoteConventionSet) -> bool:
4141
return quote_conventions.is_valid_opening_quotation_mark(self.quotation_mark)
@@ -59,18 +59,18 @@ def previous_character(self) -> Optional[str]:
5959
if previous_segment is not None and not self._text_segment.marker_is_in_preceding_context(
6060
UsfmMarkerType.PARAGRAPH
6161
):
62-
return previous_segment.text[-1]
62+
return str(previous_segment.text[-1])
6363
return None
64-
return self._text_segment.text[self._start_index - 1]
64+
return str(self._text_segment.text[self._start_index - 1])
6565

6666
@property
6767
def next_character(self) -> Optional[str]:
6868
if self.is_at_end_of_segment():
6969
next_segment = self._text_segment.next_segment
7070
if next_segment is not None and not next_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH):
71-
return next_segment.text[0]
71+
return str(next_segment.text[0])
7272
return None
73-
return self._text_segment.text[self._end_index]
73+
return str(self._text_segment.text[self._end_index])
7474

7575
def leading_substring_matches(self, regex_pattern: regex.Pattern) -> bool:
7676
return regex_pattern.search(self._text_segment.substring_before(self._start_index)) is not None
@@ -100,9 +100,11 @@ def end_index(self) -> int:
100100
# Not used, but a useful method for debugging
101101
@property
102102
def context(self) -> str:
103-
return self._text_segment.text[
104-
max(self._start_index - 10, 0) : min(self._end_index + 10, len(self._text_segment.text))
105-
]
103+
return str(
104+
self._text_segment.text[
105+
max(self._start_index - 10, 0) : min(self._end_index + 10, len(self._text_segment.text))
106+
]
107+
)
106108

107109
def resolve(self, depth: int, direction: QuotationMarkDirection) -> QuotationMarkMetadata:
108110
return QuotationMarkMetadata(

machine/punctuation_analysis/text_segment.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,10 @@ def length(self) -> int:
3939
return len(self._text)
4040

4141
def substring_before(self, index: int) -> str:
42-
return self._text[:index]
42+
return str(self._text[:index])
4343

4444
def substring_after(self, index: int) -> str:
45-
return self._text[index:]
45+
return str(self._text[index:])
4646

4747
def marker_is_in_preceding_context(self, marker: UsfmMarkerType) -> bool:
4848
return marker in self._markers_in_preceding_context
@@ -56,7 +56,7 @@ def is_last_segment_in_verse(self) -> bool:
5656
def replace_substring(self, start_index: int, end_index: int, replacement: str) -> None:
5757
self._text = self.substring_before(start_index) + replacement + self.substring_after(end_index)
5858
if self._usfm_token is not None:
59-
self._usfm_token.text = self._text
59+
self._usfm_token.text = str(self._text)
6060

6161
class Builder:
6262
def __init__(self):

tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -476,10 +476,9 @@ def test_process_scripture_element() -> None:
476476

477477
assert quote_convention_changer._quotation_mark_finder.num_times_called == 1
478478
assert mock_quotation_mark_resolver.num_times_called == 1
479-
assert quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment._text == "this is a ‘test"
479+
assert quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment.text == "this is a ‘test"
480480
assert (
481-
quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment._text
482-
== "the test ends” here"
481+
quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment.text == "the test ends” here"
483482
)
484483

485484

@@ -494,7 +493,7 @@ def test_create_text_segments_basic() -> None:
494493
text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element)
495494

496495
assert len(text_segments) == 1
497-
assert text_segments[0]._text == "test segment"
496+
assert text_segments[0].text == "test segment"
498497
assert text_segments[0]._immediate_preceding_marker is UsfmMarkerType.NO_MARKER
499498
assert text_segments[0]._markers_in_preceding_context == set()
500499
assert text_segments[0].previous_segment is None
@@ -517,7 +516,7 @@ def test_create_text_segments_with_preceding_markers() -> None:
517516
text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element)
518517

519518
assert len(text_segments) == 1
520-
assert text_segments[0]._text == "test segment"
519+
assert text_segments[0].text == "test segment"
521520
assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH
522521
assert text_segments[0]._markers_in_preceding_context == {
523522
UsfmMarkerType.VERSE,
@@ -547,15 +546,15 @@ def test_create_text_segments_with_multiple_text_tokens() -> None:
547546
text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element)
548547

549548
assert len(text_segments) == 2
550-
assert text_segments[0]._text == "test segment1"
549+
assert text_segments[0].text == "test segment1"
551550
assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH
552551
assert text_segments[0]._markers_in_preceding_context == {
553552
UsfmMarkerType.VERSE,
554553
UsfmMarkerType.PARAGRAPH,
555554
}
556555
assert text_segments[0].previous_segment is None
557556
assert text_segments[0].next_segment == text_segments[1]
558-
assert text_segments[1]._text == "test segment2"
557+
assert text_segments[1].text == "test segment2"
559558
assert text_segments[1]._immediate_preceding_marker == UsfmMarkerType.CHARACTER
560559
assert text_segments[1]._markers_in_preceding_context == {
561560
UsfmMarkerType.VERSE,
@@ -574,7 +573,7 @@ def test_create_text_segment() -> None:
574573
segment: Union[TextSegment, None] = quote_convention_changer._create_text_segment(usfm_token)
575574

576575
assert segment is not None
577-
assert segment._text == "test segment"
576+
assert segment.text == "test segment"
578577
assert segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER
579578
assert segment._markers_in_preceding_context == set()
580579
assert segment._usfm_token == usfm_token
@@ -765,7 +764,7 @@ def test_start_new_chapter() -> None:
765764
segment = quote_convention_changer._next_scripture_text_segment_builder.build()
766765
assert quote_convention_changer._current_strategy == QuotationMarkUpdateStrategy.SKIP
767766
assert segment._immediate_preceding_marker == UsfmMarkerType.CHAPTER
768-
assert segment._text == ""
767+
assert segment.text == ""
769768
assert UsfmMarkerType.EMBED not in segment._markers_in_preceding_context
770769
assert quote_convention_changer._verse_text_quotation_mark_resolver._issues == set()
771770

tests/corpora/test_usfm_manual.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,13 @@
55
from typing import List, Optional
66

77
import pytest
8-
from testutils.corpora_test_helpers import TEST_DATA_PATH, USFM_SOURCE_PROJECT_PATH, USFM_TARGET_PROJECT_PATH
8+
from testutils.corpora_test_helpers import (
9+
TEST_DATA_PATH,
10+
USFM_SOURCE_PROJECT_PATH,
11+
USFM_SOURCE_PROJECT_ZIP_PATH,
12+
USFM_TARGET_PROJECT_PATH,
13+
USFM_TARGET_PROJECT_ZIP_PATH,
14+
)
915

1016
from machine.corpora import (
1117
FileParatextProjectSettingsParser,
@@ -15,9 +21,11 @@
1521
StandardParallelTextCorpus,
1622
UpdateUsfmRow,
1723
UpdateUsfmTextBehavior,
24+
ZipParatextProjectQuoteConventionDetector,
1825
ZipParatextProjectSettingsParser,
1926
ZipParatextProjectTextUpdater,
2027
)
28+
from machine.punctuation_analysis import QuoteConventionDetector
2129

2230

2331
@pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.")
@@ -124,3 +132,22 @@ def get_usfm(project_path: Path):
124132
assert False, f"Failed to process {subdir}: {e}"
125133
else:
126134
get_usfm(PARATEXT_PROJECT_PATH)
135+
136+
137+
@pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.")
138+
def test_analyze_corpora_quote_conventions():
139+
source_handler = QuoteConventionDetector()
140+
source_archive = zipfile.ZipFile(USFM_SOURCE_PROJECT_ZIP_PATH, "r")
141+
source_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(source_archive)
142+
source_quote_convention_detector.get_quote_convention_analysis(source_handler)
143+
144+
target_handler = QuoteConventionDetector()
145+
target_archive = zipfile.ZipFile(USFM_TARGET_PROJECT_ZIP_PATH, "r")
146+
target_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(target_archive)
147+
target_quote_convention_detector.get_quote_convention_analysis(target_handler)
148+
149+
source_analysis = source_handler.detect_quote_convention()
150+
target_analysis = target_handler.detect_quote_convention()
151+
152+
assert source_analysis is not None
153+
assert target_analysis is not None

tests/punctuation_analysis/test_quotation_mark_finder.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,10 @@ def test_that_all_possible_quotation_marks_are_identified() -> None:
175175
),
176176
]
177177

178+
assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment(
179+
TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build()
180+
) == [QuotationMarkStringMatch(TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build(), 9, 10)]
181+
178182

179183
def test_that_it_uses_the_quote_convention_set() -> None:
180184
standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english")

tests/punctuation_analysis/test_quotation_mark_metadata.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def test_update_quotation_mark() -> None:
1919
end_index=23,
2020
)
2121
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english"))
22-
assert quotation_mark_metadata.text_segment._text == "He said to the woman, “Has God really said,"
22+
assert quotation_mark_metadata.text_segment.text == "He said to the woman, “Has God really said,"
2323

2424
quotation_mark_metadata = QuotationMarkMetadata(
2525
quotation_mark='"',
@@ -30,7 +30,7 @@ def test_update_quotation_mark() -> None:
3030
end_index=23,
3131
)
3232
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european"))
33-
assert quotation_mark_metadata.text_segment._text == "He said to the woman, «Has God really said,"
33+
assert quotation_mark_metadata.text_segment.text == "He said to the woman, «Has God really said,"
3434

3535
quotation_mark_metadata = QuotationMarkMetadata(
3636
quotation_mark='"',
@@ -41,7 +41,7 @@ def test_update_quotation_mark() -> None:
4141
end_index=24,
4242
)
4343
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european"))
44-
assert quotation_mark_metadata.text_segment._text == 'He said to the woman, "«as God really said,'
44+
assert quotation_mark_metadata.text_segment.text == 'He said to the woman, "«as God really said,'
4545

4646

4747
def test_update_quotation_mark_with_multi_character_quotation_marks() -> None:
@@ -54,7 +54,7 @@ def test_update_quotation_mark_with_multi_character_quotation_marks() -> None:
5454
end_index=23,
5555
)
5656
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("typewriter_french"))
57-
assert quotation_mark_metadata.text_segment._text == "He said to the woman, <<Has God really said,"
57+
assert quotation_mark_metadata.text_segment.text == "He said to the woman, <<Has God really said,"
5858
assert quotation_mark_metadata.start_index == 22
5959
assert quotation_mark_metadata.end_index == 24
6060

@@ -67,7 +67,7 @@ def test_update_quotation_mark_with_multi_character_quotation_marks() -> None:
6767
end_index=24,
6868
)
6969
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english"))
70-
assert quotation_mark_metadata.text_segment._text == "He said to the woman, “Has God really said,"
70+
assert quotation_mark_metadata.text_segment.text == "He said to the woman, “Has God really said,"
7171
assert quotation_mark_metadata.start_index == 22
7272
assert quotation_mark_metadata.end_index == 23
7373

tests/punctuation_analysis/test_quotation_mark_string_match.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,16 @@ def test_get_previous_character() -> None:
121121
quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2)
122122
assert quotation_mark_string_match.previous_character == "“"
123123

124+
quotation_mark_string_match = QuotationMarkStringMatch(
125+
TextSegment.Builder()
126+
.set_text('"उत्पत्ति पुस्तकले')
127+
.set_previous_segment(TextSegment.Builder().set_text("उत्पत्ति पुस्तकले").build())
128+
.build(),
129+
0,
130+
1,
131+
)
132+
assert quotation_mark_string_match.previous_character == "\u0947"
133+
124134

125135
def test_get_next_character() -> None:
126136
quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2)

tests/punctuation_analysis/test_text_segment.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
def test_builder_initialization() -> None:
66
builder = TextSegment.Builder()
77

8-
assert builder._text_segment._text == ""
8+
assert builder._text_segment.text == ""
99
assert builder._text_segment.previous_segment is None
1010
assert builder._text_segment.next_segment is None
1111
assert builder._text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER
@@ -20,7 +20,7 @@ def test_builder_set_text() -> None:
2020
text = "Example text"
2121
builder.set_text(text)
2222

23-
assert builder._text_segment._text == text
23+
assert builder._text_segment.text == text
2424

2525

2626
def test_builder_set_previous_segment() -> None:
@@ -62,7 +62,7 @@ def test_builder_set_usfm_token() -> None:
6262
assert builder._text_segment._usfm_token is not None
6363
assert builder._text_segment._usfm_token.type == UsfmTokenType.TEXT
6464
assert builder._text_segment._usfm_token.text == "USFM token text"
65-
assert builder._text_segment._text == ""
65+
assert builder._text_segment.text == ""
6666
assert builder._text_segment.previous_segment is None
6767
assert builder._text_segment.next_segment is None
6868

@@ -161,6 +161,14 @@ def test_length() -> None:
161161
text_segment = TextSegment.Builder().set_text("new example text").build()
162162
assert text_segment.length == len("new example text")
163163

164+
# Combining characters
165+
text_segment = TextSegment.Builder().set_text("उत्पत्ति पुस्तकले").build()
166+
assert text_segment.length == 17
167+
168+
# Surrogate pairs
169+
text_segment = TextSegment.Builder().set_text("𝜺𝜺").build()
170+
assert text_segment.length == 2
171+
164172

165173
def test_substring_before() -> None:
166174
text_segment = TextSegment.Builder().set_text("example text").build()

0 commit comments

Comments
 (0)