diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index dd540f3..0a7c8f1 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -85,6 +85,7 @@ from .usx_file_text_corpus import UsxFileTextCorpus from .usx_memory_text import UsxMemoryText from .usx_zip_text import UsxZipText +from .zip_paratext_project_quote_convention_detector import ZipParatextProjectQuoteConventionDetector from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase from .zip_paratext_project_terms_parser import ZipParatextProjectTermsParser @@ -188,6 +189,7 @@ "UsxFileTextCorpus", "UsxMemoryText", "UsxZipText", + "ZipParatextProjectQuoteConventionDetector", "ZipParatextProjectSettingsParser", "ZipParatextProjectSettingsParserBase", "ZipParatextProjectTermsParser", diff --git a/machine/punctuation_analysis/quotation_mark_finder.py b/machine/punctuation_analysis/quotation_mark_finder.py index 73c9536..6d7303e 100644 --- a/machine/punctuation_analysis/quotation_mark_finder.py +++ b/machine/punctuation_analysis/quotation_mark_finder.py @@ -36,11 +36,15 @@ def find_all_potential_quotation_marks_in_text_segment( self, text_segment: TextSegment ) -> List[QuotationMarkStringMatch]: quotation_matches: List[QuotationMarkStringMatch] = [] - for quotation_mark_match in self._QUOTATION_MARK_PATTERN.finditer(text_segment.text): + for quotation_mark_match in self._QUOTATION_MARK_PATTERN.finditer(str(text_segment.text)): if self._quote_conventions.is_valid_opening_quotation_mark( quotation_mark_match.group() ) or self._quote_conventions.is_valid_closing_quotation_mark(quotation_mark_match.group()): quotation_matches.append( - QuotationMarkStringMatch(text_segment, quotation_mark_match.start(), quotation_mark_match.end()) + QuotationMarkStringMatch( + text_segment, + quotation_mark_match.start(), + quotation_mark_match.end(), + ) ) return quotation_matches diff --git a/machine/punctuation_analysis/quotation_mark_string_match.py b/machine/punctuation_analysis/quotation_mark_string_match.py index 573e37c..dcafa86 100644 --- a/machine/punctuation_analysis/quotation_mark_string_match.py +++ b/machine/punctuation_analysis/quotation_mark_string_match.py @@ -35,7 +35,7 @@ def __eq__(self, value): @property def quotation_mark(self) -> str: - return self._text_segment.text[self._start_index : self._end_index] + return str(self._text_segment.text[self._start_index : self._end_index]) def is_valid_opening_quotation_mark(self, quote_conventions: QuoteConventionSet) -> bool: return quote_conventions.is_valid_opening_quotation_mark(self.quotation_mark) @@ -59,18 +59,18 @@ def previous_character(self) -> Optional[str]: if previous_segment is not None and not self._text_segment.marker_is_in_preceding_context( UsfmMarkerType.PARAGRAPH ): - return previous_segment.text[-1] + return str(previous_segment.text[-1]) return None - return self._text_segment.text[self._start_index - 1] + return str(self._text_segment.text[self._start_index - 1]) @property def next_character(self) -> Optional[str]: if self.is_at_end_of_segment(): next_segment = self._text_segment.next_segment if next_segment is not None and not next_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH): - return next_segment.text[0] + return str(next_segment.text[0]) return None - return self._text_segment.text[self._end_index] + return str(self._text_segment.text[self._end_index]) def leading_substring_matches(self, regex_pattern: regex.Pattern) -> bool: return regex_pattern.search(self._text_segment.substring_before(self._start_index)) is not None @@ -100,9 +100,11 @@ def end_index(self) -> int: # Not used, but a useful method for debugging @property def context(self) -> str: - return self._text_segment.text[ - max(self._start_index - 10, 0) : min(self._end_index + 10, len(self._text_segment.text)) - ] + return str( + self._text_segment.text[ + max(self._start_index - 10, 0) : min(self._end_index + 10, len(self._text_segment.text)) + ] + ) def resolve(self, depth: int, direction: QuotationMarkDirection) -> QuotationMarkMetadata: return QuotationMarkMetadata( diff --git a/machine/punctuation_analysis/text_segment.py b/machine/punctuation_analysis/text_segment.py index 78e63d4..c8d44bd 100644 --- a/machine/punctuation_analysis/text_segment.py +++ b/machine/punctuation_analysis/text_segment.py @@ -39,10 +39,10 @@ def length(self) -> int: return len(self._text) def substring_before(self, index: int) -> str: - return self._text[:index] + return str(self._text[:index]) def substring_after(self, index: int) -> str: - return self._text[index:] + return str(self._text[index:]) def marker_is_in_preceding_context(self, marker: UsfmMarkerType) -> bool: return marker in self._markers_in_preceding_context @@ -56,7 +56,7 @@ def is_last_segment_in_verse(self) -> bool: def replace_substring(self, start_index: int, end_index: int, replacement: str) -> None: self._text = self.substring_before(start_index) + replacement + self.substring_after(end_index) if self._usfm_token is not None: - self._usfm_token.text = self._text + self._usfm_token.text = str(self._text) class Builder: def __init__(self): diff --git a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py index 5aca556..baadf8e 100644 --- a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py +++ b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py @@ -476,10 +476,9 @@ def test_process_scripture_element() -> None: assert quote_convention_changer._quotation_mark_finder.num_times_called == 1 assert mock_quotation_mark_resolver.num_times_called == 1 - assert quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment._text == "this is a ‘test" + assert quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment.text == "this is a ‘test" assert ( - quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment._text - == "the test ends” here" + quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment.text == "the test ends” here" ) @@ -494,7 +493,7 @@ def test_create_text_segments_basic() -> None: text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element) assert len(text_segments) == 1 - assert text_segments[0]._text == "test segment" + assert text_segments[0].text == "test segment" assert text_segments[0]._immediate_preceding_marker is UsfmMarkerType.NO_MARKER assert text_segments[0]._markers_in_preceding_context == set() assert text_segments[0].previous_segment is None @@ -517,7 +516,7 @@ def test_create_text_segments_with_preceding_markers() -> None: text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element) assert len(text_segments) == 1 - assert text_segments[0]._text == "test segment" + assert text_segments[0].text == "test segment" assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH assert text_segments[0]._markers_in_preceding_context == { UsfmMarkerType.VERSE, @@ -547,7 +546,7 @@ def test_create_text_segments_with_multiple_text_tokens() -> None: text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element) assert len(text_segments) == 2 - assert text_segments[0]._text == "test segment1" + assert text_segments[0].text == "test segment1" assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH assert text_segments[0]._markers_in_preceding_context == { UsfmMarkerType.VERSE, @@ -555,7 +554,7 @@ def test_create_text_segments_with_multiple_text_tokens() -> None: } assert text_segments[0].previous_segment is None assert text_segments[0].next_segment == text_segments[1] - assert text_segments[1]._text == "test segment2" + assert text_segments[1].text == "test segment2" assert text_segments[1]._immediate_preceding_marker == UsfmMarkerType.CHARACTER assert text_segments[1]._markers_in_preceding_context == { UsfmMarkerType.VERSE, @@ -574,7 +573,7 @@ def test_create_text_segment() -> None: segment: Union[TextSegment, None] = quote_convention_changer._create_text_segment(usfm_token) assert segment is not None - assert segment._text == "test segment" + assert segment.text == "test segment" assert segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER assert segment._markers_in_preceding_context == set() assert segment._usfm_token == usfm_token @@ -765,7 +764,7 @@ def test_start_new_chapter() -> None: segment = quote_convention_changer._next_scripture_text_segment_builder.build() assert quote_convention_changer._current_strategy == QuotationMarkUpdateStrategy.SKIP assert segment._immediate_preceding_marker == UsfmMarkerType.CHAPTER - assert segment._text == "" + assert segment.text == "" assert UsfmMarkerType.EMBED not in segment._markers_in_preceding_context assert quote_convention_changer._verse_text_quotation_mark_resolver._issues == set() diff --git a/tests/corpora/test_usfm_manual.py b/tests/corpora/test_usfm_manual.py index b795997..45166d2 100644 --- a/tests/corpora/test_usfm_manual.py +++ b/tests/corpora/test_usfm_manual.py @@ -5,7 +5,13 @@ from typing import List, Optional import pytest -from testutils.corpora_test_helpers import TEST_DATA_PATH, USFM_SOURCE_PROJECT_PATH, USFM_TARGET_PROJECT_PATH +from testutils.corpora_test_helpers import ( + TEST_DATA_PATH, + USFM_SOURCE_PROJECT_PATH, + USFM_SOURCE_PROJECT_ZIP_PATH, + USFM_TARGET_PROJECT_PATH, + USFM_TARGET_PROJECT_ZIP_PATH, +) from machine.corpora import ( FileParatextProjectSettingsParser, @@ -15,9 +21,11 @@ StandardParallelTextCorpus, UpdateUsfmRow, UpdateUsfmTextBehavior, + ZipParatextProjectQuoteConventionDetector, ZipParatextProjectSettingsParser, ZipParatextProjectTextUpdater, ) +from machine.punctuation_analysis import QuoteConventionDetector @pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.") @@ -124,3 +132,22 @@ def get_usfm(project_path: Path): assert False, f"Failed to process {subdir}: {e}" else: get_usfm(PARATEXT_PROJECT_PATH) + + +@pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.") +def test_analyze_corpora_quote_conventions(): + source_handler = QuoteConventionDetector() + source_archive = zipfile.ZipFile(USFM_SOURCE_PROJECT_ZIP_PATH, "r") + source_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(source_archive) + source_quote_convention_detector.get_quote_convention_analysis(source_handler) + + target_handler = QuoteConventionDetector() + target_archive = zipfile.ZipFile(USFM_TARGET_PROJECT_ZIP_PATH, "r") + target_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(target_archive) + target_quote_convention_detector.get_quote_convention_analysis(target_handler) + + source_analysis = source_handler.detect_quote_convention() + target_analysis = target_handler.detect_quote_convention() + + assert source_analysis is not None + assert target_analysis is not None diff --git a/tests/punctuation_analysis/test_quotation_mark_finder.py b/tests/punctuation_analysis/test_quotation_mark_finder.py index 5d1a709..035f50f 100644 --- a/tests/punctuation_analysis/test_quotation_mark_finder.py +++ b/tests/punctuation_analysis/test_quotation_mark_finder.py @@ -175,6 +175,10 @@ def test_that_all_possible_quotation_marks_are_identified() -> None: ), ] + assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( + TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build() + ) == [QuotationMarkStringMatch(TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build(), 9, 10)] + def test_that_it_uses_the_quote_convention_set() -> None: standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") diff --git a/tests/punctuation_analysis/test_quotation_mark_metadata.py b/tests/punctuation_analysis/test_quotation_mark_metadata.py index 5f2b265..4e4bab1 100644 --- a/tests/punctuation_analysis/test_quotation_mark_metadata.py +++ b/tests/punctuation_analysis/test_quotation_mark_metadata.py @@ -19,7 +19,7 @@ def test_update_quotation_mark() -> None: end_index=23, ) quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english")) - assert quotation_mark_metadata.text_segment._text == "He said to the woman, “Has God really said," + assert quotation_mark_metadata.text_segment.text == "He said to the woman, “Has God really said," quotation_mark_metadata = QuotationMarkMetadata( quotation_mark='"', @@ -30,7 +30,7 @@ def test_update_quotation_mark() -> None: end_index=23, ) quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european")) - assert quotation_mark_metadata.text_segment._text == "He said to the woman, «Has God really said," + assert quotation_mark_metadata.text_segment.text == "He said to the woman, «Has God really said," quotation_mark_metadata = QuotationMarkMetadata( quotation_mark='"', @@ -41,7 +41,7 @@ def test_update_quotation_mark() -> None: end_index=24, ) quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european")) - assert quotation_mark_metadata.text_segment._text == 'He said to the woman, "«as God really said,' + assert quotation_mark_metadata.text_segment.text == 'He said to the woman, "«as God really said,' def test_update_quotation_mark_with_multi_character_quotation_marks() -> None: @@ -54,7 +54,7 @@ def test_update_quotation_mark_with_multi_character_quotation_marks() -> None: end_index=23, ) quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("typewriter_french")) - assert quotation_mark_metadata.text_segment._text == "He said to the woman, < None: end_index=24, ) quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english")) - assert quotation_mark_metadata.text_segment._text == "He said to the woman, “Has God really said," + assert quotation_mark_metadata.text_segment.text == "He said to the woman, “Has God really said," assert quotation_mark_metadata.start_index == 22 assert quotation_mark_metadata.end_index == 23 diff --git a/tests/punctuation_analysis/test_quotation_mark_string_match.py b/tests/punctuation_analysis/test_quotation_mark_string_match.py index 3948549..7f478f7 100644 --- a/tests/punctuation_analysis/test_quotation_mark_string_match.py +++ b/tests/punctuation_analysis/test_quotation_mark_string_match.py @@ -121,6 +121,16 @@ def test_get_previous_character() -> None: quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2) assert quotation_mark_string_match.previous_character == "“" + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder() + .set_text('"उत्पत्ति पुस्तकले') + .set_previous_segment(TextSegment.Builder().set_text("उत्पत्ति पुस्तकले").build()) + .build(), + 0, + 1, + ) + assert quotation_mark_string_match.previous_character == "\u0947" + def test_get_next_character() -> None: quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2) diff --git a/tests/punctuation_analysis/test_text_segment.py b/tests/punctuation_analysis/test_text_segment.py index aa215d6..11932f0 100644 --- a/tests/punctuation_analysis/test_text_segment.py +++ b/tests/punctuation_analysis/test_text_segment.py @@ -5,7 +5,7 @@ def test_builder_initialization() -> None: builder = TextSegment.Builder() - assert builder._text_segment._text == "" + assert builder._text_segment.text == "" assert builder._text_segment.previous_segment is None assert builder._text_segment.next_segment is None assert builder._text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER @@ -20,7 +20,7 @@ def test_builder_set_text() -> None: text = "Example text" builder.set_text(text) - assert builder._text_segment._text == text + assert builder._text_segment.text == text def test_builder_set_previous_segment() -> None: @@ -62,7 +62,7 @@ def test_builder_set_usfm_token() -> None: assert builder._text_segment._usfm_token is not None assert builder._text_segment._usfm_token.type == UsfmTokenType.TEXT assert builder._text_segment._usfm_token.text == "USFM token text" - assert builder._text_segment._text == "" + assert builder._text_segment.text == "" assert builder._text_segment.previous_segment is None assert builder._text_segment.next_segment is None @@ -161,6 +161,14 @@ def test_length() -> None: text_segment = TextSegment.Builder().set_text("new example text").build() assert text_segment.length == len("new example text") + # Combining characters + text_segment = TextSegment.Builder().set_text("उत्पत्ति पुस्तकले").build() + assert text_segment.length == 17 + + # Surrogate pairs + text_segment = TextSegment.Builder().set_text("𝜺𝜺").build() + assert text_segment.length == 2 + def test_substring_before() -> None: text_segment = TextSegment.Builder().set_text("example text").build() diff --git a/tests/testutils/corpora_test_helpers.py b/tests/testutils/corpora_test_helpers.py index e287560..6c1e7d8 100644 --- a/tests/testutils/corpora_test_helpers.py +++ b/tests/testutils/corpora_test_helpers.py @@ -8,7 +8,9 @@ USFM_TEST_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "Tes" USFM_TARGET_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "target" +USFM_TARGET_PROJECT_ZIP_PATH = TEST_DATA_PATH / "project" / "target" USFM_SOURCE_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "source" +USFM_SOURCE_PROJECT_ZIP_PATH = TEST_DATA_PATH / "project" / "source" USFM_INVALID_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "invalid_id" USFM_MISMATCH_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "mismatch_id" USX_TEST_PROJECT_PATH = TEST_DATA_PATH / "usx" / "Tes"