Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
from .usx_file_text_corpus import UsxFileTextCorpus
from .usx_memory_text import UsxMemoryText
from .usx_zip_text import UsxZipText
from .zip_paratext_project_quote_convention_detector import ZipParatextProjectQuoteConventionDetector
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase
from .zip_paratext_project_terms_parser import ZipParatextProjectTermsParser
Expand Down Expand Up @@ -188,6 +189,7 @@
"UsxFileTextCorpus",
"UsxMemoryText",
"UsxZipText",
"ZipParatextProjectQuoteConventionDetector",
"ZipParatextProjectSettingsParser",
"ZipParatextProjectSettingsParserBase",
"ZipParatextProjectTermsParser",
Expand Down
8 changes: 6 additions & 2 deletions machine/punctuation_analysis/quotation_mark_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,15 @@ def find_all_potential_quotation_marks_in_text_segment(
self, text_segment: TextSegment
) -> List[QuotationMarkStringMatch]:
quotation_matches: List[QuotationMarkStringMatch] = []
for quotation_mark_match in self._QUOTATION_MARK_PATTERN.finditer(text_segment.text):
for quotation_mark_match in self._QUOTATION_MARK_PATTERN.finditer(str(text_segment.text)):
if self._quote_conventions.is_valid_opening_quotation_mark(
quotation_mark_match.group()
) or self._quote_conventions.is_valid_closing_quotation_mark(quotation_mark_match.group()):
quotation_matches.append(
QuotationMarkStringMatch(text_segment, quotation_mark_match.start(), quotation_mark_match.end())
QuotationMarkStringMatch(
text_segment,
quotation_mark_match.start(),
quotation_mark_match.end(),
)
)
return quotation_matches
18 changes: 10 additions & 8 deletions machine/punctuation_analysis/quotation_mark_string_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __eq__(self, value):

@property
def quotation_mark(self) -> str:
return self._text_segment.text[self._start_index : self._end_index]
return str(self._text_segment.text[self._start_index : self._end_index])

def is_valid_opening_quotation_mark(self, quote_conventions: QuoteConventionSet) -> bool:
return quote_conventions.is_valid_opening_quotation_mark(self.quotation_mark)
Expand All @@ -59,18 +59,18 @@ def previous_character(self) -> Optional[str]:
if previous_segment is not None and not self._text_segment.marker_is_in_preceding_context(
UsfmMarkerType.PARAGRAPH
):
return previous_segment.text[-1]
return str(previous_segment.text[-1])
return None
return self._text_segment.text[self._start_index - 1]
return str(self._text_segment.text[self._start_index - 1])

@property
def next_character(self) -> Optional[str]:
if self.is_at_end_of_segment():
next_segment = self._text_segment.next_segment
if next_segment is not None and not next_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH):
return next_segment.text[0]
return str(next_segment.text[0])
return None
return self._text_segment.text[self._end_index]
return str(self._text_segment.text[self._end_index])

def leading_substring_matches(self, regex_pattern: regex.Pattern) -> bool:
return regex_pattern.search(self._text_segment.substring_before(self._start_index)) is not None
Expand Down Expand Up @@ -100,9 +100,11 @@ def end_index(self) -> int:
# Not used, but a useful method for debugging
@property
def context(self) -> str:
return self._text_segment.text[
max(self._start_index - 10, 0) : min(self._end_index + 10, len(self._text_segment.text))
]
return str(
self._text_segment.text[
max(self._start_index - 10, 0) : min(self._end_index + 10, len(self._text_segment.text))
]
)

def resolve(self, depth: int, direction: QuotationMarkDirection) -> QuotationMarkMetadata:
return QuotationMarkMetadata(
Expand Down
6 changes: 3 additions & 3 deletions machine/punctuation_analysis/text_segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ def length(self) -> int:
return len(self._text)

def substring_before(self, index: int) -> str:
return self._text[:index]
return str(self._text[:index])

def substring_after(self, index: int) -> str:
return self._text[index:]
return str(self._text[index:])

def marker_is_in_preceding_context(self, marker: UsfmMarkerType) -> bool:
return marker in self._markers_in_preceding_context
Expand All @@ -56,7 +56,7 @@ def is_last_segment_in_verse(self) -> bool:
def replace_substring(self, start_index: int, end_index: int, replacement: str) -> None:
self._text = self.substring_before(start_index) + replacement + self.substring_after(end_index)
if self._usfm_token is not None:
self._usfm_token.text = self._text
self._usfm_token.text = str(self._text)

class Builder:
def __init__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -476,10 +476,9 @@ def test_process_scripture_element() -> None:

assert quote_convention_changer._quotation_mark_finder.num_times_called == 1
assert mock_quotation_mark_resolver.num_times_called == 1
assert quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment._text == "this is a ‘test"
assert quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment.text == "this is a ‘test"
assert (
quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment._text
== "the test ends” here"
quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment.text == "the test ends” here"
)


Expand All @@ -494,7 +493,7 @@ def test_create_text_segments_basic() -> None:
text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element)

assert len(text_segments) == 1
assert text_segments[0]._text == "test segment"
assert text_segments[0].text == "test segment"
assert text_segments[0]._immediate_preceding_marker is UsfmMarkerType.NO_MARKER
assert text_segments[0]._markers_in_preceding_context == set()
assert text_segments[0].previous_segment is None
Expand All @@ -517,7 +516,7 @@ def test_create_text_segments_with_preceding_markers() -> None:
text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element)

assert len(text_segments) == 1
assert text_segments[0]._text == "test segment"
assert text_segments[0].text == "test segment"
assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH
assert text_segments[0]._markers_in_preceding_context == {
UsfmMarkerType.VERSE,
Expand Down Expand Up @@ -547,15 +546,15 @@ def test_create_text_segments_with_multiple_text_tokens() -> None:
text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element)

assert len(text_segments) == 2
assert text_segments[0]._text == "test segment1"
assert text_segments[0].text == "test segment1"
assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH
assert text_segments[0]._markers_in_preceding_context == {
UsfmMarkerType.VERSE,
UsfmMarkerType.PARAGRAPH,
}
assert text_segments[0].previous_segment is None
assert text_segments[0].next_segment == text_segments[1]
assert text_segments[1]._text == "test segment2"
assert text_segments[1].text == "test segment2"
assert text_segments[1]._immediate_preceding_marker == UsfmMarkerType.CHARACTER
assert text_segments[1]._markers_in_preceding_context == {
UsfmMarkerType.VERSE,
Expand All @@ -574,7 +573,7 @@ def test_create_text_segment() -> None:
segment: Union[TextSegment, None] = quote_convention_changer._create_text_segment(usfm_token)

assert segment is not None
assert segment._text == "test segment"
assert segment.text == "test segment"
assert segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER
assert segment._markers_in_preceding_context == set()
assert segment._usfm_token == usfm_token
Expand Down Expand Up @@ -765,7 +764,7 @@ def test_start_new_chapter() -> None:
segment = quote_convention_changer._next_scripture_text_segment_builder.build()
assert quote_convention_changer._current_strategy == QuotationMarkUpdateStrategy.SKIP
assert segment._immediate_preceding_marker == UsfmMarkerType.CHAPTER
assert segment._text == ""
assert segment.text == ""
assert UsfmMarkerType.EMBED not in segment._markers_in_preceding_context
assert quote_convention_changer._verse_text_quotation_mark_resolver._issues == set()

Expand Down
29 changes: 28 additions & 1 deletion tests/corpora/test_usfm_manual.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,13 @@
from typing import List, Optional

import pytest
from testutils.corpora_test_helpers import TEST_DATA_PATH, USFM_SOURCE_PROJECT_PATH, USFM_TARGET_PROJECT_PATH
from testutils.corpora_test_helpers import (
TEST_DATA_PATH,
USFM_SOURCE_PROJECT_PATH,
USFM_SOURCE_PROJECT_ZIP_PATH,
USFM_TARGET_PROJECT_PATH,
USFM_TARGET_PROJECT_ZIP_PATH,
)

from machine.corpora import (
FileParatextProjectSettingsParser,
Expand All @@ -15,9 +21,11 @@
StandardParallelTextCorpus,
UpdateUsfmRow,
UpdateUsfmTextBehavior,
ZipParatextProjectQuoteConventionDetector,
ZipParatextProjectSettingsParser,
ZipParatextProjectTextUpdater,
)
from machine.punctuation_analysis import QuoteConventionDetector


@pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.")
Expand Down Expand Up @@ -124,3 +132,22 @@ def get_usfm(project_path: Path):
assert False, f"Failed to process {subdir}: {e}"
else:
get_usfm(PARATEXT_PROJECT_PATH)


@pytest.mark.skip(reason="This is for manual testing only. Remove this decorator to run the test.")
def test_analyze_corpora_quote_conventions():
source_handler = QuoteConventionDetector()
source_archive = zipfile.ZipFile(USFM_SOURCE_PROJECT_ZIP_PATH, "r")
source_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(source_archive)
source_quote_convention_detector.get_quote_convention_analysis(source_handler)

target_handler = QuoteConventionDetector()
target_archive = zipfile.ZipFile(USFM_TARGET_PROJECT_ZIP_PATH, "r")
target_quote_convention_detector = ZipParatextProjectQuoteConventionDetector(target_archive)
target_quote_convention_detector.get_quote_convention_analysis(target_handler)

source_analysis = source_handler.detect_quote_convention()
target_analysis = target_handler.detect_quote_convention()

assert source_analysis is not None
assert target_analysis is not None
4 changes: 4 additions & 0 deletions tests/punctuation_analysis/test_quotation_mark_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,10 @@ def test_that_all_possible_quotation_marks_are_identified() -> None:
),
]

assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment(
TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build()
) == [QuotationMarkStringMatch(TextSegment.Builder().set_text('उत्पत्ति "पुस्तकले').build(), 9, 10)]


def test_that_it_uses_the_quote_convention_set() -> None:
standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english")
Expand Down
10 changes: 5 additions & 5 deletions tests/punctuation_analysis/test_quotation_mark_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def test_update_quotation_mark() -> None:
end_index=23,
)
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english"))
assert quotation_mark_metadata.text_segment._text == "He said to the woman, “Has God really said,"
assert quotation_mark_metadata.text_segment.text == "He said to the woman, “Has God really said,"

quotation_mark_metadata = QuotationMarkMetadata(
quotation_mark='"',
Expand All @@ -30,7 +30,7 @@ def test_update_quotation_mark() -> None:
end_index=23,
)
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european"))
assert quotation_mark_metadata.text_segment._text == "He said to the woman, «Has God really said,"
assert quotation_mark_metadata.text_segment.text == "He said to the woman, «Has God really said,"

quotation_mark_metadata = QuotationMarkMetadata(
quotation_mark='"',
Expand All @@ -41,7 +41,7 @@ def test_update_quotation_mark() -> None:
end_index=24,
)
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european"))
assert quotation_mark_metadata.text_segment._text == 'He said to the woman, "«as God really said,'
assert quotation_mark_metadata.text_segment.text == 'He said to the woman, "«as God really said,'


def test_update_quotation_mark_with_multi_character_quotation_marks() -> None:
Expand All @@ -54,7 +54,7 @@ def test_update_quotation_mark_with_multi_character_quotation_marks() -> None:
end_index=23,
)
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("typewriter_french"))
assert quotation_mark_metadata.text_segment._text == "He said to the woman, <<Has God really said,"
assert quotation_mark_metadata.text_segment.text == "He said to the woman, <<Has God really said,"
assert quotation_mark_metadata.start_index == 22
assert quotation_mark_metadata.end_index == 24

Expand All @@ -67,7 +67,7 @@ def test_update_quotation_mark_with_multi_character_quotation_marks() -> None:
end_index=24,
)
quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english"))
assert quotation_mark_metadata.text_segment._text == "He said to the woman, “Has God really said,"
assert quotation_mark_metadata.text_segment.text == "He said to the woman, “Has God really said,"
assert quotation_mark_metadata.start_index == 22
assert quotation_mark_metadata.end_index == 23

Expand Down
10 changes: 10 additions & 0 deletions tests/punctuation_analysis/test_quotation_mark_string_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,16 @@ def test_get_previous_character() -> None:
quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2)
assert quotation_mark_string_match.previous_character == "“"

quotation_mark_string_match = QuotationMarkStringMatch(
TextSegment.Builder()
.set_text('"उत्पत्ति पुस्तकले')
.set_previous_segment(TextSegment.Builder().set_text("उत्पत्ति पुस्तकले").build())
.build(),
0,
1,
)
assert quotation_mark_string_match.previous_character == "\u0947"


def test_get_next_character() -> None:
quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2)
Expand Down
14 changes: 11 additions & 3 deletions tests/punctuation_analysis/test_text_segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
def test_builder_initialization() -> None:
builder = TextSegment.Builder()

assert builder._text_segment._text == ""
assert builder._text_segment.text == ""
assert builder._text_segment.previous_segment is None
assert builder._text_segment.next_segment is None
assert builder._text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER
Expand All @@ -20,7 +20,7 @@ def test_builder_set_text() -> None:
text = "Example text"
builder.set_text(text)

assert builder._text_segment._text == text
assert builder._text_segment.text == text


def test_builder_set_previous_segment() -> None:
Expand Down Expand Up @@ -62,7 +62,7 @@ def test_builder_set_usfm_token() -> None:
assert builder._text_segment._usfm_token is not None
assert builder._text_segment._usfm_token.type == UsfmTokenType.TEXT
assert builder._text_segment._usfm_token.text == "USFM token text"
assert builder._text_segment._text == ""
assert builder._text_segment.text == ""
assert builder._text_segment.previous_segment is None
assert builder._text_segment.next_segment is None

Expand Down Expand Up @@ -161,6 +161,14 @@ def test_length() -> None:
text_segment = TextSegment.Builder().set_text("new example text").build()
assert text_segment.length == len("new example text")

# Combining characters
text_segment = TextSegment.Builder().set_text("उत्पत्ति पुस्तकले").build()
assert text_segment.length == 17

# Surrogate pairs
text_segment = TextSegment.Builder().set_text("𝜺𝜺").build()
assert text_segment.length == 2


def test_substring_before() -> None:
text_segment = TextSegment.Builder().set_text("example text").build()
Expand Down
2 changes: 2 additions & 0 deletions tests/testutils/corpora_test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@

USFM_TEST_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "Tes"
USFM_TARGET_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "target"
USFM_TARGET_PROJECT_ZIP_PATH = TEST_DATA_PATH / "project" / "target"
USFM_SOURCE_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "source"
USFM_SOURCE_PROJECT_ZIP_PATH = TEST_DATA_PATH / "project" / "source"
USFM_INVALID_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "invalid_id"
USFM_MISMATCH_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "mismatch_id"
USX_TEST_PROJECT_PATH = TEST_DATA_PATH / "usx" / "Tes"
Expand Down
Loading