Skip to content

Commit 76020a8

Browse files
committed
Refactor update block
1 parent 9dba22b commit 76020a8

16 files changed

+674
-664
lines changed

machine/corpora/__init__.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@
6161
from .usfm_tag import UsfmJustification, UsfmStyleAttribute, UsfmStyleType, UsfmTag, UsfmTextProperties, UsfmTextType
6262
from .usfm_token import UsfmAttribute, UsfmToken, UsfmTokenType
6363
from .usfm_tokenizer import RtlReferenceOrder, UsfmTokenizer
64+
from .usfm_update_block import UsfmUpdateBlock
65+
from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
66+
from .usfm_update_block_handler import UsfmUpdateBlockHandler
6467
from .usx_file_alignment_collection import UsxFileAlignmentCollection
6568
from .usx_file_alignment_corpus import UsxFileAlignmentCorpus
6669
from .usx_file_text import UsxFileText
@@ -92,8 +95,8 @@
9295
"is_scripture",
9396
"lowercase",
9497
"MemoryAlignmentCollection",
95-
"MemoryText",
9698
"MemoryStreamContainer",
99+
"MemoryText",
97100
"MultiKeyRef",
98101
"nfc_normalize",
99102
"nfd_normalize",
@@ -126,9 +129,9 @@
126129
"TextRow",
127130
"TextRowFlags",
128131
"unescape_spaces",
129-
"UpdateUsfmTextBehavior",
130132
"UpdateUsfmMarkerBehavior",
131133
"UpdateUsfmParserHandler",
134+
"UpdateUsfmTextBehavior",
132135
"UsfmAttribute",
133136
"UsfmElementType",
134137
"UsfmFileText",
@@ -148,6 +151,10 @@
148151
"UsfmToken",
149152
"UsfmTokenizer",
150153
"UsfmTokenType",
154+
"UsfmUpdateBlock",
155+
"UsfmUpdateBlockElement",
156+
"UsfmUpdateBlockElementType",
157+
"UsfmUpdateBlockHandler",
151158
"UsxFileAlignmentCollection",
152159
"UsxFileAlignmentCorpus",
153160
"UsxFileText",

machine/corpora/paratext_project_text_updater_base.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
from abc import ABC, abstractmethod
2-
from typing import BinaryIO, Optional, Sequence, Tuple, Union
2+
from typing import BinaryIO, Iterable, Optional, Sequence, Tuple, Union
33

44
from ..utils.typeshed import StrPath
55
from .paratext_project_settings import ParatextProjectSettings
66
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
77
from .scripture_ref import ScriptureRef
8-
from .scripture_update_block_handler import ScriptureUpdateBlockHandler
98
from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, UpdateUsfmTextBehavior
109
from .usfm_parser import parse_usfm
10+
from .usfm_update_block_handler import UsfmUpdateBlockHandler
1111

1212

1313
class ParatextProjectTextUpdaterBase(ABC):
@@ -26,8 +26,8 @@ def update_usfm(
2626
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
2727
embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
2828
style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
29-
preserve_paragraph_styles: Optional[Sequence[str]] = None,
30-
update_block_handlers: Optional[list[ScriptureUpdateBlockHandler]] = None,
29+
preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None,
30+
update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
3131
) -> Optional[str]:
3232
file_name: str = self._settings.get_book_file_name(book_id)
3333
if not self._exists(file_name):

machine/corpora/scripture_embed.py

Lines changed: 0 additions & 16 deletions
This file was deleted.

machine/corpora/scripture_ref_usfm_parser_handler.py

Lines changed: 40 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from ..scripture.verse_ref import VerseRef, are_overlapping_verse_ranges
66
from .corpora_utils import merge_verse_ranges
77
from .scripture_element import ScriptureElement
8-
from .scripture_embed import EMBED_PART_START_CHAR_STYLES, is_embed_part_style, is_embed_style, is_note_text
98
from .scripture_ref import ScriptureRef
109
from .usfm_parser_handler import UsfmParserHandler
1110
from .usfm_parser_state import UsfmParserState
@@ -16,7 +15,14 @@ class ScriptureTextType(Enum):
1615
NONE = auto()
1716
NONVERSE = auto()
1817
VERSE = auto()
19-
NOTE_TEXT = auto()
18+
EMBED = auto()
19+
20+
21+
_EMBED_STYLES = {"f", "fe", "x", "fig"}
22+
23+
24+
def _is_embed_style(marker: Optional[str]) -> bool:
25+
return marker is not None and (marker.strip("*") in _EMBED_STYLES or marker.startswith("z"))
2026

2127

2228
class ScriptureRefUsfmParserHandler(UsfmParserHandler, ABC):
@@ -25,18 +31,11 @@ def __init__(self) -> None:
2531
self._cur_elements_stack: List[ScriptureElement] = []
2632
self._cur_text_type_stack: List[ScriptureTextType] = []
2733
self._duplicate_verse: bool = False
28-
self._in_preserved_paragraph: bool = False
29-
self._in_embed: bool = False
30-
self._in_note_text: bool = False
31-
self._in_nested_embed: bool = False
3234

3335
@property
3436
def _current_text_type(self) -> ScriptureTextType:
3537
return ScriptureTextType.NONE if len(self._cur_text_type_stack) == 0 else self._cur_text_type_stack[-1]
3638

37-
def _is_in_note_text(self) -> bool:
38-
return self._in_note_text
39-
4039
def end_usfm(self, state: UsfmParserState) -> None:
4140
self._end_verse_text_wrapper(state)
4241

@@ -112,32 +111,6 @@ def start_sidebar(self, state: UsfmParserState, marker: str, category: str) -> N
112111
def end_sidebar(self, state: UsfmParserState, marker: str, closed: bool) -> None:
113112
self._end_parent_element()
114113

115-
def start_note(self, state: UsfmParserState, marker: str, caller: str, category: Optional[str]) -> None:
116-
self._in_embed = True
117-
self._start_embed_wrapper(state, marker)
118-
119-
def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None:
120-
self._end_note_text_wrapper(state)
121-
self._end_embed(state, marker, None, closed)
122-
self._in_embed = False
123-
124-
def _start_embed_wrapper(self, state: UsfmParserState, marker: str) -> None:
125-
if self._cur_verse_ref.is_default:
126-
self._update_verse_ref(state.verse_ref, marker)
127-
128-
if not self._duplicate_verse:
129-
self._check_convert_verse_para_to_non_verse(state)
130-
self._next_element(marker)
131-
132-
self._start_embed(state, self._create_non_verse_ref())
133-
134-
def _start_embed(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...
135-
136-
def _end_embed(
137-
self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool
138-
) -> None:
139-
pass
140-
141114
def text(self, state: UsfmParserState, text: str) -> None:
142115
# if we hit text in a verse paragraph and we aren't in a verse, then start a non-verse segment
143116
if text.strip():
@@ -149,29 +122,23 @@ def opt_break(self, state: UsfmParserState) -> None:
149122
def start_char(
150123
self, state: UsfmParserState, marker: str, unknown: bool, attributes: Optional[Sequence[UsfmAttribute]]
151124
) -> None:
152-
if is_embed_part_style(marker) and self._in_note_text:
153-
self._in_nested_embed = True
154125
# if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse segment
155126
self._check_convert_verse_para_to_non_verse(state)
156127

157-
if is_embed_style(marker):
158-
self._in_embed = True
159-
self._start_embed_wrapper(state, marker)
160-
161-
if is_note_text(marker):
162-
self._start_note_text_wrapper(state)
128+
if _is_embed_style(marker):
129+
self._start_embed_text_wrapper(state, marker)
163130

164131
def end_char(
165132
self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool
166133
) -> None:
167-
if is_embed_part_style(marker):
168-
if self._in_nested_embed:
169-
self._in_nested_embed = False
170-
elif self._is_note_text(marker):
171-
self._end_note_text_wrapper(state)
172-
if is_embed_style(marker):
173-
self._end_embed(state, marker, attributes, closed)
174-
self._in_embed = False
134+
if _is_embed_style(marker):
135+
self._end_embed_text_wrapper(state)
136+
137+
def start_note(self, state, marker, caller, category) -> None:
138+
self._start_embed_text_wrapper(state, marker)
139+
140+
def end_note(self, state, marker, closed) -> None:
141+
self._end_embed_text_wrapper(state)
175142

176143
def _start_verse_text(self, state: UsfmParserState, scripture_refs: Optional[Sequence[ScriptureRef]]) -> None: ...
177144

@@ -181,20 +148,9 @@ def _start_non_verse_text(self, state: UsfmParserState, scripture_ref: Scripture
181148

182149
def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...
183150

184-
def _start_note_text_wrapper(self, state: UsfmParserState):
185-
self._in_note_text = True
186-
self._cur_text_type_stack.append(ScriptureTextType.NOTE_TEXT)
187-
self._start_note_text(state)
188-
189-
def _start_note_text(self, state: UsfmParserState) -> None: ...
190-
191-
def _end_note_text_wrapper(self, state: UsfmParserState):
192-
if self._cur_text_type_stack and self._cur_text_type_stack[-1] == ScriptureTextType.NOTE_TEXT:
193-
self._end_note_text(state, self._create_non_verse_ref())
194-
self._cur_text_type_stack.pop()
195-
self._in_note_text = False
151+
def _start_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...
196152

197-
def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...
153+
def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...
198154

199155
def _start_verse_text_wrapper(self, state: UsfmParserState) -> None:
200156
self._duplicate_verse = False
@@ -222,6 +178,25 @@ def _update_verse_ref(self, verse_ref: VerseRef, marker: str) -> None:
222178
self._cur_elements_stack.append(ScriptureElement(0, marker))
223179
self._cur_verse_ref = verse_ref.copy()
224180

181+
def _start_embed_text_wrapper(self, state: UsfmParserState, marker: str) -> None:
182+
if self._cur_verse_ref.is_default:
183+
self._update_verse_ref(state.verse_ref, marker)
184+
185+
if not self._duplicate_verse:
186+
self._check_convert_verse_para_to_non_verse(state)
187+
self._next_element(marker)
188+
self._cur_text_type_stack.append(ScriptureTextType.EMBED)
189+
self._start_embed_text(state, self._create_non_verse_ref())
190+
191+
def _end_embed_text_wrapper(self, state: UsfmParserState) -> None:
192+
if (
193+
not self._duplicate_verse
194+
and self._cur_text_type_stack
195+
and self._cur_text_type_stack[-1] == ScriptureTextType.EMBED
196+
):
197+
self._end_embed_text(state, self._create_non_verse_ref())
198+
self._cur_text_type_stack.pop()
199+
225200
def _next_element(self, marker: str) -> None:
226201
prev_elem: ScriptureElement = self._cur_elements_stack.pop()
227202
self._cur_elements_stack.append(ScriptureElement(prev_elem.position + 1, marker))
@@ -234,7 +209,7 @@ def _end_parent_element(self) -> None:
234209
self._cur_elements_stack.pop()
235210

236211
def _end_embed_elements(self) -> None:
237-
if self._cur_elements_stack and is_embed_style(self._cur_elements_stack[-1].name):
212+
if self._cur_elements_stack and _is_embed_style(self._cur_elements_stack[-1].name):
238213
self._cur_elements_stack.pop()
239214

240215
def _create_verse_refs(self) -> List[ScriptureRef]:
@@ -263,14 +238,3 @@ def _check_convert_verse_para_to_non_verse(self, state: UsfmParserState) -> None
263238
):
264239
self._start_parent_element(para_tag.marker)
265240
self._start_non_verse_text_wrapper(state)
266-
267-
def _is_in_embed(self, marker: Optional[str]) -> bool:
268-
return self._in_embed or is_embed_style(marker)
269-
270-
def _is_in_nested_embed(self, marker: Optional[str]) -> bool:
271-
return self._in_nested_embed or (
272-
marker is not None
273-
and marker.startswith("+")
274-
and marker[1] in EMBED_PART_START_CHAR_STYLES
275-
and marker != "fm"
276-
)

machine/corpora/scripture_update_block.py

Lines changed: 0 additions & 53 deletions
This file was deleted.

machine/corpora/scripture_update_block_handler.py

Lines changed: 0 additions & 11 deletions
This file was deleted.

machine/corpora/scripture_update_element.py

Lines changed: 0 additions & 44 deletions
This file was deleted.

0 commit comments

Comments
 (0)