Skip to content

Scripture Update block #168

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ jobs:
node-version: "14"
- name: Lint with pyright
run: |
npm install -g [email protected].386
npm install -g [email protected].399
poetry run pyright
- name: Test with pytest
run: poetry run pytest --cov --cov-report=xml
Expand Down
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
"source.organizeImports": "explicit"
},
},
"files.associations": {
"*.SFM": "usfm",
},
"black-formatter.path": [
"poetry",
"run",
Expand Down
8 changes: 5 additions & 3 deletions machine/corpora/paratext_project_terms_parser_base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import re
from abc import ABC, abstractmethod
from collections import defaultdict
Expand Down Expand Up @@ -45,7 +47,7 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -
else:
term_id_to_category_dict = {}

terms_glosses_doc: Optional[ElementTree.ElementTree] = None
terms_glosses_doc: Optional[ElementTree.ElementTree[ElementTree.Element]] = None
resource_name = None
if self._settings.language_code is not None:
resource_name = _SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS.get(self._settings.language_code)
Expand All @@ -57,7 +59,7 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -
with open_binary(_SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS_PACKAGE, resource_name) as stream:
terms_glosses_doc = ElementTree.parse(stream)

term_renderings_doc: Optional[ElementTree.ElementTree] = None
term_renderings_doc: Optional[ElementTree.ElementTree[ElementTree.Element]] = None
if self._exists("TermRenderings.xml"):
with self._open("TermRenderings.xml") as stream:
term_renderings_doc = ElementTree.parse(stream)
Expand Down Expand Up @@ -136,7 +138,7 @@ def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str:
return term_string


def _get_category_per_id(biblical_terms_doc: ElementTree.ElementTree) -> Dict[str, str]:
def _get_category_per_id(biblical_terms_doc: ElementTree.ElementTree[ElementTree.Element]) -> Dict[str, str]:
term_id_to_category_dict: Dict[str, str] = {}

for term in biblical_terms_doc.findall(".//Term"):
Expand Down
3 changes: 3 additions & 0 deletions machine/corpora/paratext_project_text_updater_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .scripture_ref import ScriptureRef
from .scripture_update_block_handler import ScriptureUpdateBlockHandler
from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, UpdateUsfmTextBehavior
from .usfm_parser import parse_usfm

Expand All @@ -26,6 +27,7 @@ def update_usfm(
embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
preserve_paragraph_styles: Optional[Sequence[str]] = None,
update_block_handlers: Optional[list[ScriptureUpdateBlockHandler]] = None,
) -> Optional[str]:
file_name: str = self._settings.get_book_file_name(book_id)
if not self._exists(file_name):
Expand All @@ -40,6 +42,7 @@ def update_usfm(
embed_behavior,
style_behavior,
preserve_paragraph_styles,
update_block_handlers=update_block_handlers,
)
try:
parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
Expand Down
16 changes: 16 additions & 0 deletions machine/corpora/scripture_embed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from typing import Optional

EMBED_PART_START_CHAR_STYLES = ("f", "x", "z")
EMBED_STYLES = ("f", "fe", "fig", "fm", "x")


def is_note_text(marker: Optional[str]) -> bool:
return marker == "ft"


def is_embed_part_style(marker: Optional[str]) -> bool:
return marker is not None and marker.startswith(EMBED_PART_START_CHAR_STYLES)


def is_embed_style(marker: Optional[str]) -> bool:
return marker is not None and marker.strip("*") in EMBED_STYLES
28 changes: 8 additions & 20 deletions machine/corpora/scripture_ref_usfm_parser_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from ..scripture.verse_ref import VerseRef, are_overlapping_verse_ranges
from .corpora_utils import merge_verse_ranges
from .scripture_element import ScriptureElement
from .scripture_embed import EMBED_PART_START_CHAR_STYLES, is_embed_part_style, is_embed_style, is_note_text
from .scripture_ref import ScriptureRef
from .usfm_parser_handler import UsfmParserHandler
from .usfm_parser_state import UsfmParserState
Expand All @@ -18,10 +19,6 @@ class ScriptureTextType(Enum):
NOTE_TEXT = auto()


EMBED_PART_START_CHAR_STYLES = ("f", "x", "z")
EMBED_STYLES = ("f", "fe", "fig", "fm", "x")


class ScriptureRefUsfmParserHandler(UsfmParserHandler, ABC):
def __init__(self) -> None:
self._cur_verse_ref: VerseRef = VerseRef()
Expand Down Expand Up @@ -152,27 +149,27 @@ def opt_break(self, state: UsfmParserState) -> None:
def start_char(
self, state: UsfmParserState, marker: str, unknown: bool, attributes: Optional[Sequence[UsfmAttribute]]
) -> None:
if self._is_embed_part_style(marker) and self._in_note_text:
if is_embed_part_style(marker) and self._in_note_text:
self._in_nested_embed = True
# if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse segment
self._check_convert_verse_para_to_non_verse(state)

if self._is_embed_style(marker):
if is_embed_style(marker):
self._in_embed = True
self._start_embed_wrapper(state, marker)

if self._is_note_text(marker):
if is_note_text(marker):
self._start_note_text_wrapper(state)

def end_char(
self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool
) -> None:
if self._is_embed_part_style(marker):
if is_embed_part_style(marker):
if self._in_nested_embed:
self._in_nested_embed = False
else:
self._end_note_text_wrapper(state)
if self._is_embed_style(marker):
if is_embed_style(marker):
self._end_embed(state, marker, attributes, closed)
self._in_embed = False

Expand Down Expand Up @@ -237,7 +234,7 @@ def _end_parent_element(self) -> None:
self._cur_elements_stack.pop()

def _end_embed_elements(self) -> None:
if self._cur_elements_stack and self._is_embed_style(self._cur_elements_stack[-1].name):
if self._cur_elements_stack and is_embed_style(self._cur_elements_stack[-1].name):
self._cur_elements_stack.pop()

def _create_verse_refs(self) -> List[ScriptureRef]:
Expand Down Expand Up @@ -268,18 +265,9 @@ def _check_convert_verse_para_to_non_verse(self, state: UsfmParserState) -> None
self._start_non_verse_text_wrapper(state)

def _is_in_embed(self, marker: Optional[str]) -> bool:
return self._in_embed or self._is_embed_style(marker)
return self._in_embed or is_embed_style(marker)

def _is_in_nested_embed(self, marker: Optional[str]) -> bool:
return self._in_nested_embed or (
marker is not None and marker.startswith("+") and marker[1] in EMBED_PART_START_CHAR_STYLES
)

def _is_note_text(self, marker: Optional[str]) -> bool:
return marker == "ft"

def _is_embed_part_style(self, marker: Optional[str]) -> bool:
return marker is not None and marker.startswith(EMBED_PART_START_CHAR_STYLES)

def _is_embed_style(self, marker: Optional[str]) -> bool:
return marker is not None and marker.strip("*") in EMBED_STYLES
53 changes: 53 additions & 0 deletions machine/corpora/scripture_update_block.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from __future__ import annotations

from .scripture_ref import ScriptureRef
from .scripture_update_element import (
ScriptureUpdateElement,
ScriptureUpdateElementType,
create_non_text_scripture_element,
)
from .usfm_token import UsfmToken, UsfmTokenType


class ScriptureUpdateBlock:

def __init__(self) -> None:
self.ref: ScriptureRef = ScriptureRef()
self._elements: list[ScriptureUpdateElement] = []

@property
def elements(self) -> list[ScriptureUpdateElement]:
return self._elements

def add_existing_text(self, token: UsfmToken, marked_for_removal: bool = False) -> None:
self._elements.append(
ScriptureUpdateElement(ScriptureUpdateElementType.EXISTING_TEXT, [token], marked_for_removal)
)

def add_inserted_text(self, tokens: list[UsfmToken]) -> None:
self._elements.append(ScriptureUpdateElement(ScriptureUpdateElementType.INSERTED_TEXT, tokens.copy()))

def add_token(self, token: UsfmToken, marked_for_removal: bool = False) -> None:
if token.type == UsfmTokenType.TEXT:
self._elements.append(
ScriptureUpdateElement(ScriptureUpdateElementType.EXISTING_TEXT, [token], marked_for_removal)
)
else:
self._elements.append(create_non_text_scripture_element([token], marked_for_removal))

def add_embed(self, tokens: list[UsfmToken], marked_for_removal: bool = False) -> None:
if len(tokens) == 0:
return
self._elements.append(
ScriptureUpdateElement(ScriptureUpdateElementType.EMBED_BLOCK, tokens, marked_for_removal)
)

def update_ref(self, ref: ScriptureRef) -> None:
self.ref = ref

def clear(self) -> None:
self._elements.clear()
self.ref = ScriptureRef()

def get_tokens(self) -> list[UsfmToken]:
return [token for element in self._elements for token in element.get_tokens()]
10 changes: 10 additions & 0 deletions machine/corpora/scripture_update_block_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from __future__ import annotations

from abc import ABC

from .scripture_update_block import ScriptureUpdateBlock


class ScriptureUpdateBlockHandler(ABC):

def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: ...
44 changes: 44 additions & 0 deletions machine/corpora/scripture_update_element.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from __future__ import annotations

from dataclasses import dataclass
from enum import Enum, auto

from .scripture_embed import is_embed_style
from .usfm_token import UsfmToken, UsfmTokenType


class ScriptureUpdateElementType(Enum):
EXISTING_TEXT = auto()
INSERTED_TEXT = auto()
PARAGRAPH = auto()
EMBED_BLOCK = auto()
EMBED = auto()
STYLE = auto()
OTHER = auto()


@dataclass
class ScriptureUpdateElement:
type: ScriptureUpdateElementType
tokens: list[UsfmToken]
marked_for_removal: bool = False

def get_tokens(self) -> list[UsfmToken]:
if self.marked_for_removal:
return []
return self.tokens


def create_non_text_scripture_element(
tokens: list[UsfmToken], marked_for_removal: bool = False
) -> ScriptureUpdateElement:
tokens = tokens.copy()
# Determine if it is a Paragraph, style, embed or other
if len(tokens) == 0 or tokens[0].marker is None:
return ScriptureUpdateElement(ScriptureUpdateElementType.OTHER, [], marked_for_removal)
if tokens[0].type == UsfmTokenType.PARAGRAPH:
return ScriptureUpdateElement(ScriptureUpdateElementType.PARAGRAPH, tokens, marked_for_removal)
if is_embed_style(tokens[0].marker):
return ScriptureUpdateElement(ScriptureUpdateElementType.EMBED, tokens, marked_for_removal)
else:
return ScriptureUpdateElement(ScriptureUpdateElementType.STYLE, tokens, marked_for_removal)
Loading
Loading