From 157ec482761d2f12b384db236d5716c374a294eb Mon Sep 17 00:00:00 2001 From: John Lambert Date: Wed, 2 Apr 2025 11:13:04 -0400 Subject: [PATCH 01/11] Some tests pass --- machine/corpora/scripture_update_block.py | 45 +++++++ .../scripture_update_block_handler_base.py | 9 ++ ...date_block_handler_first_elements_first.py | 23 ++++ machine/corpora/scripture_update_element.py | 24 ++++ machine/corpora/update_usfm_parser_handler.py | 112 ++++++++++++------ 5 files changed, 178 insertions(+), 35 deletions(-) create mode 100644 machine/corpora/scripture_update_block.py create mode 100644 machine/corpora/scripture_update_block_handler_base.py create mode 100644 machine/corpora/scripture_update_block_handler_first_elements_first.py create mode 100644 machine/corpora/scripture_update_element.py diff --git a/machine/corpora/scripture_update_block.py b/machine/corpora/scripture_update_block.py new file mode 100644 index 00000000..00787cf2 --- /dev/null +++ b/machine/corpora/scripture_update_block.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from .scripture_ref import ScriptureRef +from .scripture_update_element import ScriptureUpdateElement, ScriptureUpdateElementType +from .usfm_token import UsfmToken, UsfmTokenType + + +class ScriptureUpdateBlock: + + def __init__(self) -> None: + self._ref: ScriptureRef = ScriptureRef() + self._elements: list[ScriptureUpdateElement] = [] + + def add_existing_text(self, token: UsfmToken, marked_for_removal: bool = False) -> None: + self._elements.append( + ScriptureUpdateElement(ScriptureUpdateElementType.EXISTING_TEXT, [token], marked_for_removal) + ) + + def add_inserted_text(self, tokens: list[UsfmToken]) -> None: + self._elements.append(ScriptureUpdateElement(ScriptureUpdateElementType.INSERTED_TEXT, tokens.copy())) + + def add_token(self, token: UsfmToken, marked_for_removal: bool = False) -> None: + if token.type == UsfmTokenType.TEXT: + self._elements.append( + ScriptureUpdateElement(ScriptureUpdateElementType.EXISTING_TEXT, [token], marked_for_removal) + ) + else: + self._elements.append(ScriptureUpdateElement(ScriptureUpdateElementType.OTHER, [token], marked_for_removal)) + + def add_tokens(self, tokens: list[UsfmToken], marked_for_removal: bool = False) -> None: + if len(tokens) == 0: + return + self._elements.append( + ScriptureUpdateElement(ScriptureUpdateElementType.OTHER, tokens.copy(), marked_for_removal) + ) + + def update_ref(self, ref: ScriptureRef) -> None: + self._ref = ref + + def clear(self) -> None: + self._elements.clear() + self._ref = ScriptureRef() + + def get_tokens(self) -> list[UsfmToken]: + return [token for element in self._elements for token in element.get_tokens()] diff --git a/machine/corpora/scripture_update_block_handler_base.py b/machine/corpora/scripture_update_block_handler_base.py new file mode 100644 index 00000000..2998a0d9 --- /dev/null +++ b/machine/corpora/scripture_update_block_handler_base.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from .scripture_update_block import ScriptureUpdateBlock + + +class ScriptureUpdateBlockHandlerBase: + + def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: + raise NotImplementedError("Must be implemented in subclass") diff --git a/machine/corpora/scripture_update_block_handler_first_elements_first.py b/machine/corpora/scripture_update_block_handler_first_elements_first.py new file mode 100644 index 00000000..17f44798 --- /dev/null +++ b/machine/corpora/scripture_update_block_handler_first_elements_first.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from .scripture_update_block import ScriptureUpdateBlock +from .scripture_update_block_handler_base import ScriptureUpdateBlockHandlerBase +from .scripture_update_element import ScriptureUpdateElementType + + +class ScriptureUpdateBlockHandlerFirstElementsFirst(ScriptureUpdateBlockHandlerBase): + + def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: + # If a paragraph, embed or style element occurs before existing text, move it before inserted text as well. + current_insert_index = 0 + for current_index in range(len(block._elements)): + element = block._elements[current_index] + if element.type == ScriptureUpdateElementType.EXISTING_TEXT: + # we found existing text, so we stop looking for elements to move + break + if current_index != current_insert_index and element.type != ScriptureUpdateElementType.INSERTED_TEXT: + block._elements.remove(element) + block._elements.insert(current_insert_index, element) + current_insert_index += 1 + + return block diff --git a/machine/corpora/scripture_update_element.py b/machine/corpora/scripture_update_element.py new file mode 100644 index 00000000..fe39d7e5 --- /dev/null +++ b/machine/corpora/scripture_update_element.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum, auto + +from .usfm_token import UsfmToken + + +class ScriptureUpdateElementType(Enum): + EXISTING_TEXT = auto() + INSERTED_TEXT = auto() + OTHER = auto() + + +@dataclass +class ScriptureUpdateElement: + type: ScriptureUpdateElementType + tokens: list[UsfmToken] + marked_for_removal: bool = False + + def get_tokens(self) -> list[UsfmToken]: + if self.marked_for_removal: + return [] + return self.tokens diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index b3ebe2be..c05989d9 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -1,8 +1,12 @@ from enum import Enum, auto from typing import List, Optional, Sequence, Tuple, Union +from ..scripture.verse_ref import VerseRef from .scripture_ref import ScriptureRef from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler +from .scripture_update_block import ScriptureUpdateBlock +from .scripture_update_block_handler_base import ScriptureUpdateBlockHandlerBase +from .scripture_update_block_handler_first_elements_first import ScriptureUpdateBlockHandlerFirstElementsFirst from .usfm_parser_state import UsfmParserState from .usfm_stylesheet import UsfmStylesheet from .usfm_token import UsfmAttribute, UsfmToken, UsfmTokenType @@ -31,13 +35,20 @@ def __init__( embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, preserve_paragraph_styles: Optional[Sequence[str]] = None, + update_block_handlers: Optional[list[ScriptureUpdateBlockHandlerBase]] = None, ) -> None: super().__init__() self._rows = rows or [] self._tokens: List[UsfmToken] = [] - self._new_tokens: List[UsfmToken] = [] - self._new_embed_tokens: List[UsfmToken] = [] + self._updated_text: List[UsfmToken] = [] + self._updated_embed_text: List[UsfmToken] = [] + self._update_block: ScriptureUpdateBlock = ScriptureUpdateBlock() + self._embed_update_block: ScriptureUpdateBlock = ScriptureUpdateBlock() self._id_text = id_text + if update_block_handlers is None: + self._update_block_handlers = [ScriptureUpdateBlockHandlerFirstElementsFirst()] + else: + self._update_block_handlers = update_block_handlers if preserve_paragraph_styles is None: self._preserve_paragraph_styles = set(["r", "rem"]) elif isinstance(preserve_paragraph_styles, str): @@ -60,7 +71,7 @@ def tokens(self) -> List[UsfmToken]: def end_usfm(self, state: UsfmParserState) -> None: self._collect_tokens(state) - + self._process_update_block() super().end_usfm(state) def start_book(self, state: UsfmParserState, marker: str, code: str) -> None: @@ -68,13 +79,12 @@ def start_book(self, state: UsfmParserState, marker: str, code: str) -> None: start_book_tokens: List[UsfmToken] = [] if self._id_text is not None: start_book_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=self._id_text + " ")) - self._push_new_tokens(start_book_tokens) + self._update_block.add_tokens(start_book_tokens) super().start_book(state, marker, code) def end_book(self, state: UsfmParserState, marker: str) -> None: - self._pop_new_tokens() - + self._process_update_block() super().end_book(state, marker) def start_para( @@ -99,6 +109,7 @@ def start_para( super().start_para(state, marker, unknown, attributes) def end_para(self, state: UsfmParserState, marker: str) -> None: + self._process_update_block() super().end_para(state, marker) self._in_preserved_paragraph = False @@ -114,7 +125,7 @@ def start_cell(self, state: UsfmParserState, marker: str, align: str, colspan: i def end_cell(self, state: UsfmParserState, marker: str) -> None: self._collect_tokens(state) - + self._process_update_block() super().end_cell(state, marker) def start_sidebar(self, state: UsfmParserState, marker: str, category: str) -> None: @@ -125,6 +136,7 @@ def start_sidebar(self, state: UsfmParserState, marker: str, category: str) -> N def end_sidebar(self, state: UsfmParserState, marker: str, closed: bool) -> None: if closed: self._collect_tokens(state) + self._process_update_block() super().end_sidebar(state, marker, closed) @@ -137,6 +149,7 @@ def chapter( pub_number: str, ) -> None: self._collect_tokens(state) + self._process_update_block() super().chapter(state, number, marker, alt_number, pub_number) @@ -148,6 +161,7 @@ def milestone( attributes: Sequence[UsfmAttribute], ) -> None: self._collect_tokens(state) + self._process_update_block() super().milestone(state, marker, start_milestone, attributes) @@ -160,6 +174,7 @@ def verse( pub_number: str, ) -> None: self._collect_tokens(state) + self._process_update_block() super().verse(state, number, marker, alt_number, pub_number) @@ -199,6 +214,7 @@ def _start_embed( state: UsfmParserState, scripture_ref: ScriptureRef, ) -> None: + self._embed_update_block.update_ref(scripture_ref) self._embed_row_texts = self._advance_rows([scripture_ref]) self._embed_updated = any(self._embed_row_texts) @@ -217,6 +233,7 @@ def _end_embed( else: self._collect_tokens(state) + self._process_embed_update_block() self._embed_row_texts.clear() self._embed_updated = False @@ -256,20 +273,20 @@ def unmatched(self, state: UsfmParserState, marker: str) -> None: def _start_verse_text(self, state: UsfmParserState, scripture_refs: Sequence[ScriptureRef]) -> None: row_texts: List[str] = self._advance_rows(scripture_refs) - self._push_new_tokens([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts]) + self._push_updated_text([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts]) def _end_verse_text(self, state: UsfmParserState, scripture_refs: Sequence[ScriptureRef]) -> None: self._pop_new_tokens() def _start_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: row_texts = self._advance_rows([scripture_ref]) - self._push_new_tokens([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts]) + self._push_updated_text([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts]) def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: self._pop_new_tokens() def _start_note_text(self, state: UsfmParserState) -> None: - self._push_new_embed_tokens([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in self._embed_row_texts]) + self._push_updated_embed_text([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in self._embed_row_texts]) def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: self._embed_row_texts.clear() @@ -306,13 +323,15 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> List[str]: return row_texts def _collect_tokens(self, state: UsfmParserState) -> None: - self._tokens.extend(self._new_tokens) - self._new_tokens.clear() + self._use_updated_text() while self._token_index <= state.index + state.special_token_count: - self._tokens.append(state.tokens[self._token_index]) + self._update_block.add_token(state.tokens[self._token_index]) self._token_index += 1 def _skip_tokens(self, state: UsfmParserState) -> None: + while self._token_index <= state.index + state.special_token_count: + self._update_block.add_token(state.tokens[self._token_index], marked_for_removal=True) + self._token_index += 1 self._token_index = state.index + 1 + state.special_token_count def _replace_with_new_tokens(self, state: UsfmParserState, closed: bool = True) -> bool: @@ -348,24 +367,24 @@ def _replace_with_new_tokens(self, state: UsfmParserState, closed: bool = True) if use_new_tokens: if in_embed: - self._add_new_embed_tokens() + self._use_updated_embed_text() else: - self._add_new_tokens() + self._use_updated_text() if existing_text and ( self._text_behavior == UpdateUsfmTextBehavior.PREFER_EXISTING or self._is_in_preserved_paragraph(marker) ): if in_embed: - self._clear_new_embed_tokens() + self._clear_updated_embed_text() else: - self._clear_new_tokens() + self._clear_updated_text() embed_in_new_verse_text = ( any(self._replace_stack) or self._text_behavior == UpdateUsfmTextBehavior.STRIP_EXISTING ) and in_embed if embed_in_new_verse_text or self._embed_updated: if self._embed_behavior == UpdateUsfmMarkerBehavior.STRIP: - self._clear_new_embed_tokens() + self._clear_updated_embed_text() return True if not self._is_in_note_text() or in_nested_embed: return False @@ -380,33 +399,56 @@ def _replace_with_new_tokens(self, state: UsfmParserState, closed: bool = True) def _has_new_text(self) -> bool: return any(self._replace_stack) and self._replace_stack[-1] - def _push_new_tokens(self, tokens: List[UsfmToken]) -> None: + def _update_verse_ref(self, verse_ref: VerseRef, marker: str) -> None: + super()._update_verse_ref(verse_ref, marker) + self._update_block.update_ref(ScriptureRef(verse_ref.copy())) + + def _create_non_verse_ref(self) -> ScriptureRef: + ref = super()._create_non_verse_ref() + self._update_block.update_ref(ref) + return ref + + def _process_update_block(self) -> None: + self._use_updated_text() + for handler in self._update_block_handlers: + self._update_block = handler.process_block(self._update_block) + self._tokens.extend(self._update_block.get_tokens()) + self._update_block.clear() + + def _process_embed_update_block(self) -> None: + self._use_updated_embed_text() + for handler in self._update_block_handlers: + self._embed_update_block = handler.process_block(self._embed_update_block) + self._update_block.add_tokens(self._embed_update_block.get_tokens()) + self._embed_update_block.clear() + + def _push_updated_text(self, tokens: List[UsfmToken]) -> None: self._replace_stack.append(any(tokens)) if tokens: - self._new_tokens.extend(tokens) + self._updated_text.extend(tokens) - def _add_new_tokens(self) -> None: - if self._new_tokens: - self._tokens.extend(self._new_tokens) - self._new_tokens.clear() + def _use_updated_text(self) -> None: + if self._updated_text: + self._update_block.add_inserted_text(self._updated_text) + self._updated_text.clear() - def _clear_new_tokens(self) -> None: - self._new_tokens.clear() + def _clear_updated_text(self) -> None: + self._updated_text.clear() - def _push_new_embed_tokens(self, tokens: List[UsfmToken]) -> None: + def _push_updated_embed_text(self, tokens: List[UsfmToken]) -> None: self._replace_stack.append(any(tokens)) if tokens: - self._new_embed_tokens.extend(tokens) + self._updated_embed_text.extend(tokens) - def _add_new_embed_tokens(self) -> None: - if self._new_embed_tokens: - self._tokens.extend(self._new_embed_tokens) - self._new_embed_tokens.clear() + def _use_updated_embed_text(self) -> None: + if self._updated_embed_text: + self._embed_update_block.add_inserted_text(self._updated_embed_text) + self._updated_embed_text.clear() - def _clear_new_embed_tokens(self) -> None: - self._new_embed_tokens.clear() + def _clear_updated_embed_text(self) -> None: + self._updated_embed_text.clear() - def _push_token_as_previous(self) -> None: + def _push_updated_text_as_previous(self) -> None: self._replace_stack.append(self._replace_stack[-1]) def _pop_new_tokens(self) -> None: From 5b073250aabc9e174c1b86823b57b2388a014c7e Mon Sep 17 00:00:00 2001 From: John Lambert Date: Thu, 3 Apr 2025 12:50:46 -0400 Subject: [PATCH 02/11] Fix the tests --- machine/corpora/update_usfm_parser_handler.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index c05989d9..dd80c07a 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -46,7 +46,7 @@ def __init__( self._embed_update_block: ScriptureUpdateBlock = ScriptureUpdateBlock() self._id_text = id_text if update_block_handlers is None: - self._update_block_handlers = [ScriptureUpdateBlockHandlerFirstElementsFirst()] + self._update_block_handlers = [] else: self._update_block_handlers = update_block_handlers if preserve_paragraph_styles is None: @@ -79,7 +79,7 @@ def start_book(self, state: UsfmParserState, marker: str, code: str) -> None: start_book_tokens: List[UsfmToken] = [] if self._id_text is not None: start_book_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=self._id_text + " ")) - self._update_block.add_tokens(start_book_tokens) + self._push_updated_text(start_book_tokens) super().start_book(state, marker, code) @@ -109,7 +109,6 @@ def start_para( super().start_para(state, marker, unknown, attributes) def end_para(self, state: UsfmParserState, marker: str) -> None: - self._process_update_block() super().end_para(state, marker) self._in_preserved_paragraph = False @@ -125,7 +124,6 @@ def start_cell(self, state: UsfmParserState, marker: str, align: str, colspan: i def end_cell(self, state: UsfmParserState, marker: str) -> None: self._collect_tokens(state) - self._process_update_block() super().end_cell(state, marker) def start_sidebar(self, state: UsfmParserState, marker: str, category: str) -> None: @@ -136,7 +134,6 @@ def start_sidebar(self, state: UsfmParserState, marker: str, category: str) -> N def end_sidebar(self, state: UsfmParserState, marker: str, closed: bool) -> None: if closed: self._collect_tokens(state) - self._process_update_block() super().end_sidebar(state, marker, closed) @@ -148,8 +145,8 @@ def chapter( alt_number: str, pub_number: str, ) -> None: - self._collect_tokens(state) self._process_update_block() + self._collect_tokens(state) super().chapter(state, number, marker, alt_number, pub_number) @@ -160,8 +157,8 @@ def milestone( start_milestone: bool, attributes: Sequence[UsfmAttribute], ) -> None: - self._collect_tokens(state) self._process_update_block() + self._collect_tokens(state) super().milestone(state, marker, start_milestone, attributes) @@ -173,8 +170,8 @@ def verse( alt_number: str, pub_number: str, ) -> None: - self._collect_tokens(state) self._process_update_block() + self._collect_tokens(state) super().verse(state, number, marker, alt_number, pub_number) @@ -325,12 +322,20 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> List[str]: def _collect_tokens(self, state: UsfmParserState) -> None: self._use_updated_text() while self._token_index <= state.index + state.special_token_count: - self._update_block.add_token(state.tokens[self._token_index]) + token = state.tokens[self._token_index] + if self._is_in_embed(token.marker): + self._embed_update_block.add_token(token) + else: + self._update_block.add_token(token) self._token_index += 1 def _skip_tokens(self, state: UsfmParserState) -> None: while self._token_index <= state.index + state.special_token_count: - self._update_block.add_token(state.tokens[self._token_index], marked_for_removal=True) + token = state.tokens[self._token_index] + if self._is_in_embed(token.marker): + self._embed_update_block.add_token(token, marked_for_removal=True) + else: + self._update_block.add_token(token, marked_for_removal=True) self._token_index += 1 self._token_index = state.index + 1 + state.special_token_count From d963c7aa0ca0eae893587f2d309159746c9d3fc4 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Thu, 3 Apr 2025 13:14:24 -0400 Subject: [PATCH 03/11] I want to process the data in segments that correspond to individual translations. These updates make it happen. --- .vscode/settings.json | 3 +++ machine/corpora/usfm_parser_state.py | 4 ++++ tests/corpora/test_update_usfm_parser_handler.py | 3 +++ 3 files changed, 10 insertions(+) diff --git a/.vscode/settings.json b/.vscode/settings.json index fe6e784e..63beb3c1 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -13,6 +13,9 @@ "source.organizeImports": "explicit" }, }, + "files.associations": { + "*.SFM": "usfm", + }, "black-formatter.path": [ "poetry", "run", diff --git a/machine/corpora/usfm_parser_state.py b/machine/corpora/usfm_parser_state.py index 3f8b40f1..3d0b9e82 100644 --- a/machine/corpora/usfm_parser_state.py +++ b/machine/corpora/usfm_parser_state.py @@ -108,6 +108,10 @@ def is_verse_para(self) -> bool: @property def is_verse_text(self) -> bool: + # anything before verse 1 is not verse text + if self.verse_ref.verse_num == 0: + return False + # Sidebars and notes are not verse text if any(e.type in {UsfmElementType.SIDEBAR, UsfmElementType.NOTE} for e in self._stack): return False diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index 431e7e41..c6cf8cea 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -202,6 +202,7 @@ def test_paragraph_in_verse(): ] usfm = r"""\id MAT - Test \c 1 +\p paragraph not in a verse \v 1 verse 1 \p inner verse paragraph \s1 Section Header \v 2 Verse 2 \p inner verse paragraph @@ -211,6 +212,7 @@ def test_paragraph_in_verse(): result = r"""\id MAT - Test \c 1 +\p paragraph not in a verse \v 1 Update 1 \s1 Section Header \v 2 Verse 2 @@ -228,6 +230,7 @@ def test_paragraph_in_verse(): result_strip = r"""\id MAT \c 1 +\p \v 1 Update 1 \s1 \v 2 From 747120733cd9fe548fd5de2fd1ba65c5a7216cff Mon Sep 17 00:00:00 2001 From: John Lambert Date: Thu, 3 Apr 2025 13:39:14 -0400 Subject: [PATCH 04/11] Added more test framework --- .../paratext_project_text_updater_base.py | 4 + machine/corpora/update_usfm_parser_handler.py | 2 + .../test_update_scripture_block_updater.py | 119 ++++++++++++++++++ 3 files changed, 125 insertions(+) create mode 100644 tests/corpora/test_update_scripture_block_updater.py diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index a56f2db0..b284cb51 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -1,6 +1,8 @@ from abc import ABC, abstractmethod from typing import BinaryIO, Optional, Sequence, Tuple, Union +from .scripture_update_block_handler_base import ScriptureUpdateBlockHandlerBase + from ..utils.typeshed import StrPath from .paratext_project_settings import ParatextProjectSettings from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase @@ -26,6 +28,7 @@ def update_usfm( embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, preserve_paragraph_styles: Optional[Sequence[str]] = None, + update_block_handlers: Optional[list[ScriptureUpdateBlockHandlerBase]] = None, ) -> Optional[str]: file_name: str = self._settings.get_book_file_name(book_id) if not self._exists(file_name): @@ -40,6 +43,7 @@ def update_usfm( embed_behavior, style_behavior, preserve_paragraph_styles, + update_block_handlers=update_block_handlers, ) try: parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification) diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index dd80c07a..54856a2d 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -109,6 +109,8 @@ def start_para( super().start_para(state, marker, unknown, attributes) def end_para(self, state: UsfmParserState, marker: str) -> None: + if not state.is_verse_text: + self._process_update_block() super().end_para(state, marker) self._in_preserved_paragraph = False diff --git a/tests/corpora/test_update_scripture_block_updater.py b/tests/corpora/test_update_scripture_block_updater.py new file mode 100644 index 00000000..32d9057a --- /dev/null +++ b/tests/corpora/test_update_scripture_block_updater.py @@ -0,0 +1,119 @@ +from typing import List, Optional, Sequence, Tuple + +from machine.corpora.scripture_update_block_handler_first_elements_first import ( + ScriptureUpdateBlockHandlerFirstElementsFirst, +) + +from machine.corpora.scripture_update_block_handler_base import ScriptureUpdateBlockHandlerBase +from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH + +from machine.corpora import ( + FileParatextProjectTextUpdater, + ScriptureRef, + UpdateUsfmMarkerBehavior, + UpdateUsfmParserHandler, + UpdateUsfmTextBehavior, + parse_usfm, +) + + +def test_preserve_paragraphs(): + rows = [ + (scr_ref("MAT 1:1"), str("U1")), + ( + scr_ref("MAT 1:1/1:f"), + str("UF1"), + ), + (scr_ref("MAT 1:2"), str("U2")), + ( + scr_ref("MAT 1:2/1:f"), + str("UF2"), + ), + (scr_ref("MAT 1:3"), str("U3")), + ( + scr_ref("MAT 1:3/1:f"), + str("UF3"), + ), + ] + usfm = r"""\id MAT +\c 1 +\v 1 \f \ft \fm ' \fm* hello world \f* it comes first +\v 2 it comes \f \ft hello \fm ' \fm* world \f* middling +\v 3 it comes last \f \ft hello world \fm ' \fm* \f* +""" + + target = update_usfm(rows, usfm) + result = r"""\id MAT +\c 1 +\v 1 U1 \f \ft UF1 \fm ' \fm*\f* +\v 2 U2 \f \ft UF2 \fm ' \fm*\f* +\v 3 U3 \f \ft UF3 \fm ' \fm*\f* +""" + + assess(target, result) + + target_first_element = update_usfm( + rows, usfm, update_block_handlers=[ScriptureUpdateBlockHandlerFirstElementsFirst()] + ) + result_first_element = r"""\id MAT +\c 1 +\v 1 \f \ft \fm ' \fm* UF1 \f* U1 +\v 2 U2 \f \ft UF2 \fm ' \fm*\f* +\v 3 U3 \f \ft UF3 \fm ' \fm*\f* +""" + assess(target_first_element, result_first_element) + + +def scr_ref(*refs: str) -> List[ScriptureRef]: + return [ScriptureRef.parse(ref) for ref in refs] + + +def update_usfm( + rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None, + source: Optional[str] = None, + id_text: Optional[str] = None, + text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_NEW, + paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, + embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, + style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, + preserve_paragraph_styles: Optional[Sequence[str]] = None, + update_block_handlers: Optional[list[ScriptureUpdateBlockHandlerBase]] = None, +) -> Optional[str]: + if source is None: + updater = FileParatextProjectTextUpdater(USFM_TEST_PROJECT_PATH) + return updater.update_usfm( + "MAT", + rows, + id_text, + text_behavior, + paragraph_behavior, + embed_behavior, + style_behavior, + preserve_paragraph_styles, + update_block_handlers, + ) + else: + source = source.strip().replace("\r\n", "\n") + "\r\n" + updater = UpdateUsfmParserHandler( + rows, + id_text, + text_behavior, + paragraph_behavior, + embed_behavior, + style_behavior, + preserve_paragraph_styles, + update_block_handlers, + ) + parse_usfm(source, updater) + return updater.get_usfm() + + +def assess(target: Optional[str], truth: str) -> None: + assert target is not None + for target_line, truth_line in zip(target.split("\n"), truth.split("\n")): + assert target_line.strip() == truth_line.strip() + + +def read_usfm() -> str: + with (USFM_TEST_PROJECT_PATH / "41MATTes.SFM").open("r", encoding="utf-8-sig", newline="\r\n") as file: + return file.read() From 8a7c993c6220c00662a7eaf19358acdd6a55c31c Mon Sep 17 00:00:00 2001 From: John Lambert Date: Thu, 10 Apr 2025 13:33:40 -0400 Subject: [PATCH 05/11] Updates for reviewer comments --- README.md | 8 ++ .../paratext_project_text_updater_base.py | 4 +- machine/corpora/scripture_update_block.py | 4 + ...e.py => scripture_update_block_handler.py} | 3 +- ...date_block_handler_first_elements_first.py | 23 ---- machine/corpora/update_usfm_parser_handler.py | 5 +- .../test_update_scripture_block_updater.py | 119 ------------------ tests/corpora/test_usfm_file_text.py | 32 ++--- 8 files changed, 34 insertions(+), 164 deletions(-) rename machine/corpora/{scripture_update_block_handler_base.py => scripture_update_block_handler.py} (80%) delete mode 100644 machine/corpora/scripture_update_block_handler_first_elements_first.py delete mode 100644 tests/corpora/test_update_scripture_block_updater.py diff --git a/README.md b/README.md index 577e58d3..a11a7ea1 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,14 @@ Machine is available as a pip package: pip install sil-machine ``` +## setup + +You can use the devcontainer (normal process), or you can setup outside of one, especially if you don't have a GPU. + +* Install poetry +* `poetry install` for everything +* `poetry install --without gpu` if you don't have a NVIDA gpu + ## Tutorials If you would like to find out more about how to use Machine, check out the tutorial Jupyter notebooks: diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index b284cb51..02b0566b 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from typing import BinaryIO, Optional, Sequence, Tuple, Union -from .scripture_update_block_handler_base import ScriptureUpdateBlockHandlerBase +from .scripture_update_block_handler import ScriptureUpdateBlockHandler from ..utils.typeshed import StrPath from .paratext_project_settings import ParatextProjectSettings @@ -28,7 +28,7 @@ def update_usfm( embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, preserve_paragraph_styles: Optional[Sequence[str]] = None, - update_block_handlers: Optional[list[ScriptureUpdateBlockHandlerBase]] = None, + update_block_handlers: Optional[list[ScriptureUpdateBlockHandler]] = None, ) -> Optional[str]: file_name: str = self._settings.get_book_file_name(book_id) if not self._exists(file_name): diff --git a/machine/corpora/scripture_update_block.py b/machine/corpora/scripture_update_block.py index 00787cf2..afb9e75a 100644 --- a/machine/corpora/scripture_update_block.py +++ b/machine/corpora/scripture_update_block.py @@ -11,6 +11,10 @@ def __init__(self) -> None: self._ref: ScriptureRef = ScriptureRef() self._elements: list[ScriptureUpdateElement] = [] + @property + def elements(self) -> list[ScriptureUpdateElement]: + return self._elements + def add_existing_text(self, token: UsfmToken, marked_for_removal: bool = False) -> None: self._elements.append( ScriptureUpdateElement(ScriptureUpdateElementType.EXISTING_TEXT, [token], marked_for_removal) diff --git a/machine/corpora/scripture_update_block_handler_base.py b/machine/corpora/scripture_update_block_handler.py similarity index 80% rename from machine/corpora/scripture_update_block_handler_base.py rename to machine/corpora/scripture_update_block_handler.py index 2998a0d9..ff1d6f9e 100644 --- a/machine/corpora/scripture_update_block_handler_base.py +++ b/machine/corpora/scripture_update_block_handler.py @@ -1,9 +1,10 @@ from __future__ import annotations +from abc import ABC from .scripture_update_block import ScriptureUpdateBlock -class ScriptureUpdateBlockHandlerBase: +class ScriptureUpdateBlockHandler(ABC): def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: raise NotImplementedError("Must be implemented in subclass") diff --git a/machine/corpora/scripture_update_block_handler_first_elements_first.py b/machine/corpora/scripture_update_block_handler_first_elements_first.py deleted file mode 100644 index 17f44798..00000000 --- a/machine/corpora/scripture_update_block_handler_first_elements_first.py +++ /dev/null @@ -1,23 +0,0 @@ -from __future__ import annotations - -from .scripture_update_block import ScriptureUpdateBlock -from .scripture_update_block_handler_base import ScriptureUpdateBlockHandlerBase -from .scripture_update_element import ScriptureUpdateElementType - - -class ScriptureUpdateBlockHandlerFirstElementsFirst(ScriptureUpdateBlockHandlerBase): - - def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: - # If a paragraph, embed or style element occurs before existing text, move it before inserted text as well. - current_insert_index = 0 - for current_index in range(len(block._elements)): - element = block._elements[current_index] - if element.type == ScriptureUpdateElementType.EXISTING_TEXT: - # we found existing text, so we stop looking for elements to move - break - if current_index != current_insert_index and element.type != ScriptureUpdateElementType.INSERTED_TEXT: - block._elements.remove(element) - block._elements.insert(current_insert_index, element) - current_insert_index += 1 - - return block diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 54856a2d..ecdf0881 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -5,8 +5,7 @@ from .scripture_ref import ScriptureRef from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler from .scripture_update_block import ScriptureUpdateBlock -from .scripture_update_block_handler_base import ScriptureUpdateBlockHandlerBase -from .scripture_update_block_handler_first_elements_first import ScriptureUpdateBlockHandlerFirstElementsFirst +from .scripture_update_block_handler import ScriptureUpdateBlockHandler from .usfm_parser_state import UsfmParserState from .usfm_stylesheet import UsfmStylesheet from .usfm_token import UsfmAttribute, UsfmToken, UsfmTokenType @@ -35,7 +34,7 @@ def __init__( embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, preserve_paragraph_styles: Optional[Sequence[str]] = None, - update_block_handlers: Optional[list[ScriptureUpdateBlockHandlerBase]] = None, + update_block_handlers: Optional[list[ScriptureUpdateBlockHandler]] = None, ) -> None: super().__init__() self._rows = rows or [] diff --git a/tests/corpora/test_update_scripture_block_updater.py b/tests/corpora/test_update_scripture_block_updater.py deleted file mode 100644 index 32d9057a..00000000 --- a/tests/corpora/test_update_scripture_block_updater.py +++ /dev/null @@ -1,119 +0,0 @@ -from typing import List, Optional, Sequence, Tuple - -from machine.corpora.scripture_update_block_handler_first_elements_first import ( - ScriptureUpdateBlockHandlerFirstElementsFirst, -) - -from machine.corpora.scripture_update_block_handler_base import ScriptureUpdateBlockHandlerBase -from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH - -from machine.corpora import ( - FileParatextProjectTextUpdater, - ScriptureRef, - UpdateUsfmMarkerBehavior, - UpdateUsfmParserHandler, - UpdateUsfmTextBehavior, - parse_usfm, -) - - -def test_preserve_paragraphs(): - rows = [ - (scr_ref("MAT 1:1"), str("U1")), - ( - scr_ref("MAT 1:1/1:f"), - str("UF1"), - ), - (scr_ref("MAT 1:2"), str("U2")), - ( - scr_ref("MAT 1:2/1:f"), - str("UF2"), - ), - (scr_ref("MAT 1:3"), str("U3")), - ( - scr_ref("MAT 1:3/1:f"), - str("UF3"), - ), - ] - usfm = r"""\id MAT -\c 1 -\v 1 \f \ft \fm ' \fm* hello world \f* it comes first -\v 2 it comes \f \ft hello \fm ' \fm* world \f* middling -\v 3 it comes last \f \ft hello world \fm ' \fm* \f* -""" - - target = update_usfm(rows, usfm) - result = r"""\id MAT -\c 1 -\v 1 U1 \f \ft UF1 \fm ' \fm*\f* -\v 2 U2 \f \ft UF2 \fm ' \fm*\f* -\v 3 U3 \f \ft UF3 \fm ' \fm*\f* -""" - - assess(target, result) - - target_first_element = update_usfm( - rows, usfm, update_block_handlers=[ScriptureUpdateBlockHandlerFirstElementsFirst()] - ) - result_first_element = r"""\id MAT -\c 1 -\v 1 \f \ft \fm ' \fm* UF1 \f* U1 -\v 2 U2 \f \ft UF2 \fm ' \fm*\f* -\v 3 U3 \f \ft UF3 \fm ' \fm*\f* -""" - assess(target_first_element, result_first_element) - - -def scr_ref(*refs: str) -> List[ScriptureRef]: - return [ScriptureRef.parse(ref) for ref in refs] - - -def update_usfm( - rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None, - source: Optional[str] = None, - id_text: Optional[str] = None, - text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_NEW, - paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, - embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, - style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, - preserve_paragraph_styles: Optional[Sequence[str]] = None, - update_block_handlers: Optional[list[ScriptureUpdateBlockHandlerBase]] = None, -) -> Optional[str]: - if source is None: - updater = FileParatextProjectTextUpdater(USFM_TEST_PROJECT_PATH) - return updater.update_usfm( - "MAT", - rows, - id_text, - text_behavior, - paragraph_behavior, - embed_behavior, - style_behavior, - preserve_paragraph_styles, - update_block_handlers, - ) - else: - source = source.strip().replace("\r\n", "\n") + "\r\n" - updater = UpdateUsfmParserHandler( - rows, - id_text, - text_behavior, - paragraph_behavior, - embed_behavior, - style_behavior, - preserve_paragraph_styles, - update_block_handlers, - ) - parse_usfm(source, updater) - return updater.get_usfm() - - -def assess(target: Optional[str], truth: str) -> None: - assert target is not None - for target_line, truth_line in zip(target.split("\n"), truth.split("\n")): - assert target_line.strip() == truth_line.strip() - - -def read_usfm() -> str: - with (USFM_TEST_PROJECT_PATH / "41MATTes.SFM").open("r", encoding="utf-8-sig", newline="\r\n") as file: - return file.read() diff --git a/tests/corpora/test_usfm_file_text.py b/tests/corpora/test_usfm_file_text.py index 44451708..e046d71d 100644 --- a/tests/corpora/test_usfm_file_text.py +++ b/tests/corpora/test_usfm_file_text.py @@ -66,7 +66,7 @@ def test_get_rows_nonempty_text_all_text() -> None: assert text is not None rows = list(text) - assert len(rows) == 50 + assert len(rows) == 52 assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:0/1:h", corpus.versification) assert rows[0].text == "Matthew" @@ -113,20 +113,20 @@ def test_get_rows_nonempty_text_all_text() -> None: assert scripture_ref(rows[24]) == ScriptureRef.parse("MAT 2:0/4:p", corpus.versification) assert not rows[24].text - assert scripture_ref(rows[26]) == ScriptureRef.parse("MAT 2:1/1:f", corpus.versification) - assert rows[26].text == "This is a footnote." + assert scripture_ref(rows[27]) == ScriptureRef.parse("MAT 2:1/1:f", corpus.versification) + assert rows[27].text == "This is a footnote." - assert scripture_ref(rows[29]) == ScriptureRef.parse("MAT 2:3/2:esb/1:ms", corpus.versification) - assert rows[29].text == "This is a sidebar" + assert scripture_ref(rows[30]) == ScriptureRef.parse("MAT 2:3/2:esb/1:ms", corpus.versification) + assert rows[30].text == "This is a sidebar" - assert scripture_ref(rows[30]) == ScriptureRef.parse("MAT 2:3/2:esb/2:p", corpus.versification) - assert rows[30].text == "Here is some sidebar content." + assert scripture_ref(rows[31]) == ScriptureRef.parse("MAT 2:3/2:esb/2:p", corpus.versification) + assert rows[31].text == "Here is some sidebar content." - assert scripture_ref(rows[36]) == ScriptureRef.parse("MAT 2:7a/1:s", corpus.versification) - assert rows[36].text == "Section header" + assert scripture_ref(rows[37]) == ScriptureRef.parse("MAT 2:7a/1:s", corpus.versification) + assert rows[37].text == "Section header" - assert scripture_ref(rows[43]) == ScriptureRef.parse("MAT 2:12/1:restore", corpus.versification) - assert rows[43].text == "restore information" + assert scripture_ref(rows[44]) == ScriptureRef.parse("MAT 2:12/1:restore", corpus.versification) + assert rows[44].text == "restore information" def test_get_rows_sentence_start() -> None: @@ -220,7 +220,7 @@ def test_get_rows_include_markers_all_text() -> None: assert text is not None rows = list(text) - assert len(rows) == 46 + assert len(rows) == 48 assert scripture_ref(rows[2]) == ScriptureRef.parse("MAT 1:0/3:ip", corpus.versification) assert rows[2].text == "An introduction to Matthew\\fe + \\ft This is an endnote.\\fe*" @@ -240,11 +240,11 @@ def test_get_rows_include_markers_all_text() -> None: assert scripture_ref(rows[20]) == ScriptureRef.parse("MAT 2:0/3:s1", corpus.versification) assert rows[20].text == "Chapter \\it Two \\it*" - assert scripture_ref(rows[22]) == ScriptureRef.parse("MAT 2:1", corpus.versification) - assert rows[22].text == "Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one." + assert scripture_ref(rows[23]) == ScriptureRef.parse("MAT 2:1", corpus.versification) + assert rows[23].text == "Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one." - assert scripture_ref(rows[26]) == ScriptureRef.parse("MAT 2:3/2:esb/2:p", corpus.versification) - assert rows[26].text == "Here is some sidebar // content." + assert scripture_ref(rows[27]) == ScriptureRef.parse("MAT 2:3/2:esb/2:p", corpus.versification) + assert rows[27].text == "Here is some sidebar // content." def test_get_rows_invalid_id() -> None: From 92ee88e37a915a70c7e05f4bae336f6c6bf2d934 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Thu, 10 Apr 2025 14:24:53 -0400 Subject: [PATCH 06/11] linting --- .github/workflows/ci.yml | 2 +- README.md | 8 - .../paratext_project_terms_parser_base.py | 6 +- .../paratext_project_text_updater_base.py | 3 +- .../corpora/scripture_update_block_handler.py | 1 + .../zip_paratext_project_terms_parser.py | 2 +- poetry.lock | 339 +++++++++++++++--- pyproject.toml | 2 +- .../test_update_usfm_parser_handler.py | 2 +- 9 files changed, 308 insertions(+), 57 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 380c8e10..044a4db5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -55,7 +55,7 @@ jobs: node-version: "14" - name: Lint with pyright run: | - npm install -g pyright@1.1.386 + npm install -g pyright@1.1.399 poetry run pyright - name: Test with pytest run: poetry run pytest --cov --cov-report=xml diff --git a/README.md b/README.md index a11a7ea1..577e58d3 100644 --- a/README.md +++ b/README.md @@ -10,14 +10,6 @@ Machine is available as a pip package: pip install sil-machine ``` -## setup - -You can use the devcontainer (normal process), or you can setup outside of one, especially if you don't have a GPU. - -* Install poetry -* `poetry install` for everything -* `poetry install --without gpu` if you don't have a NVIDA gpu - ## Tutorials If you would like to find out more about how to use Machine, check out the tutorial Jupyter notebooks: diff --git a/machine/corpora/paratext_project_terms_parser_base.py b/machine/corpora/paratext_project_terms_parser_base.py index ea71ab80..00496443 100644 --- a/machine/corpora/paratext_project_terms_parser_base.py +++ b/machine/corpora/paratext_project_terms_parser_base.py @@ -45,7 +45,7 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) - else: term_id_to_category_dict = {} - terms_glosses_doc: Optional[ElementTree.ElementTree] = None + terms_glosses_doc: Optional[ElementTree.ElementTree[ElementTree.Element]] = None resource_name = None if self._settings.language_code is not None: resource_name = _SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS.get(self._settings.language_code) @@ -57,7 +57,7 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) - with open_binary(_SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS_PACKAGE, resource_name) as stream: terms_glosses_doc = ElementTree.parse(stream) - term_renderings_doc: Optional[ElementTree.ElementTree] = None + term_renderings_doc: Optional[ElementTree.ElementTree[ElementTree.Element]] = None if self._exists("TermRenderings.xml"): with self._open("TermRenderings.xml") as stream: term_renderings_doc = ElementTree.parse(stream) @@ -136,7 +136,7 @@ def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str: return term_string -def _get_category_per_id(biblical_terms_doc: ElementTree.ElementTree) -> Dict[str, str]: +def _get_category_per_id(biblical_terms_doc: ElementTree.ElementTree[ElementTree.Element]) -> Dict[str, str]: term_id_to_category_dict: Dict[str, str] = {} for term in biblical_terms_doc.findall(".//Term"): diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index 02b0566b..8ba806a8 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -1,12 +1,11 @@ from abc import ABC, abstractmethod from typing import BinaryIO, Optional, Sequence, Tuple, Union -from .scripture_update_block_handler import ScriptureUpdateBlockHandler - from ..utils.typeshed import StrPath from .paratext_project_settings import ParatextProjectSettings from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase from .scripture_ref import ScriptureRef +from .scripture_update_block_handler import ScriptureUpdateBlockHandler from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, UpdateUsfmTextBehavior from .usfm_parser import parse_usfm diff --git a/machine/corpora/scripture_update_block_handler.py b/machine/corpora/scripture_update_block_handler.py index ff1d6f9e..bcbe8fb8 100644 --- a/machine/corpora/scripture_update_block_handler.py +++ b/machine/corpora/scripture_update_block_handler.py @@ -1,4 +1,5 @@ from __future__ import annotations + from abc import ABC from .scripture_update_block import ScriptureUpdateBlock diff --git a/machine/corpora/zip_paratext_project_terms_parser.py b/machine/corpora/zip_paratext_project_terms_parser.py index 3f781b21..ebc208a0 100644 --- a/machine/corpora/zip_paratext_project_terms_parser.py +++ b/machine/corpora/zip_paratext_project_terms_parser.py @@ -19,5 +19,5 @@ def _exists(self, file_name: StrPath) -> bool: def _open(self, file_name: StrPath) -> Optional[BinaryIO]: if file_name in self._archive.namelist(): - return BytesIO(self._archive.read(file_name)) + return BytesIO(self._archive.read(str(file_name))) return None diff --git a/poetry.lock b/poetry.lock index 4d8ded6e..a290b5dd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. [[package]] name = "accelerate" @@ -6,6 +6,8 @@ version = "0.26.1" description = "Accelerate" optional = false python-versions = ">=3.8.0" +groups = ["gpu"] +markers = "sys_platform == \"win32\" or sys_platform == \"linux\"" files = [ {file = "accelerate-0.26.1-py3-none-any.whl", hash = "sha256:04df826b84ac7bad8a0a8ab90e6aeacdecb1ea5a2d744d7e94f6735c29183227"}, {file = "accelerate-0.26.1.tar.gz", hash = "sha256:bf63716b6bd9460d87da970cf4d833abb824ca0aa633be36b741e63a1b504f89"}, @@ -36,6 +38,8 @@ version = "2.4.3" description = "Happy Eyeballs for asyncio" optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"huggingface\"" files = [ {file = "aiohappyeyeballs-2.4.3-py3-none-any.whl", hash = "sha256:8a7a83727b2756f394ab2895ea0765a0a8c475e3c71e98d43d76f22b4b435572"}, {file = "aiohappyeyeballs-2.4.3.tar.gz", hash = "sha256:75cf88a15106a5002a8eb1dab212525c00d1f4c0fa96e551c9fbe6f09a621586"}, @@ -47,6 +51,8 @@ version = "3.10.10" description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"huggingface\"" files = [ {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:be7443669ae9c016b71f402e43208e13ddf00912f47f623ee5994e12fc7d4b3f"}, {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7b06b7843929e41a94ea09eb1ce3927865387e3e23ebe108e0d0d09b08d25be9"}, @@ -151,7 +157,7 @@ multidict = ">=4.5,<7.0" yarl = ">=1.12.0,<2.0" [package.extras] -speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"] +speedups = ["Brotli ; platform_python_implementation == \"CPython\"", "aiodns (>=3.2.0) ; sys_platform == \"linux\" or sys_platform == \"darwin\"", "brotlicffi ; platform_python_implementation != \"CPython\""] [[package]] name = "aiosignal" @@ -159,6 +165,8 @@ version = "1.3.1" description = "aiosignal: a list of registered asynchronous callbacks" optional = false python-versions = ">=3.7" +groups = ["main"] +markers = "extra == \"huggingface\"" files = [ {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"}, {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"}, @@ -173,6 +181,7 @@ version = "4.6.2.post1" description = "High level compatibility layer for multiple asynchronous event loop implementations" optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ {file = "anyio-4.6.2.post1-py3-none-any.whl", hash = "sha256:6d170c36fba3bdd840c73d3868c1e777e33676a69c3a72cf0a0d5d6d8009b61d"}, {file = "anyio-4.6.2.post1.tar.gz", hash = "sha256:4c8bc31ccdb51c7f7bd251f51c609e038d63e34219b44aa86e47576389880b4c"}, @@ -186,7 +195,7 @@ typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""} [package.extras] doc = ["Sphinx (>=7.4,<8.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] -test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "truststore (>=0.9.1)", "uvloop (>=0.21.0b1)"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "truststore (>=0.9.1) ; python_version >= \"3.10\"", "uvloop (>=0.21.0b1) ; platform_python_implementation == \"CPython\" and platform_system != \"Windows\""] trio = ["trio (>=0.26.1)"] [[package]] @@ -195,6 +204,8 @@ version = "0.1.4" description = "Disable App Nap on macOS >= 10.9" optional = false python-versions = ">=3.6" +groups = ["dev"] +markers = "platform_system == \"Darwin\"" files = [ {file = "appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c"}, {file = "appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee"}, @@ -206,6 +217,7 @@ version = "23.1.0" description = "Argon2 for Python" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "argon2_cffi-23.1.0-py3-none-any.whl", hash = "sha256:c670642b78ba29641818ab2e68bd4e6a78ba53b7eff7b4c3815ae16abf91c7ea"}, {file = "argon2_cffi-23.1.0.tar.gz", hash = "sha256:879c3e79a2729ce768ebb7d36d4609e3a78a4ca2ec3a9f12286ca057e3d0db08"}, @@ -226,6 +238,7 @@ version = "21.2.0" description = "Low-level CFFI bindings for Argon2" optional = false python-versions = ">=3.6" +groups = ["dev"] files = [ {file = "argon2-cffi-bindings-21.2.0.tar.gz", hash = "sha256:bb89ceffa6c791807d1305ceb77dbfacc5aa499891d2c55661c6459651fc39e3"}, {file = "argon2_cffi_bindings-21.2.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ccb949252cb2ab3a08c02024acb77cfb179492d5701c7cbdbfd776124d4d2367"}, @@ -263,6 +276,7 @@ version = "1.3.0" description = "Better dates & times for Python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "arrow-1.3.0-py3-none-any.whl", hash = "sha256:c728b120ebc00eb84e01882a6f5e7927a53960aa990ce7dd2b10f39005a67f80"}, {file = "arrow-1.3.0.tar.gz", hash = "sha256:d4540617648cb5f895730f1ad8c82a65f2dad0166f57b75f3ca54759c4d67a85"}, @@ -282,6 +296,7 @@ version = "2.4.1" description = "Annotate AST trees with source code positions" optional = false python-versions = "*" +groups = ["dev"] files = [ {file = "asttokens-2.4.1-py2.py3-none-any.whl", hash = "sha256:051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24"}, {file = "asttokens-2.4.1.tar.gz", hash = "sha256:b03869718ba9a6eb027e134bfdf69f38a236d681c83c160d510768af11254ba0"}, @@ -291,8 +306,8 @@ files = [ six = ">=1.12.0" [package.extras] -astroid = ["astroid (>=1,<2)", "astroid (>=2,<4)"] -test = ["astroid (>=1,<2)", "astroid (>=2,<4)", "pytest"] +astroid = ["astroid (>=1,<2) ; python_version < \"3\"", "astroid (>=2,<4) ; python_version >= \"3\""] +test = ["astroid (>=1,<2) ; python_version < \"3\"", "astroid (>=2,<4) ; python_version >= \"3\"", "pytest"] [[package]] name = "async-lru" @@ -300,6 +315,7 @@ version = "2.0.4" description = "Simple LRU cache for asyncio" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "async-lru-2.0.4.tar.gz", hash = "sha256:b8a59a5df60805ff63220b2a0c5b5393da5521b113cd5465a44eb037d81a5627"}, {file = "async_lru-2.0.4-py3-none-any.whl", hash = "sha256:ff02944ce3c288c5be660c42dbcca0742b32c3b279d6dceda655190240b99224"}, @@ -314,6 +330,8 @@ version = "4.0.3" description = "Timeout context manager for asyncio programs" optional = false python-versions = ">=3.7" +groups = ["main"] +markers = "extra == \"huggingface\" and python_version < \"3.11\"" files = [ {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"}, {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, @@ -325,18 +343,20 @@ version = "24.2.0" description = "Classes Without Boilerplate" optional = false python-versions = ">=3.7" +groups = ["main", "dev"] files = [ {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"}, {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"}, ] +markers = {main = "extra == \"huggingface\" or extra == \"jobs\""} [package.extras] -benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"] -cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] -dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +benchmark = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.9\"", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.9\" and python_version < \"3.13\"", "pytest-xdist[psutil]"] +cov = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.9\"", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.9\" and python_version < \"3.13\"", "pytest-xdist[psutil]"] +dev = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.9\"", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.9\" and python_version < \"3.13\"", "pytest-xdist[psutil]"] docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"] -tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] -tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] +tests = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.9\"", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.9\" and python_version < \"3.13\"", "pytest-xdist[psutil]"] +tests-mypy = ["mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.9\"", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.9\" and python_version < \"3.13\""] [[package]] name = "babel" @@ -344,6 +364,7 @@ version = "2.16.0" description = "Internationalization utilities" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b"}, {file = "babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316"}, @@ -358,6 +379,7 @@ version = "4.12.3" description = "Screen-scraping library" optional = false python-versions = ">=3.6.0" +groups = ["dev"] files = [ {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, @@ -379,6 +401,7 @@ version = "24.10.0" description = "The uncompromising code formatter." optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ {file = "black-24.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e6668650ea4b685440857138e5fe40cde4d652633b1bdffc62933d0db4ed9812"}, {file = "black-24.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1c536fcf674217e87b8cc3657b81809d3c085d7bf3ef262ead700da345bfa6ea"}, @@ -425,6 +448,7 @@ version = "6.2.0" description = "An easy safelist-based HTML-sanitizing tool." optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ {file = "bleach-6.2.0-py3-none-any.whl", hash = "sha256:117d9c6097a7c3d22fd578fcd8d35ff1e125df6736f554da4e432fdd63f31e5e"}, {file = "bleach-6.2.0.tar.gz", hash = "sha256:123e894118b8a599fd80d3ec1a6d4cc7ce4e5882b1317a7e1ba69b56e95f991f"}, @@ -442,6 +466,7 @@ version = "1.35.51" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "boto3-1.35.51-py3-none-any.whl", hash = "sha256:c922f6a18958af9d8af0489d6d8503b517029d8159b26aa4859a8294561c72e9"}, {file = "boto3-1.35.51.tar.gz", hash = "sha256:a57c6c7012ecb40c43e565a6f7a891f39efa990ff933eab63cd456f7501c2731"}, @@ -461,6 +486,7 @@ version = "1.35.51" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "botocore-1.35.51-py3-none-any.whl", hash = "sha256:4d65b00111bd12b98e9f920ecab602cf619cc6a6d0be6e5dd53f517e4b92901c"}, {file = "botocore-1.35.51.tar.gz", hash = "sha256:a9b3d1da76b3e896ad74605c01d88f596324a3337393d4bfbfa0d6c35822ca9c"}, @@ -483,10 +509,12 @@ version = "2024.8.30" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" +groups = ["main", "dev", "gpu"] files = [ {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"}, {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"}, ] +markers = {main = "extra == \"huggingface\" or extra == \"jobs\"", gpu = "sys_platform == \"win32\" or sys_platform == \"linux\""} [[package]] name = "cffi" @@ -494,6 +522,7 @@ version = "1.17.1" description = "Foreign Function Interface for Python calling C code." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"}, {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"}, @@ -573,6 +602,7 @@ version = "2.1.1" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false python-versions = ">=3.6.0" +groups = ["main", "dev", "gpu"] files = [ {file = "charset-normalizer-2.1.1.tar.gz", hash = "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845"}, {file = "charset_normalizer-2.1.1-py3-none-any.whl", hash = "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"}, @@ -587,6 +617,8 @@ version = "1.16.5" description = "ClearML - Auto-Magical Experiment Manager, Version Control, and MLOps for AI" optional = false python-versions = "*" +groups = ["main"] +markers = "extra == \"jobs\"" files = [ {file = "clearml-1.16.5-py2.py3-none-any.whl", hash = "sha256:3caa00914e039cb2b62ca90795c3ca17077042ae1edcefc17bf13f695653480f"}, ] @@ -620,10 +652,12 @@ version = "8.1.7" description = "Composable command line interface toolkit" optional = false python-versions = ">=3.7" +groups = ["main", "dev"] files = [ {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, ] +markers = {main = "extra == \"huggingface\""} [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} @@ -634,10 +668,12 @@ version = "0.4.6" description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["main", "dev", "gpu"] files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +markers = {main = "extra == \"huggingface\" and platform_system == \"Windows\"", dev = "platform_system == \"Windows\" or sys_platform == \"win32\"", gpu = "(sys_platform == \"win32\" or sys_platform == \"linux\") and platform_system == \"Windows\""} [[package]] name = "comm" @@ -645,6 +681,7 @@ version = "0.2.2" description = "Jupyter Python Comm implementation, for usage in ipykernel, xeus-python etc." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "comm-0.2.2-py3-none-any.whl", hash = "sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3"}, {file = "comm-0.2.2.tar.gz", hash = "sha256:3fd7a84065306e07bea1773df6eb8282de51ba82f77c72f9c85716ab11fe980e"}, @@ -662,6 +699,7 @@ version = "7.6.4" description = "Code coverage measurement for Python" optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ {file = "coverage-7.6.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5f8ae553cba74085db385d489c7a792ad66f7f9ba2ee85bfa508aeb84cf0ba07"}, {file = "coverage-7.6.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8165b796df0bd42e10527a3f493c592ba494f16ef3c8b531288e3d0d72c1f6f0"}, @@ -731,7 +769,7 @@ files = [ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""} [package.extras] -toml = ["tomli"] +toml = ["tomli ; python_full_version <= \"3.11.0a6\""] [[package]] name = "cython" @@ -812,6 +850,8 @@ version = "2.21.0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.8.0" +groups = ["main"] +markers = "extra == \"huggingface\"" files = [ {file = "datasets-2.21.0-py3-none-any.whl", hash = "sha256:25e4e097110ce28824b746a107727ada94024cba11db8bc588d468414692b65a"}, {file = "datasets-2.21.0.tar.gz", hash = "sha256:998f85a8460f1bd982e5bd058f8a0808eef424249e3df1e8cdd594ccd0dc8ba2"}, @@ -835,9 +875,9 @@ xxhash = "*" [package.extras] apache-beam = ["apache-beam (>=2.26.0)"] -audio = ["librosa", "soundfile (>=0.12.1)", "soxr (>=0.4.0)"] +audio = ["librosa", "soundfile (>=0.12.1)", "soxr (>=0.4.0) ; python_version >= \"3.9\""] benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] -dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "transformers", "transformers (>=4.42.0)", "typing-extensions (>=4.6.1)", "zstandard"] +dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14) ; sys_platform != \"win32\"", "jaxlib (>=0.3.14) ; sys_platform != \"win32\"", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0) ; python_version >= \"3.9\"", "sqlalchemy", "tensorflow (>=2.16.0) ; python_version >= \"3.10\"", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0) ; python_version < \"3.10\"", "tiktoken", "torch", "torch (>=2.0.0)", "transformers", "transformers (>=4.42.0)", "typing-extensions (>=4.6.1)", "zstandard"] docs = ["s3fs", "tensorflow (>=2.6.0)", "torch", "transformers"] jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"] metrics-tests = ["Werkzeug (>=1.0.1)", "accelerate", "bert-score (>=0.3.6)", "jiwer", "langdetect", "mauve-text", "nltk (<3.8.2)", "requests-file (>=1.5.1)", "rouge-score", "sacrebleu", "sacremoses", "scikit-learn", "scipy", "sentencepiece", "seqeval", "six (>=1.15.0,<1.16.0)", "spacy (>=3.0.0)", "texttable (>=1.6.3)", "tldextract", "tldextract (>=3.1.0)", "toml (>=0.10.1)", "typer (<0.5.0)"] @@ -845,8 +885,8 @@ quality = ["ruff (>=0.3.0)"] s3 = ["s3fs"] tensorflow = ["tensorflow (>=2.6.0)"] tensorflow-gpu = ["tensorflow (>=2.6.0)"] -tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "transformers (>=4.42.0)", "typing-extensions (>=4.6.1)", "zstandard"] -tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "typing-extensions (>=4.6.1)", "zstandard"] +tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14) ; sys_platform != \"win32\"", "jaxlib (>=0.3.14) ; sys_platform != \"win32\"", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0) ; python_version >= \"3.9\"", "sqlalchemy", "tensorflow (>=2.16.0) ; python_version >= \"3.10\"", "tensorflow (>=2.6.0) ; python_version < \"3.10\"", "tiktoken", "torch (>=2.0.0)", "transformers (>=4.42.0)", "typing-extensions (>=4.6.1)", "zstandard"] +tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "jax (>=0.3.14) ; sys_platform != \"win32\"", "jaxlib (>=0.3.14) ; sys_platform != \"win32\"", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0) ; python_version >= \"3.9\"", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "typing-extensions (>=4.6.1)", "zstandard"] torch = ["torch"] vision = ["Pillow (>=9.4.0)"] @@ -856,6 +896,7 @@ version = "1.8.7" description = "An implementation of the Debug Adapter Protocol for Python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "debugpy-1.8.7-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:95fe04a573b8b22896c404365e03f4eda0ce0ba135b7667a1e57bd079793b96b"}, {file = "debugpy-1.8.7-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:628a11f4b295ffb4141d8242a9bb52b77ad4a63a2ad19217a93be0f77f2c28c9"}, @@ -891,6 +932,7 @@ version = "5.1.1" description = "Decorators for Humans" optional = false python-versions = ">=3.5" +groups = ["dev"] files = [ {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"}, {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, @@ -902,6 +944,7 @@ version = "2.1.1" description = "Opinionated mocking library for Python" optional = false python-versions = ">=3.7,<4.0" +groups = ["dev"] files = [ {file = "decoy-2.1.1-py3-none-any.whl", hash = "sha256:7ddcc08b8ce991f7705cee76fae9061dcb17352e0a1ca2d9a0d4a0306ebd51cd"}, {file = "decoy-2.1.1.tar.gz", hash = "sha256:575bdbe81afb4c152cd99a34568a9aa4369461f79d6172c678279c5d5585befe"}, @@ -913,6 +956,7 @@ version = "0.7.1" description = "XML bomb protection for Python stdlib modules" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["dev"] files = [ {file = "defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61"}, {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"}, @@ -924,6 +968,8 @@ version = "0.3.8" description = "serialize all of Python" optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"huggingface\"" files = [ {file = "dill-0.3.8-py3-none-any.whl", hash = "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7"}, {file = "dill-0.3.8.tar.gz", hash = "sha256:3ebe3c479ad625c4553aca177444d89b486b1d84982eeacded644afc0cf797ca"}, @@ -939,6 +985,8 @@ version = "3.2.6" description = "The dynamic configurator for your Python Project" optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"jobs\"" files = [ {file = "dynaconf-3.2.6-py2.py3-none-any.whl", hash = "sha256:3911c740d717df4576ed55f616c7cbad6e06bc8ef23ffca444b6e2a12fb1c34c"}, {file = "dynaconf-3.2.6.tar.gz", hash = "sha256:74cc1897396380bb957730eb341cc0976ee9c38bbcb53d3307c50caed0aedfb8"}, @@ -977,6 +1025,8 @@ version = "1.2.2" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" +groups = ["dev"] +markers = "python_version < \"3.11\"" files = [ {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, @@ -991,13 +1041,14 @@ version = "2.1.0" description = "Get the currently executing AST node of a frame, and other information" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "executing-2.1.0-py2.py3-none-any.whl", hash = "sha256:8d63781349375b5ebccc3142f4b30350c0cd9c79f921cde38be2be4637e98eaf"}, {file = "executing-2.1.0.tar.gz", hash = "sha256:8ea27ddd260da8150fa5a708269c4a10e76161e2496ec3e587da9e3c0fe4b9ab"}, ] [package.extras] -tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"] +tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich ; python_version >= \"3.11\""] [[package]] name = "fastjsonschema" @@ -1005,6 +1056,7 @@ version = "2.20.0" description = "Fastest Python implementation of JSON schema" optional = false python-versions = "*" +groups = ["dev"] files = [ {file = "fastjsonschema-2.20.0-py3-none-any.whl", hash = "sha256:5875f0b0fa7a0043a91e93a9b8f793bcbbba9691e7fd83dca95c28ba26d21f0a"}, {file = "fastjsonschema-2.20.0.tar.gz", hash = "sha256:3d48fc5300ee96f5d116f10fe6f28d938e6008f59a6a025c2649475b87f76a23"}, @@ -1019,15 +1071,17 @@ version = "3.16.1" description = "A platform independent file lock." optional = false python-versions = ">=3.8" +groups = ["main", "gpu"] files = [ {file = "filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0"}, {file = "filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435"}, ] +markers = {main = "extra == \"huggingface\"", gpu = "sys_platform == \"win32\" or sys_platform == \"linux\""} [package.extras] docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4.1)"] testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.2)", "pytest (>=8.3.3)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.4)"] -typing = ["typing-extensions (>=4.12.2)"] +typing = ["typing-extensions (>=4.12.2) ; python_version < \"3.11\""] [[package]] name = "flake8" @@ -1035,6 +1089,7 @@ version = "7.1.1" description = "the modular source code checker: pep8 pyflakes and co" optional = false python-versions = ">=3.8.1" +groups = ["dev"] files = [ {file = "flake8-7.1.1-py2.py3-none-any.whl", hash = "sha256:597477df7860daa5aa0fdd84bf5208a043ab96b8e96ab708770ae0364dd03213"}, {file = "flake8-7.1.1.tar.gz", hash = "sha256:049d058491e228e03e67b390f311bbf88fce2dbaa8fa673e7aea87b7198b8d38"}, @@ -1051,6 +1106,7 @@ version = "1.5.1" description = "Validates fully-qualified domain names against RFC 1123, so that they are acceptable to modern bowsers" optional = false python-versions = ">=2.7, !=3.0, !=3.1, !=3.2, !=3.3, !=3.4, <4" +groups = ["dev"] files = [ {file = "fqdn-1.5.1-py3-none-any.whl", hash = "sha256:3a179af3761e4df6eb2e026ff9e1a3033d3587bf980a0b1b2e1e5d08d7358014"}, {file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"}, @@ -1062,6 +1118,8 @@ version = "1.5.0" description = "A list-like structure which implements collections.abc.MutableSequence" optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"huggingface\"" files = [ {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a"}, {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb"}, @@ -1163,10 +1221,12 @@ version = "2024.6.1" description = "File-system specification" optional = false python-versions = ">=3.8" +groups = ["main", "gpu"] files = [ {file = "fsspec-2024.6.1-py3-none-any.whl", hash = "sha256:3cb443f8bcd2efb31295a5b9fdb02aee81d8452c80d28f97a6d0959e6cee101e"}, {file = "fsspec-2024.6.1.tar.gz", hash = "sha256:fad7d7e209dd4c1208e3bbfda706620e0da5142bebbd9c384afb95b07e798e49"}, ] +markers = {main = "extra == \"huggingface\"", gpu = "sys_platform == \"win32\" or sys_platform == \"linux\""} [package.dependencies] aiohttp = {version = "<4.0.0a0 || >4.0.0a0,<4.0.0a1 || >4.0.0a1", optional = true, markers = "extra == \"http\""} @@ -1205,6 +1265,8 @@ version = "2.1.3" description = "URL manipulation made simple." optional = false python-versions = "*" +groups = ["main"] +markers = "extra == \"jobs\"" files = [ {file = "furl-2.1.3-py2.py3-none-any.whl", hash = "sha256:9ab425062c4217f9802508e45feb4a83e54324273ac4b202f1850363309666c0"}, {file = "furl-2.1.3.tar.gz", hash = "sha256:5a6188fe2666c484a12159c18be97a1977a71d632ef5bb867ef15f54af39cc4e"}, @@ -1220,6 +1282,7 @@ version = "0.14.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, @@ -1231,6 +1294,7 @@ version = "1.0.6" description = "A minimal low-level HTTP client." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "httpcore-1.0.6-py3-none-any.whl", hash = "sha256:27b59625743b85577a8c0e10e55b50b5368a4f2cfe8cc7bcfa9cf00829c2682f"}, {file = "httpcore-1.0.6.tar.gz", hash = "sha256:73f6dbd6eb8c21bbf7ef8efad555481853f5f6acdeaff1edb0694289269ee17f"}, @@ -1252,6 +1316,7 @@ version = "0.27.2" description = "The next generation HTTP client." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0"}, {file = "httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2"}, @@ -1265,7 +1330,7 @@ idna = "*" sniffio = "*" [package.extras] -brotli = ["brotli", "brotlicffi"] +brotli = ["brotli ; platform_python_implementation == \"CPython\"", "brotlicffi ; platform_python_implementation != \"CPython\""] cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] @@ -1277,10 +1342,12 @@ version = "0.26.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.8.0" +groups = ["main", "gpu"] files = [ {file = "huggingface_hub-0.26.2-py3-none-any.whl", hash = "sha256:98c2a5a8e786c7b2cb6fdeb2740893cba4d53e312572ed3d8afafda65b128c46"}, {file = "huggingface_hub-0.26.2.tar.gz", hash = "sha256:b100d853465d965733964d123939ba287da60a547087783ddff8a323f340332b"}, ] +markers = {main = "extra == \"huggingface\"", gpu = "sys_platform == \"win32\" or sys_platform == \"linux\""} [package.dependencies] filelock = "*" @@ -1311,10 +1378,12 @@ version = "3.10" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.6" +groups = ["main", "dev", "gpu"] files = [ {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"}, {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"}, ] +markers = {main = "extra == \"huggingface\" or extra == \"jobs\"", gpu = "sys_platform == \"win32\" or sys_platform == \"linux\""} [package.extras] all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] @@ -1325,6 +1394,8 @@ version = "8.5.0" description = "Read metadata from Python packages" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version == \"3.9\"" files = [ {file = "importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b"}, {file = "importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7"}, @@ -1334,12 +1405,12 @@ files = [ zipp = ">=3.20" [package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] enabler = ["pytest-enabler (>=2.2)"] perf = ["ipython"] -test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"] +test = ["flufl.flake8", "importlib-resources (>=1.3) ; python_version < \"3.9\"", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"] type = ["pytest-mypy"] [[package]] @@ -1348,6 +1419,7 @@ version = "2.0.0" description = "brain-dead simple config-ini parsing" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, @@ -1359,6 +1431,7 @@ version = "6.29.5" description = "IPython Kernel for Jupyter" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "ipykernel-6.29.5-py3-none-any.whl", hash = "sha256:afdb66ba5aa354b09b91379bac28ae4afebbb30e8b39510c9690afb7a10421b5"}, {file = "ipykernel-6.29.5.tar.gz", hash = "sha256:f093a22c4a40f8828f8e330a9c297cb93dcab13bd9678ded6de8e5cf81c56215"}, @@ -1392,6 +1465,7 @@ version = "8.18.1" description = "IPython: Productive Interactive Computing" optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ {file = "ipython-8.18.1-py3-none-any.whl", hash = "sha256:e8267419d72d81955ec1177f8a29aaa90ac80ad647499201119e2f05e99aa397"}, {file = "ipython-8.18.1.tar.gz", hash = "sha256:ca6f079bb33457c66e233e4580ebfc4128855b4cf6370dddd73842a9563e8a27"}, @@ -1429,6 +1503,7 @@ version = "8.1.5" description = "Jupyter interactive widgets" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "ipywidgets-8.1.5-py3-none-any.whl", hash = "sha256:3290f526f87ae6e77655555baba4f36681c555b8bdbbff430b70e52c34c86245"}, {file = "ipywidgets-8.1.5.tar.gz", hash = "sha256:870e43b1a35656a80c18c9503bbf2d16802db1cb487eec6fab27d683381dde17"}, @@ -1450,6 +1525,7 @@ version = "20.11.0" description = "Operations with ISO 8601 durations" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "isoduration-20.11.0-py3-none-any.whl", hash = "sha256:b2904c2a4228c3d44f409c8ae8e2370eb21a26f7ac2ec5446df141dde3452042"}, {file = "isoduration-20.11.0.tar.gz", hash = "sha256:ac2f9015137935279eac671f94f89eb00584f940f5dc49462a0c4ee692ba1bd9"}, @@ -1464,6 +1540,7 @@ version = "5.13.2" description = "A Python utility / library to sort Python imports." optional = false python-versions = ">=3.8.0" +groups = ["dev"] files = [ {file = "isort-5.13.2-py3-none-any.whl", hash = "sha256:8ca5e72a8d85860d5a3fa69b8745237f2939afe12dbf656afbcb47fe72d947a6"}, {file = "isort-5.13.2.tar.gz", hash = "sha256:48fdfcb9face5d58a4f6dde2e72a1fb8dcaf8ab26f95ab49fab84c2ddefb0109"}, @@ -1478,6 +1555,7 @@ version = "0.19.1" description = "An autocompletion tool for Python that can be used for text editors." optional = false python-versions = ">=3.6" +groups = ["dev"] files = [ {file = "jedi-0.19.1-py2.py3-none-any.whl", hash = "sha256:e983c654fe5c02867aef4cdfce5a2fbb4a50adc0af145f70504238f18ef5e7e0"}, {file = "jedi-0.19.1.tar.gz", hash = "sha256:cf0496f3651bc65d7174ac1b7d043eff454892c708a87d1b683e57b569927ffd"}, @@ -1497,10 +1575,12 @@ version = "3.1.4" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" +groups = ["dev", "gpu"] files = [ {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, ] +markers = {gpu = "sys_platform == \"win32\" or sys_platform == \"linux\""} [package.dependencies] MarkupSafe = ">=2.0" @@ -1514,6 +1594,7 @@ version = "1.0.1" description = "JSON Matching Expressions" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, @@ -1525,6 +1606,8 @@ version = "1.4.2" description = "Lightweight pipelining with Python functions" optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"huggingface\"" files = [ {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"}, {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"}, @@ -1536,6 +1619,8 @@ version = "1.5.2" description = "Streaming JSON encoder and decoder" optional = false python-versions = "<4,>=3.5" +groups = ["main"] +markers = "extra == \"jobs\"" files = [ {file = "json-stream-1.5.2.tar.gz", hash = "sha256:e6f895d48190b539c431e3d8623ed868bee6d0005d5b213be6ee26256ef20ebc"}, {file = "json_stream-1.5.2-py3-none-any.whl", hash = "sha256:e0363e887770e879f438c151c56f2d12fda674e92bbf1b5c184d84723deee631"}, @@ -1550,6 +1635,7 @@ version = "0.9.25" description = "A Python implementation of the JSON5 data format." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "json5-0.9.25-py3-none-any.whl", hash = "sha256:34ed7d834b1341a86987ed52f3f76cd8ee184394906b6e22a1e0deb9ab294e8f"}, {file = "json5-0.9.25.tar.gz", hash = "sha256:548e41b9be043f9426776f05df8635a00fe06104ea51ed24b67f908856e151ae"}, @@ -1561,6 +1647,7 @@ version = "3.0.0" description = "Identify specific nodes in a JSON document (RFC 6901)" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "jsonpointer-3.0.0-py2.py3-none-any.whl", hash = "sha256:13e088adc14fca8b6aa8177c044e12701e6ad4b28ff10e65f2267a90109c9942"}, {file = "jsonpointer-3.0.0.tar.gz", hash = "sha256:2b2d729f2091522d61c3b31f82e11870f60b68f43fbc705cb76bf4b832af59ef"}, @@ -1572,10 +1659,12 @@ version = "4.23.0" description = "An implementation of JSON Schema validation for Python" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "jsonschema-4.23.0-py3-none-any.whl", hash = "sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566"}, {file = "jsonschema-4.23.0.tar.gz", hash = "sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4"}, ] +markers = {main = "extra == \"jobs\""} [package.dependencies] attrs = ">=22.2.0" @@ -1601,10 +1690,12 @@ version = "2024.10.1" description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "jsonschema_specifications-2024.10.1-py3-none-any.whl", hash = "sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf"}, {file = "jsonschema_specifications-2024.10.1.tar.gz", hash = "sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272"}, ] +markers = {main = "extra == \"jobs\""} [package.dependencies] referencing = ">=0.31.0" @@ -1615,6 +1706,7 @@ version = "1.1.1" description = "Jupyter metapackage. Install all the Jupyter components in one go." optional = false python-versions = "*" +groups = ["dev"] files = [ {file = "jupyter-1.1.1-py2.py3-none-any.whl", hash = "sha256:7a59533c22af65439b24bbe60373a4e95af8f16ac65a6c00820ad378e3f7cc83"}, {file = "jupyter-1.1.1.tar.gz", hash = "sha256:d55467bceabdea49d7e3624af7e33d59c37fff53ed3a350e1ac957bed731de7a"}, @@ -1634,6 +1726,7 @@ version = "8.6.3" description = "Jupyter protocol implementation and client libraries" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "jupyter_client-8.6.3-py3-none-any.whl", hash = "sha256:e8a19cc986cc45905ac3362915f410f3af85424b4c0905e94fa5f2cb08e8f23f"}, {file = "jupyter_client-8.6.3.tar.gz", hash = "sha256:35b3a0947c4a6e9d589eb97d7d4cd5e90f910ee73101611f01283732bd6d9419"}, @@ -1649,7 +1742,7 @@ traitlets = ">=5.3" [package.extras] docs = ["ipykernel", "myst-parser", "pydata-sphinx-theme", "sphinx (>=4)", "sphinx-autodoc-typehints", "sphinxcontrib-github-alt", "sphinxcontrib-spelling"] -test = ["coverage", "ipykernel (>=6.14)", "mypy", "paramiko", "pre-commit", "pytest (<8.2.0)", "pytest-cov", "pytest-jupyter[client] (>=0.4.1)", "pytest-timeout"] +test = ["coverage", "ipykernel (>=6.14)", "mypy", "paramiko ; sys_platform == \"win32\"", "pre-commit", "pytest (<8.2.0)", "pytest-cov", "pytest-jupyter[client] (>=0.4.1)", "pytest-timeout"] [[package]] name = "jupyter-console" @@ -1657,6 +1750,7 @@ version = "6.6.3" description = "Jupyter terminal console" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "jupyter_console-6.6.3-py3-none-any.whl", hash = "sha256:309d33409fcc92ffdad25f0bcdf9a4a9daa61b6f341177570fdac03de5352485"}, {file = "jupyter_console-6.6.3.tar.gz", hash = "sha256:566a4bf31c87adbfadf22cdf846e3069b59a71ed5da71d6ba4d8aaad14a53539"}, @@ -1681,6 +1775,7 @@ version = "5.7.2" description = "Jupyter core package. A base package on which Jupyter projects rely." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "jupyter_core-5.7.2-py3-none-any.whl", hash = "sha256:4f7315d2f6b4bcf2e3e7cb6e46772eba760ae459cd1f59d29eb57b0a01bd7409"}, {file = "jupyter_core-5.7.2.tar.gz", hash = "sha256:aa5f8d32bbf6b431ac830496da7392035d6f61b4f54872f15c4bd2a9c3f536d9"}, @@ -1701,6 +1796,7 @@ version = "0.10.0" description = "Jupyter Event System library" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "jupyter_events-0.10.0-py3-none-any.whl", hash = "sha256:4b72130875e59d57716d327ea70d3ebc3af1944d3717e5a498b8a06c6c159960"}, {file = "jupyter_events-0.10.0.tar.gz", hash = "sha256:670b8229d3cc882ec782144ed22e0d29e1c2d639263f92ca8383e66682845e22"}, @@ -1726,6 +1822,7 @@ version = "2.2.5" description = "Multi-Language Server WebSocket proxy for Jupyter Notebook/Lab server" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "jupyter-lsp-2.2.5.tar.gz", hash = "sha256:793147a05ad446f809fd53ef1cd19a9f5256fd0a2d6b7ce943a982cb4f545001"}, {file = "jupyter_lsp-2.2.5-py3-none-any.whl", hash = "sha256:45fbddbd505f3fbfb0b6cb2f1bc5e15e83ab7c79cd6e89416b248cb3c00c11da"}, @@ -1741,6 +1838,7 @@ version = "2.14.2" description = "The backend—i.e. core services, APIs, and REST endpoints—to Jupyter web applications." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "jupyter_server-2.14.2-py3-none-any.whl", hash = "sha256:47ff506127c2f7851a17bf4713434208fc490955d0e8632e95014a9a9afbeefd"}, {file = "jupyter_server-2.14.2.tar.gz", hash = "sha256:66095021aa9638ced276c248b1d81862e4c50f292d575920bbe960de1c56b12b"}, @@ -1777,6 +1875,7 @@ version = "0.5.3" description = "A Jupyter Server Extension Providing Terminals." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "jupyter_server_terminals-0.5.3-py3-none-any.whl", hash = "sha256:41ee0d7dc0ebf2809c668e0fc726dfaf258fcd3e769568996ca731b6194ae9aa"}, {file = "jupyter_server_terminals-0.5.3.tar.gz", hash = "sha256:5ae0295167220e9ace0edcfdb212afd2b01ee8d179fe6f23c899590e9b8a5269"}, @@ -1796,6 +1895,7 @@ version = "4.2.5" description = "JupyterLab computational environment" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "jupyterlab-4.2.5-py3-none-any.whl", hash = "sha256:73b6e0775d41a9fee7ee756c80f58a6bed4040869ccc21411dc559818874d321"}, {file = "jupyterlab-4.2.5.tar.gz", hash = "sha256:ae7f3a1b8cb88b4f55009ce79fa7c06f99d70cd63601ee4aa91815d054f46f75"}, @@ -1831,6 +1931,7 @@ version = "0.3.0" description = "Pygments theme using JupyterLab CSS variables" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "jupyterlab_pygments-0.3.0-py3-none-any.whl", hash = "sha256:841a89020971da1d8693f1a99997aefc5dc424bb1b251fd6322462a1b8842780"}, {file = "jupyterlab_pygments-0.3.0.tar.gz", hash = "sha256:721aca4d9029252b11cfa9d185e5b5af4d54772bb8072f9b7036f4170054d35d"}, @@ -1842,6 +1943,7 @@ version = "2.27.3" description = "A set of server components for JupyterLab and JupyterLab like applications." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "jupyterlab_server-2.27.3-py3-none-any.whl", hash = "sha256:e697488f66c3db49df675158a77b3b017520d772c6e1548c7d9bcc5df7944ee4"}, {file = "jupyterlab_server-2.27.3.tar.gz", hash = "sha256:eb36caca59e74471988f0ae25c77945610b887f777255aa21f8065def9e51ed4"}, @@ -1868,6 +1970,7 @@ version = "3.0.13" description = "Jupyter interactive widgets for JupyterLab" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "jupyterlab_widgets-3.0.13-py3-none-any.whl", hash = "sha256:e3cda2c233ce144192f1e29914ad522b2f4c40e77214b0cc97377ca3d323db54"}, {file = "jupyterlab_widgets-3.0.13.tar.gz", hash = "sha256:a2966d385328c1942b683a8cd96b89b8dd82c8b8f81dda902bb2bc06d46f5bed"}, @@ -1879,6 +1982,7 @@ version = "3.0.2" description = "Safely add untrusted strings to HTML/XML markup." optional = false python-versions = ">=3.9" +groups = ["dev", "gpu"] files = [ {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8"}, {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158"}, @@ -1942,6 +2046,7 @@ files = [ {file = "MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a"}, {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"}, ] +markers = {gpu = "sys_platform == \"win32\" or sys_platform == \"linux\""} [[package]] name = "matplotlib-inline" @@ -1949,6 +2054,7 @@ version = "0.1.7" description = "Inline Matplotlib backend for Jupyter" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca"}, {file = "matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90"}, @@ -1963,6 +2069,7 @@ version = "0.7.0" description = "McCabe checker, plugin for flake8" optional = false python-versions = ">=3.6" +groups = ["dev"] files = [ {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, @@ -1974,6 +2081,7 @@ version = "3.0.2" description = "A sane and fast Markdown parser with useful plugins and renderers" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "mistune-3.0.2-py3-none-any.whl", hash = "sha256:71481854c30fdbc938963d3605b72501f5c10a9320ecd412c121c163a1c7d205"}, {file = "mistune-3.0.2.tar.gz", hash = "sha256:fc7f93ded930c92394ef2cb6f04a8aabab4117a91449e72dcc8dfa646a508be8"}, @@ -1985,6 +2093,8 @@ version = "1.3.0" description = "Python library for arbitrary-precision floating-point arithmetic" optional = false python-versions = "*" +groups = ["gpu"] +markers = "sys_platform == \"win32\" or sys_platform == \"linux\"" files = [ {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"}, @@ -1993,7 +2103,7 @@ files = [ [package.extras] develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"] docs = ["sphinx"] -gmpy = ["gmpy2 (>=2.1.0a4)"] +gmpy = ["gmpy2 (>=2.1.0a4) ; platform_python_implementation != \"PyPy\""] tests = ["pytest (>=4.6)"] [[package]] @@ -2002,6 +2112,8 @@ version = "6.1.0" description = "multidict implementation" optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"huggingface\"" files = [ {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60"}, {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1"}, @@ -2106,6 +2218,8 @@ version = "0.70.16" description = "better multiprocessing and multithreading in Python" optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"huggingface\"" files = [ {file = "multiprocess-0.70.16-pp310-pypy310_pp73-macosx_10_13_x86_64.whl", hash = "sha256:476887be10e2f59ff183c006af746cb6f1fd0eadcfd4ef49e605cbe2659920ee"}, {file = "multiprocess-0.70.16-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d951bed82c8f73929ac82c61f01a7b5ce8f3e5ef40f5b52553b4f547ce2b08ec"}, @@ -2130,6 +2244,7 @@ version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." optional = false python-versions = ">=3.5" +groups = ["dev"] files = [ {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, @@ -2141,6 +2256,7 @@ version = "0.10.0" description = "A client library for executing notebooks. Formerly nbconvert's ExecutePreprocessor." optional = false python-versions = ">=3.8.0" +groups = ["dev"] files = [ {file = "nbclient-0.10.0-py3-none-any.whl", hash = "sha256:f13e3529332a1f1f81d82a53210322476a168bb7090a0289c795fe9cc11c9d3f"}, {file = "nbclient-0.10.0.tar.gz", hash = "sha256:4b3f1b7dba531e498449c4db4f53da339c91d449dc11e9af3a43b4eb5c5abb09"}, @@ -2163,6 +2279,7 @@ version = "7.16.4" description = "Converting Jupyter Notebooks (.ipynb files) to other formats. Output formats include asciidoc, html, latex, markdown, pdf, py, rst, script. nbconvert can be used both as a Python library (`import nbconvert`) or as a command line tool (invoked as `jupyter nbconvert ...`)." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "nbconvert-7.16.4-py3-none-any.whl", hash = "sha256:05873c620fe520b6322bf8a5ad562692343fe3452abda5765c7a34b7d1aa3eb3"}, {file = "nbconvert-7.16.4.tar.gz", hash = "sha256:86ca91ba266b0a448dc96fa6c5b9d98affabde2867b363258703536807f9f7f4"}, @@ -2201,6 +2318,7 @@ version = "5.10.4" description = "The Jupyter Notebook format" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "nbformat-5.10.4-py3-none-any.whl", hash = "sha256:3b48d6c8fbca4b299bf3982ea7db1af21580e4fec269ad087b9e81588891200b"}, {file = "nbformat-5.10.4.tar.gz", hash = "sha256:322168b14f937a5d11362988ecac2a4952d3d8e3a2cbeb2319584631226d5b3a"}, @@ -2222,6 +2340,7 @@ version = "1.6.0" description = "Patch asyncio to allow nested event loops" optional = false python-versions = ">=3.5" +groups = ["dev"] files = [ {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"}, {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"}, @@ -2233,6 +2352,7 @@ version = "3.2.1" description = "Python package for creating and manipulating graphs and networks" optional = false python-versions = ">=3.9" +groups = ["main", "gpu"] files = [ {file = "networkx-3.2.1-py3-none-any.whl", hash = "sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2"}, {file = "networkx-3.2.1.tar.gz", hash = "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6"}, @@ -2251,6 +2371,7 @@ version = "1.9.1" description = "Node.js virtual environment builder" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["dev"] files = [ {file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"}, {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"}, @@ -2262,6 +2383,7 @@ version = "22.13.1" description = "unoffical Node.js package" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "nodejs_wheel_binaries-22.13.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:e4f64d0e26600d51cbdd98a6718a19c2d1b8c7538e9e353e95a634a06a8e1a58"}, {file = "nodejs_wheel_binaries-22.13.1-py2.py3-none-macosx_11_0_x86_64.whl", hash = "sha256:afcb40484bb02f23137f838014724604ae183fd767b30da95b0be1510a40c06d"}, @@ -2280,6 +2402,7 @@ version = "7.2.2" description = "Jupyter Notebook - A web-based notebook environment for interactive computing" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "notebook-7.2.2-py3-none-any.whl", hash = "sha256:c89264081f671bc02eec0ed470a627ed791b9156cad9285226b31611d3e9fe1c"}, {file = "notebook-7.2.2.tar.gz", hash = "sha256:2ef07d4220421623ad3fe88118d687bc0450055570cdd160814a59cf3a1c516e"}, @@ -2295,7 +2418,7 @@ tornado = ">=6.2.0" [package.extras] dev = ["hatch", "pre-commit"] docs = ["myst-parser", "nbsphinx", "pydata-sphinx-theme", "sphinx (>=1.3.6)", "sphinxcontrib-github-alt", "sphinxcontrib-spelling"] -test = ["importlib-resources (>=5.0)", "ipykernel", "jupyter-server[test] (>=2.4.0,<3)", "jupyterlab-server[test] (>=2.27.1,<3)", "nbval", "pytest (>=7.0)", "pytest-console-scripts", "pytest-timeout", "pytest-tornasync", "requests"] +test = ["importlib-resources (>=5.0) ; python_version < \"3.10\"", "ipykernel", "jupyter-server[test] (>=2.4.0,<3)", "jupyterlab-server[test] (>=2.27.1,<3)", "nbval", "pytest (>=7.0)", "pytest-console-scripts", "pytest-timeout", "pytest-tornasync", "requests"] [[package]] name = "notebook-shim" @@ -2303,6 +2426,7 @@ version = "0.2.4" description = "A shim layer for notebook traits and config" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "notebook_shim-0.2.4-py3-none-any.whl", hash = "sha256:411a5be4e9dc882a074ccbcae671eda64cceb068767e9a3419096986560e1cef"}, {file = "notebook_shim-0.2.4.tar.gz", hash = "sha256:b4b2cfa1b65d98307ca24361f5b30fe785b53c3fd07b7a47e89acb5e6ac638cb"}, @@ -2320,6 +2444,7 @@ version = "1.26.4" description = "Fundamental package for array computing in Python" optional = false python-versions = ">=3.9" +groups = ["main", "dev", "gpu"] files = [ {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"}, {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"}, @@ -2365,6 +2490,8 @@ version = "12.1.3.1" description = "CUBLAS native runtime libraries" optional = false python-versions = ">=3" +groups = ["gpu"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (sys_platform == \"win32\" or sys_platform == \"linux\")" files = [ {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728"}, {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-win_amd64.whl", hash = "sha256:2b964d60e8cf11b5e1073d179d85fa340c120e99b3067558f3cf98dd69d02906"}, @@ -2376,6 +2503,8 @@ version = "12.1.105" description = "CUDA profiling tools runtime libs." optional = false python-versions = ">=3" +groups = ["gpu"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (sys_platform == \"win32\" or sys_platform == \"linux\")" files = [ {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:e54fde3983165c624cb79254ae9818a456eb6e87a7fd4d56a2352c24ee542d7e"}, {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:bea8236d13a0ac7190bd2919c3e8e6ce1e402104276e6f9694479e48bb0eb2a4"}, @@ -2387,6 +2516,8 @@ version = "12.1.105" description = "NVRTC native runtime libraries" optional = false python-versions = ">=3" +groups = ["gpu"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (sys_platform == \"win32\" or sys_platform == \"linux\")" files = [ {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:339b385f50c309763ca65456ec75e17bbefcbbf2893f462cb8b90584cd27a1c2"}, {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:0a98a522d9ff138b96c010a65e145dc1b4850e9ecb75a0172371793752fd46ed"}, @@ -2398,6 +2529,8 @@ version = "12.1.105" description = "CUDA Runtime native Libraries" optional = false python-versions = ">=3" +groups = ["gpu"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (sys_platform == \"win32\" or sys_platform == \"linux\")" files = [ {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:6e258468ddf5796e25f1dc591a31029fa317d97a0a94ed93468fc86301d61e40"}, {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:dfb46ef84d73fababab44cf03e3b83f80700d27ca300e537f85f636fac474344"}, @@ -2409,6 +2542,8 @@ version = "9.1.0.70" description = "cuDNN runtime libraries" optional = false python-versions = ">=3" +groups = ["gpu"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (sys_platform == \"win32\" or sys_platform == \"linux\")" files = [ {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f"}, {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-win_amd64.whl", hash = "sha256:6278562929433d68365a07a4a1546c237ba2849852c0d4b2262a486e805b977a"}, @@ -2423,6 +2558,8 @@ version = "11.0.2.54" description = "CUFFT native runtime libraries" optional = false python-versions = ">=3" +groups = ["gpu"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (sys_platform == \"win32\" or sys_platform == \"linux\")" files = [ {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl", hash = "sha256:794e3948a1aa71fd817c3775866943936774d1c14e7628c74f6f7417224cdf56"}, {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-win_amd64.whl", hash = "sha256:d9ac353f78ff89951da4af698f80870b1534ed69993f10a4cf1d96f21357e253"}, @@ -2434,6 +2571,8 @@ version = "10.3.2.106" description = "CURAND native runtime libraries" optional = false python-versions = ">=3" +groups = ["gpu"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (sys_platform == \"win32\" or sys_platform == \"linux\")" files = [ {file = "nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:9d264c5036dde4e64f1de8c50ae753237c12e0b1348738169cd0f8a536c0e1e0"}, {file = "nvidia_curand_cu12-10.3.2.106-py3-none-win_amd64.whl", hash = "sha256:75b6b0c574c0037839121317e17fd01f8a69fd2ef8e25853d826fec30bdba74a"}, @@ -2445,6 +2584,8 @@ version = "11.4.5.107" description = "CUDA solver native runtime libraries" optional = false python-versions = ">=3" +groups = ["gpu"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (sys_platform == \"win32\" or sys_platform == \"linux\")" files = [ {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl", hash = "sha256:8a7ec542f0412294b15072fa7dab71d31334014a69f953004ea7a118206fe0dd"}, {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-win_amd64.whl", hash = "sha256:74e0c3a24c78612192a74fcd90dd117f1cf21dea4822e66d89e8ea80e3cd2da5"}, @@ -2461,6 +2602,8 @@ version = "12.1.0.106" description = "CUSPARSE native runtime libraries" optional = false python-versions = ">=3" +groups = ["gpu"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (sys_platform == \"win32\" or sys_platform == \"linux\")" files = [ {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:f3b50f42cf363f86ab21f720998517a659a48131e8d538dc02f8768237bd884c"}, {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-win_amd64.whl", hash = "sha256:b798237e81b9719373e8fae8d4f091b70a0cf09d9d85c95a557e11df2d8e9a5a"}, @@ -2475,6 +2618,8 @@ version = "2.20.5" description = "NVIDIA Collective Communication Library (NCCL) Runtime" optional = false python-versions = ">=3" +groups = ["gpu"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (sys_platform == \"win32\" or sys_platform == \"linux\")" files = [ {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01"}, {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:057f6bf9685f75215d0c53bf3ac4a10b3e6578351de307abad9e18a99182af56"}, @@ -2486,6 +2631,8 @@ version = "12.6.77" description = "Nvidia JIT LTO Library" optional = false python-versions = ">=3" +groups = ["gpu"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (sys_platform == \"win32\" or sys_platform == \"linux\")" files = [ {file = "nvidia_nvjitlink_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:3bf10d85bb1801e9c894c6e197e44dd137d2a0a9e43f8450e9ad13f2df0dd52d"}, {file = "nvidia_nvjitlink_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9ae346d16203ae4ea513be416495167a0101d33d2d14935aa9c1829a3fb45142"}, @@ -2498,6 +2645,8 @@ version = "12.1.105" description = "NVIDIA Tools Extension" optional = false python-versions = ">=3" +groups = ["gpu"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (sys_platform == \"win32\" or sys_platform == \"linux\")" files = [ {file = "nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:dc21cf308ca5691e7c04d962e213f8a4aa9bbfa23d95412f452254c2caeb09e5"}, {file = "nvidia_nvtx_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:65f4d98982b31b60026e0e6de73fbdfc09d08a96f4656dd3665ca616a11e1e82"}, @@ -2509,6 +2658,8 @@ version = "1.0.1" description = "Ordered Multivalue Dictionary" optional = false python-versions = "*" +groups = ["main"] +markers = "extra == \"jobs\"" files = [ {file = "orderedmultidict-1.0.1-py2.py3-none-any.whl", hash = "sha256:43c839a17ee3cdd62234c47deca1a8508a3f2ca1d0678a3bf791c87cf84adbf3"}, {file = "orderedmultidict-1.0.1.tar.gz", hash = "sha256:04070bbb5e87291cc9bfa51df413677faf2141c73c61d2a5f7b26bea3cd882ad"}, @@ -2523,6 +2674,7 @@ version = "7.7.0" description = "A decorator to automatically detect mismatch when overriding a method." optional = false python-versions = ">=3.6" +groups = ["dev"] files = [ {file = "overrides-7.7.0-py3-none-any.whl", hash = "sha256:c7ed9d062f78b8e4c1a7b70bd8796b35ead4d9f510227ef9c5dc7626c60d7e49"}, {file = "overrides-7.7.0.tar.gz", hash = "sha256:55158fa3d93b98cc75299b1e67078ad9003ca27945c76162c1c0766d6f91820a"}, @@ -2534,10 +2686,12 @@ version = "24.1" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" +groups = ["main", "dev", "gpu"] files = [ {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, ] +markers = {main = "extra == \"huggingface\"", gpu = "sys_platform == \"win32\" or sys_platform == \"linux\""} [[package]] name = "pandas" @@ -2545,6 +2699,7 @@ version = "2.2.3" description = "Powerful data structures for data analysis, time series, and statistics" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5"}, {file = "pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348"}, @@ -2631,6 +2786,7 @@ version = "1.5.1" description = "Utilities for writing pandoc filters in python" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["dev"] files = [ {file = "pandocfilters-1.5.1-py2.py3-none-any.whl", hash = "sha256:93be382804a9cdb0a7267585f157e5d1731bbe5545a85b268d6f5fe6232de2bc"}, {file = "pandocfilters-1.5.1.tar.gz", hash = "sha256:002b4a555ee4ebc03f8b66307e287fa492e4a77b4ea14d3f934328297bb4939e"}, @@ -2642,6 +2798,7 @@ version = "0.8.4" description = "A Python Parser" optional = false python-versions = ">=3.6" +groups = ["dev"] files = [ {file = "parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18"}, {file = "parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d"}, @@ -2657,6 +2814,8 @@ version = "2.3.7.post1" description = "Object-oriented filesystem paths" optional = false python-versions = "*" +groups = ["main"] +markers = "extra == \"jobs\"" files = [ {file = "pathlib2-2.3.7.post1-py2.py3-none-any.whl", hash = "sha256:5266a0fd000452f1b3467d782f079a4343c63aaa119221fbdc4e39577489ca5b"}, {file = "pathlib2-2.3.7.post1.tar.gz", hash = "sha256:9fe0edad898b83c0c3e199c842b27ed216645d2e177757b2dd67384d4113c641"}, @@ -2671,6 +2830,7 @@ version = "0.12.1" description = "Utility library for gitignore style pattern matching of file paths." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"}, {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, @@ -2682,6 +2842,7 @@ version = "0.14.1" description = "Check PEP-8 naming conventions, plugin for flake8" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pep8-naming-0.14.1.tar.gz", hash = "sha256:1ef228ae80875557eb6c1549deafed4dabbf3261cfcafa12f773fe0db9be8a36"}, {file = "pep8_naming-0.14.1-py3-none-any.whl", hash = "sha256:63f514fc777d715f935faf185dedd679ab99526a7f2f503abb61587877f7b1c5"}, @@ -2696,6 +2857,8 @@ version = "4.9.0" description = "Pexpect allows easy control of interactive console applications." optional = false python-versions = "*" +groups = ["dev"] +markers = "sys_platform != \"win32\"" files = [ {file = "pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523"}, {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"}, @@ -2710,6 +2873,8 @@ version = "11.0.0" description = "Python Imaging Library (Fork)" optional = false python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"jobs\"" files = [ {file = "pillow-11.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:6619654954dc4936fcff82db8eb6401d3159ec6be81e33c6000dfd76ae189947"}, {file = "pillow-11.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b3c5ac4bed7519088103d9450a1107f76308ecf91d6dabc8a33a2fcfb18d0fba"}, @@ -2793,7 +2958,7 @@ docs = ["furo", "olefile", "sphinx (>=8.1)", "sphinx-copybutton", "sphinx-inline fpx = ["olefile"] mic = ["olefile"] tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] -typing = ["typing-extensions"] +typing = ["typing-extensions ; python_version < \"3.10\""] xmp = ["defusedxml"] [[package]] @@ -2802,6 +2967,7 @@ version = "4.3.6" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"}, {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"}, @@ -2818,6 +2984,7 @@ version = "1.5.0" description = "plugin and hook calling mechanisms for python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, @@ -2833,6 +3000,7 @@ version = "0.21.0" description = "Python client for the Prometheus monitoring system." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "prometheus_client-0.21.0-py3-none-any.whl", hash = "sha256:4fa6b4dd0ac16d58bb587c04b1caae65b8c5043e85f778f42f5f632f6af2e166"}, {file = "prometheus_client-0.21.0.tar.gz", hash = "sha256:96c83c606b71ff2b0a433c98889d275f51ffec6c5e267de37c7a2b5c9aa9233e"}, @@ -2847,6 +3015,7 @@ version = "3.0.48" description = "Library for building powerful interactive command lines in Python" optional = false python-versions = ">=3.7.0" +groups = ["dev"] files = [ {file = "prompt_toolkit-3.0.48-py3-none-any.whl", hash = "sha256:f49a827f90062e411f1ce1f854f2aedb3c23353244f8108b89283587397ac10e"}, {file = "prompt_toolkit-3.0.48.tar.gz", hash = "sha256:d6623ab0477a80df74e646bdbc93621143f5caf104206aa29294d53de1a03d90"}, @@ -2861,6 +3030,8 @@ version = "0.2.0" description = "Accelerated property cache" optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"huggingface\"" files = [ {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5869b8fd70b81835a6f187c5fdbe67917a04d7e52b6e7cc4e5fe39d55c39d58"}, {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:952e0d9d07609d9c5be361f33b0d6d650cd2bae393aabb11d9b719364521984b"}, @@ -2968,6 +3139,7 @@ version = "6.1.0" description = "Cross-platform lib for process and system monitoring in Python." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +groups = ["main", "dev", "gpu"] files = [ {file = "psutil-6.1.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff34df86226c0227c52f38b919213157588a678d049688eded74c76c8ba4a5d0"}, {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c0e0c00aa18ca2d3b2b991643b799a15fc8f0563d2ebb6040f64ce8dc027b942"}, @@ -2987,6 +3159,7 @@ files = [ {file = "psutil-6.1.0-cp37-abi3-win_amd64.whl", hash = "sha256:a8fb3752b491d246034fa4d279ff076501588ce8cbcdbb62c32fd7a377d996be"}, {file = "psutil-6.1.0.tar.gz", hash = "sha256:353815f59a7f64cdaca1c0307ee13558a0512f6db064e92fe833784f08539c7a"}, ] +markers = {main = "extra == \"jobs\"", gpu = "sys_platform == \"win32\" or sys_platform == \"linux\""} [package.extras] dev = ["black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "wheel"] @@ -2998,6 +3171,8 @@ version = "0.7.0" description = "Run a subprocess in a pseudo terminal" optional = false python-versions = "*" +groups = ["dev"] +markers = "sys_platform != \"win32\" or os_name != \"nt\"" files = [ {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, @@ -3009,6 +3184,7 @@ version = "0.2.3" description = "Safely evaluate AST nodes without side effects" optional = false python-versions = "*" +groups = ["dev"] files = [ {file = "pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0"}, {file = "pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42"}, @@ -3023,6 +3199,8 @@ version = "18.0.0" description = "Python library for Apache Arrow" optional = false python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"huggingface\"" files = [ {file = "pyarrow-18.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:2333f93260674e185cfbf208d2da3007132572e56871f451ba1a556b45dae6e2"}, {file = "pyarrow-18.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:4c381857754da44326f3a49b8b199f7f87a51c2faacd5114352fc78de30d3aba"}, @@ -3077,6 +3255,7 @@ version = "2.12.1" description = "Python style guide checker" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pycodestyle-2.12.1-py2.py3-none-any.whl", hash = "sha256:46f0fb92069a7c28ab7bb558f05bfc0110dac69a0cd23c61ea0040283a9d78b3"}, {file = "pycodestyle-2.12.1.tar.gz", hash = "sha256:6838eae08bbce4f6accd5d5572075c63626a15ee3e6f842df996bf62f6d73521"}, @@ -3088,6 +3267,7 @@ version = "2.22" description = "C parser in Python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, @@ -3099,6 +3279,7 @@ version = "3.2.0" description = "passive checker of Python programs" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pyflakes-3.2.0-py2.py3-none-any.whl", hash = "sha256:84b5be138a2dfbb40689ca07e2152deb896a65c3a3e24c251c5c62489568074a"}, {file = "pyflakes-3.2.0.tar.gz", hash = "sha256:1c61603ff154621fb2a9172037d84dca3500def8c8b630657d1701f026f8af3f"}, @@ -3110,6 +3291,7 @@ version = "2.18.0" description = "Pygments is a syntax highlighting package written in Python." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a"}, {file = "pygments-2.18.0.tar.gz", hash = "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199"}, @@ -3124,6 +3306,8 @@ version = "2.8.0" description = "JSON Web Token implementation in Python" optional = false python-versions = ">=3.7" +groups = ["main"] +markers = "extra == \"jobs\"" files = [ {file = "PyJWT-2.8.0-py3-none-any.whl", hash = "sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320"}, {file = "PyJWT-2.8.0.tar.gz", hash = "sha256:57e28d156e3d5c10088e0c68abb90bfac3df82b40a71bd0daa20c65ccd5c23de"}, @@ -3141,6 +3325,8 @@ version = "3.2.0" description = "pyparsing module - Classes and methods to define and execute parsing grammars" optional = false python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"jobs\"" files = [ {file = "pyparsing-3.2.0-py3-none-any.whl", hash = "sha256:93d9577b88da0bbea8cc8334ee8b918ed014968fd2ec383e868fb8afb1ccef84"}, {file = "pyparsing-3.2.0.tar.gz", hash = "sha256:cbf74e27246d595d9a74b186b810f6fbb86726dbf3b9532efb343f6d7294fe9c"}, @@ -3151,13 +3337,14 @@ diagrams = ["jinja2", "railroad-diagrams"] [[package]] name = "pyright" -version = "1.1.386" +version = "1.1.399" description = "Command line wrapper for pyright" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ - {file = "pyright-1.1.386-py3-none-any.whl", hash = "sha256:7071ac495593b2258ccdbbf495f1a5c0e5f27951f6b429bed4e8b296eb5cd21d"}, - {file = "pyright-1.1.386.tar.gz", hash = "sha256:8e9975e34948ba5f8e07792a9c9d2bdceb2c6c0b61742b068d2229ca2bc4a9d9"}, + {file = "pyright-1.1.399-py3-none-any.whl", hash = "sha256:55f9a875ddf23c9698f24208c764465ffdfd38be6265f7faf9a176e1dc549f3b"}, + {file = "pyright-1.1.399.tar.gz", hash = "sha256:439035d707a36c3d1b443aec980bc37053fbda88158eded24b8eedcf1c7b7a1b"}, ] [package.dependencies] @@ -3176,6 +3363,7 @@ version = "8.3.3" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pytest-8.3.3-py3-none-any.whl", hash = "sha256:a6853c7375b2663155079443d2e45de913a911a11d669df02a50814944db57b2"}, {file = "pytest-8.3.3.tar.gz", hash = "sha256:70b98107bd648308a7952b06e6ca9a50bc660be218d53c257cc1fc94fda10181"}, @@ -3198,6 +3386,7 @@ version = "4.1.0" description = "Pytest plugin for measuring coverage." optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "pytest-cov-4.1.0.tar.gz", hash = "sha256:3904b13dfbfec47f003b8e77fd5b589cd11904a21ddf1ab38a64f204d6a10ef6"}, {file = "pytest_cov-4.1.0-py3-none-any.whl", hash = "sha256:6ba70b9e97e69fcc3fb45bfeab2d0a138fb65c4d0d6a41ef33983ad114be8c3a"}, @@ -3216,6 +3405,7 @@ version = "2.9.0.post0" description = "Extensions to the standard Python datetime module" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main", "dev"] files = [ {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, @@ -3230,6 +3420,7 @@ version = "2.0.7" description = "A python library adding a json log formatter" optional = false python-versions = ">=3.6" +groups = ["dev"] files = [ {file = "python-json-logger-2.0.7.tar.gz", hash = "sha256:23e7ec02d34237c5aa1e29a070193a4ea87583bb4e7f8fd06d3de8264c4b2e1c"}, {file = "python_json_logger-2.0.7-py3-none-any.whl", hash = "sha256:f380b826a991ebbe3de4d897aeec42760035ac760345e57b812938dc8b35e2bd"}, @@ -3241,6 +3432,7 @@ version = "2024.2" description = "World timezone definitions, modern and historical" optional = false python-versions = "*" +groups = ["main", "dev"] files = [ {file = "pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725"}, {file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"}, @@ -3252,6 +3444,8 @@ version = "308" description = "Python for Window Extensions" optional = false python-versions = "*" +groups = ["dev"] +markers = "sys_platform == \"win32\" and platform_python_implementation != \"PyPy\"" files = [ {file = "pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e"}, {file = "pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e"}, @@ -3279,6 +3473,8 @@ version = "2.0.14" description = "Pseudo terminal support for Windows from Python." optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "os_name == \"nt\"" files = [ {file = "pywinpty-2.0.14-cp310-none-win_amd64.whl", hash = "sha256:0b149c2918c7974f575ba79f5a4aad58bd859a52fa9eb1296cc22aa412aa411f"}, {file = "pywinpty-2.0.14-cp311-none-win_amd64.whl", hash = "sha256:cf2a43ac7065b3e0dc8510f8c1f13a75fb8fde805efa3b8cff7599a1ef497bc7"}, @@ -3294,6 +3490,7 @@ version = "6.0.2" description = "YAML parser and emitter for Python" optional = false python-versions = ">=3.8" +groups = ["main", "dev", "gpu"] files = [ {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"}, {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"}, @@ -3349,6 +3546,7 @@ files = [ {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"}, {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"}, ] +markers = {main = "extra == \"huggingface\" or extra == \"jobs\"", gpu = "sys_platform == \"win32\" or sys_platform == \"linux\""} [[package]] name = "pyzmq" @@ -3356,6 +3554,7 @@ version = "26.2.0" description = "Python bindings for 0MQ" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "pyzmq-26.2.0-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:ddf33d97d2f52d89f6e6e7ae66ee35a4d9ca6f36eda89c24591b0c40205a3629"}, {file = "pyzmq-26.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dacd995031a01d16eec825bf30802fceb2c3791ef24bcce48fa98ce40918c27b"}, @@ -3477,10 +3676,12 @@ version = "0.35.1" description = "JSON Referencing + Python" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "referencing-0.35.1-py3-none-any.whl", hash = "sha256:eda6d3234d62814d1c64e305c1331c9a3a6132da475ab6382eaa997b21ee75de"}, {file = "referencing-0.35.1.tar.gz", hash = "sha256:25b42124a6c8b632a425174f24087783efb348a6f1e0008e63cd4466fedf703c"}, ] +markers = {main = "extra == \"jobs\""} [package.dependencies] attrs = ">=22.2.0" @@ -3492,6 +3693,7 @@ version = "2024.9.11" description = "Alternative regular expression module, to replace re." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "regex-2024.9.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1494fa8725c285a81d01dc8c06b55287a1ee5e0e382d8413adc0a9197aac6408"}, {file = "regex-2024.9.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0e12c481ad92d129c78f13a2a3662317e46ee7ef96c94fd332e1c29131875b7d"}, @@ -3595,10 +3797,12 @@ version = "2.32.3" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" +groups = ["main", "dev", "gpu"] files = [ {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, ] +markers = {main = "extra == \"huggingface\" or extra == \"jobs\"", gpu = "sys_platform == \"win32\" or sys_platform == \"linux\""} [package.dependencies] certifi = ">=2017.4.17" @@ -3616,6 +3820,7 @@ version = "0.1.4" description = "A pure python RFC3339 validator" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["dev"] files = [ {file = "rfc3339_validator-0.1.4-py2.py3-none-any.whl", hash = "sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa"}, {file = "rfc3339_validator-0.1.4.tar.gz", hash = "sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b"}, @@ -3630,6 +3835,7 @@ version = "0.1.1" description = "Pure python rfc3986 validator" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["dev"] files = [ {file = "rfc3986_validator-0.1.1-py2.py3-none-any.whl", hash = "sha256:2f235c432ef459970b4306369336b9d5dbdda31b510ca1e327636e01f528bfa9"}, {file = "rfc3986_validator-0.1.1.tar.gz", hash = "sha256:3d44bde7921b3b9ec3ae4e3adca370438eccebc676456449b145d533b240d055"}, @@ -3641,6 +3847,7 @@ version = "0.20.0" description = "Python bindings to Rust's persistent data structures (rpds)" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "rpds_py-0.20.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3ad0fda1635f8439cde85c700f964b23ed5fc2d28016b32b9ee5fe30da5c84e2"}, {file = "rpds_py-0.20.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9bb4a0d90fdb03437c109a17eade42dfbf6190408f29b2744114d11586611d6f"}, @@ -3746,6 +3953,7 @@ files = [ {file = "rpds_py-0.20.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdfc3a892927458d98f3d55428ae46b921d1f7543b89382fdb483f5640daaec8"}, {file = "rpds_py-0.20.0.tar.gz", hash = "sha256:d72a210824facfdaf8768cf2d7ca25a042c30320b3020de2fa04640920d4e121"}, ] +markers = {main = "extra == \"jobs\""} [[package]] name = "s3transfer" @@ -3753,6 +3961,7 @@ version = "0.10.3" description = "An Amazon S3 Transfer Manager" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "s3transfer-0.10.3-py3-none-any.whl", hash = "sha256:263ed587a5803c6c708d3ce44dc4dfedaab4c1a32e8329bab818933d79ddcf5d"}, {file = "s3transfer-0.10.3.tar.gz", hash = "sha256:4f50ed74ab84d474ce614475e0b8d5047ff080810aac5d01ea25231cfc944b0c"}, @@ -3770,6 +3979,8 @@ version = "0.0.53" description = "SacreMoses" optional = false python-versions = "*" +groups = ["main"] +markers = "extra == \"huggingface\"" files = [ {file = "sacremoses-0.0.53.tar.gz", hash = "sha256:43715868766c643b35de4b8046cce236bfe59a7fa88b25eaf6ddf02bacf53a7a"}, ] @@ -3787,6 +3998,7 @@ version = "0.4.5" description = "" optional = false python-versions = ">=3.7" +groups = ["main", "gpu"] files = [ {file = "safetensors-0.4.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:a63eaccd22243c67e4f2b1c3e258b257effc4acd78f3b9d397edc8cf8f1298a7"}, {file = "safetensors-0.4.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:23fc9b4ec7b602915cbb4ec1a7c1ad96d2743c322f20ab709e2c35d1b66dad27"}, @@ -3899,6 +4111,7 @@ files = [ {file = "safetensors-0.4.5-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:834001bed193e4440c4a3950a31059523ee5090605c907c66808664c932b549c"}, {file = "safetensors-0.4.5.tar.gz", hash = "sha256:d73de19682deabb02524b3d5d1f8b3aaba94c72f1bbfc7911b9b9d5d391c0310"}, ] +markers = {main = "extra == \"huggingface\"", gpu = "sys_platform == \"win32\" or sys_platform == \"linux\""} [package.extras] all = ["safetensors[jax]", "safetensors[numpy]", "safetensors[paddlepaddle]", "safetensors[pinned-tf]", "safetensors[quality]", "safetensors[testing]", "safetensors[torch]"] @@ -3919,15 +4132,16 @@ version = "1.8.3" description = "Send file to trash natively under Mac OS X, Windows and Linux" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +groups = ["dev"] files = [ {file = "Send2Trash-1.8.3-py3-none-any.whl", hash = "sha256:0c31227e0bd08961c7665474a3d1ef7193929fedda4233843689baa056be46c9"}, {file = "Send2Trash-1.8.3.tar.gz", hash = "sha256:b18e7a3966d99871aefeb00cfbcfdced55ce4871194810fc71f4aa484b953abf"}, ] [package.extras] -nativelib = ["pyobjc-framework-Cocoa", "pywin32"] -objc = ["pyobjc-framework-Cocoa"] -win32 = ["pywin32"] +nativelib = ["pyobjc-framework-Cocoa ; sys_platform == \"darwin\"", "pywin32 ; sys_platform == \"win32\""] +objc = ["pyobjc-framework-Cocoa ; sys_platform == \"darwin\""] +win32 = ["pywin32 ; sys_platform == \"win32\""] [[package]] name = "sentencepiece" @@ -3935,6 +4149,8 @@ version = "0.2.0" description = "SentencePiece python wrapper" optional = false python-versions = "*" +groups = ["main"] +markers = "extra == \"sentencepiece\"" files = [ {file = "sentencepiece-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:188779e1298a1c8b8253c7d3ad729cb0a9891e5cef5e5d07ce4592c54869e227"}, {file = "sentencepiece-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bed9cf85b296fa2b76fc2547b9cbb691a523864cebaee86304c43a7b4cb1b452"}, @@ -3997,19 +4213,20 @@ version = "75.3.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "setuptools-75.3.0-py3-none-any.whl", hash = "sha256:f2504966861356aa38616760c0f66568e535562374995367b4e69c7143cf6bcd"}, {file = "setuptools-75.3.0.tar.gz", hash = "sha256:fba5dd4d766e97be1b1681d98712680ae8f2f26d7881245f2ce9e40714f1a686"}, ] [package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.5.2)"] -core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.collections", "jaraco.functools", "jaraco.text (>=3.7)", "more-itertools", "more-itertools (>=8.8)", "packaging", "packaging (>=24)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.5.2) ; sys_platform != \"cygwin\""] +core = ["importlib-metadata (>=6) ; python_version < \"3.10\"", "importlib-resources (>=5.10.2) ; python_version < \"3.9\"", "jaraco.collections", "jaraco.functools", "jaraco.text (>=3.7)", "more-itertools", "more-itertools (>=8.8)", "packaging", "packaging (>=24)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] enabler = ["pytest-enabler (>=2.2)"] -test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test (>=5.5)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] -type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.12.*)", "pytest-mypy"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test (>=5.5)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] +type = ["importlib-metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.12.*)", "pytest-mypy"] [[package]] name = "sil-thot" @@ -4017,6 +4234,8 @@ version = "3.4.6" description = "A toolkit for statistical word alignment and machine translation" optional = false python-versions = "<4.0,>=3.7" +groups = ["main"] +markers = "extra == \"thot\"" files = [ {file = "sil_thot-3.4.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ba4a3ac7310dc4e51f81483d21e4cc461ef803968647c05e7daad7dc6d973504"}, {file = "sil_thot-3.4.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6a90640851f9b93c92f94c588814258503b5c06f7dfc7d8578b4b0222fa5be87"}, @@ -4054,6 +4273,7 @@ version = "1.16.0" description = "Python 2 and 3 compatibility utilities" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +groups = ["main", "dev"] files = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, @@ -4065,6 +4285,7 @@ version = "1.3.1" description = "Sniff out which async library your code is running under" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, @@ -4076,6 +4297,7 @@ version = "2.4.0" description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, @@ -4087,6 +4309,7 @@ version = "2.6" description = "A modern CSS selector implementation for Beautiful Soup." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"}, {file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"}, @@ -4098,6 +4321,7 @@ version = "0.6.3" description = "Extract data from python stack frames and tracebacks for informative displays" optional = false python-versions = "*" +groups = ["dev"] files = [ {file = "stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695"}, {file = "stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9"}, @@ -4117,6 +4341,8 @@ version = "1.13.3" description = "Computer algebra system (CAS) in Python" optional = false python-versions = ">=3.8" +groups = ["gpu"] +markers = "sys_platform == \"win32\" or sys_platform == \"linux\"" files = [ {file = "sympy-1.13.3-py3-none-any.whl", hash = "sha256:54612cf55a62755ee71824ce692986f23c88ffa77207b30c1368eda4a7060f73"}, {file = "sympy-1.13.3.tar.gz", hash = "sha256:b27fd2c6530e0ab39e275fc9b683895367e51d5da91baa8d3d64db2565fec4d9"}, @@ -4134,6 +4360,7 @@ version = "0.18.1" description = "Tornado websocket backend for the Xterm.js Javascript terminal emulator library." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "terminado-0.18.1-py3-none-any.whl", hash = "sha256:a4468e1b37bb318f8a86514f65814e1afc977cf29b3992a4500d9dd305dcceb0"}, {file = "terminado-0.18.1.tar.gz", hash = "sha256:de09f2c4b85de4765f7714688fff57d3e75bad1f909b589fde880460c753fd2e"}, @@ -4155,6 +4382,7 @@ version = "1.4.0" description = "A tiny CSS parser" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "tinycss2-1.4.0-py3-none-any.whl", hash = "sha256:3a49cf47b7675da0b15d0c6e1df8df4ebd96e9394bb905a5775adb0d884c5289"}, {file = "tinycss2-1.4.0.tar.gz", hash = "sha256:10c0972f6fc0fbee87c3edb76549357415e94548c1ae10ebccdea16fb404a9b7"}, @@ -4173,6 +4401,8 @@ version = "0.20.1" description = "" optional = false python-versions = ">=3.7" +groups = ["main"] +markers = "extra == \"huggingface\"" files = [ {file = "tokenizers-0.20.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:439261da7c0a5c88bda97acb284d49fbdaf67e9d3b623c0bfd107512d22787a9"}, {file = "tokenizers-0.20.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:03dae629d99068b1ea5416d50de0fea13008f04129cc79af77a2a6392792d93c"}, @@ -4290,6 +4520,8 @@ version = "2.0.2" description = "A lil' TOML parser" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version < \"3.11\"" files = [ {file = "tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38"}, {file = "tomli-2.0.2.tar.gz", hash = "sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed"}, @@ -4301,6 +4533,8 @@ version = "2.4.0" description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" optional = false python-versions = ">=3.8.0" +groups = ["gpu"] +markers = "sys_platform == \"win32\" or sys_platform == \"linux\"" files = [ {file = "torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:4ed94583e244af51d6a8d28701ca5a9e02d1219e782f5a01dd401f90af17d8ac"}, {file = "torch-2.4.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:c4ca297b7bd58b506bfd6e78ffd14eb97c0e7797dcd7965df62f50bb575d8954"}, @@ -4354,6 +4588,7 @@ version = "6.4.1" description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "tornado-6.4.1-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:163b0aafc8e23d8cdc3c9dfb24c5368af84a81e3364745ccb4427669bf84aec8"}, {file = "tornado-6.4.1-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6d5ce3437e18a2b66fbadb183c1d3364fb03f2be71299e7d10dbeeb69f4b2a14"}, @@ -4374,10 +4609,12 @@ version = "4.66.6" description = "Fast, Extensible Progress Meter" optional = false python-versions = ">=3.7" +groups = ["main", "gpu"] files = [ {file = "tqdm-4.66.6-py3-none-any.whl", hash = "sha256:223e8b5359c2efc4b30555531f09e9f2f3589bcd7fdd389271191031b49b7a63"}, {file = "tqdm-4.66.6.tar.gz", hash = "sha256:4bdd694238bef1485ce839d67967ab50af8f9272aab687c0d7702a01da0be090"}, ] +markers = {main = "extra == \"huggingface\"", gpu = "sys_platform == \"win32\" or sys_platform == \"linux\""} [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} @@ -4394,6 +4631,7 @@ version = "5.14.3" description = "Traitlets Python configuration system" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f"}, {file = "traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7"}, @@ -4409,6 +4647,8 @@ version = "4.45.2" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" optional = false python-versions = ">=3.8.0" +groups = ["main"] +markers = "extra == \"huggingface\"" files = [ {file = "transformers-4.45.2-py3-none-any.whl", hash = "sha256:c551b33660cfc815bae1f9f097ecfd1e65be623f13c6ee0dda372bd881460210"}, {file = "transformers-4.45.2.tar.gz", hash = "sha256:72bc390f6b203892561f05f86bbfaa0e234aab8e927a83e62b9d92ea7e3ae101"}, @@ -4478,6 +4718,8 @@ version = "3.0.0" description = "A language and compiler for custom Deep Learning operations" optional = false python-versions = "*" +groups = ["gpu"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and (sys_platform == \"win32\" or sys_platform == \"linux\")" files = [ {file = "triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e1efef76935b2febc365bfadf74bcb65a6f959a9872e5bddf44cc9e0adce1e1a"}, {file = "triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5ce8520437c602fb633f1324cc3871c47bee3b67acf9756c1a66309b60e3216c"}, @@ -4500,6 +4742,7 @@ version = "2.9.0.20241003" description = "Typing stubs for python-dateutil" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "types-python-dateutil-2.9.0.20241003.tar.gz", hash = "sha256:58cb85449b2a56d6684e41aeefb4c4280631246a0da1a719bdbe6f3fb0317446"}, {file = "types_python_dateutil-2.9.0.20241003-py3-none-any.whl", hash = "sha256:250e1d8e80e7bbc3a6c99b907762711d1a1cdd00e978ad39cb5940f6f0a87f3d"}, @@ -4511,10 +4754,12 @@ version = "4.12.2" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" +groups = ["main", "dev", "gpu"] files = [ {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, ] +markers = {main = "extra == \"huggingface\"", gpu = "sys_platform == \"win32\" or sys_platform == \"linux\""} [[package]] name = "tzdata" @@ -4522,6 +4767,7 @@ version = "2024.2" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" +groups = ["main", "dev"] files = [ {file = "tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd"}, {file = "tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc"}, @@ -4533,6 +4779,7 @@ version = "1.3.0" description = "RFC 6570 URI Template Processor" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "uri-template-1.3.0.tar.gz", hash = "sha256:0e00f8eb65e18c7de20d595a14336e9f337ead580c70934141624b6d1ffdacc7"}, {file = "uri_template-1.3.0-py3-none-any.whl", hash = "sha256:a44a133ea12d44a0c0f06d7d42a52d71282e77e2f937d8abd5655b8d56fc1363"}, @@ -4547,14 +4794,15 @@ version = "1.26.20" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +groups = ["main", "dev", "gpu"] files = [ {file = "urllib3-1.26.20-py2.py3-none-any.whl", hash = "sha256:0ed14ccfbf1c30a9072c7ca157e4319b70d65f623e91e7b32fadb2853431016e"}, {file = "urllib3-1.26.20.tar.gz", hash = "sha256:40c2dc0c681e47eb8f90e7e27bf6ff7df2e677421fd46756da1161c39ca70d32"}, ] [package.extras] -brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] -secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] +brotli = ["brotli (==1.0.9) ; os_name != \"nt\" and python_version < \"3\" and platform_python_implementation == \"CPython\"", "brotli (>=1.0.9) ; python_version >= \"3\" and platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; (os_name != \"nt\" or python_version >= \"3\") and platform_python_implementation != \"CPython\"", "brotlipy (>=0.6.0) ; os_name == \"nt\" and python_version < \"3\""] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress ; python_version == \"2.7\"", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] @@ -4563,6 +4811,7 @@ version = "0.2.13" description = "Measures the displayed width of unicode strings in a terminal" optional = false python-versions = "*" +groups = ["dev"] files = [ {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"}, {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"}, @@ -4574,6 +4823,7 @@ version = "24.8.0" description = "A library for working with the color formats defined by HTML and CSS." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "webcolors-24.8.0-py3-none-any.whl", hash = "sha256:fc4c3b59358ada164552084a8ebee637c221e4059267d0f8325b3b560f6c7f0a"}, {file = "webcolors-24.8.0.tar.gz", hash = "sha256:08b07af286a01bcd30d583a7acadf629583d1f79bfef27dd2c2c5c263817277d"}, @@ -4589,6 +4839,7 @@ version = "0.5.1" description = "Character encoding aliases for legacy web content" optional = false python-versions = "*" +groups = ["dev"] files = [ {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, @@ -4600,6 +4851,7 @@ version = "1.8.0" description = "WebSocket client for Python with low level API options" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "websocket_client-1.8.0-py3-none-any.whl", hash = "sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526"}, {file = "websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da"}, @@ -4616,6 +4868,7 @@ version = "4.0.13" description = "Jupyter interactive widgets for Jupyter Notebook" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "widgetsnbextension-4.0.13-py3-none-any.whl", hash = "sha256:74b2692e8500525cc38c2b877236ba51d34541e6385eeed5aec15a70f88a6c71"}, {file = "widgetsnbextension-4.0.13.tar.gz", hash = "sha256:ffcb67bc9febd10234a362795f643927f4e0c05d9342c727b65d2384f8feacb6"}, @@ -4627,6 +4880,8 @@ version = "3.5.0" description = "Python binding for xxHash" optional = false python-versions = ">=3.7" +groups = ["main"] +markers = "extra == \"huggingface\"" files = [ {file = "xxhash-3.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ece616532c499ee9afbb83078b1b952beffef121d989841f7f4b3dc5ac0fd212"}, {file = "xxhash-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3171f693dbc2cef6477054a665dc255d996646b4023fe56cb4db80e26f4cc520"}, @@ -4759,6 +5014,8 @@ version = "1.17.0" description = "Yet another URL library" optional = false python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"huggingface\"" files = [ {file = "yarl-1.17.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2d8715edfe12eee6f27f32a3655f38d6c7410deb482158c0b7d4b7fad5d07628"}, {file = "yarl-1.17.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1803bf2a7a782e02db746d8bd18f2384801bc1d108723840b25e065b116ad726"}, @@ -4855,17 +5112,19 @@ version = "3.20.2" description = "Backport of pathlib-compatible object wrapper for zip files" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version == \"3.9\"" files = [ {file = "zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350"}, {file = "zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29"}, ] [package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] enabler = ["pytest-enabler (>=2.2)"] -test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] +test = ["big-O", "importlib-resources ; python_version < \"3.9\"", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] type = ["pytest-mypy"] [extras] @@ -4875,6 +5134,6 @@ sentencepiece = ["sentencepiece"] thot = ["sil-thot"] [metadata] -lock-version = "2.0" +lock-version = "2.1" python-versions = ">=3.9,<3.13" -content-hash = "b650f3e8499b348a527c5e5f0e89ba90e55fb7df93bb907cc8d8e5fdd6b63cb0" +content-hash = "d292103e26b41fd440528597df80a64661ef21afd6be8fd07a8c34521729ad65" diff --git a/pyproject.toml b/pyproject.toml index 822c5ee5..853dc368 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,7 +84,7 @@ pytest-cov = "^4.1.0" ipykernel = "^6.7.0" jupyter = "^1.0.0" pandas = "^2.0.3" -pyright = { extras = ["nodejs"], version = "^1.1.362" } +pyright = { extras = ["nodejs"], version = "^1.1.399" } decoy = "^2.1.0" pep8-naming = "^0.14.1" diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index c6cf8cea..c9ee6ba8 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -230,7 +230,7 @@ def test_paragraph_in_verse(): result_strip = r"""\id MAT \c 1 -\p +\p \v 1 Update 1 \s1 \v 2 From 1fa8e82dd82d0da0e3139f0944f02d945057f19d Mon Sep 17 00:00:00 2001 From: John Lambert Date: Fri, 11 Apr 2025 16:21:50 -0400 Subject: [PATCH 07/11] Respond to reviewer comments Pass marker type (embed, style) to update block --- .../paratext_project_terms_parser_base.py | 2 ++ machine/corpora/scripture_embed.py | 16 +++++++++++ .../scripture_ref_usfm_parser_handler.py | 28 ++++++------------- machine/corpora/scripture_update_block.py | 18 ++++++------ .../corpora/scripture_update_block_handler.py | 3 +- machine/corpora/scripture_update_element.py | 21 +++++++++++++- machine/corpora/update_usfm_parser_handler.py | 3 +- 7 files changed, 59 insertions(+), 32 deletions(-) create mode 100644 machine/corpora/scripture_embed.py diff --git a/machine/corpora/paratext_project_terms_parser_base.py b/machine/corpora/paratext_project_terms_parser_base.py index 00496443..3245c953 100644 --- a/machine/corpora/paratext_project_terms_parser_base.py +++ b/machine/corpora/paratext_project_terms_parser_base.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import re from abc import ABC, abstractmethod from collections import defaultdict diff --git a/machine/corpora/scripture_embed.py b/machine/corpora/scripture_embed.py new file mode 100644 index 00000000..cc4a64f6 --- /dev/null +++ b/machine/corpora/scripture_embed.py @@ -0,0 +1,16 @@ +from typing import Optional + +EMBED_PART_START_CHAR_STYLES = ("f", "x", "z") +EMBED_STYLES = ("f", "fe", "fig", "fm", "x") + + +def is_note_text(marker: Optional[str]) -> bool: + return marker == "ft" + + +def is_embed_part_style(marker: Optional[str]) -> bool: + return marker is not None and marker.startswith(EMBED_PART_START_CHAR_STYLES) + + +def is_embed_style(marker: Optional[str]) -> bool: + return marker is not None and marker.strip("*") in EMBED_STYLES diff --git a/machine/corpora/scripture_ref_usfm_parser_handler.py b/machine/corpora/scripture_ref_usfm_parser_handler.py index f9bd263d..5dc6783a 100644 --- a/machine/corpora/scripture_ref_usfm_parser_handler.py +++ b/machine/corpora/scripture_ref_usfm_parser_handler.py @@ -5,6 +5,7 @@ from ..scripture.verse_ref import VerseRef, are_overlapping_verse_ranges from .corpora_utils import merge_verse_ranges from .scripture_element import ScriptureElement +from .scripture_embed import EMBED_PART_START_CHAR_STYLES, is_embed_part_style, is_embed_style, is_note_text from .scripture_ref import ScriptureRef from .usfm_parser_handler import UsfmParserHandler from .usfm_parser_state import UsfmParserState @@ -18,10 +19,6 @@ class ScriptureTextType(Enum): NOTE_TEXT = auto() -EMBED_PART_START_CHAR_STYLES = ("f", "x", "z") -EMBED_STYLES = ("f", "fe", "fig", "fm", "x") - - class ScriptureRefUsfmParserHandler(UsfmParserHandler, ABC): def __init__(self) -> None: self._cur_verse_ref: VerseRef = VerseRef() @@ -152,27 +149,27 @@ def opt_break(self, state: UsfmParserState) -> None: def start_char( self, state: UsfmParserState, marker: str, unknown: bool, attributes: Optional[Sequence[UsfmAttribute]] ) -> None: - if self._is_embed_part_style(marker) and self._in_note_text: + if is_embed_part_style(marker) and self._in_note_text: self._in_nested_embed = True # if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse segment self._check_convert_verse_para_to_non_verse(state) - if self._is_embed_style(marker): + if is_embed_style(marker): self._in_embed = True self._start_embed_wrapper(state, marker) - if self._is_note_text(marker): + if is_note_text(marker): self._start_note_text_wrapper(state) def end_char( self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool ) -> None: - if self._is_embed_part_style(marker): + if is_embed_part_style(marker): if self._in_nested_embed: self._in_nested_embed = False elif self._is_note_text(marker): self._end_note_text_wrapper(state) - if self._is_embed_style(marker): + if is_embed_style(marker): self._end_embed(state, marker, attributes, closed) self._in_embed = False @@ -237,7 +234,7 @@ def _end_parent_element(self) -> None: self._cur_elements_stack.pop() def _end_embed_elements(self) -> None: - if self._cur_elements_stack and self._is_embed_style(self._cur_elements_stack[-1].name): + if self._cur_elements_stack and is_embed_style(self._cur_elements_stack[-1].name): self._cur_elements_stack.pop() def _create_verse_refs(self) -> List[ScriptureRef]: @@ -268,7 +265,7 @@ def _check_convert_verse_para_to_non_verse(self, state: UsfmParserState) -> None self._start_non_verse_text_wrapper(state) def _is_in_embed(self, marker: Optional[str]) -> bool: - return self._in_embed or self._is_embed_style(marker) + return self._in_embed or is_embed_style(marker) def _is_in_nested_embed(self, marker: Optional[str]) -> bool: return self._in_nested_embed or ( @@ -277,12 +274,3 @@ def _is_in_nested_embed(self, marker: Optional[str]) -> bool: and marker[1] in EMBED_PART_START_CHAR_STYLES and marker != "fm" ) - - def _is_note_text(self, marker: Optional[str]) -> bool: - return marker == "ft" - - def _is_embed_part_style(self, marker: Optional[str]) -> bool: - return marker is not None and marker.startswith(EMBED_PART_START_CHAR_STYLES) and marker != "fm" - - def _is_embed_style(self, marker: Optional[str]) -> bool: - return marker is not None and marker.strip("*") in EMBED_STYLES diff --git a/machine/corpora/scripture_update_block.py b/machine/corpora/scripture_update_block.py index afb9e75a..b4c7e290 100644 --- a/machine/corpora/scripture_update_block.py +++ b/machine/corpora/scripture_update_block.py @@ -1,14 +1,18 @@ from __future__ import annotations from .scripture_ref import ScriptureRef -from .scripture_update_element import ScriptureUpdateElement, ScriptureUpdateElementType +from .scripture_update_element import ( + ScriptureUpdateElement, + ScriptureUpdateElementType, + create_non_text_scripture_element, +) from .usfm_token import UsfmToken, UsfmTokenType class ScriptureUpdateBlock: def __init__(self) -> None: - self._ref: ScriptureRef = ScriptureRef() + self.ref: ScriptureRef = ScriptureRef() self._elements: list[ScriptureUpdateElement] = [] @property @@ -29,21 +33,19 @@ def add_token(self, token: UsfmToken, marked_for_removal: bool = False) -> None: ScriptureUpdateElement(ScriptureUpdateElementType.EXISTING_TEXT, [token], marked_for_removal) ) else: - self._elements.append(ScriptureUpdateElement(ScriptureUpdateElementType.OTHER, [token], marked_for_removal)) + self._elements.append(create_non_text_scripture_element([token], marked_for_removal)) def add_tokens(self, tokens: list[UsfmToken], marked_for_removal: bool = False) -> None: if len(tokens) == 0: return - self._elements.append( - ScriptureUpdateElement(ScriptureUpdateElementType.OTHER, tokens.copy(), marked_for_removal) - ) + self._elements.append(create_non_text_scripture_element(tokens, marked_for_removal)) def update_ref(self, ref: ScriptureRef) -> None: - self._ref = ref + self.ref = ref def clear(self) -> None: self._elements.clear() - self._ref = ScriptureRef() + self.ref = ScriptureRef() def get_tokens(self) -> list[UsfmToken]: return [token for element in self._elements for token in element.get_tokens()] diff --git a/machine/corpora/scripture_update_block_handler.py b/machine/corpora/scripture_update_block_handler.py index bcbe8fb8..c520f50b 100644 --- a/machine/corpora/scripture_update_block_handler.py +++ b/machine/corpora/scripture_update_block_handler.py @@ -7,5 +7,4 @@ class ScriptureUpdateBlockHandler(ABC): - def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: - raise NotImplementedError("Must be implemented in subclass") + def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: ... diff --git a/machine/corpora/scripture_update_element.py b/machine/corpora/scripture_update_element.py index fe39d7e5..7296bd0a 100644 --- a/machine/corpora/scripture_update_element.py +++ b/machine/corpora/scripture_update_element.py @@ -3,12 +3,16 @@ from dataclasses import dataclass from enum import Enum, auto -from .usfm_token import UsfmToken +from .scripture_embed import is_embed_style +from .usfm_token import UsfmToken, UsfmTokenType class ScriptureUpdateElementType(Enum): EXISTING_TEXT = auto() INSERTED_TEXT = auto() + PARAGRAPH = auto() + EMBED = auto() + STYLE = auto() OTHER = auto() @@ -22,3 +26,18 @@ def get_tokens(self) -> list[UsfmToken]: if self.marked_for_removal: return [] return self.tokens + + +def create_non_text_scripture_element( + tokens: list[UsfmToken], marked_for_removal: bool = False +) -> ScriptureUpdateElement: + tokens = tokens.copy() + # Determine if it is a Paragraph, style, embed or other + if len(tokens) == 0 or tokens[0].marker is None: + return ScriptureUpdateElement(ScriptureUpdateElementType.OTHER, [], marked_for_removal) + if tokens[0].type == UsfmTokenType.PARAGRAPH: + return ScriptureUpdateElement(ScriptureUpdateElementType.PARAGRAPH, tokens, marked_for_removal) + if is_embed_style(tokens[0].marker): + return ScriptureUpdateElement(ScriptureUpdateElementType.EMBED, tokens, marked_for_removal) + else: + return ScriptureUpdateElement(ScriptureUpdateElementType.STYLE, tokens, marked_for_removal) diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index ecdf0881..6ab7bf08 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -2,6 +2,7 @@ from typing import List, Optional, Sequence, Tuple, Union from ..scripture.verse_ref import VerseRef +from .scripture_embed import is_embed_part_style from .scripture_ref import ScriptureRef from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler from .scripture_update_block import ScriptureUpdateBlock @@ -345,7 +346,7 @@ def _replace_with_new_tokens(self, state: UsfmParserState, closed: bool = True) in_embed: bool = self._is_in_embed(marker) in_nested_embed: bool = self._is_in_nested_embed(marker) - is_style_tag: bool = marker is not None and not self._is_embed_part_style(marker) + is_style_tag: bool = marker is not None and not is_embed_part_style(marker) existing_text = any( t.type == UsfmTokenType.TEXT and t.text From bb730bf36b3a41f8364dbdc9a4fd1a8fe3dc90fc Mon Sep 17 00:00:00 2001 From: John Lambert Date: Mon, 14 Apr 2025 12:53:27 -0400 Subject: [PATCH 08/11] Make last type - EMBED_BLOCK --- machine/corpora/scripture_update_block.py | 6 ++++-- machine/corpora/scripture_update_element.py | 1 + machine/corpora/update_usfm_parser_handler.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/machine/corpora/scripture_update_block.py b/machine/corpora/scripture_update_block.py index b4c7e290..72031e9f 100644 --- a/machine/corpora/scripture_update_block.py +++ b/machine/corpora/scripture_update_block.py @@ -35,10 +35,12 @@ def add_token(self, token: UsfmToken, marked_for_removal: bool = False) -> None: else: self._elements.append(create_non_text_scripture_element([token], marked_for_removal)) - def add_tokens(self, tokens: list[UsfmToken], marked_for_removal: bool = False) -> None: + def add_embed(self, tokens: list[UsfmToken], marked_for_removal: bool = False) -> None: if len(tokens) == 0: return - self._elements.append(create_non_text_scripture_element(tokens, marked_for_removal)) + self._elements.append( + ScriptureUpdateElement(ScriptureUpdateElementType.EMBED_BLOCK, tokens, marked_for_removal) + ) def update_ref(self, ref: ScriptureRef) -> None: self.ref = ref diff --git a/machine/corpora/scripture_update_element.py b/machine/corpora/scripture_update_element.py index 7296bd0a..754e48f6 100644 --- a/machine/corpora/scripture_update_element.py +++ b/machine/corpora/scripture_update_element.py @@ -11,6 +11,7 @@ class ScriptureUpdateElementType(Enum): EXISTING_TEXT = auto() INSERTED_TEXT = auto() PARAGRAPH = auto() + EMBED_BLOCK = auto() EMBED = auto() STYLE = auto() OTHER = auto() diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 6ab7bf08..42a3b56c 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -426,7 +426,7 @@ def _process_embed_update_block(self) -> None: self._use_updated_embed_text() for handler in self._update_block_handlers: self._embed_update_block = handler.process_block(self._embed_update_block) - self._update_block.add_tokens(self._embed_update_block.get_tokens()) + self._update_block.add_embed(self._embed_update_block.get_tokens()) self._embed_update_block.clear() def _push_updated_text(self, tokens: List[UsfmToken]) -> None: From 5ff5e0752241c113b503e77d48bff9a17f4092f1 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Mon, 14 Apr 2025 15:15:20 -0400 Subject: [PATCH 09/11] linting --- machine/corpora/zip_paratext_project_text_updater.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine/corpora/zip_paratext_project_text_updater.py b/machine/corpora/zip_paratext_project_text_updater.py index 75e8ff02..b4dbd8bd 100644 --- a/machine/corpora/zip_paratext_project_text_updater.py +++ b/machine/corpora/zip_paratext_project_text_updater.py @@ -18,5 +18,5 @@ def _exists(self, file_name: StrPath) -> bool: def _open(self, file_name: StrPath) -> Optional[BinaryIO]: if file_name in self._archive.namelist(): - return BytesIO(self._archive.read(file_name)) + return BytesIO(self._archive.read(str(file_name))) return None From 9dba22bb0a57153341873b69528edc997e772e39 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Tue, 22 Apr 2025 16:57:33 -0400 Subject: [PATCH 10/11] Reviewer updates --- machine/corpora/scripture_update_block_handler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/machine/corpora/scripture_update_block_handler.py b/machine/corpora/scripture_update_block_handler.py index c520f50b..b3b1d654 100644 --- a/machine/corpora/scripture_update_block_handler.py +++ b/machine/corpora/scripture_update_block_handler.py @@ -1,10 +1,11 @@ from __future__ import annotations -from abc import ABC +from abc import ABC, abstractmethod from .scripture_update_block import ScriptureUpdateBlock class ScriptureUpdateBlockHandler(ABC): + @abstractmethod def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: ... From 402a03fcd17268d7669b618e72d911120c606505 Mon Sep 17 00:00:00 2001 From: Damien Daspit Date: Fri, 2 May 2025 16:48:38 -0500 Subject: [PATCH 11/11] Refactor update block --- .github/workflows/ci.yml | 2 +- machine/corpora/__init__.py | 11 +- .../paratext_project_text_updater_base.py | 8 +- machine/corpora/scripture_embed.py | 16 - .../scripture_ref_usfm_parser_handler.py | 116 ++-- machine/corpora/scripture_update_block.py | 53 -- .../corpora/scripture_update_block_handler.py | 11 - machine/corpora/scripture_update_element.py | 44 -- machine/corpora/update_usfm_parser_handler.py | 347 +++++------- machine/corpora/usfm_text_base.py | 20 +- machine/corpora/usfm_update_block.py | 55 ++ machine/corpora/usfm_update_block_element.py | 24 + machine/corpora/usfm_update_block_handler.py | 8 + poetry.lock | 14 +- pyproject.toml | 2 +- .../test_update_usfm_parser_handler.py | 533 ++++++++++++------ tests/corpora/test_usfm_file_text.py | 68 +-- tests/corpora/test_usfm_memory_text.py | 16 +- 18 files changed, 679 insertions(+), 669 deletions(-) delete mode 100644 machine/corpora/scripture_embed.py delete mode 100644 machine/corpora/scripture_update_block.py delete mode 100644 machine/corpora/scripture_update_block_handler.py delete mode 100644 machine/corpora/scripture_update_element.py create mode 100644 machine/corpora/usfm_update_block.py create mode 100644 machine/corpora/usfm_update_block_element.py create mode 100644 machine/corpora/usfm_update_block_handler.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 044a4db5..860cd8dc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -55,7 +55,7 @@ jobs: node-version: "14" - name: Lint with pyright run: | - npm install -g pyright@1.1.399 + npm install -g pyright@1.1.400 poetry run pyright - name: Test with pytest run: poetry run pytest --cov --cov-report=xml diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index 98773317..523604c0 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -61,6 +61,9 @@ from .usfm_tag import UsfmJustification, UsfmStyleAttribute, UsfmStyleType, UsfmTag, UsfmTextProperties, UsfmTextType from .usfm_token import UsfmAttribute, UsfmToken, UsfmTokenType from .usfm_tokenizer import RtlReferenceOrder, UsfmTokenizer +from .usfm_update_block import UsfmUpdateBlock +from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType +from .usfm_update_block_handler import UsfmUpdateBlockHandler from .usx_file_alignment_collection import UsxFileAlignmentCollection from .usx_file_alignment_corpus import UsxFileAlignmentCorpus from .usx_file_text import UsxFileText @@ -92,8 +95,8 @@ "is_scripture", "lowercase", "MemoryAlignmentCollection", - "MemoryText", "MemoryStreamContainer", + "MemoryText", "MultiKeyRef", "nfc_normalize", "nfd_normalize", @@ -126,9 +129,9 @@ "TextRow", "TextRowFlags", "unescape_spaces", - "UpdateUsfmTextBehavior", "UpdateUsfmMarkerBehavior", "UpdateUsfmParserHandler", + "UpdateUsfmTextBehavior", "UsfmAttribute", "UsfmElementType", "UsfmFileText", @@ -148,6 +151,10 @@ "UsfmToken", "UsfmTokenizer", "UsfmTokenType", + "UsfmUpdateBlock", + "UsfmUpdateBlockElement", + "UsfmUpdateBlockElementType", + "UsfmUpdateBlockHandler", "UsxFileAlignmentCollection", "UsxFileAlignmentCorpus", "UsxFileText", diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index 8ba806a8..6ae04394 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -1,13 +1,13 @@ from abc import ABC, abstractmethod -from typing import BinaryIO, Optional, Sequence, Tuple, Union +from typing import BinaryIO, Iterable, Optional, Sequence, Tuple, Union from ..utils.typeshed import StrPath from .paratext_project_settings import ParatextProjectSettings from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase from .scripture_ref import ScriptureRef -from .scripture_update_block_handler import ScriptureUpdateBlockHandler from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, UpdateUsfmTextBehavior from .usfm_parser import parse_usfm +from .usfm_update_block_handler import UsfmUpdateBlockHandler class ParatextProjectTextUpdaterBase(ABC): @@ -26,8 +26,8 @@ def update_usfm( paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, - preserve_paragraph_styles: Optional[Sequence[str]] = None, - update_block_handlers: Optional[list[ScriptureUpdateBlockHandler]] = None, + preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None, + update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None, ) -> Optional[str]: file_name: str = self._settings.get_book_file_name(book_id) if not self._exists(file_name): diff --git a/machine/corpora/scripture_embed.py b/machine/corpora/scripture_embed.py deleted file mode 100644 index cc4a64f6..00000000 --- a/machine/corpora/scripture_embed.py +++ /dev/null @@ -1,16 +0,0 @@ -from typing import Optional - -EMBED_PART_START_CHAR_STYLES = ("f", "x", "z") -EMBED_STYLES = ("f", "fe", "fig", "fm", "x") - - -def is_note_text(marker: Optional[str]) -> bool: - return marker == "ft" - - -def is_embed_part_style(marker: Optional[str]) -> bool: - return marker is not None and marker.startswith(EMBED_PART_START_CHAR_STYLES) - - -def is_embed_style(marker: Optional[str]) -> bool: - return marker is not None and marker.strip("*") in EMBED_STYLES diff --git a/machine/corpora/scripture_ref_usfm_parser_handler.py b/machine/corpora/scripture_ref_usfm_parser_handler.py index 5dc6783a..db9081b7 100644 --- a/machine/corpora/scripture_ref_usfm_parser_handler.py +++ b/machine/corpora/scripture_ref_usfm_parser_handler.py @@ -5,7 +5,6 @@ from ..scripture.verse_ref import VerseRef, are_overlapping_verse_ranges from .corpora_utils import merge_verse_ranges from .scripture_element import ScriptureElement -from .scripture_embed import EMBED_PART_START_CHAR_STYLES, is_embed_part_style, is_embed_style, is_note_text from .scripture_ref import ScriptureRef from .usfm_parser_handler import UsfmParserHandler from .usfm_parser_state import UsfmParserState @@ -16,7 +15,14 @@ class ScriptureTextType(Enum): NONE = auto() NONVERSE = auto() VERSE = auto() - NOTE_TEXT = auto() + EMBED = auto() + + +_EMBED_STYLES = {"f", "fe", "x", "fig"} + + +def _is_embed_style(marker: Optional[str]) -> bool: + return marker is not None and (marker.strip("*") in _EMBED_STYLES or marker.startswith("z")) class ScriptureRefUsfmParserHandler(UsfmParserHandler, ABC): @@ -25,18 +31,11 @@ def __init__(self) -> None: self._cur_elements_stack: List[ScriptureElement] = [] self._cur_text_type_stack: List[ScriptureTextType] = [] self._duplicate_verse: bool = False - self._in_preserved_paragraph: bool = False - self._in_embed: bool = False - self._in_note_text: bool = False - self._in_nested_embed: bool = False @property def _current_text_type(self) -> ScriptureTextType: return ScriptureTextType.NONE if len(self._cur_text_type_stack) == 0 else self._cur_text_type_stack[-1] - def _is_in_note_text(self) -> bool: - return self._in_note_text - def end_usfm(self, state: UsfmParserState) -> None: self._end_verse_text_wrapper(state) @@ -112,32 +111,6 @@ def start_sidebar(self, state: UsfmParserState, marker: str, category: str) -> N def end_sidebar(self, state: UsfmParserState, marker: str, closed: bool) -> None: self._end_parent_element() - def start_note(self, state: UsfmParserState, marker: str, caller: str, category: Optional[str]) -> None: - self._in_embed = True - self._start_embed_wrapper(state, marker) - - def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None: - self._end_note_text_wrapper(state) - self._end_embed(state, marker, None, closed) - self._in_embed = False - - def _start_embed_wrapper(self, state: UsfmParserState, marker: str) -> None: - if self._cur_verse_ref.is_default: - self._update_verse_ref(state.verse_ref, marker) - - if not self._duplicate_verse: - self._check_convert_verse_para_to_non_verse(state) - self._next_element(marker) - - self._start_embed(state, self._create_non_verse_ref()) - - def _start_embed(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ... - - def _end_embed( - self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool - ) -> None: - pass - def text(self, state: UsfmParserState, text: str) -> None: # if we hit text in a verse paragraph and we aren't in a verse, then start a non-verse segment if text.strip(): @@ -149,29 +122,23 @@ def opt_break(self, state: UsfmParserState) -> None: def start_char( self, state: UsfmParserState, marker: str, unknown: bool, attributes: Optional[Sequence[UsfmAttribute]] ) -> None: - if is_embed_part_style(marker) and self._in_note_text: - self._in_nested_embed = True # if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse segment self._check_convert_verse_para_to_non_verse(state) - if is_embed_style(marker): - self._in_embed = True - self._start_embed_wrapper(state, marker) - - if is_note_text(marker): - self._start_note_text_wrapper(state) + if _is_embed_style(marker): + self._start_embed_text_wrapper(state, marker) def end_char( self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool ) -> None: - if is_embed_part_style(marker): - if self._in_nested_embed: - self._in_nested_embed = False - elif self._is_note_text(marker): - self._end_note_text_wrapper(state) - if is_embed_style(marker): - self._end_embed(state, marker, attributes, closed) - self._in_embed = False + if _is_embed_style(marker): + self._end_embed_text_wrapper(state) + + def start_note(self, state: UsfmParserState, marker: str, caller: str, category: Optional[str]) -> None: + self._start_embed_text_wrapper(state, marker) + + def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None: + self._end_embed_text_wrapper(state) def _start_verse_text(self, state: UsfmParserState, scripture_refs: Optional[Sequence[ScriptureRef]]) -> None: ... @@ -181,20 +148,9 @@ def _start_non_verse_text(self, state: UsfmParserState, scripture_ref: Scripture def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ... - def _start_note_text_wrapper(self, state: UsfmParserState): - self._in_note_text = True - self._cur_text_type_stack.append(ScriptureTextType.NOTE_TEXT) - self._start_note_text(state) - - def _start_note_text(self, state: UsfmParserState) -> None: ... + def _start_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ... - def _end_note_text_wrapper(self, state: UsfmParserState): - if self._cur_text_type_stack and self._cur_text_type_stack[-1] == ScriptureTextType.NOTE_TEXT: - self._end_note_text(state, self._create_non_verse_ref()) - self._cur_text_type_stack.pop() - self._in_note_text = False - - def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ... + def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ... def _start_verse_text_wrapper(self, state: UsfmParserState) -> None: self._duplicate_verse = False @@ -222,6 +178,25 @@ def _update_verse_ref(self, verse_ref: VerseRef, marker: str) -> None: self._cur_elements_stack.append(ScriptureElement(0, marker)) self._cur_verse_ref = verse_ref.copy() + def _start_embed_text_wrapper(self, state: UsfmParserState, marker: str) -> None: + if self._cur_verse_ref.is_default: + self._update_verse_ref(state.verse_ref, marker) + + if not self._duplicate_verse: + self._check_convert_verse_para_to_non_verse(state) + self._next_element(marker) + self._cur_text_type_stack.append(ScriptureTextType.EMBED) + self._start_embed_text(state, self._create_non_verse_ref()) + + def _end_embed_text_wrapper(self, state: UsfmParserState) -> None: + if ( + not self._duplicate_verse + and self._cur_text_type_stack + and self._cur_text_type_stack[-1] == ScriptureTextType.EMBED + ): + self._end_embed_text(state, self._create_non_verse_ref()) + self._cur_text_type_stack.pop() + def _next_element(self, marker: str) -> None: prev_elem: ScriptureElement = self._cur_elements_stack.pop() self._cur_elements_stack.append(ScriptureElement(prev_elem.position + 1, marker)) @@ -234,7 +209,7 @@ def _end_parent_element(self) -> None: self._cur_elements_stack.pop() def _end_embed_elements(self) -> None: - if self._cur_elements_stack and is_embed_style(self._cur_elements_stack[-1].name): + if self._cur_elements_stack and _is_embed_style(self._cur_elements_stack[-1].name): self._cur_elements_stack.pop() def _create_verse_refs(self) -> List[ScriptureRef]: @@ -263,14 +238,3 @@ def _check_convert_verse_para_to_non_verse(self, state: UsfmParserState) -> None ): self._start_parent_element(para_tag.marker) self._start_non_verse_text_wrapper(state) - - def _is_in_embed(self, marker: Optional[str]) -> bool: - return self._in_embed or is_embed_style(marker) - - def _is_in_nested_embed(self, marker: Optional[str]) -> bool: - return self._in_nested_embed or ( - marker is not None - and marker.startswith("+") - and marker[1] in EMBED_PART_START_CHAR_STYLES - and marker != "fm" - ) diff --git a/machine/corpora/scripture_update_block.py b/machine/corpora/scripture_update_block.py deleted file mode 100644 index 72031e9f..00000000 --- a/machine/corpora/scripture_update_block.py +++ /dev/null @@ -1,53 +0,0 @@ -from __future__ import annotations - -from .scripture_ref import ScriptureRef -from .scripture_update_element import ( - ScriptureUpdateElement, - ScriptureUpdateElementType, - create_non_text_scripture_element, -) -from .usfm_token import UsfmToken, UsfmTokenType - - -class ScriptureUpdateBlock: - - def __init__(self) -> None: - self.ref: ScriptureRef = ScriptureRef() - self._elements: list[ScriptureUpdateElement] = [] - - @property - def elements(self) -> list[ScriptureUpdateElement]: - return self._elements - - def add_existing_text(self, token: UsfmToken, marked_for_removal: bool = False) -> None: - self._elements.append( - ScriptureUpdateElement(ScriptureUpdateElementType.EXISTING_TEXT, [token], marked_for_removal) - ) - - def add_inserted_text(self, tokens: list[UsfmToken]) -> None: - self._elements.append(ScriptureUpdateElement(ScriptureUpdateElementType.INSERTED_TEXT, tokens.copy())) - - def add_token(self, token: UsfmToken, marked_for_removal: bool = False) -> None: - if token.type == UsfmTokenType.TEXT: - self._elements.append( - ScriptureUpdateElement(ScriptureUpdateElementType.EXISTING_TEXT, [token], marked_for_removal) - ) - else: - self._elements.append(create_non_text_scripture_element([token], marked_for_removal)) - - def add_embed(self, tokens: list[UsfmToken], marked_for_removal: bool = False) -> None: - if len(tokens) == 0: - return - self._elements.append( - ScriptureUpdateElement(ScriptureUpdateElementType.EMBED_BLOCK, tokens, marked_for_removal) - ) - - def update_ref(self, ref: ScriptureRef) -> None: - self.ref = ref - - def clear(self) -> None: - self._elements.clear() - self.ref = ScriptureRef() - - def get_tokens(self) -> list[UsfmToken]: - return [token for element in self._elements for token in element.get_tokens()] diff --git a/machine/corpora/scripture_update_block_handler.py b/machine/corpora/scripture_update_block_handler.py deleted file mode 100644 index b3b1d654..00000000 --- a/machine/corpora/scripture_update_block_handler.py +++ /dev/null @@ -1,11 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod - -from .scripture_update_block import ScriptureUpdateBlock - - -class ScriptureUpdateBlockHandler(ABC): - - @abstractmethod - def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock: ... diff --git a/machine/corpora/scripture_update_element.py b/machine/corpora/scripture_update_element.py deleted file mode 100644 index 754e48f6..00000000 --- a/machine/corpora/scripture_update_element.py +++ /dev/null @@ -1,44 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from enum import Enum, auto - -from .scripture_embed import is_embed_style -from .usfm_token import UsfmToken, UsfmTokenType - - -class ScriptureUpdateElementType(Enum): - EXISTING_TEXT = auto() - INSERTED_TEXT = auto() - PARAGRAPH = auto() - EMBED_BLOCK = auto() - EMBED = auto() - STYLE = auto() - OTHER = auto() - - -@dataclass -class ScriptureUpdateElement: - type: ScriptureUpdateElementType - tokens: list[UsfmToken] - marked_for_removal: bool = False - - def get_tokens(self) -> list[UsfmToken]: - if self.marked_for_removal: - return [] - return self.tokens - - -def create_non_text_scripture_element( - tokens: list[UsfmToken], marked_for_removal: bool = False -) -> ScriptureUpdateElement: - tokens = tokens.copy() - # Determine if it is a Paragraph, style, embed or other - if len(tokens) == 0 or tokens[0].marker is None: - return ScriptureUpdateElement(ScriptureUpdateElementType.OTHER, [], marked_for_removal) - if tokens[0].type == UsfmTokenType.PARAGRAPH: - return ScriptureUpdateElement(ScriptureUpdateElementType.PARAGRAPH, tokens, marked_for_removal) - if is_embed_style(tokens[0].marker): - return ScriptureUpdateElement(ScriptureUpdateElementType.EMBED, tokens, marked_for_removal) - else: - return ScriptureUpdateElement(ScriptureUpdateElementType.STYLE, tokens, marked_for_removal) diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 42a3b56c..8df9db91 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -1,16 +1,15 @@ from enum import Enum, auto -from typing import List, Optional, Sequence, Tuple, Union +from typing import Iterable, List, Optional, Sequence, Tuple, Union -from ..scripture.verse_ref import VerseRef -from .scripture_embed import is_embed_part_style from .scripture_ref import ScriptureRef -from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler -from .scripture_update_block import ScriptureUpdateBlock -from .scripture_update_block_handler import ScriptureUpdateBlockHandler +from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType from .usfm_parser_state import UsfmParserState from .usfm_stylesheet import UsfmStylesheet from .usfm_token import UsfmAttribute, UsfmToken, UsfmTokenType from .usfm_tokenizer import UsfmTokenizer +from .usfm_update_block import UsfmUpdateBlock +from .usfm_update_block_element import UsfmUpdateBlockElementType +from .usfm_update_block_handler import UsfmUpdateBlockHandler class UpdateUsfmTextBehavior(Enum): @@ -25,7 +24,6 @@ class UpdateUsfmMarkerBehavior(Enum): class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandler): - def __init__( self, rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None, @@ -34,21 +32,20 @@ def __init__( paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, - preserve_paragraph_styles: Optional[Sequence[str]] = None, - update_block_handlers: Optional[list[ScriptureUpdateBlockHandler]] = None, + preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None, + update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None, ) -> None: super().__init__() self._rows = rows or [] self._tokens: List[UsfmToken] = [] self._updated_text: List[UsfmToken] = [] - self._updated_embed_text: List[UsfmToken] = [] - self._update_block: ScriptureUpdateBlock = ScriptureUpdateBlock() - self._embed_update_block: ScriptureUpdateBlock = ScriptureUpdateBlock() + self._update_block_stack: list[UsfmUpdateBlock] = [] + self._embed_tokens: List[UsfmToken] = [] self._id_text = id_text if update_block_handlers is None: self._update_block_handlers = [] else: - self._update_block_handlers = update_block_handlers + self._update_block_handlers = list(update_block_handlers) if preserve_paragraph_styles is None: self._preserve_paragraph_styles = set(["r", "rem"]) elif isinstance(preserve_paragraph_styles, str): @@ -62,20 +59,18 @@ def __init__( self._replace_stack: List[bool] = [] self._row_index: int = 0 self._token_index: int = 0 - self._embed_updated: bool = False - self._embed_row_texts: List[str] = [] @property def tokens(self) -> List[UsfmToken]: return self._tokens def end_usfm(self, state: UsfmParserState) -> None: - self._collect_tokens(state) - self._process_update_block() + self._collect_updatable_tokens(state) super().end_usfm(state) def start_book(self, state: UsfmParserState, marker: str, code: str) -> None: - self._collect_tokens(state) + self._collect_readonly_tokens(state) + self._update_block_stack.append(UsfmUpdateBlock()) start_book_tokens: List[UsfmToken] = [] if self._id_text is not None: start_book_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=self._id_text + " ")) @@ -84,7 +79,11 @@ def start_book(self, state: UsfmParserState, marker: str, code: str) -> None: super().start_book(state, marker, code) def end_book(self, state: UsfmParserState, marker: str) -> None: - self._process_update_block() + self._use_updated_text() + self._pop_new_tokens() + update_block = self._update_block_stack.pop() + self._tokens.extend(update_block.get_tokens()) + super().end_book(state, marker) def start_para( @@ -94,48 +93,35 @@ def start_para( unknown: bool, attributes: Optional[Sequence[UsfmAttribute]], ) -> None: - if marker in self._preserve_paragraph_styles: - self._in_preserved_paragraph = True - if ( state.is_verse_text and (self._has_new_text() or self._text_behavior == UpdateUsfmTextBehavior.STRIP_EXISTING) and self._paragraph_behavior == UpdateUsfmMarkerBehavior.STRIP ): - self._skip_tokens(state) + self._skip_updatable_tokens(state) else: - self._collect_tokens(state) + self._collect_updatable_tokens(state) super().start_para(state, marker, unknown, attributes) - def end_para(self, state: UsfmParserState, marker: str) -> None: - if not state.is_verse_text: - self._process_update_block() - super().end_para(state, marker) - self._in_preserved_paragraph = False - def start_row(self, state: UsfmParserState, marker: str) -> None: - self._collect_tokens(state) + self._collect_updatable_tokens(state) super().start_row(state, marker) def start_cell(self, state: UsfmParserState, marker: str, align: str, colspan: int) -> None: - self._collect_tokens(state) + self._collect_updatable_tokens(state) super().start_cell(state, marker, align, colspan) - def end_cell(self, state: UsfmParserState, marker: str) -> None: - self._collect_tokens(state) - super().end_cell(state, marker) - def start_sidebar(self, state: UsfmParserState, marker: str, category: str) -> None: - self._collect_tokens(state) + self._collect_updatable_tokens(state) super().start_sidebar(state, marker, category) def end_sidebar(self, state: UsfmParserState, marker: str, closed: bool) -> None: if closed: - self._collect_tokens(state) + self._collect_updatable_tokens(state) super().end_sidebar(state, marker, closed) @@ -147,11 +133,12 @@ def chapter( alt_number: str, pub_number: str, ) -> None: - self._process_update_block() - self._collect_tokens(state) + self._use_updated_text() super().chapter(state, number, marker, alt_number, pub_number) + self._collect_readonly_tokens(state) + def milestone( self, state: UsfmParserState, @@ -159,8 +146,7 @@ def milestone( start_milestone: bool, attributes: Sequence[UsfmAttribute], ) -> None: - self._process_update_block() - self._collect_tokens(state) + self._collect_updatable_tokens(state) super().milestone(state, marker, start_milestone, attributes) @@ -172,11 +158,23 @@ def verse( alt_number: str, pub_number: str, ) -> None: - self._process_update_block() - self._collect_tokens(state) + self._use_updated_text() super().verse(state, number, marker, alt_number, pub_number) + self._collect_readonly_tokens(state) + + def start_note(self, state: UsfmParserState, marker: str, caller: str, category: str) -> None: + super().start_note(state, marker, caller, category) + + self._collect_updatable_tokens(state) + + def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None: + if closed: + self._collect_updatable_tokens(state) + + super().end_note(state, marker, closed) + def start_char( self, state: UsfmParserState, @@ -184,13 +182,17 @@ def start_char( unknown: bool, attributes: Sequence[UsfmAttribute], ) -> None: - if self._replace_with_new_tokens(state): - self._skip_tokens(state) - else: - self._collect_tokens(state) - super().start_char(state, marker_without_plus, unknown, attributes) + if self._current_text_type == ScriptureTextType.EMBED: + self._collect_updatable_tokens(state) + else: + self._replace_with_new_tokens(state) + if self._style_behavior == UpdateUsfmMarkerBehavior.STRIP: + self._skip_updatable_tokens(state) + else: + self._collect_updatable_tokens(state) + def end_char( self, state: UsfmParserState, @@ -198,98 +200,67 @@ def end_char( attributes: Sequence[UsfmAttribute], closed: bool, ) -> None: - - skip_tokens = self._replace_with_new_tokens(state, closed) if closed: - if skip_tokens: - self._skip_tokens(state) + if self._current_text_type == ScriptureTextType.EMBED: + self._collect_updatable_tokens(state) else: - self._collect_tokens(state) + self._replace_with_new_tokens(state) + if self._style_behavior == UpdateUsfmMarkerBehavior.STRIP: + self._skip_updatable_tokens(state) + else: + self._collect_updatable_tokens(state) super().end_char(state, marker, attributes, closed) - def _start_embed( - self, - state: UsfmParserState, - scripture_ref: ScriptureRef, - ) -> None: - self._embed_update_block.update_ref(scripture_ref) - self._embed_row_texts = self._advance_rows([scripture_ref]) - self._embed_updated = any(self._embed_row_texts) - - if self._replace_with_new_tokens(state): - self._skip_tokens(state) - else: - self._collect_tokens(state) - - def _end_embed( - self, state: UsfmParserState, marker: str, attributes: Sequence[UsfmAttribute], closed: bool - ) -> None: - skip_tokens = self._replace_with_new_tokens(state, closed) - if closed: - if skip_tokens: - self._skip_tokens(state) - else: - self._collect_tokens(state) - - self._process_embed_update_block() - self._embed_row_texts.clear() - self._embed_updated = False - - super()._end_embed(state, marker, attributes, closed) - def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> None: + super().ref(state, marker, display, target) + if self._replace_with_new_tokens(state): - self._skip_tokens(state) + self._skip_updatable_tokens(state) else: - self._collect_tokens(state) - - super().ref(state, marker, display, target) + self._collect_updatable_tokens(state) def text(self, state: UsfmParserState, text: str) -> None: super().text(state, text) if self._replace_with_new_tokens(state): - self._skip_tokens(state) + self._skip_updatable_tokens(state) else: - self._collect_tokens(state) + self._collect_updatable_tokens(state) def opt_break(self, state: UsfmParserState) -> None: - if self._replace_with_new_tokens(state): - self._skip_tokens(state) - else: - self._collect_tokens(state) - super().opt_break(state) - def unmatched(self, state: UsfmParserState, marker: str) -> None: if self._replace_with_new_tokens(state): - self._skip_tokens(state) + self._skip_updatable_tokens(state) else: - self._collect_tokens(state) + self._collect_updatable_tokens(state) + def unmatched(self, state: UsfmParserState, marker: str) -> None: super().unmatched(state, marker) + if self._replace_with_new_tokens(state): + self._skip_updatable_tokens(state) + else: + self._collect_updatable_tokens(state) + def _start_verse_text(self, state: UsfmParserState, scripture_refs: Sequence[ScriptureRef]) -> None: - row_texts: List[str] = self._advance_rows(scripture_refs) - self._push_updated_text([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts]) + self._start_update_block(scripture_refs) def _end_verse_text(self, state: UsfmParserState, scripture_refs: Sequence[ScriptureRef]) -> None: - self._pop_new_tokens() + self._end_update_block(scripture_refs) def _start_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: - row_texts = self._advance_rows([scripture_ref]) - self._push_updated_text([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts]) + self._start_update_block([scripture_ref]) def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: - self._pop_new_tokens() + self._end_update_block([scripture_ref]) - def _start_note_text(self, state: UsfmParserState) -> None: - self._push_updated_embed_text([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in self._embed_row_texts]) - - def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: - self._embed_row_texts.clear() - self._pop_new_tokens() + def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: + self._update_block_stack[-1].add_embed( + self._embed_tokens, marked_for_removal=self._embed_behavior == UpdateUsfmMarkerBehavior.STRIP + ) + self._embed_tokens.clear() def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: if isinstance(stylesheet, str): @@ -321,113 +292,93 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> List[str]: self._row_index += 1 return row_texts - def _collect_tokens(self, state: UsfmParserState) -> None: + def _collect_updatable_tokens(self, state: UsfmParserState) -> None: self._use_updated_text() while self._token_index <= state.index + state.special_token_count: token = state.tokens[self._token_index] - if self._is_in_embed(token.marker): - self._embed_update_block.add_token(token) + if self._current_text_type == ScriptureTextType.EMBED: + self._embed_tokens.append(token) + elif ( + self._current_text_type != ScriptureTextType.NONE + or (state.para_tag is not None and state.para_tag.marker == "id") + ) and len(self._update_block_stack) > 0: + self._update_block_stack[-1].add_token(token) else: - self._update_block.add_token(token) + self._tokens.append(token) self._token_index += 1 - def _skip_tokens(self, state: UsfmParserState) -> None: + def _collect_readonly_tokens(self, state: UsfmParserState) -> None: while self._token_index <= state.index + state.special_token_count: token = state.tokens[self._token_index] - if self._is_in_embed(token.marker): - self._embed_update_block.add_token(token, marked_for_removal=True) + if len(self._update_block_stack) > 0: + self._update_block_stack[-1].add_token(token) else: - self._update_block.add_token(token, marked_for_removal=True) + self._tokens.append(token) self._token_index += 1 - self._token_index = state.index + 1 + state.special_token_count - def _replace_with_new_tokens(self, state: UsfmParserState, closed: bool = True) -> bool: - marker: Optional[str] = state.token if state.token is None else state.token.marker - in_embed: bool = self._is_in_embed(marker) + def _skip_updatable_tokens(self, state: UsfmParserState) -> None: + while self._token_index <= state.index + state.special_token_count: + token = state.tokens[self._token_index] + if self._current_text_type != ScriptureTextType.NONE or ( + state.para_tag is not None and state.para_tag.marker == "id" + ): + if len(self._update_block_stack) > 0: + self._update_block_stack[-1].add_token(token, marked_for_removal=True) + self._token_index += 1 + self._token_index = state.index + 1 + state.special_token_count - in_nested_embed: bool = self._is_in_nested_embed(marker) - is_style_tag: bool = marker is not None and not is_embed_part_style(marker) + def _replace_with_new_tokens(self, state: UsfmParserState) -> bool: + if self._current_text_type == ScriptureTextType.EMBED: + return False existing_text = any( t.type == UsfmTokenType.TEXT and t.text for t in state.tokens[self._token_index : state.index + 1 + state.special_token_count] ) - use_new_tokens = ( - not self._is_in_preserved_paragraph(marker) - and ( - self._text_behavior == UpdateUsfmTextBehavior.STRIP_EXISTING - or ( - self._has_new_text() - and (not existing_text or self._text_behavior != UpdateUsfmTextBehavior.PREFER_EXISTING) - ) - ) - and ( - not in_embed - or ( - self._is_in_note_text() - and not in_nested_embed - and self._embed_behavior == UpdateUsfmMarkerBehavior.PRESERVE - ) - ) - ) - - if use_new_tokens: - if in_embed: - self._use_updated_embed_text() - else: - self._use_updated_text() - - if existing_text and ( - self._text_behavior == UpdateUsfmTextBehavior.PREFER_EXISTING or self._is_in_preserved_paragraph(marker) + use_new_tokens = True + if self._is_in_preserved_paragraph(state): + use_new_tokens = False + elif self._text_behavior != UpdateUsfmTextBehavior.STRIP_EXISTING and ( + not self._has_new_text() + or (existing_text and self._text_behavior == UpdateUsfmTextBehavior.PREFER_EXISTING) ): - if in_embed: - self._clear_updated_embed_text() - else: - self._clear_updated_text() + use_new_tokens = False - embed_in_new_verse_text = ( - any(self._replace_stack) or self._text_behavior == UpdateUsfmTextBehavior.STRIP_EXISTING - ) and in_embed - if embed_in_new_verse_text or self._embed_updated: - if self._embed_behavior == UpdateUsfmMarkerBehavior.STRIP: - self._clear_updated_embed_text() - return True - if not self._is_in_note_text() or in_nested_embed: - return False + if use_new_tokens: + self._use_updated_text() - skip_tokens = use_new_tokens and closed + clear_new_tokens = existing_text and ( + self._text_behavior == UpdateUsfmTextBehavior.PREFER_EXISTING or self._is_in_preserved_paragraph(state) + ) - if use_new_tokens and is_style_tag: - skip_tokens = self._style_behavior == UpdateUsfmMarkerBehavior.STRIP + if clear_new_tokens: + self._clear_updated_text() - return skip_tokens + return use_new_tokens def _has_new_text(self) -> bool: return any(self._replace_stack) and self._replace_stack[-1] - def _update_verse_ref(self, verse_ref: VerseRef, marker: str) -> None: - super()._update_verse_ref(verse_ref, marker) - self._update_block.update_ref(ScriptureRef(verse_ref.copy())) - - def _create_non_verse_ref(self) -> ScriptureRef: - ref = super()._create_non_verse_ref() - self._update_block.update_ref(ref) - return ref + def _start_update_block(self, scripture_refs: Sequence[ScriptureRef]) -> None: + self._update_block_stack.append(UsfmUpdateBlock(scripture_refs)) + row_texts: List[str] = self._advance_rows(scripture_refs) + self._push_updated_text([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts]) - def _process_update_block(self) -> None: + def _end_update_block(self, scripture_refs: Sequence[ScriptureRef]) -> None: self._use_updated_text() + self._pop_new_tokens() + update_block = self._update_block_stack.pop() + update_block.update_refs(scripture_refs) for handler in self._update_block_handlers: - self._update_block = handler.process_block(self._update_block) - self._tokens.extend(self._update_block.get_tokens()) - self._update_block.clear() - - def _process_embed_update_block(self) -> None: - self._use_updated_embed_text() - for handler in self._update_block_handlers: - self._embed_update_block = handler.process_block(self._embed_update_block) - self._update_block.add_embed(self._embed_update_block.get_tokens()) - self._embed_update_block.clear() + update_block = handler.process_block(update_block) + if ( + len(self._update_block_stack) > 0 + and self._update_block_stack[-1].elements[-1].type == UsfmUpdateBlockElementType.PARAGRAPH + ): + self._update_block_stack[-1].extend_last_element(update_block.get_tokens()) + else: + self._tokens.extend(update_block.get_tokens()) def _push_updated_text(self, tokens: List[UsfmToken]) -> None: self._replace_stack.append(any(tokens)) @@ -436,30 +387,14 @@ def _push_updated_text(self, tokens: List[UsfmToken]) -> None: def _use_updated_text(self) -> None: if self._updated_text: - self._update_block.add_inserted_text(self._updated_text) + self._update_block_stack[-1].add_text(self._updated_text) self._updated_text.clear() def _clear_updated_text(self) -> None: self._updated_text.clear() - def _push_updated_embed_text(self, tokens: List[UsfmToken]) -> None: - self._replace_stack.append(any(tokens)) - if tokens: - self._updated_embed_text.extend(tokens) - - def _use_updated_embed_text(self) -> None: - if self._updated_embed_text: - self._embed_update_block.add_inserted_text(self._updated_embed_text) - self._updated_embed_text.clear() - - def _clear_updated_embed_text(self) -> None: - self._updated_embed_text.clear() - - def _push_updated_text_as_previous(self) -> None: - self._replace_stack.append(self._replace_stack[-1]) - def _pop_new_tokens(self) -> None: self._replace_stack.pop() - def _is_in_preserved_paragraph(self, marker: Optional[str]) -> bool: - return self._in_preserved_paragraph or marker in self._preserve_paragraph_styles + def _is_in_preserved_paragraph(self, state: UsfmParserState) -> bool: + return state.para_tag is not None and state.para_tag.marker in self._preserve_paragraph_styles diff --git a/machine/corpora/usfm_text_base.py b/machine/corpora/usfm_text_base.py index 4556144d..c286c001 100644 --- a/machine/corpora/usfm_text_base.py +++ b/machine/corpora/usfm_text_base.py @@ -202,12 +202,8 @@ def text(self, state: UsfmParserState, text: str) -> None: text = text.lstrip() row_text += text elif len(text) > 0 and (self._current_text_type != ScriptureTextType.VERSE or state.is_verse_text): - is_embed_or_nested_dont_update = ( - state.token is not None - and self._is_in_embed(state.token.marker) - and (not self._is_in_note_text() or self._is_in_nested_embed(state.token.marker)) - ) - if is_embed_or_nested_dont_update: + # ignore embed text + if self._current_text_type == ScriptureTextType.EMBED: return if ( @@ -235,18 +231,6 @@ def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRe if self._text._include_all_text: self._rows.append(self._text._create_scripture_row(scripture_ref, text, self._sentence_start)) - def _start_note_text(self, state: UsfmParserState) -> None: - if self._text._include_markers: - return - self._row_texts_stack.append("") - - def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: - if self._text._include_markers: - return - text = self._row_texts_stack.pop() - if self._text._include_all_text: - self._rows.append(self._text._create_scripture_row(scripture_ref, text, self._sentence_start)) - def _output_marker(self, state: UsfmParserState) -> None: if not self._text._include_markers or len(self._row_texts_stack) == 0: return diff --git a/machine/corpora/usfm_update_block.py b/machine/corpora/usfm_update_block.py new file mode 100644 index 00000000..3d612d5f --- /dev/null +++ b/machine/corpora/usfm_update_block.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from typing import Iterable, Sequence + +from .scripture_ref import ScriptureRef +from .usfm_token import UsfmToken, UsfmTokenType +from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType + + +class UsfmUpdateBlock: + def __init__(self, refs: Iterable[ScriptureRef] = [], elements: Iterable[UsfmUpdateBlockElement] = []) -> None: + self._refs: list[ScriptureRef] = list(refs) + self._elements: list[UsfmUpdateBlockElement] = list(elements) + + @property + def refs(self) -> Sequence[ScriptureRef]: + return self._refs + + @property + def elements(self) -> Sequence[UsfmUpdateBlockElement]: + return self._elements + + def add_text(self, tokens: Iterable[UsfmToken]) -> None: + self._elements.append(UsfmUpdateBlockElement(UsfmUpdateBlockElementType.TEXT, list(tokens))) + + def add_token(self, token: UsfmToken, marked_for_removal: bool = False) -> None: + if token.type == UsfmTokenType.TEXT: + element_type = UsfmUpdateBlockElementType.TEXT + elif token.type == UsfmTokenType.PARAGRAPH: + element_type = UsfmUpdateBlockElementType.PARAGRAPH + elif token.type == UsfmTokenType.CHARACTER or token.type == UsfmTokenType.END: + element_type = UsfmUpdateBlockElementType.STYLE + else: + element_type = UsfmUpdateBlockElementType.OTHER + self._elements.append(UsfmUpdateBlockElement(element_type, [token], marked_for_removal)) + + def add_embed(self, tokens: Iterable[UsfmToken], marked_for_removal: bool = False) -> None: + self._elements.append( + UsfmUpdateBlockElement(UsfmUpdateBlockElementType.EMBED, list(tokens), marked_for_removal) + ) + + def extend_last_element(self, tokens: Iterable[UsfmToken]) -> None: + self._elements[-1].tokens.extend(tokens) + + def update_refs(self, refs: Iterable[ScriptureRef]) -> None: + self._refs = list(refs) + + def get_tokens(self) -> list[UsfmToken]: + return [token for element in self._elements for token in element.get_tokens()] + + def __eq__(self, other: UsfmUpdateBlock) -> bool: + return self._refs == other._refs and self._elements == other._elements + + def copy(self) -> UsfmUpdateBlock: + return UsfmUpdateBlock(self._refs, self._elements) diff --git a/machine/corpora/usfm_update_block_element.py b/machine/corpora/usfm_update_block_element.py new file mode 100644 index 00000000..46a70651 --- /dev/null +++ b/machine/corpora/usfm_update_block_element.py @@ -0,0 +1,24 @@ +from dataclasses import dataclass +from enum import Enum, auto + +from .usfm_token import UsfmToken + + +class UsfmUpdateBlockElementType(Enum): + TEXT = auto() + PARAGRAPH = auto() + EMBED = auto() + STYLE = auto() + OTHER = auto() + + +@dataclass(frozen=True) +class UsfmUpdateBlockElement: + type: UsfmUpdateBlockElementType + tokens: list[UsfmToken] + marked_for_removal: bool = False + + def get_tokens(self) -> list[UsfmToken]: + if self.marked_for_removal: + return [] + return self.tokens.copy() diff --git a/machine/corpora/usfm_update_block_handler.py b/machine/corpora/usfm_update_block_handler.py new file mode 100644 index 00000000..b06a31dd --- /dev/null +++ b/machine/corpora/usfm_update_block_handler.py @@ -0,0 +1,8 @@ +from abc import ABC, abstractmethod + +from .usfm_update_block import UsfmUpdateBlock + + +class UsfmUpdateBlockHandler(ABC): + @abstractmethod + def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: ... diff --git a/poetry.lock b/poetry.lock index a290b5dd..860194ff 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. [[package]] name = "accelerate" @@ -777,6 +777,8 @@ version = "3.0.12" description = "The Cython compiler for writing C extensions in the Python language." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" +groups = ["main"] +markers = "sys_platform == \"linux\" and extra == \"jobs\"" files = [ {file = "Cython-3.0.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ba67eee9413b66dd9fbacd33f0bc2e028a2a120991d77b5fd4b19d0b1e4039b9"}, {file = "Cython-3.0.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bee2717e5b5f7d966d0c6e27d2efe3698c357aa4d61bb3201997c7a4f9fe485a"}, @@ -1008,6 +1010,8 @@ version = "2.0.0" description = "pip installable eflomal" optional = false python-versions = "*" +groups = ["main"] +markers = "sys_platform == \"linux\" and extra == \"jobs\"" files = [ {file = "eflomal-2.0.0.tar.gz", hash = "sha256:b71183dcf85bf4f59f44ef7a59f5268df1c17c0c8d8093f77b220025ffdba100"}, ] @@ -3337,14 +3341,14 @@ diagrams = ["jinja2", "railroad-diagrams"] [[package]] name = "pyright" -version = "1.1.399" +version = "1.1.400" description = "Command line wrapper for pyright" optional = false python-versions = ">=3.7" groups = ["dev"] files = [ - {file = "pyright-1.1.399-py3-none-any.whl", hash = "sha256:55f9a875ddf23c9698f24208c764465ffdfd38be6265f7faf9a176e1dc549f3b"}, - {file = "pyright-1.1.399.tar.gz", hash = "sha256:439035d707a36c3d1b443aec980bc37053fbda88158eded24b8eedcf1c7b7a1b"}, + {file = "pyright-1.1.400-py3-none-any.whl", hash = "sha256:c80d04f98b5a4358ad3a35e241dbf2a408eee33a40779df365644f8054d2517e"}, + {file = "pyright-1.1.400.tar.gz", hash = "sha256:b8a3ba40481aa47ba08ffb3228e821d22f7d391f83609211335858bf05686bdb"}, ] [package.dependencies] @@ -5136,4 +5140,4 @@ thot = ["sil-thot"] [metadata] lock-version = "2.1" python-versions = ">=3.9,<3.13" -content-hash = "d292103e26b41fd440528597df80a64661ef21afd6be8fd07a8c34521729ad65" +content-hash = "f56942f52a117fba35a5f43ee631c386ff95dd270301805558064d0228253624" diff --git a/pyproject.toml b/pyproject.toml index 853dc368..0498016f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,7 +84,7 @@ pytest-cov = "^4.1.0" ipykernel = "^6.7.0" jupyter = "^1.0.0" pandas = "^2.0.3" -pyright = { extras = ["nodejs"], version = "^1.1.399" } +pyright = { extras = ["nodejs"], version = "^1.1.400" } decoy = "^2.1.0" pep8-naming = "^0.14.1" diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index c9ee6ba8..086f2551 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Sequence, Tuple +from typing import Iterable, List, Optional, Sequence, Tuple, Union from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH, ignore_line_endings @@ -8,6 +8,9 @@ UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, UpdateUsfmTextBehavior, + UsfmUpdateBlock, + UsfmUpdateBlockElementType, + UsfmUpdateBlockHandler, parse_usfm, ) @@ -50,8 +53,8 @@ def test_get_usfm_strip_all_text() -> None: \r keep this reference \rem and this reference too \ip but remove this text -\v 1 Chapter \add one\add*, \p verse \f + \fr 2:1: \ft This is a \fm ∆\fm* footnote.\f*one. -\v 2 Chapter \add one\add*, \p verse \f + \fr 2:1: \ft This is a \fm ∆\fm* footnote.\f*two. +\v 1 Chapter \add one\add*, \p verse \f + \fr 1:1: \ft This is a \+bd ∆\+bd* footnote.\f*one. +\v 2 Chapter \add one\add*, \p verse \f + \fr 1:2: \ft This is a \+bd ∆\+bd* footnote.\f*two. \v 3 Verse 3 \v 4 Verse 4 """ @@ -66,18 +69,18 @@ def test_get_usfm_strip_all_text() -> None: ) result = r"""\id MAT -\c 1 -\r keep this reference -\rem and this reference too -\ip -\v 1 Update 1 \add \add* -\p \f + \fr 2:1: \ft \fm ∆\fm*\f* -\v 2 \add \add* -\p \f + \fr 2:1: \ft \fm ∆\fm*\f* -\v 3 Update 3 -\v 4 -""" - assess(target, result) + \c 1 + \r keep this reference + \rem and this reference too + \ip + \v 1 Update 1 \add \add* + \p \f + \fr 1:1: \ft This is a \+bd ∆\+bd* footnote.\f* + \v 2 \add \add* + \p \f + \fr 1:2: \ft This is a \+bd ∆\+bd* footnote.\f* + \v 3 Update 3 + \v 4 + """ + assert_usfm_equals(target, result) target = update_usfm( rows, @@ -98,7 +101,7 @@ def test_get_usfm_strip_all_text() -> None: \v 3 Update 3 \v 4 """ - assess(target, result) + assert_usfm_equals(target, result) def test_get_usfm_strip_paragraphs_preserve_paragraph_styles(): @@ -129,7 +132,7 @@ def test_get_usfm_strip_paragraphs_preserve_paragraph_styles(): \v 1 Update 1 """ - assess(target, result) + assert_usfm_equals(target, result) target_diff_paragraph = update_usfm( rows, @@ -146,7 +149,7 @@ def test_get_usfm_strip_paragraphs_preserve_paragraph_styles(): \v 1 Update 1 """ - assess(target_diff_paragraph, result_diff_paragraph) + assert_usfm_equals(target_diff_paragraph, result_diff_paragraph) def test_preserve_paragraphs(): @@ -177,7 +180,7 @@ def test_preserve_paragraphs(): \v 1 Update 1 """ - assess(target, result) + assert_usfm_equals(target, result) target_diff_paragraph = update_usfm( rows, usfm, text_behavior=UpdateUsfmTextBehavior.STRIP_EXISTING, preserve_paragraph_styles=("ip") @@ -190,7 +193,7 @@ def test_preserve_paragraphs(): \v 1 Update 1 """ - assess(target_diff_paragraph, result_diff_paragraph) + assert_usfm_equals(target_diff_paragraph, result_diff_paragraph) def test_paragraph_in_verse(): @@ -219,7 +222,7 @@ def test_paragraph_in_verse(): \p inner verse paragraph """ - assess(target, result) + assert_usfm_equals(target, result) target_strip = update_usfm( rows, @@ -236,7 +239,7 @@ def test_paragraph_in_verse(): \v 2 """ - assess(target_strip, result_strip) + assert_usfm_equals(target_strip, result_strip) def test_get_usfm_prefer_existing(): @@ -265,7 +268,7 @@ def test_get_usfm_prefer_existing(): \v 2 Update 2 \v 3 Other text """ - assess(target, result) + assert_usfm_equals(target, result) def test_get_usfm_prefer_rows(): @@ -298,25 +301,23 @@ def test_get_usfm_verse_strip_note() -> None: assert "\\v 1 First verse of the second chapter.\r\n" in target -def test_get_usfm_verse_replace_note() -> None: +def test_get_usfm_verse_replace_with_note() -> None: rows = [ ( scr_ref("MAT 1:1"), str("updated text"), ), - (scr_ref("MAT 1:1/1:f"), str("This is a new footnote.")), ] usfm = r"""\id MAT - Test \c 1 -\v 1 Chapter \add one\add*, verse \f + \fr 2:1: \ft This is a \fq quotation \ft and an \fqa alternative quotation\f*one. +\v 1 Chapter \add one\add*, verse \f + \fr 2:1: \ft This is a footnote.\f*one. """ target = update_usfm(rows, usfm) - # Only the first \ft marker is updated result = r"""\id MAT - Test \c 1 -\v 1 updated text \f + \fr 2:1: \ft This is a new footnote. \fq quotation \ft and an \fqa alternative quotation\f* +\v 1 updated text \f + \fr 2:1: \ft This is a footnote.\f* """ - assess(target, result) + assert_usfm_equals(target, result) def test_get_usfm_row_verse_segment() -> None: @@ -427,7 +428,7 @@ def test_get_usfm_merge_verse_segments() -> None: ] target = update_usfm(rows) assert target is not None - assert "\\v 2-3 Verse 2. Verse 2a. Verse 2b. \\fm ∆\\fm*\r\n" in target + assert "\\v 2-3 Verse 2. Verse 2a. Verse 2b.\r\n" in target def test_get_usfm_verse_opt_break() -> None: @@ -528,7 +529,7 @@ def test_get_usfm_nonverse_relaxed() -> None: def test_get_usfm_nonverse_sidebar() -> None: rows = [ ( - scr_ref("MAT 2:3/2:esb/1:ms"), + scr_ref("MAT 2:3/1:esb/1:ms"), str("The first paragraph of the sidebar."), ) ] @@ -556,7 +557,7 @@ def test_get_usfm_nonverse_table() -> None: def test_get_usfm_nonverse_optbreak() -> None: rows = [ ( - scr_ref("MAT 2:3/2:esb/2:p"), + scr_ref("MAT 2:3/1:esb/2:p"), str("The second paragraph of the sidebar."), ) ] @@ -589,20 +590,16 @@ def test_get_usfm_nonverse_skip_note() -> None: assert "\\ip The introductory paragraph.\r\n" in target -def test_get_usfm_nonverse_replace_note() -> None: +def test_get_usfm_nonverse_replace_with_note() -> None: rows = [ ( scr_ref("MAT 1:0/3:ip"), str("The introductory paragraph."), ), - ( - scr_ref("MAT 1:0/3:ip/1:fe"), - str("This is a new endnote."), - ), ] target = update_usfm(rows) assert target is not None - assert "\\ip The introductory paragraph. \\fe + \\ft This is a new endnote. \\fe*\r\n" in target + assert "\\ip The introductory paragraph. \\fe + \\ft This is an endnote.\\fe*\r\n" in target def test_get_usfm_verse_double_va_vp() -> None: @@ -673,77 +670,6 @@ def test_get_usfm_verse_pretranslations_before_text() -> None: assert "\\ip The introductory paragraph. \\fe + \\ft This is an endnote.\\fe*\r\n" in target -def test_embed_style_preservation() -> None: - rows = [ - ( - scr_ref("MAT 1:1"), - str("Update the greeting"), - ), - ( - scr_ref("MAT 1:1/1:f"), - str("Update the comment"), - ), - ( - scr_ref("MAT 1:2"), - str("Update the greeting only"), - ), - ( - scr_ref("MAT 1:3/1:f"), - str("Update the comment only"), - ), - ] - usfm = r"""\id MAT - Test -\c 1 -\v 1 Hello \f \fr 1.1 \ft Some \+bd note\+bd* \f*\bd World \bd* -\v 2 Good \f \fr 1.2 \ft Some other \+bd note\+bd* \f*\bd Morning \bd* -\v 3 Pleasant \f \fr 1.3 \ft A third \+bd note\+bd* \f*\bd Evening \bd* -""" - - target = update_usfm( - rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, style_behavior=UpdateUsfmMarkerBehavior.PRESERVE - ) - result_pp = r"""\id MAT - Test -\c 1 -\v 1 Update the greeting \f \fr 1.1 \ft Update the comment \+bd \+bd*\f*\bd \bd* -\v 2 Update the greeting only \f \fr 1.2 \ft Some other \+bd note\+bd* \f*\bd \bd* -\v 3 Pleasant \f \fr 1.3 \ft Update the comment only \+bd \+bd*\f*\bd Evening \bd* -""" - assess(target, result_pp) - - target = update_usfm( - rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, style_behavior=UpdateUsfmMarkerBehavior.STRIP - ) - result_ps = r"""\id MAT - Test -\c 1 -\v 1 Update the greeting \f \fr 1.1 \ft Update the comment \f* -\v 2 Update the greeting only \f \fr 1.2 \ft Some other \+bd note\+bd* \f* -\v 3 Pleasant \f \fr 1.3 \ft Update the comment only \f*\bd Evening \bd* -""" - assess(target, result_ps) - - target = update_usfm( - rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.STRIP, style_behavior=UpdateUsfmMarkerBehavior.PRESERVE - ) - result_sp = r"""\id MAT - Test -\c 1 -\v 1 Update the greeting \bd \bd* -\v 2 Update the greeting only \bd \bd* -\v 3 Pleasant \bd Evening \bd* -""" - assess(target, result_sp) - - target = update_usfm( - rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.STRIP, style_behavior=UpdateUsfmMarkerBehavior.STRIP - ) - result_ss = r"""\id MAT - Test -\c 1 -\v 1 Update the greeting -\v 2 Update the greeting only -\v 3 Pleasant \bd Evening \bd* -""" - assess(target, result_ss) - - def test_strip_paragraphs() -> None: rows = [ ( @@ -776,7 +702,7 @@ def test_strip_paragraphs() -> None: \p World """ - assess(target, result_p) + assert_usfm_equals(target, result_p) target = update_usfm(rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP) result_s = r"""\id MAT - Test @@ -787,7 +713,7 @@ def test_strip_paragraphs() -> None: \v 2 Hello \p World """ - assess(target, result_s) + assert_usfm_equals(target, result_s) def test_preservation_raw_strings() -> None: @@ -807,7 +733,7 @@ def test_preservation_raw_strings() -> None: \c 1 \v 1 Update all in one row \f \fr 1.1 \ft Some note \f* """ - assess(target, result) + assert_usfm_equals(target, result) def test_beginning_of_verse_embed() -> None: @@ -827,27 +753,7 @@ def test_beginning_of_verse_embed() -> None: \c 1 \v 1 Updated text """ - assess(target, result) - - -def test_empty_note() -> None: - rows = [ - ( - scr_ref("MAT 1:1/1:f"), - str("Update the note"), - ) - ] - usfm = r"""\id MAT - Test -\c 1 -\v 1 Empty Note \f \fr 1.1 \ft \f* -""" - - target = update_usfm(rows, usfm) - result = r"""\id MAT - Test -\c 1 -\v 1 Empty Note \f \fr 1.1 \ft Update the note \f* -""" - assess(target, result) + assert_usfm_equals(target, result) def test_cross_reference_dont_update() -> None: @@ -867,10 +773,10 @@ def test_cross_reference_dont_update() -> None: \c 1 \v 1 Cross reference verse \x - \xo 2:3-4 \xt Cool Book 3:24 \xta The annotation \x* and more content. """ - assess(target, result) + assert_usfm_equals(target, result) -def test_preserve_fig_and_fm() -> None: +def test_preserve_fig() -> None: rows = [ ( scr_ref("MAT 1:1"), @@ -879,18 +785,18 @@ def test_preserve_fig_and_fm() -> None: ] usfm = r"""\id MAT - Test \c 1 -\v 1 initial text \fig stuff\fig* more text \fm * \fm* and more. +\v 1 initial text \fig stuff\fig* more text and more. """ target = update_usfm(rows, usfm) result = r"""\id MAT - Test \c 1 -\v 1 Update \fig stuff\fig*\fm * \fm* +\v 1 Update \fig stuff\fig* """ - assess(target, result) + assert_usfm_equals(target, result) -def test_nested_xt() -> None: +def test_note_explicit_end_markers() -> None: rows = [ ( scr_ref("MAT 1:1"), @@ -903,108 +809,335 @@ def test_nested_xt() -> None: ] usfm = r"""\id MAT - Test \c 1 -\v 1 initial text \f + \fr 15.8 \ft Text (\+xt reference\+xt*). And more.\f* and the end. +\v 1 initial text \f + \fr 2.4\fr* \fk The \+nd Lord\+nd*:\fk* \ft See \+nd Lord\+nd* in Word List.\ft*\f* and the end. """ target = update_usfm(rows, usfm) result = r"""\id MAT - Test \c 1 -\v 1 Update text \f + \fr 15.8 \ft Update note \+xt reference\+xt*\f* +\v 1 Update text \f + \fr 2.4\fr* \fk The \+nd Lord\+nd*:\fk* \ft See \+nd Lord\+nd* in Word List.\ft*\f* """ - assess(target, result) + assert_usfm_equals(target, result) target = update_usfm(rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.STRIP) result = r"""\id MAT - Test \c 1 \v 1 Update text """ - assess(target, result) + assert_usfm_equals(target, result) -def test_non_nested_xt() -> None: +def test_update_block_verse_preserve_paras() -> None: rows = [ ( scr_ref("MAT 1:1"), - str("Update text"), + str("Update 1"), ), + ] + usfm = r"""\id MAT - Test +\c 1 +\v 1 verse 1 \p inner verse paragraph +""" + + update_block_handler = TestUsfmUpdateBlockHandler() + update_usfm( + rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, update_block_handlers=[update_block_handler] + ) + + assert len(update_block_handler.blocks) == 1 + update_block = update_block_handler.blocks[0] + assert_update_block_equals( + update_block, + "MAT 1:1", + (UsfmUpdateBlockElementType.OTHER, "\\v 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "Update 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "verse 1 ", True), + (UsfmUpdateBlockElementType.PARAGRAPH, "\\p ", False), + (UsfmUpdateBlockElementType.TEXT, "inner verse paragraph ", True), + ) + + +def test_update_block_verse_strip_paras() -> None: + rows = [ ( - scr_ref("MAT 1:1/1:f"), - str("Update note"), + scr_ref("MAT 1:1"), + str("Update 1"), ), ] usfm = r"""\id MAT - Test \c 1 -\v 1 initial text \f + \fr 15.8 \ft Text \xt reference\f* and the end. +\v 1 verse 1 \p inner verse paragraph """ - target = update_usfm(rows, usfm) - result = r"""\id MAT - Test + update_block_handler = TestUsfmUpdateBlockHandler() + update_usfm( + rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP, update_block_handlers=[update_block_handler] + ) + + assert len(update_block_handler.blocks) == 1 + update_block = update_block_handler.blocks[0] + assert_update_block_equals( + update_block, + "MAT 1:1", + (UsfmUpdateBlockElementType.OTHER, "\\v 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "Update 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "verse 1 ", True), + (UsfmUpdateBlockElementType.PARAGRAPH, "\\p ", True), + (UsfmUpdateBlockElementType.TEXT, "inner verse paragraph ", True), + ) + + +def test_update_block_verse_range() -> None: + rows = [ + ( + scr_ref("MAT 1:1"), + str("Update 1"), + ), + ] + usfm = r"""\id MAT - Test \c 1 -\v 1 Update text \f + \fr 15.8 \ft Update note \xt reference\f* +\v 1-3 verse 1 through 3 """ - assess(target, result) - target = update_usfm(rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.STRIP) - result = r"""\id MAT - Test + update_block_handler = TestUsfmUpdateBlockHandler() + update_usfm( + rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, update_block_handlers=[update_block_handler] + ) + + assert len(update_block_handler.blocks) == 1 + update_block = update_block_handler.blocks[0] + assert_update_block_equals( + update_block, + ["MAT 1:1", "MAT 1:2", "MAT 1:3"], + (UsfmUpdateBlockElementType.OTHER, "\\v 1-3 ", False), + (UsfmUpdateBlockElementType.TEXT, "Update 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "verse 1 through 3 ", True), + ) + + +def test_update_block_footnote_preserve_embeds() -> None: + rows = [ + ( + scr_ref("MAT 1:1"), + str("Update 1"), + ), + ] + usfm = r"""\id MAT - Test \c 1 -\v 1 Update text +\v 1 verse\f \fr 1.1 \ft Some note \f* 1 """ - assess(target, result) + + update_block_handler = TestUsfmUpdateBlockHandler() + update_usfm( + rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, update_block_handlers=[update_block_handler] + ) + + assert len(update_block_handler.blocks) == 1 + update_block = update_block_handler.blocks[0] + assert_update_block_equals( + update_block, + "MAT 1:1", + (UsfmUpdateBlockElementType.OTHER, "\\v 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "Update 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "verse", True), + (UsfmUpdateBlockElementType.EMBED, "\\f \\fr 1.1 \\ft Some note \\f*", False), + (UsfmUpdateBlockElementType.TEXT, " 1 ", True), + ) -def test_multiple_ft_only_update_first() -> None: +def test_update_block_footnote_strip_embeds() -> None: rows = [ ( scr_ref("MAT 1:1"), - str("Update text"), + str("Update 1"), ), + ] + usfm = r"""\id MAT - Test +\c 1 +\v 1 verse\f \fr 1.1 \ft Some note \f* 1 +""" + + update_block_handler = TestUsfmUpdateBlockHandler() + update_usfm(rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.STRIP, update_block_handlers=[update_block_handler]) + + assert len(update_block_handler.blocks) == 1 + update_block = update_block_handler.blocks[0] + assert_update_block_equals( + update_block, + "MAT 1:1", + (UsfmUpdateBlockElementType.OTHER, "\\v 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "Update 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "verse", True), + (UsfmUpdateBlockElementType.EMBED, "\\f \\fr 1.1 \\ft Some note \\f*", True), + (UsfmUpdateBlockElementType.TEXT, " 1 ", True), + ) + + +def test_update_block_nonverse() -> None: + rows = [ ( - scr_ref("MAT 1:1/1:f"), - str("Update note"), + scr_ref("MAT 1:0/1:s"), + str("Updated section Header"), ), ] usfm = r"""\id MAT - Test +\s Section header \c 1 -\v 1 initial text \f + \fr 15.8 \ft first note \ft second note\f* and the end. +\v 1 verse 1 """ - target = update_usfm(rows, usfm) - result = r"""\id MAT - Test + update_block_handler = TestUsfmUpdateBlockHandler() + update_usfm(rows, usfm, update_block_handlers=[update_block_handler]) + + assert len(update_block_handler.blocks) == 2 + update_block = update_block_handler.blocks[0] + assert_update_block_equals( + update_block, + "MAT 1:0/1:s", + (UsfmUpdateBlockElementType.TEXT, "Updated section Header ", False), + (UsfmUpdateBlockElementType.TEXT, "Section header ", True), + ) + + +def test_update_block_verse_preserve_styles() -> None: + rows = [ + ( + scr_ref("MAT 1:1"), + str("Update 1"), + ), + ] + usfm = r"""\id MAT - Test \c 1 -\v 1 Update text \f + \fr 15.8 \ft Update note \ft second note\f* +\v 1 verse \bd 1\bd* """ - assess(target, result) - target = update_usfm(rows, usfm, embed_behavior=UpdateUsfmMarkerBehavior.STRIP) - result = r"""\id MAT - Test + update_block_handler = TestUsfmUpdateBlockHandler() + update_usfm( + rows, usfm, style_behavior=UpdateUsfmMarkerBehavior.PRESERVE, update_block_handlers=[update_block_handler] + ) + + assert len(update_block_handler.blocks) == 1 + update_block = update_block_handler.blocks[0] + assert_update_block_equals( + update_block, + "MAT 1:1", + (UsfmUpdateBlockElementType.OTHER, "\\v 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "Update 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "verse ", True), + (UsfmUpdateBlockElementType.STYLE, "\\bd ", False), + (UsfmUpdateBlockElementType.TEXT, "1", True), + (UsfmUpdateBlockElementType.STYLE, "\\bd*", False), + (UsfmUpdateBlockElementType.TEXT, " ", True), + ) + + +def test_update_block_verse_strip_styles() -> None: + rows = [ + ( + scr_ref("MAT 1:1"), + str("Update 1"), + ), + ] + usfm = r"""\id MAT - Test \c 1 -\v 1 Update text +\v 1 verse \bd 1\bd* """ - assess(target, result) + + update_block_handler = TestUsfmUpdateBlockHandler() + update_usfm(rows, usfm, style_behavior=UpdateUsfmMarkerBehavior.STRIP, update_block_handlers=[update_block_handler]) + + assert len(update_block_handler.blocks) == 1 + update_block = update_block_handler.blocks[0] + assert_update_block_equals( + update_block, + "MAT 1:1", + (UsfmUpdateBlockElementType.OTHER, "\\v 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "Update 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "verse ", True), + (UsfmUpdateBlockElementType.STYLE, "\\bd ", True), + (UsfmUpdateBlockElementType.TEXT, "1", True), + (UsfmUpdateBlockElementType.STYLE, "\\bd*", True), + (UsfmUpdateBlockElementType.TEXT, " ", True), + ) -def test_implicitly_closed_char_style() -> None: +def test_update_block_verse_section_header() -> None: rows = [ ( scr_ref("MAT 1:1"), - str("Update text"), - ) + str("Update 1"), + ), ] usfm = r"""\id MAT - Test \c 1 -\v 1 Verse \bd one. -\c 2 -\v 1 Verse one. +\p +\v 1 Verse 1 +\s Section header +\p +\v 2 Verse 2 """ - target = update_usfm(rows, usfm) - result = r"""\id MAT - Test + update_block_handler = TestUsfmUpdateBlockHandler() + update_usfm(rows, usfm, update_block_handlers=[update_block_handler]) + + assert len(update_block_handler.blocks) == 4 + update_block = update_block_handler.blocks[0] + assert_update_block_equals(update_block, "MAT 1:0/1:p") + update_block = update_block_handler.blocks[1] + assert_update_block_equals(update_block, "MAT 1:1/1:s", (UsfmUpdateBlockElementType.TEXT, "Section header ", False)) + update_block = update_block_handler.blocks[2] + assert_update_block_equals( + update_block, + "MAT 1:1", + (UsfmUpdateBlockElementType.OTHER, "\\v 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "Update 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "Verse 1 ", True), + (UsfmUpdateBlockElementType.PARAGRAPH, "\\s Section header ", False), + (UsfmUpdateBlockElementType.PARAGRAPH, "\\p ", False), + ) + update_block = update_block_handler.blocks[3] + assert_update_block_equals( + update_block, + "MAT 1:2", + (UsfmUpdateBlockElementType.OTHER, "\\v 2 ", False), + (UsfmUpdateBlockElementType.TEXT, "Verse 2 ", False), + ) + + +def test_update_block_verse_section_header_in_verse() -> None: + rows = [ + ( + scr_ref("MAT 1:1"), + str("Update 1"), + ), + ] + usfm = r"""\id MAT - Test \c 1 -\v 1 Update text -\c 2 -\v 1 Verse one. +\p +\v 1 Beginning of verse +\s Section header +\p end of verse """ - assess(target, result) + + update_block_handler = TestUsfmUpdateBlockHandler() + update_usfm(rows, usfm, update_block_handlers=[update_block_handler]) + + assert len(update_block_handler.blocks) == 3 + update_block = update_block_handler.blocks[0] + assert_update_block_equals(update_block, "MAT 1:0/1:p") + update_block = update_block_handler.blocks[1] + assert_update_block_equals(update_block, "MAT 1:1/1:s", (UsfmUpdateBlockElementType.TEXT, "Section header ", False)) + update_block = update_block_handler.blocks[2] + assert_update_block_equals( + update_block, + "MAT 1:1", + (UsfmUpdateBlockElementType.OTHER, "\\v 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "Update 1 ", False), + (UsfmUpdateBlockElementType.TEXT, "Beginning of verse ", True), + (UsfmUpdateBlockElementType.PARAGRAPH, "\\s Section header ", False), + (UsfmUpdateBlockElementType.PARAGRAPH, "\\p ", False), + (UsfmUpdateBlockElementType.TEXT, "end of verse ", True), + ) def scr_ref(*refs: str) -> List[ScriptureRef]: @@ -1019,7 +1152,8 @@ def update_usfm( paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, - preserve_paragraph_styles: Optional[Sequence[str]] = None, + preserve_paragraph_styles: Optional[Iterable[str]] = None, + update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None, ) -> Optional[str]: if source is None: updater = FileParatextProjectTextUpdater(USFM_TEST_PROJECT_PATH) @@ -1032,17 +1166,25 @@ def update_usfm( embed_behavior, style_behavior, preserve_paragraph_styles, + update_block_handlers, ) else: source = source.strip().replace("\r\n", "\n") + "\r\n" updater = UpdateUsfmParserHandler( - rows, id_text, text_behavior, paragraph_behavior, embed_behavior, style_behavior, preserve_paragraph_styles + rows, + id_text, + text_behavior, + paragraph_behavior, + embed_behavior, + style_behavior, + preserve_paragraph_styles, + update_block_handlers, ) parse_usfm(source, updater) return updater.get_usfm() -def assess(target: Optional[str], truth: str) -> None: +def assert_usfm_equals(target: Optional[str], truth: str) -> None: assert target is not None for target_line, truth_line in zip(target.split("\n"), truth.split("\n")): assert target_line.strip() == truth_line.strip() @@ -1051,3 +1193,26 @@ def assess(target: Optional[str], truth: str) -> None: def read_usfm() -> str: with (USFM_TEST_PROJECT_PATH / "41MATTes.SFM").open("r", encoding="utf-8-sig", newline="\r\n") as file: return file.read() + + +def assert_update_block_equals( + block: UsfmUpdateBlock, + expected_ref: Union[str, Iterable[str]], + *expected_elements: tuple[UsfmUpdateBlockElementType, str, bool], +) -> None: + assert block.refs == [ScriptureRef.parse(expected_ref)] if isinstance(expected_ref, str) else list(expected_ref) + assert len(block.elements) == len(expected_elements) + for element, [expected_type, expected_usfm, expected_marked_for_removal] in zip(block.elements, expected_elements): + assert element.type == expected_type + assert "".join(token.to_usfm() for token in element.tokens) == expected_usfm + assert element.marked_for_removal == expected_marked_for_removal + + +class TestUsfmUpdateBlockHandler(UsfmUpdateBlockHandler): + def __init__(self): + self.blocks: list[UsfmUpdateBlock] = [] + + def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: + new_block = block.copy() + self.blocks.append(new_block) + return new_block diff --git a/tests/corpora/test_usfm_file_text.py b/tests/corpora/test_usfm_file_text.py index e046d71d..7c40597f 100644 --- a/tests/corpora/test_usfm_file_text.py +++ b/tests/corpora/test_usfm_file_text.py @@ -66,7 +66,7 @@ def test_get_rows_nonempty_text_all_text() -> None: assert text is not None rows = list(text) - assert len(rows) == 52 + assert len(rows) == 48 assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:0/1:h", corpus.versification) assert rows[0].text == "Matthew" @@ -77,56 +77,44 @@ def test_get_rows_nonempty_text_all_text() -> None: assert scripture_ref(rows[2]) == ScriptureRef.parse("MAT 1:0/3:ip", corpus.versification) assert rows[2].text == "An introduction to Matthew" - assert scripture_ref(rows[3]) == ScriptureRef.parse("MAT 1:0/3:ip/1:fe", corpus.versification) - assert rows[3].text == "This is an endnote." - - assert scripture_ref(rows[4]) == ScriptureRef.parse("Mat 1:0/4:p", corpus.versification) - assert rows[4].text == "MAT 1 Here is another paragraph." - - assert scripture_ref(rows[7]) == ScriptureRef.parse("MAT 1:0/7:weirdtaglookingthing", corpus.versification) - assert rows[7].text == "that is not an actual tag." + assert scripture_ref(rows[3]) == ScriptureRef.parse("Mat 1:0/4:p", corpus.versification) + assert rows[3].text == "MAT 1 Here is another paragraph." - assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 1:0/8:s", corpus.versification) - assert rows[8].text == "Chapter One" + assert scripture_ref(rows[6]) == ScriptureRef.parse("MAT 1:0/7:weirdtaglookingthing", corpus.versification) + assert rows[6].text == "that is not an actual tag." - assert scripture_ref(rows[10]) == ScriptureRef.parse("MAT 1:1/1:f", corpus.versification) - assert rows[10].text == "This is a footnote for v1." + assert scripture_ref(rows[7]) == ScriptureRef.parse("MAT 1:0/8:s", corpus.versification) + assert rows[7].text == "Chapter One" - assert scripture_ref(rows[12]) == ScriptureRef.parse("MAT 1:2/1:f", corpus.versification) - assert rows[12].text == "This is a footnote for v2." + assert scripture_ref(rows[16]) == ScriptureRef.parse("MAT 2:0/1:tr/1:tc1", corpus.versification) + assert rows[16].text == "Row one, column one." - assert scripture_ref(rows[19]) == ScriptureRef.parse("MAT 2:0/1:tr/1:tc1", corpus.versification) - assert rows[19].text == "Row one, column one." + assert scripture_ref(rows[17]) == ScriptureRef.parse("MAT 2:0/1:tr/2:tc2", corpus.versification) + assert rows[17].text == "Row one, column two." - assert scripture_ref(rows[20]) == ScriptureRef.parse("MAT 2:0/1:tr/2:tc2", corpus.versification) - assert rows[20].text == "Row one, column two." + assert scripture_ref(rows[18]) == ScriptureRef.parse("MAT 2:0/2:tr/1:tc1", corpus.versification) + assert rows[18].text == "Row two, column one." - assert scripture_ref(rows[21]) == ScriptureRef.parse("MAT 2:0/2:tr/1:tc1", corpus.versification) - assert rows[21].text == "Row two, column one." + assert scripture_ref(rows[19]) == ScriptureRef.parse("MAT 2:0/2:tr/2:tc2", corpus.versification) + assert rows[19].text == "Row two, column two." - assert scripture_ref(rows[22]) == ScriptureRef.parse("MAT 2:0/2:tr/2:tc2", corpus.versification) - assert rows[22].text == "Row two, column two." - - assert scripture_ref(rows[23]) == ScriptureRef.parse("MAT 2:0/3:s1", corpus.versification) - assert rows[23].text == "Chapter Two" - - assert scripture_ref(rows[24]) == ScriptureRef.parse("MAT 2:0/4:p", corpus.versification) - assert not rows[24].text + assert scripture_ref(rows[20]) == ScriptureRef.parse("MAT 2:0/3:s1", corpus.versification) + assert rows[20].text == "Chapter Two" - assert scripture_ref(rows[27]) == ScriptureRef.parse("MAT 2:1/1:f", corpus.versification) - assert rows[27].text == "This is a footnote." + assert scripture_ref(rows[21]) == ScriptureRef.parse("MAT 2:0/4:p", corpus.versification) + assert not rows[21].text - assert scripture_ref(rows[30]) == ScriptureRef.parse("MAT 2:3/2:esb/1:ms", corpus.versification) - assert rows[30].text == "This is a sidebar" + assert scripture_ref(rows[26]) == ScriptureRef.parse("MAT 2:3/1:esb/1:ms", corpus.versification) + assert rows[26].text == "This is a sidebar" - assert scripture_ref(rows[31]) == ScriptureRef.parse("MAT 2:3/2:esb/2:p", corpus.versification) - assert rows[31].text == "Here is some sidebar content." + assert scripture_ref(rows[27]) == ScriptureRef.parse("MAT 2:3/1:esb/2:p", corpus.versification) + assert rows[27].text == "Here is some sidebar content." - assert scripture_ref(rows[37]) == ScriptureRef.parse("MAT 2:7a/1:s", corpus.versification) - assert rows[37].text == "Section header" + assert scripture_ref(rows[33]) == ScriptureRef.parse("MAT 2:7a/1:s", corpus.versification) + assert rows[33].text == "Section header" - assert scripture_ref(rows[44]) == ScriptureRef.parse("MAT 2:12/1:restore", corpus.versification) - assert rows[44].text == "restore information" + assert scripture_ref(rows[40]) == ScriptureRef.parse("MAT 2:12/1:restore", corpus.versification) + assert rows[40].text == "restore information" def test_get_rows_sentence_start() -> None: @@ -243,7 +231,7 @@ def test_get_rows_include_markers_all_text() -> None: assert scripture_ref(rows[23]) == ScriptureRef.parse("MAT 2:1", corpus.versification) assert rows[23].text == "Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one." - assert scripture_ref(rows[27]) == ScriptureRef.parse("MAT 2:3/2:esb/2:p", corpus.versification) + assert scripture_ref(rows[27]) == ScriptureRef.parse("MAT 2:3/1:esb/2:p", corpus.versification) assert rows[27].text == "Here is some sidebar // content." diff --git a/tests/corpora/test_usfm_memory_text.py b/tests/corpora/test_usfm_memory_text.py index 46288b59..37b87563 100644 --- a/tests/corpora/test_usfm_memory_text.py +++ b/tests/corpora/test_usfm_memory_text.py @@ -111,11 +111,11 @@ def test_get_rows_verse_para_beginning_non_verse_segment() -> None: """, include_all_text=True, ) - assert len(rows) == 5, str.join(",", [tr.text for tr in rows]) + assert len(rows) == 4, str.join(",", [tr.text for tr in rows]) assert rows[0].text == "" assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:0/1:q1") - assert rows[1].text == "World" - assert scripture_ref(rows[1]) == ScriptureRef.parse("MAT 1:0/1:q1/1:f") + assert rows[1].text == "First verse in line!?!" + assert scripture_ref(rows[1]) == ScriptureRef.parse("MAT 1:1") def test_get_rows_verse_para_comment_first() -> None: @@ -129,11 +129,11 @@ def test_get_rows_verse_para_comment_first() -> None: """, include_all_text=True, ) - assert rows[0].text == "World" - assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:0/1:f") - assert rows[1].text == "This is a comment" - assert scripture_ref(rows[1]) == ScriptureRef.parse("MAT 1:0/2:ip") - assert len(rows) == 3, str.join(",", [tr.text for tr in rows]) + assert rows[0].text == "This is a comment" + assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:0/2:ip") + assert rows[1].text == "First verse in line!?!" + assert scripture_ref(rows[1]) == ScriptureRef.parse("MAT 1:1") + assert len(rows) == 2, str.join(",", [tr.text for tr in rows]) def get_rows(usfm: str, include_markers: bool = False, include_all_text: bool = False) -> List[TextRow]: