Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 60c78d6

Browse files
committedNov 14, 2024·
raise error on invalid and mismatched book ids, take 2
1 parent d91eeeb commit 60c78d6

File tree

10 files changed

+178
-2
lines changed

10 files changed

+178
-2
lines changed
 

‎machine/corpora/usfm_text_base.py

+8
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from io import TextIOWrapper
33
from typing import Generator, Iterable, List, Optional, Sequence
44

5+
from ..scripture.canon import ALL_BOOK_IDS
56
from ..scripture.verse_ref import Versification
67
from ..utils.string_utils import has_sentence_ending
78
from .corpora_utils import gen
@@ -90,6 +91,13 @@ def __init__(self, text: UsfmTextBase) -> None:
9091
def rows(self) -> Iterable[TextRow]:
9192
return self._rows
9293

94+
def start_book(self, state: UsfmParserState, marker: str, code: str) -> None:
95+
super().start_book(state, marker, code)
96+
if code not in ALL_BOOK_IDS:
97+
raise ValueError(f"The book {code} is not a valid book id.")
98+
if code != self._text.id:
99+
raise ValueError(f"The \\id marker {code} does not match the text id {self._text.id}.")
100+
93101
def verse(
94102
self,
95103
state: UsfmParserState,

‎tests/corpora/test_scripture_text_corpus.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH
1+
from pytest import raises
2+
from testutils.corpora_test_helpers import USFM_MISMATCH_ID_PROJECT_PATH, USFM_TEST_PROJECT_PATH
23

34
from machine.corpora import ParatextTextCorpus, extract_scripture_corpus
45
from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef
@@ -59,3 +60,14 @@ def test_extract_scripture_corpus() -> None:
5960
assert text == ""
6061
assert orig_vref.exact_equals(VerseRef.from_string("MAT 2:12", ORIGINAL_VERSIFICATION))
6162
assert corpus_vref is not None and corpus_vref.exact_equals(VerseRef.from_string("MAT 2:12", corpus.versification))
63+
64+
65+
def test_extract_scripture_corpus_mismatch_id() -> None:
66+
corpus = ParatextTextCorpus(USFM_MISMATCH_ID_PROJECT_PATH, include_all_text=True)
67+
68+
with raises(
69+
RuntimeError,
70+
match=r"An error occurred while parsing the text 'JDG' in project mismatch_id. "
71+
r"Verse: JUD 1:0, line: 1, character: 1, error: 'The \\id marker JUD does not match the text id JDG.'",
72+
):
73+
list(extract_scripture_corpus(corpus))

‎tests/corpora/test_usfm_file_text.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH, scripture_ref
1+
from pytest import raises
2+
from testutils.corpora_test_helpers import USFM_INVALID_ID_PROJECT_PATH, USFM_TEST_PROJECT_PATH, scripture_ref
23

34
from machine.corpora import ScriptureRef, UsfmFileTextCorpus
45

@@ -244,6 +245,19 @@ def test_get_rows_include_markers_all_text() -> None:
244245
assert rows[26].text == "Here is some sidebar // content."
245246

246247

248+
def test_get_rows_invalid_id() -> None:
249+
corpus = UsfmFileTextCorpus(USFM_INVALID_ID_PROJECT_PATH)
250+
251+
text = corpus.get_text("JGS")
252+
assert text is not None
253+
with raises(
254+
RuntimeError,
255+
match="An error occurred while parsing the text 'JGS'."
256+
" Verse: 1:0, line: 1, character: 1, error: 'The book JGS is not a valid book id.",
257+
):
258+
list(text)
259+
260+
247261
def test_usfm_file_text_corpus_lowercase_usfm_id() -> None:
248262
corpus = UsfmFileTextCorpus(USFM_TEST_PROJECT_PATH)
249263

‎tests/testutils/corpora_test_helpers.py

+2
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
USFM_TEST_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "Tes"
1010
USFM_TARGET_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "target"
1111
USFM_SOURCE_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "source"
12+
USFM_INVALID_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "invalid_id"
13+
USFM_MISMATCH_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "mismatch_id"
1214
USX_TEST_PROJECT_PATH = TEST_DATA_PATH / "usx" / "Tes"
1315
TEXT_TEST_PROJECT_PATH = TEST_DATA_PATH / "txt"
1416
CUSTOM_VERS_PATH = TEST_DATA_PATH / "custom.vrs"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
\id JGS - Test
2+
\h Judges
3+
\mt Judges
4+
\c 1
5+
\v 1 Chapter one, verse one.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
<ScriptureText>
2+
<StyleSheet>usfm.sty</StyleSheet>
3+
<Versification>4</Versification>
4+
<LanguageIsoCode>en:::</LanguageIsoCode>
5+
<Language>English</Language>
6+
<MinParatextVersion>8.0.100.76</MinParatextVersion>
7+
<FullName>Test</FullName>
8+
<Encoding>65001</Encoding>
9+
<Editable>T</Editable>
10+
<Copyright />
11+
<NormalizationForm>NFC</NormalizationForm>
12+
<Name>invalid_id</Name>
13+
<Guid>a7e0b3ce0200736062f9f810a444dbfbe64aca35</Guid>
14+
<DefaultFont>Charis SIL</DefaultFont>
15+
<DefaultFontSize>12</DefaultFontSize>
16+
<FontFeatures />
17+
<HtmlLanguage />
18+
<AssociatedLexicalProject />
19+
<FileNameBookNameForm>41MAT</FileNameBookNameForm>
20+
<FileNamePrePart />
21+
<FileNamePostPart>.SFM</FileNamePostPart>
22+
<BiblicalTermsListSetting>Major::BiblicalTerms.xml</BiblicalTermsListSetting>
23+
<MatchBasedOnStems>F</MatchBasedOnStems>
24+
<AllowReadAccess>F</AllowReadAccess>
25+
<AllowSharingWithSLDR>F</AllowSharingWithSLDR>
26+
<Visibility>Public</Visibility>
27+
<TranslationInfo>Standard::</TranslationInfo>
28+
<EncodingConverter />
29+
<UsfmVersion>3</UsfmVersion>
30+
<ParallelPassagesBooks>000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000</ParallelPassagesBooks>
31+
<BooksPresent>000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000</BooksPresent>
32+
<BibleModuleAssociations />
33+
<Naming PrePart="" PostPart=".SFM" BookNameForm="41MAT" />
34+
</ScriptureText>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# custom.vrs
2+
3+
LEV 14:56
4+
ROM 14:26
5+
REV 12:17
6+
TOB 5:22
7+
TOB 10:12
8+
SIR 23:28
9+
ESG 1:22
10+
ESG 3:15
11+
ESG 5:14
12+
ESG 8:17
13+
ESG 10:14
14+
SIR 33:33
15+
SIR 41:24
16+
BAR 1:22
17+
4MA 7:25
18+
4MA 12:20
19+
20+
# deliberately missing verses
21+
-ROM 16:26
22+
-ROM 16:27
23+
-3JN 1:15
24+
-S3Y 1:49
25+
-ESG 4:6
26+
-ESG 9:5
27+
-ESG 9:30
28+
29+
LEV 14:55 = LEV 14:55
30+
LEV 14:55 = LEV 14:56
31+
LEV 14:56 = LEV 14:57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
\id JUD - Test
2+
\h Judges
3+
\mt Judges
4+
\c 1
5+
\v 1 Chapter one, verse one.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
<ScriptureText>
2+
<StyleSheet>usfm.sty</StyleSheet>
3+
<Versification>4</Versification>
4+
<LanguageIsoCode>en:::</LanguageIsoCode>
5+
<Language>English</Language>
6+
<MinParatextVersion>8.0.100.76</MinParatextVersion>
7+
<FullName>Test</FullName>
8+
<Encoding>65001</Encoding>
9+
<Editable>T</Editable>
10+
<Copyright />
11+
<NormalizationForm>NFC</NormalizationForm>
12+
<Name>mismatch_id</Name>
13+
<Guid>a7e0b3ce0200736062f9f810a444dbfbe64aca35</Guid>
14+
<DefaultFont>Charis SIL</DefaultFont>
15+
<DefaultFontSize>12</DefaultFontSize>
16+
<FontFeatures />
17+
<HtmlLanguage />
18+
<AssociatedLexicalProject />
19+
<FileNameBookNameForm>41MAT</FileNameBookNameForm>
20+
<FileNamePrePart />
21+
<FileNamePostPart>.SFM</FileNamePostPart>
22+
<BiblicalTermsListSetting>Major::BiblicalTerms.xml</BiblicalTermsListSetting>
23+
<MatchBasedOnStems>F</MatchBasedOnStems>
24+
<AllowReadAccess>F</AllowReadAccess>
25+
<AllowSharingWithSLDR>F</AllowSharingWithSLDR>
26+
<Visibility>Public</Visibility>
27+
<TranslationInfo>Standard::</TranslationInfo>
28+
<EncodingConverter />
29+
<UsfmVersion>3</UsfmVersion>
30+
<ParallelPassagesBooks>000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000</ParallelPassagesBooks>
31+
<BooksPresent>000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000</BooksPresent>
32+
<BibleModuleAssociations />
33+
<Naming PrePart="" PostPart=".SFM" BookNameForm="41MAT" />
34+
</ScriptureText>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# custom.vrs
2+
3+
LEV 14:56
4+
ROM 14:26
5+
REV 12:17
6+
TOB 5:22
7+
TOB 10:12
8+
SIR 23:28
9+
ESG 1:22
10+
ESG 3:15
11+
ESG 5:14
12+
ESG 8:17
13+
ESG 10:14
14+
SIR 33:33
15+
SIR 41:24
16+
BAR 1:22
17+
4MA 7:25
18+
4MA 12:20
19+
20+
# deliberately missing verses
21+
-ROM 16:26
22+
-ROM 16:27
23+
-3JN 1:15
24+
-S3Y 1:49
25+
-ESG 4:6
26+
-ESG 9:5
27+
-ESG 9:30
28+
29+
LEV 14:55 = LEV 14:55
30+
LEV 14:55 = LEV 14:56
31+
LEV 14:56 = LEV 14:57

0 commit comments

Comments
 (0)
Please sign in to comment.