From 84a6a3464406673ebc0087888167786f20940820 Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Mon, 11 Nov 2024 18:27:31 -0500 Subject: [PATCH 1/3] raise error when id tag doesn't match filename book id --- .../corpora/paratext_backup_text_corpus.py | 32 ++++++++++------- machine/corpora/paratext_text_corpus.py | 29 ++++++++++------ .../test_paratext_backup_text_corpus.py | 30 +++++++++++++--- tests/corpora/test_paratext_text_corpus.py | 14 ++++++++ tests/testutils/corpora_test_helpers.py | 12 +++++++ .../testutils/data/usfm/invalid_id/07JDG.SFM | 5 +++ .../data/usfm/invalid_id/Settings.xml | 34 +++++++++++++++++++ .../testutils/data/usfm/invalid_id/custom.vrs | 31 +++++++++++++++++ .../testutils/data/usfm/mismatch_id/07JDG.SFM | 5 +++ .../data/usfm/mismatch_id/Settings.xml | 34 +++++++++++++++++++ .../data/usfm/mismatch_id/custom.vrs | 31 +++++++++++++++++ 11 files changed, 230 insertions(+), 27 deletions(-) create mode 100644 tests/corpora/test_paratext_text_corpus.py create mode 100644 tests/testutils/data/usfm/invalid_id/07JDG.SFM create mode 100644 tests/testutils/data/usfm/invalid_id/Settings.xml create mode 100644 tests/testutils/data/usfm/invalid_id/custom.vrs create mode 100644 tests/testutils/data/usfm/mismatch_id/07JDG.SFM create mode 100644 tests/testutils/data/usfm/mismatch_id/Settings.xml create mode 100644 tests/testutils/data/usfm/mismatch_id/custom.vrs diff --git a/machine/corpora/paratext_backup_text_corpus.py b/machine/corpora/paratext_backup_text_corpus.py index 77d70654..34bf8f9f 100644 --- a/machine/corpora/paratext_backup_text_corpus.py +++ b/machine/corpora/paratext_backup_text_corpus.py @@ -19,18 +19,26 @@ def __init__(self, filename: StrPath, include_markers: bool = False, include_all for sfm_entry in archive.filelist: book_id = settings.get_book_id(sfm_entry.filename) if book_id: - texts.append( - UsfmZipText( - settings.stylesheet, - settings.encoding, - book_id, - filename, - sfm_entry.filename, - versification, - include_markers, - include_all_text, - settings.name, - ) + text = UsfmZipText( + settings.stylesheet, + settings.encoding, + book_id, + filename, + sfm_entry.filename, + versification, + include_markers, + include_all_text, + settings.name, ) + with text.get_rows() as rows: + row = next(rows, None) + if row and row.ref.book != book_id: + if row.ref.book == "": + raise ValueError(f"The \\id tag in {sfm_entry.filename} is invalid.") + raise ValueError( + f"The \\id tag {row.ref.book} in {sfm_entry.filename}" + f" does not match filename book id {book_id}." + ) + texts.append(text) super().__init__(versification, texts) diff --git a/machine/corpora/paratext_text_corpus.py b/machine/corpora/paratext_text_corpus.py index 24c24dd3..0831ae6a 100644 --- a/machine/corpora/paratext_text_corpus.py +++ b/machine/corpora/paratext_text_corpus.py @@ -18,17 +18,24 @@ def __init__(self, project_dir: StrPath, include_markers: bool = False, include_ for sfm_filename in Path(project_dir).glob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"): book_id = settings.get_book_id(sfm_filename.name) if book_id: - texts.append( - UsfmFileText( - settings.stylesheet, - settings.encoding, - book_id, - sfm_filename, - versification, - include_markers, - include_all_text, - settings.name, - ) + text = UsfmFileText( + settings.stylesheet, + settings.encoding, + book_id, + sfm_filename, + versification, + include_markers, + include_all_text, + settings.name, ) + with text.get_rows() as rows: + row = next(rows, None) + if row and row.ref.book != book_id: + if row.ref.book == "": + raise ValueError(f"The \\id tag in {sfm_filename} is invalid.") + raise ValueError( + f"The \\id tag {row.ref.book} in {sfm_filename} does not match filename book id {book_id}." + ) + texts.append(text) super().__init__(versification, texts) diff --git a/tests/corpora/test_paratext_backup_text_corpus.py b/tests/corpora/test_paratext_backup_text_corpus.py index 57907c33..31d85bff 100644 --- a/tests/corpora/test_paratext_backup_text_corpus.py +++ b/tests/corpora/test_paratext_backup_text_corpus.py @@ -2,9 +2,14 @@ from pathlib import Path from tempfile import TemporaryDirectory -from typing import Any, ContextManager +from typing import Any, ContextManager, Optional -from testutils.corpora_test_helpers import create_test_paratext_backup +from pytest import raises +from testutils.corpora_test_helpers import ( + create_test_paratext_backup, + create_test_paratext_backup_invalid_id, + create_test_paratext_backup_mismatch_id, +) from machine.corpora import ParatextBackupTextCorpus @@ -28,10 +33,27 @@ def test_get_text() -> None: assert not any(jhn.get_rows()) +def test_invalid_id() -> None: + with raises(ValueError, match=r"The \\id tag in .* is invalid."): + with _TestEnvironment("invalid_id") as env: + env.corpus.get_text("JDG") + + +def test_mismatch_id() -> None: + with raises(ValueError, match=r"The \\id tag .* in .* does not match filename book id .*"): + with _TestEnvironment("mismatch_id") as env: + env.corpus.get_text("JDG") + + class _TestEnvironment(ContextManager["_TestEnvironment"]): - def __init__(self) -> None: + def __init__(self, project_folder_name: Optional[str] = None) -> None: self._temp_dir = TemporaryDirectory() - archive_filename = create_test_paratext_backup(Path(self._temp_dir.name)) + if project_folder_name == "invalid_id": + archive_filename = create_test_paratext_backup_invalid_id(Path(self._temp_dir.name)) + elif project_folder_name == "mismatch_id": + archive_filename = create_test_paratext_backup_mismatch_id(Path(self._temp_dir.name)) + else: + archive_filename = create_test_paratext_backup(Path(self._temp_dir.name)) self._corpus = ParatextBackupTextCorpus(archive_filename) @property diff --git a/tests/corpora/test_paratext_text_corpus.py b/tests/corpora/test_paratext_text_corpus.py new file mode 100644 index 00000000..ee3906cb --- /dev/null +++ b/tests/corpora/test_paratext_text_corpus.py @@ -0,0 +1,14 @@ +from pytest import raises +from testutils.corpora_test_helpers import USFM_INVALID_ID_PROJECT_PATH, USFM_MISMATCH_ID_PROJECT_PATH + +from machine.corpora import ParatextTextCorpus + + +def test_paratext_text_corpus_invalid_id() -> None: + with raises(ValueError, match=r"The \\id tag in .* is invalid."): + ParatextTextCorpus(USFM_INVALID_ID_PROJECT_PATH, include_all_text=True) + + +def test_paratext_text_corpus_mismatch_id() -> None: + with raises(ValueError, match=r"The \\id tag .* in .* does not match filename book id .*"): + ParatextTextCorpus(USFM_MISMATCH_ID_PROJECT_PATH, include_all_text=True) diff --git a/tests/testutils/corpora_test_helpers.py b/tests/testutils/corpora_test_helpers.py index 4fd93416..2a2fc502 100644 --- a/tests/testutils/corpora_test_helpers.py +++ b/tests/testutils/corpora_test_helpers.py @@ -9,6 +9,8 @@ USFM_TEST_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "Tes" USFM_TARGET_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "target" USFM_SOURCE_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "source" +USFM_MISMATCH_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "mismatch_id" +USFM_INVALID_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "invalid_id" USX_TEST_PROJECT_PATH = TEST_DATA_PATH / "usx" / "Tes" TEXT_TEST_PROJECT_PATH = TEST_DATA_PATH / "txt" CUSTOM_VERS_PATH = TEST_DATA_PATH / "custom.vrs" @@ -24,6 +26,16 @@ def create_test_paratext_backup(temp_dir: Path) -> Path: return temp_dir / "Tes.zip" +def create_test_paratext_backup_invalid_id(temp_dir: Path) -> Path: + shutil.make_archive(str(temp_dir / "invalid_id"), "zip", USFM_INVALID_ID_PROJECT_PATH) + return temp_dir / "invalid_id.zip" + + +def create_test_paratext_backup_mismatch_id(temp_dir: Path) -> Path: + shutil.make_archive(str(temp_dir / "mismatch_id"), "zip", USFM_MISMATCH_ID_PROJECT_PATH) + return temp_dir / "mismatch_id.zip" + + def verse_ref(segment: TextRow) -> VerseRef: assert isinstance(segment.ref, VerseRef) return segment.ref diff --git a/tests/testutils/data/usfm/invalid_id/07JDG.SFM b/tests/testutils/data/usfm/invalid_id/07JDG.SFM new file mode 100644 index 00000000..40d866f3 --- /dev/null +++ b/tests/testutils/data/usfm/invalid_id/07JDG.SFM @@ -0,0 +1,5 @@ +\id JGS - Test +\h Judges +\mt Judges +\c 1 +\v 1 Chapter one, verse one. diff --git a/tests/testutils/data/usfm/invalid_id/Settings.xml b/tests/testutils/data/usfm/invalid_id/Settings.xml new file mode 100644 index 00000000..45cf3eab --- /dev/null +++ b/tests/testutils/data/usfm/invalid_id/Settings.xml @@ -0,0 +1,34 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test + 65001 + T + + NFC + invalid_id + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + .SFM + Major::BiblicalTerms.xml + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/tests/testutils/data/usfm/invalid_id/custom.vrs b/tests/testutils/data/usfm/invalid_id/custom.vrs new file mode 100644 index 00000000..9c1cd387 --- /dev/null +++ b/tests/testutils/data/usfm/invalid_id/custom.vrs @@ -0,0 +1,31 @@ +# custom.vrs + +LEV 14:56 +ROM 14:26 +REV 12:17 +TOB 5:22 +TOB 10:12 +SIR 23:28 +ESG 1:22 +ESG 3:15 +ESG 5:14 +ESG 8:17 +ESG 10:14 +SIR 33:33 +SIR 41:24 +BAR 1:22 +4MA 7:25 +4MA 12:20 + +# deliberately missing verses +-ROM 16:26 +-ROM 16:27 +-3JN 1:15 +-S3Y 1:49 +-ESG 4:6 +-ESG 9:5 +-ESG 9:30 + +LEV 14:55 = LEV 14:55 +LEV 14:55 = LEV 14:56 +LEV 14:56 = LEV 14:57 diff --git a/tests/testutils/data/usfm/mismatch_id/07JDG.SFM b/tests/testutils/data/usfm/mismatch_id/07JDG.SFM new file mode 100644 index 00000000..bc7c876f --- /dev/null +++ b/tests/testutils/data/usfm/mismatch_id/07JDG.SFM @@ -0,0 +1,5 @@ +\id JUD - Test +\h Judges +\mt Judges +\c 1 +\v 1 Chapter one, verse one. diff --git a/tests/testutils/data/usfm/mismatch_id/Settings.xml b/tests/testutils/data/usfm/mismatch_id/Settings.xml new file mode 100644 index 00000000..a068c35c --- /dev/null +++ b/tests/testutils/data/usfm/mismatch_id/Settings.xml @@ -0,0 +1,34 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test + 65001 + T + + NFC + mismatch_id + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + .SFM + Major::BiblicalTerms.xml + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/tests/testutils/data/usfm/mismatch_id/custom.vrs b/tests/testutils/data/usfm/mismatch_id/custom.vrs new file mode 100644 index 00000000..9c1cd387 --- /dev/null +++ b/tests/testutils/data/usfm/mismatch_id/custom.vrs @@ -0,0 +1,31 @@ +# custom.vrs + +LEV 14:56 +ROM 14:26 +REV 12:17 +TOB 5:22 +TOB 10:12 +SIR 23:28 +ESG 1:22 +ESG 3:15 +ESG 5:14 +ESG 8:17 +ESG 10:14 +SIR 33:33 +SIR 41:24 +BAR 1:22 +4MA 7:25 +4MA 12:20 + +# deliberately missing verses +-ROM 16:26 +-ROM 16:27 +-3JN 1:15 +-S3Y 1:49 +-ESG 4:6 +-ESG 9:5 +-ESG 9:30 + +LEV 14:55 = LEV 14:55 +LEV 14:55 = LEV 14:56 +LEV 14:56 = LEV 14:57 From bc41e28e27b17e9512a1a99f367cd943eaafffa5 Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Tue, 12 Nov 2024 15:42:33 -0500 Subject: [PATCH 2/3] Revert "raise error when id tag doesn't match filename book id" This reverts commit 8679b785cdecb2a427726baf55b023a80496498e. --- .../corpora/paratext_backup_text_corpus.py | 32 +++++++---------- machine/corpora/paratext_text_corpus.py | 29 ++++++---------- .../test_paratext_backup_text_corpus.py | 30 +++------------- tests/corpora/test_paratext_text_corpus.py | 14 -------- tests/testutils/corpora_test_helpers.py | 12 ------- .../testutils/data/usfm/invalid_id/07JDG.SFM | 5 --- .../data/usfm/invalid_id/Settings.xml | 34 ------------------- .../testutils/data/usfm/invalid_id/custom.vrs | 31 ----------------- .../testutils/data/usfm/mismatch_id/07JDG.SFM | 5 --- .../data/usfm/mismatch_id/Settings.xml | 34 ------------------- .../data/usfm/mismatch_id/custom.vrs | 31 ----------------- 11 files changed, 27 insertions(+), 230 deletions(-) delete mode 100644 tests/corpora/test_paratext_text_corpus.py delete mode 100644 tests/testutils/data/usfm/invalid_id/07JDG.SFM delete mode 100644 tests/testutils/data/usfm/invalid_id/Settings.xml delete mode 100644 tests/testutils/data/usfm/invalid_id/custom.vrs delete mode 100644 tests/testutils/data/usfm/mismatch_id/07JDG.SFM delete mode 100644 tests/testutils/data/usfm/mismatch_id/Settings.xml delete mode 100644 tests/testutils/data/usfm/mismatch_id/custom.vrs diff --git a/machine/corpora/paratext_backup_text_corpus.py b/machine/corpora/paratext_backup_text_corpus.py index 34bf8f9f..77d70654 100644 --- a/machine/corpora/paratext_backup_text_corpus.py +++ b/machine/corpora/paratext_backup_text_corpus.py @@ -19,26 +19,18 @@ def __init__(self, filename: StrPath, include_markers: bool = False, include_all for sfm_entry in archive.filelist: book_id = settings.get_book_id(sfm_entry.filename) if book_id: - text = UsfmZipText( - settings.stylesheet, - settings.encoding, - book_id, - filename, - sfm_entry.filename, - versification, - include_markers, - include_all_text, - settings.name, + texts.append( + UsfmZipText( + settings.stylesheet, + settings.encoding, + book_id, + filename, + sfm_entry.filename, + versification, + include_markers, + include_all_text, + settings.name, + ) ) - with text.get_rows() as rows: - row = next(rows, None) - if row and row.ref.book != book_id: - if row.ref.book == "": - raise ValueError(f"The \\id tag in {sfm_entry.filename} is invalid.") - raise ValueError( - f"The \\id tag {row.ref.book} in {sfm_entry.filename}" - f" does not match filename book id {book_id}." - ) - texts.append(text) super().__init__(versification, texts) diff --git a/machine/corpora/paratext_text_corpus.py b/machine/corpora/paratext_text_corpus.py index 0831ae6a..24c24dd3 100644 --- a/machine/corpora/paratext_text_corpus.py +++ b/machine/corpora/paratext_text_corpus.py @@ -18,24 +18,17 @@ def __init__(self, project_dir: StrPath, include_markers: bool = False, include_ for sfm_filename in Path(project_dir).glob(f"{settings.file_name_prefix}*{settings.file_name_suffix}"): book_id = settings.get_book_id(sfm_filename.name) if book_id: - text = UsfmFileText( - settings.stylesheet, - settings.encoding, - book_id, - sfm_filename, - versification, - include_markers, - include_all_text, - settings.name, + texts.append( + UsfmFileText( + settings.stylesheet, + settings.encoding, + book_id, + sfm_filename, + versification, + include_markers, + include_all_text, + settings.name, + ) ) - with text.get_rows() as rows: - row = next(rows, None) - if row and row.ref.book != book_id: - if row.ref.book == "": - raise ValueError(f"The \\id tag in {sfm_filename} is invalid.") - raise ValueError( - f"The \\id tag {row.ref.book} in {sfm_filename} does not match filename book id {book_id}." - ) - texts.append(text) super().__init__(versification, texts) diff --git a/tests/corpora/test_paratext_backup_text_corpus.py b/tests/corpora/test_paratext_backup_text_corpus.py index 31d85bff..57907c33 100644 --- a/tests/corpora/test_paratext_backup_text_corpus.py +++ b/tests/corpora/test_paratext_backup_text_corpus.py @@ -2,14 +2,9 @@ from pathlib import Path from tempfile import TemporaryDirectory -from typing import Any, ContextManager, Optional +from typing import Any, ContextManager -from pytest import raises -from testutils.corpora_test_helpers import ( - create_test_paratext_backup, - create_test_paratext_backup_invalid_id, - create_test_paratext_backup_mismatch_id, -) +from testutils.corpora_test_helpers import create_test_paratext_backup from machine.corpora import ParatextBackupTextCorpus @@ -33,27 +28,10 @@ def test_get_text() -> None: assert not any(jhn.get_rows()) -def test_invalid_id() -> None: - with raises(ValueError, match=r"The \\id tag in .* is invalid."): - with _TestEnvironment("invalid_id") as env: - env.corpus.get_text("JDG") - - -def test_mismatch_id() -> None: - with raises(ValueError, match=r"The \\id tag .* in .* does not match filename book id .*"): - with _TestEnvironment("mismatch_id") as env: - env.corpus.get_text("JDG") - - class _TestEnvironment(ContextManager["_TestEnvironment"]): - def __init__(self, project_folder_name: Optional[str] = None) -> None: + def __init__(self) -> None: self._temp_dir = TemporaryDirectory() - if project_folder_name == "invalid_id": - archive_filename = create_test_paratext_backup_invalid_id(Path(self._temp_dir.name)) - elif project_folder_name == "mismatch_id": - archive_filename = create_test_paratext_backup_mismatch_id(Path(self._temp_dir.name)) - else: - archive_filename = create_test_paratext_backup(Path(self._temp_dir.name)) + archive_filename = create_test_paratext_backup(Path(self._temp_dir.name)) self._corpus = ParatextBackupTextCorpus(archive_filename) @property diff --git a/tests/corpora/test_paratext_text_corpus.py b/tests/corpora/test_paratext_text_corpus.py deleted file mode 100644 index ee3906cb..00000000 --- a/tests/corpora/test_paratext_text_corpus.py +++ /dev/null @@ -1,14 +0,0 @@ -from pytest import raises -from testutils.corpora_test_helpers import USFM_INVALID_ID_PROJECT_PATH, USFM_MISMATCH_ID_PROJECT_PATH - -from machine.corpora import ParatextTextCorpus - - -def test_paratext_text_corpus_invalid_id() -> None: - with raises(ValueError, match=r"The \\id tag in .* is invalid."): - ParatextTextCorpus(USFM_INVALID_ID_PROJECT_PATH, include_all_text=True) - - -def test_paratext_text_corpus_mismatch_id() -> None: - with raises(ValueError, match=r"The \\id tag .* in .* does not match filename book id .*"): - ParatextTextCorpus(USFM_MISMATCH_ID_PROJECT_PATH, include_all_text=True) diff --git a/tests/testutils/corpora_test_helpers.py b/tests/testutils/corpora_test_helpers.py index 2a2fc502..4fd93416 100644 --- a/tests/testutils/corpora_test_helpers.py +++ b/tests/testutils/corpora_test_helpers.py @@ -9,8 +9,6 @@ USFM_TEST_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "Tes" USFM_TARGET_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "target" USFM_SOURCE_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "source" -USFM_MISMATCH_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "mismatch_id" -USFM_INVALID_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "invalid_id" USX_TEST_PROJECT_PATH = TEST_DATA_PATH / "usx" / "Tes" TEXT_TEST_PROJECT_PATH = TEST_DATA_PATH / "txt" CUSTOM_VERS_PATH = TEST_DATA_PATH / "custom.vrs" @@ -26,16 +24,6 @@ def create_test_paratext_backup(temp_dir: Path) -> Path: return temp_dir / "Tes.zip" -def create_test_paratext_backup_invalid_id(temp_dir: Path) -> Path: - shutil.make_archive(str(temp_dir / "invalid_id"), "zip", USFM_INVALID_ID_PROJECT_PATH) - return temp_dir / "invalid_id.zip" - - -def create_test_paratext_backup_mismatch_id(temp_dir: Path) -> Path: - shutil.make_archive(str(temp_dir / "mismatch_id"), "zip", USFM_MISMATCH_ID_PROJECT_PATH) - return temp_dir / "mismatch_id.zip" - - def verse_ref(segment: TextRow) -> VerseRef: assert isinstance(segment.ref, VerseRef) return segment.ref diff --git a/tests/testutils/data/usfm/invalid_id/07JDG.SFM b/tests/testutils/data/usfm/invalid_id/07JDG.SFM deleted file mode 100644 index 40d866f3..00000000 --- a/tests/testutils/data/usfm/invalid_id/07JDG.SFM +++ /dev/null @@ -1,5 +0,0 @@ -\id JGS - Test -\h Judges -\mt Judges -\c 1 -\v 1 Chapter one, verse one. diff --git a/tests/testutils/data/usfm/invalid_id/Settings.xml b/tests/testutils/data/usfm/invalid_id/Settings.xml deleted file mode 100644 index 45cf3eab..00000000 --- a/tests/testutils/data/usfm/invalid_id/Settings.xml +++ /dev/null @@ -1,34 +0,0 @@ - - usfm.sty - 4 - en::: - English - 8.0.100.76 - Test - 65001 - T - - NFC - invalid_id - a7e0b3ce0200736062f9f810a444dbfbe64aca35 - Charis SIL - 12 - - - - 41MAT - - .SFM - Major::BiblicalTerms.xml - F - F - F - Public - Standard:: - - 3 - 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 - 000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000 - - - \ No newline at end of file diff --git a/tests/testutils/data/usfm/invalid_id/custom.vrs b/tests/testutils/data/usfm/invalid_id/custom.vrs deleted file mode 100644 index 9c1cd387..00000000 --- a/tests/testutils/data/usfm/invalid_id/custom.vrs +++ /dev/null @@ -1,31 +0,0 @@ -# custom.vrs - -LEV 14:56 -ROM 14:26 -REV 12:17 -TOB 5:22 -TOB 10:12 -SIR 23:28 -ESG 1:22 -ESG 3:15 -ESG 5:14 -ESG 8:17 -ESG 10:14 -SIR 33:33 -SIR 41:24 -BAR 1:22 -4MA 7:25 -4MA 12:20 - -# deliberately missing verses --ROM 16:26 --ROM 16:27 --3JN 1:15 --S3Y 1:49 --ESG 4:6 --ESG 9:5 --ESG 9:30 - -LEV 14:55 = LEV 14:55 -LEV 14:55 = LEV 14:56 -LEV 14:56 = LEV 14:57 diff --git a/tests/testutils/data/usfm/mismatch_id/07JDG.SFM b/tests/testutils/data/usfm/mismatch_id/07JDG.SFM deleted file mode 100644 index bc7c876f..00000000 --- a/tests/testutils/data/usfm/mismatch_id/07JDG.SFM +++ /dev/null @@ -1,5 +0,0 @@ -\id JUD - Test -\h Judges -\mt Judges -\c 1 -\v 1 Chapter one, verse one. diff --git a/tests/testutils/data/usfm/mismatch_id/Settings.xml b/tests/testutils/data/usfm/mismatch_id/Settings.xml deleted file mode 100644 index a068c35c..00000000 --- a/tests/testutils/data/usfm/mismatch_id/Settings.xml +++ /dev/null @@ -1,34 +0,0 @@ - - usfm.sty - 4 - en::: - English - 8.0.100.76 - Test - 65001 - T - - NFC - mismatch_id - a7e0b3ce0200736062f9f810a444dbfbe64aca35 - Charis SIL - 12 - - - - 41MAT - - .SFM - Major::BiblicalTerms.xml - F - F - F - Public - Standard:: - - 3 - 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 - 000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000 - - - \ No newline at end of file diff --git a/tests/testutils/data/usfm/mismatch_id/custom.vrs b/tests/testutils/data/usfm/mismatch_id/custom.vrs deleted file mode 100644 index 9c1cd387..00000000 --- a/tests/testutils/data/usfm/mismatch_id/custom.vrs +++ /dev/null @@ -1,31 +0,0 @@ -# custom.vrs - -LEV 14:56 -ROM 14:26 -REV 12:17 -TOB 5:22 -TOB 10:12 -SIR 23:28 -ESG 1:22 -ESG 3:15 -ESG 5:14 -ESG 8:17 -ESG 10:14 -SIR 33:33 -SIR 41:24 -BAR 1:22 -4MA 7:25 -4MA 12:20 - -# deliberately missing verses --ROM 16:26 --ROM 16:27 --3JN 1:15 --S3Y 1:49 --ESG 4:6 --ESG 9:5 --ESG 9:30 - -LEV 14:55 = LEV 14:55 -LEV 14:55 = LEV 14:56 -LEV 14:56 = LEV 14:57 From 894e2baf0103311bde9a2af9bb7cb10d1902c5ae Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Thu, 14 Nov 2024 16:28:29 -0500 Subject: [PATCH 3/3] raise error on invalid and mismatched book ids, take 2 --- machine/corpora/usfm_text_base.py | 8 +++++ tests/corpora/test_scripture_text_corpus.py | 14 +++++++- tests/corpora/test_usfm_file_text.py | 16 ++++++++- tests/testutils/corpora_test_helpers.py | 2 ++ .../testutils/data/usfm/invalid_id/07JDG.SFM | 5 +++ .../data/usfm/invalid_id/Settings.xml | 34 +++++++++++++++++++ .../testutils/data/usfm/invalid_id/custom.vrs | 31 +++++++++++++++++ .../testutils/data/usfm/mismatch_id/07JDG.SFM | 5 +++ .../data/usfm/mismatch_id/Settings.xml | 34 +++++++++++++++++++ .../data/usfm/mismatch_id/custom.vrs | 31 +++++++++++++++++ 10 files changed, 178 insertions(+), 2 deletions(-) create mode 100644 tests/testutils/data/usfm/invalid_id/07JDG.SFM create mode 100644 tests/testutils/data/usfm/invalid_id/Settings.xml create mode 100644 tests/testutils/data/usfm/invalid_id/custom.vrs create mode 100644 tests/testutils/data/usfm/mismatch_id/07JDG.SFM create mode 100644 tests/testutils/data/usfm/mismatch_id/Settings.xml create mode 100644 tests/testutils/data/usfm/mismatch_id/custom.vrs diff --git a/machine/corpora/usfm_text_base.py b/machine/corpora/usfm_text_base.py index 1fd96f3c..c150779f 100644 --- a/machine/corpora/usfm_text_base.py +++ b/machine/corpora/usfm_text_base.py @@ -2,6 +2,7 @@ from io import TextIOWrapper from typing import Generator, Iterable, List, Optional, Sequence +from ..scripture.canon import ALL_BOOK_IDS from ..scripture.verse_ref import Versification from ..utils.string_utils import has_sentence_ending from .corpora_utils import gen @@ -90,6 +91,13 @@ def __init__(self, text: UsfmTextBase) -> None: def rows(self) -> Iterable[TextRow]: return self._rows + def start_book(self, state: UsfmParserState, marker: str, code: str) -> None: + super().start_book(state, marker, code) + if code not in ALL_BOOK_IDS: + raise ValueError(f"The book {code} is not a valid book id.") + if code != self._text.id: + raise ValueError(f"The \\id marker {code} does not match the text id {self._text.id}.") + def verse( self, state: UsfmParserState, diff --git a/tests/corpora/test_scripture_text_corpus.py b/tests/corpora/test_scripture_text_corpus.py index 925c9ca7..5bbbfca6 100644 --- a/tests/corpora/test_scripture_text_corpus.py +++ b/tests/corpora/test_scripture_text_corpus.py @@ -1,4 +1,5 @@ -from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH +from pytest import raises +from testutils.corpora_test_helpers import USFM_MISMATCH_ID_PROJECT_PATH, USFM_TEST_PROJECT_PATH from machine.corpora import ParatextTextCorpus, extract_scripture_corpus from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef @@ -59,3 +60,14 @@ def test_extract_scripture_corpus() -> None: assert text == "" assert orig_vref.exact_equals(VerseRef.from_string("MAT 2:12", ORIGINAL_VERSIFICATION)) assert corpus_vref is not None and corpus_vref.exact_equals(VerseRef.from_string("MAT 2:12", corpus.versification)) + + +def test_extract_scripture_corpus_mismatch_id() -> None: + corpus = ParatextTextCorpus(USFM_MISMATCH_ID_PROJECT_PATH, include_all_text=True) + + with raises( + RuntimeError, + match=r"An error occurred while parsing the text 'JDG' in project mismatch_id. " + r"Verse: JUD 1:0, line: 1, character: 1, error: 'The \\id marker JUD does not match the text id JDG.'", + ): + list(extract_scripture_corpus(corpus)) diff --git a/tests/corpora/test_usfm_file_text.py b/tests/corpora/test_usfm_file_text.py index 383b95a4..3f87fd38 100644 --- a/tests/corpora/test_usfm_file_text.py +++ b/tests/corpora/test_usfm_file_text.py @@ -1,4 +1,5 @@ -from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH, scripture_ref +from pytest import raises +from testutils.corpora_test_helpers import USFM_INVALID_ID_PROJECT_PATH, USFM_TEST_PROJECT_PATH, scripture_ref from machine.corpora import ScriptureRef, UsfmFileTextCorpus @@ -244,6 +245,19 @@ def test_get_rows_include_markers_all_text() -> None: assert rows[26].text == "Here is some sidebar // content." +def test_get_rows_invalid_id() -> None: + corpus = UsfmFileTextCorpus(USFM_INVALID_ID_PROJECT_PATH) + + text = corpus.get_text("JGS") + assert text is not None + with raises( + RuntimeError, + match="An error occurred while parsing the text 'JGS'." + " Verse: 1:0, line: 1, character: 1, error: 'The book JGS is not a valid book id.", + ): + list(text) + + def test_usfm_file_text_corpus_lowercase_usfm_id() -> None: corpus = UsfmFileTextCorpus(USFM_TEST_PROJECT_PATH) diff --git a/tests/testutils/corpora_test_helpers.py b/tests/testutils/corpora_test_helpers.py index 4fd93416..e2875605 100644 --- a/tests/testutils/corpora_test_helpers.py +++ b/tests/testutils/corpora_test_helpers.py @@ -9,6 +9,8 @@ USFM_TEST_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "Tes" USFM_TARGET_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "target" USFM_SOURCE_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "source" +USFM_INVALID_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "invalid_id" +USFM_MISMATCH_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "mismatch_id" USX_TEST_PROJECT_PATH = TEST_DATA_PATH / "usx" / "Tes" TEXT_TEST_PROJECT_PATH = TEST_DATA_PATH / "txt" CUSTOM_VERS_PATH = TEST_DATA_PATH / "custom.vrs" diff --git a/tests/testutils/data/usfm/invalid_id/07JDG.SFM b/tests/testutils/data/usfm/invalid_id/07JDG.SFM new file mode 100644 index 00000000..6d754977 --- /dev/null +++ b/tests/testutils/data/usfm/invalid_id/07JDG.SFM @@ -0,0 +1,5 @@ +\id JGS - Test +\h Judges +\mt Judges +\c 1 +\v 1 Chapter one, verse one. \ No newline at end of file diff --git a/tests/testutils/data/usfm/invalid_id/Settings.xml b/tests/testutils/data/usfm/invalid_id/Settings.xml new file mode 100644 index 00000000..aa24e29b --- /dev/null +++ b/tests/testutils/data/usfm/invalid_id/Settings.xml @@ -0,0 +1,34 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test + 65001 + T + + NFC + invalid_id + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + .SFM + Major::BiblicalTerms.xml + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/tests/testutils/data/usfm/invalid_id/custom.vrs b/tests/testutils/data/usfm/invalid_id/custom.vrs new file mode 100644 index 00000000..fb315afb --- /dev/null +++ b/tests/testutils/data/usfm/invalid_id/custom.vrs @@ -0,0 +1,31 @@ +# custom.vrs + +LEV 14:56 +ROM 14:26 +REV 12:17 +TOB 5:22 +TOB 10:12 +SIR 23:28 +ESG 1:22 +ESG 3:15 +ESG 5:14 +ESG 8:17 +ESG 10:14 +SIR 33:33 +SIR 41:24 +BAR 1:22 +4MA 7:25 +4MA 12:20 + +# deliberately missing verses +-ROM 16:26 +-ROM 16:27 +-3JN 1:15 +-S3Y 1:49 +-ESG 4:6 +-ESG 9:5 +-ESG 9:30 + +LEV 14:55 = LEV 14:55 +LEV 14:55 = LEV 14:56 +LEV 14:56 = LEV 14:57 \ No newline at end of file diff --git a/tests/testutils/data/usfm/mismatch_id/07JDG.SFM b/tests/testutils/data/usfm/mismatch_id/07JDG.SFM new file mode 100644 index 00000000..19591779 --- /dev/null +++ b/tests/testutils/data/usfm/mismatch_id/07JDG.SFM @@ -0,0 +1,5 @@ +\id JUD - Test +\h Judges +\mt Judges +\c 1 +\v 1 Chapter one, verse one. \ No newline at end of file diff --git a/tests/testutils/data/usfm/mismatch_id/Settings.xml b/tests/testutils/data/usfm/mismatch_id/Settings.xml new file mode 100644 index 00000000..5e09b680 --- /dev/null +++ b/tests/testutils/data/usfm/mismatch_id/Settings.xml @@ -0,0 +1,34 @@ + + usfm.sty + 4 + en::: + English + 8.0.100.76 + Test + 65001 + T + + NFC + mismatch_id + a7e0b3ce0200736062f9f810a444dbfbe64aca35 + Charis SIL + 12 + + + + 41MAT + + .SFM + Major::BiblicalTerms.xml + F + F + F + Public + Standard:: + + 3 + 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000 + + + \ No newline at end of file diff --git a/tests/testutils/data/usfm/mismatch_id/custom.vrs b/tests/testutils/data/usfm/mismatch_id/custom.vrs new file mode 100644 index 00000000..fb315afb --- /dev/null +++ b/tests/testutils/data/usfm/mismatch_id/custom.vrs @@ -0,0 +1,31 @@ +# custom.vrs + +LEV 14:56 +ROM 14:26 +REV 12:17 +TOB 5:22 +TOB 10:12 +SIR 23:28 +ESG 1:22 +ESG 3:15 +ESG 5:14 +ESG 8:17 +ESG 10:14 +SIR 33:33 +SIR 41:24 +BAR 1:22 +4MA 7:25 +4MA 12:20 + +# deliberately missing verses +-ROM 16:26 +-ROM 16:27 +-3JN 1:15 +-S3Y 1:49 +-ESG 4:6 +-ESG 9:5 +-ESG 9:30 + +LEV 14:55 = LEV 14:55 +LEV 14:55 = LEV 14:56 +LEV 14:56 = LEV 14:57 \ No newline at end of file