Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Paratext/USFM processing tutorial #130

Merged
merged 1 commit into from
Oct 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ If you would like to find out more about how to use Machine, check out the tutor
- [Tokenization](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/tokenization.ipynb)
- [Text Corpora](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/corpora.ipynb)
- [Word Alignment](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/word_alignment.ipynb)
- [Paratext/USFM Processing](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/paratext_usfm.ipynb)
3 changes: 2 additions & 1 deletion machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
normalize,
unescape_spaces,
)
from .update_usfm_parser_handler import UpdateUsfmParserHandler
from .update_usfm_parser_handler import UpdateUsfmBehavior, UpdateUsfmParserHandler
from .usfm_file_text import UsfmFileText
from .usfm_file_text_corpus import UsfmFileTextCorpus
from .usfm_memory_text import UsfmMemoryText
Expand Down Expand Up @@ -125,6 +125,7 @@
"TextRow",
"TextRowFlags",
"unescape_spaces",
"UpdateUsfmBehavior",
"UpdateUsfmParserHandler",
"UsfmAttribute",
"UsfmElementType",
Expand Down
9 changes: 3 additions & 6 deletions machine/corpora/paratext_project_text_updater_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .scripture_ref import ScriptureRef
from .update_usfm_parser_handler import UpdateUsfmParserHandler
from .update_usfm_parser_handler import UpdateUsfmBehavior, UpdateUsfmParserHandler
from .usfm_parser import parse_usfm


Expand All @@ -21,17 +21,14 @@ def update_usfm(
book_id: str,
rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None,
full_name: Optional[str] = None,
strip_all_text: bool = False,
prefer_existing_text: bool = True,
behavior: UpdateUsfmBehavior = UpdateUsfmBehavior.PREFER_EXISTING,
) -> Optional[str]:
file_name: str = self._settings.get_book_file_name(book_id)
if not self._exists(file_name):
return None
with self._open(file_name) as sfm_file:
usfm: str = sfm_file.read().decode(self._settings.encoding)
handler = UpdateUsfmParserHandler(
rows, None if full_name is None else f"- {full_name}", strip_all_text, prefer_existing_text
)
handler = UpdateUsfmParserHandler(rows, None if full_name is None else f"- {full_name}", behavior)
try:
parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
return handler.get_usfm(self._settings.stylesheet)
Expand Down
17 changes: 12 additions & 5 deletions machine/corpora/update_usfm_parser_handler.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from enum import Enum, auto
from typing import List, Optional, Sequence, Tuple, Union

from .scripture_ref import ScriptureRef
Expand All @@ -8,21 +9,25 @@
from .usfm_tokenizer import UsfmTokenizer


class UpdateUsfmBehavior(Enum):
PREFER_EXISTING = auto()
PREFER_NEW = auto()
STRIP_EXISTING = auto()


class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandler):
def __init__(
self,
rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None,
id_text: Optional[str] = None,
strip_all_text: bool = False,
prefer_existing_text: bool = False,
behavior: UpdateUsfmBehavior = UpdateUsfmBehavior.PREFER_EXISTING,
) -> None:
super().__init__()
self._rows = rows or []
self._tokens: List[UsfmToken] = []
self._new_tokens: List[UsfmToken] = []
self._id_text = id_text
self._strip_all_text = strip_all_text
self._prefer_existing_text = prefer_existing_text
self._behavior = behavior
self._replace_stack: List[bool] = []
self._row_index: int = 0
self._token_index: int = 0
Expand Down Expand Up @@ -283,7 +288,9 @@ def _replace_with_new_tokens(self, state: UsfmParserState) -> bool:
existing_text = True
break
use_new_tokens: bool = (
self._strip_all_text or (new_text and not existing_text) or (new_text and not self._prefer_existing_text)
self._behavior is UpdateUsfmBehavior.STRIP_EXISTING
or (new_text and not existing_text)
or (new_text and self._behavior is UpdateUsfmBehavior.PREFER_NEW)
)
if use_new_tokens:
self._tokens.extend(self._new_tokens)
Expand Down
Loading
Loading