From 714db8713598150a743c031447efb6d38ec61853 Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Sat, 17 Aug 2024 16:32:14 -0400 Subject: [PATCH] port commit 40a55fd, control pretranslation of existing text --- machine/corpora/usfm_text_updater.py | 84 +++++++---- tests/corpora/test_usfm_file_text.py | 162 ++++++++++----------- tests/corpora/test_usfm_text_updater.py | 37 ++++- tests/corpora/test_usfm_tokenizer.py | 2 +- tests/testutils/data/usfm/Tes/41MATTes.SFM | 3 + 5 files changed, 176 insertions(+), 112 deletions(-) diff --git a/machine/corpora/usfm_text_updater.py b/machine/corpora/usfm_text_updater.py index ba62eea..a293178 100644 --- a/machine/corpora/usfm_text_updater.py +++ b/machine/corpora/usfm_text_updater.py @@ -15,36 +15,35 @@ def __init__( id_text: Optional[str] = None, strip_all_text: bool = False, strict_comparison: bool = True, + prefer_existing_text: bool = False, ) -> None: super().__init__() self._rows = rows or [] self._tokens: List[UsfmToken] = [] + self._new_tokens: List[UsfmToken] = [] self._id_text = id_text self._strip_all_text = strip_all_text self._strict_comparison = strict_comparison + self._prefer_existing_text = prefer_existing_text self._replace_stack: List[bool] = [] self._row_index: int = 0 self._token_index: int = 0 - self._replace_text: bool = False @property def tokens(self) -> List[UsfmToken]: return self._tokens - @property - def replace_text(self) -> bool: - return self._strip_all_text or (len(self._replace_stack) > 0 and self._replace_stack[-1]) - def start_book(self, state: UsfmParserState, marker: str, code: str) -> None: self._collect_tokens(state) + start_book_tokens: List[UsfmToken] = [] if self._id_text is not None: - self._tokens.append(UsfmToken(UsfmTokenType.TEXT, text=self._id_text + " ")) - self._replace_stack.append(self._id_text is not None) + start_book_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=self._id_text + " ")) + self._push_new_tokens(start_book_tokens) super().start_book(state, marker, code) def end_book(self, state: UsfmParserState, marker: str) -> None: - self._replace_stack.pop() + self._pop_new_tokens() super().end_book(state, marker) @@ -127,7 +126,7 @@ def start_char( unknown: bool, attributes: List[UsfmAttribute], ) -> None: - if self.replace_text: + if self._replace_with_new_tokens(state): self._skip_tokens(state) else: self._collect_tokens(state) @@ -141,7 +140,7 @@ def end_char( attributes: List[UsfmAttribute], closed: bool, ) -> None: - if closed and self.replace_text: + if closed and self._replace_with_new_tokens(state): self._skip_tokens(state) super().end_char(state, marker, attributes, closed) @@ -153,7 +152,7 @@ def start_note( caller: str, category: str, ) -> None: - if self.replace_text: + if self._replace_with_new_tokens(state): self._skip_tokens(state) else: self._collect_tokens(state) @@ -161,13 +160,13 @@ def start_note( super().start_note(state, marker, caller, category) def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None: - if closed and self.replace_text: + if closed and self._replace_with_new_tokens(state): self._skip_tokens(state) super().end_note(state, marker, closed) def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> None: - if self.replace_text: + if self._replace_with_new_tokens(state): self._skip_tokens(state) else: self._collect_tokens(state) @@ -175,7 +174,7 @@ def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> super().ref(state, marker, display, target) def text(self, state: UsfmParserState, text: str) -> None: - if self.replace_text: + if self._replace_with_new_tokens(state): self._skip_tokens(state) else: self._collect_tokens(state) @@ -183,7 +182,7 @@ def text(self, state: UsfmParserState, text: str) -> None: super().text(state, text) def opt_break(self, state: UsfmParserState) -> None: - if self.replace_text: + if self._replace_with_new_tokens(state): self._skip_tokens(state) else: self._collect_tokens(state) @@ -191,7 +190,7 @@ def opt_break(self, state: UsfmParserState) -> None: super().opt_break(state) def unmatched(self, state: UsfmParserState, marker: str) -> None: - if self.replace_text: + if self._replace_with_new_tokens(state): self._skip_tokens(state) else: self._collect_tokens(state) @@ -200,38 +199,37 @@ def unmatched(self, state: UsfmParserState, marker: str) -> None: def _start_verse_text(self, state: UsfmParserState, scripture_refs: List[ScriptureRef]) -> None: row_texts: List[str] = self._advance_rows(scripture_refs) - self._tokens.extend(UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts) - self._replace_stack.append(len(row_texts) > 0) + self._push_new_tokens([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts]) def _end_verse_text(self, state: UsfmParserState, scripture_refs: List[ScriptureRef]) -> None: - self._replace_stack.pop() + self._pop_new_tokens() def _start_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: row_texts = self._advance_rows([scripture_ref]) - self._tokens.extend(UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts) - self._replace_stack.append(len(row_texts) > 0) + self._push_new_tokens([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts]) def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: - self._replace_stack.pop() + self._pop_new_tokens() def _start_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: row_texts = self._advance_rows([scripture_ref]) + new_tokens: List[UsfmToken] = [] if len(row_texts) > 0: if state.token is None: raise ValueError("Invalid parser state.") - self._tokens.append(state.token) - self._tokens.append(UsfmToken(UsfmTokenType.CHARACTER, "ft", None, "ft*")) + new_tokens.append(state.token) + new_tokens.append(UsfmToken(UsfmTokenType.CHARACTER, "ft", None, "ft*")) for i, text in enumerate(row_texts): if i < len(row_texts) - 1: text += " " - self._tokens.append(UsfmToken(UsfmTokenType.TEXT, text=text)) - self._tokens.append(UsfmToken(UsfmTokenType.END, state.token.end_marker, None, None)) - self._replace_stack.append(True) + new_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=text)) + new_tokens.append(UsfmToken(UsfmTokenType.END, state.token.end_marker, None, None)) + self._push_new_tokens(new_tokens) else: - self._replace_stack.append(self._replace_stack[-1]) + self._push_token_as_previous() def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: - self._replace_stack.pop() + self._pop_new_tokens() def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: if isinstance(stylesheet, str): @@ -268,9 +266,37 @@ def _advance_rows(self, seg_scr_refs: List[ScriptureRef]) -> List[str]: return row_texts def _collect_tokens(self, state: UsfmParserState) -> None: + self._tokens.extend(self._new_tokens) + self._new_tokens.clear() while self._token_index <= state.index + state.special_token_count: self._tokens.append(state.tokens[self._token_index]) self._token_index += 1 def _skip_tokens(self, state: UsfmParserState) -> None: self._token_index = state.index + 1 + state.special_token_count + + def _replace_with_new_tokens(self, state: UsfmParserState) -> bool: + new_text: bool = len(self._replace_stack) > 0 and self._replace_stack[-1] + token_end: int = state.index + state.special_token_count + 1 + existing_text: bool = False + for index in range(self._token_index, token_end + 1): + if state.tokens[index].type == UsfmTokenType.TEXT and state.tokens[index].text: + existing_text = True + break + use_new_tokens: bool = ( + self._strip_all_text or (new_text and not existing_text) or (new_text and not self._prefer_existing_text) + ) + if use_new_tokens: + self._tokens.extend(self._new_tokens) + self._new_tokens.clear() + return use_new_tokens + + def _push_new_tokens(self, tokens: List[UsfmToken]) -> None: + self._replace_stack.append(any(tokens)) + self._new_tokens.extend(tokens) + + def _push_token_as_previous(self) -> None: + self._replace_stack.append(self._replace_stack[-1]) + + def _pop_new_tokens(self) -> None: + self._replace_stack.pop() diff --git a/tests/corpora/test_usfm_file_text.py b/tests/corpora/test_usfm_file_text.py index 33679ae..ad920f1 100644 --- a/tests/corpora/test_usfm_file_text.py +++ b/tests/corpora/test_usfm_file_text.py @@ -10,7 +10,7 @@ def test_get_rows_nonempty_text() -> None: assert text is not None rows = list(text) - assert len(rows) == 19 + assert len(rows) == 22 assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:1", corpus.versification) assert rows[0].text == "Chapter one, verse one." @@ -21,41 +21,41 @@ def test_get_rows_nonempty_text() -> None: assert scripture_ref(rows[4]) == ScriptureRef.parse("MAT 1:5", corpus.versification) assert rows[4].text == "Chapter one, verse five." - assert scripture_ref(rows[5]) == ScriptureRef.parse("MAT 2:1", corpus.versification) - assert rows[5].text == "Chapter two, verse one." + assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 2:1", corpus.versification) + assert rows[8].text == "Chapter two, verse one." - assert scripture_ref(rows[6]) == ScriptureRef.parse("MAT 2:2", corpus.versification) - assert rows[6].text == "Chapter two, verse two. Chapter two, verse three." - assert rows[6].is_in_range - assert rows[6].is_range_start + assert scripture_ref(rows[9]) == ScriptureRef.parse("MAT 2:2", corpus.versification) + assert rows[9].text == "Chapter two, verse two. Chapter two, verse three." + assert rows[9].is_in_range + assert rows[9].is_range_start - assert scripture_ref(rows[7]) == ScriptureRef.parse("MAT 2:3", corpus.versification) - assert len(rows[7].segment) == 0 - assert rows[7].is_in_range - assert not rows[7].is_range_start + assert scripture_ref(rows[10]) == ScriptureRef.parse("MAT 2:3", corpus.versification) + assert len(rows[10].segment) == 0 + assert rows[10].is_in_range + assert not rows[10].is_range_start - assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 2:4a", corpus.versification) - assert len(rows[8].segment) == 0 - assert rows[8].is_in_range - assert not rows[8].is_range_start + assert scripture_ref(rows[11]) == ScriptureRef.parse("MAT 2:4a", corpus.versification) + assert len(rows[11].segment) == 0 + assert rows[11].is_in_range + assert not rows[11].is_range_start - assert scripture_ref(rows[9]) == ScriptureRef.parse("MAT 2:4b", corpus.versification) - assert rows[9].text == "Chapter two, verse four." + assert scripture_ref(rows[12]) == ScriptureRef.parse("MAT 2:4b", corpus.versification) + assert rows[12].text == "Chapter two, verse four." - assert scripture_ref(rows[10]) == ScriptureRef.parse("MAT 2:5", corpus.versification) - assert rows[10].text == "Chapter two, verse five." + assert scripture_ref(rows[13]) == ScriptureRef.parse("MAT 2:5", corpus.versification) + assert rows[13].text == "Chapter two, verse five." - assert scripture_ref(rows[11]) == ScriptureRef.parse("MAT 2:6", corpus.versification) - assert rows[11].text == "Chapter two, verse six." + assert scripture_ref(rows[14]) == ScriptureRef.parse("MAT 2:6", corpus.versification) + assert rows[14].text == "Chapter two, verse six." - assert scripture_ref(rows[15]) == ScriptureRef.parse("MAT 2:9", corpus.versification) - assert rows[15].text == "Chapter 2 verse 9" + assert scripture_ref(rows[18]) == ScriptureRef.parse("MAT 2:9", corpus.versification) + assert rows[18].text == "Chapter 2 verse 9" - assert scripture_ref(rows[16]) == ScriptureRef.parse("MAT 2:10", corpus.versification) - assert rows[16].text == "Chapter 2 verse 10" + assert scripture_ref(rows[19]) == ScriptureRef.parse("MAT 2:10", corpus.versification) + assert rows[19].text == "Chapter 2 verse 10" - assert scripture_ref(rows[17]) == ScriptureRef.parse("MAT 2:11", corpus.versification) - assert not rows[17].text + assert scripture_ref(rows[20]) == ScriptureRef.parse("MAT 2:11", corpus.versification) + assert not rows[20].text def test_get_rows_nonempty_text_all_text() -> None: @@ -65,7 +65,7 @@ def test_get_rows_nonempty_text_all_text() -> None: assert text is not None rows = list(text) - assert len(rows) == 36 + assert len(rows) == 39 assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:0/1:h", corpus.versification) assert rows[0].text == "Matthew" @@ -88,35 +88,35 @@ def test_get_rows_nonempty_text_all_text() -> None: assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 1:2/1:f", corpus.versification) assert rows[8].text == "1:2: This is a footnote." - assert scripture_ref(rows[12]) == ScriptureRef.parse("MAT 2:0/1:tr/1:tc1", corpus.versification) - assert rows[12].text == "Row one, column one." + assert scripture_ref(rows[15]) == ScriptureRef.parse("MAT 2:0/1:tr/1:tc1", corpus.versification) + assert rows[15].text == "Row one, column one." - assert scripture_ref(rows[13]) == ScriptureRef.parse("MAT 2:0/1:tr/2:tc2", corpus.versification) - assert rows[13].text == "Row one, column two." + assert scripture_ref(rows[16]) == ScriptureRef.parse("MAT 2:0/1:tr/2:tc2", corpus.versification) + assert rows[16].text == "Row one, column two." - assert scripture_ref(rows[14]) == ScriptureRef.parse("MAT 2:0/2:tr/1:tc1", corpus.versification) - assert rows[14].text == "Row two, column one." + assert scripture_ref(rows[17]) == ScriptureRef.parse("MAT 2:0/2:tr/1:tc1", corpus.versification) + assert rows[17].text == "Row two, column one." - assert scripture_ref(rows[15]) == ScriptureRef.parse("MAT 2:0/2:tr/2:tc2", corpus.versification) - assert rows[15].text == "Row two, column two." + assert scripture_ref(rows[18]) == ScriptureRef.parse("MAT 2:0/2:tr/2:tc2", corpus.versification) + assert rows[18].text == "Row two, column two." - assert scripture_ref(rows[16]) == ScriptureRef.parse("MAT 2:0/3:s1", corpus.versification) - assert rows[16].text == "Chapter Two" + assert scripture_ref(rows[19]) == ScriptureRef.parse("MAT 2:0/3:s1", corpus.versification) + assert rows[19].text == "Chapter Two" - assert scripture_ref(rows[18]) == ScriptureRef.parse("MAT 2:1/1:f", corpus.versification) - assert rows[18].text == "2:1: This is a footnote." + assert scripture_ref(rows[21]) == ScriptureRef.parse("MAT 2:1/1:f", corpus.versification) + assert rows[21].text == "2:1: This is a footnote." - assert scripture_ref(rows[21]) == ScriptureRef.parse("MAT 2:3/1:esb/1:ms", corpus.versification) - assert rows[21].text == "This is a sidebar" + assert scripture_ref(rows[24]) == ScriptureRef.parse("MAT 2:3/1:esb/1:ms", corpus.versification) + assert rows[24].text == "This is a sidebar" - assert scripture_ref(rows[22]) == ScriptureRef.parse("MAT 2:3/1:esb/2:p", corpus.versification) - assert rows[22].text == "Here is some sidebar content." + assert scripture_ref(rows[25]) == ScriptureRef.parse("MAT 2:3/1:esb/2:p", corpus.versification) + assert rows[25].text == "Here is some sidebar content." - assert scripture_ref(rows[28]) == ScriptureRef.parse("MAT 2:7a/1:s", corpus.versification) - assert rows[28].text == "Section header" + assert scripture_ref(rows[31]) == ScriptureRef.parse("MAT 2:7a/1:s", corpus.versification) + assert rows[31].text == "Section header" - assert scripture_ref(rows[35]) == ScriptureRef.parse("MAT 2:12/1:restore", corpus.versification) - assert rows[35].text == "restore information" + assert scripture_ref(rows[38]) == ScriptureRef.parse("MAT 2:12/1:restore", corpus.versification) + assert rows[38].text == "restore information" def test_get_rows_sentence_start() -> None: @@ -126,7 +126,7 @@ def test_get_rows_sentence_start() -> None: assert text is not None rows = list(text) - assert len(rows) == 19 + assert len(rows) == 22 assert scripture_ref(rows[3]) == ScriptureRef.parse("MAT 1:4", corpus.versification) assert rows[3].text == "Chapter one, verse four," @@ -154,7 +154,7 @@ def test_get_rows_include_markers() -> None: assert text is not None rows = list(text) - assert len(rows) == 19 + assert len(rows) == 22 assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:1", corpus.versification) assert ( @@ -167,38 +167,38 @@ def test_get_rows_include_markers() -> None: assert scripture_ref(rows[4]) == ScriptureRef.parse("MAT 1:5", corpus.versification) assert rows[4].text == 'Chapter one, \\li2 verse \\fig Figure 1|src="image1.png" size="col" ref="1:5"\\fig* five.' - assert scripture_ref(rows[5]) == ScriptureRef.parse("MAT 2:1", corpus.versification) - assert rows[5].text == "Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one." + assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 2:1", corpus.versification) + assert rows[8].text == "Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one." - assert scripture_ref(rows[6]) == ScriptureRef.parse("MAT 2:2", corpus.versification) - assert rows[6].text == "Chapter two, // verse \\fm ∆\\fm*two. Chapter two, verse \\w three|lemma\\w*." - assert rows[6].is_in_range - assert rows[6].is_range_start + assert scripture_ref(rows[9]) == ScriptureRef.parse("MAT 2:2", corpus.versification) + assert rows[9].text == "Chapter two, // verse \\fm ∆\\fm*two. Chapter two, verse \\w three|lemma\\w*." + assert rows[9].is_in_range + assert rows[9].is_range_start - assert scripture_ref(rows[7]) == ScriptureRef.parse("MAT 2:3", corpus.versification) - assert len(rows[7].segment) == 0 - assert rows[7].is_in_range - assert not rows[7].is_range_start + assert scripture_ref(rows[10]) == ScriptureRef.parse("MAT 2:3", corpus.versification) + assert len(rows[10].segment) == 0 + assert rows[10].is_in_range + assert not rows[10].is_range_start - assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 2:4a", corpus.versification) - assert len(rows[8].segment) == 0 - assert rows[8].is_in_range - assert not rows[8].is_range_start + assert scripture_ref(rows[11]) == ScriptureRef.parse("MAT 2:4a", corpus.versification) + assert len(rows[11].segment) == 0 + assert rows[11].is_in_range + assert not rows[11].is_range_start - assert scripture_ref(rows[9]) == ScriptureRef.parse("MAT 2:4b", corpus.versification) - assert rows[9].text == "Chapter two, verse four." + assert scripture_ref(rows[12]) == ScriptureRef.parse("MAT 2:4b", corpus.versification) + assert rows[12].text == "Chapter two, verse four." - assert scripture_ref(rows[10]) == ScriptureRef.parse("MAT 2:5", corpus.versification) - assert rows[10].text == "Chapter two, verse five \\rq (MAT 3:1)\\rq*." + assert scripture_ref(rows[13]) == ScriptureRef.parse("MAT 2:5", corpus.versification) + assert rows[13].text == "Chapter two, verse five \\rq (MAT 3:1)\\rq*." - assert scripture_ref(rows[11]) == ScriptureRef.parse("MAT 2:6", corpus.versification) - assert rows[11].text == 'Chapter two, verse \\w six|strong="12345" \\w*.' + assert scripture_ref(rows[14]) == ScriptureRef.parse("MAT 2:6", corpus.versification) + assert rows[14].text == 'Chapter two, verse \\w six|strong="12345" \\w*.' - assert scripture_ref(rows[15]) == ScriptureRef.parse("MAT 2:9", corpus.versification) - assert rows[15].text == "Chapter\\tcr2 2\\tc3 verse\\tcr4 9" + assert scripture_ref(rows[18]) == ScriptureRef.parse("MAT 2:9", corpus.versification) + assert rows[18].text == "Chapter\\tcr2 2\\tc3 verse\\tcr4 9" - assert scripture_ref(rows[16]) == ScriptureRef.parse("MAT 2:10", corpus.versification) - assert rows[16].text == "\\tc3-4 Chapter 2 verse 10" + assert scripture_ref(rows[19]) == ScriptureRef.parse("MAT 2:10", corpus.versification) + assert rows[19].text == "\\tc3-4 Chapter 2 verse 10" def test_get_rows_include_markers_all_text() -> None: @@ -209,7 +209,7 @@ def test_get_rows_include_markers_all_text() -> None: assert text is not None rows = list(text) - assert len(rows) == 32 + assert len(rows) == 35 assert scripture_ref(rows[2]) == ScriptureRef.parse("MAT 1:0/3:ip", corpus.versification) assert rows[2].text == "An introduction to Matthew\\fe + \\ft This is an endnote.\\fe*" @@ -225,14 +225,14 @@ def test_get_rows_include_markers_all_text() -> None: assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 1:5", corpus.versification) assert rows[8].text == 'Chapter one, \\li2 verse \\fig Figure 1|src="image1.png" size="col" ref="1:5"\\fig* five.' - assert scripture_ref(rows[13]) == ScriptureRef.parse("MAT 2:0/3:s1", corpus.versification) - assert rows[13].text == "Chapter \\it Two \\it*" + assert scripture_ref(rows[16]) == ScriptureRef.parse("MAT 2:0/3:s1", corpus.versification) + assert rows[16].text == "Chapter \\it Two \\it*" - assert scripture_ref(rows[14]) == ScriptureRef.parse("MAT 2:1", corpus.versification) - assert rows[14].text == "Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one." + assert scripture_ref(rows[17]) == ScriptureRef.parse("MAT 2:1", corpus.versification) + assert rows[17].text == "Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one." - assert scripture_ref(rows[18]) == ScriptureRef.parse("MAT 2:3/1:esb/2:p", corpus.versification) - assert rows[18].text == "Here is some sidebar // content." + assert scripture_ref(rows[21]) == ScriptureRef.parse("MAT 2:3/1:esb/2:p", corpus.versification) + assert rows[21].text == "Here is some sidebar // content." def test_usfm_file_text_corpus_lowercase_usfm_id() -> None: diff --git a/tests/corpora/test_usfm_text_updater.py b/tests/corpora/test_usfm_text_updater.py index 80693ab..8ef31f8 100644 --- a/tests/corpora/test_usfm_text_updater.py +++ b/tests/corpora/test_usfm_text_updater.py @@ -30,6 +30,40 @@ def test_get_usfm_strip_all_text() -> None: assert "\\s\r\n" in target +def test_get_usfm_prefer_existing(): + rows = [ + ( + scr_ref("MAT 1:6"), + str("Text 6"), + ), + ( + scr_ref("MAT 1:7"), + str("Text 7"), + ), + ] + target = update_usfm(rows, prefer_existing_text=True) + assert "\\id MAT - Test\r\n" in target + assert "\\v 6 Verse 6 content.\r\n" in target + assert "\\v 7 Text 7\r\n" in target + + +def test_get_usfm_prefer_rows(): + rows = [ + ( + scr_ref("MAT 1:6"), + str("Text 6"), + ), + ( + scr_ref("MAT 1:7"), + str("Text 7"), + ), + ] + target = update_usfm(rows, prefer_existing_text=False) + assert "\\id MAT - Test\r\n" in target + assert "\\v 6 Text 6\r\n" in target + assert "\\v 7 Text 7\r\n" in target + + def test_get_usfm_verse_skip_note() -> None: rows = [ ( @@ -306,9 +340,10 @@ def update_usfm( id_text: Optional[str] = None, strip_all_text: bool = False, strict_comparison: bool = True, + prefer_existing_text: bool = False, ) -> str: source = read_usfm() - updater = UsfmTextUpdater(rows, id_text, strip_all_text, strict_comparison) + updater = UsfmTextUpdater(rows, id_text, strip_all_text, strict_comparison, prefer_existing_text) parse_usfm(source, updater) return updater.get_usfm() diff --git a/tests/corpora/test_usfm_tokenizer.py b/tests/corpora/test_usfm_tokenizer.py index 74c30ac..e91abe8 100644 --- a/tests/corpora/test_usfm_tokenizer.py +++ b/tests/corpora/test_usfm_tokenizer.py @@ -7,7 +7,7 @@ def test_tokenize() -> None: usfm = _read_usfm() usfm_tokenizer = UsfmTokenizer() tokens = usfm_tokenizer.tokenize(usfm) - assert len(tokens) == 170 + assert len(tokens) == 174 assert tokens[0].type is UsfmTokenType.BOOK assert tokens[0].marker == "id" diff --git a/tests/testutils/data/usfm/Tes/41MATTes.SFM b/tests/testutils/data/usfm/Tes/41MATTes.SFM index af634ba..3224e09 100644 --- a/tests/testutils/data/usfm/Tes/41MATTes.SFM +++ b/tests/testutils/data/usfm/Tes/41MATTes.SFM @@ -14,6 +14,9 @@ \li2 verse four, \v 5 Chapter one, \li2 verse \fig Figure 1|src="image1.png" size="col" ref="1:5"\fig* five. +\v 6 Verse 6 content. +\v 7 +\v 8 \c 2 \tr \tc1 Row one, column one. \tc2 Row one, column two. \tr \tc1 Row two, column one. \tc2 Row two, column two.