From 714db8713598150a743c031447efb6d38ec61853 Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Sat, 17 Aug 2024 16:32:14 -0400
Subject: [PATCH] port commit 40a55fd, control pretranslation of existing text

---
 machine/corpora/usfm_text_updater.py       |  84 +++++++----
 tests/corpora/test_usfm_file_text.py       | 162 ++++++++++-----------
 tests/corpora/test_usfm_text_updater.py    |  37 ++++-
 tests/corpora/test_usfm_tokenizer.py       |   2 +-
 tests/testutils/data/usfm/Tes/41MATTes.SFM |   3 +
 5 files changed, 176 insertions(+), 112 deletions(-)

diff --git a/machine/corpora/usfm_text_updater.py b/machine/corpora/usfm_text_updater.py
index ba62eea..a293178 100644
--- a/machine/corpora/usfm_text_updater.py
+++ b/machine/corpora/usfm_text_updater.py
@@ -15,36 +15,35 @@ def __init__(
         id_text: Optional[str] = None,
         strip_all_text: bool = False,
         strict_comparison: bool = True,
+        prefer_existing_text: bool = False,
     ) -> None:
         super().__init__()
         self._rows = rows or []
         self._tokens: List[UsfmToken] = []
+        self._new_tokens: List[UsfmToken] = []
         self._id_text = id_text
         self._strip_all_text = strip_all_text
         self._strict_comparison = strict_comparison
+        self._prefer_existing_text = prefer_existing_text
         self._replace_stack: List[bool] = []
         self._row_index: int = 0
         self._token_index: int = 0
-        self._replace_text: bool = False
 
     @property
     def tokens(self) -> List[UsfmToken]:
         return self._tokens
 
-    @property
-    def replace_text(self) -> bool:
-        return self._strip_all_text or (len(self._replace_stack) > 0 and self._replace_stack[-1])
-
     def start_book(self, state: UsfmParserState, marker: str, code: str) -> None:
         self._collect_tokens(state)
+        start_book_tokens: List[UsfmToken] = []
         if self._id_text is not None:
-            self._tokens.append(UsfmToken(UsfmTokenType.TEXT, text=self._id_text + " "))
-        self._replace_stack.append(self._id_text is not None)
+            start_book_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=self._id_text + " "))
+        self._push_new_tokens(start_book_tokens)
 
         super().start_book(state, marker, code)
 
     def end_book(self, state: UsfmParserState, marker: str) -> None:
-        self._replace_stack.pop()
+        self._pop_new_tokens()
 
         super().end_book(state, marker)
 
@@ -127,7 +126,7 @@ def start_char(
         unknown: bool,
         attributes: List[UsfmAttribute],
     ) -> None:
-        if self.replace_text:
+        if self._replace_with_new_tokens(state):
             self._skip_tokens(state)
         else:
             self._collect_tokens(state)
@@ -141,7 +140,7 @@ def end_char(
         attributes: List[UsfmAttribute],
         closed: bool,
     ) -> None:
-        if closed and self.replace_text:
+        if closed and self._replace_with_new_tokens(state):
             self._skip_tokens(state)
 
         super().end_char(state, marker, attributes, closed)
@@ -153,7 +152,7 @@ def start_note(
         caller: str,
         category: str,
     ) -> None:
-        if self.replace_text:
+        if self._replace_with_new_tokens(state):
             self._skip_tokens(state)
         else:
             self._collect_tokens(state)
@@ -161,13 +160,13 @@ def start_note(
         super().start_note(state, marker, caller, category)
 
     def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None:
-        if closed and self.replace_text:
+        if closed and self._replace_with_new_tokens(state):
             self._skip_tokens(state)
 
         super().end_note(state, marker, closed)
 
     def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> None:
-        if self.replace_text:
+        if self._replace_with_new_tokens(state):
             self._skip_tokens(state)
         else:
             self._collect_tokens(state)
@@ -175,7 +174,7 @@ def ref(self, state: UsfmParserState, marker: str, display: str, target: str) ->
         super().ref(state, marker, display, target)
 
     def text(self, state: UsfmParserState, text: str) -> None:
-        if self.replace_text:
+        if self._replace_with_new_tokens(state):
             self._skip_tokens(state)
         else:
             self._collect_tokens(state)
@@ -183,7 +182,7 @@ def text(self, state: UsfmParserState, text: str) -> None:
         super().text(state, text)
 
     def opt_break(self, state: UsfmParserState) -> None:
-        if self.replace_text:
+        if self._replace_with_new_tokens(state):
             self._skip_tokens(state)
         else:
             self._collect_tokens(state)
@@ -191,7 +190,7 @@ def opt_break(self, state: UsfmParserState) -> None:
         super().opt_break(state)
 
     def unmatched(self, state: UsfmParserState, marker: str) -> None:
-        if self.replace_text:
+        if self._replace_with_new_tokens(state):
             self._skip_tokens(state)
         else:
             self._collect_tokens(state)
@@ -200,38 +199,37 @@ def unmatched(self, state: UsfmParserState, marker: str) -> None:
 
     def _start_verse_text(self, state: UsfmParserState, scripture_refs: List[ScriptureRef]) -> None:
         row_texts: List[str] = self._advance_rows(scripture_refs)
-        self._tokens.extend(UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts)
-        self._replace_stack.append(len(row_texts) > 0)
+        self._push_new_tokens([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts])
 
     def _end_verse_text(self, state: UsfmParserState, scripture_refs: List[ScriptureRef]) -> None:
-        self._replace_stack.pop()
+        self._pop_new_tokens()
 
     def _start_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
         row_texts = self._advance_rows([scripture_ref])
-        self._tokens.extend(UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts)
-        self._replace_stack.append(len(row_texts) > 0)
+        self._push_new_tokens([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts])
 
     def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
-        self._replace_stack.pop()
+        self._pop_new_tokens()
 
     def _start_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
         row_texts = self._advance_rows([scripture_ref])
+        new_tokens: List[UsfmToken] = []
         if len(row_texts) > 0:
             if state.token is None:
                 raise ValueError("Invalid parser state.")
-            self._tokens.append(state.token)
-            self._tokens.append(UsfmToken(UsfmTokenType.CHARACTER, "ft", None, "ft*"))
+            new_tokens.append(state.token)
+            new_tokens.append(UsfmToken(UsfmTokenType.CHARACTER, "ft", None, "ft*"))
             for i, text in enumerate(row_texts):
                 if i < len(row_texts) - 1:
                     text += " "
-                self._tokens.append(UsfmToken(UsfmTokenType.TEXT, text=text))
-            self._tokens.append(UsfmToken(UsfmTokenType.END, state.token.end_marker, None, None))
-            self._replace_stack.append(True)
+                new_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=text))
+            new_tokens.append(UsfmToken(UsfmTokenType.END, state.token.end_marker, None, None))
+            self._push_new_tokens(new_tokens)
         else:
-            self._replace_stack.append(self._replace_stack[-1])
+            self._push_token_as_previous()
 
     def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
-        self._replace_stack.pop()
+        self._pop_new_tokens()
 
     def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
         if isinstance(stylesheet, str):
@@ -268,9 +266,37 @@ def _advance_rows(self, seg_scr_refs: List[ScriptureRef]) -> List[str]:
         return row_texts
 
     def _collect_tokens(self, state: UsfmParserState) -> None:
+        self._tokens.extend(self._new_tokens)
+        self._new_tokens.clear()
         while self._token_index <= state.index + state.special_token_count:
             self._tokens.append(state.tokens[self._token_index])
             self._token_index += 1
 
     def _skip_tokens(self, state: UsfmParserState) -> None:
         self._token_index = state.index + 1 + state.special_token_count
+
+    def _replace_with_new_tokens(self, state: UsfmParserState) -> bool:
+        new_text: bool = len(self._replace_stack) > 0 and self._replace_stack[-1]
+        token_end: int = state.index + state.special_token_count + 1
+        existing_text: bool = False
+        for index in range(self._token_index, token_end + 1):
+            if state.tokens[index].type == UsfmTokenType.TEXT and state.tokens[index].text:
+                existing_text = True
+                break
+        use_new_tokens: bool = (
+            self._strip_all_text or (new_text and not existing_text) or (new_text and not self._prefer_existing_text)
+        )
+        if use_new_tokens:
+            self._tokens.extend(self._new_tokens)
+        self._new_tokens.clear()
+        return use_new_tokens
+
+    def _push_new_tokens(self, tokens: List[UsfmToken]) -> None:
+        self._replace_stack.append(any(tokens))
+        self._new_tokens.extend(tokens)
+
+    def _push_token_as_previous(self) -> None:
+        self._replace_stack.append(self._replace_stack[-1])
+
+    def _pop_new_tokens(self) -> None:
+        self._replace_stack.pop()
diff --git a/tests/corpora/test_usfm_file_text.py b/tests/corpora/test_usfm_file_text.py
index 33679ae..ad920f1 100644
--- a/tests/corpora/test_usfm_file_text.py
+++ b/tests/corpora/test_usfm_file_text.py
@@ -10,7 +10,7 @@ def test_get_rows_nonempty_text() -> None:
     assert text is not None
     rows = list(text)
 
-    assert len(rows) == 19
+    assert len(rows) == 22
 
     assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:1", corpus.versification)
     assert rows[0].text == "Chapter one, verse one."
@@ -21,41 +21,41 @@ def test_get_rows_nonempty_text() -> None:
     assert scripture_ref(rows[4]) == ScriptureRef.parse("MAT 1:5", corpus.versification)
     assert rows[4].text == "Chapter one, verse five."
 
-    assert scripture_ref(rows[5]) == ScriptureRef.parse("MAT 2:1", corpus.versification)
-    assert rows[5].text == "Chapter two, verse one."
+    assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 2:1", corpus.versification)
+    assert rows[8].text == "Chapter two, verse one."
 
-    assert scripture_ref(rows[6]) == ScriptureRef.parse("MAT 2:2", corpus.versification)
-    assert rows[6].text == "Chapter two, verse two. Chapter two, verse three."
-    assert rows[6].is_in_range
-    assert rows[6].is_range_start
+    assert scripture_ref(rows[9]) == ScriptureRef.parse("MAT 2:2", corpus.versification)
+    assert rows[9].text == "Chapter two, verse two. Chapter two, verse three."
+    assert rows[9].is_in_range
+    assert rows[9].is_range_start
 
-    assert scripture_ref(rows[7]) == ScriptureRef.parse("MAT 2:3", corpus.versification)
-    assert len(rows[7].segment) == 0
-    assert rows[7].is_in_range
-    assert not rows[7].is_range_start
+    assert scripture_ref(rows[10]) == ScriptureRef.parse("MAT 2:3", corpus.versification)
+    assert len(rows[10].segment) == 0
+    assert rows[10].is_in_range
+    assert not rows[10].is_range_start
 
-    assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 2:4a", corpus.versification)
-    assert len(rows[8].segment) == 0
-    assert rows[8].is_in_range
-    assert not rows[8].is_range_start
+    assert scripture_ref(rows[11]) == ScriptureRef.parse("MAT 2:4a", corpus.versification)
+    assert len(rows[11].segment) == 0
+    assert rows[11].is_in_range
+    assert not rows[11].is_range_start
 
-    assert scripture_ref(rows[9]) == ScriptureRef.parse("MAT 2:4b", corpus.versification)
-    assert rows[9].text == "Chapter two, verse four."
+    assert scripture_ref(rows[12]) == ScriptureRef.parse("MAT 2:4b", corpus.versification)
+    assert rows[12].text == "Chapter two, verse four."
 
-    assert scripture_ref(rows[10]) == ScriptureRef.parse("MAT 2:5", corpus.versification)
-    assert rows[10].text == "Chapter two, verse five."
+    assert scripture_ref(rows[13]) == ScriptureRef.parse("MAT 2:5", corpus.versification)
+    assert rows[13].text == "Chapter two, verse five."
 
-    assert scripture_ref(rows[11]) == ScriptureRef.parse("MAT 2:6", corpus.versification)
-    assert rows[11].text == "Chapter two, verse six."
+    assert scripture_ref(rows[14]) == ScriptureRef.parse("MAT 2:6", corpus.versification)
+    assert rows[14].text == "Chapter two, verse six."
 
-    assert scripture_ref(rows[15]) == ScriptureRef.parse("MAT 2:9", corpus.versification)
-    assert rows[15].text == "Chapter 2 verse 9"
+    assert scripture_ref(rows[18]) == ScriptureRef.parse("MAT 2:9", corpus.versification)
+    assert rows[18].text == "Chapter 2 verse 9"
 
-    assert scripture_ref(rows[16]) == ScriptureRef.parse("MAT 2:10", corpus.versification)
-    assert rows[16].text == "Chapter 2 verse 10"
+    assert scripture_ref(rows[19]) == ScriptureRef.parse("MAT 2:10", corpus.versification)
+    assert rows[19].text == "Chapter 2 verse 10"
 
-    assert scripture_ref(rows[17]) == ScriptureRef.parse("MAT 2:11", corpus.versification)
-    assert not rows[17].text
+    assert scripture_ref(rows[20]) == ScriptureRef.parse("MAT 2:11", corpus.versification)
+    assert not rows[20].text
 
 
 def test_get_rows_nonempty_text_all_text() -> None:
@@ -65,7 +65,7 @@ def test_get_rows_nonempty_text_all_text() -> None:
     assert text is not None
     rows = list(text)
 
-    assert len(rows) == 36
+    assert len(rows) == 39
 
     assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:0/1:h", corpus.versification)
     assert rows[0].text == "Matthew"
@@ -88,35 +88,35 @@ def test_get_rows_nonempty_text_all_text() -> None:
     assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 1:2/1:f", corpus.versification)
     assert rows[8].text == "1:2: This is a footnote."
 
-    assert scripture_ref(rows[12]) == ScriptureRef.parse("MAT 2:0/1:tr/1:tc1", corpus.versification)
-    assert rows[12].text == "Row one, column one."
+    assert scripture_ref(rows[15]) == ScriptureRef.parse("MAT 2:0/1:tr/1:tc1", corpus.versification)
+    assert rows[15].text == "Row one, column one."
 
-    assert scripture_ref(rows[13]) == ScriptureRef.parse("MAT 2:0/1:tr/2:tc2", corpus.versification)
-    assert rows[13].text == "Row one, column two."
+    assert scripture_ref(rows[16]) == ScriptureRef.parse("MAT 2:0/1:tr/2:tc2", corpus.versification)
+    assert rows[16].text == "Row one, column two."
 
-    assert scripture_ref(rows[14]) == ScriptureRef.parse("MAT 2:0/2:tr/1:tc1", corpus.versification)
-    assert rows[14].text == "Row two, column one."
+    assert scripture_ref(rows[17]) == ScriptureRef.parse("MAT 2:0/2:tr/1:tc1", corpus.versification)
+    assert rows[17].text == "Row two, column one."
 
-    assert scripture_ref(rows[15]) == ScriptureRef.parse("MAT 2:0/2:tr/2:tc2", corpus.versification)
-    assert rows[15].text == "Row two, column two."
+    assert scripture_ref(rows[18]) == ScriptureRef.parse("MAT 2:0/2:tr/2:tc2", corpus.versification)
+    assert rows[18].text == "Row two, column two."
 
-    assert scripture_ref(rows[16]) == ScriptureRef.parse("MAT 2:0/3:s1", corpus.versification)
-    assert rows[16].text == "Chapter Two"
+    assert scripture_ref(rows[19]) == ScriptureRef.parse("MAT 2:0/3:s1", corpus.versification)
+    assert rows[19].text == "Chapter Two"
 
-    assert scripture_ref(rows[18]) == ScriptureRef.parse("MAT 2:1/1:f", corpus.versification)
-    assert rows[18].text == "2:1: This is a footnote."
+    assert scripture_ref(rows[21]) == ScriptureRef.parse("MAT 2:1/1:f", corpus.versification)
+    assert rows[21].text == "2:1: This is a footnote."
 
-    assert scripture_ref(rows[21]) == ScriptureRef.parse("MAT 2:3/1:esb/1:ms", corpus.versification)
-    assert rows[21].text == "This is a sidebar"
+    assert scripture_ref(rows[24]) == ScriptureRef.parse("MAT 2:3/1:esb/1:ms", corpus.versification)
+    assert rows[24].text == "This is a sidebar"
 
-    assert scripture_ref(rows[22]) == ScriptureRef.parse("MAT 2:3/1:esb/2:p", corpus.versification)
-    assert rows[22].text == "Here is some sidebar content."
+    assert scripture_ref(rows[25]) == ScriptureRef.parse("MAT 2:3/1:esb/2:p", corpus.versification)
+    assert rows[25].text == "Here is some sidebar content."
 
-    assert scripture_ref(rows[28]) == ScriptureRef.parse("MAT 2:7a/1:s", corpus.versification)
-    assert rows[28].text == "Section header"
+    assert scripture_ref(rows[31]) == ScriptureRef.parse("MAT 2:7a/1:s", corpus.versification)
+    assert rows[31].text == "Section header"
 
-    assert scripture_ref(rows[35]) == ScriptureRef.parse("MAT 2:12/1:restore", corpus.versification)
-    assert rows[35].text == "restore information"
+    assert scripture_ref(rows[38]) == ScriptureRef.parse("MAT 2:12/1:restore", corpus.versification)
+    assert rows[38].text == "restore information"
 
 
 def test_get_rows_sentence_start() -> None:
@@ -126,7 +126,7 @@ def test_get_rows_sentence_start() -> None:
     assert text is not None
     rows = list(text)
 
-    assert len(rows) == 19
+    assert len(rows) == 22
 
     assert scripture_ref(rows[3]) == ScriptureRef.parse("MAT 1:4", corpus.versification)
     assert rows[3].text == "Chapter one, verse four,"
@@ -154,7 +154,7 @@ def test_get_rows_include_markers() -> None:
     assert text is not None
     rows = list(text)
 
-    assert len(rows) == 19
+    assert len(rows) == 22
 
     assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:1", corpus.versification)
     assert (
@@ -167,38 +167,38 @@ def test_get_rows_include_markers() -> None:
     assert scripture_ref(rows[4]) == ScriptureRef.parse("MAT 1:5", corpus.versification)
     assert rows[4].text == 'Chapter one, \\li2 verse \\fig Figure 1|src="image1.png" size="col" ref="1:5"\\fig* five.'
 
-    assert scripture_ref(rows[5]) == ScriptureRef.parse("MAT 2:1", corpus.versification)
-    assert rows[5].text == "Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one."
+    assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 2:1", corpus.versification)
+    assert rows[8].text == "Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one."
 
-    assert scripture_ref(rows[6]) == ScriptureRef.parse("MAT 2:2", corpus.versification)
-    assert rows[6].text == "Chapter two, // verse \\fm ∆\\fm*two. Chapter two, verse \\w three|lemma\\w*."
-    assert rows[6].is_in_range
-    assert rows[6].is_range_start
+    assert scripture_ref(rows[9]) == ScriptureRef.parse("MAT 2:2", corpus.versification)
+    assert rows[9].text == "Chapter two, // verse \\fm ∆\\fm*two. Chapter two, verse \\w three|lemma\\w*."
+    assert rows[9].is_in_range
+    assert rows[9].is_range_start
 
-    assert scripture_ref(rows[7]) == ScriptureRef.parse("MAT 2:3", corpus.versification)
-    assert len(rows[7].segment) == 0
-    assert rows[7].is_in_range
-    assert not rows[7].is_range_start
+    assert scripture_ref(rows[10]) == ScriptureRef.parse("MAT 2:3", corpus.versification)
+    assert len(rows[10].segment) == 0
+    assert rows[10].is_in_range
+    assert not rows[10].is_range_start
 
-    assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 2:4a", corpus.versification)
-    assert len(rows[8].segment) == 0
-    assert rows[8].is_in_range
-    assert not rows[8].is_range_start
+    assert scripture_ref(rows[11]) == ScriptureRef.parse("MAT 2:4a", corpus.versification)
+    assert len(rows[11].segment) == 0
+    assert rows[11].is_in_range
+    assert not rows[11].is_range_start
 
-    assert scripture_ref(rows[9]) == ScriptureRef.parse("MAT 2:4b", corpus.versification)
-    assert rows[9].text == "Chapter two, verse four."
+    assert scripture_ref(rows[12]) == ScriptureRef.parse("MAT 2:4b", corpus.versification)
+    assert rows[12].text == "Chapter two, verse four."
 
-    assert scripture_ref(rows[10]) == ScriptureRef.parse("MAT 2:5", corpus.versification)
-    assert rows[10].text == "Chapter two, verse five \\rq (MAT 3:1)\\rq*."
+    assert scripture_ref(rows[13]) == ScriptureRef.parse("MAT 2:5", corpus.versification)
+    assert rows[13].text == "Chapter two, verse five \\rq (MAT 3:1)\\rq*."
 
-    assert scripture_ref(rows[11]) == ScriptureRef.parse("MAT 2:6", corpus.versification)
-    assert rows[11].text == 'Chapter two, verse \\w six|strong="12345" \\w*.'
+    assert scripture_ref(rows[14]) == ScriptureRef.parse("MAT 2:6", corpus.versification)
+    assert rows[14].text == 'Chapter two, verse \\w six|strong="12345" \\w*.'
 
-    assert scripture_ref(rows[15]) == ScriptureRef.parse("MAT 2:9", corpus.versification)
-    assert rows[15].text == "Chapter\\tcr2 2\\tc3 verse\\tcr4 9"
+    assert scripture_ref(rows[18]) == ScriptureRef.parse("MAT 2:9", corpus.versification)
+    assert rows[18].text == "Chapter\\tcr2 2\\tc3 verse\\tcr4 9"
 
-    assert scripture_ref(rows[16]) == ScriptureRef.parse("MAT 2:10", corpus.versification)
-    assert rows[16].text == "\\tc3-4 Chapter 2 verse 10"
+    assert scripture_ref(rows[19]) == ScriptureRef.parse("MAT 2:10", corpus.versification)
+    assert rows[19].text == "\\tc3-4 Chapter 2 verse 10"
 
 
 def test_get_rows_include_markers_all_text() -> None:
@@ -209,7 +209,7 @@ def test_get_rows_include_markers_all_text() -> None:
     assert text is not None
     rows = list(text)
 
-    assert len(rows) == 32
+    assert len(rows) == 35
 
     assert scripture_ref(rows[2]) == ScriptureRef.parse("MAT 1:0/3:ip", corpus.versification)
     assert rows[2].text == "An introduction to Matthew\\fe + \\ft This is an endnote.\\fe*"
@@ -225,14 +225,14 @@ def test_get_rows_include_markers_all_text() -> None:
     assert scripture_ref(rows[8]) == ScriptureRef.parse("MAT 1:5", corpus.versification)
     assert rows[8].text == 'Chapter one, \\li2 verse \\fig Figure 1|src="image1.png" size="col" ref="1:5"\\fig* five.'
 
-    assert scripture_ref(rows[13]) == ScriptureRef.parse("MAT 2:0/3:s1", corpus.versification)
-    assert rows[13].text == "Chapter \\it Two \\it*"
+    assert scripture_ref(rows[16]) == ScriptureRef.parse("MAT 2:0/3:s1", corpus.versification)
+    assert rows[16].text == "Chapter \\it Two \\it*"
 
-    assert scripture_ref(rows[14]) == ScriptureRef.parse("MAT 2:1", corpus.versification)
-    assert rows[14].text == "Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one."
+    assert scripture_ref(rows[17]) == ScriptureRef.parse("MAT 2:1", corpus.versification)
+    assert rows[17].text == "Chapter \\add two\\add*, verse \\f + \\fr 2:1: \\ft This is a footnote.\\f*one."
 
-    assert scripture_ref(rows[18]) == ScriptureRef.parse("MAT 2:3/1:esb/2:p", corpus.versification)
-    assert rows[18].text == "Here is some sidebar // content."
+    assert scripture_ref(rows[21]) == ScriptureRef.parse("MAT 2:3/1:esb/2:p", corpus.versification)
+    assert rows[21].text == "Here is some sidebar // content."
 
 
 def test_usfm_file_text_corpus_lowercase_usfm_id() -> None:
diff --git a/tests/corpora/test_usfm_text_updater.py b/tests/corpora/test_usfm_text_updater.py
index 80693ab..8ef31f8 100644
--- a/tests/corpora/test_usfm_text_updater.py
+++ b/tests/corpora/test_usfm_text_updater.py
@@ -30,6 +30,40 @@ def test_get_usfm_strip_all_text() -> None:
     assert "\\s\r\n" in target
 
 
+def test_get_usfm_prefer_existing():
+    rows = [
+        (
+            scr_ref("MAT 1:6"),
+            str("Text 6"),
+        ),
+        (
+            scr_ref("MAT 1:7"),
+            str("Text 7"),
+        ),
+    ]
+    target = update_usfm(rows, prefer_existing_text=True)
+    assert "\\id MAT - Test\r\n" in target
+    assert "\\v 6 Verse 6 content.\r\n" in target
+    assert "\\v 7 Text 7\r\n" in target
+
+
+def test_get_usfm_prefer_rows():
+    rows = [
+        (
+            scr_ref("MAT 1:6"),
+            str("Text 6"),
+        ),
+        (
+            scr_ref("MAT 1:7"),
+            str("Text 7"),
+        ),
+    ]
+    target = update_usfm(rows, prefer_existing_text=False)
+    assert "\\id MAT - Test\r\n" in target
+    assert "\\v 6 Text 6\r\n" in target
+    assert "\\v 7 Text 7\r\n" in target
+
+
 def test_get_usfm_verse_skip_note() -> None:
     rows = [
         (
@@ -306,9 +340,10 @@ def update_usfm(
     id_text: Optional[str] = None,
     strip_all_text: bool = False,
     strict_comparison: bool = True,
+    prefer_existing_text: bool = False,
 ) -> str:
     source = read_usfm()
-    updater = UsfmTextUpdater(rows, id_text, strip_all_text, strict_comparison)
+    updater = UsfmTextUpdater(rows, id_text, strip_all_text, strict_comparison, prefer_existing_text)
     parse_usfm(source, updater)
     return updater.get_usfm()
 
diff --git a/tests/corpora/test_usfm_tokenizer.py b/tests/corpora/test_usfm_tokenizer.py
index 74c30ac..e91abe8 100644
--- a/tests/corpora/test_usfm_tokenizer.py
+++ b/tests/corpora/test_usfm_tokenizer.py
@@ -7,7 +7,7 @@ def test_tokenize() -> None:
     usfm = _read_usfm()
     usfm_tokenizer = UsfmTokenizer()
     tokens = usfm_tokenizer.tokenize(usfm)
-    assert len(tokens) == 170
+    assert len(tokens) == 174
 
     assert tokens[0].type is UsfmTokenType.BOOK
     assert tokens[0].marker == "id"
diff --git a/tests/testutils/data/usfm/Tes/41MATTes.SFM b/tests/testutils/data/usfm/Tes/41MATTes.SFM
index af634ba..3224e09 100644
--- a/tests/testutils/data/usfm/Tes/41MATTes.SFM
+++ b/tests/testutils/data/usfm/Tes/41MATTes.SFM
@@ -14,6 +14,9 @@
 \li2 verse four,
 \v 5 Chapter one,
 \li2 verse \fig Figure 1|src="image1.png" size="col" ref="1:5"\fig* five.
+\v 6 Verse 6 content.
+\v 7
+\v 8
 \c 2
 \tr \tc1 Row one, column one. \tc2 Row one, column two.
 \tr \tc1 Row two, column one. \tc2 Row two, column two.