From 694c180876bccf1d7206cd8f536ddc410d7ba4aa Mon Sep 17 00:00:00 2001
From: Jeff Zohrab <jzohrab@gmail.com>
Date: Mon, 3 Jun 2024 19:48:21 -0700
Subject: [PATCH 1/7] Use renderer for language export.

---
 lute/cli/language_term_export.py            | 77 +++++++++++++++++++--
 tests/unit/cli/test_language_term_export.py | 71 +++++++++++++++++++
 2 files changed, 142 insertions(+), 6 deletions(-)

diff --git a/lute/cli/language_term_export.py b/lute/cli/language_term_export.py
index e104972f..4d5b307c 100644
--- a/lute/cli/language_term_export.py
+++ b/lute/cli/language_term_export.py
@@ -21,6 +21,7 @@
 from lute.db import db
 from lute.models.book import Book
 from lute.term.model import Repository
+from lute.read.render.service import get_paragraphs
 
 
 def get_dist(book, collector, termrepo, language_id):  # pylint: disable=too-many-locals
@@ -159,24 +160,88 @@ def generate_file(language_name, outfile_name):
     """
     Generate the datafile for the language.
     """
+    # pylint: disable=too-many-locals
+
     books = db.session.query(Book).all()
     books = [b for b in books if b.language.name == language_name]
     if len(books) == 0:
         print(f"No books for given language {language_name}, quitting.")
         sys.exit(0)
 
-    langid = books[0].language.id
+    lang = books[0].language
+    langid = lang.id
 
     repo = Repository(db)
     terms = {}
+
+    def _add_term_if_needed(t):
+        if t.text_lc in terms:
+            return
+        tag_list = ", ".join([tg.text for tg in t.term_tags])
+        if tag_list == "":
+            tag_list = "-"
+
+        zws = "\u200B"
+        terms[t.text_lc] = {
+            "sourceterm": t,
+            "term": t.text.replace(zws, ""),
+            "count": 0,
+            "familycount": 0,
+            "books": [],
+            "definition": t.translation or "-",
+            "status": t.status,
+            "children": [],
+            "tags": tag_list,
+        }
+
     for b in books:
-        get_dist(b, terms, repo, langid)
+        print(f"Loading data for book {b.title} ...")
+        i = 0
+        for text in b.texts:
+            i += 1
+            if i % 10 == 0:
+                print(f"  page {i} of {b.page_count()}", end="\r")
+            paragraphs = get_paragraphs(text.text, lang)
+            displayed_terms = [
+                ti.term
+                for para in paragraphs
+                for sentence in para
+                for ti in sentence.textitems
+                if ti.is_word and ti.term is not None
+            ]
+            for t in displayed_terms:
+                _add_term_if_needed(t)
+                e = terms[t.text_lc]
+                e["count"] += 1
+                e["familycount"] += 1
+                if b.title not in e["books"]:
+                    e["books"].append(b.title)
+
+                for parent in t.parents:
+                    _add_term_if_needed(parent)
+                    p = terms[parent.text_lc]
+                    p["familycount"] += 1
+                    if b.title not in p["books"]:
+                        p["books"].append(b.title)
+                    if t.text_lc not in p["children"]:
+                        p["children"].append(t.text_lc)
+
+    for _, hsh in terms.items():
+        hsh["books"] = ", ".join(list(set(hsh["books"])))
+        # children to child (count)
+        children = []
+        for key in hsh["children"]:
+            t = terms[key]
+            children.append({"count": t["count"], "term": t["sourceterm"].text})
+        csorted = sorted(children, key=lambda c: c["count"], reverse=True)
+        children_string = "; ".join([f"{c['term']} ({c['count']})" for c in csorted])
+        if children_string == "":
+            children_string = "-"
+        hsh["children"] = children_string
 
-    load_term_data(langid, terms, repo)
-    load_parent_data(langid, terms, repo)
-    outdata = get_output_data(terms)
+    outdata = terms.values()
 
-    ptsorted = sorted(outdata, key=lambda c: c["familycount"], reverse=True)
+    ptsorted = sorted(outdata, key=lambda x: (-x["familycount"], x["term"]))
     keys = [
         "term",
         "count",
diff --git a/tests/unit/cli/test_language_term_export.py b/tests/unit/cli/test_language_term_export.py
index d71f609f..876f456f 100644
--- a/tests/unit/cli/test_language_term_export.py
+++ b/tests/unit/cli/test_language_term_export.py
@@ -1,9 +1,12 @@
 "Smoke test only."
 
+import textwrap
 from lute.cli.language_term_export import generate_file
 
 from lute.models.term import Term, TermTag
+from lute.models.book import Book
 from lute.db import db
+from tests.dbasserts import assert_sql_result
 
 
 def test_smoke_test(app_context, tmp_path, english):
@@ -27,3 +30,71 @@ def test_smoke_test(app_context, tmp_path, english):
     firstline = lines[1]
     assert firstline.startswith("the,"), "the is most common"
     assert firstline.endswith('article,1,-,"a, b"'), "ending data"
+
+
+def test_single_book_export(app_context, empty_db, tmp_path, english):
+    "dump data for english."
+
+    assert_sql_result("select * from books", [], "no books")
+    assert_sql_result("select * from words", [], "no terms")
+
+    fulltext = "a b c d e A B C\n---\nG H I c d e d"
+    b = Book.create_book("hi", english, fulltext)
+    db.session.add(b)
+    db.session.commit()
+
+    for c in ["a", "d", "c d"]:
+        t = Term(english, c)
+        t.status = 1
+        db.session.add(t)
+    for c in ["e", "g", "h"]:
+        t = Term(english, c)
+        t.status = 0
+        db.session.add(t)
+    db.session.commit()
+
+    def _find(term_string):
+        "Find term with the text."
+        spec = Term(english, term_string)
+        ret = Term.find_by_spec(spec)
+        assert ret is not None, f"Have {term_string}"
+        return ret
+
+    a = _find("a")
+    for c in ["e", "h"]:
+        t = _find(c)
+        t.add_parent(a)
+        db.session.add(t)
+    db.session.commit()
+
+    outfile = tmp_path / "outfile.csv"
+    generate_file("English", outfile)
+    with open(outfile, "r", encoding="utf-8") as ofhandle:
+        text = ofhandle.read()
+    print(text)
+
+    expected = [
+        # Headings
+        "term,count,familycount,books,definition,status,children,tags",
+        # a has two children, e and h
+        "a,2,5,hi,-,1,e (2); h (1),-",
+        # b occurs twice.
+        "b,2,2,hi,-,0,-,-",
+        # 'c d' occurs twice
+        "c d,2,2,hi,-,1,-,-",
+        # e is a new term
+        "e,2,2,hi,-,0,-,-",
+        # c is a new term, status 0.
+        # Occurs once as c, once as C.
+        "C,1,1,hi,-,0,-,-",
+        "I,1,1,hi,-,0,-,-",
+        "d,1,1,hi,-,1,-,-",
+        # g and h are new
+        "g,1,1,hi,-,0,-,-",
+        "h,1,1,hi,-,0,-,-",
+        "",
+    ]
+
+    # .lower() because sometimes the text file returned B, and
+    # sometimes b ...  which is _very_ odd, but don't really care.
+    assert text.lower() == "\n".join(expected).lower(), "content"

From 1b849ea65881ea9415989bec13f3179e7c9ddfee Mon Sep 17 00:00:00 2001
From: Jeff Zohrab <jzohrab@gmail.com>
Date: Mon, 3 Jun 2024 19:50:50 -0700
Subject: [PATCH 2/7] Remove unused methods.

---
 lute/cli/language_term_export.py            | 137 --------------------
 tests/unit/cli/test_language_term_export.py |   1 -
 2 files changed, 138 deletions(-)

diff --git a/lute/cli/language_term_export.py b/lute/cli/language_term_export.py
index 4d5b307c..dafdefca 100644
--- a/lute/cli/language_term_export.py
+++ b/lute/cli/language_term_export.py
@@ -17,145 +17,11 @@
 
 import sys
 import csv
-from collections import Counter
 from lute.db import db
 from lute.models.book import Book
-from lute.term.model import Repository
 from lute.read.render.service import get_paragraphs
 
 
-def get_dist(book, collector, termrepo, language_id):  # pylint: disable=too-many-locals
-    """
-    Get word distribution in book.
-
-    The data is added to the collector dictionary.
-    """
-
-    # Get all terms and counts.
-    fulltext = "\n".join([t.text for t in book.texts])
-    pts = book.language.get_parsed_tokens(fulltext)
-    words = [pt.token for pt in pts if pt.is_word]
-
-    # distrib = { 'term1': count1, 'term2': count2, ... }
-    distrib = dict(Counter(words))
-
-    # The distribution doesn't handle capitalization, so it will
-    # contain things like { 'There': 10, 'there': 20 }.  Do a lookup
-    # for each term ('There', 'there') in the repository to see if a
-    # matching term with a standardized term.text is found.
-    normalized = {}
-
-    totcount = len(distrib.keys())
-    i = 0
-    print(f"Loading data for book {book.title} ...")
-    for k, v in distrib.items():
-        i += 1
-        if i % 100 == 0:
-            print(f"  {i} of {totcount}", end="\r")
-        norm_word = termrepo.find_or_new(language_id, k)
-        norm_entry = normalized.get(norm_word.text, {"count": 0, "parents": []})
-        norm_entry["count"] += v
-        norm_entry["parents"] = norm_word.parents
-        normalized[norm_word.text] = norm_entry
-
-    # normalized = { 'there': { 'count': 30, 'parents': [...] }, ... }.
-    #
-    # The collector may already have the term ('there') from prior
-    # books, so combine those.
-    for t, n in normalized.items():  # pylint: disable=redefined-outer-name
-        entry = collector.get(t, {"term": t, "count": 0, "books": []})
-        entry["count"] += n["count"]
-        entry["books"].append(book.title)
-        collector[t] = entry
-
-        # The term may have a parent that isn't actually present in any book!
-        # We need to add those parents to the collector as well, or later
-        # searches for the parent will fail.
-        for p in n["parents"]:
-            pentry = collector.get(p, {"term": p, "count": 0, "books": []})
-            collector[p] = pentry
-
-
-def _load_hash_from_term(t, term):
-    "Load common data to hash."
-    t["parent"] = ", ".join(term.parents)
-    t["definition"] = term.translation or "-"
-    t["status"] = term.status if term.id is not None else "-"
-    t["children"] = "-"
-    t["childbooks"] = []
-    t["tags"] = ", ".join(term.term_tags)
-
-
-def load_term_data(langid, terms, repo):
-    "Load basic data."
-    totcount = len(terms.keys())
-    i = 0
-    print("Loading term data ...")
-    for k, t in terms.items():  # pylint: disable=unused-variable
-        i += 1
-        if i % 100 == 0:
-            print(f"  {i} of {totcount}", end="\r")
-
-        term = repo.find_or_new(langid, t["term"])
-        _load_hash_from_term(t, term)
-        t["familycount"] = t["count"]
-
-
-def load_parent_data(langid, terms, repo):
-    "Get and print data."
-
-    parents = list({t["parent"] for t in terms.values() if t["parent"] != ""})
-
-    missingparents = [p for p in parents if p not in terms]
-    totcount = len(missingparents)
-    i = 0
-    print("Loading missing parents ...")
-    for p in missingparents:
-        i += 1
-        if i % 100 == 0:
-            print(f"  {i} of {totcount}", end="\r")
-
-        term = repo.find_or_new(langid, p)
-        t = {"term": p, "count": 0, "books": []}
-        _load_hash_from_term(t, term)
-        t["familycount"] = 0
-        terms[p] = t
-
-    totcount = len(parents)
-    i = 0
-    print("Finalizing parent data ...")
-    for p in parents:
-        i += 1
-        if i % 100 == 0:
-            print(f"  {i} of {totcount}", end="\r")
-
-        children = [c for c in terms.values() if c["parent"] == p]
-        csorted = sorted(children, key=lambda c: c["count"], reverse=True)
-        children_string = "; ".join([f"{c['term']} ({c['count']})" for c in csorted])
-        childbooks = [c["books"] for c in children]
-        childbooks = list({b for blist in childbooks for b in blist})
-        childtotcount = sum(c["count"] for c in children)
-
-        terms[p]["children"] = children_string
-        terms[p]["childbooks"] = childbooks
-        terms[p]["familycount"] += childtotcount
-
-
-def get_output_data(terms):
-    "Get the final set of output data."
-    printterms = [
-        t for t in terms.values() if t["parent"] == "" or t["children"] != "-"
-    ]
-
-    # Clean up data for printing.
-    for t in printterms:
-        t["books"] = list(set(t["books"] + t["childbooks"]))
-        t["books"] = "; ".join(t["books"])
-        del t["childbooks"]
-
-    return printterms
-
-
 def generate_file(language_name, outfile_name):
     """
     Generate the datafile for the language.
@@ -169,9 +35,6 @@ def generate_file(language_name, outfile_name):
         sys.exit(0)
 
     lang = books[0].language
-    langid = lang.id
-
-    repo = Repository(db)
     terms = {}
 
     def _add_term_if_needed(t):
diff --git a/tests/unit/cli/test_language_term_export.py b/tests/unit/cli/test_language_term_export.py
index 876f456f..bcaa0568 100644
--- a/tests/unit/cli/test_language_term_export.py
+++ b/tests/unit/cli/test_language_term_export.py
@@ -1,6 +1,5 @@
 "Smoke test only."
 
-import textwrap
 from lute.cli.language_term_export import generate_file
 
 from lute.models.term import Term, TermTag

From b156b2ad438ad640a36ccab15376f60fbee1149f Mon Sep 17 00:00:00 2001
From: Jeff Zohrab <jzohrab@gmail.com>
Date: Mon, 3 Jun 2024 20:29:17 -0700
Subject: [PATCH 3/7] Return object added.

---
 lute/cli/language_term_export.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/lute/cli/language_term_export.py b/lute/cli/language_term_export.py
index dafdefca..7cd60b64 100644
--- a/lute/cli/language_term_export.py
+++ b/lute/cli/language_term_export.py
@@ -37,15 +37,17 @@ def generate_file(language_name, outfile_name):
     lang = books[0].language
     terms = {}
 
-    def _add_term_if_needed(t):
-        if t.text_lc in terms:
-            return
+    def _add_term_to_dict(t):
+        key = t.text_lc
+        if key in terms:
+            return terms[key]
+
         tag_list = ", ".join([tg.text for tg in t.term_tags])
         if tag_list == "":
             tag_list = "-"
 
         zws = "\u200B"
-        terms[t.text_lc] = {
+        hsh = {
             "sourceterm": t,
             "term": t.text.replace(zws, ""),
             "count": 0,
@@ -56,6 +58,8 @@ def _add_term_if_needed(t):
             "children": [],
             "tags": tag_list,
         }
+        terms[key] = hsh
+        return hsh
 
     for b in books:
         print(f"Loading data for book {b.title} ...")
@@ -73,16 +77,14 @@ def _add_term_if_needed(t):
                 if ti.is_word and ti.term is not None
             ]
             for t in displayed_terms:
-                _add_term_if_needed(t)
-                e = terms[t.text_lc]
+                e = _add_term_to_dict(t)
                 e["count"] += 1
                 e["familycount"] += 1
                 if b.title not in e["books"]:
                     e["books"].append(b.title)
 
                 for parent in t.parents:
-                    _add_term_if_needed(parent)
-                    p = terms[parent.text_lc]
+                    p = _add_term_to_dict(parent)
                     p["familycount"] += 1
                     if b.title not in p["books"]:
                         p["books"].append(b.title)

From 9cedbd65ac7ef9536e8c10d09b4c1529e1c4fe4c Mon Sep 17 00:00:00 2001
From: Jeff Zohrab <jzohrab@gmail.com>
Date: Mon, 3 Jun 2024 20:30:55 -0700
Subject: [PATCH 4/7] Extract methods.

---
 lute/cli/language_term_export.py | 187 ++++++++++++++++---------------
 1 file changed, 98 insertions(+), 89 deletions(-)

diff --git a/lute/cli/language_term_export.py b/lute/cli/language_term_export.py
index 7cd60b64..4553e479 100644
--- a/lute/cli/language_term_export.py
+++ b/lute/cli/language_term_export.py
@@ -10,87 +10,74 @@
 e.g.
 term; count; familycount; books; definition; status; children
 haber; 100; 1500; book1,book2; to exist; 99; hay (500), he (200), has (150) ...
-
-There is probably a far better way to do this, likely using something
-fairly heavyweight like pandas.  This works well enough for now.
 """
 
-import sys
 import csv
 from lute.db import db
 from lute.models.book import Book
 from lute.read.render.service import get_paragraphs
 
 
-def generate_file(language_name, outfile_name):
-    """
-    Generate the datafile for the language.
-    """
-    # pylint: disable=too-many-locals
-
-    books = db.session.query(Book).all()
-    books = [b for b in books if b.language.name == language_name]
-    if len(books) == 0:
-        print(f"No books for given language {language_name}, quitting.")
-        sys.exit(0)
-
-    lang = books[0].language
-    terms = {}
-
-    def _add_term_to_dict(t):
-        key = t.text_lc
-        if key in terms:
-            return terms[key]
-
-        tag_list = ", ".join([tg.text for tg in t.term_tags])
-        if tag_list == "":
-            tag_list = "-"
-
-        zws = "\u200B"
-        hsh = {
-            "sourceterm": t,
-            "term": t.text.replace(zws, ""),
-            "count": 0,
-            "familycount": 0,
-            "books": [],
-            "definition": t.translation or "-",
-            "status": t.status,
-            "children": [],
-            "tags": tag_list,
-        }
-        terms[key] = hsh
-        return hsh
-
-    for b in books:
-        print(f"Loading data for book {b.title} ...")
-        i = 0
-        for text in b.texts:
-            i += 1
-            if i % 10 == 0:
-                print(f"  page {i} of {b.page_count()}", end="\r")
-            paragraphs = get_paragraphs(text.text, lang)
-            displayed_terms = [
-                ti.term
-                for para in paragraphs
-                for sentence in para
-                for ti in sentence.textitems
-                if ti.is_word and ti.term is not None
-            ]
-            for t in displayed_terms:
-                e = _add_term_to_dict(t)
-                e["count"] += 1
-                e["familycount"] += 1
-                if b.title not in e["books"]:
-                    e["books"].append(b.title)
-
-                for parent in t.parents:
-                    p = _add_term_to_dict(parent)
-                    p["familycount"] += 1
-                    if b.title not in p["books"]:
-                        p["books"].append(b.title)
-                    if t.text_lc not in p["children"]:
-                        p["children"].append(t.text_lc)
-
+def _add_term_to_dict(t, terms):
+    "Add term to dictionary and return it."
+    key = t.text_lc
+    if key in terms:
+        return terms[key]
+
+    tag_list = ", ".join([tg.text for tg in t.term_tags])
+    if tag_list == "":
+        tag_list = "-"
+
+    zws = "\u200B"
+    hsh = {
+        "sourceterm": t,
+        "term": t.text.replace(zws, ""),
+        "count": 0,
+        "familycount": 0,
+        "books": [],
+        "definition": t.translation or "-",
+        "status": t.status,
+        "children": [],
+        "tags": tag_list,
+    }
+    terms[key] = hsh
+    return hsh
+
+
+def _process_book(b, terms):
+    "Process pages in book, add to output."
+    print(f"Loading data for book {b.title} ...")
+    i = 0
+    for text in b.texts:
+        i += 1
+        if i % 10 == 0:
+            print(f"  page {i} of {b.page_count()}", end="\r")
+        paragraphs = get_paragraphs(text.text, b.language)
+        displayed_terms = [
+            ti.term
+            for para in paragraphs
+            for sentence in para
+            for ti in sentence.textitems
+            if ti.is_word and ti.term is not None
+        ]
+        for t in displayed_terms:
+            e = _add_term_to_dict(t, terms)
+            e["count"] += 1
+            e["familycount"] += 1
+            if b.title not in e["books"]:
+                e["books"].append(b.title)
+
+            for parent in t.parents:
+                p = _add_term_to_dict(parent, terms)
+                p["familycount"] += 1
+                if b.title not in p["books"]:
+                    p["books"].append(b.title)
+                if t.text_lc not in p["children"]:
+                    p["children"].append(t.text_lc)
+
+
+def _finalize_output(terms):
+    "Convert terms hash to usable output."
     for _, hsh in terms.items():
         hsh["books"] = ", ".join(list(set(hsh["books"])))
         # children to child (count)
@@ -104,23 +91,45 @@ def _add_term_to_dict(t):
             children_string = "-"
         hsh["children"] = children_string
 
-    outdata = terms.values()
-
-    ptsorted = sorted(outdata, key=lambda x: (-x["familycount"], x["term"]))
-    keys = [
-        "term",
-        "count",
-        "familycount",
-        "books",
-        "definition",
-        "status",
-        "children",
-        "tags",
-    ]
-    print(f"Writing to {outfile_name}")
+    ret = terms.values()
+    return sorted(ret, key=lambda x: (-x["familycount"], x["term"]))
+
+
+def _generate_file(books, outfile_name):
+    "Write data file for books to outfile_name."
+    terms = {}
+    for b in books:
+        _process_book(b, terms)
+    outdata = _finalize_output(terms)
+
     with open(outfile_name, "w", newline="", encoding="utf-8") as outfile:
+        keys = [
+            "term",
+            "count",
+            "familycount",
+            "books",
+            "definition",
+            "status",
+            "children",
+            "tags",
+        ]
         writer = csv.DictWriter(outfile, fieldnames=keys, extrasaction="ignore")
         writer.writeheader()
-        for r in ptsorted:
+        for r in outdata:
             writer.writerow(r)
-    print("Done.")
+
+
+def generate_file(language_name, outfile_name):
+    """
+    Generate the datafile for the language.
+    """
+    # pylint: disable=too-many-locals
+
+    books = db.session.query(Book).all()
+    books = [b for b in books if b.language.name == language_name]
+    if len(books) == 0:
+        print(f"No books for given language {language_name}, quitting.")
+    else:
+        print(f"Writing to {outfile_name}")
+        _generate_file(books, outfile_name)
+        print("Done.")

From edadde82a35cf10bca5e1accb520f4db402701b6 Mon Sep 17 00:00:00 2001
From: Jeff Zohrab <jzohrab@gmail.com>
Date: Mon, 3 Jun 2024 21:42:10 -0700
Subject: [PATCH 5/7] Truncate book list.

---
 lute/cli/language_term_export.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/lute/cli/language_term_export.py b/lute/cli/language_term_export.py
index 4553e479..38cfa662 100644
--- a/lute/cli/language_term_export.py
+++ b/lute/cli/language_term_export.py
@@ -46,12 +46,12 @@ def _add_term_to_dict(t, terms):
 
 def _process_book(b, terms):
     "Process pages in book, add to output."
-    print(f"Loading data for book {b.title} ...")
+    print(f"Processing {b.title} ...")
     i = 0
     for text in b.texts:
         i += 1
         if i % 10 == 0:
-            print(f"  page {i} of {b.page_count()}", end="\r")
+            print(f"  page {i} of {b.page_count}", end="\r")
         paragraphs = get_paragraphs(text.text, b.language)
         displayed_terms = [
             ti.term
@@ -76,10 +76,22 @@ def _process_book(b, terms):
                     p["children"].append(t.text_lc)
 
 
+def _book_list_truncated(title_array):
+    "Return first 5 books, + count of rest."
+    titles = list(set(title_array))
+    first_5 = titles[:5]
+    ret = ", ".join(first_5)
+    count_rest = len(titles) - len(first_5)
+    if count_rest > 0:
+        ret += f" [... +{count_rest} more]"
+    return ret
+
+
 def _finalize_output(terms):
     "Convert terms hash to usable output."
     for _, hsh in terms.items():
-        hsh["books"] = ", ".join(list(set(hsh["books"])))
+        hsh["books"] = _book_list_truncated(hsh["books"])
+
         # children to child (count)
         children = []
         for key in hsh["children"]:
@@ -132,4 +144,4 @@ def generate_file(language_name, outfile_name):
     else:
         print(f"Writing to {outfile_name}")
         _generate_file(books, outfile_name)
-        print("Done.")
+        print("Done.                     ")  # extra space overwrites old output.

From 9eb2cc116df31bed1a59cd813bbb2e9002099887 Mon Sep 17 00:00:00 2001
From: Jeff Zohrab <jzohrab@gmail.com>
Date: Mon, 3 Jun 2024 22:13:33 -0700
Subject: [PATCH 6/7] Add book_term_export cli job.

---
 lute/cli/commands.py                        | 17 ++++++++++++++---
 lute/cli/language_term_export.py            | 18 +++++++++++++++---
 tests/unit/cli/test_language_term_export.py | 12 +++++++++---
 3 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/lute/cli/commands.py b/lute/cli/commands.py
index 469200fc..b52ef2f3 100644
--- a/lute/cli/commands.py
+++ b/lute/cli/commands.py
@@ -5,7 +5,7 @@
 import click
 from flask import Blueprint
 
-from lute.cli.language_term_export import generate_file
+from lute.cli.language_term_export import generate_language_file, generate_book_file
 
 bp = Blueprint("cli", __name__)
 
@@ -32,7 +32,18 @@ def hello():
 @click.argument("output_path")
 def language_export(language, output_path):
     """
-    Get all terms from active books in the language, and write a
+    Get all terms from all books in the language, and write a
     data file of term frequencies and children.
     """
-    generate_file(language, output_path)
+    generate_language_file(language, output_path)
+
+
+@bp.cli.command("book_term_export")
+@click.argument("bookid")
+@click.argument("output_path")
+def book_term_export(bookid, output_path):
+    """
+    Get all terms for the given book, and write a
+    data file of term frequencies and children.
+    """
+    generate_book_file(bookid, output_path)
diff --git a/lute/cli/language_term_export.py b/lute/cli/language_term_export.py
index 38cfa662..76ac4b4f 100644
--- a/lute/cli/language_term_export.py
+++ b/lute/cli/language_term_export.py
@@ -131,12 +131,10 @@ def _generate_file(books, outfile_name):
             writer.writerow(r)
 
 
-def generate_file(language_name, outfile_name):
+def generate_language_file(language_name, outfile_name):
     """
     Generate the datafile for the language.
     """
-    # pylint: disable=too-many-locals
-
     books = db.session.query(Book).all()
     books = [b for b in books if b.language.name == language_name]
     if len(books) == 0:
@@ -145,3 +143,17 @@ def generate_file(language_name, outfile_name):
         print(f"Writing to {outfile_name}")
         _generate_file(books, outfile_name)
         print("Done.                     ")  # extra space overwrites old output.
+
+
+def generate_book_file(bookid, outfile_name):
+    """
+    Generate the datafile for the book.
+    """
+    books = db.session.query(Book).all()
+    books = [b for b in books if f"{b.id}" == f"{bookid}"]
+    if len(books) == 0:
+        print(f"No book with id = {bookid}.")
+    else:
+        print(f"Writing to {outfile_name}")
+        _generate_file(books, outfile_name)
+        print("Done.                     ")  # extra space overwrites old output.
diff --git a/tests/unit/cli/test_language_term_export.py b/tests/unit/cli/test_language_term_export.py
index bcaa0568..5061bd10 100644
--- a/tests/unit/cli/test_language_term_export.py
+++ b/tests/unit/cli/test_language_term_export.py
@@ -1,6 +1,6 @@
 "Smoke test only."
 
-from lute.cli.language_term_export import generate_file
+from lute.cli.language_term_export import generate_language_file, generate_book_file
 
 from lute.models.term import Term, TermTag
 from lute.models.book import Book
@@ -18,7 +18,7 @@ def test_smoke_test(app_context, tmp_path, english):
     db.session.commit()
 
     outfile = tmp_path / "outfile.csv"
-    generate_file("English", outfile)
+    generate_language_file("English", outfile)
     with open(outfile, "r", encoding="utf-8") as ofhandle:
         text = ofhandle.read()
     print(text)
@@ -67,7 +67,7 @@ def _find(term_string):
     db.session.commit()
 
     outfile = tmp_path / "outfile.csv"
-    generate_file("English", outfile)
+    generate_language_file("English", outfile)
     with open(outfile, "r", encoding="utf-8") as ofhandle:
         text = ofhandle.read()
     print(text)
@@ -97,3 +97,9 @@ def _find(term_string):
     # .lower() because sometimes the text file returned B, and
     # sometimes b ...  which is _very_ odd, but don't really care.
     assert text.lower() == "\n".join(expected).lower(), "content"
+
+    generate_book_file(b.id, outfile)
+    with open(outfile, "r", encoding="utf-8") as ofhandle:
+        text = ofhandle.read()
+    print(text)
+    assert text.lower() == "\n".join(expected).lower(), "book file"

From 0e21a31b2da5f1e8fd7c54d5559f9f30bd591481 Mon Sep 17 00:00:00 2001
From: Jeff Zohrab <jzohrab@gmail.com>
Date: Thu, 6 Jun 2024 16:11:58 -0700
Subject: [PATCH 7/7] Slight performance improvement to rendering search
 methods.

---
 lute/read/render/service.py | 111 ++++++++++++++++++++++++++----------
 lute/utils/debug_helpers.py |   3 +-
 2 files changed, 84 insertions(+), 30 deletions(-)

diff --git a/lute/read/render/service.py b/lute/read/render/service.py
index 92afc612..0125b80a 100644
--- a/lute/read/render/service.py
+++ b/lute/read/render/service.py
@@ -10,6 +10,8 @@
 from lute.read.render.renderable_calculator import RenderableCalculator
 from lute.db import db
 
+# from lute.utils.debug_helpers import DebugTimer
+
 
 def find_all_Terms_in_string(s, language):  # pylint: disable=too-many-locals
     """
@@ -39,28 +41,69 @@ def _find_all_terms_in_tokens(tokens, language):
 
     The code first queries for exact single-token matches,
     and then multiword matches, because that's much faster
-    than querying for everthing at once.  (This may no longer
-    be true, can change it later.)
+    than querying for everything at once.
     """
 
-    parser = language.parser
+    # Future performance improvement considerations:
+    #
+    # 1. I considered keeping a cache of multiword terms strings and
+    # IDs, but IMO the payoff isn't worth the extra complexity at this
+    # time.
+    #
+    # 2. Maybe a different search method like Aho-Corasick (ref
+    # https://github.com/abusix/ahocorapy) would be useful ... again
+    # it would imply that all keywords (existing Terms) are loaded
+    # into the Aho-Corasick automaton.  This could be cached, but would
+    # again need methods for cache invalidation and reload etc.
+
+    # dt = DebugTimer("_find_all_terms_in_tokens", False)
 
-    # fyi - Manually searching for terms was slow (i.e., querying for
-    # all terms, and checking if the strings were in the string s).
+    parser = language.parser
 
-    # Query for terms with a single token that match the unique word tokens
+    # Single word terms
+    #
+    # Build query for terms with a single token that match the unique
+    # word tokens.  Note it's much faster to use a query for this,
+    # rather than loading all term text and checking for the strings
+    # using python, as we can rely on the database indexes.
     word_tokens = filter(lambda t: t.is_word, tokens)
     tok_strings = [parser.get_lowercase(t.token) for t in word_tokens]
     tok_strings = list(set(tok_strings))
-    terms_matching_tokens = (
-        db.session.query(Term)
-        .filter(
-            Term.language == language,
-            Term.text_lc.in_(tok_strings),
-            Term.token_count == 1,
-        )
-        .all()
+    terms_matching_tokens_qry = db.session.query(Term).filter(
+        Term.text_lc.in_(tok_strings), Term.language == language
     )
+    # dt.step("single, query prep")
+
+    # Multiword terms
+    #
+    # Multiword terms are harder to find as we have to do a full text
+    # match.
+    #
+    # The "obvious" method of using the model is quite slow:
+    #
+    #   contained_term_qry = db.session.query(Term).filter(
+    #     Term.language == language,
+    #     Term.token_count > 1,
+    #     func.instr(content, Term.text_lc) > 0,
+    #   )
+    #   contained_terms = contained_term_qry.all()
+    #
+    # This code first finds the IDs of the terms that are in the content,
+    # and then loads the terms.
+    #
+    # Note that querying using 'LIKE' is again slow, i.e:
+    #   sql = sqltext(
+    #     """
+    #     SELECT WoID FROM words
+    #     WHERE WoLgID=:lid and WoTokenCount>1
+    #     AND :content LIKE '%' || :zws || WoTextLC || :zws || '%'
+    #     """
+    #   )
+    #   sql = sql.bindparams(lid=language.id, content=content, zws=zws)
+    #
+    # It is actually faster to load all Term text_lc and use python to
+    # check if the strings are in the content string, and only then
+    # load the terms.
 
     # Multiword terms have zws between all tokens.
     # Create content string with zws between all tokens for the match.
@@ -70,26 +113,27 @@ def _find_all_terms_in_tokens(tokens, language):
 
     sql = sqltext(
         """
-        SELECT WoID FROM words
+        SELECT WoID, WoTextLC FROM words
         WHERE WoLgID=:language_id and WoTokenCount>1
-        AND :content LIKE '%' || :zws || WoTextLC || :zws || '%'
         """
     )
-    sql = sql.bindparams(language_id=language.id, content=content, zws=zws)
-    idlist = db.session.execute(sql).all()
-    woids = [int(p[0]) for p in idlist]
-    contained_terms = db.session.query(Term).filter(Term.id.in_(woids)).all()
+    sql = sql.bindparams(language_id=language.id)
+    reclist = db.session.execute(sql).all()
+    # dt.step(f"mwords, loaded {len(reclist)} records")
+    woids = [int(p[0]) for p in reclist if f"{zws}{p[1]}{zws}" in content]
+    # dt.step("mwords, filtered ids")
+    # dt.step("mword ids")
+
+    contained_terms_qry = db.session.query(Term).filter(Term.id.in_(woids))
 
-    # Note that the above method (querying for ids, then getting terms)
-    # is faster than using the model as shown below!
-    ### contained_term_query = db.session.query(Term).filter(
-    ###     Term.language == language,
-    ###     Term.token_count > 1,
-    ###     func.instr(content, Term.text_lc) > 0,
-    ### )
-    ### contained_terms = contained_term_query.all()
+    # Some term entity relationship objects (tags, parents) could be
+    # eagerly loaded using ".options(joinedload(Term.term_tags),
+    # joinedload(Term.parents))", but any gains in subsequent usage
+    # are offset by the slower query!
+    all_terms = terms_matching_tokens_qry.union(contained_terms_qry).all()
+    # dt.step("union, exec query")
 
-    return terms_matching_tokens + contained_terms
+    return all_terms
 
 
 class RenderableSentence:
@@ -168,12 +212,16 @@ def get_paragraphs(s, language):
     """
     Get array of arrays of RenderableSentences for the given string s.
     """
+    # dt = DebugTimer("get_paragraphs", False)
+
     # Hacky reset of state of ParsedToken state.
     # _Shouldn't_ be needed but doesn't hurt, even if it's lame.
     ParsedToken.reset_counters()
 
     cleaned = re.sub(r" +", " ", s)
+    # dt.step("start get_parsed_tokens")
     tokens = language.get_parsed_tokens(cleaned)
+    # dt.step("done get_parsed_tokens")
 
     # Brutal hack ... for some reason the tests fail in
     # CI, but _inconsistently_, with the token order numbers.  The
@@ -186,10 +234,13 @@ def get_paragraphs(s, language):
         for t in tokens:
             t.order = n
             n += 1
+    # dt.step("done token.sort")
 
     terms = _find_all_terms_in_tokens(tokens, language)
+    # dt.step("done _find_all_terms_in_tokens")
 
     paragraphs = _split_tokens_by_paragraph(tokens)
+    # dt.step("done _split_tokens_by_paragraph")
 
     renderable_paragraphs = []
     pnum = 0
@@ -201,7 +252,9 @@ def get_paragraphs(s, language):
         ]
         renderable_paragraphs.append(renderable_sentences)
         pnum += 1
+    # dt.step("done renderable_paragraphs load")
 
     _add_status_0_terms(renderable_paragraphs, language)
+    # dt.step("done add status 0 terms")
 
     return renderable_paragraphs
diff --git a/lute/utils/debug_helpers.py b/lute/utils/debug_helpers.py
index 8c253f70..a883d5a3 100644
--- a/lute/utils/debug_helpers.py
+++ b/lute/utils/debug_helpers.py
@@ -16,7 +16,8 @@ def __init__(self, name, display=True):
         self.name = name
         self.step_map = {}
         self.display = display
-        print(f"{name} timer started")
+        if display:
+            print(f"{name} timer started")
 
     def step(self, s):
         "Dump time spent in step, total time since start."