Merge branch 'add_book_term_export_cli' into develop

LuteOrg · Jun 9, 2024 · def0589 · def0589
2 parents 9d9e7fa + 0e21a31
commit def0589
Show file tree

Hide file tree

Showing 5 changed files with 299 additions and 195 deletions.
diff --git a/lute/cli/commands.py b/lute/cli/commands.py
@@ -5,7 +5,7 @@
 import click
 from flask import Blueprint
 
-from lute.cli.language_term_export import generate_file
+from lute.cli.language_term_export import generate_language_file, generate_book_file
 
 bp = Blueprint("cli", __name__)
 
@@ -32,7 +32,18 @@ def hello():
 @click.argument("output_path")
 def language_export(language, output_path):
     """
-    Get all terms from active books in the language, and write a
+    Get all terms from all books in the language, and write a
     data file of term frequencies and children.
     """
-    generate_file(language, output_path)
+    generate_language_file(language, output_path)
+
+
+@bp.cli.command("book_term_export")
+@click.argument("bookid")
+@click.argument("output_path")
+def book_term_export(bookid, output_path):
+    """
+    Get all terms for the given book, and write a
+    data file of term frequencies and children.
+    """
+    generate_book_file(bookid, output_path)
diff --git a/lute/cli/language_term_export.py b/lute/cli/language_term_export.py
@@ -10,187 +10,150 @@
 e.g.
 term; count; familycount; books; definition; status; children
 haber; 100; 1500; book1,book2; to exist; 99; hay (500), he (200), has (150) ...
-
-There is probably a far better way to do this, likely using something
-fairly heavyweight like pandas.  This works well enough for now.
 """
 
-import sys
 import csv
-from collections import Counter
 from lute.db import db
 from lute.models.book import Book
-from lute.term.model import Repository
-
-
-def get_dist(book, collector, termrepo, language_id):  # pylint: disable=too-many-locals
-    """
-    Get word distribution in book.
-
-    The data is added to the collector dictionary.
-    """
-
-    # Get all terms and counts.
-    fulltext = "\n".join([t.text for t in book.texts])
-    pts = book.language.get_parsed_tokens(fulltext)
-    words = [pt.token for pt in pts if pt.is_word]
-
-    # distrib = { 'term1': count1, 'term2': count2, ... }
-    distrib = dict(Counter(words))
-
-    # The distribution doesn't handle capitalization, so it will
-    # contain things like { 'There': 10, 'there': 20 }.  Do a lookup
-    # for each term ('There', 'there') in the repository to see if a
-    # matching term with a standardized term.text is found.
-    normalized = {}
-
-    totcount = len(distrib.keys())
+from lute.read.render.service import get_paragraphs
+
+
+def _add_term_to_dict(t, terms):
+    "Add term to dictionary and return it."
+    key = t.text_lc
+    if key in terms:
+        return terms[key]
+
+    tag_list = ", ".join([tg.text for tg in t.term_tags])
+    if tag_list == "":
+        tag_list = "-"
+
+    zws = "\u200B"
+    hsh = {
+        "sourceterm": t,
+        "term": t.text.replace(zws, ""),
+        "count": 0,
+        "familycount": 0,
+        "books": [],
+        "definition": t.translation or "-",
+        "status": t.status,
+        "children": [],
+        "tags": tag_list,
+    }
+    terms[key] = hsh
+    return hsh
+
+
+def _process_book(b, terms):
+    "Process pages in book, add to output."
+    print(f"Processing {b.title} ...")
     i = 0
-    print(f"Loading data for book {book.title} ...")
-    for k, v in distrib.items():
+    for text in b.texts:
         i += 1
-        if i % 100 == 0:
-            print(f"  {i} of {totcount}", end="\r")
-        norm_word = termrepo.find_or_new(language_id, k)
-        norm_entry = normalized.get(norm_word.text, {"count": 0, "parents": []})
-        norm_entry["count"] += v
-        norm_entry["parents"] = norm_word.parents
-        normalized[norm_word.text] = norm_entry
-
-    # normalized = { 'there': { 'count': 30, 'parents': [...] }, ... }.
-    #
-    # The collector may already have the term ('there') from prior
-    # books, so combine those.
-    for t, n in normalized.items():  # pylint: disable=redefined-outer-name
-        entry = collector.get(t, {"term": t, "count": 0, "books": []})
-        entry["count"] += n["count"]
-        entry["books"].append(book.title)
-        collector[t] = entry
-
-        # The term may have a parent that isn't actually present in any book!
-        # We need to add those parents to the collector as well, or later
-        # searches for the parent will fail.
-        for p in n["parents"]:
-            pentry = collector.get(p, {"term": p, "count": 0, "books": []})
-            collector[p] = pentry
-
-
-def _load_hash_from_term(t, term):
-    "Load common data to hash."
-    t["parent"] = ", ".join(term.parents)
-    t["definition"] = term.translation or "-"
-    t["status"] = term.status if term.id is not None else "-"
-    t["children"] = "-"
-    t["childbooks"] = []
-    t["tags"] = ", ".join(term.term_tags)
-
-
-def load_term_data(langid, terms, repo):
-    "Load basic data."
-    totcount = len(terms.keys())
-    i = 0
-    print("Loading term data ...")
-    for k, t in terms.items():  # pylint: disable=unused-variable
-        i += 1
-        if i % 100 == 0:
-            print(f"  {i} of {totcount}", end="\r")
-
-        term = repo.find_or_new(langid, t["term"])
-        _load_hash_from_term(t, term)
-        t["familycount"] = t["count"]
-
-
-def load_parent_data(langid, terms, repo):
-    "Get and print data."
-
-    parents = list({t["parent"] for t in terms.values() if t["parent"] != ""})
-
-    missingparents = [p for p in parents if p not in terms]
-    totcount = len(missingparents)
-    i = 0
-    print("Loading missing parents ...")
-    for p in missingparents:
-        i += 1
-        if i % 100 == 0:
-            print(f"  {i} of {totcount}", end="\r")
-
-        term = repo.find_or_new(langid, p)
-        t = {"term": p, "count": 0, "books": []}
-        _load_hash_from_term(t, term)
-        t["familycount"] = 0
-        terms[p] = t
-
-    totcount = len(parents)
-    i = 0
-    print("Finalizing parent data ...")
-    for p in parents:
-        i += 1
-        if i % 100 == 0:
-            print(f"  {i} of {totcount}", end="\r")
-
-        children = [c for c in terms.values() if c["parent"] == p]
+        if i % 10 == 0:
+            print(f"  page {i} of {b.page_count}", end="\r")
+        paragraphs = get_paragraphs(text.text, b.language)
+        displayed_terms = [
+            ti.term
+            for para in paragraphs
+            for sentence in para
+            for ti in sentence.textitems
+            if ti.is_word and ti.term is not None
+        ]
+        for t in displayed_terms:
+            e = _add_term_to_dict(t, terms)
+            e["count"] += 1
+            e["familycount"] += 1
+            if b.title not in e["books"]:
+                e["books"].append(b.title)
+
+            for parent in t.parents:
+                p = _add_term_to_dict(parent, terms)
+                p["familycount"] += 1
+                if b.title not in p["books"]:
+                    p["books"].append(b.title)
+                if t.text_lc not in p["children"]:
+                    p["children"].append(t.text_lc)
+
+
+def _book_list_truncated(title_array):
+    "Return first 5 books, + count of rest."
+    titles = list(set(title_array))
+    first_5 = titles[:5]
+    ret = ", ".join(first_5)
+    count_rest = len(titles) - len(first_5)
+    if count_rest > 0:
+        ret += f" [... +{count_rest} more]"
+    return ret
+
+
+def _finalize_output(terms):
+    "Convert terms hash to usable output."
+    for _, hsh in terms.items():
+        hsh["books"] = _book_list_truncated(hsh["books"])
+
+        # children to child (count)
+        children = []
+        for key in hsh["children"]:
+            t = terms[key]
+            children.append({"count": t["count"], "term": t["sourceterm"].text})
         csorted = sorted(children, key=lambda c: c["count"], reverse=True)
         children_string = "; ".join([f"{c['term']} ({c['count']})" for c in csorted])
-        childbooks = [c["books"] for c in children]
-        childbooks = list({b for blist in childbooks for b in blist})
-        childtotcount = sum(c["count"] for c in children)
+        if children_string == "":
+            children_string = "-"
+        hsh["children"] = children_string
 
-        terms[p]["children"] = children_string
-        terms[p]["childbooks"] = childbooks
-        terms[p]["familycount"] += childtotcount
+    ret = terms.values()
+    return sorted(ret, key=lambda x: (-x["familycount"], x["term"]))
 
 
-def get_output_data(terms):
-    "Get the final set of output data."
-    printterms = [
-        t for t in terms.values() if t["parent"] == "" or t["children"] != "-"
-    ]
-
-    # Clean up data for printing.
-    for t in printterms:
-        t["books"] = list(set(t["books"] + t["childbooks"]))
-        t["books"] = "; ".join(t["books"])
-        del t["childbooks"]
+def _generate_file(books, outfile_name):
+    "Write data file for books to outfile_name."
+    terms = {}
+    for b in books:
+        _process_book(b, terms)
+    outdata = _finalize_output(terms)
 
-    return printterms
+    with open(outfile_name, "w", newline="", encoding="utf-8") as outfile:
+        keys = [
+            "term",
+            "count",
+            "familycount",
+            "books",
+            "definition",
+            "status",
+            "children",
+            "tags",
+        ]
+        writer = csv.DictWriter(outfile, fieldnames=keys, extrasaction="ignore")
+        writer.writeheader()
+        for r in outdata:
+            writer.writerow(r)
 
 
-def generate_file(language_name, outfile_name):
+def generate_language_file(language_name, outfile_name):
     """
     Generate the datafile for the language.
     """
     books = db.session.query(Book).all()
     books = [b for b in books if b.language.name == language_name]
     if len(books) == 0:
         print(f"No books for given language {language_name}, quitting.")
-        sys.exit(0)
+    else:
+        print(f"Writing to {outfile_name}")
+        _generate_file(books, outfile_name)
+        print("Done.                     ")  # extra space overwrites old output.
 
-    langid = books[0].language.id
 
-    repo = Repository(db)
-    terms = {}
-    for b in books:
-        get_dist(b, terms, repo, langid)
-
-    load_term_data(langid, terms, repo)
-    load_parent_data(langid, terms, repo)
-    outdata = get_output_data(terms)
-
-    ptsorted = sorted(outdata, key=lambda c: c["familycount"], reverse=True)
-    keys = [
-        "term",
-        "count",
-        "familycount",
-        "books",
-        "definition",
-        "status",
-        "children",
-        "tags",
-    ]
-    print(f"Writing to {outfile_name}")
-    with open(outfile_name, "w", newline="", encoding="utf-8") as outfile:
-        writer = csv.DictWriter(outfile, fieldnames=keys, extrasaction="ignore")
-        writer.writeheader()
-        for r in ptsorted:
-            writer.writerow(r)
-    print("Done.")
+def generate_book_file(bookid, outfile_name):
+    """
+    Generate the datafile for the book.
+    """
+    books = db.session.query(Book).all()
+    books = [b for b in books if f"{b.id}" == f"{bookid}"]
+    if len(books) == 0:
+        print(f"No book with id = {bookid}.")
+    else:
+        print(f"Writing to {outfile_name}")
+        _generate_file(books, outfile_name)
+        print("Done.                     ")  # extra space overwrites old output.