From 694c180876bccf1d7206cd8f536ddc410d7ba4aa Mon Sep 17 00:00:00 2001 From: Jeff Zohrab Date: Mon, 3 Jun 2024 19:48:21 -0700 Subject: [PATCH 1/7] Use renderer for language export. --- lute/cli/language_term_export.py | 77 +++++++++++++++++++-- tests/unit/cli/test_language_term_export.py | 71 +++++++++++++++++++ 2 files changed, 142 insertions(+), 6 deletions(-) diff --git a/lute/cli/language_term_export.py b/lute/cli/language_term_export.py index e104972f..4d5b307c 100644 --- a/lute/cli/language_term_export.py +++ b/lute/cli/language_term_export.py @@ -21,6 +21,7 @@ from lute.db import db from lute.models.book import Book from lute.term.model import Repository +from lute.read.render.service import get_paragraphs def get_dist(book, collector, termrepo, language_id): # pylint: disable=too-many-locals @@ -159,24 +160,88 @@ def generate_file(language_name, outfile_name): """ Generate the datafile for the language. """ + # pylint: disable=too-many-locals + books = db.session.query(Book).all() books = [b for b in books if b.language.name == language_name] if len(books) == 0: print(f"No books for given language {language_name}, quitting.") sys.exit(0) - langid = books[0].language.id + lang = books[0].language + langid = lang.id repo = Repository(db) terms = {} + + def _add_term_if_needed(t): + if t.text_lc in terms: + return + tag_list = ", ".join([tg.text for tg in t.term_tags]) + if tag_list == "": + tag_list = "-" + + zws = "\u200B" + terms[t.text_lc] = { + "sourceterm": t, + "term": t.text.replace(zws, ""), + "count": 0, + "familycount": 0, + "books": [], + "definition": t.translation or "-", + "status": t.status, + "children": [], + "tags": tag_list, + } + for b in books: - get_dist(b, terms, repo, langid) + print(f"Loading data for book {b.title} ...") + i = 0 + for text in b.texts: + i += 1 + if i % 10 == 0: + print(f" page {i} of {b.page_count()}", end="\r") + paragraphs = get_paragraphs(text.text, lang) + displayed_terms = [ + ti.term + for para in paragraphs + for sentence in para + for ti in sentence.textitems + if ti.is_word and ti.term is not None + ] + for t in displayed_terms: + _add_term_if_needed(t) + e = terms[t.text_lc] + e["count"] += 1 + e["familycount"] += 1 + if b.title not in e["books"]: + e["books"].append(b.title) + + for parent in t.parents: + _add_term_if_needed(parent) + p = terms[parent.text_lc] + p["familycount"] += 1 + if b.title not in p["books"]: + p["books"].append(b.title) + if t.text_lc not in p["children"]: + p["children"].append(t.text_lc) + + for _, hsh in terms.items(): + hsh["books"] = ", ".join(list(set(hsh["books"]))) + # children to child (count) + children = [] + for key in hsh["children"]: + t = terms[key] + children.append({"count": t["count"], "term": t["sourceterm"].text}) + csorted = sorted(children, key=lambda c: c["count"], reverse=True) + children_string = "; ".join([f"{c['term']} ({c['count']})" for c in csorted]) + if children_string == "": + children_string = "-" + hsh["children"] = children_string - load_term_data(langid, terms, repo) - load_parent_data(langid, terms, repo) - outdata = get_output_data(terms) + outdata = terms.values() - ptsorted = sorted(outdata, key=lambda c: c["familycount"], reverse=True) + ptsorted = sorted(outdata, key=lambda x: (-x["familycount"], x["term"])) keys = [ "term", "count", diff --git a/tests/unit/cli/test_language_term_export.py b/tests/unit/cli/test_language_term_export.py index d71f609f..876f456f 100644 --- a/tests/unit/cli/test_language_term_export.py +++ b/tests/unit/cli/test_language_term_export.py @@ -1,9 +1,12 @@ "Smoke test only." +import textwrap from lute.cli.language_term_export import generate_file from lute.models.term import Term, TermTag +from lute.models.book import Book from lute.db import db +from tests.dbasserts import assert_sql_result def test_smoke_test(app_context, tmp_path, english): @@ -27,3 +30,71 @@ def test_smoke_test(app_context, tmp_path, english): firstline = lines[1] assert firstline.startswith("the,"), "the is most common" assert firstline.endswith('article,1,-,"a, b"'), "ending data" + + +def test_single_book_export(app_context, empty_db, tmp_path, english): + "dump data for english." + + assert_sql_result("select * from books", [], "no books") + assert_sql_result("select * from words", [], "no terms") + + fulltext = "a b c d e A B C\n---\nG H I c d e d" + b = Book.create_book("hi", english, fulltext) + db.session.add(b) + db.session.commit() + + for c in ["a", "d", "c d"]: + t = Term(english, c) + t.status = 1 + db.session.add(t) + for c in ["e", "g", "h"]: + t = Term(english, c) + t.status = 0 + db.session.add(t) + db.session.commit() + + def _find(term_string): + "Find term with the text." + spec = Term(english, term_string) + ret = Term.find_by_spec(spec) + assert ret is not None, f"Have {term_string}" + return ret + + a = _find("a") + for c in ["e", "h"]: + t = _find(c) + t.add_parent(a) + db.session.add(t) + db.session.commit() + + outfile = tmp_path / "outfile.csv" + generate_file("English", outfile) + with open(outfile, "r", encoding="utf-8") as ofhandle: + text = ofhandle.read() + print(text) + + expected = [ + # Headings + "term,count,familycount,books,definition,status,children,tags", + # a has two children, e and h + "a,2,5,hi,-,1,e (2); h (1),-", + # b occurs twice. + "b,2,2,hi,-,0,-,-", + # 'c d' occurs twice + "c d,2,2,hi,-,1,-,-", + # e is a new term + "e,2,2,hi,-,0,-,-", + # c is a new term, status 0. + # Occurs once as c, once as C. + "C,1,1,hi,-,0,-,-", + "I,1,1,hi,-,0,-,-", + "d,1,1,hi,-,1,-,-", + # g and h are new + "g,1,1,hi,-,0,-,-", + "h,1,1,hi,-,0,-,-", + "", + ] + + # .lower() because sometimes the text file returned B, and + # sometimes b ... which is _very_ odd, but don't really care. + assert text.lower() == "\n".join(expected).lower(), "content" From 1b849ea65881ea9415989bec13f3179e7c9ddfee Mon Sep 17 00:00:00 2001 From: Jeff Zohrab Date: Mon, 3 Jun 2024 19:50:50 -0700 Subject: [PATCH 2/7] Remove unused methods. --- lute/cli/language_term_export.py | 137 -------------------- tests/unit/cli/test_language_term_export.py | 1 - 2 files changed, 138 deletions(-) diff --git a/lute/cli/language_term_export.py b/lute/cli/language_term_export.py index 4d5b307c..dafdefca 100644 --- a/lute/cli/language_term_export.py +++ b/lute/cli/language_term_export.py @@ -17,145 +17,11 @@ import sys import csv -from collections import Counter from lute.db import db from lute.models.book import Book -from lute.term.model import Repository from lute.read.render.service import get_paragraphs -def get_dist(book, collector, termrepo, language_id): # pylint: disable=too-many-locals - """ - Get word distribution in book. - - The data is added to the collector dictionary. - """ - - # Get all terms and counts. - fulltext = "\n".join([t.text for t in book.texts]) - pts = book.language.get_parsed_tokens(fulltext) - words = [pt.token for pt in pts if pt.is_word] - - # distrib = { 'term1': count1, 'term2': count2, ... } - distrib = dict(Counter(words)) - - # The distribution doesn't handle capitalization, so it will - # contain things like { 'There': 10, 'there': 20 }. Do a lookup - # for each term ('There', 'there') in the repository to see if a - # matching term with a standardized term.text is found. - normalized = {} - - totcount = len(distrib.keys()) - i = 0 - print(f"Loading data for book {book.title} ...") - for k, v in distrib.items(): - i += 1 - if i % 100 == 0: - print(f" {i} of {totcount}", end="\r") - norm_word = termrepo.find_or_new(language_id, k) - norm_entry = normalized.get(norm_word.text, {"count": 0, "parents": []}) - norm_entry["count"] += v - norm_entry["parents"] = norm_word.parents - normalized[norm_word.text] = norm_entry - - # normalized = { 'there': { 'count': 30, 'parents': [...] }, ... }. - # - # The collector may already have the term ('there') from prior - # books, so combine those. - for t, n in normalized.items(): # pylint: disable=redefined-outer-name - entry = collector.get(t, {"term": t, "count": 0, "books": []}) - entry["count"] += n["count"] - entry["books"].append(book.title) - collector[t] = entry - - # The term may have a parent that isn't actually present in any book! - # We need to add those parents to the collector as well, or later - # searches for the parent will fail. - for p in n["parents"]: - pentry = collector.get(p, {"term": p, "count": 0, "books": []}) - collector[p] = pentry - - -def _load_hash_from_term(t, term): - "Load common data to hash." - t["parent"] = ", ".join(term.parents) - t["definition"] = term.translation or "-" - t["status"] = term.status if term.id is not None else "-" - t["children"] = "-" - t["childbooks"] = [] - t["tags"] = ", ".join(term.term_tags) - - -def load_term_data(langid, terms, repo): - "Load basic data." - totcount = len(terms.keys()) - i = 0 - print("Loading term data ...") - for k, t in terms.items(): # pylint: disable=unused-variable - i += 1 - if i % 100 == 0: - print(f" {i} of {totcount}", end="\r") - - term = repo.find_or_new(langid, t["term"]) - _load_hash_from_term(t, term) - t["familycount"] = t["count"] - - -def load_parent_data(langid, terms, repo): - "Get and print data." - - parents = list({t["parent"] for t in terms.values() if t["parent"] != ""}) - - missingparents = [p for p in parents if p not in terms] - totcount = len(missingparents) - i = 0 - print("Loading missing parents ...") - for p in missingparents: - i += 1 - if i % 100 == 0: - print(f" {i} of {totcount}", end="\r") - - term = repo.find_or_new(langid, p) - t = {"term": p, "count": 0, "books": []} - _load_hash_from_term(t, term) - t["familycount"] = 0 - terms[p] = t - - totcount = len(parents) - i = 0 - print("Finalizing parent data ...") - for p in parents: - i += 1 - if i % 100 == 0: - print(f" {i} of {totcount}", end="\r") - - children = [c for c in terms.values() if c["parent"] == p] - csorted = sorted(children, key=lambda c: c["count"], reverse=True) - children_string = "; ".join([f"{c['term']} ({c['count']})" for c in csorted]) - childbooks = [c["books"] for c in children] - childbooks = list({b for blist in childbooks for b in blist}) - childtotcount = sum(c["count"] for c in children) - - terms[p]["children"] = children_string - terms[p]["childbooks"] = childbooks - terms[p]["familycount"] += childtotcount - - -def get_output_data(terms): - "Get the final set of output data." - printterms = [ - t for t in terms.values() if t["parent"] == "" or t["children"] != "-" - ] - - # Clean up data for printing. - for t in printterms: - t["books"] = list(set(t["books"] + t["childbooks"])) - t["books"] = "; ".join(t["books"]) - del t["childbooks"] - - return printterms - - def generate_file(language_name, outfile_name): """ Generate the datafile for the language. @@ -169,9 +35,6 @@ def generate_file(language_name, outfile_name): sys.exit(0) lang = books[0].language - langid = lang.id - - repo = Repository(db) terms = {} def _add_term_if_needed(t): diff --git a/tests/unit/cli/test_language_term_export.py b/tests/unit/cli/test_language_term_export.py index 876f456f..bcaa0568 100644 --- a/tests/unit/cli/test_language_term_export.py +++ b/tests/unit/cli/test_language_term_export.py @@ -1,6 +1,5 @@ "Smoke test only." -import textwrap from lute.cli.language_term_export import generate_file from lute.models.term import Term, TermTag From b156b2ad438ad640a36ccab15376f60fbee1149f Mon Sep 17 00:00:00 2001 From: Jeff Zohrab Date: Mon, 3 Jun 2024 20:29:17 -0700 Subject: [PATCH 3/7] Return object added. --- lute/cli/language_term_export.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/lute/cli/language_term_export.py b/lute/cli/language_term_export.py index dafdefca..7cd60b64 100644 --- a/lute/cli/language_term_export.py +++ b/lute/cli/language_term_export.py @@ -37,15 +37,17 @@ def generate_file(language_name, outfile_name): lang = books[0].language terms = {} - def _add_term_if_needed(t): - if t.text_lc in terms: - return + def _add_term_to_dict(t): + key = t.text_lc + if key in terms: + return terms[key] + tag_list = ", ".join([tg.text for tg in t.term_tags]) if tag_list == "": tag_list = "-" zws = "\u200B" - terms[t.text_lc] = { + hsh = { "sourceterm": t, "term": t.text.replace(zws, ""), "count": 0, @@ -56,6 +58,8 @@ def _add_term_if_needed(t): "children": [], "tags": tag_list, } + terms[key] = hsh + return hsh for b in books: print(f"Loading data for book {b.title} ...") @@ -73,16 +77,14 @@ def _add_term_if_needed(t): if ti.is_word and ti.term is not None ] for t in displayed_terms: - _add_term_if_needed(t) - e = terms[t.text_lc] + e = _add_term_to_dict(t) e["count"] += 1 e["familycount"] += 1 if b.title not in e["books"]: e["books"].append(b.title) for parent in t.parents: - _add_term_if_needed(parent) - p = terms[parent.text_lc] + p = _add_term_to_dict(parent) p["familycount"] += 1 if b.title not in p["books"]: p["books"].append(b.title) From 9cedbd65ac7ef9536e8c10d09b4c1529e1c4fe4c Mon Sep 17 00:00:00 2001 From: Jeff Zohrab Date: Mon, 3 Jun 2024 20:30:55 -0700 Subject: [PATCH 4/7] Extract methods. --- lute/cli/language_term_export.py | 187 ++++++++++++++++--------------- 1 file changed, 98 insertions(+), 89 deletions(-) diff --git a/lute/cli/language_term_export.py b/lute/cli/language_term_export.py index 7cd60b64..4553e479 100644 --- a/lute/cli/language_term_export.py +++ b/lute/cli/language_term_export.py @@ -10,87 +10,74 @@ e.g. term; count; familycount; books; definition; status; children haber; 100; 1500; book1,book2; to exist; 99; hay (500), he (200), has (150) ... - -There is probably a far better way to do this, likely using something -fairly heavyweight like pandas. This works well enough for now. """ -import sys import csv from lute.db import db from lute.models.book import Book from lute.read.render.service import get_paragraphs -def generate_file(language_name, outfile_name): - """ - Generate the datafile for the language. - """ - # pylint: disable=too-many-locals - - books = db.session.query(Book).all() - books = [b for b in books if b.language.name == language_name] - if len(books) == 0: - print(f"No books for given language {language_name}, quitting.") - sys.exit(0) - - lang = books[0].language - terms = {} - - def _add_term_to_dict(t): - key = t.text_lc - if key in terms: - return terms[key] - - tag_list = ", ".join([tg.text for tg in t.term_tags]) - if tag_list == "": - tag_list = "-" - - zws = "\u200B" - hsh = { - "sourceterm": t, - "term": t.text.replace(zws, ""), - "count": 0, - "familycount": 0, - "books": [], - "definition": t.translation or "-", - "status": t.status, - "children": [], - "tags": tag_list, - } - terms[key] = hsh - return hsh - - for b in books: - print(f"Loading data for book {b.title} ...") - i = 0 - for text in b.texts: - i += 1 - if i % 10 == 0: - print(f" page {i} of {b.page_count()}", end="\r") - paragraphs = get_paragraphs(text.text, lang) - displayed_terms = [ - ti.term - for para in paragraphs - for sentence in para - for ti in sentence.textitems - if ti.is_word and ti.term is not None - ] - for t in displayed_terms: - e = _add_term_to_dict(t) - e["count"] += 1 - e["familycount"] += 1 - if b.title not in e["books"]: - e["books"].append(b.title) - - for parent in t.parents: - p = _add_term_to_dict(parent) - p["familycount"] += 1 - if b.title not in p["books"]: - p["books"].append(b.title) - if t.text_lc not in p["children"]: - p["children"].append(t.text_lc) - +def _add_term_to_dict(t, terms): + "Add term to dictionary and return it." + key = t.text_lc + if key in terms: + return terms[key] + + tag_list = ", ".join([tg.text for tg in t.term_tags]) + if tag_list == "": + tag_list = "-" + + zws = "\u200B" + hsh = { + "sourceterm": t, + "term": t.text.replace(zws, ""), + "count": 0, + "familycount": 0, + "books": [], + "definition": t.translation or "-", + "status": t.status, + "children": [], + "tags": tag_list, + } + terms[key] = hsh + return hsh + + +def _process_book(b, terms): + "Process pages in book, add to output." + print(f"Loading data for book {b.title} ...") + i = 0 + for text in b.texts: + i += 1 + if i % 10 == 0: + print(f" page {i} of {b.page_count()}", end="\r") + paragraphs = get_paragraphs(text.text, b.language) + displayed_terms = [ + ti.term + for para in paragraphs + for sentence in para + for ti in sentence.textitems + if ti.is_word and ti.term is not None + ] + for t in displayed_terms: + e = _add_term_to_dict(t, terms) + e["count"] += 1 + e["familycount"] += 1 + if b.title not in e["books"]: + e["books"].append(b.title) + + for parent in t.parents: + p = _add_term_to_dict(parent, terms) + p["familycount"] += 1 + if b.title not in p["books"]: + p["books"].append(b.title) + if t.text_lc not in p["children"]: + p["children"].append(t.text_lc) + + +def _finalize_output(terms): + "Convert terms hash to usable output." for _, hsh in terms.items(): hsh["books"] = ", ".join(list(set(hsh["books"]))) # children to child (count) @@ -104,23 +91,45 @@ def _add_term_to_dict(t): children_string = "-" hsh["children"] = children_string - outdata = terms.values() - - ptsorted = sorted(outdata, key=lambda x: (-x["familycount"], x["term"])) - keys = [ - "term", - "count", - "familycount", - "books", - "definition", - "status", - "children", - "tags", - ] - print(f"Writing to {outfile_name}") + ret = terms.values() + return sorted(ret, key=lambda x: (-x["familycount"], x["term"])) + + +def _generate_file(books, outfile_name): + "Write data file for books to outfile_name." + terms = {} + for b in books: + _process_book(b, terms) + outdata = _finalize_output(terms) + with open(outfile_name, "w", newline="", encoding="utf-8") as outfile: + keys = [ + "term", + "count", + "familycount", + "books", + "definition", + "status", + "children", + "tags", + ] writer = csv.DictWriter(outfile, fieldnames=keys, extrasaction="ignore") writer.writeheader() - for r in ptsorted: + for r in outdata: writer.writerow(r) - print("Done.") + + +def generate_file(language_name, outfile_name): + """ + Generate the datafile for the language. + """ + # pylint: disable=too-many-locals + + books = db.session.query(Book).all() + books = [b for b in books if b.language.name == language_name] + if len(books) == 0: + print(f"No books for given language {language_name}, quitting.") + else: + print(f"Writing to {outfile_name}") + _generate_file(books, outfile_name) + print("Done.") From edadde82a35cf10bca5e1accb520f4db402701b6 Mon Sep 17 00:00:00 2001 From: Jeff Zohrab Date: Mon, 3 Jun 2024 21:42:10 -0700 Subject: [PATCH 5/7] Truncate book list. --- lute/cli/language_term_export.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/lute/cli/language_term_export.py b/lute/cli/language_term_export.py index 4553e479..38cfa662 100644 --- a/lute/cli/language_term_export.py +++ b/lute/cli/language_term_export.py @@ -46,12 +46,12 @@ def _add_term_to_dict(t, terms): def _process_book(b, terms): "Process pages in book, add to output." - print(f"Loading data for book {b.title} ...") + print(f"Processing {b.title} ...") i = 0 for text in b.texts: i += 1 if i % 10 == 0: - print(f" page {i} of {b.page_count()}", end="\r") + print(f" page {i} of {b.page_count}", end="\r") paragraphs = get_paragraphs(text.text, b.language) displayed_terms = [ ti.term @@ -76,10 +76,22 @@ def _process_book(b, terms): p["children"].append(t.text_lc) +def _book_list_truncated(title_array): + "Return first 5 books, + count of rest." + titles = list(set(title_array)) + first_5 = titles[:5] + ret = ", ".join(first_5) + count_rest = len(titles) - len(first_5) + if count_rest > 0: + ret += f" [... +{count_rest} more]" + return ret + + def _finalize_output(terms): "Convert terms hash to usable output." for _, hsh in terms.items(): - hsh["books"] = ", ".join(list(set(hsh["books"]))) + hsh["books"] = _book_list_truncated(hsh["books"]) + # children to child (count) children = [] for key in hsh["children"]: @@ -132,4 +144,4 @@ def generate_file(language_name, outfile_name): else: print(f"Writing to {outfile_name}") _generate_file(books, outfile_name) - print("Done.") + print("Done. ") # extra space overwrites old output. From 9eb2cc116df31bed1a59cd813bbb2e9002099887 Mon Sep 17 00:00:00 2001 From: Jeff Zohrab Date: Mon, 3 Jun 2024 22:13:33 -0700 Subject: [PATCH 6/7] Add book_term_export cli job. --- lute/cli/commands.py | 17 ++++++++++++++--- lute/cli/language_term_export.py | 18 +++++++++++++++--- tests/unit/cli/test_language_term_export.py | 12 +++++++++--- 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/lute/cli/commands.py b/lute/cli/commands.py index 469200fc..b52ef2f3 100644 --- a/lute/cli/commands.py +++ b/lute/cli/commands.py @@ -5,7 +5,7 @@ import click from flask import Blueprint -from lute.cli.language_term_export import generate_file +from lute.cli.language_term_export import generate_language_file, generate_book_file bp = Blueprint("cli", __name__) @@ -32,7 +32,18 @@ def hello(): @click.argument("output_path") def language_export(language, output_path): """ - Get all terms from active books in the language, and write a + Get all terms from all books in the language, and write a data file of term frequencies and children. """ - generate_file(language, output_path) + generate_language_file(language, output_path) + + +@bp.cli.command("book_term_export") +@click.argument("bookid") +@click.argument("output_path") +def book_term_export(bookid, output_path): + """ + Get all terms for the given book, and write a + data file of term frequencies and children. + """ + generate_book_file(bookid, output_path) diff --git a/lute/cli/language_term_export.py b/lute/cli/language_term_export.py index 38cfa662..76ac4b4f 100644 --- a/lute/cli/language_term_export.py +++ b/lute/cli/language_term_export.py @@ -131,12 +131,10 @@ def _generate_file(books, outfile_name): writer.writerow(r) -def generate_file(language_name, outfile_name): +def generate_language_file(language_name, outfile_name): """ Generate the datafile for the language. """ - # pylint: disable=too-many-locals - books = db.session.query(Book).all() books = [b for b in books if b.language.name == language_name] if len(books) == 0: @@ -145,3 +143,17 @@ def generate_file(language_name, outfile_name): print(f"Writing to {outfile_name}") _generate_file(books, outfile_name) print("Done. ") # extra space overwrites old output. + + +def generate_book_file(bookid, outfile_name): + """ + Generate the datafile for the book. + """ + books = db.session.query(Book).all() + books = [b for b in books if f"{b.id}" == f"{bookid}"] + if len(books) == 0: + print(f"No book with id = {bookid}.") + else: + print(f"Writing to {outfile_name}") + _generate_file(books, outfile_name) + print("Done. ") # extra space overwrites old output. diff --git a/tests/unit/cli/test_language_term_export.py b/tests/unit/cli/test_language_term_export.py index bcaa0568..5061bd10 100644 --- a/tests/unit/cli/test_language_term_export.py +++ b/tests/unit/cli/test_language_term_export.py @@ -1,6 +1,6 @@ "Smoke test only." -from lute.cli.language_term_export import generate_file +from lute.cli.language_term_export import generate_language_file, generate_book_file from lute.models.term import Term, TermTag from lute.models.book import Book @@ -18,7 +18,7 @@ def test_smoke_test(app_context, tmp_path, english): db.session.commit() outfile = tmp_path / "outfile.csv" - generate_file("English", outfile) + generate_language_file("English", outfile) with open(outfile, "r", encoding="utf-8") as ofhandle: text = ofhandle.read() print(text) @@ -67,7 +67,7 @@ def _find(term_string): db.session.commit() outfile = tmp_path / "outfile.csv" - generate_file("English", outfile) + generate_language_file("English", outfile) with open(outfile, "r", encoding="utf-8") as ofhandle: text = ofhandle.read() print(text) @@ -97,3 +97,9 @@ def _find(term_string): # .lower() because sometimes the text file returned B, and # sometimes b ... which is _very_ odd, but don't really care. assert text.lower() == "\n".join(expected).lower(), "content" + + generate_book_file(b.id, outfile) + with open(outfile, "r", encoding="utf-8") as ofhandle: + text = ofhandle.read() + print(text) + assert text.lower() == "\n".join(expected).lower(), "book file" From 0e21a31b2da5f1e8fd7c54d5559f9f30bd591481 Mon Sep 17 00:00:00 2001 From: Jeff Zohrab Date: Thu, 6 Jun 2024 16:11:58 -0700 Subject: [PATCH 7/7] Slight performance improvement to rendering search methods. --- lute/read/render/service.py | 111 ++++++++++++++++++++++++++---------- lute/utils/debug_helpers.py | 3 +- 2 files changed, 84 insertions(+), 30 deletions(-) diff --git a/lute/read/render/service.py b/lute/read/render/service.py index 92afc612..0125b80a 100644 --- a/lute/read/render/service.py +++ b/lute/read/render/service.py @@ -10,6 +10,8 @@ from lute.read.render.renderable_calculator import RenderableCalculator from lute.db import db +# from lute.utils.debug_helpers import DebugTimer + def find_all_Terms_in_string(s, language): # pylint: disable=too-many-locals """ @@ -39,28 +41,69 @@ def _find_all_terms_in_tokens(tokens, language): The code first queries for exact single-token matches, and then multiword matches, because that's much faster - than querying for everthing at once. (This may no longer - be true, can change it later.) + than querying for everything at once. """ - parser = language.parser + # Future performance improvement considerations: + # + # 1. I considered keeping a cache of multiword terms strings and + # IDs, but IMO the payoff isn't worth the extra complexity at this + # time. + # + # 2. Maybe a different search method like Aho-Corasick (ref + # https://github.com/abusix/ahocorapy) would be useful ... again + # it would imply that all keywords (existing Terms) are loaded + # into the Aho-Corasick automaton. This could be cached, but would + # again need methods for cache invalidation and reload etc. + + # dt = DebugTimer("_find_all_terms_in_tokens", False) - # fyi - Manually searching for terms was slow (i.e., querying for - # all terms, and checking if the strings were in the string s). + parser = language.parser - # Query for terms with a single token that match the unique word tokens + # Single word terms + # + # Build query for terms with a single token that match the unique + # word tokens. Note it's much faster to use a query for this, + # rather than loading all term text and checking for the strings + # using python, as we can rely on the database indexes. word_tokens = filter(lambda t: t.is_word, tokens) tok_strings = [parser.get_lowercase(t.token) for t in word_tokens] tok_strings = list(set(tok_strings)) - terms_matching_tokens = ( - db.session.query(Term) - .filter( - Term.language == language, - Term.text_lc.in_(tok_strings), - Term.token_count == 1, - ) - .all() + terms_matching_tokens_qry = db.session.query(Term).filter( + Term.text_lc.in_(tok_strings), Term.language == language ) + # dt.step("single, query prep") + + # Multiword terms + # + # Multiword terms are harder to find as we have to do a full text + # match. + # + # The "obvious" method of using the model is quite slow: + # + # contained_term_qry = db.session.query(Term).filter( + # Term.language == language, + # Term.token_count > 1, + # func.instr(content, Term.text_lc) > 0, + # ) + # contained_terms = contained_term_qry.all() + # + # This code first finds the IDs of the terms that are in the content, + # and then loads the terms. + # + # Note that querying using 'LIKE' is again slow, i.e: + # sql = sqltext( + # """ + # SELECT WoID FROM words + # WHERE WoLgID=:lid and WoTokenCount>1 + # AND :content LIKE '%' || :zws || WoTextLC || :zws || '%' + # """ + # ) + # sql = sql.bindparams(lid=language.id, content=content, zws=zws) + # + # It is actually faster to load all Term text_lc and use python to + # check if the strings are in the content string, and only then + # load the terms. # Multiword terms have zws between all tokens. # Create content string with zws between all tokens for the match. @@ -70,26 +113,27 @@ def _find_all_terms_in_tokens(tokens, language): sql = sqltext( """ - SELECT WoID FROM words + SELECT WoID, WoTextLC FROM words WHERE WoLgID=:language_id and WoTokenCount>1 - AND :content LIKE '%' || :zws || WoTextLC || :zws || '%' """ ) - sql = sql.bindparams(language_id=language.id, content=content, zws=zws) - idlist = db.session.execute(sql).all() - woids = [int(p[0]) for p in idlist] - contained_terms = db.session.query(Term).filter(Term.id.in_(woids)).all() + sql = sql.bindparams(language_id=language.id) + reclist = db.session.execute(sql).all() + # dt.step(f"mwords, loaded {len(reclist)} records") + woids = [int(p[0]) for p in reclist if f"{zws}{p[1]}{zws}" in content] + # dt.step("mwords, filtered ids") + # dt.step("mword ids") + + contained_terms_qry = db.session.query(Term).filter(Term.id.in_(woids)) - # Note that the above method (querying for ids, then getting terms) - # is faster than using the model as shown below! - ### contained_term_query = db.session.query(Term).filter( - ### Term.language == language, - ### Term.token_count > 1, - ### func.instr(content, Term.text_lc) > 0, - ### ) - ### contained_terms = contained_term_query.all() + # Some term entity relationship objects (tags, parents) could be + # eagerly loaded using ".options(joinedload(Term.term_tags), + # joinedload(Term.parents))", but any gains in subsequent usage + # are offset by the slower query! + all_terms = terms_matching_tokens_qry.union(contained_terms_qry).all() + # dt.step("union, exec query") - return terms_matching_tokens + contained_terms + return all_terms class RenderableSentence: @@ -168,12 +212,16 @@ def get_paragraphs(s, language): """ Get array of arrays of RenderableSentences for the given string s. """ + # dt = DebugTimer("get_paragraphs", False) + # Hacky reset of state of ParsedToken state. # _Shouldn't_ be needed but doesn't hurt, even if it's lame. ParsedToken.reset_counters() cleaned = re.sub(r" +", " ", s) + # dt.step("start get_parsed_tokens") tokens = language.get_parsed_tokens(cleaned) + # dt.step("done get_parsed_tokens") # Brutal hack ... for some reason the tests fail in # CI, but _inconsistently_, with the token order numbers. The @@ -186,10 +234,13 @@ def get_paragraphs(s, language): for t in tokens: t.order = n n += 1 + # dt.step("done token.sort") terms = _find_all_terms_in_tokens(tokens, language) + # dt.step("done _find_all_terms_in_tokens") paragraphs = _split_tokens_by_paragraph(tokens) + # dt.step("done _split_tokens_by_paragraph") renderable_paragraphs = [] pnum = 0 @@ -201,7 +252,9 @@ def get_paragraphs(s, language): ] renderable_paragraphs.append(renderable_sentences) pnum += 1 + # dt.step("done renderable_paragraphs load") _add_status_0_terms(renderable_paragraphs, language) + # dt.step("done add status 0 terms") return renderable_paragraphs diff --git a/lute/utils/debug_helpers.py b/lute/utils/debug_helpers.py index 8c253f70..a883d5a3 100644 --- a/lute/utils/debug_helpers.py +++ b/lute/utils/debug_helpers.py @@ -16,7 +16,8 @@ def __init__(self, name, display=True): self.name = name self.step_map = {} self.display = display - print(f"{name} timer started") + if display: + print(f"{name} timer started") def step(self, s): "Dump time spent in step, total time since start."