Skip to content

Commit

Permalink
Merge branch 'add_book_term_export_cli' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
jzohrab committed Jun 9, 2024
2 parents 9d9e7fa + 0e21a31 commit def0589
Show file tree
Hide file tree
Showing 5 changed files with 299 additions and 195 deletions.
17 changes: 14 additions & 3 deletions lute/cli/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import click
from flask import Blueprint

from lute.cli.language_term_export import generate_file
from lute.cli.language_term_export import generate_language_file, generate_book_file

bp = Blueprint("cli", __name__)

Expand All @@ -32,7 +32,18 @@ def hello():
@click.argument("output_path")
def language_export(language, output_path):
"""
Get all terms from active books in the language, and write a
Get all terms from all books in the language, and write a
data file of term frequencies and children.
"""
generate_file(language, output_path)
generate_language_file(language, output_path)


@bp.cli.command("book_term_export")
@click.argument("bookid")
@click.argument("output_path")
def book_term_export(bookid, output_path):
"""
Get all terms for the given book, and write a
data file of term frequencies and children.
"""
generate_book_file(bookid, output_path)
283 changes: 123 additions & 160 deletions lute/cli/language_term_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,187 +10,150 @@
e.g.
term; count; familycount; books; definition; status; children
haber; 100; 1500; book1,book2; to exist; 99; hay (500), he (200), has (150) ...
There is probably a far better way to do this, likely using something
fairly heavyweight like pandas. This works well enough for now.
"""

import sys
import csv
from collections import Counter
from lute.db import db
from lute.models.book import Book
from lute.term.model import Repository


def get_dist(book, collector, termrepo, language_id): # pylint: disable=too-many-locals
"""
Get word distribution in book.
The data is added to the collector dictionary.
"""

# Get all terms and counts.
fulltext = "\n".join([t.text for t in book.texts])
pts = book.language.get_parsed_tokens(fulltext)
words = [pt.token for pt in pts if pt.is_word]

# distrib = { 'term1': count1, 'term2': count2, ... }
distrib = dict(Counter(words))

# The distribution doesn't handle capitalization, so it will
# contain things like { 'There': 10, 'there': 20 }. Do a lookup
# for each term ('There', 'there') in the repository to see if a
# matching term with a standardized term.text is found.
normalized = {}

totcount = len(distrib.keys())
from lute.read.render.service import get_paragraphs


def _add_term_to_dict(t, terms):
"Add term to dictionary and return it."
key = t.text_lc
if key in terms:
return terms[key]

tag_list = ", ".join([tg.text for tg in t.term_tags])
if tag_list == "":
tag_list = "-"

zws = "\u200B"
hsh = {
"sourceterm": t,
"term": t.text.replace(zws, ""),
"count": 0,
"familycount": 0,
"books": [],
"definition": t.translation or "-",
"status": t.status,
"children": [],
"tags": tag_list,
}
terms[key] = hsh
return hsh


def _process_book(b, terms):
"Process pages in book, add to output."
print(f"Processing {b.title} ...")
i = 0
print(f"Loading data for book {book.title} ...")
for k, v in distrib.items():
for text in b.texts:
i += 1
if i % 100 == 0:
print(f" {i} of {totcount}", end="\r")
norm_word = termrepo.find_or_new(language_id, k)
norm_entry = normalized.get(norm_word.text, {"count": 0, "parents": []})
norm_entry["count"] += v
norm_entry["parents"] = norm_word.parents
normalized[norm_word.text] = norm_entry

# normalized = { 'there': { 'count': 30, 'parents': [...] }, ... }.
#
# The collector may already have the term ('there') from prior
# books, so combine those.
for t, n in normalized.items(): # pylint: disable=redefined-outer-name
entry = collector.get(t, {"term": t, "count": 0, "books": []})
entry["count"] += n["count"]
entry["books"].append(book.title)
collector[t] = entry

# The term may have a parent that isn't actually present in any book!
# We need to add those parents to the collector as well, or later
# searches for the parent will fail.
for p in n["parents"]:
pentry = collector.get(p, {"term": p, "count": 0, "books": []})
collector[p] = pentry


def _load_hash_from_term(t, term):
"Load common data to hash."
t["parent"] = ", ".join(term.parents)
t["definition"] = term.translation or "-"
t["status"] = term.status if term.id is not None else "-"
t["children"] = "-"
t["childbooks"] = []
t["tags"] = ", ".join(term.term_tags)


def load_term_data(langid, terms, repo):
"Load basic data."
totcount = len(terms.keys())
i = 0
print("Loading term data ...")
for k, t in terms.items(): # pylint: disable=unused-variable
i += 1
if i % 100 == 0:
print(f" {i} of {totcount}", end="\r")

term = repo.find_or_new(langid, t["term"])
_load_hash_from_term(t, term)
t["familycount"] = t["count"]


def load_parent_data(langid, terms, repo):
"Get and print data."

parents = list({t["parent"] for t in terms.values() if t["parent"] != ""})

missingparents = [p for p in parents if p not in terms]
totcount = len(missingparents)
i = 0
print("Loading missing parents ...")
for p in missingparents:
i += 1
if i % 100 == 0:
print(f" {i} of {totcount}", end="\r")

term = repo.find_or_new(langid, p)
t = {"term": p, "count": 0, "books": []}
_load_hash_from_term(t, term)
t["familycount"] = 0
terms[p] = t

totcount = len(parents)
i = 0
print("Finalizing parent data ...")
for p in parents:
i += 1
if i % 100 == 0:
print(f" {i} of {totcount}", end="\r")

children = [c for c in terms.values() if c["parent"] == p]
if i % 10 == 0:
print(f" page {i} of {b.page_count}", end="\r")
paragraphs = get_paragraphs(text.text, b.language)
displayed_terms = [
ti.term
for para in paragraphs
for sentence in para
for ti in sentence.textitems
if ti.is_word and ti.term is not None
]
for t in displayed_terms:
e = _add_term_to_dict(t, terms)
e["count"] += 1
e["familycount"] += 1
if b.title not in e["books"]:
e["books"].append(b.title)

for parent in t.parents:
p = _add_term_to_dict(parent, terms)
p["familycount"] += 1
if b.title not in p["books"]:
p["books"].append(b.title)
if t.text_lc not in p["children"]:
p["children"].append(t.text_lc)


def _book_list_truncated(title_array):
"Return first 5 books, + count of rest."
titles = list(set(title_array))
first_5 = titles[:5]
ret = ", ".join(first_5)
count_rest = len(titles) - len(first_5)
if count_rest > 0:
ret += f" [... +{count_rest} more]"
return ret


def _finalize_output(terms):
"Convert terms hash to usable output."
for _, hsh in terms.items():
hsh["books"] = _book_list_truncated(hsh["books"])

# children to child (count)
children = []
for key in hsh["children"]:
t = terms[key]
children.append({"count": t["count"], "term": t["sourceterm"].text})
csorted = sorted(children, key=lambda c: c["count"], reverse=True)
children_string = "; ".join([f"{c['term']} ({c['count']})" for c in csorted])
childbooks = [c["books"] for c in children]
childbooks = list({b for blist in childbooks for b in blist})
childtotcount = sum(c["count"] for c in children)
if children_string == "":
children_string = "-"
hsh["children"] = children_string

terms[p]["children"] = children_string
terms[p]["childbooks"] = childbooks
terms[p]["familycount"] += childtotcount
ret = terms.values()
return sorted(ret, key=lambda x: (-x["familycount"], x["term"]))


def get_output_data(terms):
"Get the final set of output data."
printterms = [
t for t in terms.values() if t["parent"] == "" or t["children"] != "-"
]

# Clean up data for printing.
for t in printterms:
t["books"] = list(set(t["books"] + t["childbooks"]))
t["books"] = "; ".join(t["books"])
del t["childbooks"]
def _generate_file(books, outfile_name):
"Write data file for books to outfile_name."
terms = {}
for b in books:
_process_book(b, terms)
outdata = _finalize_output(terms)

return printterms
with open(outfile_name, "w", newline="", encoding="utf-8") as outfile:
keys = [
"term",
"count",
"familycount",
"books",
"definition",
"status",
"children",
"tags",
]
writer = csv.DictWriter(outfile, fieldnames=keys, extrasaction="ignore")
writer.writeheader()
for r in outdata:
writer.writerow(r)


def generate_file(language_name, outfile_name):
def generate_language_file(language_name, outfile_name):
"""
Generate the datafile for the language.
"""
books = db.session.query(Book).all()
books = [b for b in books if b.language.name == language_name]
if len(books) == 0:
print(f"No books for given language {language_name}, quitting.")
sys.exit(0)
else:
print(f"Writing to {outfile_name}")
_generate_file(books, outfile_name)
print("Done. ") # extra space overwrites old output.

langid = books[0].language.id

repo = Repository(db)
terms = {}
for b in books:
get_dist(b, terms, repo, langid)

load_term_data(langid, terms, repo)
load_parent_data(langid, terms, repo)
outdata = get_output_data(terms)

ptsorted = sorted(outdata, key=lambda c: c["familycount"], reverse=True)
keys = [
"term",
"count",
"familycount",
"books",
"definition",
"status",
"children",
"tags",
]
print(f"Writing to {outfile_name}")
with open(outfile_name, "w", newline="", encoding="utf-8") as outfile:
writer = csv.DictWriter(outfile, fieldnames=keys, extrasaction="ignore")
writer.writeheader()
for r in ptsorted:
writer.writerow(r)
print("Done.")
def generate_book_file(bookid, outfile_name):
"""
Generate the datafile for the book.
"""
books = db.session.query(Book).all()
books = [b for b in books if f"{b.id}" == f"{bookid}"]
if len(books) == 0:
print(f"No book with id = {bookid}.")
else:
print(f"Writing to {outfile_name}")
_generate_file(books, outfile_name)
print("Done. ") # extra space overwrites old output.
Loading

0 comments on commit def0589

Please sign in to comment.