Skip to content

Commit

Permalink
Merge branch 'get_paragraphs_creates_status_0_terms' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
jzohrab committed Jun 3, 2024
2 parents 1380794 + 6870244 commit 9d9e7fa
Show file tree
Hide file tree
Showing 8 changed files with 172 additions and 98 deletions.
12 changes: 10 additions & 2 deletions lute/parse/mecab_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from natto import MeCab
import jaconv
from lute.parse.base import ParsedToken, AbstractParser
from lute.models.setting import UserSetting
from lute.models.setting import UserSetting, MissingUserSettingKeyException


class JapaneseParser(AbstractParser):
Expand Down Expand Up @@ -128,6 +128,15 @@ def get_reading(self, text: str):
if self._string_is_hiragana(text):
return None

jp_reading_setting = ""
try:
jp_reading_setting = UserSetting.get_value("japanese_reading")
except MissingUserSettingKeyException:
# During loading of demo data, the key isn't set, but the
# reading isn't needed either, as this is only called when
# calculating stats.
return None

flags = r"-O yomi"
readings = []
with MeCab(flags) as nm:
Expand All @@ -139,7 +148,6 @@ def get_reading(self, text: str):
if ret in ("", text):
return None

jp_reading_setting = UserSetting.get_value("japanese_reading")
if jp_reading_setting == "katakana":
return ret
if jp_reading_setting == "hiragana":
Expand Down
9 changes: 7 additions & 2 deletions lute/read/render/renderable_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,15 +434,20 @@ def __repr__(self):
def term(self):
return self._term

@property
def wo_id(self):
"The term id is the wo_id."
if self._term is None:
return None
return self._term.id

@term.setter
def term(self, t):
self.wo_id = None
self.wo_status = None
self._term = t
if t is None:
return

self.wo_id = t.id
self.wo_status = t.status
if t.status >= 1 and t.status <= 5:
self._show_tooltip = True
Expand Down
95 changes: 64 additions & 31 deletions lute/read/render/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,64 @@ def __repr__(self):
return f'<RendSent {self.sentence_id}, {len(self.textitems)} items, "{s}">'


## Getting paragraphs ##############################


def _split_tokens_by_paragraph(tokens):
"Split tokens by ¶"
ret = []
curr_para = []
for t in tokens:
if t.token == "¶":
ret.append(curr_para)
curr_para = []
else:
curr_para.append(t)
if len(curr_para) > 0:
ret.append(curr_para)
return ret


def _make_renderable_sentence(language, pnum, sentence_num, tokens, terms):
"""
Make a RenderableSentences using the tokens present in
that sentence.
"""
sentence_tokens = [t for t in tokens if t.sentence_number == sentence_num]
renderable = RenderableCalculator.get_renderable(language, terms, sentence_tokens)
textitems = [i.make_text_item(pnum, sentence_num, language) for i in renderable]
ret = RenderableSentence(sentence_num, textitems)
return ret


def _sentence_nums(paratokens):
"Sentence numbers in the paragraph tokens."
senums = [t.sentence_number for t in paratokens]
return sorted(list(set(senums)))


def _add_status_0_terms(paragraphs, lang):
"Add status 0 terms for new textitems in paragraph."
new_textitems = [
ti
for para in paragraphs
for sentence in para
for ti in sentence.textitems
if ti.is_word and ti.term is None
]

new_terms_needed = {t.text for t in new_textitems}
new_terms = [Term.create_term_no_parsing(lang, t) for t in new_terms_needed]
for t in new_terms:
t.status = 0

# new_terms may contain some dups (e.g. "cat" and "CAT" are both
# created), so use a map with lowcase text to disambiguate.
textlc_to_term_map = {t.text_lc: t for t in new_terms}
for ti in new_textitems:
ti.term = textlc_to_term_map[ti.text_lc]


def get_paragraphs(s, language):
"""
Get array of arrays of RenderableSentences for the given string s.
Expand All @@ -131,44 +189,19 @@ def get_paragraphs(s, language):

terms = _find_all_terms_in_tokens(tokens, language)

# Split into paragraphs.
paragraphs = []
curr_para = []
for t in tokens:
if t.token == "¶":
paragraphs.append(curr_para)
curr_para = []
else:
curr_para.append(t)
if len(curr_para) > 0:
paragraphs.append(curr_para)

def make_RenderableSentence(pnum, sentence_num, tokens, terms):
"""
Make a RenderableSentences using the tokens present in
that sentence. The current text and language are pulled
into the function from the closure.
"""
sentence_tokens = [t for t in tokens if t.sentence_number == sentence_num]
renderable = RenderableCalculator.get_renderable(
language, terms, sentence_tokens
)
textitems = [i.make_text_item(pnum, sentence_num, language) for i in renderable]
ret = RenderableSentence(sentence_num, textitems)
return ret

def unique(arr):
return list(set(arr))
paragraphs = _split_tokens_by_paragraph(tokens)

renderable_paragraphs = []
pnum = 0
for paratokens in paragraphs:
for p in paragraphs:
# A renderable paragraph is a collection of RenderableSentences.
renderable_sentences = [
make_RenderableSentence(pnum, senum, paratokens, terms)
for senum in sorted(unique([t.sentence_number for t in paratokens]))
_make_renderable_sentence(language, pnum, senum, p, terms)
for senum in _sentence_nums(p)
]
renderable_paragraphs.append(renderable_sentences)
pnum += 1

_add_status_0_terms(renderable_paragraphs, language)

return renderable_paragraphs
71 changes: 12 additions & 59 deletions lute/read/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,30 +19,21 @@ def set_unknowns_to_known(text: Text):
Given a text, create new Terms with status Well-Known
for any new Terms.
"""
language = text.book.language
paragraphs = get_paragraphs(text.text, text.book.language)
# Just in case.
_add_status_0_terms(paragraphs, language)
_save_new_status_0_terms(paragraphs)

tis = [
ti
unknowns = [
ti.term
for para in paragraphs
for sentence in para
for ti in sentence.textitems
if ti.is_word
if ti.is_word and ti.term.status == 0
]

def is_unknown(ti):
return ti.is_word == 1 and ti.term is not None and ti.wo_status == 0

unknown_ids = [t.wo_id for t in tis if is_unknown(t)]
uniques = list(set(unknown_ids))

batch_size = 100
i = 0

for u in uniques:
t = Term.find(u)
for t in unknowns:
t.status = Status.WELLKNOWN
db.session.add(t)
i += 1
Expand All @@ -67,57 +58,19 @@ def bulk_status_update(text: Text, terms_text_array, new_status):
repo.commit()


def _create_unknown_terms(textitems, lang):
"Create any terms required for the page."
# dt = DebugTimer("create-unk-terms")
toks = [t.text for t in textitems]
# print(f"creating toks {toks}", flush=True)
unique_word_tokens = list(set(toks))
# print(f"creating unique toks {unique_word_tokens}", flush=True)
all_new_terms = [Term.create_term_no_parsing(lang, t) for t in unique_word_tokens]
# print(f"all_new_terms = {all_new_terms}", flush=True)
# dt.step("make all_new_terms")

unique_text_lcs = {}
for t in all_new_terms:
if t.text_lc not in unique_text_lcs:
unique_text_lcs[t.text_lc] = t
unique_new_terms = unique_text_lcs.values()
# print(f"utlcs keys = {unique_text_lcs.keys()}", flush=True)
# dt.step("find unique_new_terms")

for t in unique_new_terms:
t.status = 0
db.session.add(t)
db.session.commit()
# dt.step("commit")
# dt.summary()

return unique_new_terms


def _add_status_0_terms(paragraphs, lang):
def _save_new_status_0_terms(paragraphs):
"Add status 0 terms for new textitems in paragraph."
new_textitems = [
tis_with_new_terms = [
ti
for para in paragraphs
for sentence in para
for ti in sentence.textitems
if ti.is_word and ti.term is None
if ti.is_word and ti.term.id is None and ti.term.status == 0
]
# Create new terms for all unknown word tokens in the text.
new_terms = _create_unknown_terms(new_textitems, lang)

# Set the terms for the unknown_textitems
textlc_to_term_map = {}
for t in new_terms:
textlc_to_term_map[t.text_lc] = t
# print("map: textlc_to_term_map")
# for k, v in textlc_to_term_map.items():
# print(f"{k}: {v}", flush=True)
for ti in new_textitems:
# print(f'Assigning term from map to ti with ti.text_lc = "{ti.text_lc}"')
ti.term = textlc_to_term_map[ti.text_lc]
for ti in tis_with_new_terms:
db.session.add(ti.term)
db.session.commit()


def start_reading(dbbook, pagenum, db_session):
Expand All @@ -134,7 +87,7 @@ def start_reading(dbbook, pagenum, db_session):

lang = text.book.language
paragraphs = get_paragraphs(text.text, lang)
_add_status_0_terms(paragraphs, lang)
_save_new_status_0_terms(paragraphs)

return paragraphs

Expand Down
12 changes: 12 additions & 0 deletions tests/unit/book/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,18 @@ def test_single_word(spanish):
)


def test_new_terms_are_not_created(spanish):
"No new terms created accidentally on calc stats."
scenario(
spanish,
"Tengo un gato. Tengo un perro.",
[["gato", 3], ["un", 0]],
{0: 3, 1: 0, 2: 0, 3: 1, 4: 0, 5: 0, 98: 0, 99: 0},
)
sql = "select WoText from words order by WoText"
assert_sql_result(sql, ["gato", "un"], "no new terms.")


def test_with_multiword(spanish):
scenario(
spanish,
Expand Down
10 changes: 10 additions & 0 deletions tests/unit/read/render/test_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
from lute.parse.base import ParsedToken
from lute.read.render.service import find_all_Terms_in_string, get_paragraphs
from lute.db import db
from lute.models.term import Term

from tests.utils import add_terms, make_text, assert_rendered_text_equals
from tests.dbasserts import assert_sql_result


def _run_scenario(language, content, expected_found, msg=""):
Expand Down Expand Up @@ -86,12 +88,18 @@ def test_smoke_get_paragraphs(spanish, app_context):
Smoke test to get paragraph information.
"""
add_terms(spanish, ["tengo un", "un gato"])
perro = Term(spanish, "perro")
perro.status = 0
db.session.add(perro)

content = "Tengo un gato. Hay un perro.\nTengo un perro."
t = make_text("Hola", content, spanish)
db.session.add(t)
db.session.commit()

sql = "select WoText from words order by WoText"
assert_sql_result(sql, ["perro", "tengo/ /un", "un/ /gato"], "initial")

ParsedToken.reset_counters()
paras = get_paragraphs(t.text, t.book.language)
assert len(paras) == 2
Expand All @@ -117,6 +125,8 @@ def stringize(t):
]
assert actual == expected

assert_sql_result(sql, ["perro", "tengo/ /un", "un/ /gato"], "No new terms")


def test_smoke_rendered(spanish, app_context):
"""
Expand Down
31 changes: 30 additions & 1 deletion tests/unit/read/test_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,41 @@

from lute.models.term import Term
from lute.book.model import Book, Repository
from lute.read.service import start_reading
from lute.read.service import set_unknowns_to_known, start_reading
from lute.db import db

from tests.dbasserts import assert_record_count_equals, assert_sql_result


def test_set_unknowns_to_known(english, app_context):
"Unknowns (status 0) or new are set to well known."
t = Term(english, "dog")
db.session.add(t)
db.session.commit()

b = Book()
b.title = "blah"
b.language_id = english.id
b.text = "Dog CAT dog cat."
r = Repository(db)
dbbook = r.add(b)
r.commit()

sql = "select WoTextLC, WoStatus from words order by WoText"
assert_sql_result(sql, ["dog; 1"], "before start")

start_reading(dbbook, 1, db.session)
assert_sql_result(sql, ["cat; 0", "dog; 1"], "after start")

tx = dbbook.texts[0]
tx.text = "Dog CAT dog cat extra."
db.session.add(tx)
db.session.commit()

set_unknowns_to_known(tx)
assert_sql_result(sql, ["cat; 99", "dog; 1", "extra; 99"], "after set")


def test_smoke_start_reading(english, app_context):
"Smoke test book."
b = Book()
Expand Down
Loading

0 comments on commit 9d9e7fa

Please sign in to comment.