Skip to content

Commit

Permalink
Fix: ASL gloss head forms
Browse files Browse the repository at this point in the history
American Sign Language headers are a bit irregular,
this extracts the ASL gloss from it as a form.
  • Loading branch information
kristian-clausal committed Feb 2, 2024
1 parent a19d708 commit 9ad8e16
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 1 deletion.
3 changes: 3 additions & 0 deletions src/wiktextract/extractor/en/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1333,6 +1333,7 @@ def process_gloss_header(
wxr, pos_data, header_nodes, post_template_fn=head_post_template_fn
)
header_text = re.sub(r"\s+", " ", header_text)
# print(f"{header_text=}")
parse_word_head(
wxr,
pos_type,
Expand Down Expand Up @@ -3551,6 +3552,8 @@ def usex_template_fn(name, ht):
tr = "\n".join(lines[i:])
lines = lines[:i]



roman = re.sub(r"[ \t\r]+", " ", roman).strip()
roman = re.sub(r"\[\s*…\s*\]", "[…]", roman)
tr = re.sub(r"^[#*:]+\s*", "", tr)
Expand Down
28 changes: 27 additions & 1 deletion src/wiktextract/form_descriptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1753,6 +1753,7 @@ def parse_word_head(
assert isinstance(ruby, (list, tuple))
assert is_reconstruction in (True, False)
# print("PARSE_WORD_HEAD: {}: {!r}".format(wxr.wtp.section, text))
# print(f"PARSE_WORD_HEAD: {data=}")

if "Lua execution error" in text or "Lua timeout error" in text:
return
Expand Down Expand Up @@ -1804,7 +1805,6 @@ def parse_word_head(
# Many languages use • as a punctuation mark separating the base
# from the rest of the head. στάδιος/Ancient Greek, issue #176
base = base.strip()
# print("parse_word_head: base={!r}".format(base))

# Check for certain endings in head (mostly for compatibility with weird
# heads, e.g. rata/Romanian "1st conj." at end)
Expand Down Expand Up @@ -1910,7 +1910,9 @@ def parse_word_head(
alts[-1] += " or " + last
elif last:
alts.append(last)

# print("parse_word_head alts: {}".format(alts))
# print(f"{base=}")

# Process the head alternatives
canonicals = []
Expand Down Expand Up @@ -2283,6 +2285,23 @@ def strokes_repl(m):
following_tags = None
continue

# American Sign Language has images (or requests for image)
# as heads, + this ASL gloss after.
m2 = re.search(r"\(ASL gloss:\s+(.*)\)", text)
if m2:
add_related(
wxr,
data,
["ASL-gloss"],
[m2.group(1)],
text,
True,
is_reconstruction,
head_group,
ruby,
)
continue

parts = list(m.group(0) for m in re.finditer(word_re, desc))
if not parts:
prev_tags = None
Expand Down Expand Up @@ -2410,6 +2429,9 @@ def strokes_repl(m):
related = alt_related
tagsets = alt_tagsets




# print("FORM END: tagsets={} related={}".format(tagsets, related))
if not tagsets:
continue
Expand All @@ -2422,6 +2444,8 @@ def strokes_repl(m):
alts = split_at_comma_semi(related, separators=[" or "])
if not alts:
alts = [""]
print(f"!!!!!!PARSE_WORD_HEAD: {data.get('tags')=}")
print(f"{alts=}")
for related in alts:
if related:
if prev_tags and (
Expand Down Expand Up @@ -2510,6 +2534,8 @@ def strokes_repl(m):
prev_tags = tagsets
following_tags = None

print(f"?????PARSE_WORD_HEAD: {data.get('tags')=}")

# Finally, if we collected hirakana/katakana, add them now
if hiragana:
add_related(
Expand Down

0 comments on commit 9ad8e16

Please sign in to comment.