Merge pull request #490 from tatuylonen/asl

Fix: ASL gloss head forms
tatuylonen · Feb 2, 2024 · 1118175 · 1118175
2 parents a19d708 + 9ad8e16
commit 1118175
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 1 deletion.
diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py
@@ -1333,6 +1333,7 @@ def process_gloss_header(
             wxr, pos_data, header_nodes, post_template_fn=head_post_template_fn
         )
         header_text = re.sub(r"\s+", " ", header_text)
+        # print(f"{header_text=}")
         parse_word_head(
             wxr,
             pos_type,
@@ -3551,6 +3552,8 @@ def usex_template_fn(name, ht):
                             tr = "\n".join(lines[i:])
                             lines = lines[:i]
 
+
+
                 roman = re.sub(r"[ \t\r]+", " ", roman).strip()
                 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman)
                 tr = re.sub(r"^[#*:]+\s*", "", tr)

diff --git a/src/wiktextract/form_descriptions.py b/src/wiktextract/form_descriptions.py
@@ -1753,6 +1753,7 @@ def parse_word_head(
     assert isinstance(ruby, (list, tuple))
     assert is_reconstruction in (True, False)
     # print("PARSE_WORD_HEAD: {}: {!r}".format(wxr.wtp.section, text))
+    # print(f"PARSE_WORD_HEAD: {data=}")
 
     if "Lua execution error" in text or "Lua timeout error" in text:
         return
@@ -1804,7 +1805,6 @@ def parse_word_head(
     # Many languages use • as a punctuation mark separating the base
     # from the rest of the head. στάδιος/Ancient Greek, issue #176
     base = base.strip()
-    # print("parse_word_head: base={!r}".format(base))
 
     # Check for certain endings in head (mostly for compatibility with weird
     # heads, e.g. rata/Romanian "1st conj." at end)
@@ -1910,7 +1910,9 @@ def parse_word_head(
         alts[-1] += " or " + last
     elif last:
         alts.append(last)
+
     # print("parse_word_head alts: {}".format(alts))
+    # print(f"{base=}")
 
     # Process the head alternatives
     canonicals = []
@@ -2283,6 +2285,23 @@ def strokes_repl(m):
                 following_tags = None
                 continue
 
+            # American Sign Language has images (or requests for image)
+            # as heads, + this ASL gloss after.
+            m2 = re.search(r"\(ASL gloss:\s+(.*)\)", text)
+            if m2:
+                add_related(
+                    wxr,
+                    data,
+                    ["ASL-gloss"],
+                    [m2.group(1)],
+                    text,
+                    True,
+                    is_reconstruction,
+                    head_group,
+                    ruby,
+                )
+                continue
+
             parts = list(m.group(0) for m in re.finditer(word_re, desc))
             if not parts:
                 prev_tags = None
@@ -2410,6 +2429,9 @@ def strokes_repl(m):
                     related = alt_related
                     tagsets = alt_tagsets
 
+
+
+
             # print("FORM END: tagsets={} related={}".format(tagsets, related))
             if not tagsets:
                 continue
@@ -2422,6 +2444,8 @@ def strokes_repl(m):
                 alts = split_at_comma_semi(related, separators=[" or "])
                 if not alts:
                     alts = [""]
+            print(f"!!!!!!PARSE_WORD_HEAD: {data.get('tags')=}")
+            print(f"{alts=}")
             for related in alts:
                 if related:
                     if prev_tags and (
@@ -2510,6 +2534,8 @@ def strokes_repl(m):
                     prev_tags = tagsets
                     following_tags = None
 
+            print(f"?????PARSE_WORD_HEAD: {data.get('tags')=}")
+
     # Finally, if we collected hirakana/katakana, add them now
     if hiragana:
         add_related(