diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index 23156cb4..119b8e9b 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -1333,6 +1333,7 @@ def process_gloss_header( wxr, pos_data, header_nodes, post_template_fn=head_post_template_fn ) header_text = re.sub(r"\s+", " ", header_text) + # print(f"{header_text=}") parse_word_head( wxr, pos_type, @@ -3551,6 +3552,8 @@ def usex_template_fn(name, ht): tr = "\n".join(lines[i:]) lines = lines[:i] + + roman = re.sub(r"[ \t\r]+", " ", roman).strip() roman = re.sub(r"\[\s*…\s*\]", "[…]", roman) tr = re.sub(r"^[#*:]+\s*", "", tr) diff --git a/src/wiktextract/form_descriptions.py b/src/wiktextract/form_descriptions.py index d987bb0c..f635963e 100644 --- a/src/wiktextract/form_descriptions.py +++ b/src/wiktextract/form_descriptions.py @@ -1753,6 +1753,7 @@ def parse_word_head( assert isinstance(ruby, (list, tuple)) assert is_reconstruction in (True, False) # print("PARSE_WORD_HEAD: {}: {!r}".format(wxr.wtp.section, text)) + # print(f"PARSE_WORD_HEAD: {data=}") if "Lua execution error" in text or "Lua timeout error" in text: return @@ -1804,7 +1805,6 @@ def parse_word_head( # Many languages use • as a punctuation mark separating the base # from the rest of the head. στάδιος/Ancient Greek, issue #176 base = base.strip() - # print("parse_word_head: base={!r}".format(base)) # Check for certain endings in head (mostly for compatibility with weird # heads, e.g. rata/Romanian "1st conj." at end) @@ -1910,7 +1910,9 @@ def parse_word_head( alts[-1] += " or " + last elif last: alts.append(last) + # print("parse_word_head alts: {}".format(alts)) + # print(f"{base=}") # Process the head alternatives canonicals = [] @@ -2283,6 +2285,23 @@ def strokes_repl(m): following_tags = None continue + # American Sign Language has images (or requests for image) + # as heads, + this ASL gloss after. + m2 = re.search(r"\(ASL gloss:\s+(.*)\)", text) + if m2: + add_related( + wxr, + data, + ["ASL-gloss"], + [m2.group(1)], + text, + True, + is_reconstruction, + head_group, + ruby, + ) + continue + parts = list(m.group(0) for m in re.finditer(word_re, desc)) if not parts: prev_tags = None @@ -2410,6 +2429,9 @@ def strokes_repl(m): related = alt_related tagsets = alt_tagsets + + + # print("FORM END: tagsets={} related={}".format(tagsets, related)) if not tagsets: continue @@ -2422,6 +2444,8 @@ def strokes_repl(m): alts = split_at_comma_semi(related, separators=[" or "]) if not alts: alts = [""] + print(f"!!!!!!PARSE_WORD_HEAD: {data.get('tags')=}") + print(f"{alts=}") for related in alts: if related: if prev_tags and ( @@ -2510,6 +2534,8 @@ def strokes_repl(m): prev_tags = tagsets following_tags = None + print(f"?????PARSE_WORD_HEAD: {data.get('tags')=}") + # Finally, if we collected hirakana/katakana, add them now if hiragana: add_related(