Merge pull request #580 from tatuylonen/linkage

Use `skipped=` using link data when splitting on commas for word heards
tatuylonen · Apr 11, 2024 · cbc9a9e · cbc9a9e
2 parents f2bddeb + f415db7
commit cbc9a9e
Show file tree

Hide file tree

Showing 3 changed files with 453 additions and 392 deletions.
diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py
@@ -1314,6 +1314,25 @@ def process_gloss_header(
         header_tags: list[str],
     ) -> None:
         ruby = []
+        links: list[str] = []
+        if not word.isalnum():
+            # if the word contains non-letter or -number characters, it might
+            # have something that messes with split-at-semi-comma; we collect
+            # links so that we can skip splitting them.
+            exp = wxr.wtp.parse(
+                wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
+            )
+            link_nodes, _ = recursively_extract(
+                exp.children,
+                lambda x: isinstance(x, WikiNode)
+                and x.kind == NodeKind.LINK
+            )
+            for ln in link_nodes:
+                ltext = "".join(ln.largs[-1])  # type: ignore
+                if not ltext.isalnum():
+                    links.append(ltext)
+            if word not in links:
+                links.append(word)
         if lang_code == "ja":
             exp = wxr.wtp.parse(
                 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
@@ -1346,6 +1365,7 @@ def process_gloss_header(
             is_reconstruction,
             header_group,
             ruby=ruby,
+            links=links,
         )
         if "tags" in pos_data:
             # pos_data can get "tags" data from some source; type-checkers
@@ -3893,6 +3913,7 @@ def parse_page(
         wxr.wtp.start_section(lang)
 
         # Collect all words from the page.
+        # print(f"{langnode=}")
         datas = parse_language(wxr, langnode, lang, lang_code)
 
         # Propagate fields resulting from top-level templates to this

diff --git a/src/wiktextract/form_descriptions.py b/src/wiktextract/form_descriptions.py
@@ -1716,7 +1716,7 @@ def check_related(related):
 
 
 def parse_word_head(
-    wxr, pos, text, data, is_reconstruction, head_group, ruby=[]
+    wxr, pos, text, data, is_reconstruction, head_group, ruby=[], links=[],
 ):
     """Parses the head line for a word for in a particular language and
     part-of-speech, extracting tags and related forms."""
@@ -1803,7 +1803,7 @@ def parse_word_head(
     if m:
         tag, readings = m.groups()
         tag = re.sub(r"\s+", "-", tag)
-        for reading in split_at_comma_semi(readings):
+        for reading in split_at_comma_semi(readings, skipped=links):
             add_related(
                 wxr,
                 data,
@@ -2035,7 +2035,7 @@ def strokes_repl(m):
         for desc in descriptors:
             new_desc.extend(
                 map_with(
-                    xlat_tags_map, split_at_comma_semi(desc, extra=[", or "])
+                    xlat_tags_map, split_at_comma_semi(desc, extra=[", or "], skipped=links)
                 )
             )
         prev_tags = None
@@ -2388,7 +2388,7 @@ def strokes_repl(m):
                                 and desc in data["categories"]
                             )
                         ):
-                            for r in split_at_comma_semi(paren, extra=[" or "]):
+                            for r in split_at_comma_semi(paren, extra=[" or "], skipped=links):
                                 add_romanization(
                                     wxr,
                                     data,
@@ -2420,7 +2420,7 @@ def strokes_repl(m):
             if "or" in titleparts:
                 alts = [related]
             else:
-                alts = split_at_comma_semi(related, separators=[" or "])
+                alts = split_at_comma_semi(related, separators=[" or "], skipped=links)
                 if not alts:
                     alts = [""]
             for related in alts: