Skip to content

Commit

Permalink
Merge pull request #580 from tatuylonen/linkage
Browse files Browse the repository at this point in the history
Use `skipped=` using link data when splitting on commas for word heards
  • Loading branch information
kristian-clausal authored Apr 11, 2024
2 parents f2bddeb + f415db7 commit cbc9a9e
Show file tree
Hide file tree
Showing 3 changed files with 453 additions and 392 deletions.
21 changes: 21 additions & 0 deletions src/wiktextract/extractor/en/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1314,6 +1314,25 @@ def process_gloss_header(
header_tags: list[str],
) -> None:
ruby = []
links: list[str] = []
if not word.isalnum():
# if the word contains non-letter or -number characters, it might
# have something that messes with split-at-semi-comma; we collect
# links so that we can skip splitting them.
exp = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
)
link_nodes, _ = recursively_extract(
exp.children,
lambda x: isinstance(x, WikiNode)
and x.kind == NodeKind.LINK
)
for ln in link_nodes:
ltext = "".join(ln.largs[-1]) # type: ignore
if not ltext.isalnum():
links.append(ltext)
if word not in links:
links.append(word)
if lang_code == "ja":
exp = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
Expand Down Expand Up @@ -1346,6 +1365,7 @@ def process_gloss_header(
is_reconstruction,
header_group,
ruby=ruby,
links=links,
)
if "tags" in pos_data:
# pos_data can get "tags" data from some source; type-checkers
Expand Down Expand Up @@ -3893,6 +3913,7 @@ def parse_page(
wxr.wtp.start_section(lang)

# Collect all words from the page.
# print(f"{langnode=}")
datas = parse_language(wxr, langnode, lang, lang_code)

# Propagate fields resulting from top-level templates to this
Expand Down
10 changes: 5 additions & 5 deletions src/wiktextract/form_descriptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1716,7 +1716,7 @@ def check_related(related):


def parse_word_head(
wxr, pos, text, data, is_reconstruction, head_group, ruby=[]
wxr, pos, text, data, is_reconstruction, head_group, ruby=[], links=[],
):
"""Parses the head line for a word for in a particular language and
part-of-speech, extracting tags and related forms."""
Expand Down Expand Up @@ -1803,7 +1803,7 @@ def parse_word_head(
if m:
tag, readings = m.groups()
tag = re.sub(r"\s+", "-", tag)
for reading in split_at_comma_semi(readings):
for reading in split_at_comma_semi(readings, skipped=links):
add_related(
wxr,
data,
Expand Down Expand Up @@ -2035,7 +2035,7 @@ def strokes_repl(m):
for desc in descriptors:
new_desc.extend(
map_with(
xlat_tags_map, split_at_comma_semi(desc, extra=[", or "])
xlat_tags_map, split_at_comma_semi(desc, extra=[", or "], skipped=links)
)
)
prev_tags = None
Expand Down Expand Up @@ -2388,7 +2388,7 @@ def strokes_repl(m):
and desc in data["categories"]
)
):
for r in split_at_comma_semi(paren, extra=[" or "]):
for r in split_at_comma_semi(paren, extra=[" or "], skipped=links):
add_romanization(
wxr,
data,
Expand Down Expand Up @@ -2420,7 +2420,7 @@ def strokes_repl(m):
if "or" in titleparts:
alts = [related]
else:
alts = split_at_comma_semi(related, separators=[" or "])
alts = split_at_comma_semi(related, separators=[" or "], skipped=links)
if not alts:
alts = [""]
for related in alts:
Expand Down
Loading

0 comments on commit cbc9a9e

Please sign in to comment.