diff --git a/src/wiktextract/extractor/nl/linkage.py b/src/wiktextract/extractor/nl/linkage.py index 972a8650..ec65cb1e 100644 --- a/src/wiktextract/extractor/nl/linkage.py +++ b/src/wiktextract/extractor/nl/linkage.py @@ -5,6 +5,7 @@ from ...page import clean_node from ...wxr_context import WiktextractContext from .models import Linkage, WordEntry +from .tags import LIST_ITEM_TAG_TEMPLATES def extract_linkage_section( @@ -73,24 +74,53 @@ def extract_linkage_list_item( sense: str, sense_index: str, ) -> None: - for node in list_item.children: + linkage_list = getattr(word_entry, linkage_type) + orig_len = len(linkage_list) + tags = [] + for index, node in enumerate(list_item.children): if isinstance(node, str): m = re.search(r"\[(\d+)\]", node) if m is not None: sense_index = int(m.group(1)) - elif node.strip().startswith("="): - sense = node.strip().removeprefix("=").strip() - linkage_list = getattr(word_entry, linkage_type) - if len(linkage_list) > 0: + elif node.strip().startswith(("=", "–")): + sense = clean_node(wxr, None, list_item.children[index:]).strip( + "=– " + ) + if len(linkage_list) > orig_len: linkage_list[-1].sense = sense + else: + word_nodes = [ + n + for n in list_item.children[:index] + if not isinstance(n, TemplateNode) + ] + word = clean_node(wxr, None, word_nodes) + if word != "": + linkage_list.append( + Linkage( + word=word, + sense=sense, + sense_index=sense_index, + tags=tags, + ) + ) + return elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: word = clean_node(wxr, None, node) if word != "": - getattr(word_entry, linkage_type).append( + linkage_list.append( Linkage(word=word, sense=sense, sense_index=sense_index) ) - elif isinstance(node, TemplateNode) and node.template_name == "expr": - extract_expr_template(wxr, word_entry, node, linkage_type) + elif isinstance(node, TemplateNode): + if node.template_name == "expr": + extract_expr_template(wxr, word_entry, node, linkage_type) + elif node.template_name in LIST_ITEM_TAG_TEMPLATES: + if len(linkage_list) > orig_len: + linkage_list[-1].tags.append( + LIST_ITEM_TAG_TEMPLATES[node.template_name] + ) + else: + tags.append(LIST_ITEM_TAG_TEMPLATES[node.template_name]) def extract_nld_template( diff --git a/src/wiktextract/extractor/nl/pos.py b/src/wiktextract/extractor/nl/pos.py index bb740267..c16de747 100644 --- a/src/wiktextract/extractor/nl/pos.py +++ b/src/wiktextract/extractor/nl/pos.py @@ -11,7 +11,11 @@ ) from .models import AltForm, Sense, WordEntry from .section_titles import POS_DATA -from .tags import translate_raw_tags +from .tags import ( + GLOSS_TAG_TEMPLATES, + LIST_ITEM_TAG_TEMPLATES, + translate_raw_tags, +) def extract_pos_section( @@ -74,34 +78,46 @@ def extract_pos_section_nodes( and len(page_data[-1].senses) > 0 ): extract_example_template(wxr, page_data[-1].senses[-1], node) - elif isinstance(node, TemplateNode) and node.template_name in [ - "noun-pl", - "noun-form", - ]: + elif isinstance(node, TemplateNode) and ( + node.template_name + in [ + "noun-pl", + "nl-advb-form", + "noun-dim", + "noun-dim-pl", + "num-form", + "ordn-form", + "prep-form", + "pronom-dem-form", + "pronom-pos-form", + "xh-pronom-pos-form", + ] + or node.template_name.endswith( + ("adjc-form", "adverb-form", "noun-form") + ) + ): extract_noun_form_of_template(wxr, page_data[-1], node) - elif isinstance(node, TemplateNode) and node.template_name.startswith( - ( - "1ps", - "2ps", - "aanv-w", - "onv-d", - "ott-", - "ovt-", - "tps", - "volt-d", - "eng-onv-d", + elif isinstance(node, TemplateNode) and ( + node.template_name.startswith( + ( + "1ps", + "2ps", + "aanv-w", + "onv-d", + "ott-", + "ovt-", + "tps", + "volt-d", + "eng-onv-d", + ) ) + or node.template_name.endswith("verb-form") ): extract_verb_form_of_template( wxr, page_data, base_data, forms_data, node ) -# https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen -# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen -GLOSS_TAG_TEMPLATES = frozenset(["auxl", "erga", "inerg"]) - - def extract_gloss_list_item( wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode ) -> None: @@ -111,6 +127,8 @@ def extract_gloss_list_item( if isinstance(child, TemplateNode): if child.template_name in GLOSS_TAG_TEMPLATES: sense.raw_tags.append(clean_node(wxr, sense, child)) + elif child.template_name in LIST_ITEM_TAG_TEMPLATES: + sense.tags.append(LIST_ITEM_TAG_TEMPLATES[child.template_name]) else: expanded_text = clean_node(wxr, sense, child) if expanded_text.startswith("(") and expanded_text.endswith( @@ -133,7 +151,7 @@ def extract_gloss_list_item( gloss_nodes.append(child) gloss_text = clean_node(wxr, sense, gloss_nodes) - if gloss_text.startswith(","): # between qualifier templates + while gloss_text.startswith(","): # between qualifier templates gloss_text = gloss_text.removeprefix(",").strip() m = re.match(r"\(([^()]+)\)", gloss_text) if m is not None: # expanded "verouderd" template in "2ps" template @@ -153,8 +171,12 @@ def extract_pos_header_line_nodes( m = re.search(r"\[(.+)\]", node.strip()) if m is not None: word_entry.etymology_index = m.group(1).strip() - elif isinstance(node, TemplateNode) and node.template_name == "-l-": - extract_l_template(wxr, word_entry, node) + elif isinstance(node, TemplateNode): + if node.template_name == "-l-": + extract_l_template(wxr, word_entry, node) + elif node.template_name == "dimt": + word_entry.raw_tags.append(clean_node(wxr, word_entry, node)) + translate_raw_tags(word_entry) def extract_l_template( @@ -198,8 +220,9 @@ def extract_l_template( def extract_noun_form_of_template( wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode ) -> None: + # https://nl.wiktionary.org/wiki/Categorie:Vormsjablonen sense = Sense(tags=["form-of"]) - if t_node.template_name == "noun-pl": + if t_node.template_name.endswith("-pl"): sense.tags.append("plural") else: num_arg = t_node.template_parameters.get("getal", "") @@ -236,6 +259,7 @@ def extract_verb_form_of_template( t_node: TemplateNode, ) -> None: # https://nl.wiktionary.org/wiki/Categorie:Werkwoordsvormsjablonen_voor_het_Nederlands + # https://nl.wiktionary.org/wiki/Categorie:Werkwoordsvormsjablonen from .page import extract_section_categories orig_data_len = len(page_data) diff --git a/src/wiktextract/extractor/nl/tags.py b/src/wiktextract/extractor/nl/tags.py index 5330a647..fbc1f118 100644 --- a/src/wiktextract/extractor/nl/tags.py +++ b/src/wiktextract/extractor/nl/tags.py @@ -1,12 +1,32 @@ from .models import WordEntry +# https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen # https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen -VERB_TAGS = { - "ergatief": "ergative", # Sjabloon:erga - "inergatief": "unergative", # Sjabloon:inerg - "hulpwerkwoord": "auxiliary", # Sjabloon:auxl -} +GLOSS_TAG_TEMPLATES = frozenset( + [ + "absol", + "accus", + "auxl", + "copl", + "deponens", + "ditr", + "erga", + "inerg", + "intr", + "modl", + "onpr", + "ov", + "rcpq", + "refl", + "s-verb", + "plurt", + "singt", + "versterkend voorvoegsel", + ] +) + +# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen # https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels GLOSS_TAGS = { "figuurlijk": "figuratively", @@ -55,6 +75,23 @@ "zegswijze": "idiomatic", "zeldzaam": "rare", "Latijns-Amerika": "Latin-America", + "absoluut": "absolute", # Sjabloon:absol + "accusatief": "accusative", # Sjabloon:accus + "hulpwerkwoord": "auxiliary", # Sjabloon:auxl + "koppelwerkwoord": "copulative", # Sjabloon:copl + "deponens": "deponent", + "ditransitief": "ditransitive", # Sjabloon:ditr + "ergatief": "ergative", # Sjabloon:erga + "inergatief": "unergative", # Sjabloon:inerg + "onovergankelijk": "intransitive", # Sjabloon:intr + "modaal werkwoord": ["modal", "verb"], # Sjabloon:modl + "onpersoonlijk": "impersonal", # Sjabloon:onpr + "overgankelijk": "transitive", # Sjabloon:ov + "wederkerig": "reciprocal", # Sjabloon:rcpq + "wederkerend": "reflexive", # Sjabloon:refl + "alleen meervoud": "plural-only", # Sjabloon:plurt + "geen meervoud": "no-plural", # Sjabloon:singt + "versterkend voorvoegsel": ["intensifier", "prefix"], } TABLE_TAGS = { @@ -89,7 +126,12 @@ } -TAGS = {**VERB_TAGS, **GLOSS_TAGS, **TABLE_TAGS} +HEADER_LINE_TAGS = { + "dim. tant.": ["diminutive", "noun"], # Sjabloon:dimt +} + + +TAGS = {**GLOSS_TAGS, **TABLE_TAGS, **HEADER_LINE_TAGS} # https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels TOPICS = { @@ -342,3 +384,18 @@ def translate_raw_tags(data: WordEntry) -> None: else: raw_tags.append(raw_tag) data.raw_tags = raw_tags + + +# used in translation, linkage and gloss lists +LIST_ITEM_TAG_TEMPLATES = { + "m": "masculine", + "f": "feminine", + "n": "neuter", + "c": "common", + "s": "singular", + "p": "plural", + "a": "animate", + "i": "inanimate", + "impf": "imperfective", + "pf": "perfective", +} diff --git a/src/wiktextract/extractor/nl/translation.py b/src/wiktextract/extractor/nl/translation.py index 32b4794a..474b8475 100644 --- a/src/wiktextract/extractor/nl/translation.py +++ b/src/wiktextract/extractor/nl/translation.py @@ -5,6 +5,7 @@ from ...page import clean_node from ...wxr_context import WiktextractContext from .models import Translation, WordEntry +from .tags import LIST_ITEM_TAG_TEMPLATES def extract_translation_section( @@ -30,18 +31,6 @@ def extract_translation_section( ) -TR_TEMPLATES = { - "m": "masculine", - "f": "feminine", - "n": "neuter", - "c": "common", - "s": "singular", - "p": "plural", - "a": "animate", - "i": "inanimate", -} - - def extract_translation_list_item( wxr: WiktextractContext, word_entry: WordEntry, @@ -75,11 +64,11 @@ def extract_translation_list_item( ) ) elif ( - node.template_name in TR_TEMPLATES + node.template_name in LIST_ITEM_TAG_TEMPLATES and len(word_entry.translations) > 0 ): word_entry.translations[-1].tags.append( - TR_TEMPLATES[node.template_name] + LIST_ITEM_TAG_TEMPLATES[node.template_name] ) elif isinstance(node, str): for c in node: @@ -93,5 +82,10 @@ def extract_translation_list_item( roman_str = "" elif brackets > 0: roman_str += c + elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: + for next_list_item in node.find_child(NodeKind.LIST_ITEM): + extract_translation_list_item( + wxr, word_entry, next_list_item, sense, sense_index + ) elif brackets > 0: roman_str += clean_node(wxr, None, node) diff --git a/tests/test_nl_linkage.py b/tests/test_nl_linkage.py index 76516949..c3389341 100644 --- a/tests/test_nl_linkage.py +++ b/tests/test_nl_linkage.py @@ -148,3 +148,23 @@ def test_sense_text_after_link(self): data[0]["related"], [{"sense": "met grote passen lopen", "word": "benen"}], ) + + data = parse_page( + self.wxr, + "omyl", + """==Tsjechisch== +====Zelfstandig naamwoord==== +# fout +=====Typische woordcombinaties===== +* justiční ''omyl'' {{m}}{{i}} – justitiële ''dwaling''""", + ) + self.assertEqual( + data[0]["derived"], + [ + { + "sense": "justitiële dwaling", + "word": "justiční omyl", + "tags": ["masculine", "inanimate"], + } + ], + ) diff --git a/tests/test_nl_translation.py b/tests/test_nl_translation.py index 86ed039b..cac75422 100644 --- a/tests/test_nl_translation.py +++ b/tests/test_nl_translation.py @@ -90,3 +90,22 @@ def test_plain_text_lang_name(self): }, ], ) + + def test_nested_list(self): + self.wxr.wtp.add_page("Sjabloon:cmn", 10, "Mandarijn") + data = parse_page( + self.wxr, + "kijken", + """==Nederlands== +====Werkwoord==== +# met de ogen waarnemen +=====Vertalingen===== +* Chinees: +** {{cmn}}: {{trad|cmn|看}}""", + ) + self.assertEqual( + data[0]["translations"], + [ + {"word": "看", "lang": "Mandarijn", "lang_code": "cmn"}, + ], + )