diff --git a/src/wiktextract/extractor/fr/gloss.py b/src/wiktextract/extractor/fr/gloss.py index bcb03994c..c3eb1f8c7 100644 --- a/src/wiktextract/extractor/fr/gloss.py +++ b/src/wiktextract/extractor/fr/gloss.py @@ -43,15 +43,29 @@ def extract_gloss( ) gloss_only_nodes = [] - # extract italic tags - for node in gloss_nodes[gloss_start:]: - if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: + tag_indexes = set() + for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start): + # if an italic node is between parentheses then it's a tag, also + # don't add the parenthese strings to `gloss_only_nodes` + if ( + isinstance(node, WikiNode) + and node.kind == NodeKind.ITALIC + and index > gloss_start + and isinstance(gloss_nodes[index - 1], str) + and gloss_nodes[index - 1].strip() == "(" + and index + 1 < len(gloss_nodes) + and isinstance(gloss_nodes[index + 1], str) + and gloss_nodes[index + 1].strip() == ")" + ): gloss_data["tags"].append(clean_node(wxr, None, node)) + tag_indexes |= {index - 1, index, index + 1} continue - elif isinstance(node, str) and node.strip() in ["(", ")"]: - # remove parentheses around italic node - continue - gloss_only_nodes.append(node) + + gloss_only_nodes = [ + node + for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start) + if index not in tag_indexes + ] gloss_text = clean_node(wxr, gloss_data, gloss_only_nodes) gloss_data["glosses"] = [gloss_text] extract_examples(wxr, gloss_data, list_item_node) diff --git a/src/wiktextract/extractor/fr/inflection.py b/src/wiktextract/extractor/fr/inflection.py index c9ee14e3a..d65f3d88d 100644 --- a/src/wiktextract/extractor/fr/inflection.py +++ b/src/wiktextract/extractor/fr/inflection.py @@ -36,6 +36,7 @@ def extract_inflection( } ) + @dataclass class ColspanHeader: text: str @@ -123,7 +124,9 @@ def process_inflection_table( ) else: column_headers.append(table_header_text) - column_cell_index += int(table_cell.attrs.get("colspan", 1)) + column_cell_index += int( + table_cell.attrs.get("colspan", 1) + ) elif row_num > 0: row_headers.append(table_header_text) if "rowspan" in table_cell.attrs: diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py index 845bb34a6..95d601c00 100644 --- a/tests/test_fr_gloss.py +++ b/tests/test_fr_gloss.py @@ -196,3 +196,22 @@ def test_italic_tag(self): } ], ) + + def test_not_italic_tag(self): + # https://fr.wiktionary.org/wiki/bec-en-ciseaux + self.wxr.wtp.start_page("bec-en-ciseaux") + root = self.wxr.wtp.parse( + "# [[oiseau|Oiseau]] aquatique de taille moyenne du genre ''[[Rhynchops]]''." + ) + page_data = [defaultdict(list)] + extract_gloss(self.wxr, page_data, root.children[0]) + self.assertEqual( + page_data, + [ + { + "senses": [ + {"glosses": ["Oiseau aquatique de taille moyenne du genre Rhynchops."]} + ] + } + ], + )