Skip to content

Commit

Permalink
Only add the italic node as tag if it's between parentheses
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Oct 16, 2023
1 parent 7e4451f commit b2b54d3
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 8 deletions.
28 changes: 21 additions & 7 deletions src/wiktextract/extractor/fr/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,29 @@ def extract_gloss(
)

gloss_only_nodes = []
# extract italic tags
for node in gloss_nodes[gloss_start:]:
if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
tag_indexes = set()
for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start):
# if an italic node is between parentheses then it's a tag, also
# don't add the parenthese strings to `gloss_only_nodes`
if (
isinstance(node, WikiNode)
and node.kind == NodeKind.ITALIC
and index > gloss_start
and isinstance(gloss_nodes[index - 1], str)
and gloss_nodes[index - 1].strip() == "("
and index + 1 < len(gloss_nodes)
and isinstance(gloss_nodes[index + 1], str)
and gloss_nodes[index + 1].strip() == ")"
):
gloss_data["tags"].append(clean_node(wxr, None, node))
tag_indexes |= {index - 1, index, index + 1}
continue
elif isinstance(node, str) and node.strip() in ["(", ")"]:
# remove parentheses around italic node
continue
gloss_only_nodes.append(node)

gloss_only_nodes = [
node
for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start)
if index not in tag_indexes
]
gloss_text = clean_node(wxr, gloss_data, gloss_only_nodes)
gloss_data["glosses"] = [gloss_text]
extract_examples(wxr, gloss_data, list_item_node)
Expand Down
5 changes: 4 additions & 1 deletion src/wiktextract/extractor/fr/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def extract_inflection(
}
)


@dataclass
class ColspanHeader:
text: str
Expand Down Expand Up @@ -123,7 +124,9 @@ def process_inflection_table(
)
else:
column_headers.append(table_header_text)
column_cell_index += int(table_cell.attrs.get("colspan", 1))
column_cell_index += int(
table_cell.attrs.get("colspan", 1)
)
elif row_num > 0:
row_headers.append(table_header_text)
if "rowspan" in table_cell.attrs:
Expand Down
19 changes: 19 additions & 0 deletions tests/test_fr_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,3 +196,22 @@ def test_italic_tag(self):
}
],
)

def test_not_italic_tag(self):
# https://fr.wiktionary.org/wiki/bec-en-ciseaux
self.wxr.wtp.start_page("bec-en-ciseaux")
root = self.wxr.wtp.parse(
"# [[oiseau|Oiseau]] aquatique de taille moyenne du genre ''[[Rhynchops]]''."
)
page_data = [defaultdict(list)]
extract_gloss(self.wxr, page_data, root.children[0])
self.assertEqual(
page_data,
[
{
"senses": [
{"glosses": ["Oiseau aquatique de taille moyenne du genre Rhynchops."]}
]
}
],
)

0 comments on commit b2b54d3

Please sign in to comment.