Skip to content

Commit

Permalink
Merge pull request #451 from xxyzz/lang
Browse files Browse the repository at this point in the history
Change "lang_name" field to "lang" in fr and zh extractor
  • Loading branch information
xxyzz authored Jan 5, 2024
2 parents 1f6450f + 64c52fa commit 90f9f71
Show file tree
Hide file tree
Showing 28 changed files with 135 additions and 135 deletions.
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/fr/conjugation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def extract_conjugation(wxr: WiktextractContext, entry: WordEntry) -> None:
"""
conj_ns = wxr.wtp.NAMESPACE_DATA["Conjugaison"]
conj_page_title = (
f"{conj_ns['name']}:{entry.lang_name.lower()}/{entry.word}"
f"{conj_ns['name']}:{entry.lang.lower()}/{entry.word}"
)
conj_page = wxr.wtp.get_page_body(conj_page_title, conj_ns["id"])
if conj_page is None:
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/fr/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def process_derives_autres_list(
):
word = clean_node(wxr, None, node)
page_data[-1].derived.append(
Linkage(lang_code=lang_code, lang_name=lang_name, word=word)
Linkage(lang_code=lang_code, lang=lang_name, word=word)
)


Expand Down
6 changes: 3 additions & 3 deletions src/wiktextract/extractor/fr/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class Translation(FrenchBaseModel):
lang_code: str = Field(
"", description="Wiktionary language code of the translation term"
)
lang_name: str = Field("", description="Translation language name")
lang: str = Field("", description="Translation language name")
word: str = Field("", description="Translation term")
sense: str = Field("", description="Translation gloss")
sense_index: int = Field(
Expand All @@ -68,7 +68,7 @@ class Linkage(FrenchBaseModel):
sense_index: int = Field(
0, ge=0, description="Number of the definition, start from 1"
)
lang_name: str = Field("", description="Localized language name")
lang: str = Field("", description="Localized language name")
lang_code: str = Field("", description="Wiktionary language code")


Expand All @@ -84,7 +84,7 @@ class WordEntry(FrenchBaseModel):

word: str = Field(description="Word string")
lang_code: str = Field(description="Wiktionary language code")
lang_name: str = Field(description="Localized language name")
lang: str = Field(description="Localized language name")
pos: str = Field("", description="Part of speech type")
pos_title: str = Field(
"", description="Original POS title for matching etymology texts"
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/fr/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def parse_page(
base_data = WordEntry(
word=wxr.wtp.title,
lang_code=lang_code,
lang_name=lang_name,
lang=lang_name,
categories=categories.get("categories", []),
)
etymology_data: Optional[EtymologyData] = None
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/fr/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def process_translation_templates(
base_translation_data.lang_code = template_node.template_parameters.get(
1
)
base_translation_data.lang_name = clean_node(
base_translation_data.lang = clean_node(
wxr, page_data[-1], template_node
)
elif template_node.template_name.startswith("trad"):
Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/zh/descendant.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def extract_descendant_list_item(
for child_index, child_node in enumerate(nodes_without_ruby):
if isinstance(child_node, str) and child_node.endswith(":"):
lang_name = child_node.strip(" :")
descendant_data.lang_name = lang_name
descendant_data.lang = lang_name
elif (
isinstance(child_node, WikiNode)
and child_node.kind == NodeKind.HTML
Expand All @@ -61,7 +61,7 @@ def extract_descendant_list_item(
if len(descendant_data.word) > 0:
parent_data.descendants.append(descendant_data)
descendant_data = Descendant(
lang_code=lang_code, lang_name=lang_name
lang_code=lang_code, lang=lang_name
)
if len(ruby_data) > 0:
descendant_data.ruby = ruby_data
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/zh/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def extract_linkages(

base_data = WordEntry(
lang_code=page_data[-1].lang_code,
lang_name=page_data[-1].lang_name,
lang=page_data[-1].lang,
word=page_data[-1].word,
)
parse_section(wxr, page_data, base_data, node)
Expand Down
8 changes: 4 additions & 4 deletions src/wiktextract/extractor/zh/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class Translation(ChineseBaseModel):
lang_code: str = Field(
"", description="Wiktionary language code of the translation term"
)
lang_name: str = Field("", description="Translation language name")
lang: str = Field("", description="Translation language name")
word: str = Field("", description="Translation term")
sense: str = Field("", description="Translation gloss")
tags: list[str] = []
Expand All @@ -83,8 +83,8 @@ class Linkage(ChineseBaseModel):


class Descendant(ChineseBaseModel):
lang_code: str = Field("", description="Wiktionary language code of")
lang_name: str = Field("", description="Translation language name")
lang_code: str = Field("", description="Wiktionary language code")
lang: str = Field("", description="Language name")
word: str = ""
roman: str = ""
tags: list[str] = []
Expand All @@ -97,7 +97,7 @@ class WordEntry(ChineseBaseModel):

word: str = Field(description="Word string")
lang_code: str = Field(description="Wiktionary language code")
lang_name: str = Field(description="Localized language name")
lang: str = Field(description="Localized language name")
pos: str = Field("", description="Part of speech type")
etymology_text: str = ""
senses: list[Sense] = Field([], description="Sense list")
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/zh/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def parse_page(
continue
wxr.wtp.start_section(lang_name)
base_data = WordEntry(
word=wxr.wtp.title, lang_code=lang_code, lang_name=lang_name
word=wxr.wtp.title, lang_code=lang_code, lang=lang_name
)
base_data.categories = categories.get("categories", [])
page_data.append(base_data.model_copy(deep=True))
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/zh/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def process_translation_list_item(
tags, word = capture_text_in_parentheses(word_and_tags)
tags = [tag for tag in tags if tag != lang_code] # rm Wiktionary link
translation_data = Translation(
lang_code=lang_code, lang_name=lang_text, word=word
lang_code=lang_code, lang=lang_text, word=word
)
tags_without_roman = []
for tag in tags:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_fr_conj.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def test_fr_conj_1(self):
|}
</div>""",
)
entry = WordEntry(lang_code="fr", lang_name="Français", word="lancer")
entry = WordEntry(lang_code="fr", lang="Français", word="lancer")
extract_conjugation(self.wxr, entry)
self.assertEqual(
[f.model_dump(exclude_defaults=True) for f in entry.forms],
Expand Down
20 changes: 10 additions & 10 deletions tests/test_fr_etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,14 @@ def test_list_etymologies(self):
WordEntry(
word="test",
lang_code="fr",
lang_name="Français",
lang="Français",
pos="noun",
pos_title="Nom commun 1",
),
WordEntry(
word="test",
lang_code="fr",
lang_name="Français",
lang="Français",
pos="noun",
pos_title="Nom commun 2",
),
Expand All @@ -73,7 +73,7 @@ def test_list_etymologies(self):
{
"word": "test",
"lang_code": "fr",
"lang_name": "Français",
"lang": "Français",
"pos": "noun",
"pos_title": "Nom commun 1",
"etymology_texts": [
Expand All @@ -84,7 +84,7 @@ def test_list_etymologies(self):
{
"word": "test",
"lang_code": "fr",
"lang_name": "Français",
"lang": "Français",
"pos": "noun",
"pos_title": "Nom commun 2",
"etymology_texts": [
Expand Down Expand Up @@ -119,21 +119,21 @@ def test_indent_etymology_with_pos_template(self):
WordEntry(
word="test",
lang_code="fr",
lang_name="Français",
lang="Français",
pos="noun",
pos_title="Nom commun 1",
),
WordEntry(
word="test",
lang_code="fr",
lang_name="Français",
lang="Français",
pos="noun",
pos_title="Nom commun 2",
),
WordEntry(
word="test",
lang_code="fr",
lang_name="Français",
lang="Français",
pos="intj",
pos_title="Interjection",
),
Expand All @@ -145,7 +145,7 @@ def test_indent_etymology_with_pos_template(self):
{
"word": "test",
"lang_code": "fr",
"lang_name": "Français",
"lang": "Français",
"pos": "noun",
"pos_title": "Nom commun 1",
"etymology_texts": [
Expand All @@ -155,7 +155,7 @@ def test_indent_etymology_with_pos_template(self):
{
"word": "test",
"lang_code": "fr",
"lang_name": "Français",
"lang": "Français",
"pos": "noun",
"pos_title": "Nom commun 2",
"etymology_texts": [
Expand All @@ -165,7 +165,7 @@ def test_indent_etymology_with_pos_template(self):
{
"word": "test",
"lang_code": "fr",
"lang_name": "Français",
"lang": "Français",
"pos": "intj",
"pos_title": "Interjection",
"etymology_texts": [
Expand Down
16 changes: 8 additions & 8 deletions tests/test_fr_form_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_ipa(self):
self.wxr.wtp.add_page("Modèle:pron", 10, "\\bɔ̃.ʒuʁ\\")
root = self.wxr.wtp.parse("'''bonjour''' {{pron|bɔ̃.ʒuʁ|fr}}")
page_data = [
WordEntry(word="bonjour", lang_code="fr", lang_name="Français")
WordEntry(word="bonjour", lang_code="fr", lang="Français")
]
extract_form_line(self.wxr, page_data, root.children)
self.assertEqual(
Expand All @@ -38,7 +38,7 @@ def test_gender(self):
self.wxr.wtp.add_page("Modèle:m", 10, "masculin")
root = self.wxr.wtp.parse("'''bonjour''' {{m}}")
page_data = [
WordEntry(word="bonjour", lang_code="fr", lang_name="Français")
WordEntry(word="bonjour", lang_code="fr", lang="Français")
]
extract_form_line(self.wxr, page_data, root.children)
self.assertEqual(page_data[-1].tags, ["masculin"])
Expand All @@ -50,7 +50,7 @@ def test_zh_mot(self):
self.wxr.wtp.add_page("Modèle:pron", 10, body="\\ma̠˨˩˦\\")
root = self.wxr.wtp.parse("{{zh-mot|马|mǎ}}")
page_data = [
WordEntry(word="test", lang_code="fr", lang_name="Français")
WordEntry(word="test", lang_code="fr", lang="Français")
]
process_zh_mot_template(self.wxr, root.children[0], page_data)
self.assertEqual(
Expand All @@ -72,15 +72,15 @@ def test_ipa_location_tag(self):
"{{pron|bas.kɛt.bol|fr}} {{FR|nocat=1}} ''ou'' {{pron|bas.kɛt.bɔl|fr}} {{FR|nocat=1}} ''ou'' {{pron|bas.kɛt.bɑl|fr}} {{CA|nocat=1}} {{m}}"
)
page_data = [
WordEntry(word="basket-ball", lang_code="fr", lang_name="Français")
WordEntry(word="basket-ball", lang_code="fr", lang="Français")
]
extract_form_line(self.wxr, page_data, root.children)
self.assertEqual(
page_data[-1].model_dump(exclude_defaults=True),
{
"word": "basket-ball",
"lang_code": "fr",
"lang_name": "Français",
"lang": "Français",
"tags": ["masculin"],
"sounds": [
{"ipa": "bas.kɛt.bol", "tags": ["France"]},
Expand All @@ -99,7 +99,7 @@ def test_template_in_pron_argument(self):
"'''minéral argileux''' {{pron|mi.ne.ʁa.l{{liaison|fr}}aʁ.ʒi.lø|fr}}"
)
page_data = [
WordEntry(word="test", lang_code="fr", lang_name="Français")
WordEntry(word="test", lang_code="fr", lang="Français")
]
extract_form_line(self.wxr, page_data, root.children)
self.assertEqual(
Expand All @@ -118,15 +118,15 @@ def test_equiv_pour_template(self, mock_node_to_wikitext):
"{{équiv-pour|un homme|auteur|2egenre=une personne non-binaire|2egenre1=autaire|2egenre2=auteurice|2egenre3=auteur·ice|lang=fr}}"
)
page_data = [
WordEntry(word="autrice", lang_code="fr", lang_name="Français")
WordEntry(word="autrice", lang_code="fr", lang="Français")
]
extract_form_line(self.wxr, page_data, root.children)
self.assertEqual(
page_data[-1].model_dump(exclude_defaults=True),
{
"word": "autrice",
"lang_code": "fr",
"lang_name": "Français",
"lang": "Français",
"forms": [
{
"form": "auteur",
Expand Down
Loading

0 comments on commit 90f9f71

Please sign in to comment.