Skip to content

Commit

Permalink
Merge pull request #391 from empiriker/es
Browse files Browse the repository at this point in the history
Add languages.json for Spanish Wiktionary
  • Loading branch information
kristian-clausal authored Oct 31, 2023
2 parents fdb3c43 + a64e729 commit 595ff1d
Show file tree
Hide file tree
Showing 5 changed files with 3,715 additions and 16 deletions.
27 changes: 12 additions & 15 deletions languages/get_de_data.py → languages/get_data_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,21 @@
# python language_data.py de dewiktionary_dump_file [--languages languages_output_file]

import argparse
from wikitextprocessor import Wtp
from wiktextract.config import WiktionaryConfig
from wiktextract.wxr_context import WiktextractContext
from wiktextract.page import clean_node
from wikitextprocessor.dumpparser import process_dump
from wikitextprocessor import NodeKind, WikiNode

import json

from wikitextprocessor import NodeKind, WikiNode, Wtp
from wikitextprocessor.dumpparser import process_dump

from wiktextract.config import WiktionaryConfig
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Export Wiktionary language data to JSON"
)
parser.add_argument("lang_code", type=str, help="Dump file language code")
parser.add_argument("dump", type=str, help="Wiktionary xml dump file path")
parser.add_argument(
"--languages",
type=str,
default="languages.json",
help="Language data output file path",
)
args = parser.parse_args()
wxr = WiktextractContext(Wtp(lang_code=args.lang_code), WiktionaryConfig())

Expand All @@ -40,7 +33,7 @@
template_ns_id = wxr.wtp.NAMESPACE_DATA["Template"]["id"]
process_dump(wxr.wtp, args.dump, {help_ns_id, template_ns_id})

# The page 'Hilfe:Sprachkürzel seems to be the only central collection of
# The page 'Hilfe:Sprachkürzel seems to be the only central collection of
# language codes and their German expansions. We will use this until we find
# perhaps a more authoritative source.
sprachkuerzel = wxr.wtp.get_page("Hilfe:Sprachkürzel")
Expand Down Expand Up @@ -68,5 +61,9 @@

languages[lang_code] = [clean_node(wxr, None, third_row_content)]

with open(args.languages, "w", encoding="utf-8") as fout:
with open(
f"src/wiktextract/data/{args.lang_code}/languages.json",
"w",
encoding="utf-8",
) as fout:
json.dump(languages, fout, indent=2, ensure_ascii=False, sort_keys=True)
60 changes: 60 additions & 0 deletions languages/get_data_es.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Export Spanish Wiktionary language data to JSON.
#
# Usage:
#
# python language_data.py de dewiktionary_dump_file [--languages languages_output_file]

import argparse
import json

from wikitextprocessor import NodeKind, WikiNode, Wtp
from wikitextprocessor.dumpparser import process_dump

from wiktextract.config import WiktionaryConfig
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Export Wiktionary language data to JSON"
)
parser.add_argument("lang_code", type=str, help="Dump file language code")
parser.add_argument("dump", type=str, help="Wiktionary xml dump file path")
args = parser.parse_args()
wxr = WiktextractContext(Wtp(lang_code=args.lang_code), WiktionaryConfig())

wxr = WiktextractContext(
Wtp(
lang_code=args.lang_code, db_path="wikt-db_es_language_data_temp.db"
),
WiktionaryConfig(),
)
appendix_ns_id = wxr.wtp.NAMESPACE_DATA["Appendix"]["id"]
process_dump(wxr.wtp, args.dump, {appendix_ns_id})

# https://es.wiktionary.org/wiki/Ap%C3%A9ndice:C%C3%B3digos_de_idioma
codigos_de_idioma = wxr.wtp.get_page("Apéndice:Códigos de idioma")

wxr.config.word = codigos_de_idioma.title
wxr.wtp.start_page(codigos_de_idioma.title)
tree = wxr.wtp.parse(
codigos_de_idioma.body,
pre_expand=True,
)
languages = {}
for table in tree.find_child_recursively(NodeKind.TABLE):
for table_row in table.find_child(NodeKind.TABLE_ROW):
lang_code_language = []
for table_cell in table_row.find_child(NodeKind.TABLE_CELL):
lang_code_language.append(table_cell.children[0])

if lang_code_language:
languages[clean_node(wxr, None, lang_code_language[0])] = [
clean_node(wxr, None, lang_code_language[1])
]
with open(
f"src/wiktextract/data/{args.lang_code}/languages.json",
"w",
encoding="utf-8",
) as fout:
json.dump(languages, fout, indent=2, ensure_ascii=False, sort_keys=True)
4 changes: 4 additions & 0 deletions src/wiktextract/data/es/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"analyze_templates": false,
"extract_thesaurus_pages": false
}
Loading

0 comments on commit 595ff1d

Please sign in to comment.