Skip to content

Commit

Permalink
feat: import wikidata language list
Browse files Browse the repository at this point in the history
- Remove "wikidata_id" field of Language model, beacuse the language codes supported by wikidata are not bind with a QID. See:  https://www.wikidata.org/w/api.php?action=help&modules=query%2Blanguageinfo
- In Django command "import_languages", get the wikidata "language code list" by calling wikidata API; then get the language details for each code also by calling wikidata API

Refs: #157
  • Loading branch information
kunfang98927 committed Sep 23, 2024
1 parent b8e649c commit 2748205
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,58 +4,161 @@
from django.core.management.base import BaseCommand
from VIM.apps.instruments.models import Language

WIKIDATA_URL = "https://www.wikidata.org/w/api.php"


def get_languages_from_wikidata():
"""
Fetches the list of languages from Wikidata using the Wikidata API.
The API endpoint used is the `siteinfo` module with the `languages` parameter.
For more information, see:
https://www.wikidata.org/wiki/Special:ApiHelp/query%2Bsiteinfo
Example API request in the API sandbox:
https://www.wikidata.org/wiki/Special:ApiSandbox#action=query&format=json&prop=&list=&meta=siteinfo&formatversion=2&siprop=languages
Returns:
list: A list of dictionaries containing language information.
For example:
[
{
"code": "aa",
"bcp47": "aa",
"name": "Qafár af"
},
{
"code": "aae",
"bcp47": "aae",
"name": "Arbërisht"
},
...
]
"""

# Define the API endpoint and parameters to get the list of languages
params = {
"action": "query",
"format": "json",
"prop": "",
"list": "",
"meta": "siteinfo",
"formatversion": "2",
"siprop": "languages",
}

# Make the request to the Wikidata API
response = requests.get(WIKIDATA_URL, params=params, timeout=50)

# Check if the request was successful
if response.status_code == 200:
data = response.json()
# Extract the language list from the response
languages = data.get("query", {}).get("languages", [])
return languages
else:
print(f"Error: Failed to fetch data. Status code {response.status_code}")
return []


def get_language_details(language_codes):
"""
Fetches the details of the specified languages from Wikidata using the Wikidata API.
The API endpoint used is the `languageinfo` module with the `liprop` parameter.
For more information, see:
https://www.wikidata.org/w/api.php?action=help&modules=query%2Blanguageinfo
Example API request in the API sandbox:
https://www.wikidata.org/wiki/Special:ApiSandbox#action=query&format=json&prop=&list=&meta=languageinfo&formatversion=2&liprop=autonym%7Ccode%7Cname&licode=aa%7Caae
Args:
language_codes (list): A list of language codes for which details are to be fetched.
Returns:
dict: A dictionary containing language details with the language code as the key.
For example:
{
"aa": {
"code": "aa",
"autonym": "Qafár af",
"name": "Afar"
},
"aae": {
"code": "aae",
"autonym": "Arbërisht",
"name": "Arbëresh"
}
...
}
"""

# Define the API endpoint and parameters to get the language details
params = {
"action": "query",
"format": "json",
"prop": "",
"meta": "languageinfo",
"formatversion": "2",
"liprop": "code|autonym|name",
"licode": "|".join(language_codes),
}

# Make the request to the Wikidata API
response = requests.get(WIKIDATA_URL, params=params, timeout=50)

# Check if the request was successful
if response.status_code == 200:
data = response.json()
# Extract the language details from the response
language_details = data.get("query", {}).get("languageinfo", {})
return language_details
else:
print(f"Error: Failed to fetch data. Status code {response.status_code}")
return None


class Command(BaseCommand):
"""
The import_languages command populates the database with languages in which instrument
names can be provided in VIM. It fetches the language list from Wikidata, retrieves the
'wikidata_code', 'wikidata_id', 'autonym', and 'en_label', and stores them in the database.
'wikidata_code', 'autonym', and 'en_label', and stores them in the database.
"""

help = "Imports possible languages for instrument names from Wikidata."

WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql"

def handle(self, *args, **options):
query = """
SELECT ?language ?languageLabel ?ISO639code ?autonym WHERE {
?language wdt:P31 wd:Q34770; # Instance of a natural language
wdt:P424 ?ISO639code; # ISO 639 code
rdfs:label ?autonym filter (lang(?autonym) = ?ISO639code).
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
"""

headers = {"Accept": "application/sparql-results+json"}
response = requests.get(
self.WIKIDATA_SPARQL_URL,
params={"query": query},
headers=headers,
timeout=50,
)
data = response.json()
# Fetch the list of languages
languages = get_languages_from_wikidata()
language_codes = [lang.get("code") for lang in languages]

for item in data["results"]["bindings"]:
wikidata_code = item["ISO639code"]["value"]
wikidata_id = item["language"]["value"].split("/")[-1]
en_label = item["languageLabel"]["value"]
autonym = item["autonym"]["value"]

self.stdout.write(
wikidata_code, "-", wikidata_id, "-", en_label, "-", autonym
self.stdout.write(
self.style.SUCCESS(
f"Successfully fetched {len(language_codes)} language codes."
)
)

Language.objects.update_or_create(
wikidata_code=wikidata_code,
defaults={
"wikidata_id": wikidata_id,
"en_label": en_label,
"autonym": autonym,
},
)
# Fetch details for specific language codes, 50 at a time
for i in range(0, len(language_codes), 50):
language_batch = language_codes[i : i + 50]
language_details = get_language_details(language_batch)
if language_details:
for lang in language_details:
wikidata_code = language_details[lang]["code"]
en_label = language_details[lang]["name"]
autonym = language_details[lang]["autonym"]

Language.objects.update_or_create(
wikidata_code=wikidata_code,
defaults={"en_label": en_label, "autonym": autonym},
)

self.stdout.write(
self.style.SUCCESS(
f"Successfully imported {len(data['results']['bindings'])} languages."
f"Successfully imported {Language.objects.count()} languages."
)
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Generated by Django 4.2.5 on 2024-09-23 15:45

from django.db import migrations


class Migration(migrations.Migration):
dependencies = [
("instruments", "0004_merge_20240816_2008"),
]

operations = [
migrations.RemoveField(
model_name="language",
name="wikidata_id",
),
]
3 changes: 0 additions & 3 deletions web-app/django/VIM/apps/instruments/models/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@ class Language(models.Model):
wikidata_code = models.CharField(
unique=True, blank=False, help_text="Language code in Wikidata"
)
wikidata_id = models.CharField(
unique=True, blank=False, help_text="Language ID (Q number) in Wikidata"
)
en_label = models.CharField(blank=False, help_text="Language label in English")
autonym = models.CharField(
blank=False, help_text="Language label in the language itself"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def get_active_language_en_label(self) -> str:
language_en = self.request.GET.get("language")
if language_en:
return language_en
return self.request.session.get("active_language_en", "english")
return self.request.session.get("active_language_en", "English")

def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs)
Expand Down

0 comments on commit 2748205

Please sign in to comment.