Skip to content

Commit

Permalink
Initial support for Polish language. (#48)
Browse files Browse the repository at this point in the history
  • Loading branch information
tirkarthi committed Oct 4, 2023
1 parent 077d531 commit 6597d21
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 3 deletions.
4 changes: 2 additions & 2 deletions app/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ android {
applicationId "com.xtreak.notificationdictionary"
minSdk 24
targetSdk 33
versionCode 21
versionName "0.0.21"
versionCode 22
versionName "0.0.22"

testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,13 @@ class MainActivity : AppCompatActivity() {
) {
default_language_value = current_locale
default_database_value = "dictionary_de.db"
} else if (current_locale.startsWith(
"pl",
ignoreCase = true
)
) {
default_language_value = current_locale
default_database_value = "dictionary_pl.db"
}
// Set values here so that
with(sharedPref.edit()) {
Expand Down Expand Up @@ -189,7 +196,7 @@ class MainActivity : AppCompatActivity() {

fun initialize_spinner(database_name: String) {
val spinner = findViewById<View>(R.id.spinner) as Spinner
val languages = arrayOf("English", "French", "German")
val languages = arrayOf("English", "French", "German", "Polish")
val adapter: ArrayAdapter<String> = ArrayAdapter<String>(
this@MainActivity,
android.R.layout.simple_spinner_item, languages
Expand All @@ -205,6 +212,8 @@ class MainActivity : AppCompatActivity() {
spinner.setSelection(1, false)
} else if (database_name == "dictionary_de.db") {
spinner.setSelection(2, false)
} else if (database_name == "dictionary_pl.db") {
spinner.setSelection(3, false)
} else {
spinner.setSelection(0, false)
}
Expand Down Expand Up @@ -265,6 +274,9 @@ class MainActivity : AppCompatActivity() {
} else if (item == "German") {
database_name = "dictionary_de.db"
selected_language = "de"
} else if (item == "Polish") {
database_name = "dictionary_pl.db"
selected_language = "pl"
}

with(sharedPref.edit()) {
Expand Down
132 changes: 132 additions & 0 deletions parse_wikitionary_pl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import sys
import re
import time
import sqlite3
from zipfile import ZipFile, ZIP_DEFLATED

from xml.etree import ElementTree as ET

from wikitextparser import remove_markup, parse

"""
Downloading latest dump file :
wget https://dumps.wikimedia.org/plwiktionary/latest/plwiktionary-latest-pages-articles.xml.bz2
Parsing structure
znaczenia (meaning start marker)
rzeczownik (part of speech)
(1.1) poet. wieczór
(1.2) vespers: rel. nieszpory
(1.3) rel. dzwon wzywający na nieszpory
odmiana (meaning end marker)
"""


SQL_CREATE_TABLE = """
CREATE TABLE IF NOT EXISTS dictionary
(
id INTEGER PRIMARY KEY,
word TEXT,
lexical_category TEXT,
etymology_no INTEGER,
definition_no INTEGER,
definition TEXT
);
"""

SQL_DELETE_ENTRIES = "DELETE from dictionary"
DATABASE_FILE = "dictionary_pl.db"


def main():
connection = sqlite3.connect(DATABASE_FILE)
cursor = connection.cursor()
cursor.execute(SQL_CREATE_TABLE)
cursor.execute(SQL_DELETE_ENTRIES)
start_time = time.time()
doc = ET.iterparse(sys.argv[1])
index = 0
words = 0
count = 0

for event, elem in doc:

if "page" in elem.tag:
title = elem.find(
".//{http://www.mediawiki.org/xml/export-0.10/}title"
).text
content = (
elem.find(".//{http://www.mediawiki.org/xml/export-0.10/}revision")
.find(".//{http://www.mediawiki.org/xml/export-0.10/}text")
.text
)

try:
sections = parse(content).sections
for section in sections:
if section.templates:
start, end = None, None
for template in section.templates:
if template.name == "odmiana":
end = template

if template.name == "znaczenia":
start = template

if start and end:
content = parse(
section.string[
start.span[0]
- section.span[0] : end.span[1]
- section.span[0]
]
)
part_of_speech = content.get_italics()[0].text
meanings = content.lists()[0].items

for meaning_ in meanings:
index += 1
meaning = remove_markup(meaning_).strip()
meaning = re.sub(
"^\s*\(\s*\d+\s*\.\s*\d+\s*\)", "", meaning
)
cursor.execute(
"INSERT INTO dictionary VALUES (?, ?, ?, ?, ?, ?)",
(index, title, part_of_speech, 1, 1, meaning),
)
print(index, title, part_of_speech, 1, 1, meaning)
except (Exception, IndexError) as e:
elem.clear()
print(e)
continue

if count > 1000:
count = 0
cursor.execute("COMMIT")
connection.commit()
cursor.execute("BEGIN TRANSACTION")
print(
f"Processing {words} words and {index} meanings took"
f" {time.time()-start_time} seconds"
)

# https://stackoverflow.com/questions/12160418/why-is-lxml-etree-iterparse-eating-up-all-my-memory
elem.clear()
words += 1
count += 1

cursor.close()
connection.close()
print(f"Processing {words} words took {time.time()-start_time} seconds")

with ZipFile(f"{DATABASE_FILE}.zip", "w", ZIP_DEFLATED) as zipf:
zipf.write(DATABASE_FILE)


if __name__ == "__main__":
main()

0 comments on commit 6597d21

Please sign in to comment.