Initial support for Polish language. (#48)

tirkarthi · Oct 4, 2023 · 6597d21 · 6597d21
1 parent 077d531
commit 6597d21
Show file tree

Hide file tree

Showing 3 changed files with 147 additions and 3 deletions.
diff --git a/app/build.gradle b/app/build.gradle
@@ -28,8 +28,8 @@ android {
         applicationId "com.xtreak.notificationdictionary"
         minSdk 24
         targetSdk 33
-        versionCode 21
-        versionName "0.0.21"
+        versionCode 22
+        versionName "0.0.22"
 
         testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner"
     }

diff --git a/app/src/main/java/com/xtreak/notificationdictionary/MainActivity.kt b/app/src/main/java/com/xtreak/notificationdictionary/MainActivity.kt
@@ -89,6 +89,13 @@ class MainActivity : AppCompatActivity() {
             ) {
                 default_language_value = current_locale
                 default_database_value = "dictionary_de.db"
+            } else if (current_locale.startsWith(
+                    "pl",
+                    ignoreCase = true
+                )
+            ) {
+                default_language_value = current_locale
+                default_database_value = "dictionary_pl.db"
             }
             // Set values here so that
             with(sharedPref.edit()) {
@@ -189,7 +196,7 @@ class MainActivity : AppCompatActivity() {
 
     fun initialize_spinner(database_name: String) {
         val spinner = findViewById<View>(R.id.spinner) as Spinner
-        val languages = arrayOf("English", "French", "German")
+        val languages = arrayOf("English", "French", "German", "Polish")
         val adapter: ArrayAdapter<String> = ArrayAdapter<String>(
             this@MainActivity,
             android.R.layout.simple_spinner_item, languages
@@ -205,6 +212,8 @@ class MainActivity : AppCompatActivity() {
             spinner.setSelection(1, false)
         } else if (database_name == "dictionary_de.db") {
             spinner.setSelection(2, false)
+        } else if (database_name == "dictionary_pl.db") {
+            spinner.setSelection(3, false)
         } else {
             spinner.setSelection(0, false)
         }
@@ -265,6 +274,9 @@ class MainActivity : AppCompatActivity() {
                                 } else if (item == "German") {
                                     database_name = "dictionary_de.db"
                                     selected_language = "de"
+                                } else if (item == "Polish") {
+                                    database_name = "dictionary_pl.db"
+                                    selected_language = "pl"
                                 }
 
                                 with(sharedPref.edit()) {

diff --git a/parse_wikitionary_pl.py b/parse_wikitionary_pl.py
@@ -0,0 +1,132 @@
+import sys
+import re
+import time
+import sqlite3
+from zipfile import ZipFile, ZIP_DEFLATED
+
+from xml.etree import ElementTree as ET
+
+from wikitextparser import remove_markup, parse
+
+"""
+Downloading latest dump file :
+
+wget https://dumps.wikimedia.org/plwiktionary/latest/plwiktionary-latest-pages-articles.xml.bz2
+
+Parsing structure
+
+znaczenia (meaning start marker)
+
+rzeczownik (part of speech)
+
+    (1.1) poet. wieczór
+    (1.2) vespers: rel. nieszpory
+    (1.3) rel. dzwon wzywający na nieszpory
+
+odmiana (meaning end marker)
+"""
+
+
+SQL_CREATE_TABLE = """
+CREATE TABLE IF NOT EXISTS dictionary
+                             (
+                                 id INTEGER PRIMARY KEY,
+                                 word TEXT,
+                                 lexical_category TEXT,
+                                 etymology_no INTEGER,
+                                 definition_no INTEGER,
+                                 definition TEXT
+                             );
+"""
+
+SQL_DELETE_ENTRIES = "DELETE from dictionary"
+DATABASE_FILE = "dictionary_pl.db"
+
+
+def main():
+    connection = sqlite3.connect(DATABASE_FILE)
+    cursor = connection.cursor()
+    cursor.execute(SQL_CREATE_TABLE)
+    cursor.execute(SQL_DELETE_ENTRIES)
+    start_time = time.time()
+    doc = ET.iterparse(sys.argv[1])
+    index = 0
+    words = 0
+    count = 0
+
+    for event, elem in doc:
+
+        if "page" in elem.tag:
+            title = elem.find(
+                ".//{http://www.mediawiki.org/xml/export-0.10/}title"
+            ).text
+            content = (
+                elem.find(".//{http://www.mediawiki.org/xml/export-0.10/}revision")
+                .find(".//{http://www.mediawiki.org/xml/export-0.10/}text")
+                .text
+            )
+
+            try:
+                sections = parse(content).sections
+                for section in sections:
+                    if section.templates:
+                        start, end = None, None
+                        for template in section.templates:
+                            if template.name == "odmiana":
+                                end = template
+
+                            if template.name == "znaczenia":
+                                start = template
+
+                        if start and end:
+                            content = parse(
+                                section.string[
+                                    start.span[0]
+                                    - section.span[0] : end.span[1]
+                                    - section.span[0]
+                                ]
+                            )
+                            part_of_speech = content.get_italics()[0].text
+                            meanings = content.lists()[0].items
+
+                            for meaning_ in meanings:
+                                index += 1
+                                meaning = remove_markup(meaning_).strip()
+                                meaning = re.sub(
+                                    "^\s*\(\s*\d+\s*\.\s*\d+\s*\)", "", meaning
+                                )
+                                cursor.execute(
+                                    "INSERT INTO dictionary VALUES (?, ?, ?, ?, ?, ?)",
+                                    (index, title, part_of_speech, 1, 1, meaning),
+                                )
+                                print(index, title, part_of_speech, 1, 1, meaning)
+            except (Exception, IndexError) as e:
+                elem.clear()
+                print(e)
+                continue
+
+            if count > 1000:
+                count = 0
+                cursor.execute("COMMIT")
+                connection.commit()
+                cursor.execute("BEGIN TRANSACTION")
+                print(
+                    f"Processing {words} words and {index} meanings took"
+                    f" {time.time()-start_time} seconds"
+                )
+
+            # https://stackoverflow.com/questions/12160418/why-is-lxml-etree-iterparse-eating-up-all-my-memory
+            elem.clear()
+            words += 1
+            count += 1
+
+    cursor.close()
+    connection.close()
+    print(f"Processing {words} words took {time.time()-start_time} seconds")
+
+    with ZipFile(f"{DATABASE_FILE}.zip", "w", ZIP_DEFLATED) as zipf:
+        zipf.write(DATABASE_FILE)
+
+
+if __name__ == "__main__":
+    main()