This repository was archived by the owner on Oct 19, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcedict_to_sqlite.py
111 lines (89 loc) · 4.35 KB
/
cedict_to_sqlite.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python3
import sqlite3
import gzip
from pathlib import Path
from argparse import ArgumentParser
import requests
from pinyin import convert_pinyin
class CLI:
""" Very basic command line interface to convert a cedict file to a sqlite
database. """
WEB_CEDICT_FILE = ("https://www.mdbg.net/chinese/export/cedict/"
"cedict_1_0_ts_utf-8_mdbg.txt.gz")
def __init__(self):
Path("build/").mkdir(exist_ok=True)
self.init_args()
self.download_cedict()
self.init_db()
self.populate_db()
def init_args(self):
""" Inits the argument parser. """
parser = ArgumentParser(
description="Converts cedict to a sqlite database.")
parser.add_argument("--enable-tone-accents",
dest="enable_tone_accents",
default=False, type=bool,
help="Boolean toggle to add pinyin with character "
"tones as seperate column. Defaults to False.")
parser.add_argument("--erhua-keep-space",
dest="erhua_keep_space",
default=False, type=bool,
help="Boolean toggle to keep space before r if "
"--enable-tone-accents is set to true. "
"Defaults to False.")
self.args = parser.parse_args()
def download_cedict(self):
""" Downloads the cedict file and stores it on the filesystem. """
if not Path("build/cedict.txt.gz").is_file():
with open("build/cedict.txt.gz", "wb") as file:
file.write(requests.get(self.WEB_CEDICT_FILE).content)
def init_db(self):
""" Drops the cedict database if it already exists, and then creates
the database. """
self.conn = sqlite3.connect("build/cedict.db")
cursor = self.conn.cursor()
cursor.execute("DROP TABLE IF EXISTS entries")
cursor.execute("CREATE TABLE entries (traditional TEXT,"
"simplified TEXT, pinyin TEXT, english TEXT)")
if self.args.enable_tone_accents:
cursor.execute("ALTER TABLE entries "
"ADD COLUMN pinyin_char_tone TEXT")
cursor.execute("CREATE INDEX entries_index "
"ON entries (traditional, simplified)")
cursor.close()
def populate_db(self):
""" Parses the cedict text file, and populates the cedict database
with the relevant fields. """
cursor = self.conn.cursor()
with gzip.open("build/cedict.txt.gz", "rt", encoding="utf-8") as file:
for line in file:
if line[0] == "#":
continue
trad, simp = line.split(" ")[:2]
pinyin = line[line.index("[") + 1:line.index("]")]
english = line[line.index("/") + 1:-2].strip()
if self.args.enable_tone_accents:
pinyin_char_tone = convert_pinyin(pinyin)
if self.args.erhua_keep_space:
pinyin_char_tone = pinyin_char_tone.replace("r5", "r")
else:
pinyin_char_tone = pinyin_char_tone.replace(" r5", "r")
# Some of the pinyin is capitalized so that's why I'm
# leaving the preceding l out.
pinyin_char_tone = pinyin_char_tone.replace("u:2", "ǘ")
pinyin_char_tone = pinyin_char_tone.replace("u:3", "ü")
pinyin_char_tone = pinyin_char_tone.replace("u:4", "ǜ")
pinyin_char_tone = pinyin_char_tone.replace("u:è", "üè")
cursor.execute("INSERT INTO entries (traditional,"
"simplified, pinyin, english,"
"pinyin_char_tone) VALUES (?,?,?,?,?)",
(trad, simp, pinyin, english,
pinyin_char_tone))
else:
cursor.execute("INSERT INTO entries (traditional,"
"simplified, pinyin, english) "
"VALUES (?,?,?,?)",
(trad, simp, pinyin, english))
cursor.close()
self.conn.commit()
CLI()