-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathdb_fill.py
87 lines (71 loc) · 3.06 KB
/
db_fill.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import sqlite3
import json
from zhudi.data import Data
from zhudi.processing import DictionaryTools
def load_edict_dictionary(language: str, filename: str) -> None:
print(f"Importing {filename} as {language} table...")
# Remove commented license at the top of the file first
# table needs: traditional, simplified, pinyin, zhuyin, definitions
c = sqlite3.connect("zhudi-data/dictionaries.db")
cursor = c.cursor()
cursor.execute(f"""
create table if not exists {language} (
traditional text not null,
simplified text not null,
pinyin text not null,
zhuyin text not null,
definitions text not null
) strict;
""")
cursor.execute(f"delete from {language}")
c.commit()
query = f"insert into {language}(traditional, simplified, pinyin, zhuyin, definitions) values (?, ?, ?, ?, ?)"
with open(f"zhudi-data/{filename}", 'r') as fd:
lines = fd.readlines()
data = []
for line in lines:
clean_line = line.replace('\n', '')
parts = clean_line.split(' ')
if (len(parts) < 4):
print(f"Warning, ignored: {parts}")
else:
# Parse based on https://cc-cedict.org/wiki/format:syntax
traditional = parts[0]
simplified = parts[1]
pinyin = clean_line.replace('[', '|').replace(']', '|').split('|')[1]
# do not convert pinyin into unicode in db, but in app
z = DictionaryTools.pinyin_to_zhuyin(pinyin.split(' '), Data())
zhuyin = ' '.join(z)
definitions = clean_line.split('/')[1:-1]
data.append((traditional, simplified, pinyin, zhuyin, json.dumps(definitions)))
if len(data) == 1000:
c.executemany(query, data)
data = []
if len(data) != 0:
c.executemany(query, data)
c.commit()
def generate_full_text_search_table(language: str) -> None:
print(f"Generating SQLite FTS5 (Full Text Search) for {language}...")
c = sqlite3.connect("zhudi-data/dictionaries.db")
cursor = c.cursor()
cursor.execute(f"""
create virtual table if not exists {language}_fts using fts5(
traditional,
simplified,
pinyin,
zhuyin,
definitions,
content={language}
);
""")
cursor.execute(f"insert into {language}_fts({language}_fts) values('delete-all');")
cursor.execute(f"insert into {language}_fts (rowid, traditional, simplified, pinyin, zhuyin, definitions) select rowid, traditional, simplified, pinyin, zhuyin, definitions from {language};")
cursor.execute(f"insert into {language}_fts({language}_fts) values('optimize');")
c.commit()
if __name__ == '__main__':
load_edict_dictionary('english', 'cedict_1_0_ts_utf-8_mdbg.txt')
load_edict_dictionary('german', 'handedict.u8')
load_edict_dictionary('french', 'cfdict.u8')
generate_full_text_search_table('english')
generate_full_text_search_table('german')
generate_full_text_search_table('french')