forked from iconclass/data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_sqlite.py
175 lines (156 loc) · 5.28 KB
/
make_sqlite.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import sqlite3
import os
import traceback
import sys
SCHEMA = [
"""CREATE TABLE IF NOT EXISTS "notations" (
"id" INTEGER PRIMARY KEY AUTOINCREMENT,
"notation" TEXT,
"children" TEXT,
"refs" TEXT,
"key" TEXT
)""",
"""CREATE TABLE IF NOT EXISTS "keys" (
"id" INTEGER PRIMARY KEY AUTOINCREMENT,
"code" TEXT,
"suffix" TEXT
)""",
"""CREATE TABLE IF NOT EXISTS "texts" (
"ref" INTEGER,
"type" INTEGER,
"language" TEXT,
"text" TEXT
)""",
"""CREATE INDEX IF NOT EXISTS "texts_ref" ON "texts" ("ref")""",
"""CREATE INDEX IF NOT EXISTS "notations_notation" ON "notations" ("notation")""",
"""CREATE INDEX IF NOT EXISTS "keys_code" ON "keys" ("code")""",
]
def parse_dbtxt(data):
obj = {}
buf = []
last_field = None
for line in data.split("\n"):
if line.startswith("#"):
continue
data = line.split(" ")
if len(data) < 2:
continue
field = data[0].lower()
data = " ".join(data[1:])
if field == ";":
buf.append(data)
elif field != last_field:
if buf:
obj[last_field] = buf
buf = [data]
last_field = field
if field in ("n", "k"):
buf = buf[0]
if buf:
obj[last_field] = buf
for k, v in obj.copy().items():
if k.startswith("txt_"):
obj.setdefault("txt", {})[k[4:]] = v
del obj[k]
if k.startswith("kwd_"):
obj.setdefault("kw", {})[k[4:]] = v
del obj[k]
return obj
def read_notations(filename, cursor):
print("Reading notations")
rowid = 1
INSERT_SQL = "INSERT INTO notations VALUES (?, ?, ?, ?, ?)"
notation_ids = {}
with open(filename, "rt", encoding="utf8") as input_file:
for lineno, chunk in enumerate(input_file.read().split("\n$")):
try:
obj = parse_dbtxt(chunk)
except:
print(
f"Problem with notations in {filename} on line {lineno}: {repr(chunk)}"
)
return None
notation = obj.get("n")
children = "|".join(obj.get("c", [])) or None
refs = "|".join(obj.get("r", [])) or None
key = obj.get("k")
if notation:
data = (rowid, notation, children, refs, key)
cursor.execute(INSERT_SQL, data)
notation_ids[notation] = rowid
rowid += 1
return notation_ids
def read_texts(txt_type, notation_ids, filename, language, cursor):
print("Reading %s texts from %s" % (language, filename))
INSERT_SQL = (
"INSERT INTO texts (ref, type, language, text) VALUES (?, %s, ?, ?)" % txt_type
)
with open(filename, "rt", encoding="utf8") as input_file:
for line in input_file.read().split("\n"):
if line.startswith("#"):
continue
line = line.strip()
tmp = line.split("|")
if len(tmp) != 2:
continue
notation, txt = tmp
ref = notation_ids.get(notation)
if not ref:
continue
data = (ref, language, txt)
cursor.execute(INSERT_SQL, data)
def read_keys(notation_ids, filename, cursor):
print("Reading keys")
# get the maximum row id for notations
# the keys id should start from the maximum notation ids plus 1
# as the texts table have infor on both notations and keys
row_id = max(notation_ids.values())
INSERT_SQL1 = "INSERT INTO keys (id, code, suffix) VALUES (?, ?, ?)"
keys_ids = {}
with open(filename, "rt", encoding="utf8") as input_file:
for chunk in input_file.read().split("\n$"):
obj = parse_dbtxt(chunk)
code = obj.get("k")
for suffix in obj.get("s", []):
row_id += 1
data = (row_id, code, suffix)
cursor.execute(INSERT_SQL1, data)
keys_ids[f"{code}{suffix}"] = row_id
notation_ids[code] = row_id
return keys_ids
if __name__ == "__main__":
db = sqlite3.connect("iconclass.sqlite")
cursor = db.cursor()
for statement in SCHEMA:
try:
cursor.execute(statement)
except sqlite3.OperationalError:
traceback.print_exc()
print("Problem with ---> [", end="")
print(statement, end="")
print("]")
sys.exit(1)
# Read the structure
notation_ids = read_notations("notations.txt", cursor)
if not notation_ids:
sys.exit(1)
# read the keys
keys_ids = read_keys(notation_ids, "keys.txt", cursor)
# Read the texts
for dirpath, dirs, files in os.walk("."):
for filename in files:
if filename.find("_keys") > 0:
thebuf = keys_ids
else:
thebuf = notation_ids
if filename.startswith("kw_"):
language = filename[3:5]
read_texts(
1, thebuf, os.path.join(dirpath, filename), language, cursor
)
elif filename.startswith("txt_"):
language = filename[4:6]
read_texts(
0, thebuf, os.path.join(dirpath, filename), language, cursor
)
db.commit()