-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathreading.py
291 lines (235 loc) · 10.6 KB
/
reading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
# -*- coding: utf-8 -*-
# This file is part of Japanese Furigana <https://github.com/obynio/anki-japanese-furigana>.
#
# Japanese Furigana is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Japanese Furigana is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Japanese Furigana. If not, see <http://www.gnu.org/licenses/>.
import sys
import os
import re
import subprocess
import platform
from typing import Any, List, Mapping, Optional, Union
mecabArgs = ['--node-format=%m[%f[7]] ', '--eos-format=\n',
'--unk-format=%m[] ']
mecabDir = os.path.join(os.path.dirname(__file__), "support")
HTML_REPLACER = '▦'
NEWLINE_REPLACER = '▧'
# Unicode character used to replace ASCII Space (0x20) in expression before
# passing in to MeCab. MeCab separates kanji/reading nodes with ASCII spaces,
# so without this we wouldn't be able to tell apart a node separator from a
# space character in the original string.
# This is unique to ASCII Space (0x20) and does not apply to any other whitespace
# character (eg CJK Space)
# Codepoint chosen to be a unicode character unlikely to ever feature in ANY
# Anki card.
ASCII_SPACE_TOKEN = u"\U0000FFFF"
def htmlReplace(text):
pattern = r"(?:<[^<]+?>)"
matches = re.findall(pattern, text)
text = re.sub(r"<[^<]+?>", HTML_REPLACER, text)
return matches, text
def escapeText(text):
text = text.replace("\n", " ")
text = text.replace(u'\uff5e', "~")
text = re.sub("<br( /)?>", NEWLINE_REPLACER, text)
#showInfo(text)
matches, text = htmlReplace(text)
text = text.replace(NEWLINE_REPLACER, "<br>")
return matches, text
if sys.platform == "win32":
si = subprocess.STARTUPINFO()
try:
si.dwFlags |= subprocess.STARTF_USESHOWWINDOW
except:
si.dwFlags |= subprocess._subprocess.STARTF_USESHOWWINDOW
else:
si: Optional[Any] = None
# Syllabary utilities
UNICODE_HIRAGANA_START = 0x3041
UNICODE_HIRAGANA_END = 0x309F
UNICODE_KATAKANA_START = 0x30A1
UNICODE_KATAKANA_END = 0x30FF
UNICODE_MIDDLE_DOT = 0x30FB # '・'
UNICODE_PROLONGED_SOUND_MARK = 0x30FC # 'ー'
class Translator(Mapping[int, Union[str, int, None]]):
def __getitem__(self, key: int) -> Union[str, int, None]:
if not isinstance(key, int):
# Argument error
raise LookupError()
if key >= UNICODE_KATAKANA_START and key <= UNICODE_KATAKANA_END:
# Some general punctuation is located within the Katakana block
# and SHOULDN'T be transformed
if key == UNICODE_MIDDLE_DOT or key == UNICODE_PROLONGED_SOUND_MARK:
raise LookupError()
# Regular katakana Unicode block
offset = key - UNICODE_KATAKANA_START
return UNICODE_HIRAGANA_START + offset
# Not a character we're converting
raise LookupError()
def __len__(self) -> int:
# Exists only to satisfy base type
raise NotImplementedError()
def __iter__(self):
# Exists only to satisfy base type
raise NotImplementedError()
translator = Translator()
def convertToHiragana(expr: str) -> str:
return expr.translate(translator)
def getAdditionalPossibleReadings(hiragana: str) -> Optional[List[str]]:
# The little ヵ and ヶ can show up in readings as "か" (eg: ヶ月, ヵ国, etc)
if hiragana == 'ゕ' or hiragana == 'ゖ':
return ['か']
return None
def isKana(char: str) -> bool:
code = ord(char)
# Hiragana
if code >= UNICODE_HIRAGANA_START and code <= UNICODE_HIRAGANA_END:
return True
# Katakana
if code >= UNICODE_KATAKANA_START and code <= UNICODE_KATAKANA_END:
return True
return False
# Mecab
def mungeForPlatform(popen):
if sys.platform.startswith("win32"):
popen = [os.path.normpath(x) for x in popen]
popen[0] += ".exe"
elif not sys.platform.startswith("darwin"):
popen[0] += ".lin"
elif platform.machine().startswith("arm"):
popen[0] += ".arm"
return popen
class ReadingNode:
def __init__(self, text: str, reading: Optional[str]):
self.text = text
self.reading = reading
def format(self, useRubyTags: bool) -> str:
if self.reading is None:
return self.text
if useRubyTags:
return "<ruby>%s<rp>(</rp><rt>%s</rt><rp>)</rp></ruby>" % (self.text, self.reading)
else:
return '%s[%s]' % (self.text, self.reading)
class RegexDefinition:
def __init__(self, text: str, regexGroupIndex: Optional[int]):
self.text = text
self.regexGroupIndex = regexGroupIndex
def kanjiToRegex(kanji: str):
regexPieces: list[str] = []
definitions: list[RegexDefinition] = []
numCaptureGroups = 0
index = 0
while index < len(kanji):
# Hiragana and Katakana characters are inlined into the Regex
if isKana(kanji[index]):
# The reading variable is ALWAYS in hiragana only
hiragana = convertToHiragana(kanji[index])
additional = getAdditionalPossibleReadings(hiragana)
if additional:
# If it's possible that this kana could be read as a totally different kana
# (eg "ヶ" being read as "か"), we want to give it furigana.
# We'll register it as a capture group -- both because we don't know
# for SURE which reading we're expecting (so we'll register multiple
# possibilities), but ALSO so that we can go down the furigana generation
# pathway that's normally/usually reserved for kanji
regexPieces.append("(" + "|".join([hiragana] + additional) + ")")
# Use kanji[index] here to retain original katakana/hiragana
# (We convert to hiragana just to match against reading)
definitions.append(RegexDefinition(kanji[index], numCaptureGroups))
numCaptureGroups += 1
else:
regexPieces.append(hiragana)
# Use kanji[index] here to retain original katakana/hiragana
# (We convert to hiragana just to match against reading)
definitions.append(RegexDefinition(kanji[index], None))
# Advance to the next character
index += 1
continue
# We have a kanji character, which will become a lazy capture group
# in our Regex. First, absorb all sequential kanji characters into a
# single capture group
captureGroup = ""
while index < len(kanji) and not isKana(kanji[index]):
captureGroup += kanji[index]
index += 1
regexPieces.append("(.+?)")
definitions.append(RegexDefinition(captureGroup, numCaptureGroups))
numCaptureGroups += 1
return ("^%s$" % ''.join(regexPieces), definitions)
class MecabController(object):
def __init__(self):
self.mecab = None
def setup(self):
self.mecabCmd = mungeForPlatform([os.path.join(mecabDir, "mecab")] + mecabArgs + ['-d', mecabDir, '-r', os.path.join(mecabDir, "mecabrc")])
os.environ['DYLD_LIBRARY_PATH'] = mecabDir
os.environ['LD_LIBRARY_PATH'] = mecabDir
if not sys.platform.startswith("win32"):
os.chmod(self.mecabCmd[0], 0o755)
def ensureOpen(self):
if not self.mecab:
self.setup()
try:
self.mecab = subprocess.Popen(self.mecabCmd, bufsize=-1, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, startupinfo=si)
except OSError:
raise Exception(
"Please ensure your Linux system has 64 bit binary support.")
def reading(self, expr, ignoreNumbers = True, useRubyTags = False):
self.ensureOpen()
matches, expr = escapeText(expr)
expr = expr.replace(" ", ASCII_SPACE_TOKEN)
self.mecab.stdin.write(expr.encode("utf-8", "ignore") + b'\n')
self.mecab.stdin.flush()
expr = self.mecab.stdout.readline().rstrip(b'\r\n').decode('utf-8', "ignore")
nodes: list[ReadingNode] = []
for node in expr.split(" "):
if not node:
break
(kanji, reading) = re.match(r"(.+)\[(.*)\]", node).groups()
# katakana, punctuation, not japanese, or lacking a reading
# NOTE: Katakana goes down this path because Mecab returns all
# readings in katakana, so a katakana word looks like 'カリン[カリン]'
if kanji == reading or not reading:
nodes.append(ReadingNode(kanji, None))
continue
# convert reading from katakana to hiragana
reading = convertToHiragana(reading)
# Text in sentence is hiragana
if kanji == reading:
nodes.append(ReadingNode(kanji, None))
continue
# don't add readings of numbers
if ignoreNumbers and kanji in u"一二三四五六七八九十0123456789":
nodes.append(ReadingNode(kanji, None))
continue
# Convert the kanji variable into a Regex pattern where non-kana are
# turned into Regex capture groups, and then apply it to the reading
# to figure out (using lazy matching) what the smallest furigana readings
# are for the kanji
(regexPattern, regexDefinitions) = kanjiToRegex(kanji)
match = re.search(regexPattern, reading)
for definition in regexDefinitions:
if definition.regexGroupIndex is None:
nodes.append(ReadingNode(definition.text, None))
else:
groupReading = match.group(definition.regexGroupIndex + 1)
nodes.append(ReadingNode(definition.text, groupReading))
# Combine our nodes together into a single sentece
fin = ''.join(node.format(useRubyTags) for node in nodes)
# Finalize formatting
fin = fin.replace(ASCII_SPACE_TOKEN, ' ')
for match in matches:
fin = fin.replace(HTML_REPLACER, match, 1)
fin = re.sub(r'& ?nbsp ?;', ' ', re.sub(r"< ?br ?>", "<br>", re.sub(r"> ", ">", fin.strip())))
return fin
# Init
mecab = MecabController()