forked from learningequality/ka-lite
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaccenting.py
164 lines (139 loc) · 5.77 KB
/
accenting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# -*- coding: utf-8 -*-
"""**accenting** allows you to convert certain ascii strings to their
accented versions (the convert function). There's also convert_msg, a
convenience function useful for converting po/mo file entries. Used
mainly for debugging i18n.
"""
import re
import itertools
# Acquired from
# https://github.com/edx/i18n-tools/blob/master/i18n/dummy.py and
# https://github.com/edx/i18n-tools/blob/master/i18n/converter.py
class Converter(object):
r"""Converter is an class that transforms strings. It hides embedded
tags (HTML or Python sequences) from transformation, and
converts certain letters into their accented counterparts (see
Converter.TABLE for the exact mapping.)
To implement Converter, provide implementation for inner_convert_string()
Strategy:
1. extract tags embedded in the string
a. use the index of each extracted tag to re-insert it later
b. replace tags in string with numbers (<0>, <1>, etc.)
c. save extracted tags in a separate list
2. convert string
3. re-insert the extracted tags
Each property file is derived from the equivalent en_US file, with these
transformations applied:
1. Every vowel is replaced with an equivalent with extra accent marks.
2. Every string is padded out to +30% length to simulate verbose languages
(such as German) to see if layout and flows work properly.
3. Every string is terminated with a '#' character to make it easier to detect
truncation.
Example use::
>>> from dummy import Dummy
>>> c = Dummy()
>>> c.convert("My name is Bond, James Bond")
u'M\xfd n\xe4m\xe9 \xefs B\xf8nd, J\xe4m\xe9s B\xf8nd \u2360\u03c3\u044f\u0454\u043c \u03b9\u03c1#'
>>> print c.convert("My name is Bond, James Bond")
Mý nämé ïs Bønd, Jämés Bønd Ⱡσяєм ιρ#
>>> print c.convert("don't convert <a href='href'>tag ids</a>")
døn't çønvért <a href='href'>täg ïds</a> Ⱡσяєм ιρѕυ#
>>> print c.convert("don't convert %(name)s tags on %(date)s")
døn't çønvért %(name)s tägs øn %(date)s Ⱡσяєм ιρѕ#
"""
TABLE = dict(zip(
u"AabCcEeIiOoUuYy",
u"ÀäßÇçÉéÌïÖöÛüÝý"
))
# matches tags like these:
# HTML: <B>, </B>, <BR/>, <textformat leading="10">
# Python: %(date)s, %(name)s
tag_pattern = re.compile(
r'''
(<[^>]+>) | # <tag>
({[^}]+}) | # {tag}
(%\([\w]+\)\w) | # %(tag)s
(&\w+;) | # &entity;
(&\#\d+;) | # Ӓ
(&\#x[0-9a-f]+;) | # ꯍ
(\[\[.*\]\]) # [[snowman radio]];
''',
re.IGNORECASE | re.VERBOSE
)
def convert(self, string):
"""Returns: a converted tagged string
param: string (contains html tags)
Don't replace characters inside tags
"""
(string, tags) = self.detag_string(string)
string = self.inner_convert_string(string)
string = self.retag_string(string, tags)
return string
def detag_string(self, string):
"""Extracts tags from string.
returns (string, list) where
string: string has tags replaced by indices (<BR>... => <0>, <1>, <2>, etc.)
list: list of the removed tags ('<BR>', '<I>', '</I>')
"""
counter = itertools.count(0)
count = lambda m: '<%s>' % counter.next()
tags = self.tag_pattern.findall(string)
tags = [''.join(tag) for tag in tags]
(new, nfound) = self.tag_pattern.subn(count, string)
if len(tags) != nfound:
raise Exception('tags dont match:' + string)
return (new, tags)
def retag_string(self, string, tags):
"""substitutes each tag back into string, into occurrences of <0>, <1> etc"""
for i, tag in enumerate(tags):
bracketed = '<%s>' % i
try:
string = re.sub(bracketed, tag, string, 1)
except Exception as e: # re module raising vague errors, WHY
continue
return string
def inner_convert_string(self, string):
for old, new in self.TABLE.items():
string = string.replace(old, new)
return self.pad(string)
def pad(self, string):
return string
def convert_msg(self, msg):
"""
Takes one POEntry object and converts it (adds a dummy translation to it)
msg is an instance of polib.POEntry
"""
source = msg.msgid
if not source:
# don't translate empty string
return
plural = msg.msgid_plural
if plural:
# translate singular and plural
foreign_single = self.convert(source)
foreign_plural = self.convert(plural)
plural = {
'0': self.final_newline(source, foreign_single),
'1': self.final_newline(plural, foreign_plural),
}
msg.msgstr_plural = plural
else:
foreign = self.convert(source)
msg.msgstr = self.final_newline(source, foreign)
def final_newline(self, original, translated):
""" Returns a new translated string.
If last char of original is a newline, make sure translation
has a newline too.
"""
if original:
if original[-1] == '\n' and translated[-1] != '\n':
translated += '\n'
return translated
# Main API
"""Convert certain ascii characters in a string into their accented
versions.
"""
convert = Converter().convert
"""Convert the msgstr of a (M|P)OEntry into its accented version.
"""
convert_msg = Converter().convert_msg