-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathtranslate-rules.py
180 lines (152 loc) · 9.25 KB
/
translate-rules.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
"""
Translate the English rule files (but not the unicode files) to the target language.
This is done with the function build_all_translations().
The unicode files are not built here because they are large enough to seem to occasionally run into hiccups.
See the end of this file how this is used (typically change 'language' and just run the file)
"""
import re
import os
import sys
sys.stdout.reconfigure(encoding='utf-8')
# Translate text in rules into the target language
# The google translate is done via https://github.com/ffreemt/google-stranslate (pip install itranslate)
# from itranslate import itranslate as translate
# TRANSLATE_URL = "https://translate.google.us"
#
# The google translate is done via googletrans
# Note: needed to use 'pip install googletrans==4.0.0-rc1' and there is some concern this package might go away
from googletrans import Translator
GoogleTranslate = Translator(service_urls=["translate.google.us"])
# Google allows up to 500K chars translation/month, so using a key likely would be free anyway
# Unlike the unicode file, the rule files don't have a lot of text.
#
# To speed things up and avoid getting blocked, two passes are taken:
# 1. For each file, we gather all the text into a list that has "phrase(..'xxx'...)". We prepend <line#>:: to the phrase string.
# 2. Turn the list into a string with separators, translate it, and reconvert to a list
# 3. Reread the file replacing translations (we know the line number) and writing it out
PhraseToTranslate = re.compile(r'phrase\(([^)]+)\)')
WordToTranslate = re.compile(r't: "([^"]+)"')
# run over the file and figure out what words need to be translated
def collect_phrases_to_translate(file_to_translate: str) -> (list[str], list[str]):
with open(file_to_translate, 'r', encoding='utf8') as in_stream:
phrases = []
words = []
for line in in_stream:
phrase = PhraseToTranslate.search(line)
if phrase:
phrases.append(phrase.group(1))
word = WordToTranslate.search(line)
if word:
words.append(word.group(1))
print(f"#phrases={len(phrases)}, #words={len(words)}")
return (phrases, words)
# break up the words into chunks to make google translate happy (and to run faster) and return a dictionary of word: translation
MAX_CHARS_IN_CHUNK = 4500 # 4500 sometimes failed (language code "no")
# try to avoid google banning us
TIMEOUT = 2
import time
def translate_phrases(phrases_to_translate: list[str], lang) -> list[str]:
if lang=='nb' or lang=='nn':
lang = 'no' # google doesn't know those variants, but SRE uses them
def do_translation_chunk(phrases: list[str]):
if len(phrases) == 0:
return phrases # file with no "phrase(...)"
# translate doesn't handle a list properly -- use ".\n" to separate phrases
phrases_string = ".\n".join(phrases)
# print("***Phrases to translate: {}\n".format(phrases))
translated_phrases_str: str = GoogleTranslate.translate(phrases_string, src='en', dest=lang).text
translated_phrases_str = translated_phrases_str.replace('。', '.') # happens for Chinese
translated_phrases_str = translated_phrases_str.replace('"', "'").replace("“", "'").replace("”", "'") # google occasionally changes quotes
translated_phrases_str = translated_phrases_str.replace("«", "'").replace("»", "'") # google occasionally changes quotes to this form
translated_phrases_str = translated_phrases_str.replace("、", ",") # Chinese comma
translated_phrases_str = translated_phrases_str.lower()
translated_phrases_list = translated_phrases_str.split('.\n')
if len(translated_phrases_list) != len(phrases):
print("\n!!!Problem in translation: size of translations ({}) differs from phrases to translate ({})\n".format(len(translated_phrases_list), len(phrases)))
print("English phrases: {}\n".format(phrases))
print("Truncated translated phrases: {}\n".format(translated_phrases_list))
# The Finnish translation (at least) for some reason has a few failures where ".\n" is only "\n" (and translation failed)
# We try a last attempt by deleting the '.' and splitting at the newline
print("Retrying by assuming '.' is missing...")
translated_phrases_list = translated_phrases_str.replace('.','').split('\n')
if len(translated_phrases_list) != len(phrases):
print("!!!***Retry failed: size of translations ({}) differs from phrases to translate ({})\n".format(len(translated_phrases_list), len(phrases)))
print("Phrases to translate:\n{}".format(list(phrases)))
print("Translations:\n{}".format(list(translated_phrases_list)))
return translated_phrases_list
translations = []
char_count = 0
phrases_chunks_to_translate = []
for phrase in phrases_to_translate:
phrases_chunks_to_translate.append(phrase)
char_count += len(phrase)
if char_count >= MAX_CHARS_IN_CHUNK:
print("char_count={}", char_count)
translations += do_translation_chunk(phrases_chunks_to_translate)
print("Translated {} phrases...".format(len(phrases_chunks_to_translate)))
char_count = 0
phrases_chunks_to_translate = []
time.sleep(TIMEOUT) # try to avoid google banning us
return translations + do_translation_chunk(phrases_chunks_to_translate)
TargetWord = re.compile(r"'([^']+)'")
TextString = re.compile(r'([ \[{][oc]?t: )"([^"]+)"')
def substitute_in_translated_phrase(line, translated_phrase, translated_word) -> str:
has_phrase = PhraseToTranslate.search(line)
target_words = TargetWord.search(translated_phrase)
text_words = TextString.search(line)
new_line = line
if has_phrase and target_words and text_words: # test for text_words handles "variables: [....]"
try:
replacement = text_words.group(1) + '"' + target_words.group(1) + '"' # add the surrounding context back
except AttributeError:
print(f"text_words={text_words}, target_words={target_words}, line='{line}'")
exit()
new_line = TextString.sub(replacement, line)
# print("fixed line: {}".format(new_line))
elif text_words:
print(f"Failed to find quoted part in translation \"{translated_phrase}\", \
using '{translated_word}\n original line: {line}")
replacement = text_words.group(1) + '"' + translated_word + '"' # add the surrounding context back
new_line = TextString.sub(replacement, line)
return new_line
def create_new_file(file_to_translate: str, output_file: str,
phrase_translations: list[str], word_translations: list[str]) -> None:
with open(output_file, 'w', encoding='utf8') as out_stream:
with open(file_to_translate, 'r', encoding='utf8') as in_stream:
iPhraseTranslation = 0
iWordTranslation = 0
# need to add an extra element to both lists because the indexes are inc'd after last entry but could be more non-translation lines
phrase_translations.append("dummy")
word_translations.append("dummy")
for line in in_stream:
out_stream.write(substitute_in_translated_phrase(
line, phrase_translations[iPhraseTranslation], word_translations[iWordTranslation]))
if PhraseToTranslate.search(line):
iPhraseTranslation += 1
if WordToTranslate.search(line):
iWordTranslation += 1
def build_new_translation(path_to_mathcat: str, lang: str, rule_file_name: str) -> None:
print("build_new_translation: rule_file_name=", rule_file_name)
file_to_translate = "{}/Rules/Languages/en/{}".format(path_to_mathcat, rule_file_name)
(phrases_to_translate, words_to_translate) = collect_phrases_to_translate(file_to_translate)
phrase_translations = translate_phrases(phrases_to_translate, lang)
word_translations = translate_phrases(words_to_translate, lang)
print(f"file:{rule_file_name}: #phrases={len(phrase_translations)}, #words={len(word_translations)}")
create_new_file(file_to_translate, os.path.join(lang, rule_file_name), phrase_translations, word_translations)
print("done\n")
def build_all_translations(path_to_mathcat: str, lang: str, subdir="") -> None:
dir_to_translate = os.path.join(path_to_mathcat, "Rules", "Languages", "en", subdir)
entries = os.listdir(dir_to_translate)
for entry in entries:
if os.path.isdir(os.path.join(dir_to_translate, entry)):
build_all_translations(path_to_mathcat, lang, os.path.join(subdir, entry))
elif entry.endswith('.yaml') and not(entry=="definitions.yaml" or entry=="unicode.yaml" or entry=="unicode-full.yaml"):
# the excluded files are built in translate-unicode.py and need some manual checking so not included here
build_new_translation(path_to_mathcat, lang, os.path.join(subdir, entry))
language = 'el'
if not os.path.exists(language):
os.makedirs(language)
if not os.path.exists(language+"/SharedRules"):
os.makedirs(language+"/SharedRules")
# build_new_translation("..", language, "ClearSpeak_Rules.yaml")
build_all_translations("..", language)