From 9d45c3d7d8eb689e84fa8b4d7591c15eb59c5da7 Mon Sep 17 00:00:00 2001 From: bookfere Date: Fri, 24 Nov 2023 02:18:55 +0800 Subject: [PATCH] refactor: Optimized the feature for adding translations to elements. --- lib/element.py | 138 +++++++++++++++++++++++------------------- tests/test_element.py | 98 +++++++++++++++++++++++------- 2 files changed, 153 insertions(+), 83 deletions(-) diff --git a/lib/element.py b/lib/element.py index d305f9c..af99fd5 100644 --- a/lib/element.py +++ b/lib/element.py @@ -156,12 +156,12 @@ def add_translation(self, translation, placeholder, position=None, new_element = etree.XML('<{0} xmlns="{1}">{2}'.format( get_name(self.element), ns['x'], trim(translation))) # Preserve all attributes from the original element. - for k, v in self.element.items(): - if k == 'id' and position != 'only': + for name, value in self.element.items(): + if name == 'id' and position != 'only': continue - if k == 'dir': - v = 'auto' - new_element.set(k, v) + if name == 'dir': + value = 'auto' + new_element.set(name, value) if color is not None: new_element.set('style', 'color:%s' % color) if lang is not None: @@ -291,7 +291,9 @@ def __init__(self, placeholder, separator, merge_length=0): self.lang = None self.elements = {} - self.original = [] + self.originals = [] + + self.base_originals = [] def get_merge_length(self): return self.merge_length @@ -305,29 +307,33 @@ def set_translation_color(self, color): def set_translation_lang(self, lang): self.lang = lang - def get_elements(self): - return self.elements - def remove_unused_elements(self): if self.position == 'only': for element in self.elements.values(): element.delete() def prepare_original(self, elements): - for eid, element in enumerate(elements): - self.elements[eid] = element + count = 0 + for oid, element in enumerate(elements): raw = element.get_raw() content = element.get_content(self.placeholder) - md5 = uid('%s%s' % (eid, content)) + md5 = uid('%s%s' % (oid, content)) attrs = element.get_attributes() - self.original.append( - (eid, md5, raw, content, element.ignored, attrs, - element.page_id)) - return self.original + if not element.ignored: + self.elements[count] = element + self.base_originals.append(content) + count += 1 + self.originals.append( + (oid, md5, raw, content, element.ignored, attrs, + element.page_id)) + return self.originals def add_translations(self, paragraphs): + count = 0 for paragraph in paragraphs: - element = self.elements.get(paragraph.id) + if paragraph.original not in self.base_originals: + continue + element = self.elements.get(count) if not element: continue translation = paragraph.translation @@ -335,7 +341,8 @@ def add_translations(self, paragraphs): element.add_translation( translation, self.placeholder, self.position, self.lang, self.color) - self.elements.pop(paragraph.id) + self.elements.pop(count) + count += 1 for eid, element in self.elements.copy().items(): if element.ignored: self.elements.pop(eid) @@ -345,59 +352,69 @@ def add_translations(self, paragraphs): class ElementHandlerMerge(ElementHandler): def prepare_original(self, elements): raw = '' - content = '' + txt = '' + oid = 0 count = 0 - for eid, element in enumerate(elements): + for element in elements: if element.ignored: continue - self.elements[eid] = element - separator = self.separator \ - or ' %s ' % self.placeholder[0].format(eid) + self.elements[count] = element code = element.get_raw() - text = element.get_content(self.placeholder) + separator - if len(content + text) < self.merge_length: - raw += code - content += text + content = element.get_content(self.placeholder) + self.base_originals.append(content) + count += 1 + content += self.separator + if len(txt + content) < self.merge_length: + raw += code + self.separator + txt += content continue - elif content: - md5 = uid('%s%s' % (count, content)) - self.original.append((count, md5, raw, content, False)) - count += 1 + elif txt: + md5 = uid('%s%s' % (oid, txt)) + self.originals.append((oid, md5, raw, txt, False)) + oid += 1 raw = code - content = text - md5 = uid('%s%s' % (count, content)) - content and self.original.append((count, md5, raw, content, False)) - return self.original + txt = content + md5 = uid('%s%s' % (oid, txt)) + txt and self.originals.append((oid, md5, raw, txt, False)) + return self.originals + + def align_paragraph(self, paragraph): + # Compatible with using the placeholder as the separator. + if paragraph.original[-2:] != self.separator: + pattern = re.compile( + r'\s*%s\s*' % self.placeholder[1].format(r'(0|[^0]\d*)')) + paragraph.original = pattern.sub( + self.separator, paragraph.original) + paragraph.translation = pattern.sub( + self.separator, paragraph.translation) + # Ensure the translation count matches the actual elements count. + originals = paragraph.original.strip().split(self.separator) + pattern = re.compile('%s+' % self.separator) + translations = pattern.sub(self.separator, paragraph.translation) + translations = translations.strip().split(self.separator) + offset = len(originals) - len(translations) + if offset > 0: + translations += ['-'] * offset + elif offset < 0: + translations = translations[:offset] + for original in originals: + if original and original not in self.base_originals: + translations.pop(originals.index(original)) + return translations def add_translations(self, paragraphs): - content = '' + translations = [] for paragraph in paragraphs: - tail = paragraph.original[-2:] - tail = tail if tail == self.separator else '' - if paragraph.translation: - content += paragraph.translation + tail - # Check if the translated content contains at least one separator; - # if none is found, use the placeholder to separate paragraphs. - if self.separator and self.separator in content: - pattern = '%s+' % self.separator - content = re.sub(pattern, self.separator, content) - else: - self.separator = None - + translations.extend(self.align_paragraph(paragraph)) + count = 0 for eid, element in self.elements.copy().items(): - separator = self.separator or self.placeholder[1].format(eid) - matches = re.search(separator, content) - if not matches: + if element.ignored: continue - pattern = matches.group(0) - end = content.find(pattern) - part = content[:end] - content = content.replace(part + pattern, '', 1) - if not element.ignored: - element.add_translation( - part.strip(), self.placeholder, self.position, self.lang, - self.color) - self.elements.pop(eid) + element.add_translation( + translations[count], self.placeholder, self.position, + self.lang, self.color) + count += 1 + self.elements.pop(eid) self.remove_unused_elements() @@ -449,5 +466,4 @@ def get_element_handler(placeholder, separator): handler.set_translation_position( config.get('translation_position')) handler.set_translation_color(config.get('translation_color')) - return handler diff --git a/tests/test_element.py b/tests/test_element.py index b6b0604..863e15f 100644 --- a/tests/test_element.py +++ b/tests/test_element.py @@ -474,9 +474,11 @@ def test_prepare_original(self, mock_uid): def test_add_translations(self): self.handler.prepare_original(self.elements) translations = [ - Paragraph(0, 'm1', '

a

', 'a', False, '{"id": "a"}', + Paragraph(0, 'm1', '

x

', 'x', False, '{"id": "x"}', + 'p1', None, 'ENGINE', 'LANG'), + Paragraph(1, 'm1', '

a

', 'a', False, '{"id": "a"}', 'p1', 'A', 'ENGINE', 'LANG'), - Paragraph(1, 'm2', '

b

', 'b', False, '{"id": "b"}', + Paragraph(2, 'm2', '

b

', 'b', False, '{"id": "b"}', 'p1', 'B', 'ENGINE', 'LANG'), Paragraph(3, 'm3', '

c

', 'c', False, '{"id": "c", "class": "c"}', 'p1', 'C', 'ENGINE', @@ -485,6 +487,7 @@ def test_add_translations(self): self.handler.add_translations(translations) elements = self.xhtml.findall('./x:body/*', namespaces=ns) + self.assertEqual(8, len(elements)) self.assertEqual('a', elements[0].text) self.assertEqual('A', elements[1].text) @@ -493,6 +496,7 @@ def test_add_translations(self): self.assertEqual('c', elements[5].text) self.assertEqual('C', elements[6].text) + self.assertIsNone(elements[6].get('id')) self.assertEqual('c', elements[6].get('class')) @@ -540,24 +544,64 @@ def setUp(self): in self.xhtml.findall('./x:body/*', namespaces=ns)] self.elements[-1].set_ignored(True) self.elements[-3].set_ignored(True) - self.handler = ElementHandlerMerge(Base.placeholder, None, 1000) + self.handler = ElementHandlerMerge( + Base.placeholder, Base.separator, 1000) - @patch('calibre_plugins.ebook_translator.lib.element.uid') - def test_prepare_original_merge_placeholder(self, mock_uid): - mock_uid.return_value = 'm1' - self.assertEqual([( - 0, 'm1', '

a

b

c

', - 'a {{id_0}} b {{id_1}} c {{id_3}} ', False)], - self.handler.prepare_original(self.elements)) + def test_align_paragraph(self): + self.handler.prepare_original(self.elements) + + paragraph = Paragraph( + 0, 'm1', '

a

b

c

', + 'a {{id_0}} b {{id_1}} c {{id_3}}', False, None, None, + 'A {{id_0}} B {{id_1}} C {{id_3}}', 'ENGINE', 'LANG') + self.assertEqual( + ['A', 'B', 'C'], self.handler.align_paragraph(paragraph)) + + paragraph = Paragraph( + 0, 'm1', '

a

b

c

', + 'a {{id_0}} b {{id_1}} c {{id_3}}', False, None, None, + 'A {{id_0}} B {{id_1}} C {{id_4}} D {{id_5}}', 'ENGINE', 'LANG') + self.handler.align_paragraph(paragraph) + self.assertEqual( + ['A', 'B', 'C'], self.handler.align_paragraph(paragraph)) + + paragraph = Paragraph( + 0, 'm1', '

a

b

c

', + 'a {{id_0}} b {{id_1}} c {{id_3}}', False, None, None, + 'A {{id_0}} B {{id_1}}', 'ENGINE', 'LANG') + self.assertEqual( + ['A', 'B', '-'], self.handler.align_paragraph(paragraph)) + + paragraph = Paragraph( + 0, 'm1', '

a

b

c

', + 'a\n\nb\n\nc\n\n', False, None, None, 'A\n\nB\n\nC\n\n', + 'ENGINE', 'LANG') + self.assertEqual( + ['A', 'B', 'C'], self.handler.align_paragraph(paragraph)) + + paragraph = Paragraph( + 0, 'm1', '

a

b

c

', + 'a\n\nb\n\nc\n\n', False, None, None, 'A\n\nB\n\nC\n\nD\n\nE\n\n', + 'ENGINE', 'LANG') + self.assertEqual( + ['A', 'B', 'C'], self.handler.align_paragraph(paragraph)) + + paragraph = Paragraph( + 0, 'm1', '

a

b

c

', + 'a\n\nb\n\nc\n\n', False, None, None, 'A\n\nB\n\n', + 'ENGINE', 'LANG') + self.assertEqual( + ['A', 'B', '-'], self.handler.align_paragraph(paragraph)) @patch('calibre_plugins.ebook_translator.lib.element.uid') def test_prepare_original_merge_separator(self, mock_uid): mock_uid.return_value = 'm1' self.handler.separator = Base.separator self.assertEqual([( - 0, 'm1', '

a

b

c

', - 'a\n\nb\n\nc\n\n', False)], + 0, 'm1', '

a

\n\n

b

\n\n

c

\n\n', 'a\n\nb\n\nc\n\n', False)], self.handler.prepare_original(self.elements)) + self.assertEqual(['a', 'b', 'c'], self.handler.base_originals) @patch('calibre_plugins.ebook_translator.lib.element.uid') def test_prepare_original_merge_separator_multiple(self, mock_uid): @@ -569,6 +613,7 @@ def test_prepare_original_merge_separator_multiple(self, mock_uid): (1, 'm2', '

b

', 'b\n\n', False), (2, 'm3', '

c

', 'c\n\n', False)] self.assertEqual(items, self.handler.prepare_original(self.elements)) + self.assertEqual(['a', 'b', 'c'], self.handler.base_originals) def test_add_translations_merge_placeholder(self): self.handler.prepare_original(self.elements) @@ -617,12 +662,13 @@ def test_add_translations_merge_separator(self): elements = self.xhtml.findall('./x:body/*', namespaces=ns) - self.assertEqual(7, len(elements)) + self.assertEqual(8, len(elements)) self.assertEqual('a', elements[0].text) self.assertEqual('A B', elements[1].text) self.assertEqual('b', elements[2].text) self.assertEqual('C', elements[3].text) self.assertEqual('c', elements[5].text) + self.assertEqual('-', elements[6].text) def test_add_translations_merge_separator_multiple(self): self.handler.merge_length = 2 @@ -656,13 +702,15 @@ def test_add_translations_merge_placeholder_missing_id(self): 'A B {{id_1}} C {{id_3}}', 'ENGINE', 'LANG')]) elements = self.xhtml.findall('./x:body/*', namespaces=ns) - self.assertEqual(7, len(elements)) + + self.assertEqual(8, len(elements)) self.assertEqual('a', elements[0].text) - self.assertEqual('b', elements[1].text) - self.assertEqual('A B', elements[2].text) + self.assertEqual('A B', elements[1].text) + self.assertEqual('b', elements[2].text) + self.assertEqual('C', elements[3].text) - self.assertEqual('c', elements[4].text) - self.assertEqual('C', elements[5].text) + self.assertEqual('c', elements[5].text) + self.assertEqual('-', elements[6].text) def test_add_translations_merge_placeholder_missing_newline(self): self.handler.separator = Base.separator @@ -673,13 +721,15 @@ def test_add_translations_merge_placeholder_missing_newline(self): 'A B\n\nC\n\n', 'ENGINE', 'LANG')]) elements = self.xhtml.findall('./x:body/*', namespaces=ns) - self.assertEqual(7, len(elements)) + + self.assertEqual(8, len(elements)) self.assertEqual('a', elements[0].text) self.assertEqual('A B', elements[1].text) self.assertEqual('b', elements[2].text) self.assertEqual('C', elements[3].text) self.assertEqual('c', elements[5].text) + self.assertEqual('-', elements[6].text) def test_add_translations_merge_palceholder_only(self): self.handler.position = 'only' @@ -725,10 +775,11 @@ def test_add_translations_merge_placeholder_only_missing_id(self): 'A B {{id_1}} C {{id_3}}', 'ENGINE', 'LANG')]) elements = self.xhtml.findall('./x:body/*', namespaces=ns) - self.assertEqual(4, len(elements)) + self.assertEqual(5, len(elements)) self.assertEqual('A B', elements[0].text) + self.assertEqual('C', elements[1].text) - self.assertEqual('C', elements[2].text) + self.assertEqual('-', elements[3].text) def test_add_translations_merge_separator_only_missing_id(self): self.handler.position = 'only' @@ -741,6 +792,9 @@ def test_add_translations_merge_separator_only_missing_id(self): 'A B\n\nC\n\n', 'ENGINE', 'LANG')]) elements = self.xhtml.findall('./x:body/*', namespaces=ns) - self.assertEqual(4, len(elements)) + + self.assertEqual(5, len(elements)) self.assertEqual('A B', elements[0].text) self.assertEqual('C', elements[1].text) + + self.assertEqual('-', elements[3].text)