diff --git a/advanced.py b/advanced.py
index 8248dde..d2ffb8f 100644
--- a/advanced.py
+++ b/advanced.py
@@ -467,7 +467,8 @@ def layout_filter(self):
categories = QComboBox()
categories.addItem(_('All'), 'all')
- categories.addItem(_('Non-aligned'), 'non_aligned')
+ if self.merge_enabled:
+ categories.addItem(_('Non-aligned'), 'non_aligned')
categories.addItem(_('Translated'), 'translated')
categories.addItem(_('Untranslated'), 'untranslated')
diff --git a/components/table.py b/components/table.py
index e453cca..d4341db 100644
--- a/components/table.py
+++ b/components/table.py
@@ -84,7 +84,7 @@ def track_row_data(self, row):
paragraph = self.paragraph(row)
if paragraph.translation:
before_aligned = paragraph.aligned
- self.check_row_alignment(paragraph)
+ self.parent.merge_enabled and self.check_row_alignment(paragraph)
# If the alignment of before and after is the same, do nothing.
if before_aligned and not paragraph.aligned:
self.non_aligned_count += 1
diff --git a/lib/config.py b/lib/config.py
index f3b5002..a3babdb 100644
--- a/lib/config.py
+++ b/lib/config.py
@@ -28,6 +28,7 @@
'filter_scope': 'text',
'filter_rules': [],
'element_rules': [],
+ 'reserve_rules': [],
'custom_engines': {},
'glossary_enabled': False,
'glossary_path': None,
diff --git a/lib/element.py b/lib/element.py
index 546407b..0a52778 100644
--- a/lib/element.py
+++ b/lib/element.py
@@ -5,7 +5,8 @@
from lxml import etree
from calibre import prepare_string_for_xml as xml_escape
-from .utils import ns, css, uid, trim, sorted_mixed_keys, open_file
+from .utils import (
+ ns, uid, trim, sorted_mixed_keys, open_file, css_to_xpath, create_xpath)
from .config import get_config
@@ -36,6 +37,9 @@ def __init__(self, element, page_id=None):
self.original_color = None
self.translation_color = None
+ self.remove_pattern = None
+ self.reserve_pattern = None
+
def _element_copy(self):
return copy.deepcopy(self.element)
@@ -60,6 +64,12 @@ def set_original_color(self, color):
def set_translation_color(self, color):
self.translation_color = color
+ def set_remove_pattern(self, pattern):
+ self.remove_pattern = pattern
+
+ def set_reserve_pattern(self, pattern):
+ self.reserve_pattern = pattern
+
def get_name(self):
return None
@@ -152,7 +162,7 @@ def add_translation(self, translation=None):
self.element.content = '%s %s' % (
translation, self.element.content)
else:
- self.element.content = '%s %s' %(
+ self.element.content = '%s %s' % (
self.element.content, translation)
@@ -175,11 +185,6 @@ def add_translation(self, translation=None):
class PageElement(Element):
- def _get_descendents(self, element, tags):
- tags = (tags,) if isinstance(tags, str) else tags
- xpath = './/*[%s]' % ' or '.join(['self::x:%s' % tag for tag in tags])
- return element.xpath(xpath, namespaces=ns)
-
def get_name(self):
return get_name(self.element)
@@ -206,18 +211,16 @@ def _safe_remove(self, element, replacement=''):
def get_content(self):
element_copy = self._element_copy()
- for noise in self._get_descendents(element_copy, ('rt', 'rp')):
- self._safe_remove(noise)
- # Reserve the
element instead of using a line break to prevent
- # conflicts with the mechanism of merge translation.
- target_elements = (
- 'img', 'code', 'br', 'hr', 'sub', 'sup', 'kbd', 'abbr', 'wbr', 'var',
- 'canvas', 'svg', 'script', 'style')
- self.reserve_elements = self._get_descendents(
- element_copy, target_elements)
+ if self.remove_pattern is not None:
+ for noise in element_copy.xpath(
+ self.remove_pattern, namespaces=ns):
+ self._safe_remove(noise)
+ if self.reserve_pattern is not None:
+ self.reserve_elements = element_copy.xpath(
+ self.reserve_pattern, namespaces=ns)
for eid, reserve in enumerate(self.reserve_elements):
replacement = self.placeholder[0].format(format(eid, '05'))
- if get_name(reserve) in ['sub', 'sup']:
+ if get_name(reserve) in ('sub', 'sup'):
parent = reserve.getparent()
if parent is not None and get_name(parent) == 'a' and \
parent.text is None and reserve.tail is None and \
@@ -423,6 +426,11 @@ def _create_table(self, translation=None):
class Extraction:
+ priority_elements = (
+ 'p', 'pre', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote')
+ default_filter_rules = (
+ r'^[-\d\s\.\'\\"‘’“”,=~!@#$%^&º*|≈<>?/`—…+:–_(){}[\]]+$',)
+
def __init__(
self, pages, rule_mode, filter_scope, filter_rules, element_rules):
self.pages = pages
@@ -438,9 +446,7 @@ def __init__(
self.load_element_patterns()
def load_filter_patterns(self):
- default_rules = [
- r'^[-\d\s\.\'\\"‘’“”,=~!@#$%^&º*|≈<>?/`—…+:–_(){}[\]]+$']
- patterns = [re.compile(rule) for rule in default_rules]
+ patterns = [re.compile(rule) for rule in self.default_filter_rules]
for rule in self.filter_rules:
if self.rule_mode == 'normal':
rule = re.compile(re.escape(rule), re.I)
@@ -452,13 +458,9 @@ def load_filter_patterns(self):
self.filter_patterns = patterns
def load_element_patterns(self):
- rules = ['pre', 'code']
- rules.extend(self.element_rules)
- patterns = []
- for selector in rules:
- rule = css(selector)
- rule and patterns.append(rule)
- self.element_patterns = patterns
+ default_selectors = ['pre', 'code']
+ self.element_patterns = css_to_xpath(
+ default_selectors + self.element_rules)
def get_sorted_pages(self):
pages = []
@@ -483,15 +485,20 @@ def need_ignore(self, element):
return False
def extract_elements(self, page_id, root, elements=[]):
- priority_elements = [
- 'p', 'pre', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote']
+ """If the root matches the pattern, return an empty list; otherwise,
+ just break the recursion without doing anything.
+ """
+ if self.need_ignore(root):
+ return []
for element in root.findall('./*'):
+ if self.need_ignore(element):
+ continue
element_has_content = False
if element.text is not None and trim(element.text) != '':
element_has_content = True
else:
children = element.findall('./*')
- if children and get_name(element) in priority_elements:
+ if children and get_name(element) in self.priority_elements:
element_has_content = True
else:
for child in children:
@@ -540,6 +547,9 @@ def __init__(self, placeholder, separator, position, merge_length=0):
self.translation_color = None
self.column_gap = None
+ self.remove_pattern = None
+ self.reserve_pattern = None
+
self.elements = {}
self.originals = []
@@ -559,6 +569,18 @@ def set_column_gap(self, values):
if isinstance(values, tuple) and len(values) == 2:
self.column_gap = values
+ def load_remove_rules(self, rules=[]):
+ default_rules = ('rt', 'rp')
+ self.remove_pattern = create_xpath(default_rules + tuple(rules))
+
+ def load_reserve_rules(self, rules=[]):
+ # Reserve the
element instead of using a line break to prevent
+ # conflicts with the mechanism of merge translation.
+ default_rules = (
+ 'img', 'code', 'br', 'hr', 'sub', 'sup', 'kbd', 'abbr', 'wbr',
+ 'var', 'canvas', 'svg', 'script', 'style')
+ self.reserve_pattern = create_xpath(default_rules + tuple(rules))
+
def prepare_original(self, elements):
count = 0
for oid, element in enumerate(elements):
@@ -569,6 +591,8 @@ def prepare_original(self, elements):
element.set_translation_color(self.translation_color)
if self.column_gap is not None:
element.set_column_gap(self.column_gap)
+ element.set_remove_pattern(self.remove_pattern)
+ element.set_reserve_pattern(self.reserve_pattern)
raw = element.get_raw()
content = element.get_content()
md5 = uid('%s%s' % (oid, content))
@@ -618,6 +642,8 @@ def prepare_original(self, elements):
element.set_translation_color(self.translation_color)
if self.column_gap is not None:
element.set_column_gap(self.column_gap)
+ element.set_remove_pattern(self.remove_pattern)
+ element.set_reserve_pattern(self.reserve_pattern)
code = element.get_raw()
content = element.get_content()
content += self.separator
@@ -725,7 +751,7 @@ def get_page_elements(pages):
config = get_config()
rule_mode = config.get('rule_mode')
filter_scope = config.get('filter_scope')
- filter_rules = config.get('filter_rules')
+ filter_rules = config.get('filter_rules', [])
element_rules = config.get('element_rules', [])
extraction = Extraction(
pages, rule_mode, filter_scope, filter_rules, element_rules)
@@ -747,4 +773,6 @@ def get_element_handler(placeholder, separator):
handler.set_column_gap((gap_type, column_gap.get(gap_type)))
handler.set_original_color(config.get('original_color'))
handler.set_translation_color(config.get('translation_color'))
+ handler.load_remove_rules(config.get('element_rules', []))
+ handler.load_reserve_rules(config.get('reserve_rules', []))
return handler
diff --git a/lib/utils.py b/lib/utils.py
index 9c971d0..d170f22 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -25,6 +25,19 @@ def css(seletor):
return None
+def css_to_xpath(selectors):
+ patterns = []
+ for selector in selectors:
+ rule = css(selector)
+ rule and patterns.append(rule)
+ return patterns
+
+
+def create_xpath(selectors):
+ selectors = (selectors,) if isinstance(selectors, str) else selectors
+ return './/*[%s]' % ' or '.join(css_to_xpath(selectors))
+
+
def uid(*args):
md5 = hashlib.md5()
for arg in args:
diff --git a/setting.py b/setting.py
index a00d996..7427289 100644
--- a/setting.py
+++ b/setting.py
@@ -1072,12 +1072,25 @@ def choose_filter_mode(btn_id):
self.element_rules.setMinimumHeight(100)
self.element_rules.insertPlainText(
'\n'.join(self.config.get('element_rules')))
-
element_layout.addWidget(QLabel(
_('CSS selectors to exclude elements. One rule per line:')))
element_layout.addWidget(self.element_rules)
layout.addWidget(element_group)
+ # Reserve element
+ reserve_group = QGroupBox(_('Reserve Element'))
+ reserve_layout = QVBoxLayout(reserve_group)
+ self.reserve_rules = QPlainTextEdit()
+ self.reserve_rules.setPlaceholderText(
+ '%s %s' % (_('e.g.,'), 'span.footnote, a#footnote'))
+ self.reserve_rules.setMinimumHeight(100)
+ self.reserve_rules.insertPlainText(
+ '\n'.join(self.config.get('reserve_rules')))
+ reserve_layout.addWidget(QLabel(
+ _('CSS selectors to reserve elements. One rule per line:')))
+ reserve_layout.addWidget(self.reserve_rules)
+ layout.addWidget(reserve_group)
+
# Ebook Metadata
metadata_group = QGroupBox(_('Ebook Metadata'))
metadata_layout = QFormLayout(metadata_group)
@@ -1280,6 +1293,18 @@ def update_content_config(self):
self.config.delete('element_rules')
element_rules and self.config.update(element_rules=element_rules)
+ # Reserve rules
+ rule_content = self.reserve_rules.toPlainText()
+ reserve_rules = [r for r in rule_content.split('\n') if r.strip()]
+ for rule in reserve_rules:
+ if css(rule) is None:
+ self.alert.pop(
+ _('{} is not a valid CSS seletor.')
+ .format(rule), 'warning')
+ return False
+ self.config.delete('reserve_rules')
+ reserve_rules and self.config.update(reserve_rules=reserve_rules)
+
# Ebook metadata
ebook_metadata = self.config.get('ebook_metadata').copy()
ebook_metadata.clear()
diff --git a/tests/test_config.py b/tests/test_config.py
index 50271d1..11bc8be 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -33,6 +33,7 @@ def test_default(self):
'filter_scope': 'text',
'filter_rules': [],
'element_rules': [],
+ 'reserve_rules': [],
'custom_engines': {},
'glossary_enabled': False,
'glossary_path': None,
diff --git a/tests/test_element.py b/tests/test_element.py
index 9daccbe..f1d8ed8 100644
--- a/tests/test_element.py
+++ b/tests/test_element.py
@@ -1,3 +1,4 @@
+import re
import unittest
from unittest.mock import patch, Mock
@@ -5,7 +6,7 @@
from calibre.ebooks.oeb.base import TOC, Metadata
-from ..lib.utils import ns
+from ..lib.utils import ns, create_xpath
from ..lib.cache import Paragraph
from ..lib.element import (
get_string, get_name, Extraction, ElementHandler, ElementHandlerMerge,
@@ -20,7 +21,7 @@ class TestFunction(unittest.TestCase):
def test_get_string(self):
markup = '
abc
defabc
', get_string(element, False)) @@ -115,6 +116,8 @@ def test_create_element(self): self.assertIsNone(self.element.translation_lang) self.assertIsNone(self.element.original_color) self.assertIsNone(self.element.translation_color) + self.assertIsNone(self.element.remove_pattern) + self.assertIsNone(self.element.reserve_pattern) def test_set_ignored(self): self.element.set_ignored(True) @@ -144,6 +147,14 @@ def test_set_translation_color(self): self.element.set_translation_color('green') self.assertEqual('green', self.element.translation_color) + def test_set_remove_pattern(self): + self.element.set_remove_pattern('.//*[self::x:sup]') + self.assertEqual('.//*[self::x:sup]', self.element.remove_pattern) + + def test_set_reserve_pattern(self): + self.element.set_reserve_pattern('.//*[self::x:sup]') + self.assertEqual('.//*[self::x:sup]', self.element.reserve_pattern) + def test_get_name(self): self.assertIsNone(self.element.get_name()) @@ -363,6 +374,9 @@ def setUp(self):