Skip to content

Commit

Permalink
feat: Supports reserving elements using CSS selectors. resolve #114, fix
Browse files Browse the repository at this point in the history
 #238, resolve #271
  • Loading branch information
bookfere committed Apr 8, 2024
1 parent dc1d0d7 commit bcf812a
Show file tree
Hide file tree
Showing 16 changed files with 343 additions and 106 deletions.
3 changes: 2 additions & 1 deletion advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,8 @@ def layout_filter(self):

categories = QComboBox()
categories.addItem(_('All'), 'all')
categories.addItem(_('Non-aligned'), 'non_aligned')
if self.merge_enabled:
categories.addItem(_('Non-aligned'), 'non_aligned')
categories.addItem(_('Translated'), 'translated')
categories.addItem(_('Untranslated'), 'untranslated')

Expand Down
2 changes: 1 addition & 1 deletion components/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def track_row_data(self, row):
paragraph = self.paragraph(row)
if paragraph.translation:
before_aligned = paragraph.aligned
self.check_row_alignment(paragraph)
self.parent.merge_enabled and self.check_row_alignment(paragraph)
# If the alignment of before and after is the same, do nothing.
if before_aligned and not paragraph.aligned:
self.non_aligned_count += 1
Expand Down
1 change: 1 addition & 0 deletions lib/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
'filter_scope': 'text',
'filter_rules': [],
'element_rules': [],
'reserve_rules': [],
'custom_engines': {},
'glossary_enabled': False,
'glossary_path': None,
Expand Down
90 changes: 59 additions & 31 deletions lib/element.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from lxml import etree
from calibre import prepare_string_for_xml as xml_escape

from .utils import ns, css, uid, trim, sorted_mixed_keys, open_file
from .utils import (
ns, uid, trim, sorted_mixed_keys, open_file, css_to_xpath, create_xpath)
from .config import get_config


Expand Down Expand Up @@ -36,6 +37,9 @@ def __init__(self, element, page_id=None):
self.original_color = None
self.translation_color = None

self.remove_pattern = None
self.reserve_pattern = None

def _element_copy(self):
return copy.deepcopy(self.element)

Expand All @@ -60,6 +64,12 @@ def set_original_color(self, color):
def set_translation_color(self, color):
self.translation_color = color

def set_remove_pattern(self, pattern):
self.remove_pattern = pattern

def set_reserve_pattern(self, pattern):
self.reserve_pattern = pattern

def get_name(self):
return None

Expand Down Expand Up @@ -152,7 +162,7 @@ def add_translation(self, translation=None):
self.element.content = '%s %s' % (
translation, self.element.content)
else:
self.element.content = '%s %s' %(
self.element.content = '%s %s' % (
self.element.content, translation)


Expand All @@ -175,11 +185,6 @@ def add_translation(self, translation=None):


class PageElement(Element):
def _get_descendents(self, element, tags):
tags = (tags,) if isinstance(tags, str) else tags
xpath = './/*[%s]' % ' or '.join(['self::x:%s' % tag for tag in tags])
return element.xpath(xpath, namespaces=ns)

def get_name(self):
return get_name(self.element)

Expand All @@ -206,18 +211,16 @@ def _safe_remove(self, element, replacement=''):

def get_content(self):
element_copy = self._element_copy()
for noise in self._get_descendents(element_copy, ('rt', 'rp')):
self._safe_remove(noise)
# Reserve the <br> element instead of using a line break to prevent
# conflicts with the mechanism of merge translation.
target_elements = (
'img', 'code', 'br', 'hr', 'sub', 'sup', 'kbd', 'abbr', 'wbr', 'var',
'canvas', 'svg', 'script', 'style')
self.reserve_elements = self._get_descendents(
element_copy, target_elements)
if self.remove_pattern is not None:
for noise in element_copy.xpath(
self.remove_pattern, namespaces=ns):
self._safe_remove(noise)
if self.reserve_pattern is not None:
self.reserve_elements = element_copy.xpath(
self.reserve_pattern, namespaces=ns)
for eid, reserve in enumerate(self.reserve_elements):
replacement = self.placeholder[0].format(format(eid, '05'))
if get_name(reserve) in ['sub', 'sup']:
if get_name(reserve) in ('sub', 'sup'):
parent = reserve.getparent()
if parent is not None and get_name(parent) == 'a' and \
parent.text is None and reserve.tail is None and \
Expand Down Expand Up @@ -423,6 +426,11 @@ def _create_table(self, translation=None):


class Extraction:
priority_elements = (
'p', 'pre', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote')
default_filter_rules = (
r'^[-\d\s\.\'\\"‘’“”,=~!@#$%^&º*|≈<>?/`—…+:–_(){}[\]]+$',)

def __init__(
self, pages, rule_mode, filter_scope, filter_rules, element_rules):
self.pages = pages
Expand All @@ -438,9 +446,7 @@ def __init__(
self.load_element_patterns()

def load_filter_patterns(self):
default_rules = [
r'^[-\d\s\.\'\\"‘’“”,=~!@#$%^&º*|≈<>?/`—…+:–_(){}[\]]+$']
patterns = [re.compile(rule) for rule in default_rules]
patterns = [re.compile(rule) for rule in self.default_filter_rules]
for rule in self.filter_rules:
if self.rule_mode == 'normal':
rule = re.compile(re.escape(rule), re.I)
Expand All @@ -452,13 +458,9 @@ def load_filter_patterns(self):
self.filter_patterns = patterns

def load_element_patterns(self):
rules = ['pre', 'code']
rules.extend(self.element_rules)
patterns = []
for selector in rules:
rule = css(selector)
rule and patterns.append(rule)
self.element_patterns = patterns
default_selectors = ['pre', 'code']
self.element_patterns = css_to_xpath(
default_selectors + self.element_rules)

def get_sorted_pages(self):
pages = []
Expand All @@ -483,15 +485,20 @@ def need_ignore(self, element):
return False

def extract_elements(self, page_id, root, elements=[]):
priority_elements = [
'p', 'pre', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote']
"""If the root matches the pattern, return an empty list; otherwise,
just break the recursion without doing anything.
"""
if self.need_ignore(root):
return []
for element in root.findall('./*'):
if self.need_ignore(element):
continue
element_has_content = False
if element.text is not None and trim(element.text) != '':
element_has_content = True
else:
children = element.findall('./*')
if children and get_name(element) in priority_elements:
if children and get_name(element) in self.priority_elements:
element_has_content = True
else:
for child in children:
Expand Down Expand Up @@ -540,6 +547,9 @@ def __init__(self, placeholder, separator, position, merge_length=0):
self.translation_color = None
self.column_gap = None

self.remove_pattern = None
self.reserve_pattern = None

self.elements = {}
self.originals = []

Expand All @@ -559,6 +569,18 @@ def set_column_gap(self, values):
if isinstance(values, tuple) and len(values) == 2:
self.column_gap = values

def load_remove_rules(self, rules=[]):
default_rules = ('rt', 'rp')
self.remove_pattern = create_xpath(default_rules + tuple(rules))

def load_reserve_rules(self, rules=[]):
# Reserve the <br> element instead of using a line break to prevent
# conflicts with the mechanism of merge translation.
default_rules = (
'img', 'code', 'br', 'hr', 'sub', 'sup', 'kbd', 'abbr', 'wbr',
'var', 'canvas', 'svg', 'script', 'style')
self.reserve_pattern = create_xpath(default_rules + tuple(rules))

def prepare_original(self, elements):
count = 0
for oid, element in enumerate(elements):
Expand All @@ -569,6 +591,8 @@ def prepare_original(self, elements):
element.set_translation_color(self.translation_color)
if self.column_gap is not None:
element.set_column_gap(self.column_gap)
element.set_remove_pattern(self.remove_pattern)
element.set_reserve_pattern(self.reserve_pattern)
raw = element.get_raw()
content = element.get_content()
md5 = uid('%s%s' % (oid, content))
Expand Down Expand Up @@ -618,6 +642,8 @@ def prepare_original(self, elements):
element.set_translation_color(self.translation_color)
if self.column_gap is not None:
element.set_column_gap(self.column_gap)
element.set_remove_pattern(self.remove_pattern)
element.set_reserve_pattern(self.reserve_pattern)
code = element.get_raw()
content = element.get_content()
content += self.separator
Expand Down Expand Up @@ -725,7 +751,7 @@ def get_page_elements(pages):
config = get_config()
rule_mode = config.get('rule_mode')
filter_scope = config.get('filter_scope')
filter_rules = config.get('filter_rules')
filter_rules = config.get('filter_rules', [])
element_rules = config.get('element_rules', [])
extraction = Extraction(
pages, rule_mode, filter_scope, filter_rules, element_rules)
Expand All @@ -747,4 +773,6 @@ def get_element_handler(placeholder, separator):
handler.set_column_gap((gap_type, column_gap.get(gap_type)))
handler.set_original_color(config.get('original_color'))
handler.set_translation_color(config.get('translation_color'))
handler.load_remove_rules(config.get('element_rules', []))
handler.load_reserve_rules(config.get('reserve_rules', []))
return handler
13 changes: 13 additions & 0 deletions lib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,19 @@ def css(seletor):
return None


def css_to_xpath(selectors):
patterns = []
for selector in selectors:
rule = css(selector)
rule and patterns.append(rule)
return patterns


def create_xpath(selectors):
selectors = (selectors,) if isinstance(selectors, str) else selectors
return './/*[%s]' % ' or '.join(css_to_xpath(selectors))


def uid(*args):
md5 = hashlib.md5()
for arg in args:
Expand Down
27 changes: 26 additions & 1 deletion setting.py
Original file line number Diff line number Diff line change
Expand Up @@ -1072,12 +1072,25 @@ def choose_filter_mode(btn_id):
self.element_rules.setMinimumHeight(100)
self.element_rules.insertPlainText(
'\n'.join(self.config.get('element_rules')))

element_layout.addWidget(QLabel(
_('CSS selectors to exclude elements. One rule per line:')))
element_layout.addWidget(self.element_rules)
layout.addWidget(element_group)

# Reserve element
reserve_group = QGroupBox(_('Reserve Element'))
reserve_layout = QVBoxLayout(reserve_group)
self.reserve_rules = QPlainTextEdit()
self.reserve_rules.setPlaceholderText(
'%s %s' % (_('e.g.,'), 'span.footnote, a#footnote'))
self.reserve_rules.setMinimumHeight(100)
self.reserve_rules.insertPlainText(
'\n'.join(self.config.get('reserve_rules')))
reserve_layout.addWidget(QLabel(
_('CSS selectors to reserve elements. One rule per line:')))
reserve_layout.addWidget(self.reserve_rules)
layout.addWidget(reserve_group)

# Ebook Metadata
metadata_group = QGroupBox(_('Ebook Metadata'))
metadata_layout = QFormLayout(metadata_group)
Expand Down Expand Up @@ -1280,6 +1293,18 @@ def update_content_config(self):
self.config.delete('element_rules')
element_rules and self.config.update(element_rules=element_rules)

# Reserve rules
rule_content = self.reserve_rules.toPlainText()
reserve_rules = [r for r in rule_content.split('\n') if r.strip()]
for rule in reserve_rules:
if css(rule) is None:
self.alert.pop(
_('{} is not a valid CSS seletor.')
.format(rule), 'warning')
return False
self.config.delete('reserve_rules')
reserve_rules and self.config.update(reserve_rules=reserve_rules)

# Ebook metadata
ebook_metadata = self.config.get('ebook_metadata').copy()
ebook_metadata.clear()
Expand Down
1 change: 1 addition & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def test_default(self):
'filter_scope': 'text',
'filter_rules': [],
'element_rules': [],
'reserve_rules': [],
'custom_engines': {},
'glossary_enabled': False,
'glossary_path': None,
Expand Down
Loading

0 comments on commit bcf812a

Please sign in to comment.