Skip to content

Commit

Permalink
Replace string-concatenation with QName for attr manipulation
Browse files Browse the repository at this point in the history
In local profiling of the repository from #XXX, this reduces the
time spent in regex.sub from 6 seconds to 1 second.
  • Loading branch information
apasel422 committed Jun 17, 2024
1 parent 6ee7e3a commit 429a112
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 75 deletions.
43 changes: 17 additions & 26 deletions se/easy_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@
CSS_SELECTOR_CACHE: Dict[str, cssselect.CSSSelector] = {}
CSS_RULES_CACHE: Dict[str, List[se.css.CssRule]] = {}

EPUB_NAMESPACE ="http://www.idpf.org/2007/ops"
EPUB_TYPE_ATTR = etree.QName(EPUB_NAMESPACE, "type")

XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
XML_LANG_ATTR = etree.QName(XML_NAMESPACE, "lang")

XLINK_NAMESPACE = "http://www.w3.org/1999/xlink"

def escape_xpath(string: str) -> str:
"""
Xpath string literals don't have escape sequences for ' and "
Expand Down Expand Up @@ -48,7 +56,7 @@ class EasyXmlTree:
"""

def __init__(self, xml: Union[str, etree._ElementTree]):
self.namespaces = {"re": "http://exslt.org/regular-expressions", "xml": "http://www.w3.org/XML/1998/namespace"} # Enable regular expressions in xpath; xml is the default xml namespace
self.namespaces = {"re": "http://exslt.org/regular-expressions", "xml": XML_NAMESPACE} # Enable regular expressions in xpath; xml is the default xml namespace
self.default_namespace = None

if isinstance(xml, etree._ElementTree):
Expand Down Expand Up @@ -222,23 +230,6 @@ def __init__(self, lxml_element: Union[str, etree._ElementTree], namespaces=None
else:
self.lxml_element = lxml_element

def _replace_shorthand_namespaces(self, value:str) -> str:
"""
Given a string starting with a shorthand namespace, return
the fully qualified namespace.
This is useful for passing to raw lxml operations as lxml doesn't understand
shorthand namespaces.
Example:
epub:type -> {http://www.idpf.org/2007/ops}type
"""

if self.namespaces:
value = regex.sub(r"^(\L<ns>):", lambda m: f"{{{self.namespaces[m[1]]}}}", value, ns=self.namespaces.keys())

return value

def to_tag_string(self) -> str:
"""
Return a string representing the opening tag of the element.
Expand Down Expand Up @@ -308,18 +299,18 @@ def get_css_property(self, property_name: str):

return None

def remove_attr(self, attribute: str) -> None:
def remove_attr(self, attribute: Union[str, etree.QName]) -> None:
"""
Remove an attribute from this node.
"""

try:
self.lxml_element.attrib.pop(self._replace_shorthand_namespaces(attribute))
self.lxml_element.attrib.pop(attribute)
except KeyError:
# If the attribute doesn't exist, just continue
pass

def add_attr_value(self, attribute: str, value: str) -> None:
def add_attr_value(self, attribute: Union[str, etree.QName], value: str) -> None:
"""
Add a space-separated attribute value to the target attribute.
If the attribute doesn't exist, add it.
Expand All @@ -334,7 +325,7 @@ def add_attr_value(self, attribute: str, value: str) -> None:

self.set_attr(attribute, (existing_value + " " + value).strip())

def remove_attr_value(self, attribute: str, value: str) -> None:
def remove_attr_value(self, attribute: Union[str, etree.QName], value: str) -> None:
"""
Remove a space-separated attribute value from the target attribute.
If removing the value makes the attribute empty, remove the attribute.
Expand All @@ -352,19 +343,19 @@ def remove_attr_value(self, attribute: str, value: str) -> None:
if not self.get_attr(attribute):
self.remove_attr(attribute)

def get_attr(self, attribute: str) -> str:
def get_attr(self, attribute: Union[str, etree.QName]) -> str:
"""
Return the value of an attribute on this element.
"""

return self.lxml_element.get(self._replace_shorthand_namespaces(attribute))
return self.lxml_element.get(attribute)

def set_attr(self, attribute: str, value: str) -> str:
def set_attr(self, attribute: Union[str, etree.QName], value: str) -> str:
"""
Set the value of an attribute on this element.
"""

return self.lxml_element.set(self._replace_shorthand_namespaces(attribute), value)
return self.lxml_element.set(attribute, value)

def xpath(self, selector: str, return_string: bool = False):
"""
Expand Down
2 changes: 1 addition & 1 deletion se/epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def convert_toc_to_ncx(epub_root_absolute_path: Path, toc_filename: str, xsl_fil

# Remove empty lang tags
for node in ncx_dom.xpath("//*[@xml:lang and re:test(@xml:lang, '^\\s*$')]"):
node.remove_attr("xml:lang")
node.remove_attr(se.easy_xml.XML_LANG_ATTR)

for node in ncx_dom.xpath("//navMap"):
node.set_attr("id", "navmap")
Expand Down
18 changes: 9 additions & 9 deletions se/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from unidecode import unidecode

import se
from se.easy_xml import EasyXmlTree, EasyXmlElement
from se.easy_xml import EasyXmlTree, EasyXmlElement, EPUB_TYPE_ATTR


# This list of phrasing tags is not intended to be exhaustive. The list is only used
Expand Down Expand Up @@ -1423,7 +1423,7 @@ def generate_title(xhtml: Union[str, EasyXmlTree]) -> str:
raise se.InvalidSeEbookException("No [xhtml]<section>[/] or [xhtml]<article>[/] element for [xhtml]<hgroup>[/].")

# If the closest parent <section> or <article> is a part, division, or volume, then keep all <hgroup> children
closest_parent_section_epub_type = closest_parent_section.get_attr("epub:type")
closest_parent_section_epub_type = closest_parent_section.get_attr(EPUB_TYPE_ATTR)
if not closest_parent_section_epub_type or (closest_parent_section_epub_type and ("part" not in closest_parent_section_epub_type and "division" not in closest_parent_section_epub_type and "volume" not in closest_parent_section_epub_type)):
# Else, if the closest parent <section> or <article> is a halftitlepage, then discard <hgroup> subtitles
if closest_parent_section_epub_type and "halftitlepage" in closest_parent_section_epub_type:
Expand Down Expand Up @@ -1471,9 +1471,9 @@ def generate_title(xhtml: Union[str, EasyXmlTree]) -> str:
top_level_wrapper = top_level_wrappers[0]

# Only guess the title if there is a single value for epub:type
if top_level_wrapper.get_attr("epub:type"):
if top_level_wrapper.get_attr(EPUB_TYPE_ATTR):
# Get the first non-namespaced value as the title
for value in top_level_wrapper.get_attr("epub:type").split(" "):
for value in top_level_wrapper.get_attr(EPUB_TYPE_ATTR).split(" "):
if value == "z3998:frontispiece":
title = "Frontispiece"
break
Expand Down Expand Up @@ -1527,9 +1527,9 @@ def _get_flattened_children(node: EasyXmlElement, allow_header: bool) -> List[Ea
for child in node.children:
is_endnote = False
is_glossdef = False
if child.get_attr("epub:type"):
is_endnote = regex.search(r"\bendnote\b", child.get_attr("epub:type"))
is_glossdef = "glossdef" in child.get_attr("epub:type")
if child.get_attr(EPUB_TYPE_ATTR):
is_endnote = regex.search(r"\bendnote\b", child.get_attr(EPUB_TYPE_ATTR))
is_glossdef = "glossdef" in child.get_attr(EPUB_TYPE_ATTR)

if child.tag not in sectioning_elements and not is_endnote and not is_glossdef:
result.append(child)
Expand Down Expand Up @@ -1571,7 +1571,7 @@ def find_unexpected_ids(dom: EasyXmlTree) -> List[Tuple[EasyXmlElement, str]]:
section_id = section.get_attr("id")
allow_header = not is_poem

section_epub_type = section.get_attr("epub:type")
section_epub_type = section.get_attr(EPUB_TYPE_ATTR)
if section_epub_type:
# If this section is a poem or an endnotes container, reset the counters
if "z3998:poem" in section_epub_type:
Expand All @@ -1583,7 +1583,7 @@ def find_unexpected_ids(dom: EasyXmlTree) -> List[Tuple[EasyXmlElement, str]]:
endnote_number = 0

# If this section is an endnote, increment the note number and check the ID right now
if regex.search(r"\bendnote\b", section.get_attr("epub:type")):
if regex.search(r"\bendnote\b", section.get_attr(EPUB_TYPE_ATTR)):
endnote_number = endnote_number + 1
expected_id = f"note-{endnote_number}"

Expand Down
19 changes: 10 additions & 9 deletions se/se_epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import se
import se.easy_xml
from se.easy_xml import EPUB_TYPE_ATTR, XML_LANG_ATTR
import se.formatting
import se.images

Expand Down Expand Up @@ -487,7 +488,7 @@ def _recompose_xhtml(self, section: se.easy_xml.EasyXmlElement, output_dom: se.e
raise se.InvalidXhtmlException(f"Section without [attr]id[/] attribute: [html]{section.to_tag_string()}[/]")

if section.parent.tag.lower() == "body" and not section.get_attr("data-parent"):
section.set_attr("epub:type", f"{section.get_attr('epub:type')} {section.parent.get_attr('epub:type')}".strip())
section.set_attr(EPUB_TYPE_ATTR, f"{section.get_attr(EPUB_TYPE_ATTR)} {section.parent.get_attr(EPUB_TYPE_ATTR)}".strip())

# Try to find our parent element in the current output dom, by ID.
# If it's not in the output, then append this element to the elements's closest parent by ID (or <body>), then iterate over its children and do the same.
Expand Down Expand Up @@ -657,18 +658,18 @@ def recompose(self, output_xhtml5: bool, extra_css_file: Union[Path,None] = None
node.remove()

for node in output_dom.xpath("//*[@xml:lang]"):
node.set_attr("lang", node.get_attr("xml:lang"))
node.set_attr("lang", node.get_attr(XML_LANG_ATTR))
else:
for node in output_dom.xpath("/html[@epub:prefix]"):
node.remove_attr("epub:prefix")
node.remove_attr(etree.QName(se.easy_xml.EPUB_NAMESPACE, "prefix"))

for node in output_dom.xpath("//*[@xml:lang]"):
node.set_attr("lang", node.get_attr("xml:lang"))
node.remove_attr("xml:lang")
node.set_attr("lang", node.get_attr(XML_LANG_ATTR))
node.remove_attr(XML_LANG_ATTR)

for node in output_dom.xpath("//*[@epub:type]"):
node.set_attr("data-epub-type", node.get_attr("epub:type"))
node.remove_attr("epub:type")
node.set_attr("data-epub-type", node.get_attr(EPUB_TYPE_ATTR))
node.remove_attr(EPUB_TYPE_ATTR)

# Get the output XHTML as a string
output_xhtml = output_dom.to_string()
Expand Down Expand Up @@ -757,7 +758,7 @@ def generate_cover_svg(self) -> None:

# Embed the file
for node in dom.xpath("//*[re:test(@xlink:href, 'cover\\.jpg$')]"):
node.set_attr("xlink:href", "data:image/jpeg;base64," + source_cover_jpg_base64)
node.set_attr(etree.QName(se.easy_xml.XLINK_NAMESPACE, "href"), "data:image/jpeg;base64," + source_cover_jpg_base64)

# For the cover we want to keep the path.title-box style, and add an additional
# style to color our new paths white
Expand Down Expand Up @@ -1479,7 +1480,7 @@ def __process_direct_link(self, change_list, link) -> bool:
Returns a boolean of needs_write (whether object needs to be re-written)
"""
epub_type = link.get_attr("epub:type") or ""
epub_type = link.get_attr(EPUB_TYPE_ATTR) or ""
if not epub_type: # it wasn't an actual endnote reference but a direct link (we hope!)
href = link.get_attr("href") or ""
if href:
Expand Down
25 changes: 13 additions & 12 deletions se/se_epub_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

import se
import se.easy_xml
from se.easy_xml import EPUB_TYPE_ATTR
import se.epub
import se.formatting
import se.images
Expand Down Expand Up @@ -448,7 +449,7 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k
# Since we added an outlining stroke to the titlepage/publisher logo images, we
# want to remove the se:image.color-depth.black-on-transparent semantic
for node in dom.xpath("/html/body//img[ (contains(@epub:type, 'z3998:publisher-logo') or ancestor-or-self::*[re:test(@epub:type, '\\btitlepage\\b')]) and contains(@epub:type, 'se:image.color-depth.black-on-transparent')]"):
node.remove_attr_value("epub:type", "se:image.color-depth.black-on-transparent")
node.remove_attr_value(EPUB_TYPE_ATTR, "se:image.color-depth.black-on-transparent")

# Add ARIA roles, which are just mostly duplicate attributes to epub:type
for role in ARIA_ROLES:
Expand All @@ -459,7 +460,7 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k
if node.tag == "article":
continue

attr_values = regex.split(r"\s", node.get_attr("epub:type"))
attr_values = regex.split(r"\s", node.get_attr(EPUB_TYPE_ATTR))

if len(attr_values) > 1:
# If there is more than one value for epub:type, ace expects the `role` attribute
Expand All @@ -477,10 +478,10 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k
# Matching `endnote` will also catch `endnotes`
for node in dom.xpath("/html/body//*[contains(@epub:type, 'endnote')]"):
plural = ""
if "endnotes" in node.get_attr("epub:type"):
if "endnotes" in node.get_attr(EPUB_TYPE_ATTR):
plural = "s"

node.add_attr_value("epub:type", "footnote" + plural)
node.add_attr_value(EPUB_TYPE_ATTR, "footnote" + plural)

# Remember to get our custom style selectors that we added, too
if "epub-type-endnote" + plural in (node.get_attr("class") or ""):
Expand All @@ -492,7 +493,7 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k

# Include extra lang tag for accessibility compatibility
for node in dom.xpath("//*[@xml:lang]"):
node.set_attr("lang", node.get_attr("xml:lang"))
node.set_attr("lang", node.get_attr(se.easy_xml.XML_LANG_ATTR))

processed_xhtml = se.formatting.format_xhtml(dom.to_string())

Expand Down Expand Up @@ -718,7 +719,7 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k

# Change 'noteref' to 'endnote' so that popup endnotes work in Kobo. Kobo doesn't understand 'noteref', only 'endnote'.
for node in dom.xpath("/html/body//a[contains(@epub:type, 'noteref')]"):
node.set_attr("epub:type", node.get_attr("epub:type") + " endnote")
node.set_attr(EPUB_TYPE_ATTR, node.get_attr(EPUB_TYPE_ATTR) + " endnote")

# Now add the kobo spans
kobo.add_kobo_spans_to_node(dom.xpath("/html/body")[0].lxml_element)
Expand Down Expand Up @@ -977,7 +978,7 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k

img_node = se.easy_xml.EasyXmlElement("<img/>", {"epub": "http://www.idpf.org/2007/ops"})
img_node.set_attr("class", "mathml epub-type-se-image-color-depth-black-on-transparent")
img_node.set_attr("epub:type", "se:image.color-depth.black-on-transparent")
img_node.set_attr(EPUB_TYPE_ATTR, "se:image.color-depth.black-on-transparent")
img_node.set_attr("src", f"../images/mathml-{mathml_count}-2x.png")
if node.get_attr("alttext"):
img_node.set_attr("alt", node.get_attr("alttext"))
Expand Down Expand Up @@ -1074,9 +1075,9 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k

ref_node.set_attr("title", node.text)
ref_node.set_attr("href", node.get_attr("href"))
if node.get_attr("epub:type"):
if node.get_attr(EPUB_TYPE_ATTR):
# Set the `type` attribute and remove any z3998 items, as well as front/body/backmatter
ref_node.set_attr("type", node.get_attr("epub:type"))
ref_node.set_attr("type", node.get_attr(EPUB_TYPE_ATTR))
ref_node.set_attr("type", regex.sub(r"\s*\b(front|body|back)matter\b\s*", "", ref_node.get_attr("type")))
ref_node.set_attr("type", regex.sub(r"\s*\bz3998:.+\b\s*", "", ref_node.get_attr("type")))

Expand Down Expand Up @@ -1329,8 +1330,8 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k
if node.get_attr("class"):
node.set_attr("class", node.get_attr("class").replace("epub-type-se-image-color-depth-black-on-transparent", "").replace("epub-type-se-image-style-realistic", ""))

if node.get_attr("epub:type"):
node.set_attr("epub:type", node.get_attr("epub:type").replace("se:image.color-depth.black-on-transparent", "").replace("se:image.style.realistic", ""))
if node.get_attr(EPUB_TYPE_ATTR):
node.set_attr(EPUB_TYPE_ATTR, node.get_attr(EPUB_TYPE_ATTR).replace("se:image.color-depth.black-on-transparent", "").replace("se:image.style.realistic", ""))

# If the only element on the page is an absolutely positioned image, Kindle will ignore the file in the reading order.
# So, in that case we add a `<div>` with some text content to fool Kindle.
Expand Down Expand Up @@ -1387,7 +1388,7 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k

# Remove the epub:type attribute, as Calibre turns it into just "type"
for node in dom.xpath("//*[@epub:type]"):
node.remove_attr("epub:type")
node.remove_attr(EPUB_TYPE_ATTR)

# Kindle doesn't recognize most zero-width spaces or word joiners, so just remove them.
# It does recognize the word joiner character, but only in the old mobi7 format. The new format renders them as spaces.
Expand Down
Loading

0 comments on commit 429a112

Please sign in to comment.