diff --git a/se/easy_xml.py b/se/easy_xml.py index b2d8680e..5fae5fe8 100644 --- a/se/easy_xml.py +++ b/se/easy_xml.py @@ -20,6 +20,14 @@ CSS_SELECTOR_CACHE: Dict[str, cssselect.CSSSelector] = {} CSS_RULES_CACHE: Dict[str, List[se.css.CssRule]] = {} +EPUB_NAMESPACE ="http://www.idpf.org/2007/ops" +EPUB_TYPE_ATTR = etree.QName(EPUB_NAMESPACE, "type") + +XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace" +XML_LANG_ATTR = etree.QName(XML_NAMESPACE, "lang") + +XLINK_NAMESPACE = "http://www.w3.org/1999/xlink" + def escape_xpath(string: str) -> str: """ Xpath string literals don't have escape sequences for ' and " @@ -48,7 +56,7 @@ class EasyXmlTree: """ def __init__(self, xml: Union[str, etree._ElementTree]): - self.namespaces = {"re": "http://exslt.org/regular-expressions", "xml": "http://www.w3.org/XML/1998/namespace"} # Enable regular expressions in xpath; xml is the default xml namespace + self.namespaces = {"re": "http://exslt.org/regular-expressions", "xml": XML_NAMESPACE} # Enable regular expressions in xpath; xml is the default xml namespace self.default_namespace = None if isinstance(xml, etree._ElementTree): @@ -222,23 +230,6 @@ def __init__(self, lxml_element: Union[str, etree._ElementTree], namespaces=None else: self.lxml_element = lxml_element - def _replace_shorthand_namespaces(self, value:str) -> str: - """ - Given a string starting with a shorthand namespace, return - the fully qualified namespace. - - This is useful for passing to raw lxml operations as lxml doesn't understand - shorthand namespaces. - - Example: - epub:type -> {http://www.idpf.org/2007/ops}type - """ - - if self.namespaces: - value = regex.sub(r"^(\L):", lambda m: f"{{{self.namespaces[m[1]]}}}", value, ns=self.namespaces.keys()) - - return value - def to_tag_string(self) -> str: """ Return a string representing the opening tag of the element. @@ -308,18 +299,18 @@ def get_css_property(self, property_name: str): return None - def remove_attr(self, attribute: str) -> None: + def remove_attr(self, attribute: Union[str, etree.QName]) -> None: """ Remove an attribute from this node. """ try: - self.lxml_element.attrib.pop(self._replace_shorthand_namespaces(attribute)) + self.lxml_element.attrib.pop(attribute) except KeyError: # If the attribute doesn't exist, just continue pass - def add_attr_value(self, attribute: str, value: str) -> None: + def add_attr_value(self, attribute: Union[str, etree.QName], value: str) -> None: """ Add a space-separated attribute value to the target attribute. If the attribute doesn't exist, add it. @@ -334,7 +325,7 @@ def add_attr_value(self, attribute: str, value: str) -> None: self.set_attr(attribute, (existing_value + " " + value).strip()) - def remove_attr_value(self, attribute: str, value: str) -> None: + def remove_attr_value(self, attribute: Union[str, etree.QName], value: str) -> None: """ Remove a space-separated attribute value from the target attribute. If removing the value makes the attribute empty, remove the attribute. @@ -352,19 +343,19 @@ def remove_attr_value(self, attribute: str, value: str) -> None: if not self.get_attr(attribute): self.remove_attr(attribute) - def get_attr(self, attribute: str) -> str: + def get_attr(self, attribute: Union[str, etree.QName]) -> str: """ Return the value of an attribute on this element. """ - return self.lxml_element.get(self._replace_shorthand_namespaces(attribute)) + return self.lxml_element.get(attribute) - def set_attr(self, attribute: str, value: str) -> str: + def set_attr(self, attribute: Union[str, etree.QName], value: str) -> str: """ Set the value of an attribute on this element. """ - return self.lxml_element.set(self._replace_shorthand_namespaces(attribute), value) + return self.lxml_element.set(attribute, value) def xpath(self, selector: str, return_string: bool = False): """ diff --git a/se/epub.py b/se/epub.py index 2911b0de..23135c86 100644 --- a/se/epub.py +++ b/se/epub.py @@ -35,7 +35,7 @@ def convert_toc_to_ncx(epub_root_absolute_path: Path, toc_filename: str, xsl_fil # Remove empty lang tags for node in ncx_dom.xpath("//*[@xml:lang and re:test(@xml:lang, '^\\s*$')]"): - node.remove_attr("xml:lang") + node.remove_attr(se.easy_xml.XML_LANG_ATTR) for node in ncx_dom.xpath("//navMap"): node.set_attr("id", "navmap") diff --git a/se/formatting.py b/se/formatting.py index 6a1677ef..e35d5075 100644 --- a/se/formatting.py +++ b/se/formatting.py @@ -20,7 +20,7 @@ from unidecode import unidecode import se -from se.easy_xml import EasyXmlTree, EasyXmlElement +from se.easy_xml import EasyXmlTree, EasyXmlElement, EPUB_TYPE_ATTR # This list of phrasing tags is not intended to be exhaustive. The list is only used @@ -1423,7 +1423,7 @@ def generate_title(xhtml: Union[str, EasyXmlTree]) -> str: raise se.InvalidSeEbookException("No [xhtml]
[/] or [xhtml]
[/] element for [xhtml]
[/].") # If the closest parent
or
is a part, division, or volume, then keep all
children - closest_parent_section_epub_type = closest_parent_section.get_attr("epub:type") + closest_parent_section_epub_type = closest_parent_section.get_attr(EPUB_TYPE_ATTR) if not closest_parent_section_epub_type or (closest_parent_section_epub_type and ("part" not in closest_parent_section_epub_type and "division" not in closest_parent_section_epub_type and "volume" not in closest_parent_section_epub_type)): # Else, if the closest parent
or
is a halftitlepage, then discard
subtitles if closest_parent_section_epub_type and "halftitlepage" in closest_parent_section_epub_type: @@ -1471,9 +1471,9 @@ def generate_title(xhtml: Union[str, EasyXmlTree]) -> str: top_level_wrapper = top_level_wrappers[0] # Only guess the title if there is a single value for epub:type - if top_level_wrapper.get_attr("epub:type"): + if top_level_wrapper.get_attr(EPUB_TYPE_ATTR): # Get the first non-namespaced value as the title - for value in top_level_wrapper.get_attr("epub:type").split(" "): + for value in top_level_wrapper.get_attr(EPUB_TYPE_ATTR).split(" "): if value == "z3998:frontispiece": title = "Frontispiece" break @@ -1527,9 +1527,9 @@ def _get_flattened_children(node: EasyXmlElement, allow_header: bool) -> List[Ea for child in node.children: is_endnote = False is_glossdef = False - if child.get_attr("epub:type"): - is_endnote = regex.search(r"\bendnote\b", child.get_attr("epub:type")) - is_glossdef = "glossdef" in child.get_attr("epub:type") + if child.get_attr(EPUB_TYPE_ATTR): + is_endnote = regex.search(r"\bendnote\b", child.get_attr(EPUB_TYPE_ATTR)) + is_glossdef = "glossdef" in child.get_attr(EPUB_TYPE_ATTR) if child.tag not in sectioning_elements and not is_endnote and not is_glossdef: result.append(child) @@ -1571,7 +1571,7 @@ def find_unexpected_ids(dom: EasyXmlTree) -> List[Tuple[EasyXmlElement, str]]: section_id = section.get_attr("id") allow_header = not is_poem - section_epub_type = section.get_attr("epub:type") + section_epub_type = section.get_attr(EPUB_TYPE_ATTR) if section_epub_type: # If this section is a poem or an endnotes container, reset the counters if "z3998:poem" in section_epub_type: @@ -1583,7 +1583,7 @@ def find_unexpected_ids(dom: EasyXmlTree) -> List[Tuple[EasyXmlElement, str]]: endnote_number = 0 # If this section is an endnote, increment the note number and check the ID right now - if regex.search(r"\bendnote\b", section.get_attr("epub:type")): + if regex.search(r"\bendnote\b", section.get_attr(EPUB_TYPE_ATTR)): endnote_number = endnote_number + 1 expected_id = f"note-{endnote_number}" diff --git a/se/se_epub.py b/se/se_epub.py index 06857dce..b417a840 100644 --- a/se/se_epub.py +++ b/se/se_epub.py @@ -17,6 +17,7 @@ import se import se.easy_xml +from se.easy_xml import EPUB_TYPE_ATTR, XML_LANG_ATTR import se.formatting import se.images @@ -487,7 +488,7 @@ def _recompose_xhtml(self, section: se.easy_xml.EasyXmlElement, output_dom: se.e raise se.InvalidXhtmlException(f"Section without [attr]id[/] attribute: [html]{section.to_tag_string()}[/]") if section.parent.tag.lower() == "body" and not section.get_attr("data-parent"): - section.set_attr("epub:type", f"{section.get_attr('epub:type')} {section.parent.get_attr('epub:type')}".strip()) + section.set_attr(EPUB_TYPE_ATTR, f"{section.get_attr(EPUB_TYPE_ATTR)} {section.parent.get_attr(EPUB_TYPE_ATTR)}".strip()) # Try to find our parent element in the current output dom, by ID. # If it's not in the output, then append this element to the elements's closest parent by ID (or ), then iterate over its children and do the same. @@ -657,18 +658,18 @@ def recompose(self, output_xhtml5: bool, extra_css_file: Union[Path,None] = None node.remove() for node in output_dom.xpath("//*[@xml:lang]"): - node.set_attr("lang", node.get_attr("xml:lang")) + node.set_attr("lang", node.get_attr(XML_LANG_ATTR)) else: for node in output_dom.xpath("/html[@epub:prefix]"): - node.remove_attr("epub:prefix") + node.remove_attr(etree.QName(se.easy_xml.EPUB_NAMESPACE, "prefix")) for node in output_dom.xpath("//*[@xml:lang]"): - node.set_attr("lang", node.get_attr("xml:lang")) - node.remove_attr("xml:lang") + node.set_attr("lang", node.get_attr(XML_LANG_ATTR)) + node.remove_attr(XML_LANG_ATTR) for node in output_dom.xpath("//*[@epub:type]"): - node.set_attr("data-epub-type", node.get_attr("epub:type")) - node.remove_attr("epub:type") + node.set_attr("data-epub-type", node.get_attr(EPUB_TYPE_ATTR)) + node.remove_attr(EPUB_TYPE_ATTR) # Get the output XHTML as a string output_xhtml = output_dom.to_string() @@ -757,7 +758,7 @@ def generate_cover_svg(self) -> None: # Embed the file for node in dom.xpath("//*[re:test(@xlink:href, 'cover\\.jpg$')]"): - node.set_attr("xlink:href", "data:image/jpeg;base64," + source_cover_jpg_base64) + node.set_attr(etree.QName(se.easy_xml.XLINK_NAMESPACE, "href"), "data:image/jpeg;base64," + source_cover_jpg_base64) # For the cover we want to keep the path.title-box style, and add an additional # style to color our new paths white @@ -1479,7 +1480,7 @@ def __process_direct_link(self, change_list, link) -> bool: Returns a boolean of needs_write (whether object needs to be re-written) """ - epub_type = link.get_attr("epub:type") or "" + epub_type = link.get_attr(EPUB_TYPE_ATTR) or "" if not epub_type: # it wasn't an actual endnote reference but a direct link (we hope!) href = link.get_attr("href") or "" if href: diff --git a/se/se_epub_build.py b/se/se_epub_build.py index 6ec74efa..5f42b16e 100644 --- a/se/se_epub_build.py +++ b/se/se_epub_build.py @@ -26,6 +26,7 @@ import se import se.easy_xml +from se.easy_xml import EPUB_TYPE_ATTR import se.epub import se.formatting import se.images @@ -448,7 +449,7 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k # Since we added an outlining stroke to the titlepage/publisher logo images, we # want to remove the se:image.color-depth.black-on-transparent semantic for node in dom.xpath("/html/body//img[ (contains(@epub:type, 'z3998:publisher-logo') or ancestor-or-self::*[re:test(@epub:type, '\\btitlepage\\b')]) and contains(@epub:type, 'se:image.color-depth.black-on-transparent')]"): - node.remove_attr_value("epub:type", "se:image.color-depth.black-on-transparent") + node.remove_attr_value(EPUB_TYPE_ATTR, "se:image.color-depth.black-on-transparent") # Add ARIA roles, which are just mostly duplicate attributes to epub:type for role in ARIA_ROLES: @@ -459,7 +460,7 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k if node.tag == "article": continue - attr_values = regex.split(r"\s", node.get_attr("epub:type")) + attr_values = regex.split(r"\s", node.get_attr(EPUB_TYPE_ATTR)) if len(attr_values) > 1: # If there is more than one value for epub:type, ace expects the `role` attribute @@ -477,10 +478,10 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k # Matching `endnote` will also catch `endnotes` for node in dom.xpath("/html/body//*[contains(@epub:type, 'endnote')]"): plural = "" - if "endnotes" in node.get_attr("epub:type"): + if "endnotes" in node.get_attr(EPUB_TYPE_ATTR): plural = "s" - node.add_attr_value("epub:type", "footnote" + plural) + node.add_attr_value(EPUB_TYPE_ATTR, "footnote" + plural) # Remember to get our custom style selectors that we added, too if "epub-type-endnote" + plural in (node.get_attr("class") or ""): @@ -492,7 +493,7 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k # Include extra lang tag for accessibility compatibility for node in dom.xpath("//*[@xml:lang]"): - node.set_attr("lang", node.get_attr("xml:lang")) + node.set_attr("lang", node.get_attr(se.easy_xml.XML_LANG_ATTR)) processed_xhtml = se.formatting.format_xhtml(dom.to_string()) @@ -718,7 +719,7 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k # Change 'noteref' to 'endnote' so that popup endnotes work in Kobo. Kobo doesn't understand 'noteref', only 'endnote'. for node in dom.xpath("/html/body//a[contains(@epub:type, 'noteref')]"): - node.set_attr("epub:type", node.get_attr("epub:type") + " endnote") + node.set_attr(EPUB_TYPE_ATTR, node.get_attr(EPUB_TYPE_ATTR) + " endnote") # Now add the kobo spans kobo.add_kobo_spans_to_node(dom.xpath("/html/body")[0].lxml_element) @@ -977,7 +978,7 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k img_node = se.easy_xml.EasyXmlElement("", {"epub": "http://www.idpf.org/2007/ops"}) img_node.set_attr("class", "mathml epub-type-se-image-color-depth-black-on-transparent") - img_node.set_attr("epub:type", "se:image.color-depth.black-on-transparent") + img_node.set_attr(EPUB_TYPE_ATTR, "se:image.color-depth.black-on-transparent") img_node.set_attr("src", f"../images/mathml-{mathml_count}-2x.png") if node.get_attr("alttext"): img_node.set_attr("alt", node.get_attr("alttext")) @@ -1074,9 +1075,9 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k ref_node.set_attr("title", node.text) ref_node.set_attr("href", node.get_attr("href")) - if node.get_attr("epub:type"): + if node.get_attr(EPUB_TYPE_ATTR): # Set the `type` attribute and remove any z3998 items, as well as front/body/backmatter - ref_node.set_attr("type", node.get_attr("epub:type")) + ref_node.set_attr("type", node.get_attr(EPUB_TYPE_ATTR)) ref_node.set_attr("type", regex.sub(r"\s*\b(front|body|back)matter\b\s*", "", ref_node.get_attr("type"))) ref_node.set_attr("type", regex.sub(r"\s*\bz3998:.+\b\s*", "", ref_node.get_attr("type"))) @@ -1329,8 +1330,8 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k if node.get_attr("class"): node.set_attr("class", node.get_attr("class").replace("epub-type-se-image-color-depth-black-on-transparent", "").replace("epub-type-se-image-style-realistic", "")) - if node.get_attr("epub:type"): - node.set_attr("epub:type", node.get_attr("epub:type").replace("se:image.color-depth.black-on-transparent", "").replace("se:image.style.realistic", "")) + if node.get_attr(EPUB_TYPE_ATTR): + node.set_attr(EPUB_TYPE_ATTR, node.get_attr(EPUB_TYPE_ATTR).replace("se:image.color-depth.black-on-transparent", "").replace("se:image.style.realistic", "")) # If the only element on the page is an absolutely positioned image, Kindle will ignore the file in the reading order. # So, in that case we add a `
` with some text content to fool Kindle. @@ -1387,7 +1388,7 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k # Remove the epub:type attribute, as Calibre turns it into just "type" for node in dom.xpath("//*[@epub:type]"): - node.remove_attr("epub:type") + node.remove_attr(EPUB_TYPE_ATTR) # Kindle doesn't recognize most zero-width spaces or word joiners, so just remove them. # It does recognize the word joiner character, but only in the old mobi7 format. The new format renders them as spaces. diff --git a/se/se_epub_generate_toc.py b/se/se_epub_generate_toc.py index f25eedaa..f5222d50 100644 --- a/se/se_epub_generate_toc.py +++ b/se/se_epub_generate_toc.py @@ -15,7 +15,7 @@ import se import se.formatting import se.easy_xml -from se.easy_xml import EasyXmlTree, EasyXmlElement +from se.easy_xml import EasyXmlTree, EasyXmlElement, EPUB_TYPE_ATTR, XML_LANG_ATTR class BookDivision(Enum): @@ -146,7 +146,7 @@ def get_place(node: EasyXmlElement) -> Position: a Position enum value indicating the place in the book """ - epub_type = node.get_attr("epub:type") + epub_type = node.get_attr(EPUB_TYPE_ATTR) if not epub_type: return Position.NONE @@ -184,13 +184,13 @@ def add_landmark(dom: EasyXmlTree, textf: str, landmarks: list) -> None: sections = dom.xpath("//body/*[name() = 'section' or name() = 'article' or name() = 'nav']") if not sections: raise se.InvalidInputException("Couldn’t locate first [xhtml]
[/], [xhtml]
[/], or [xhtml]