Replace string-concatenation with QName for attr manipulation

In local profiling of the repository from #XXX, this reduces the time spent in regex.sub from 6 seconds to 1 second.
apasel422 · Jun 17, 2024 · 429a112 · 429a112
1 parent 6ee7e3a
commit 429a112
Show file tree

Hide file tree

Showing 7 changed files with 69 additions and 75 deletions.
diff --git a/se/easy_xml.py b/se/easy_xml.py
@@ -20,6 +20,14 @@
 CSS_SELECTOR_CACHE: Dict[str, cssselect.CSSSelector] = {}
 CSS_RULES_CACHE: Dict[str, List[se.css.CssRule]] = {}
 
+EPUB_NAMESPACE ="http://www.idpf.org/2007/ops"
+EPUB_TYPE_ATTR = etree.QName(EPUB_NAMESPACE, "type")
+
+XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
+XML_LANG_ATTR = etree.QName(XML_NAMESPACE, "lang")
+
+XLINK_NAMESPACE = "http://www.w3.org/1999/xlink"
+
 def escape_xpath(string: str) -> str:
 	"""
 	Xpath string literals don't have escape sequences for ' and "
@@ -48,7 +56,7 @@ class EasyXmlTree:
 	"""
 
 	def __init__(self, xml: Union[str, etree._ElementTree]):
-		self.namespaces = {"re": "http://exslt.org/regular-expressions", "xml": "http://www.w3.org/XML/1998/namespace"} # Enable regular expressions in xpath; xml is the default xml namespace
+		self.namespaces = {"re": "http://exslt.org/regular-expressions", "xml": XML_NAMESPACE} # Enable regular expressions in xpath; xml is the default xml namespace
 		self.default_namespace = None
 
 		if isinstance(xml, etree._ElementTree):
@@ -222,23 +230,6 @@ def __init__(self, lxml_element: Union[str, etree._ElementTree], namespaces=None
 		else:
 			self.lxml_element = lxml_element
 
-	def _replace_shorthand_namespaces(self, value:str) -> str:
-		"""
-		Given a string starting with a shorthand namespace, return
-		the fully qualified namespace.
-
-		This is useful for passing to raw lxml operations as lxml doesn't understand
-		shorthand namespaces.
-
-		Example:
-		epub:type -> {http://www.idpf.org/2007/ops}type
-		"""
-
-		if self.namespaces:
-			value = regex.sub(r"^(\L<ns>):", lambda m: f"{{{self.namespaces[m[1]]}}}", value, ns=self.namespaces.keys())
-
-		return value
-
 	def to_tag_string(self) -> str:
 		"""
 		Return a string representing the opening tag of the element.
@@ -308,18 +299,18 @@ def get_css_property(self, property_name: str):
 
 		return None
 
-	def remove_attr(self, attribute: str) -> None:
+	def remove_attr(self, attribute: Union[str, etree.QName]) -> None:
 		"""
 		Remove an attribute from this node.
 		"""
 
 		try:
-			self.lxml_element.attrib.pop(self._replace_shorthand_namespaces(attribute))
+			self.lxml_element.attrib.pop(attribute)
 		except KeyError:
 			# If the attribute doesn't exist, just continue
 			pass
 
-	def add_attr_value(self, attribute: str, value: str) -> None:
+	def add_attr_value(self, attribute: Union[str, etree.QName], value: str) -> None:
 		"""
 		Add a space-separated attribute value to the target attribute.
 		If the attribute doesn't exist, add it.
@@ -334,7 +325,7 @@ def add_attr_value(self, attribute: str, value: str) -> None:
 
 		self.set_attr(attribute, (existing_value + " " + value).strip())
 
-	def remove_attr_value(self, attribute: str, value: str) -> None:
+	def remove_attr_value(self, attribute: Union[str, etree.QName], value: str) -> None:
 		"""
 		Remove a space-separated attribute value from the target attribute.
 		If removing the value makes the attribute empty, remove the attribute.
@@ -352,19 +343,19 @@ def remove_attr_value(self, attribute: str, value: str) -> None:
 			if not self.get_attr(attribute):
 				self.remove_attr(attribute)
 
-	def get_attr(self, attribute: str) -> str:
+	def get_attr(self, attribute: Union[str, etree.QName]) -> str:
 		"""
 		Return the value of an attribute on this element.
 		"""
 
-		return self.lxml_element.get(self._replace_shorthand_namespaces(attribute))
+		return self.lxml_element.get(attribute)
 
-	def set_attr(self, attribute: str, value: str) -> str:
+	def set_attr(self, attribute: Union[str, etree.QName], value: str) -> str:
 		"""
 		Set the value of an attribute on this element.
 		"""
 
-		return self.lxml_element.set(self._replace_shorthand_namespaces(attribute), value)
+		return self.lxml_element.set(attribute, value)
 
 	def xpath(self, selector: str, return_string: bool = False):
 		"""

diff --git a/se/epub.py b/se/epub.py
@@ -35,7 +35,7 @@ def convert_toc_to_ncx(epub_root_absolute_path: Path, toc_filename: str, xsl_fil
 
 	# Remove empty lang tags
 	for node in ncx_dom.xpath("//*[@xml:lang and re:test(@xml:lang, '^\\s*$')]"):
-		node.remove_attr("xml:lang")
+		node.remove_attr(se.easy_xml.XML_LANG_ATTR)
 
 	for node in ncx_dom.xpath("//navMap"):
 		node.set_attr("id", "navmap")

diff --git a/se/formatting.py b/se/formatting.py
@@ -20,7 +20,7 @@
 from unidecode import unidecode
 
 import se
-from se.easy_xml import EasyXmlTree, EasyXmlElement
+from se.easy_xml import EasyXmlTree, EasyXmlElement, EPUB_TYPE_ATTR
 
 
 # This list of phrasing tags is not intended to be exhaustive. The list is only used
@@ -1423,7 +1423,7 @@ def generate_title(xhtml: Union[str, EasyXmlTree]) -> str:
 			raise se.InvalidSeEbookException("No [xhtml]<section>[/] or [xhtml]<article>[/] element for [xhtml]<hgroup>[/].")
 
 		# If the closest parent <section> or <article> is a part, division, or volume, then keep all <hgroup> children
-		closest_parent_section_epub_type = closest_parent_section.get_attr("epub:type")
+		closest_parent_section_epub_type = closest_parent_section.get_attr(EPUB_TYPE_ATTR)
 		if not closest_parent_section_epub_type or (closest_parent_section_epub_type and ("part" not in closest_parent_section_epub_type and "division" not in closest_parent_section_epub_type and "volume" not in closest_parent_section_epub_type)):
 			# Else, if the closest parent <section> or <article> is a halftitlepage, then discard <hgroup> subtitles
 			if closest_parent_section_epub_type and "halftitlepage" in closest_parent_section_epub_type:
@@ -1471,9 +1471,9 @@ def generate_title(xhtml: Union[str, EasyXmlTree]) -> str:
 				top_level_wrapper = top_level_wrappers[0]
 
 				# Only guess the title if there is a single value for epub:type
-				if top_level_wrapper.get_attr("epub:type"):
+				if top_level_wrapper.get_attr(EPUB_TYPE_ATTR):
 					# Get the first non-namespaced value as the title
-					for value in top_level_wrapper.get_attr("epub:type").split(" "):
+					for value in top_level_wrapper.get_attr(EPUB_TYPE_ATTR).split(" "):
 						if value == "z3998:frontispiece":
 							title = "Frontispiece"
 							break
@@ -1527,9 +1527,9 @@ def _get_flattened_children(node: EasyXmlElement, allow_header: bool) -> List[Ea
 	for child in node.children:
 		is_endnote = False
 		is_glossdef = False
-		if child.get_attr("epub:type"):
-			is_endnote = regex.search(r"\bendnote\b", child.get_attr("epub:type"))
-			is_glossdef = "glossdef" in child.get_attr("epub:type")
+		if child.get_attr(EPUB_TYPE_ATTR):
+			is_endnote = regex.search(r"\bendnote\b", child.get_attr(EPUB_TYPE_ATTR))
+			is_glossdef = "glossdef" in child.get_attr(EPUB_TYPE_ATTR)
 
 		if child.tag not in sectioning_elements and not is_endnote and not is_glossdef:
 			result.append(child)
@@ -1571,7 +1571,7 @@ def find_unexpected_ids(dom: EasyXmlTree) -> List[Tuple[EasyXmlElement, str]]:
 		section_id = section.get_attr("id")
 		allow_header = not is_poem
 
-		section_epub_type = section.get_attr("epub:type")
+		section_epub_type = section.get_attr(EPUB_TYPE_ATTR)
 		if section_epub_type:
 			# If this section is a poem or an endnotes container, reset the counters
 			if "z3998:poem" in section_epub_type:
@@ -1583,7 +1583,7 @@ def find_unexpected_ids(dom: EasyXmlTree) -> List[Tuple[EasyXmlElement, str]]:
 				endnote_number = 0
 
 			# If this section is an endnote, increment the note number and check the ID right now
-			if regex.search(r"\bendnote\b", section.get_attr("epub:type")):
+			if regex.search(r"\bendnote\b", section.get_attr(EPUB_TYPE_ATTR)):
 				endnote_number = endnote_number + 1
 				expected_id = f"note-{endnote_number}"
 

diff --git a/se/se_epub.py b/se/se_epub.py
@@ -17,6 +17,7 @@
 
 import se
 import se.easy_xml
+from se.easy_xml import EPUB_TYPE_ATTR, XML_LANG_ATTR
 import se.formatting
 import se.images
 
@@ -487,7 +488,7 @@ def _recompose_xhtml(self, section: se.easy_xml.EasyXmlElement, output_dom: se.e
 			raise se.InvalidXhtmlException(f"Section without [attr]id[/] attribute: [html]{section.to_tag_string()}[/]")
 
 		if section.parent.tag.lower() == "body" and not section.get_attr("data-parent"):
-			section.set_attr("epub:type", f"{section.get_attr('epub:type')} {section.parent.get_attr('epub:type')}".strip())
+			section.set_attr(EPUB_TYPE_ATTR, f"{section.get_attr(EPUB_TYPE_ATTR)} {section.parent.get_attr(EPUB_TYPE_ATTR)}".strip())
 
 		# Try to find our parent element in the current output dom, by ID.
 		# If it's not in the output, then append this element to the elements's closest parent by ID (or <body>), then iterate over its children and do the same.
@@ -657,18 +658,18 @@ def recompose(self, output_xhtml5: bool, extra_css_file: Union[Path,None] = None
 				node.remove()
 
 			for node in output_dom.xpath("//*[@xml:lang]"):
-				node.set_attr("lang", node.get_attr("xml:lang"))
+				node.set_attr("lang", node.get_attr(XML_LANG_ATTR))
 		else:
 			for node in output_dom.xpath("/html[@epub:prefix]"):
-				node.remove_attr("epub:prefix")
+				node.remove_attr(etree.QName(se.easy_xml.EPUB_NAMESPACE, "prefix"))
 
 			for node in output_dom.xpath("//*[@xml:lang]"):
-				node.set_attr("lang", node.get_attr("xml:lang"))
-				node.remove_attr("xml:lang")
+				node.set_attr("lang", node.get_attr(XML_LANG_ATTR))
+				node.remove_attr(XML_LANG_ATTR)
 
 			for node in output_dom.xpath("//*[@epub:type]"):
-				node.set_attr("data-epub-type", node.get_attr("epub:type"))
-				node.remove_attr("epub:type")
+				node.set_attr("data-epub-type", node.get_attr(EPUB_TYPE_ATTR))
+				node.remove_attr(EPUB_TYPE_ATTR)
 
 		# Get the output XHTML as a string
 		output_xhtml = output_dom.to_string()
@@ -757,7 +758,7 @@ def generate_cover_svg(self) -> None:
 
 			# Embed the file
 			for node in dom.xpath("//*[re:test(@xlink:href, 'cover\\.jpg$')]"):
-				node.set_attr("xlink:href", "data:image/jpeg;base64," + source_cover_jpg_base64)
+				node.set_attr(etree.QName(se.easy_xml.XLINK_NAMESPACE, "href"), "data:image/jpeg;base64," + source_cover_jpg_base64)
 
 			# For the cover we want to keep the path.title-box style, and add an additional
 			# style to color our new paths white
@@ -1479,7 +1480,7 @@ def __process_direct_link(self, change_list, link) -> bool:
 
 		Returns a boolean of needs_write (whether object needs to be re-written)
 		"""
-		epub_type = link.get_attr("epub:type") or ""
+		epub_type = link.get_attr(EPUB_TYPE_ATTR) or ""
 		if not epub_type: # it wasn't an actual endnote reference but a direct link (we hope!)
 			href = link.get_attr("href") or ""
 			if href:

diff --git a/se/se_epub_build.py b/se/se_epub_build.py
@@ -26,6 +26,7 @@
 
 import se
 import se.easy_xml
+from se.easy_xml import EPUB_TYPE_ATTR
 import se.epub
 import se.formatting
 import se.images
@@ -448,7 +449,7 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k
 				# Since we added an outlining stroke to the titlepage/publisher logo images, we
 				# want to remove the se:image.color-depth.black-on-transparent semantic
 				for node in dom.xpath("/html/body//img[ (contains(@epub:type, 'z3998:publisher-logo') or ancestor-or-self::*[re:test(@epub:type, '\\btitlepage\\b')]) and contains(@epub:type, 'se:image.color-depth.black-on-transparent')]"):
-					node.remove_attr_value("epub:type", "se:image.color-depth.black-on-transparent")
+					node.remove_attr_value(EPUB_TYPE_ATTR, "se:image.color-depth.black-on-transparent")
 
 				# Add ARIA roles, which are just mostly duplicate attributes to epub:type
 				for role in ARIA_ROLES:
@@ -459,7 +460,7 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k
 						if node.tag == "article":
 							continue
 
-						attr_values = regex.split(r"\s", node.get_attr("epub:type"))
+						attr_values = regex.split(r"\s", node.get_attr(EPUB_TYPE_ATTR))
 
 						if len(attr_values) > 1:
 							# If there is more than one value for epub:type, ace expects the `role` attribute
@@ -477,10 +478,10 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k
 				# Matching `endnote` will also catch `endnotes`
 				for node in dom.xpath("/html/body//*[contains(@epub:type, 'endnote')]"):
 					plural = ""
-					if "endnotes" in node.get_attr("epub:type"):
+					if "endnotes" in node.get_attr(EPUB_TYPE_ATTR):
 						plural = "s"
 
-					node.add_attr_value("epub:type", "footnote" + plural)
+					node.add_attr_value(EPUB_TYPE_ATTR, "footnote" + plural)
 
 					# Remember to get our custom style selectors that we added, too
 					if "epub-type-endnote" + plural in (node.get_attr("class") or ""):
@@ -492,7 +493,7 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k
 
 				# Include extra lang tag for accessibility compatibility
 				for node in dom.xpath("//*[@xml:lang]"):
-					node.set_attr("lang", node.get_attr("xml:lang"))
+					node.set_attr("lang", node.get_attr(se.easy_xml.XML_LANG_ATTR))
 
 				processed_xhtml = se.formatting.format_xhtml(dom.to_string())
 
@@ -718,7 +719,7 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k
 
 					# Change 'noteref' to 'endnote' so that popup endnotes work in Kobo. Kobo doesn't understand 'noteref', only 'endnote'.
 					for node in dom.xpath("/html/body//a[contains(@epub:type, 'noteref')]"):
-						node.set_attr("epub:type", node.get_attr("epub:type") + " endnote")
+						node.set_attr(EPUB_TYPE_ATTR, node.get_attr(EPUB_TYPE_ATTR) + " endnote")
 
 					# Now add the kobo spans
 					kobo.add_kobo_spans_to_node(dom.xpath("/html/body")[0].lxml_element)
@@ -977,7 +978,7 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k
 
 							img_node = se.easy_xml.EasyXmlElement("<img/>", {"epub": "http://www.idpf.org/2007/ops"})
 							img_node.set_attr("class", "mathml epub-type-se-image-color-depth-black-on-transparent")
-							img_node.set_attr("epub:type", "se:image.color-depth.black-on-transparent")
+							img_node.set_attr(EPUB_TYPE_ATTR, "se:image.color-depth.black-on-transparent")
 							img_node.set_attr("src", f"../images/mathml-{mathml_count}-2x.png")
 							if node.get_attr("alttext"):
 								img_node.set_attr("alt", node.get_attr("alttext"))
@@ -1074,9 +1075,9 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k
 
 			ref_node.set_attr("title", node.text)
 			ref_node.set_attr("href", node.get_attr("href"))
-			if node.get_attr("epub:type"):
+			if node.get_attr(EPUB_TYPE_ATTR):
 				# Set the `type` attribute and remove any z3998 items, as well as front/body/backmatter
-				ref_node.set_attr("type", node.get_attr("epub:type"))
+				ref_node.set_attr("type", node.get_attr(EPUB_TYPE_ATTR))
 				ref_node.set_attr("type", regex.sub(r"\s*\b(front|body|back)matter\b\s*", "", ref_node.get_attr("type")))
 				ref_node.set_attr("type", regex.sub(r"\s*\bz3998:.+\b\s*", "", ref_node.get_attr("type")))
 
@@ -1329,8 +1330,8 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k
 					if node.get_attr("class"):
 						node.set_attr("class", node.get_attr("class").replace("epub-type-se-image-color-depth-black-on-transparent", "").replace("epub-type-se-image-style-realistic", ""))
 
-					if node.get_attr("epub:type"):
-						node.set_attr("epub:type", node.get_attr("epub:type").replace("se:image.color-depth.black-on-transparent", "").replace("se:image.style.realistic", ""))
+					if node.get_attr(EPUB_TYPE_ATTR):
+						node.set_attr(EPUB_TYPE_ATTR, node.get_attr(EPUB_TYPE_ATTR).replace("se:image.color-depth.black-on-transparent", "").replace("se:image.style.realistic", ""))
 
 				# If the only element on the page is an absolutely positioned image, Kindle will ignore the file in the reading order.
 				# So, in that case we add a `<div>` with some text content to fool Kindle.
@@ -1387,7 +1388,7 @@ def build(self, run_epubcheck: bool, check_only: bool, build_kobo: bool, build_k
 
 				# Remove the epub:type attribute, as Calibre turns it into just "type"
 				for node in dom.xpath("//*[@epub:type]"):
-					node.remove_attr("epub:type")
+					node.remove_attr(EPUB_TYPE_ATTR)
 
 				# Kindle doesn't recognize most zero-width spaces or word joiners, so just remove them.
 				# It does recognize the word joiner character, but only in the old mobi7 format.  The new format renders them as spaces.