Skip to content

Commit

Permalink
Clean up regex flags
Browse files Browse the repository at this point in the history
- MULTILINE is irrelevant unless ^ is used to match line starts
- IGNORECASE is irrelevant when case-agnostic character classes are
  already being used (e.g. \w or \p{Letter}) or when only
  punctuation/spaces are being matched
- DOTALL is irrelevant unless . is used to match all characters
  • Loading branch information
apasel422 committed Jul 15, 2024
1 parent a394344 commit 8665308
Show file tree
Hide file tree
Showing 8 changed files with 51 additions and 51 deletions.
6 changes: 3 additions & 3 deletions se/commands/create_draft.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,8 +727,8 @@ def _create_draft(args: Namespace, plain_output: bool):

producers_text = regex.sub(r".+?Produced by (.+?)\s*$", "\\1", producers_text, flags=regex.DOTALL)
producers_text = regex.sub(r"\(.+?\)", "", producers_text, flags=regex.DOTALL)
producers_text = regex.sub(r"(at )?https?://www\.pgdp\.net", "", producers_text, flags=regex.DOTALL)
producers_text = regex.sub(r"[\r\n]+", " ", producers_text, flags=regex.DOTALL)
producers_text = regex.sub(r"(at )?https?://www\.pgdp\.net", "", producers_text)
producers_text = regex.sub(r"[\r\n]+", " ", producers_text)
producers_text = regex.sub(r",? and ", ", and ", producers_text)
producers_text = producers_text.replace(" and the Online", " and The Online")
producers_text = producers_text.replace(", and ", ", ").strip()
Expand Down Expand Up @@ -945,7 +945,7 @@ def _create_draft(args: Namespace, plain_output: bool):

i = i + 1

metadata_xml = regex.sub(r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>", "\t\t" + producers_xhtml.strip(), metadata_xml, flags=regex.DOTALL)
metadata_xml = regex.sub(r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url\.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>", "\t\t" + producers_xhtml.strip(), metadata_xml)

if ebook_wiki_url:
metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<", f">{ebook_wiki_url}<")
Expand Down
2 changes: 1 addition & 1 deletion se/commands/word_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def word_count(plain_output: bool) -> int:

else:
# We couldn't generate a dom, fall back to regex replacements
xhtml = regex.sub(r"<(pre|div|p)[^>]*?>[^<]*Project Gutenberg[^<]+?</\1>", "", xhtml, flags=regex.IGNORECASE|regex.DOTALL)
xhtml = regex.sub(r"<(pre|div|p)[^>]*?>[^<]*Project Gutenberg[^<]+?</\1>", "", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"<span class=\"pagenum\">.+?</span>", "", xhtml, flags=regex.IGNORECASE|regex.DOTALL)

total_word_count += se.formatting.get_word_count(xhtml)
Expand Down
18 changes: 9 additions & 9 deletions se/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,16 +362,16 @@ def get_word_count(xhtml: str) -> int:
xhtml = regex.sub(r"<.+?>", " ", xhtml, flags=regex.DOTALL)

# Replace some formatting characters
xhtml = regex.sub(r"[…–—― ‘’“”\{\}\(\)]", " ", xhtml, flags=regex.IGNORECASE | regex.DOTALL)
xhtml = regex.sub(r"[…–—― ‘’“”\{\}\(\)]", " ", xhtml)

# Remove word-connecting dashes, apostrophes, commas, and slashes (and/or), they count as a word boundry but they shouldn't
xhtml = regex.sub(fr"[\p{{Letter}}0-9][\-\'\,\.\/{se.NO_BREAK_HYPHEN}{se.SHY_HYPHEN}][\p{{Letter}}0-9]", "aa", xhtml, flags=regex.IGNORECASE | regex.DOTALL)
xhtml = regex.sub(fr"[\p{{Letter}}0-9][\-\'\,\.\/{se.NO_BREAK_HYPHEN}{se.SHY_HYPHEN}][\p{{Letter}}0-9]", "aa", xhtml)

# Replace sequential spaces with one space
xhtml = regex.sub(r"\s+", " ", xhtml, flags=regex.IGNORECASE | regex.DOTALL)
xhtml = regex.sub(r"\s+", " ", xhtml)

# Get the word count
return len(regex.findall(r"\b\w+\b", xhtml, flags=regex.IGNORECASE | regex.DOTALL))
return len(regex.findall(r"\b\w+\b", xhtml))

def _replace_character_references(match_object) -> str:
"""Replace most XML character references with literal characters.
Expand Down Expand Up @@ -660,13 +660,13 @@ def format_xhtml(xhtml: str) -> str:
xhtml = regex.sub(r"&#?\w+;", _replace_character_references, xhtml)

# Remove unnecessary doctypes which can cause xmllint to hang
xhtml = regex.sub(r"<!DOCTYPE[^>]+?>", "", xhtml, flags=regex.DOTALL)
xhtml = regex.sub(r"<!DOCTYPE[^>]+?>", "", xhtml)

# Remove white space between opening/closing tag and text nodes
# We do this first so that we can still format line breaks after <br/>
# Exclude comments
xhtml = regex.sub(r"(<(?:[^!/][^>]*?[^/]|[a-z])>)\s+([^\s<])", r"\1\2", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"([^\s>])\s+(</[^>]+?>)", r"\1\2", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"([^\s>])\s+(</[^>]+?>)", r"\1\2", xhtml)

try:
tree = _format_xml_str(xhtml)
Expand Down Expand Up @@ -1088,7 +1088,7 @@ def format_css(css: str) -> str:
output = regex.sub(r"(@[\p{Letter}]+) \(", "\\1(", output)

# Remove empty rules
output = regex.sub(r"^\t*[^\{\}]+?\{\s*\}\n", "", output, flags=regex.DOTALL|regex.MULTILINE)
output = regex.sub(r"^\t*[^\{\}]+?\{\s*\}\n", "", output, flags=regex.MULTILINE)

return output

Expand All @@ -1103,7 +1103,7 @@ def remove_tags(text: str) -> str:
A string with all HTML tags removed
"""

return regex.sub(r"</?[\p{Letter}]+[^>]*?>", "", text, flags=regex.DOTALL)
return regex.sub(r"</?[\p{Letter}]+[^>]*?>", "", text)

def get_ordinal(number: str) -> str:
"""
Expand Down Expand Up @@ -1296,7 +1296,7 @@ def make_url_safe(text: str) -> str:
text = regex.sub(r"['‘’`]", "", text)

# 5. Convert any non-digit, non-letter character to a space
text = regex.sub(r"[^0-9\p{Letter}]", " ", text, flags=regex.IGNORECASE)
text = regex.sub(r"[^0-9\p{Letter}]", " ", text)

# 6. Convert any instance of one or more space to a dash
text = regex.sub(r"\s+", "-", text)
Expand Down
2 changes: 1 addition & 1 deletion se/se_epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,7 @@ def recompose(self, output_xhtml5: bool, extra_css_file: Union[Path,None] = None
output_xhtml = output_xhtml.replace("epub|type", "data-epub-type")
output_xhtml = output_xhtml.replace("xml|lang", "lang")
output_xhtml = regex.sub(r" xmlns.+?=\".+?\"", "", output_xhtml)
output_xhtml = regex.sub(r"@namespace (epub|xml).+?\s+", "", output_xhtml, flags=regex.MULTILINE)
output_xhtml = regex.sub(r"@namespace (epub|xml).+?\s+", "", output_xhtml)

# The Nu HTML5 Validator barfs if non-void elements are self-closed (like <td/>)
# Try to un-self-close them for HTML5 output.
Expand Down
2 changes: 1 addition & 1 deletion se/se_epub_generate_toc.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def toc_link(self) -> str:
out_string += f"<a href=\"text/{self.file_link}\">{self.title}</a>\n"

# Replace <br/> with a single space
out_string = regex.sub(r"<br/>\s*", " ", out_string, flags=regex.DOTALL)
out_string = regex.sub(r"<br/>\s*", " ", out_string)

return out_string

Expand Down
14 changes: 7 additions & 7 deletions se/se_epub_lint.py
Original file line number Diff line number Diff line change
Expand Up @@ -1827,7 +1827,7 @@ def _lint_xhtml_syntax_checks(self, filename: Path, dom: se.easy_xml.EasyXmlTree
title = regex.sub(r"^[\s\.\,\!\?\:\;]*", "", title)

# Normalize whitespace
title = regex.sub(r"\s+", " ", title, flags=regex.DOTALL).strip()
title = regex.sub(r"\s+", " ", title).strip()

# Do we have a subtitle? If so the first letter of that must be capitalized, so we pull that out
subtitle_matches = regex.findall(r"(.*?)<span epub:type=\"subtitle\">(.*?)</span>(.*?)", title, flags=regex.DOTALL)
Expand Down Expand Up @@ -2313,7 +2313,7 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree,

# Check for repeated punctuation, but first remove `&amp;` so we don't match `&amp;,`
# Remove tds with repeated ” as they are probably ditto marks
matches = regex.findall(r"[,;]{2,}.{0,20}", file_contents.replace("&amp;", "")) + regex.findall(r"(?:“\s*“|”\s*”|’ ’|‘\s*‘).{0,20}", regex.sub(r"<td>[”\s]+?(<a .+?epub:type=\"noteref\">.+?</a>)?</td>", "", file_contents)) + regex.findall(r"[\p{Letter}][,\.:;]\s[,\.:;]\s?[\p{Letter}<].{0,20}", file_contents, flags=regex.IGNORECASE)
matches = regex.findall(r"[,;]{2,}.{0,20}", file_contents.replace("&amp;", "")) + regex.findall(r"(?:“\s*“|”\s*”|’ ’|‘\s*‘).{0,20}", regex.sub(r"<td>[”\s]+?(<a .+?epub:type=\"noteref\">.+?</a>)?</td>", "", file_contents)) + regex.findall(r"[\p{Letter}][,\.:;]\s[,\.:;]\s?[\p{Letter}<].{0,20}", file_contents)
if matches:
messages.append(LintMessage("t-008", "Repeated punctuation.", se.MESSAGE_TYPE_WARNING, filename, matches))

Expand Down Expand Up @@ -2608,7 +2608,7 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree,
messages.append(LintMessage("t-048", "Chapter opening text in all-caps.", se.MESSAGE_TYPE_ERROR, filename, [node.to_string() for node in nodes]))

# Check for two-em-dashes used for elision instead of three-em-dashes
matches = regex.findall(fr"[^{se.WORD_JOINER}\p{{Letter}}”]⸺[^“{se.WORD_JOINER}\p{{Letter}}].*", file_contents, flags=regex.MULTILINE)
matches = regex.findall(fr"[^{se.WORD_JOINER}\p{{Letter}}”]⸺[^“{se.WORD_JOINER}\p{{Letter}}].*", file_contents)
if matches:
messages.append(LintMessage("t-049", "Two-em-dash used for eliding an entire word. Use a three-em-dash instead.", se.MESSAGE_TYPE_WARNING, filename, matches))

Expand Down Expand Up @@ -2934,7 +2934,7 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c
# Exclude paragraphs in blockquotes, which may have special quoting rules, and "continued" paragraphs, which may be continued dialog without an “
for node in dom_copy.xpath("/html/body//p[not(ancestor::blockquote) and not(contains(@class, 'continued'))]"):
node.set_attr("id", "lint-" + str(node_number))
temp_xhtml = temp_xhtml + f"<p id=\"lint-{node_number}\">" + regex.sub(r"[\s\n]+", " ", node.inner_text(), flags=regex.DOTALL) + "\n"
temp_xhtml = temp_xhtml + f"<p id=\"lint-{node_number}\">" + regex.sub(r"\s+", " ", node.inner_text()) + "\n"
node_number = node_number + 1

replacement_count = 1
Expand All @@ -2943,12 +2943,12 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c
(temp_xhtml, replacement_count) = regex.subn(r"“[^“]+?”", " ", temp_xhtml) # Remove all regular quotes

# Remove contractions to reduce rsquo for next regex
temp_xhtml = regex.sub(r"[\p{Letter}]’[\p{Letter}]", " ", temp_xhtml, flags=regex.MULTILINE)
temp_xhtml = regex.sub(r"[\p{Letter}]’[\p{Letter}]", " ", temp_xhtml)

# Remove all runs of ldquo that are likely to spill to the next <p>
replacement_count = 1
while replacement_count > 0:
(temp_xhtml, replacement_count) = regex.subn(r"“[^“”]+?$", " ", temp_xhtml, flags=regex.MULTILINE)
(temp_xhtml, replacement_count) = regex.subn(r"“[^“”]+?$", " ", temp_xhtml)

# Match problem `‘` using regex, and if found, get the actual node text from the dom to return.
typos = []
Expand Down Expand Up @@ -3009,7 +3009,7 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c

# Check for closing rdquo without opening ldquo.
# Remove tds in case rdquo means "ditto mark"
typos = regex.findall(r"”[^“‘]+?”", regex.sub(r"<td[^>]*?>[”\s]+?(<a .+?epub:type=\"noteref\">.+?</a>)?</td>", "", file_contents), flags=regex.DOTALL)
typos = regex.findall(r"”[^“‘]+?”", regex.sub(r"<td[^>]*?>[”\s]+?(<a .+?epub:type=\"noteref\">.+?</a>)?</td>", "", file_contents))

# We create a filter to try to exclude nested quotations
# Remove tags in case they're enclosing punctuation we want to match against at the end of a sentence.
Expand Down
56 changes: 28 additions & 28 deletions se/typography.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,18 +103,18 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str:
xhtml = xhtml.replace("——", "⸺")

# Smartypants doesn't do well on em dashes followed by open quotes. Fix that here
xhtml = regex.sub(r"—”([\p{Letter}])", r"—“\1", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"—’([\p{Letter}])", r"—‘\1", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"-“</p>", r"—”</p>", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"‘”</p>", fr"’{se.HAIR_SPACE}”</p>", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"—”([\p{Letter}])", r"—“\1", xhtml)
xhtml = regex.sub(r"—’([\p{Letter}])", r"—‘\1", xhtml)
xhtml = regex.sub(r"-“</p>", r"—”</p>", xhtml)
xhtml = regex.sub(r"‘”</p>", fr"’{se.HAIR_SPACE}”</p>", xhtml)

# Now that we've fixed Smartypants' output, put our quotes back in
xhtml = xhtml.replace("!#se:rsquo#!", "’")

# Remove spaces between en and em dashes
# Note that we match at least one character before the dashes, so that we don't catch start-of-line em dashes like in poetry.
# We do a negative lookbehind for <br/ to prevent newlines/indents after <br/>s from being included
xhtml = regex.sub(r"(?<!<br/)([^\.…\s])\s*([–—])\s*", r"\1\2", xhtml, flags=regex.DOTALL)
xhtml = regex.sub(r"(?<!<br/)([^\.…\s])\s*([–—])\s*", r"\1\2", xhtml)

# First, remove stray word joiners
xhtml = xhtml.replace(se.WORD_JOINER, "")
Expand All @@ -123,12 +123,12 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str:
xhtml = xhtml.replace(se.SHY_HYPHEN, "")

# Fix some common em-dash transcription errors
xhtml = regex.sub(r"([:;])-([\p{Letter}])", r"\1—\2", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"([\p{Letter}])-“", r"\1—“", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"([:;])-([\p{Letter}])", r"\1—\2", xhtml)
xhtml = regex.sub(r"([\p{Letter}])-“", r"\1—“", xhtml)
xhtml = regex.sub(r":-</", fr":{se.WORD_JOINER}—</", xhtml)

# Em dashes and two-em-dashes can be broken before, so add a word joiner between letters/punctuation and the following em dash
xhtml = regex.sub(fr"([^\s{se.WORD_JOINER}{se.NO_BREAK_SPACE}{se.HAIR_SPACE}])([—⸻])", fr"\1{se.WORD_JOINER}\2", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"([^\s{se.WORD_JOINER}{se.NO_BREAK_SPACE}{se.HAIR_SPACE}])([—⸻])", fr"\1{se.WORD_JOINER}\2", xhtml)

# Add en dashes; don't replace match that is within an html tag, since ids and attrs often contain the pattern DIGIT-DIGIT
xhtml = regex.sub(r"(?<!<[^>]*)([0-9]+)\-([0-9]+)", r"\1–\2", xhtml)
Expand All @@ -146,7 +146,7 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str:
xhtml = regex.sub(fr"([\p{{Lowercase_Letter}}]){se.WORD_JOINER}—th\b", r"\1 —th", xhtml)

# Remove word joiners from following opening tags--they're usually never correct
xhtml = regex.sub(fr"<([\p{{Letter}}]+)([^>]*?)>{se.WORD_JOINER}", r"<\1\2>", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"<([\p{{Letter}}]+)([^>]*?)>{se.WORD_JOINER}", r"<\1\2>", xhtml)

# Add a word joiner after em dashes within <cite> elements
xhtml = regex.sub(r"<cite([^>]*?)>—", fr"<cite\1>—{se.WORD_JOINER}", xhtml)
Expand Down Expand Up @@ -213,7 +213,7 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str:
xhtml = regex.sub(r"(\s)‘a’(\s)", r"\1’a’\2", xhtml, flags=regex.IGNORECASE)

# Years
xhtml = regex.sub(r"‘([0-9]{2,}[^\p{Letter}0-9’])", r"’\1", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"‘([0-9]{2,}[^\p{Letter}0-9’])", r"’\1", xhtml)

xhtml = regex.sub(r"‘([Aa]ve|[Oo]me|[Ii]m|[Mm]idst|[Gg]ainst|[Nn]eath|[Ee]m|[Cc]os|[Tt]is|[Tt]isn’t|[Tt]was|[Tt]ain’t|[Tt]wixt|[Tt]were|[Tt]would|[Tt]wouldn|[Tt]won|[Tt]ween|[Tt]will|[Rr]ound|[Pp]on|[Uu]ns?|[Uu]d|[Cc]ept|[Oo]w|[Aa]ppen|[Ee])\b", r"’\1", xhtml)

Expand Down Expand Up @@ -245,34 +245,34 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str:
xhtml = regex.sub(r"(?<!A\. )B\.\s+C\.", r"BC", xhtml)

# Put spacing next to close quotes
xhtml = regex.sub(fr"“[\s{se.NO_BREAK_SPACE}]*‘", fr"“{se.HAIR_SPACE}‘", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"’[\s{se.NO_BREAK_SPACE}]*”", fr"’{se.HAIR_SPACE}”", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"“[\s{se.NO_BREAK_SPACE}]*’", fr"“{se.HAIR_SPACE}’", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"‘[\s{se.NO_BREAK_SPACE}]*“", fr"‘{se.HAIR_SPACE}“", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"‘[\s{se.NO_BREAK_SPACE}]*’", fr"‘{se.HAIR_SPACE}’", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"“[\s{se.NO_BREAK_SPACE}]*‘", fr"“{se.HAIR_SPACE}‘", xhtml)
xhtml = regex.sub(fr"’[\s{se.NO_BREAK_SPACE}]*”", fr"’{se.HAIR_SPACE}”", xhtml)
xhtml = regex.sub(fr"“[\s{se.NO_BREAK_SPACE}]*’", fr"“{se.HAIR_SPACE}’", xhtml)
xhtml = regex.sub(fr"‘[\s{se.NO_BREAK_SPACE}]*“", fr"‘{se.HAIR_SPACE}“", xhtml)
xhtml = regex.sub(fr"‘[\s{se.NO_BREAK_SPACE}]*’", fr"‘{se.HAIR_SPACE}’", xhtml)

# We require a non-letter char at the end, otherwise we might match a contraction: “Hello,” ’e said.
xhtml = regex.sub(fr"”[\s{se.NO_BREAK_SPACE}]*’([^\p{{Letter}}])", fr"”{se.HAIR_SPACE}’\1", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"”[\s{se.NO_BREAK_SPACE}]*’([^\p{{Letter}}])", fr"”{se.HAIR_SPACE}’\1", xhtml)

# Fix ellipses spacing
xhtml = regex.sub(r"\s*\.\s*\.\s*\.\s*", r"…", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"[\s{se.NO_BREAK_SPACE}]?…[\s{se.NO_BREAK_SPACE}]?\.", fr".{se.HAIR_SPACE}…", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"[\s{se.NO_BREAK_SPACE}]?…[\s{se.NO_BREAK_SPACE}]?", fr"{se.HAIR_SPACE}… ", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"<p([^>]*?)>{se.HAIR_SPACE}…", r"<p\1>…", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"\s*\.\s*\.\s*\.\s*", r"…", xhtml)
xhtml = regex.sub(fr"[\s{se.NO_BREAK_SPACE}]?…[\s{se.NO_BREAK_SPACE}]?\.", fr".{se.HAIR_SPACE}…", xhtml)
xhtml = regex.sub(fr"[\s{se.NO_BREAK_SPACE}]?…[\s{se.NO_BREAK_SPACE}]?", fr"{se.HAIR_SPACE}… ", xhtml)
xhtml = regex.sub(fr"<p([^>]*?)>{se.HAIR_SPACE}…", r"<p\1>…", xhtml)

# Remove spaces between opening tags and ellipses
xhtml = regex.sub(fr"(<[\p{{Letter}}0-9]+[^<]+?>)[\s{se.NO_BREAK_SPACE}]+?…", r"\1…", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"(<[\p{{Letter}}0-9]+[^<]+?>)[\s{se.NO_BREAK_SPACE}]+?…", r"\1…", xhtml)

# Remove spaces between closing tags and ellipses
xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?(</[\p{{Letter}}0-9]+>)", r"…\1", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]+([\)”’])(?![\p{{Letter}}])", r"…\1", xhtml, flags=regex.IGNORECASE) # If followed by a letter, the single quote is probably a leading elision
xhtml = regex.sub(fr"([\(“‘])[\s{se.NO_BREAK_SPACE}]+…", r"\1…", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?([\!\?\.\;\,])", fr"…{se.HAIR_SPACE}\1", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"([\!\?\.\;”’])[\s{se.NO_BREAK_SPACE}]?…", fr"\1{se.HAIR_SPACE}…", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"\,[\s{se.NO_BREAK_SPACE}]?…", fr",{se.HAIR_SPACE}…", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?(</[\p{{Letter}}0-9]+>)", r"…\1", xhtml)
xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]+([\)”’])(?![\p{{Letter}}])", r"…\1", xhtml) # If followed by a letter, the single quote is probably a leading elision
xhtml = regex.sub(fr"([\(“‘])[\s{se.NO_BREAK_SPACE}]+…", r"\1…", xhtml)
xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?([\!\?\.\;\,])", fr"…{se.HAIR_SPACE}\1", xhtml)
xhtml = regex.sub(fr"([\!\?\.\;”’])[\s{se.NO_BREAK_SPACE}]?…", fr"\1{se.HAIR_SPACE}…", xhtml)
xhtml = regex.sub(fr"\,[\s{se.NO_BREAK_SPACE}]?…", fr",{se.HAIR_SPACE}…", xhtml)

# Add nbsp to ellipses that open dialog
xhtml = regex.sub(r"([“‘])…\s([\p{Letter}0-9])", fr"\1…{se.NO_BREAK_SPACE}\2", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"([“‘])…\s([\p{Letter}0-9])", fr"\1…{se.NO_BREAK_SPACE}\2", xhtml)

# Don't use . ... if within a clause
xhtml = regex.sub(r"\.(\s…\s[\p{Lowercase_Letter}])", r"\1", xhtml)
Expand Down
2 changes: 1 addition & 1 deletion se/vendor/kobo_touch_extended/kobo.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def append_kobo_spans_from_text(node, text):
return False
else:
# Split text in sentences
groups = regex.split(fr'(.*?[\.\!\?\:](?:{se.HAIR_SPACE}…)?[\'"\u201d\u2019]?(?:{se.HAIR_SPACE}\u201d)?\s*)', text, flags=regex.MULTILINE)
groups = regex.split(fr'(.*?[\.\!\?\:](?:{se.HAIR_SPACE}…)?[\'"\u201d\u2019]?(?:{se.HAIR_SPACE}\u201d)?\s*)', text)
# Remove empty strings resulting from split()
groups = [g for g in groups if g != ""]

Expand Down

0 comments on commit 8665308

Please sign in to comment.