From db49424651bfea7c2628c60eb81a2c0c91da52c7 Mon Sep 17 00:00:00 2001 From: Andrew Paseltiner Date: Mon, 24 Jun 2024 11:12:08 -0400 Subject: [PATCH] Fix check for LoI text against either figcaption or alt text in s-041 And reword the corresponding lint message to indicate that alt is allowed. --- se/se_epub_lint.py | 31 ++++--- .../lint/semantic/s-041/golden/s-041-out.txt | 9 ++ .../semantic/s-041/in/src/epub/content.opf | 93 +++++++++++++++++++ .../s-041/in/src/epub/text/chapter-1.xhtml | 21 +++++ .../s-041/in/src/epub/text/colophon.xhtml | 42 +++++++++ .../semantic/s-041/in/src/epub/text/loi.xhtml | 23 +++++ 6 files changed, 204 insertions(+), 15 deletions(-) create mode 100644 tests/lint/semantic/s-041/golden/s-041-out.txt create mode 100644 tests/lint/semantic/s-041/in/src/epub/content.opf create mode 100644 tests/lint/semantic/s-041/in/src/epub/text/chapter-1.xhtml create mode 100644 tests/lint/semantic/s-041/in/src/epub/text/colophon.xhtml create mode 100644 tests/lint/semantic/s-041/in/src/epub/text/loi.xhtml diff --git a/se/se_epub_lint.py b/se/se_epub_lint.py index ad126b80..5db15a9c 100644 --- a/se/se_epub_lint.py +++ b/se/se_epub_lint.py @@ -289,7 +289,7 @@ "s-038", "Illegal asterism. Section/scene breaks must be defined by an [xhtml]
[/] element." "s-039", "[text]Ibid[/] in endnotes. “Ibid” means “The previous reference” which is meaningless with popup endnotes" "s-040", f"[attr]#{figure_ref}[/] not found in file [path][link=file://{self.path / 'src/epub/text' / chapter_ref}]{chapter_ref}[/][/]." -"s-041", f"The [xhtml]
[/] element of [attr]#{figure_ref}[/] does not match the text in its LoI entry." +"s-041", f"The text in [attr]#{figure_ref}[/]'s LoI entry does not match either its [xhtml]
[/] element or its [xhtml][/] [attr]alt[/] attribute." "s-042", "[xhtml][/] element without [xhtml][/] child." "s-043", "[val]se:short-story[/] semantic on element that is not [xhtml]
[/]." "s-044", "Element with poem or verse semantic, without descendant [xhtml]

[/] (stanza) element." @@ -1504,8 +1504,6 @@ def _lint_special_file_checks(self, filename: Path, dom: se.easy_xml.EasyXmlTree for node in dom.xpath("/html/body/nav[contains(@epub:type, 'loi')]//li//a"): figure_ref = node.get_attr("href").split("#")[1] chapter_ref = regex.findall(r"(.*?)#.*", node.get_attr("href"))[0] - figure_img_alt = "" - figcaption_text = "" loi_text = node.inner_text() file_dom = self.get_dom(self.content_path / "text" / chapter_ref) @@ -1515,19 +1513,21 @@ def _lint_special_file_checks(self, filename: Path, dom: se.easy_xml.EasyXmlTree messages.append(LintMessage("s-040", f"[attr]#{figure_ref}[/] not found in file [path][link=file://{self.path / 'src/epub/text' / chapter_ref}]{chapter_ref}[/][/].", se.MESSAGE_TYPE_ERROR, filename)) continue - for child in figure.xpath("./*"): + loi_text_matches_figure = False + for child in figure.xpath("./img|./figcaption"): + figure_text = "" if child.tag == "img": - figure_img_alt = child.get_attr("alt") - - if child.tag == "figcaption": - figcaption_text = child.inner_text() - + figure_text = child.get_attr("alt") + elif child.tag == "figcaption": # Replace tabs and newlines with a single space to better match figcaptions that contain
- figcaption_text = regex.sub(r"(\n|\t)", " ", figcaption_text) - figcaption_text = regex.sub(r"[ ]+", " ", figcaption_text) + figure_text = regex.sub(r"[ \n\t]+", " ", child.inner_text()) + + if loi_text == figure_text: + loi_text_matches_figure = True + break - if (figcaption_text != "" and loi_text != "" and figcaption_text != loi_text) and (figure_img_alt != "" and loi_text != "" and figure_img_alt != loi_text): - messages.append(LintMessage("s-041", f"The [xhtml]

[/] element of [attr]#{figure_ref}[/] does not match the text in its LoI entry.", se.MESSAGE_TYPE_WARNING, self.path / "src/epub/text" / chapter_ref)) + if not loi_text_matches_figure: + messages.append(LintMessage("s-041", f"The text in [attr]#{figure_ref}[/]'s LoI entry does not match either its [xhtml]
[/] element or its [xhtml][/] [attr]alt[/] attribute.", se.MESSAGE_TYPE_WARNING, self.path / "src/epub/text" / chapter_ref)) return messages @@ -2445,7 +2445,9 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, img_alt_not_typogrified = [] img_alt_lacking_punctuation = [] for node in nodes: - if "titlepage.svg" not in node.get_attr("src"): + img_src = node.lxml_element.get("src") + # Avoid crashing if the src attribute is missing + if img_src and "titlepage.svg" not in img_src: ebook_flags["has_images"] = True # Save for a later check alt = node.get_attr("alt") @@ -2460,7 +2462,6 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, img_alt_lacking_punctuation.append(node.to_tag_string()) # Check that alt attributes match SVG titles - img_src = node.lxml_element.get("src") if img_src and img_src.endswith("svg"): title_text = "" image_ref = img_src.split("/").pop() diff --git a/tests/lint/semantic/s-041/golden/s-041-out.txt b/tests/lint/semantic/s-041/golden/s-041-out.txt new file mode 100644 index 00000000..ee7259d8 --- /dev/null +++ b/tests/lint/semantic/s-041/golden/s-041-out.txt @@ -0,0 +1,9 @@ +s-004 [Error] chapter-1.xhtml `img` element missing `alt` attribute. + + +s-041 [Manual Review] chapter-1.xhtml The text in `#f-5`'s LoI entry does not +match either its `
` element or its `` `alt` attribute. +s-041 [Manual Review] chapter-1.xhtml The text in `#f-6`'s LoI entry does not +match either its `
` element or its `` `alt` attribute. +s-041 [Manual Review] chapter-1.xhtml The text in `#f-7`'s LoI entry does not +match either its `
` element or its `` `alt` attribute. diff --git a/tests/lint/semantic/s-041/in/src/epub/content.opf b/tests/lint/semantic/s-041/in/src/epub/content.opf new file mode 100644 index 00000000..0af098ae --- /dev/null +++ b/tests/lint/semantic/s-041/in/src/epub/content.opf @@ -0,0 +1,93 @@ + + + + url:https://standardebooks.org/ebooks/jane-austen/unknown-novel/john-doe + 1900-01-01T00:00:00Z + 1900-01-01T00:00:00Z + The source text and artwork in this ebook are believed to be in the United States public domain; that is, they are believed to be free of copyright restrictions in the United States. They may still be copyrighted in other countries, so users located outside of the United States must check their local laws before using this ebook. The creators of, and contributors to, this ebook dedicate their contributions to the worldwide public domain via the terms in the [CC0 1.0 Universal Public Domain Dedication](https://creativecommons.org/publicdomain/zero/1.0/). + Standard Ebooks + Standard Ebooks + https://standardebooks.org + bkd + mdc + pbl + The League of Moveable Type + League of Moveable Type, The + https://www.theleagueofmoveabletype.com + tyd + + Standard Ebooks + textual + textual + readingOrder + structuralNavigation + tableOfContents + unlocked + none + This publication conforms to WCAG 2.2 Level AA. + + Unknown Novel + Unknown Novel + England--Social life and customs--19th century--Fiction + Sisters -- Fiction + LCSH + sh2008114941 + LCSH + sh2008111400 + Fiction + A short test novel for lint testing. + + <p>A short test novel for lint testing.</p> + + en-GB + https://www.gutenberg.org/ebooks/161 + https://archive.org/details/bub_gb_RtT0OLKFMHsC + WORD_COUNT + READING_EASE + https://en.wikipedia.org/wiki/Unknown_Jane_Austen_Novel + https://github.com/standardebooks/jane-austen_unknown-novel_john-doe + Jane Austen + Austen, Jane + https://en.wikipedia.org/wiki/Jane_Austen + http://id.loc.gov/authorities/names/n79032879 + aut + Georg Friedrich Kersting + Kersting, Georg Friedrich + https://en.wikipedia.org/wiki/Georg_Friedrich_Kersting + http://id.loc.gov/authorities/names/n83319941 + art + Anonymous + Anonymous + trc + John Doe + Doe, John + bkp + blw + cov + ill + mrk + pfr + tyg + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/lint/semantic/s-041/in/src/epub/text/chapter-1.xhtml b/tests/lint/semantic/s-041/in/src/epub/text/chapter-1.xhtml new file mode 100644 index 00000000..b0d6e099 --- /dev/null +++ b/tests/lint/semantic/s-041/in/src/epub/text/chapter-1.xhtml @@ -0,0 +1,21 @@ + + + + I + + +
+

I

+
t-1-alt.
+ +
t-2-cap
+
t-3-alt.
t-3-cap
+
t-4-alt.
t-4-cap
+ +
t-5-alt.
+ +
t-6-cap
+
t-7-alt.
t-7-cap
+
+ + diff --git a/tests/lint/semantic/s-041/in/src/epub/text/colophon.xhtml b/tests/lint/semantic/s-041/in/src/epub/text/colophon.xhtml new file mode 100644 index 00000000..41ad0c59 --- /dev/null +++ b/tests/lint/semantic/s-041/in/src/epub/text/colophon.xhtml @@ -0,0 +1,42 @@ + + + + Colophon + + + + +
+
+

Colophon

+ The Standard Ebooks logo. +
+

Unknown Novel
+ was published in 1810 by
+ Jane Austen.

+

This ebook was produced for
+ Standard Ebooks
+ by
+ An Anonymous Volunteer,
+ and is based on a transcription produced in 2010 by
+ An Anonymous Volunteer
+ for
+ Project Gutenberg
+ and is based on digital scans from the
+ Internet Archive.

+

The cover page is adapted from
+ At the Mirror,
+ a painting completed in 1827 by
+ Georg Friedrich Kersting.
+ The cover and title pages feature the
+ League Spartan and Sorts Mill Goudy
+ typefaces created in 2014 and 2009 by
+ The League of Moveable Type.

+

The first edition of this ebook was released on
+ January 1, 1900, 12:00 a.m.
+ You can check for updates to this ebook, view its revision history, or download it for different ereading systems at
+ standardebooks.org/ebooks/jane-austen/unknown-novel/john-doe.

+

The volunteer-driven Standard Ebooks project relies on readers like you to submit typos, corrections, and other improvements. Anyone can contribute at standardebooks.org.

+
+ + diff --git a/tests/lint/semantic/s-041/in/src/epub/text/loi.xhtml b/tests/lint/semantic/s-041/in/src/epub/text/loi.xhtml new file mode 100644 index 00000000..a8703ed8 --- /dev/null +++ b/tests/lint/semantic/s-041/in/src/epub/text/loi.xhtml @@ -0,0 +1,23 @@ + + + + List of Illustrations + + + + +