Skip to content

Commit

Permalink
Fix check for LoI text against either figcaption or alt text in s-041
Browse files Browse the repository at this point in the history
And reword the corresponding lint message to indicate that alt is
allowed.
  • Loading branch information
apasel422 committed Jun 24, 2024
1 parent db23ebe commit db49424
Show file tree
Hide file tree
Showing 6 changed files with 204 additions and 15 deletions.
31 changes: 16 additions & 15 deletions se/se_epub_lint.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@
"s-038", "Illegal asterism. Section/scene breaks must be defined by an [xhtml]<hr/>[/] element."
"s-039", "[text]Ibid[/] in endnotes. “Ibid” means “The previous reference” which is meaningless with popup endnotes"
"s-040", f"[attr]#{figure_ref}[/] not found in file [path][link=file://{self.path / 'src/epub/text' / chapter_ref}]{chapter_ref}[/][/]."
"s-041", f"The [xhtml]<figcaption>[/] element of [attr]#{figure_ref}[/] does not match the text in its LoI entry."
"s-041", f"The text in [attr]#{figure_ref}[/]'s LoI entry does not match either its [xhtml]<figcaption>[/] element or its [xhtml]<img>[/] [attr]alt[/] attribute."
"s-042", "[xhtml]<table>[/] element without [xhtml]<tbody>[/] child."
"s-043", "[val]se:short-story[/] semantic on element that is not [xhtml]<article>[/]."
"s-044", "Element with poem or verse semantic, without descendant [xhtml]<p>[/] (stanza) element."
Expand Down Expand Up @@ -1504,8 +1504,6 @@ def _lint_special_file_checks(self, filename: Path, dom: se.easy_xml.EasyXmlTree
for node in dom.xpath("/html/body/nav[contains(@epub:type, 'loi')]//li//a"):
figure_ref = node.get_attr("href").split("#")[1]
chapter_ref = regex.findall(r"(.*?)#.*", node.get_attr("href"))[0]
figure_img_alt = ""
figcaption_text = ""
loi_text = node.inner_text()
file_dom = self.get_dom(self.content_path / "text" / chapter_ref)

Expand All @@ -1515,19 +1513,21 @@ def _lint_special_file_checks(self, filename: Path, dom: se.easy_xml.EasyXmlTree
messages.append(LintMessage("s-040", f"[attr]#{figure_ref}[/] not found in file [path][link=file://{self.path / 'src/epub/text' / chapter_ref}]{chapter_ref}[/][/].", se.MESSAGE_TYPE_ERROR, filename))
continue

for child in figure.xpath("./*"):
loi_text_matches_figure = False
for child in figure.xpath("./img|./figcaption"):
figure_text = ""
if child.tag == "img":
figure_img_alt = child.get_attr("alt")

if child.tag == "figcaption":
figcaption_text = child.inner_text()

figure_text = child.get_attr("alt")
elif child.tag == "figcaption":
# Replace tabs and newlines with a single space to better match figcaptions that contain <br/>
figcaption_text = regex.sub(r"(\n|\t)", " ", figcaption_text)
figcaption_text = regex.sub(r"[ ]+", " ", figcaption_text)
figure_text = regex.sub(r"[ \n\t]+", " ", child.inner_text())

if loi_text == figure_text:
loi_text_matches_figure = True
break

if (figcaption_text != "" and loi_text != "" and figcaption_text != loi_text) and (figure_img_alt != "" and loi_text != "" and figure_img_alt != loi_text):
messages.append(LintMessage("s-041", f"The [xhtml]<figcaption>[/] element of [attr]#{figure_ref}[/] does not match the text in its LoI entry.", se.MESSAGE_TYPE_WARNING, self.path / "src/epub/text" / chapter_ref))
if not loi_text_matches_figure:
messages.append(LintMessage("s-041", f"The text in [attr]#{figure_ref}[/]'s LoI entry does not match either its [xhtml]<figcaption>[/] element or its [xhtml]<img>[/] [attr]alt[/] attribute.", se.MESSAGE_TYPE_WARNING, self.path / "src/epub/text" / chapter_ref))

return messages

Expand Down Expand Up @@ -2445,7 +2445,9 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree,
img_alt_not_typogrified = []
img_alt_lacking_punctuation = []
for node in nodes:
if "titlepage.svg" not in node.get_attr("src"):
img_src = node.lxml_element.get("src")
# Avoid crashing if the src attribute is missing
if img_src and "titlepage.svg" not in img_src:
ebook_flags["has_images"] = True # Save for a later check

alt = node.get_attr("alt")
Expand All @@ -2460,7 +2462,6 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree,
img_alt_lacking_punctuation.append(node.to_tag_string())

# Check that alt attributes match SVG titles
img_src = node.lxml_element.get("src")
if img_src and img_src.endswith("svg"):
title_text = ""
image_ref = img_src.split("/").pop()
Expand Down
9 changes: 9 additions & 0 deletions tests/lint/semantic/s-041/golden/s-041-out.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
s-004 [Error] chapter-1.xhtml `img` element missing `alt` attribute.
<img>
<img>
s-041 [Manual Review] chapter-1.xhtml The text in `#f-5`'s LoI entry does not
match either its `<figcaption>` element or its `<img>` `alt` attribute.
s-041 [Manual Review] chapter-1.xhtml The text in `#f-6`'s LoI entry does not
match either its `<figcaption>` element or its `<img>` `alt` attribute.
s-041 [Manual Review] chapter-1.xhtml The text in `#f-7`'s LoI entry does not
match either its `<figcaption>` element or its `<img>` `alt` attribute.
93 changes: 93 additions & 0 deletions tests/lint/semantic/s-041/in/src/epub/content.opf
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
<?xml version="1.0" encoding="utf-8"?>
<package xmlns="http://www.idpf.org/2007/opf" dir="ltr" prefix="se: https://standardebooks.org/vocab/1.0" unique-identifier="uid" version="3.0" xml:lang="en-US">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:identifier id="uid">url:https://standardebooks.org/ebooks/jane-austen/unknown-novel/john-doe</dc:identifier>
<dc:date>1900-01-01T00:00:00Z</dc:date>
<meta property="dcterms:modified">1900-01-01T00:00:00Z</meta>
<dc:rights>The source text and artwork in this ebook are believed to be in the United States public domain; that is, they are believed to be free of copyright restrictions in the United States. They may still be copyrighted in other countries, so users located outside of the United States must check their local laws before using this ebook. The creators of, and contributors to, this ebook dedicate their contributions to the worldwide public domain via the terms in the [CC0 1.0 Universal Public Domain Dedication](https://creativecommons.org/publicdomain/zero/1.0/).</dc:rights>
<dc:publisher id="publisher">Standard Ebooks</dc:publisher>
<meta property="file-as" refines="#publisher">Standard Ebooks</meta>
<meta property="se:url.homepage" refines="#publisher">https://standardebooks.org</meta>
<meta property="role" refines="#publisher" scheme="marc:relators">bkd</meta>
<meta property="role" refines="#publisher" scheme="marc:relators">mdc</meta>
<meta property="role" refines="#publisher" scheme="marc:relators">pbl</meta>
<dc:contributor id="type-designer">The League of Moveable Type</dc:contributor>
<meta property="file-as" refines="#type-designer">League of Moveable Type, The</meta>
<meta property="se:url.homepage" refines="#type-designer">https://www.theleagueofmoveabletype.com</meta>
<meta property="role" refines="#type-designer" scheme="marc:relators">tyd</meta>
<link href="http://www.idpf.org/epub/a11y/accessibility-20170105.html#wcag-aa" rel="dcterms:conformsTo"/>
<meta property="a11y:certifiedBy">Standard Ebooks</meta>
<meta property="schema:accessMode">textual</meta>
<meta property="schema:accessModeSufficient">textual</meta>
<meta property="schema:accessibilityFeature">readingOrder</meta>
<meta property="schema:accessibilityFeature">structuralNavigation</meta>
<meta property="schema:accessibilityFeature">tableOfContents</meta>
<meta property="schema:accessibilityFeature">unlocked</meta>
<meta property="schema:accessibilityHazard">none</meta>
<meta property="schema:accessibilitySummary">This publication conforms to WCAG 2.2 Level AA.</meta>
<link href="onix.xml" media-type="application/xml" properties="onix" rel="record"/>
<dc:title id="title">Unknown Novel</dc:title>
<meta property="file-as" refines="#title">Unknown Novel</meta>
<dc:subject id="subject-1">England--Social life and customs--19th century--Fiction</dc:subject>
<dc:subject id="subject-2">Sisters -- Fiction</dc:subject>
<meta property="authority" refines="#subject-1">LCSH</meta>
<meta property="term" refines="#subject-1">sh2008114941</meta>
<meta property="authority" refines="#subject-2">LCSH</meta>
<meta property="term" refines="#subject-2">sh2008111400</meta>
<meta property="se:subject">Fiction</meta>
<dc:description id="description">A short test novel for lint testing.</dc:description>
<meta id="long-description" property="se:long-description" refines="#description">
&lt;p&gt;A short test novel for lint testing.&lt;/p&gt;
</meta>
<dc:language>en-GB</dc:language>
<dc:source>https://www.gutenberg.org/ebooks/161</dc:source>
<dc:source>https://archive.org/details/bub_gb_RtT0OLKFMHsC</dc:source>
<meta property="se:word-count">WORD_COUNT</meta>
<meta property="se:reading-ease.flesch">READING_EASE</meta>
<meta property="se:url.encyclopedia.wikipedia">https://en.wikipedia.org/wiki/Unknown_Jane_Austen_Novel</meta>
<meta property="se:url.vcs.github">https://github.com/standardebooks/jane-austen_unknown-novel_john-doe</meta>
<dc:creator id="author">Jane Austen</dc:creator>
<meta property="file-as" refines="#author">Austen, Jane</meta>
<meta property="se:url.encyclopedia.wikipedia" refines="#author">https://en.wikipedia.org/wiki/Jane_Austen</meta>
<meta property="se:url.authority.nacoaf" refines="#author">http://id.loc.gov/authorities/names/n79032879</meta>
<meta property="role" refines="#author" scheme="marc:relators">aut</meta>
<dc:contributor id="artist">Georg Friedrich Kersting</dc:contributor>
<meta property="file-as" refines="#artist">Kersting, Georg Friedrich</meta>
<meta property="se:url.encyclopedia.wikipedia" refines="#artist">https://en.wikipedia.org/wiki/Georg_Friedrich_Kersting</meta>
<meta property="se:url.authority.nacoaf" refines="#artist">http://id.loc.gov/authorities/names/n83319941</meta>
<meta property="role" refines="#artist" scheme="marc:relators">art</meta>
<dc:contributor id="transcriber-1">Anonymous</dc:contributor>
<meta property="file-as" refines="#transcriber-1">Anonymous</meta>
<meta property="role" refines="#transcriber-1" scheme="marc:relators">trc</meta>
<dc:contributor id="producer-1">John Doe</dc:contributor>
<meta property="file-as" refines="#producer-1">Doe, John</meta>
<meta property="role" refines="#producer-1" scheme="marc:relators">bkp</meta>
<meta property="role" refines="#producer-1" scheme="marc:relators">blw</meta>
<meta property="role" refines="#producer-1" scheme="marc:relators">cov</meta>
<meta property="role" refines="#producer-1" scheme="marc:relators">ill</meta>
<meta property="role" refines="#producer-1" scheme="marc:relators">mrk</meta>
<meta property="role" refines="#producer-1" scheme="marc:relators">pfr</meta>
<meta property="role" refines="#producer-1" scheme="marc:relators">tyg</meta>
</metadata>
<manifest>
<item href="css/core.css" id="core.css" media-type="text/css"/>
<item href="css/local.css" id="local.css" media-type="text/css"/>
<item href="css/se.css" id="se.css" media-type="text/css"/>
<item href="images/cover.svg" id="cover.svg" media-type="image/svg+xml" properties="cover-image"/>
<item href="images/logo.svg" id="logo.svg" media-type="image/svg+xml"/>
<item href="images/titlepage.svg" id="titlepage.svg" media-type="image/svg+xml"/>
<item href="text/chapter-1.xhtml" id="chapter-1.xhtml" media-type="application/xhtml+xml"/>
<item href="text/colophon.xhtml" id="colophon.xhtml" media-type="application/xhtml+xml" properties="svg"/>
<item href="text/imprint.xhtml" id="imprint.xhtml" media-type="application/xhtml+xml" properties="svg"/>
<item href="text/titlepage.xhtml" id="titlepage.xhtml" media-type="application/xhtml+xml" properties="svg"/>
<item href="text/uncopyright.xhtml" id="uncopyright.xhtml" media-type="application/xhtml+xml"/>
<item href="toc.xhtml" id="toc.xhtml" media-type="application/xhtml+xml" properties="nav"/>
</manifest>
<spine>
<itemref idref="titlepage.xhtml"/>
<itemref idref="imprint.xhtml"/>
<itemref idref="chapter-1.xhtml"/>
<itemref idref="colophon.xhtml"/>
<itemref idref="uncopyright.xhtml"/>
</spine>
</package>
21 changes: 21 additions & 0 deletions tests/lint/semantic/s-041/in/src/epub/text/chapter-1.xhtml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/" xml:lang="en-GB">
<head>
<title>I</title>
</head>
<body epub:type="bodymatter z3998:fiction">
<section id="chapter-1" epub:type="chapter">
<h2 epub:type="ordinal z3998:roman">I</h2>
<figure id="f-1"><img alt="t-1-alt."/></figure>
<!-- will cause s-004 to be emitted, but we are deliberately testing behavior with missing alt -->
<figure id="f-2"><img/><figcaption>t-2-cap</figcaption></figure>
<figure id="f-3"><img alt="t-3-alt."/><figcaption>t-3-cap</figcaption></figure>
<figure id="f-4"><img alt="t-4-alt."/><figcaption>t-4-cap</figcaption></figure>

<figure id="f-5"><img alt="t-5-alt."/></figure>
<!-- will cause s-004 to be emitted, but we are deliberately testing behavior with missing alt -->
<figure id="f-6"><img/><figcaption>t-6-cap</figcaption></figure>
<figure id="f-7"><img alt="t-7-alt."/><figcaption>t-7-cap</figcaption></figure>
</section>
</body>
</html>
42 changes: 42 additions & 0 deletions tests/lint/semantic/s-041/in/src/epub/text/colophon.xhtml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/, se: https://standardebooks.org/vocab/1.0" xml:lang="en-US">
<head>
<title>Colophon</title>
<link href="../css/core.css" rel="stylesheet" type="text/css"/>
<link href="../css/se.css" rel="stylesheet" type="text/css"/>
</head>
<body epub:type="backmatter">
<section id="colophon" epub:type="colophon">
<header>
<h2 epub:type="title">Colophon</h2>
<img alt="The Standard Ebooks logo." src="../images/logo.svg" epub:type="z3998:publisher-logo se:image.color-depth.black-on-transparent"/>
</header>
<p><i epub:type="se:name.publication.book">Unknown Novel</i><br/>
was published in 1810 by<br/>
<a href="https://en.wikipedia.org/wiki/Jane_Austen">Jane Austen</a>.</p>
<p>This ebook was produced for<br/>
<a href="https://standardebooks.org">Standard Ebooks</a><br/>
by<br/>
<b>An Anonymous Volunteer</b>,<br/>
and is based on a transcription produced in 2010 by<br/>
<b class="name">An Anonymous Volunteer</b><br/>
for<br/>
<a href="https://www.gutenberg.org/ebooks/161">Project Gutenberg</a><br/>
and is based on digital scans from the<br/>
<a href="https://archive.org/details/bub_gb_RtT0OLKFMHsC">Internet Archive</a>.</p>
<p>The cover page is adapted from<br/>
<i epub:type="se:name.visual-art.painting">At the Mirror</i>,<br/>
a painting completed in 1827 by<br/>
<a href="https://en.wikipedia.org/wiki/Georg_Friedrich_Kersting">Georg Friedrich Kersting</a>.<br/>
The cover and title pages feature the<br/>
<b epub:type="se:name.visual-art.typeface">League Spartan</b> and <b epub:type="se:name.visual-art.typeface">Sorts Mill Goudy</b><br/>
typefaces created in 2014 and 2009 by<br/>
<a href="https://www.theleagueofmoveabletype.com">The League of Moveable Type</a>.</p>
<p>The first edition of this ebook was released on<br/>
<b>January 1, 1900, 12:00 <abbr class="eoc">a.m.</abbr></b><br/>
You can check for updates to this ebook, view its revision history, or download it for different ereading systems at<br/>
<a href="https://standardebooks.org/ebooks/jane-austen/unknown-novel/john-doe">standardebooks.org/ebooks/jane-austen/unknown-novel/john-doe</a>.</p>
<p>The volunteer-driven Standard Ebooks project relies on readers like you to submit typos, corrections, and other improvements. Anyone can contribute at <a href="https://standardebooks.org">standardebooks.org</a>.</p>
</section>
</body>
</html>
23 changes: 23 additions & 0 deletions tests/lint/semantic/s-041/in/src/epub/text/loi.xhtml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/" xml:lang="en-GB">
<head>
<title>List of Illustrations</title>
</head>
<body epub:type="backmatter">
<nav id="loi" epub:type="loi">
<h2 epub:type="title">List of Illustrations</h2>
<ol>
<!-- text matches either alt or caption -->
<li><p><a href="chapter-1.xhtml#f-1">t-1-alt.</a></p></li>
<li><p><a href="chapter-1.xhtml#f-2">t-2-cap</a></p></li>
<li><p><a href="chapter-1.xhtml#f-3">t-3-cap</a></p></li>
<li><p><a href="chapter-1.xhtml#f-4">t-4-alt.</a></p></li>

<!-- text matches neither alt nor caption -->
<li><p><a href="chapter-1.xhtml#f-5">x</a></p></li>
<li><p><a href="chapter-1.xhtml#f-6">x</a></p></li>
<li><p><a href="chapter-1.xhtml#f-7">x</a></p></li>
</ol>
</nav>
</body>
</html>

0 comments on commit db49424

Please sign in to comment.