Skip to content

Commit

Permalink
Search: respect spacing from block elements when indexing (#11658)
Browse files Browse the repository at this point in the history
HTML tags can be divided in two categories: inline and block elements.
Inline elements do not start on a new line, while block elements start on a new line.
This gives block elements an implicit spacing that is not present in
inline elements. If there are two tags next to each other,
and one of them is a block element, there will be a space between them.
Or if the two tags are inline elements, there will be no space between them.
  • Loading branch information
stsewd authored Oct 10, 2024
1 parent 14e4353 commit 24cc300
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 1 deletion.
46 changes: 45 additions & 1 deletion readthedocs/search/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,45 @@ class GenericParser:
# Limit that matches the ``index.mapping.nested_objects.limit`` ES setting.
max_inner_documents = 10000

# Block level elements have an implicit line break before and after them.
# List taken from: https://www.w3schools.com/htmL/html_blocks.asp.
block_level_elements = [
"address",
"article",
"aside",
"blockquote",
"canvas",
"dd",
"div",
"dl",
"dt",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"header",
"hr",
"li",
"main",
"nav",
"noscript",
"ol",
"p",
"pre",
"section",
"table",
"tfoot",
"ul",
"video",
]

def __init__(self, version):
self.version = version
self.project = self.version.project
Expand Down Expand Up @@ -334,7 +373,12 @@ def _parse_section_content(self, tag, *, depth=0):
)

if content:
contents.append(content)
is_block_level_element = next_tag.tag in self.block_level_elements
if is_block_level_element:
# Add a line break before and after a block level element.
contents.append(f"\n{content}\n")
else:
contents.append(content)
next_tag = next_tag.next

return self._parse_content("".join(contents)), section_found
Expand Down
10 changes: 10 additions & 0 deletions readthedocs/search/tests/data/sphinx/in/page.html
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,16 @@ <h2>Footnotes and domains<a class="headerlink" href="#footnotes-and-domains" tit
</div>
<!-- End of footnote -->

<!-- Definition list -->
<section id="development">
<h2>Development<a class="headerlink" href="#development" title="Permalink to this heading"></a></h2>
<dl class="simple">
<!-- NOTE: leave this as a single line to test a bug related to spacing -->
<dt><a class="reference internal" href="contributing.html"><span class="doc">Contributing</span></a></dt><dd><p>How to contribute changes to the theme.</p></dd>
</dl>
</section>
<!-- End of definition list -->

</div>
</main>
</body>
Expand Down
5 changes: 5 additions & 0 deletions readthedocs/search/tests/data/sphinx/out/page.json
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@
"title": "Footnotes and domains",
"content": ""
},
{
"content": "Contributing How to contribute changes to the theme.",
"id": "development",
"title": "Development"
},
{
"id": "subsub-title",
"title": "Subsub title",
Expand Down

0 comments on commit 24cc300

Please sign in to comment.