Skip to content

Commit

Permalink
Add support for alternate hreflang link
Browse files Browse the repository at this point in the history
  • Loading branch information
freddyheppell committed Jan 14, 2025
1 parent 32c6478 commit 3af1401
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 1 deletion.
53 changes: 53 additions & 0 deletions tests/tree/test_xml_exts.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,56 @@ def test_xml_image(self, requests_mock):
print(tree)

assert tree == expected_sitemap_tree

def test_xml_hreflang(self, requests_mock):
requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher)

requests_mock.get(
self.TEST_BASE_URL + "/robots.txt",
headers={"Content-Type": "text/plain"},
text=textwrap.dedent(
f"""
User-agent: *
Disallow: /whatever
Sitemap: {self.TEST_BASE_URL}/sitemap.xml
"""
).strip(),
)

requests_mock.get(
self.TEST_BASE_URL + "/sitemap.xml",
headers={"Content-Type": "text/xml"},
text=textwrap.dedent(
f"""
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
<url>
<loc>{self.TEST_BASE_URL}/en/page</loc>
<lastmod>{self.TEST_DATE_STR_ISO8601}</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
<xhtml:link rel="alternate" hreflang="fr-FR" href="{self.TEST_BASE_URL}/fr/page"/>
</url>
<url>
<loc>{self.TEST_BASE_URL}/fr/page</loc>
<lastmod>{self.TEST_DATE_STR_ISO8601}</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
<xhtml:link rel="alternate" hreflang="en-GB" href="{self.TEST_BASE_URL}/en/page"/>
</url>
</urlset>
"""
).strip()
)

tree = sitemap_tree_for_homepage(self.TEST_BASE_URL)

pages = list(tree.all_pages())
assert pages[0].alternates == [
("fr-FR", f"{self.TEST_BASE_URL}/fr/page"),
]
assert pages[1].alternates == [
("en-GB", f"{self.TEST_BASE_URL}/en/page"),
]
16 changes: 16 additions & 0 deletions usp/fetch_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,7 @@ class Page:
"news_keywords",
"news_stock_tickers",
"images",
"alternates"
]

def __init__(self):
Expand All @@ -659,6 +660,7 @@ def __init__(self):
self.news_keywords = None
self.news_stock_tickers = None
self.images = []
self.alternates = []

def __hash__(self):
return hash(
Expand Down Expand Up @@ -763,13 +765,18 @@ def page(self) -> Optional[SitemapPage]:
for image in self.images
]

alternates = None
if len(self.alternates) > 0:
alternates = self.alternates

return SitemapPage(
url=url,
last_modified=last_modified,
change_frequency=change_frequency,
priority=priority,
news_story=sitemap_news_story,
images=sitemap_images,
alternates=alternates,
)

__slots__ = ["_current_page", "_pages", "_page_urls", "_current_image"]
Expand Down Expand Up @@ -801,6 +808,15 @@ def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None:
"Page is expected to be set before <image:image>."
)
self._current_image = self.Image()
elif name == "link":
if not self._current_page:
raise SitemapXMLParsingException(
"Page is expected to be set before <link>."
)
if "hreflang" not in attrs or "href" not in attrs:
log.warning(f"<link> element is missing hreflang or href attributes: {attrs}.")

self._current_page.alternates.append((attrs["hreflang"], attrs["href"]))

def __require_last_char_data_to_be_set(self, name: str) -> None:
if not self._last_char_data:
Expand Down
13 changes: 12 additions & 1 deletion usp/objects/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import datetime
from decimal import Decimal
from enum import Enum, unique
from typing import List, Optional
from typing import List, Optional, Tuple

SITEMAP_PAGE_DEFAULT_PRIORITY = Decimal("0.5")
"""Default sitemap page priority, as per the spec."""
Expand Down Expand Up @@ -331,6 +331,7 @@ class SitemapPage:
"__change_frequency",
"__news_story",
"__images",
"__alternates",
]

def __init__(
Expand All @@ -341,6 +342,7 @@ def __init__(
change_frequency: Optional[SitemapPageChangeFrequency] = None,
news_story: Optional[SitemapNewsStory] = None,
images: Optional[List[SitemapImage]] = None,
alternates: Optional[List[Tuple[str, str]]] = None,
):
"""
Initialize a new sitemap-derived page.
Expand All @@ -357,6 +359,7 @@ def __init__(
self.__change_frequency = change_frequency
self.__news_story = news_story
self.__images = images
self.__alternates = alternates

def __eq__(self, other) -> bool:
if not isinstance(other, SitemapPage):
Expand All @@ -380,6 +383,9 @@ def __eq__(self, other) -> bool:
if self.images != other.images:
return False

if self.alternates != other.alternates:
return False

return True

def __hash__(self):
Expand Down Expand Up @@ -449,3 +455,8 @@ def news_story(self) -> Optional[SitemapNewsStory]:
def images(self) -> Optional[List[SitemapImage]]:
"""Get the images attached to the URL."""
return self.__images

@property
def alternates(self) -> Optional[List[Tuple[str, str]]]:
"""Get the alternate URLs for the URL."""
return self.__alternates

0 comments on commit 3af1401

Please sign in to comment.