diff --git a/tests/tree/test_xml_exts.py b/tests/tree/test_xml_exts.py
index 78735e3..d1e6c0b 100644
--- a/tests/tree/test_xml_exts.py
+++ b/tests/tree/test_xml_exts.py
@@ -105,3 +105,56 @@ def test_xml_image(self, requests_mock):
print(tree)
assert tree == expected_sitemap_tree
+
+ def test_xml_hreflang(self, requests_mock):
+ requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher)
+
+ requests_mock.get(
+ self.TEST_BASE_URL + "/robots.txt",
+ headers={"Content-Type": "text/plain"},
+ text=textwrap.dedent(
+ f"""
+ User-agent: *
+ Disallow: /whatever
+
+ Sitemap: {self.TEST_BASE_URL}/sitemap.xml
+
+ """
+ ).strip(),
+ )
+
+ requests_mock.get(
+ self.TEST_BASE_URL + "/sitemap.xml",
+ headers={"Content-Type": "text/xml"},
+ text=textwrap.dedent(
+ f"""
+
+
+
+ {self.TEST_BASE_URL}/en/page
+ {self.TEST_DATE_STR_ISO8601}
+ monthly
+ 0.8
+
+
+
+ {self.TEST_BASE_URL}/fr/page
+ {self.TEST_DATE_STR_ISO8601}
+ monthly
+ 0.8
+
+
+
+ """
+ ).strip()
+ )
+
+ tree = sitemap_tree_for_homepage(self.TEST_BASE_URL)
+
+ pages = list(tree.all_pages())
+ assert pages[0].alternates == [
+ ("fr-FR", f"{self.TEST_BASE_URL}/fr/page"),
+ ]
+ assert pages[1].alternates == [
+ ("en-GB", f"{self.TEST_BASE_URL}/en/page"),
+ ]
diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py
index e960d0e..c707e2a 100644
--- a/usp/fetch_parse.py
+++ b/usp/fetch_parse.py
@@ -643,6 +643,7 @@ class Page:
"news_keywords",
"news_stock_tickers",
"images",
+ "alternates"
]
def __init__(self):
@@ -659,6 +660,7 @@ def __init__(self):
self.news_keywords = None
self.news_stock_tickers = None
self.images = []
+ self.alternates = []
def __hash__(self):
return hash(
@@ -763,6 +765,10 @@ def page(self) -> Optional[SitemapPage]:
for image in self.images
]
+ alternates = None
+ if len(self.alternates) > 0:
+ alternates = self.alternates
+
return SitemapPage(
url=url,
last_modified=last_modified,
@@ -770,6 +776,7 @@ def page(self) -> Optional[SitemapPage]:
priority=priority,
news_story=sitemap_news_story,
images=sitemap_images,
+ alternates=alternates,
)
__slots__ = ["_current_page", "_pages", "_page_urls", "_current_image"]
@@ -801,6 +808,15 @@ def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None:
"Page is expected to be set before ."
)
self._current_image = self.Image()
+ elif name == "link":
+ if not self._current_page:
+ raise SitemapXMLParsingException(
+ "Page is expected to be set before ."
+ )
+ if "hreflang" not in attrs or "href" not in attrs:
+ log.warning(f" element is missing hreflang or href attributes: {attrs}.")
+
+ self._current_page.alternates.append((attrs["hreflang"], attrs["href"]))
def __require_last_char_data_to_be_set(self, name: str) -> None:
if not self._last_char_data:
diff --git a/usp/objects/page.py b/usp/objects/page.py
index 3829565..7bc366e 100644
--- a/usp/objects/page.py
+++ b/usp/objects/page.py
@@ -3,7 +3,7 @@
import datetime
from decimal import Decimal
from enum import Enum, unique
-from typing import List, Optional
+from typing import List, Optional, Tuple
SITEMAP_PAGE_DEFAULT_PRIORITY = Decimal("0.5")
"""Default sitemap page priority, as per the spec."""
@@ -331,6 +331,7 @@ class SitemapPage:
"__change_frequency",
"__news_story",
"__images",
+ "__alternates",
]
def __init__(
@@ -341,6 +342,7 @@ def __init__(
change_frequency: Optional[SitemapPageChangeFrequency] = None,
news_story: Optional[SitemapNewsStory] = None,
images: Optional[List[SitemapImage]] = None,
+ alternates: Optional[List[Tuple[str, str]]] = None,
):
"""
Initialize a new sitemap-derived page.
@@ -357,6 +359,7 @@ def __init__(
self.__change_frequency = change_frequency
self.__news_story = news_story
self.__images = images
+ self.__alternates = alternates
def __eq__(self, other) -> bool:
if not isinstance(other, SitemapPage):
@@ -380,6 +383,9 @@ def __eq__(self, other) -> bool:
if self.images != other.images:
return False
+ if self.alternates != other.alternates:
+ return False
+
return True
def __hash__(self):
@@ -449,3 +455,8 @@ def news_story(self) -> Optional[SitemapNewsStory]:
def images(self) -> Optional[List[SitemapImage]]:
"""Get the images attached to the URL."""
return self.__images
+
+ @property
+ def alternates(self) -> Optional[List[Tuple[str, str]]]:
+ """Get the alternate URLs for the URL."""
+ return self.__alternates
\ No newline at end of file