diff --git a/tests/tree/test_xml_exts.py b/tests/tree/test_xml_exts.py index 78735e3..d1e6c0b 100644 --- a/tests/tree/test_xml_exts.py +++ b/tests/tree/test_xml_exts.py @@ -105,3 +105,56 @@ def test_xml_image(self, requests_mock): print(tree) assert tree == expected_sitemap_tree + + def test_xml_hreflang(self, requests_mock): + requests_mock.add_matcher(TreeTestBase.fallback_to_404_not_found_matcher) + + requests_mock.get( + self.TEST_BASE_URL + "/robots.txt", + headers={"Content-Type": "text/plain"}, + text=textwrap.dedent( + f""" + User-agent: * + Disallow: /whatever + + Sitemap: {self.TEST_BASE_URL}/sitemap.xml + + """ + ).strip(), + ) + + requests_mock.get( + self.TEST_BASE_URL + "/sitemap.xml", + headers={"Content-Type": "text/xml"}, + text=textwrap.dedent( + f""" + + + + {self.TEST_BASE_URL}/en/page + {self.TEST_DATE_STR_ISO8601} + monthly + 0.8 + + + + {self.TEST_BASE_URL}/fr/page + {self.TEST_DATE_STR_ISO8601} + monthly + 0.8 + + + + """ + ).strip() + ) + + tree = sitemap_tree_for_homepage(self.TEST_BASE_URL) + + pages = list(tree.all_pages()) + assert pages[0].alternates == [ + ("fr-FR", f"{self.TEST_BASE_URL}/fr/page"), + ] + assert pages[1].alternates == [ + ("en-GB", f"{self.TEST_BASE_URL}/en/page"), + ] diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py index e960d0e..c707e2a 100644 --- a/usp/fetch_parse.py +++ b/usp/fetch_parse.py @@ -643,6 +643,7 @@ class Page: "news_keywords", "news_stock_tickers", "images", + "alternates" ] def __init__(self): @@ -659,6 +660,7 @@ def __init__(self): self.news_keywords = None self.news_stock_tickers = None self.images = [] + self.alternates = [] def __hash__(self): return hash( @@ -763,6 +765,10 @@ def page(self) -> Optional[SitemapPage]: for image in self.images ] + alternates = None + if len(self.alternates) > 0: + alternates = self.alternates + return SitemapPage( url=url, last_modified=last_modified, @@ -770,6 +776,7 @@ def page(self) -> Optional[SitemapPage]: priority=priority, news_story=sitemap_news_story, images=sitemap_images, + alternates=alternates, ) __slots__ = ["_current_page", "_pages", "_page_urls", "_current_image"] @@ -801,6 +808,15 @@ def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None: "Page is expected to be set before ." ) self._current_image = self.Image() + elif name == "link": + if not self._current_page: + raise SitemapXMLParsingException( + "Page is expected to be set before ." + ) + if "hreflang" not in attrs or "href" not in attrs: + log.warning(f" element is missing hreflang or href attributes: {attrs}.") + + self._current_page.alternates.append((attrs["hreflang"], attrs["href"])) def __require_last_char_data_to_be_set(self, name: str) -> None: if not self._last_char_data: diff --git a/usp/objects/page.py b/usp/objects/page.py index 3829565..7bc366e 100644 --- a/usp/objects/page.py +++ b/usp/objects/page.py @@ -3,7 +3,7 @@ import datetime from decimal import Decimal from enum import Enum, unique -from typing import List, Optional +from typing import List, Optional, Tuple SITEMAP_PAGE_DEFAULT_PRIORITY = Decimal("0.5") """Default sitemap page priority, as per the spec.""" @@ -331,6 +331,7 @@ class SitemapPage: "__change_frequency", "__news_story", "__images", + "__alternates", ] def __init__( @@ -341,6 +342,7 @@ def __init__( change_frequency: Optional[SitemapPageChangeFrequency] = None, news_story: Optional[SitemapNewsStory] = None, images: Optional[List[SitemapImage]] = None, + alternates: Optional[List[Tuple[str, str]]] = None, ): """ Initialize a new sitemap-derived page. @@ -357,6 +359,7 @@ def __init__( self.__change_frequency = change_frequency self.__news_story = news_story self.__images = images + self.__alternates = alternates def __eq__(self, other) -> bool: if not isinstance(other, SitemapPage): @@ -380,6 +383,9 @@ def __eq__(self, other) -> bool: if self.images != other.images: return False + if self.alternates != other.alternates: + return False + return True def __hash__(self): @@ -449,3 +455,8 @@ def news_story(self) -> Optional[SitemapNewsStory]: def images(self) -> Optional[List[SitemapImage]]: """Get the images attached to the URL.""" return self.__images + + @property + def alternates(self) -> Optional[List[Tuple[str, str]]]: + """Get the alternate URLs for the URL.""" + return self.__alternates \ No newline at end of file