diff --git a/example/example_site.toml b/example/example_site.toml index 6be71a7a4..dab806e55 100644 --- a/example/example_site.toml +++ b/example/example_site.toml @@ -5,6 +5,12 @@ # name of the folder that the site will be generated in name = "Notion Test Site" +# Sitemap configuration +# Remove if you don't want to generate sitemap.xml +protocol = "https" +domain = "example.com" +remove_html_extension = true + # the notion.so page to being parsing from. This page will become the index.html # of the generated site, and loconotation will parse all sub-pages present on the page page = "https://www.notion.so/Loconotion-Example-Page-03c403f4fdc94cc1b315b9469a8950ef" diff --git a/loconotion/modules/notionparser.py b/loconotion/modules/notionparser.py index 4fccf6518..7797b9a20 100644 --- a/loconotion/modules/notionparser.py +++ b/loconotion/modules/notionparser.py @@ -754,6 +754,21 @@ def parse_subpages(self, subpages): if sub_page not in self.processed_pages.keys(): self.parse_page(sub_page) + def export_sitemap(self, protocol:str, domain:str, processed_pages:set, remove_html_extension:bool): + # Open file in dist/sitemap.xml to write sitemap + with open(self.dist_folder / "sitemap.xml", "w") as f: + # Write XML header + f.write('\r') + # Write sitemap index opening tag + f.write('\r') + # Write the sitemap from domain and processed pages + for page in processed_pages: + if remove_html_extension: + page = page.replace(".html", "") + f.write(f'\t{protocol}://{domain}/{page}\r') + # Write sitemap index closing tag + f.write("") + def load(self, url): self.driver.get(url) WebDriverWait(self.driver, 60).until(notion_page_loaded()) @@ -762,6 +777,12 @@ def run(self): start_time = time.time() self.processed_pages = {} self.parse_page(self.starting_url) + if self.config.get("domain",None): + self.export_sitemap( + self.config.get("protocol", "https"), + self.config.get("domain"), + set(self.processed_pages.values()), + self.config.get("remove_html_extension", False)) elapsed_time = time.time() - start_time formatted_time = "{:02d}:{:02d}:{:02d}".format( int(elapsed_time // 3600),