From 6da501e15475b038e0a9e72956610b5a19c64fae Mon Sep 17 00:00:00 2001 From: kevindaffaarr Date: Fri, 6 Jan 2023 19:33:50 +0700 Subject: [PATCH 1/4] generate sitemap with custom domain in toml --- example/example_site.toml | 1 + loconotion/modules/notionparser.py | 15 +++++++++++++++ loconotion/tests/test_sitemap.py | 14 ++++++++++++++ 3 files changed, 30 insertions(+) create mode 100644 loconotion/tests/test_sitemap.py diff --git a/example/example_site.toml b/example/example_site.toml index 6be71a7a4..d4e81657e 100644 --- a/example/example_site.toml +++ b/example/example_site.toml @@ -4,6 +4,7 @@ # name of the folder that the site will be generated in name = "Notion Test Site" +domain = "example.com" # the notion.so page to being parsing from. This page will become the index.html # of the generated site, and loconotation will parse all sub-pages present on the page diff --git a/loconotion/modules/notionparser.py b/loconotion/modules/notionparser.py index 4fccf6518..c01fecc98 100644 --- a/loconotion/modules/notionparser.py +++ b/loconotion/modules/notionparser.py @@ -754,6 +754,19 @@ def parse_subpages(self, subpages): if sub_page not in self.processed_pages.keys(): self.parse_page(sub_page) + def export_sitemap(self, domain:str, processed_pages:list): + # Open file in dist/sitemap.xml to write sitemap + with open(self.dist_folder / "sitemap.xml", "w") as f: + # Write XML header + f.write('\r') + # Write sitemap index opening tag + f.write('\r') + # Write the sitemap from domain and processed pages + for page in processed_pages: + f.write(f'https://{domain}/{page}\r') + # Write sitemap index closing tag + f.write("") + def load(self, url): self.driver.get(url) WebDriverWait(self.driver, 60).until(notion_page_loaded()) @@ -762,6 +775,8 @@ def run(self): start_time = time.time() self.processed_pages = {} self.parse_page(self.starting_url) + if self.config.get("domain",None): + self.export_sitemap(self.config.get("domain"),list(self.processed_pages.values())) elapsed_time = time.time() - start_time formatted_time = "{:02d}:{:02d}:{:02d}".format( int(elapsed_time // 3600), diff --git a/loconotion/tests/test_sitemap.py b/loconotion/tests/test_sitemap.py new file mode 100644 index 000000000..068e86c4f --- /dev/null +++ b/loconotion/tests/test_sitemap.py @@ -0,0 +1,14 @@ +import sys +sys.path.insert(0, "D:\\Other Projects\\loconotion\\loconotion") + +from modules.notionparser import Parser + +def test_parse_sample_page(): + config={"page": "https://www.notion.so/Loconotion-Example-Page-03c403f4fdc94cc1b315b9469a8950ef", "domain": "example.com"} + args = {"timeout": 10, "single_page": True} + parser = Parser(config, args) + parser.run() + pass + +if __name__ == "__main__": + test_parse_sample_page() From d8253499c01da9ccf60b75ff19b9def67faba648 Mon Sep 17 00:00:00 2001 From: kevindaffaarr Date: Fri, 6 Jan 2023 20:23:30 +0700 Subject: [PATCH 2/4] adding protocol and remove_html_extension. Also not export duplicate links --- loconotion/modules/notionparser.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/loconotion/modules/notionparser.py b/loconotion/modules/notionparser.py index c01fecc98..7797b9a20 100644 --- a/loconotion/modules/notionparser.py +++ b/loconotion/modules/notionparser.py @@ -754,7 +754,7 @@ def parse_subpages(self, subpages): if sub_page not in self.processed_pages.keys(): self.parse_page(sub_page) - def export_sitemap(self, domain:str, processed_pages:list): + def export_sitemap(self, protocol:str, domain:str, processed_pages:set, remove_html_extension:bool): # Open file in dist/sitemap.xml to write sitemap with open(self.dist_folder / "sitemap.xml", "w") as f: # Write XML header @@ -763,7 +763,9 @@ def export_sitemap(self, domain:str, processed_pages:list): f.write('\r') # Write the sitemap from domain and processed pages for page in processed_pages: - f.write(f'https://{domain}/{page}\r') + if remove_html_extension: + page = page.replace(".html", "") + f.write(f'\t{protocol}://{domain}/{page}\r') # Write sitemap index closing tag f.write("") @@ -776,7 +778,11 @@ def run(self): self.processed_pages = {} self.parse_page(self.starting_url) if self.config.get("domain",None): - self.export_sitemap(self.config.get("domain"),list(self.processed_pages.values())) + self.export_sitemap( + self.config.get("protocol", "https"), + self.config.get("domain"), + set(self.processed_pages.values()), + self.config.get("remove_html_extension", False)) elapsed_time = time.time() - start_time formatted_time = "{:02d}:{:02d}:{:02d}".format( int(elapsed_time // 3600), From 8b0b99843ebc9088ac8fd56a58ac09c2cdb0e258 Mon Sep 17 00:00:00 2001 From: kevindaffaarr Date: Fri, 6 Jan 2023 20:31:08 +0700 Subject: [PATCH 3/4] add sitemap configuration in example toml --- example/example_site.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/example/example_site.toml b/example/example_site.toml index d4e81657e..dab806e55 100644 --- a/example/example_site.toml +++ b/example/example_site.toml @@ -4,7 +4,12 @@ # name of the folder that the site will be generated in name = "Notion Test Site" + +# Sitemap configuration +# Remove if you don't want to generate sitemap.xml +protocol = "https" domain = "example.com" +remove_html_extension = true # the notion.so page to being parsing from. This page will become the index.html # of the generated site, and loconotation will parse all sub-pages present on the page From dcf53962f59a23e86ff3d64289d334aaa7530d7f Mon Sep 17 00:00:00 2001 From: kevindaffaarr Date: Fri, 6 Jan 2023 20:33:51 +0700 Subject: [PATCH 4/4] - --- loconotion/tests/test_sitemap.py | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 loconotion/tests/test_sitemap.py diff --git a/loconotion/tests/test_sitemap.py b/loconotion/tests/test_sitemap.py deleted file mode 100644 index 068e86c4f..000000000 --- a/loconotion/tests/test_sitemap.py +++ /dev/null @@ -1,14 +0,0 @@ -import sys -sys.path.insert(0, "D:\\Other Projects\\loconotion\\loconotion") - -from modules.notionparser import Parser - -def test_parse_sample_page(): - config={"page": "https://www.notion.so/Loconotion-Example-Page-03c403f4fdc94cc1b315b9469a8950ef", "domain": "example.com"} - args = {"timeout": 10, "single_page": True} - parser = Parser(config, args) - parser.run() - pass - -if __name__ == "__main__": - test_parse_sample_page()