Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor sitemap generation code #264

Merged
merged 10 commits into from
Jul 27, 2023
4 changes: 4 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@ ADD \

ENV CLASSPATH=${CLASSPATH}:/usr/lib/jvm/java-11-openjdk/saxon/saxon.jar

# Pinned for build issue: https://github.com/pyproj4/pyproj/issues/1321
RUN pip install --upgrade pip
# RUN python3 -m pip install 'cython<3'
# RUN python3 -m pip install --no-use-pep517 pyproj==3.4.1
RUN python3 -m pip install pyproj@git+https://github.com/pyproj4/pyproj.git@main

COPY . $APP_DIR/

Expand Down
129 changes: 67 additions & 62 deletions ckanext/geodatagov/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,9 @@ def datagovs3():


class Sitemap:
"""Sitemap object

Accepts file_num, start, page_size
"""

def __init__(self, file_num: str, start: int, page_size: int) -> None:
self.file_num = file_num
self.filename_s3 = f"sitemap/sitemap-{file_num}.xml"
self.start = start
self.page_size = page_size
self.xml = ""
Expand All @@ -64,9 +59,22 @@ def write_xml(self, some_xml, add_newline=True) -> None:
else:
self.xml += some_xml

def write_sitemap_header(self) -> None:
def to_json(self) -> str:
return json.dumps(self, default=lambda o: o.__dict__)

def write_sitemap_header(self, index=False) -> None:
self.write_xml('<?xml version="1.0" encoding="UTF-8"?>')
self.write_xml('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')
if index:
self.write_xml('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')
else:
self.write_xml('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')


class SitemapData(Sitemap):

def __init__(self, file_num: str, start: int, page_size: int) -> None:
super().__init__(file_num, start, page_size)
self.filename_s3 = f"sitemap/sitemap-{file_num}.xml"

def write_pkgs(self, package_query: GeoPackageSearchQuery) -> None:

Expand All @@ -86,8 +94,26 @@ def write_pkgs(self, package_query: GeoPackageSearchQuery) -> None:
def write_sitemap_footer(self) -> None:
self.write_xml("</urlset>")

def to_json(self) -> str:
return json.dumps(self, default=lambda o: o.__dict__)

class SitemapIndex(Sitemap):

def __init__(self, file_num: str, start: int, page_size: int) -> None:
super().__init__(file_num, start, page_size)
self.filename_s3 = "sitemap.xml"

def write_table_of_contents(self, number_of_sitemaps):
current_time = datetime.datetime.now().strftime("%Y-%m-%d")

log.info("Creating sitemap index...")

for file_num in range(number_of_sitemaps):
# add sitemaps to sitemap index file
self.write_xml("<sitemap>")
loc = f"{config.get('ckan.site_url')}/sitemap/sitemap-{file_num}.xml"
self.write_xml(f"<loc>{loc}</loc>")
self.write_xml(f"<lastmod>{current_time}</lastmod>")
self.write_xml("</sitemap>")
self.write_xml("</sitemapindex>")


def get_s3() -> None:
Expand Down Expand Up @@ -174,49 +200,20 @@ def upload_to_key(upload_str: str, filename_on_s3: str) -> None:
else:
log.error(f"File {filename_on_s3} upload failed. Error: {resp_metadata}")

del temp_file

def upload_sitemap_index(sitemaps: list) -> None:
"""Creates and uploads sitemap index xml file"""

current_time = datetime.datetime.now().strftime("%Y-%m-%d")
sitemap_index = Sitemap("index", 0, 0)
sitemap_index.filename_s3 = "sitemap.xml"

log.info("Creating sitemap index...")
# write sitemap index
sitemap_index.write_xml('<?xml version="1.0" encoding="UTF-8"?>')
sitemap_index.write_xml(
'<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
)

for sitemap in sitemaps:
# add sitemaps to sitemap index file
sitemap_index.write_xml("<sitemap>")
loc = f"{CKAN_SITE_URL}/{sitemap.filename_s3}"
sitemap_index.write_xml(f"<loc>{loc}</loc>")
sitemap_index.write_xml(f"<lastmod>{current_time}</lastmod>")
sitemap_index.write_xml("</sitemap>")
sitemap_index.write_xml("</sitemapindex>")
def upload_sitemap_file(sitemap: list) -> None:
"""Handles uploading sitemap files to s3"""

upload_to_key(sitemap_index.xml, sitemap_index.filename_s3)
log.info("Uploading sitemap file...")
upload_to_key(sitemap.xml, sitemap.filename_s3)
log.info(
f"Sitemap index upload complete to: \
{S3_ENDPOINT_URL}/{BUCKET_NAME}/{sitemap_index.filename_s3}"
f"Sitemap file {sitemap.filename_s3} upload complete to: \
{S3_ENDPOINT_URL}/{BUCKET_NAME}/{sitemap.filename_s3}"
)


def upload_sitemap_files(sitemaps: list) -> None:
"""Handles uploading sitemap files to s3"""

log.info(f"Uploading {len(sitemaps)} sitemap files...")
for sitemap in sitemaps:
upload_to_key(sitemap.xml, sitemap.filename_s3)
log.info(
f"Sitemap file {sitemap.filename_s3} upload complete to: \
{S3_ENDPOINT_URL}/{BUCKET_NAME}/{sitemap.filename_s3}"
)


@geodatagov.command()
@click.option("--upload_to_s3", default=UPLOAD_TO_S3, type=click.BOOL)
@click.option("--page_size", default=PAGE_SIZE, type=click.INT)
Expand All @@ -233,12 +230,25 @@ def sitemap_to_s3(upload_to_s3: bool, page_size: int, max_per_page: int):
return

start = 0
file_num = 1
sitemaps = []

paginations = (count // page_size) + 1
for _ in range(paginations):
sitemap = Sitemap(str(file_num), start, page_size)
num_of_pages = (count // page_size) + 1

# Create + Upload Sitemap Index File
sitemap_index = SitemapIndex("index", 0, 0)
sitemap_index.write_sitemap_header(index=True)
sitemap_index.write_table_of_contents(num_of_pages)

if upload_to_s3:
# set global S3 object and vars
get_s3()
upload_to_key(sitemap_index.xml, sitemap_index.filename_s3)
log.info(
f"Sitemap index upload complete to: \
{S3_ENDPOINT_URL}/{BUCKET_NAME}/{sitemap_index.filename_s3}"
)

for file_num in range(1, num_of_pages + 1):
sitemap = SitemapData(str(file_num), start, page_size)
sitemap.write_sitemap_header()
sitemap.write_pkgs(package_query)
sitemap.write_sitemap_footer()
Expand All @@ -253,22 +263,17 @@ def sitemap_to_s3(upload_to_s3: bool, page_size: int, max_per_page: int):
# 597610699434bde9415a48ed0b1085bfa0e9720f/ckanext/geodatagov/cli.py#L183

log.info(f"done with {sitemap.filename_s3}.")
sitemaps.append(sitemap)

start += page_size
file_num += 1

if upload_to_s3:
log.info("Starting S3 uploads...")
# set global S3 object and vars
get_s3()
if upload_to_s3:
log.info(f"Uploading {sitemap.filename_s3}...")
upload_sitemap_file(sitemap)
else:
log.info(f"Skip upload and return local copy of sitemap {file_num}.")
print(json.dumps(sitemap.to_json(), indent=4))

upload_sitemap_index(sitemaps)
upload_sitemap_files(sitemaps)
else:
log.info("Skip upload and finish.")
dump = [sitemap.to_json() for sitemap in sitemaps]
print(f"Done locally: Sitemap list\n{json.dumps(dump, indent=4)}")
del sitemap


def _normalize_type(_type):
Expand Down
14 changes: 4 additions & 10 deletions ckanext/geodatagov/tests/test_sitemap_creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,20 +53,14 @@ def test_cli_output(cli_result: Result) -> None:
# the example output I have only has one element in it,
# this and _handle_cli_output will need to be updated for examples with more elements
# checks only one list element in output string
assert cli_result.output.count("[") == 1
assert cli_result.output.count("]") == 1
assert cli_result.output.count("file_num") == 1

@staticmethod
def _handle_cli_output(cli_result: Result) -> list:
"""Parses cli output Result to an interable file_list"""

file_list = [
eval(
cli_result.output[
cli_result.output.index("[") + 1: cli_result.output.index("]") - 1
].strip()
)
]
file_list = cli_result.output.split("}\"\n")
file_list = list(set([f + "}\"" for f in file_list]) - {'}\"'})

return file_list

Expand All @@ -79,7 +73,7 @@ def test_create_sitemap(self, cli_result):
datasets = 0
for site_file in file_list:
# site_file is dumped as string
site_file = eval(site_file)
site_file = eval(eval(site_file))

files += 1
""" expected something like
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ pyOpenSSL>22.10 #pinning to fix error with crypto (https://levelup.gitconnected.
# ckantoolkit # included as dep of ckanext-harvest
GeoAlchemy2==0.5.0
Shapely>=1.2.13
pyproj==3.4.1
OWSLib==0.28.1
lxml>=2.3
argparse
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name="ckanext-geodatagov",
version="0.2.0",
version="0.2.1",
description="",
long_description=long_description,
long_description_content_type='text/markdown',
Expand Down