diff --git a/Dockerfile b/Dockerfile index a1cab295..60b89baf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,11 @@ ADD \ ENV CLASSPATH=${CLASSPATH}:/usr/lib/jvm/java-11-openjdk/saxon/saxon.jar +# Pinned for build issue: https://github.com/pyproj4/pyproj/issues/1321 RUN pip install --upgrade pip +# RUN python3 -m pip install 'cython<3' +# RUN python3 -m pip install --no-use-pep517 pyproj==3.4.1 +RUN python3 -m pip install pyproj@git+https://github.com/pyproj4/pyproj.git@main COPY . $APP_DIR/ diff --git a/ckanext/geodatagov/cli.py b/ckanext/geodatagov/cli.py index f11479ac..f7ba6414 100644 --- a/ckanext/geodatagov/cli.py +++ b/ckanext/geodatagov/cli.py @@ -46,14 +46,9 @@ def datagovs3(): class Sitemap: - """Sitemap object - - Accepts file_num, start, page_size - """ def __init__(self, file_num: str, start: int, page_size: int) -> None: self.file_num = file_num - self.filename_s3 = f"sitemap/sitemap-{file_num}.xml" self.start = start self.page_size = page_size self.xml = "" @@ -64,9 +59,22 @@ def write_xml(self, some_xml, add_newline=True) -> None: else: self.xml += some_xml - def write_sitemap_header(self) -> None: + def to_json(self) -> str: + return json.dumps(self, default=lambda o: o.__dict__) + + def write_sitemap_header(self, index=False) -> None: self.write_xml('') - self.write_xml('') + if index: + self.write_xml('') + else: + self.write_xml('') + + +class SitemapData(Sitemap): + + def __init__(self, file_num: str, start: int, page_size: int) -> None: + super().__init__(file_num, start, page_size) + self.filename_s3 = f"sitemap/sitemap-{file_num}.xml" def write_pkgs(self, package_query: GeoPackageSearchQuery) -> None: @@ -86,8 +94,26 @@ def write_pkgs(self, package_query: GeoPackageSearchQuery) -> None: def write_sitemap_footer(self) -> None: self.write_xml("") - def to_json(self) -> str: - return json.dumps(self, default=lambda o: o.__dict__) + +class SitemapIndex(Sitemap): + + def __init__(self, file_num: str, start: int, page_size: int) -> None: + super().__init__(file_num, start, page_size) + self.filename_s3 = "sitemap.xml" + + def write_table_of_contents(self, number_of_sitemaps): + current_time = datetime.datetime.now().strftime("%Y-%m-%d") + + log.info("Creating sitemap index...") + + for file_num in range(number_of_sitemaps): + # add sitemaps to sitemap index file + self.write_xml("") + loc = f"{config.get('ckan.site_url')}/sitemap/sitemap-{file_num}.xml" + self.write_xml(f"{loc}") + self.write_xml(f"{current_time}") + self.write_xml("") + self.write_xml("") def get_s3() -> None: @@ -174,49 +200,20 @@ def upload_to_key(upload_str: str, filename_on_s3: str) -> None: else: log.error(f"File {filename_on_s3} upload failed. Error: {resp_metadata}") + del temp_file -def upload_sitemap_index(sitemaps: list) -> None: - """Creates and uploads sitemap index xml file""" - current_time = datetime.datetime.now().strftime("%Y-%m-%d") - sitemap_index = Sitemap("index", 0, 0) - sitemap_index.filename_s3 = "sitemap.xml" - - log.info("Creating sitemap index...") - # write sitemap index - sitemap_index.write_xml('') - sitemap_index.write_xml( - '' - ) - - for sitemap in sitemaps: - # add sitemaps to sitemap index file - sitemap_index.write_xml("") - loc = f"{CKAN_SITE_URL}/{sitemap.filename_s3}" - sitemap_index.write_xml(f"{loc}") - sitemap_index.write_xml(f"{current_time}") - sitemap_index.write_xml("") - sitemap_index.write_xml("") +def upload_sitemap_file(sitemap: list) -> None: + """Handles uploading sitemap files to s3""" - upload_to_key(sitemap_index.xml, sitemap_index.filename_s3) + log.info("Uploading sitemap file...") + upload_to_key(sitemap.xml, sitemap.filename_s3) log.info( - f"Sitemap index upload complete to: \ - {S3_ENDPOINT_URL}/{BUCKET_NAME}/{sitemap_index.filename_s3}" + f"Sitemap file {sitemap.filename_s3} upload complete to: \ + {S3_ENDPOINT_URL}/{BUCKET_NAME}/{sitemap.filename_s3}" ) -def upload_sitemap_files(sitemaps: list) -> None: - """Handles uploading sitemap files to s3""" - - log.info(f"Uploading {len(sitemaps)} sitemap files...") - for sitemap in sitemaps: - upload_to_key(sitemap.xml, sitemap.filename_s3) - log.info( - f"Sitemap file {sitemap.filename_s3} upload complete to: \ - {S3_ENDPOINT_URL}/{BUCKET_NAME}/{sitemap.filename_s3}" - ) - - @geodatagov.command() @click.option("--upload_to_s3", default=UPLOAD_TO_S3, type=click.BOOL) @click.option("--page_size", default=PAGE_SIZE, type=click.INT) @@ -233,12 +230,25 @@ def sitemap_to_s3(upload_to_s3: bool, page_size: int, max_per_page: int): return start = 0 - file_num = 1 - sitemaps = [] - paginations = (count // page_size) + 1 - for _ in range(paginations): - sitemap = Sitemap(str(file_num), start, page_size) + num_of_pages = (count // page_size) + 1 + + # Create + Upload Sitemap Index File + sitemap_index = SitemapIndex("index", 0, 0) + sitemap_index.write_sitemap_header(index=True) + sitemap_index.write_table_of_contents(num_of_pages) + + if upload_to_s3: + # set global S3 object and vars + get_s3() + upload_to_key(sitemap_index.xml, sitemap_index.filename_s3) + log.info( + f"Sitemap index upload complete to: \ + {S3_ENDPOINT_URL}/{BUCKET_NAME}/{sitemap_index.filename_s3}" + ) + + for file_num in range(1, num_of_pages + 1): + sitemap = SitemapData(str(file_num), start, page_size) sitemap.write_sitemap_header() sitemap.write_pkgs(package_query) sitemap.write_sitemap_footer() @@ -253,22 +263,17 @@ def sitemap_to_s3(upload_to_s3: bool, page_size: int, max_per_page: int): # 597610699434bde9415a48ed0b1085bfa0e9720f/ckanext/geodatagov/cli.py#L183 log.info(f"done with {sitemap.filename_s3}.") - sitemaps.append(sitemap) start += page_size - file_num += 1 - if upload_to_s3: - log.info("Starting S3 uploads...") - # set global S3 object and vars - get_s3() + if upload_to_s3: + log.info(f"Uploading {sitemap.filename_s3}...") + upload_sitemap_file(sitemap) + else: + log.info(f"Skip upload and return local copy of sitemap {file_num}.") + print(json.dumps(sitemap.to_json(), indent=4)) - upload_sitemap_index(sitemaps) - upload_sitemap_files(sitemaps) - else: - log.info("Skip upload and finish.") - dump = [sitemap.to_json() for sitemap in sitemaps] - print(f"Done locally: Sitemap list\n{json.dumps(dump, indent=4)}") + del sitemap def _normalize_type(_type): diff --git a/ckanext/geodatagov/tests/test_sitemap_creation.py b/ckanext/geodatagov/tests/test_sitemap_creation.py index d0442e9d..d81edd60 100644 --- a/ckanext/geodatagov/tests/test_sitemap_creation.py +++ b/ckanext/geodatagov/tests/test_sitemap_creation.py @@ -53,20 +53,14 @@ def test_cli_output(cli_result: Result) -> None: # the example output I have only has one element in it, # this and _handle_cli_output will need to be updated for examples with more elements # checks only one list element in output string - assert cli_result.output.count("[") == 1 - assert cli_result.output.count("]") == 1 + assert cli_result.output.count("file_num") == 1 @staticmethod def _handle_cli_output(cli_result: Result) -> list: """Parses cli output Result to an interable file_list""" - file_list = [ - eval( - cli_result.output[ - cli_result.output.index("[") + 1: cli_result.output.index("]") - 1 - ].strip() - ) - ] + file_list = cli_result.output.split("}\"\n") + file_list = list(set([f + "}\"" for f in file_list]) - {'}\"'}) return file_list @@ -79,7 +73,7 @@ def test_create_sitemap(self, cli_result): datasets = 0 for site_file in file_list: # site_file is dumped as string - site_file = eval(site_file) + site_file = eval(eval(site_file)) files += 1 """ expected something like diff --git a/requirements.txt b/requirements.txt index a2157720..cc84f36c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,6 @@ pyOpenSSL>22.10 #pinning to fix error with crypto (https://levelup.gitconnected. # ckantoolkit # included as dep of ckanext-harvest GeoAlchemy2==0.5.0 Shapely>=1.2.13 -pyproj==3.4.1 OWSLib==0.28.1 lxml>=2.3 argparse diff --git a/setup.py b/setup.py index 49d60a12..5557c52f 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="ckanext-geodatagov", - version="0.2.0", + version="0.2.1", description="", long_description=long_description, long_description_content_type='text/markdown',