From 752bf88d037322a5fab2a276e33386eca163c987 Mon Sep 17 00:00:00 2001 From: Fuhu Xia Date: Thu, 22 Aug 2024 12:48:55 -0400 Subject: [PATCH 1/5] db-solr-sync-next --- ckanext/geodatagov/cli.py | 116 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/ckanext/geodatagov/cli.py b/ckanext/geodatagov/cli.py index 58de08de..fabc0e16 100644 --- a/ckanext/geodatagov/cli.py +++ b/ckanext/geodatagov/cli.py @@ -306,6 +306,41 @@ def index_for(_type): return NoopSearchIndex() +def get_all_entity_ids_date(): + """ + Return a list of the IDs and metadata_modified of all indexed packages. + + Solr result is to be processed in batches at 10000 + pagination to avoid out-of-memory. + """ + query = "*:*" + fq = '+site_id:"%s" ' % config.get("ckan.site_id") + fq += "+state:active " + fq += "+type:dataset " + + ret_all = [] + + start = 0 + page_size = 10000 + conn = make_connection() + + log.info(f"Now loading SOLR packages using page size {page_size}...") + + while True: + log.info(f"loading packages starting from {start}") + data = conn.search(query, fq=fq, start=start, rows=page_size, fl="id, metadata_modified") + + if not data: + break + + for r in data.docs: + ret_all.append((r.get("id"), r.get("metadata_modified"))) + + start += page_size + + return ret_all + + def get_all_entity_ids_date_hoid(): """ Return a list of the IDs and metadata_modified of all indexed packages. @@ -484,6 +519,87 @@ def db_solr_sync(dryrun, cleanup_solr, update_solr): print(*active_package_id_wo_ho, sep='\n') +@geodatagov.command() +@click.option("--dryrun", is_flag=True, help="inspect what will be updated") +@click.option( + "--cleanup_solr", is_flag=True, help="Only remove orphaned entries in Solr" +) +@click.option( + "--update_solr", + is_flag=True, + help=( + "(Update solr entries with new data from DB) OR (Add DB data to Solr that is missing)" + ), +) +def db_solr_sync_next(dryrun, cleanup_solr, update_solr): + """db solr sync next""" + if dryrun: + log.info("Starting dryrun to update index.") + + package_index = index_for(model.Package) + + # get active packages from DB + active_package = { + (r[0], r[1].replace(microsecond=0)) + for r in model.Session.query( + model.Package.id, + model.Package.metadata_modified + ) + .filter( + model.Package.type == "dataset", + model.Package.state == "active" + ) + .all() + } + + log.info(f"total {len(active_package)} DB active_package") + + # get indexed packages from solr + indexed_package = set(get_all_entity_ids_date()) + log.info(f"total {len(indexed_package)} solr indexed_package") + + solr_package = indexed_package - active_package + db_package = active_package - indexed_package + + work_list = {} + for id, *_ in solr_package: + work_list[id] = "solr" + for id, *_ in db_package: + if id in work_list: + work_list[id] = "solr-db" + else: + work_list[id] = "db" + + both = cleanup_solr == update_solr + set_cleanup = {i if work_list[i] == "solr" else None for i in work_list} - {None} + set_update = work_list.keys() - set_cleanup + log.info(f"{len(set_cleanup)} packages need to be removed from Solr") + log.info(f"{len(set_update)} packages need to be updated/added to Solr") + + if not dryrun and set_cleanup and (cleanup_solr or both): + log.info("Deleting indexes") + delete_packages(set_cleanup) + package_index.commit() + log.info("Finished cleaning solr entries.") + + if not dryrun and set_update and (update_solr or both): + log.info("Rebuilding indexes") + try: + rebuild(package_ids=set_update, defer_commit=True) + except Exception as e: + log.error("Error while rebuild index %s: %s" % (id, repr(e))) + package_index.commit() + log.info("Finished updating solr entries.") + log.info("Here is the first a few dataset ids that are rebuilt:") + count = 0 + max = 10 + for id in set_update: + count = count + 1 + if count > max: + break + log.info(f"{count}: {id}") + + @geodatagov.command() def check_stuck_jobs(): """check stuck harvest jobs""" From dbd6679aff2e62f734416094451264b3b74266b2 Mon Sep 17 00:00:00 2001 From: Fuhu Xia Date: Thu, 22 Aug 2024 13:43:59 -0400 Subject: [PATCH 2/5] bump version --- ckanext/geodatagov/cli.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ckanext/geodatagov/cli.py b/ckanext/geodatagov/cli.py index fabc0e16..0665a351 100644 --- a/ckanext/geodatagov/cli.py +++ b/ckanext/geodatagov/cli.py @@ -532,7 +532,7 @@ def db_solr_sync(dryrun, cleanup_solr, update_solr): ), ) def db_solr_sync_next(dryrun, cleanup_solr, update_solr): - """db solr sync next""" + """db solr sync next for catalog-next""" if dryrun: log.info("Starting dryrun to update index.") diff --git a/setup.py b/setup.py index 424371b5..c1eb0682 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="ckanext-geodatagov", - version="0.2.8", + version="0.2.9", description="", long_description=long_description, long_description_content_type="text/markdown", From 3972830530ab58315a0c63d4d6a11aba0f7c5e4e Mon Sep 17 00:00:00 2001 From: Fuhu Xia Date: Thu, 22 Aug 2024 13:50:14 -0400 Subject: [PATCH 3/5] fix docker compose --- Makefile | 16 ++++++++-------- README.md | 8 ++++---- test.sh | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index bf06eda0..0168326a 100644 --- a/Makefile +++ b/Makefile @@ -2,28 +2,28 @@ CKAN_VERSION ?= 2.10 COMPOSE_FILE ?= docker-compose.yml build: ## Build the docker containers - CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) build + CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) build debug: - CKAN_VERSION=$(CKAN_VERSION) docker-compose run --service-ports app + CKAN_VERSION=$(CKAN_VERSION) docker compose run --service-ports app lint: ## Lint the code - CKAN_VERSION=$(CKAN_VERSION) docker-compose -f docker-compose.yml run --rm app flake8 /srv/app/ckanext/ --count --max-line-length=127 --show-source --statistics --exclude ckan + CKAN_VERSION=$(CKAN_VERSION) docker compose -f docker-compose.yml run --rm app flake8 /srv/app/ckanext/ --count --max-line-length=127 --show-source --statistics --exclude ckan clean: ## Clean workspace and containers find . -name *.pyc -delete - CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) down -v --remove-orphans + CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) down -v --remove-orphans test: ## Run tests in a new container - CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) run --rm app /srv/app/test.sh + CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) run --rm app /srv/app/test.sh java-test: ## Test java transformation command (java + saxon installed) - CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) run --rm app bash -c "java net.sf.saxon.Transform -s:/app/ckanext/geodatagov/tests/data-samples/waf-fgdc/fgdc-csdgm_sample.xml -xsl:/app/ckanext/geodatagov/harvesters/fgdcrse2iso19115-2.xslt" + CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) run --rm app bash -c "java net.sf.saxon.Transform -s:/app/ckanext/geodatagov/tests/data-samples/waf-fgdc/fgdc-csdgm_sample.xml -xsl:/app/ckanext/geodatagov/harvesters/fgdcrse2iso19115-2.xslt" up: ## Start the containers - CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) up + CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) up ci: ## Start the containers in the background - CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) up -d + CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) up -d .DEFAULT_GOAL := help .PHONY: build clean help lint test up diff --git a/README.md b/README.md index 8843a400..f7a35e8b 100644 --- a/README.md +++ b/README.md @@ -46,18 +46,18 @@ All the tests live in the [/ckanext/geodatagov/tests](/ckanext/geodatagov/tests) ### Build Environment To start environment, run: -```docker-compose build``` -```docker-compose up``` +```docker compose build``` +```docker compose up``` CKAN will start at localhost:5000 To shut down environment, run: -```docker-compose down``` +```docker compose down``` To docker exec into the CKAN image, run: -```docker-compose exec app /bin/bash``` +```docker compose exec app /bin/bash``` ### Testing diff --git a/test.sh b/test.sh index 2be43ec9..7049affe 100755 --- a/test.sh +++ b/test.sh @@ -2,7 +2,7 @@ # Setup and run extension tests. This script should be run in a _clean_ CKAN # environment. e.g.: # -# $ docker-compose run --rm app ./test.sh +# $ docker compose run --rm app ./test.sh # set -o errexit From 3f3576f4c632bc8a00d3e18ec5a28aec9f5eb6c5 Mon Sep 17 00:00:00 2001 From: Fuhu Xia Date: Thu, 22 Aug 2024 15:54:29 -0400 Subject: [PATCH 4/5] use latest release of proj --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 0edfa212..77264f18 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,7 +16,7 @@ ENV CLASSPATH=${CLASSPATH}:/usr/lib/jvm/java-11-openjdk/saxon/saxon.jar RUN pip install --upgrade pip # RUN python3 -m pip install 'cython<3' # RUN python3 -m pip install --no-use-pep517 pyproj==3.4.1 -RUN python3 -m pip install pyproj@git+https://github.com/pyproj4/pyproj.git@main +RUN python3 -m pip install pyproj@git+https://github.com/pyproj4/pyproj.git@3.6.1 COPY . $APP_DIR/ From ef55c5da379c0de70ca36a7f37dbaf2985e51113 Mon Sep 17 00:00:00 2001 From: Fuhu Xia Date: Fri, 23 Aug 2024 13:21:38 -0400 Subject: [PATCH 5/5] remove unnecessary code --- ckanext/geodatagov/cli.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/ckanext/geodatagov/cli.py b/ckanext/geodatagov/cli.py index 0665a351..097ae43c 100644 --- a/ckanext/geodatagov/cli.py +++ b/ckanext/geodatagov/cli.py @@ -478,10 +478,7 @@ def db_solr_sync(dryrun, cleanup_solr, update_solr): for id, *_ in solr_package: work_list[id] = "solr" for id, *_ in db_package: - if id in work_list: - work_list[id] = "solr-db" - else: - work_list[id] = "db" + work_list[id] = "db" both = cleanup_solr == update_solr set_cleanup = {i if work_list[i] == "solr" else None for i in work_list} - {None} @@ -565,10 +562,7 @@ def db_solr_sync_next(dryrun, cleanup_solr, update_solr): for id, *_ in solr_package: work_list[id] = "solr" for id, *_ in db_package: - if id in work_list: - work_list[id] = "solr-db" - else: - work_list[id] = "db" + work_list[id] = "db" both = cleanup_solr == update_solr set_cleanup = {i if work_list[i] == "solr" else None for i in work_list} - {None}