Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add db-solr-sync-next for catalog-next #281

Merged
merged 5 commits into from
Aug 23, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ ENV CLASSPATH=${CLASSPATH}:/usr/lib/jvm/java-11-openjdk/saxon/saxon.jar
RUN pip install --upgrade pip
# RUN python3 -m pip install 'cython<3'
# RUN python3 -m pip install --no-use-pep517 pyproj==3.4.1
RUN python3 -m pip install pyproj@git+https://github.com/pyproj4/pyproj.git@main
RUN python3 -m pip install pyproj@git+https://github.com/pyproj4/pyproj.git@3.6.1

COPY . $APP_DIR/

Expand Down
16 changes: 8 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,28 @@ CKAN_VERSION ?= 2.10
COMPOSE_FILE ?= docker-compose.yml

build: ## Build the docker containers
CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) build
CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) build
debug:
CKAN_VERSION=$(CKAN_VERSION) docker-compose run --service-ports app
CKAN_VERSION=$(CKAN_VERSION) docker compose run --service-ports app

lint: ## Lint the code
CKAN_VERSION=$(CKAN_VERSION) docker-compose -f docker-compose.yml run --rm app flake8 /srv/app/ckanext/ --count --max-line-length=127 --show-source --statistics --exclude ckan
CKAN_VERSION=$(CKAN_VERSION) docker compose -f docker-compose.yml run --rm app flake8 /srv/app/ckanext/ --count --max-line-length=127 --show-source --statistics --exclude ckan

clean: ## Clean workspace and containers
find . -name *.pyc -delete
CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) down -v --remove-orphans
CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) down -v --remove-orphans

test: ## Run tests in a new container
CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) run --rm app /srv/app/test.sh
CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) run --rm app /srv/app/test.sh

java-test: ## Test java transformation command (java + saxon installed)
CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) run --rm app bash -c "java net.sf.saxon.Transform -s:/app/ckanext/geodatagov/tests/data-samples/waf-fgdc/fgdc-csdgm_sample.xml -xsl:/app/ckanext/geodatagov/harvesters/fgdcrse2iso19115-2.xslt"
CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) run --rm app bash -c "java net.sf.saxon.Transform -s:/app/ckanext/geodatagov/tests/data-samples/waf-fgdc/fgdc-csdgm_sample.xml -xsl:/app/ckanext/geodatagov/harvesters/fgdcrse2iso19115-2.xslt"

up: ## Start the containers
CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) up
CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) up

ci: ## Start the containers in the background
CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) up -d
CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) up -d

.DEFAULT_GOAL := help
.PHONY: build clean help lint test up
Expand Down
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,18 @@ All the tests live in the [/ckanext/geodatagov/tests](/ckanext/geodatagov/tests)
### Build Environment

To start environment, run:
```docker-compose build```
```docker-compose up```
```docker compose build```
```docker compose up```

CKAN will start at localhost:5000

To shut down environment, run:

```docker-compose down```
```docker compose down```

To docker exec into the CKAN image, run:

```docker-compose exec app /bin/bash```
```docker compose exec app /bin/bash```

### Testing

Expand Down
116 changes: 116 additions & 0 deletions ckanext/geodatagov/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,41 @@ def index_for(_type):
return NoopSearchIndex()


def get_all_entity_ids_date():
"""
Return a list of the IDs and metadata_modified of all indexed packages.

Solr result is to be processed in batches at 10000
pagination to avoid out-of-memory.
"""
query = "*:*"
fq = '+site_id:"%s" ' % config.get("ckan.site_id")
fq += "+state:active "
fq += "+type:dataset "

ret_all = []

start = 0
page_size = 10000
conn = make_connection()

log.info(f"Now loading SOLR packages using page size {page_size}...")

while True:
log.info(f"loading packages starting from {start}")
data = conn.search(query, fq=fq, start=start, rows=page_size, fl="id, metadata_modified")

if not data:
break

for r in data.docs:
ret_all.append((r.get("id"), r.get("metadata_modified")))

start += page_size

return ret_all


def get_all_entity_ids_date_hoid():
"""
Return a list of the IDs and metadata_modified of all indexed packages.
Expand Down Expand Up @@ -484,6 +519,87 @@ def db_solr_sync(dryrun, cleanup_solr, update_solr):
print(*active_package_id_wo_ho, sep='\n')


@geodatagov.command()
@click.option("--dryrun", is_flag=True, help="inspect what will be updated")
@click.option(
"--cleanup_solr", is_flag=True, help="Only remove orphaned entries in Solr"
)
@click.option(
"--update_solr",
is_flag=True,
help=(
"(Update solr entries with new data from DB) OR (Add DB data to Solr that is missing)"
),
)
def db_solr_sync_next(dryrun, cleanup_solr, update_solr):
"""db solr sync next for catalog-next"""
if dryrun:
log.info("Starting dryrun to update index.")

package_index = index_for(model.Package)

# get active packages from DB
active_package = {
(r[0], r[1].replace(microsecond=0))
for r in model.Session.query(
model.Package.id,
model.Package.metadata_modified
)
.filter(
model.Package.type == "dataset",
model.Package.state == "active"
)
.all()
}

log.info(f"total {len(active_package)} DB active_package")

# get indexed packages from solr
indexed_package = set(get_all_entity_ids_date())
log.info(f"total {len(indexed_package)} solr indexed_package")

solr_package = indexed_package - active_package
db_package = active_package - indexed_package

work_list = {}
for id, *_ in solr_package:
work_list[id] = "solr"
for id, *_ in db_package:
if id in work_list:
work_list[id] = "solr-db"
rshewitt marked this conversation as resolved.
Show resolved Hide resolved
else:
work_list[id] = "db"

both = cleanup_solr == update_solr
set_cleanup = {i if work_list[i] == "solr" else None for i in work_list} - {None}
set_update = work_list.keys() - set_cleanup
log.info(f"{len(set_cleanup)} packages need to be removed from Solr")
log.info(f"{len(set_update)} packages need to be updated/added to Solr")

if not dryrun and set_cleanup and (cleanup_solr or both):
log.info("Deleting indexes")
delete_packages(set_cleanup)
package_index.commit()
log.info("Finished cleaning solr entries.")

if not dryrun and set_update and (update_solr or both):
log.info("Rebuilding indexes")
try:
rebuild(package_ids=set_update, defer_commit=True)
except Exception as e:
log.error("Error while rebuild index %s: %s" % (id, repr(e)))
package_index.commit()
log.info("Finished updating solr entries.")
log.info("Here is the first a few dataset ids that are rebuilt:")
count = 0
max = 10
for id in set_update:
count = count + 1
if count > max:
break
log.info(f"{count}: {id}")


@geodatagov.command()
def check_stuck_jobs():
"""check stuck harvest jobs"""
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name="ckanext-geodatagov",
version="0.2.8",
version="0.2.9",
description="",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
2 changes: 1 addition & 1 deletion test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Setup and run extension tests. This script should be run in a _clean_ CKAN
# environment. e.g.:
#
# $ docker-compose run --rm app ./test.sh
# $ docker compose run --rm app ./test.sh
#

set -o errexit
Expand Down
Loading