Skip to content

Commit

Permalink
Merge pull request #281 from GSA/db-solr-sync-next
Browse files Browse the repository at this point in the history
Add db-solr-sync-next for catalog-next
  • Loading branch information
rshewitt authored Aug 23, 2024
2 parents ea5ba64 + ef55c5d commit 4510c5b
Show file tree
Hide file tree
Showing 6 changed files with 129 additions and 19 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ ENV CLASSPATH=${CLASSPATH}:/usr/lib/jvm/java-11-openjdk/saxon/saxon.jar
RUN pip install --upgrade pip
# RUN python3 -m pip install 'cython<3'
# RUN python3 -m pip install --no-use-pep517 pyproj==3.4.1
RUN python3 -m pip install pyproj@git+https://github.com/pyproj4/pyproj.git@main
RUN python3 -m pip install pyproj@git+https://github.com/pyproj4/pyproj.git@3.6.1

COPY . $APP_DIR/

Expand Down
16 changes: 8 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,28 @@ CKAN_VERSION ?= 2.10
COMPOSE_FILE ?= docker-compose.yml

build: ## Build the docker containers
CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) build
CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) build
debug:
CKAN_VERSION=$(CKAN_VERSION) docker-compose run --service-ports app
CKAN_VERSION=$(CKAN_VERSION) docker compose run --service-ports app

lint: ## Lint the code
CKAN_VERSION=$(CKAN_VERSION) docker-compose -f docker-compose.yml run --rm app flake8 /srv/app/ckanext/ --count --max-line-length=127 --show-source --statistics --exclude ckan
CKAN_VERSION=$(CKAN_VERSION) docker compose -f docker-compose.yml run --rm app flake8 /srv/app/ckanext/ --count --max-line-length=127 --show-source --statistics --exclude ckan

clean: ## Clean workspace and containers
find . -name *.pyc -delete
CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) down -v --remove-orphans
CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) down -v --remove-orphans

test: ## Run tests in a new container
CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) run --rm app /srv/app/test.sh
CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) run --rm app /srv/app/test.sh

java-test: ## Test java transformation command (java + saxon installed)
CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) run --rm app bash -c "java net.sf.saxon.Transform -s:/app/ckanext/geodatagov/tests/data-samples/waf-fgdc/fgdc-csdgm_sample.xml -xsl:/app/ckanext/geodatagov/harvesters/fgdcrse2iso19115-2.xslt"
CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) run --rm app bash -c "java net.sf.saxon.Transform -s:/app/ckanext/geodatagov/tests/data-samples/waf-fgdc/fgdc-csdgm_sample.xml -xsl:/app/ckanext/geodatagov/harvesters/fgdcrse2iso19115-2.xslt"

up: ## Start the containers
CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) up
CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) up

ci: ## Start the containers in the background
CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) up -d
CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) up -d

.DEFAULT_GOAL := help
.PHONY: build clean help lint test up
Expand Down
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,18 @@ All the tests live in the [/ckanext/geodatagov/tests](/ckanext/geodatagov/tests)
### Build Environment

To start environment, run:
```docker-compose build```
```docker-compose up```
```docker compose build```
```docker compose up```

CKAN will start at localhost:5000

To shut down environment, run:

```docker-compose down```
```docker compose down```

To docker exec into the CKAN image, run:

```docker-compose exec app /bin/bash```
```docker compose exec app /bin/bash```

### Testing

Expand Down
118 changes: 114 additions & 4 deletions ckanext/geodatagov/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,41 @@ def index_for(_type):
return NoopSearchIndex()


def get_all_entity_ids_date():
"""
Return a list of the IDs and metadata_modified of all indexed packages.
Solr result is to be processed in batches at 10000
pagination to avoid out-of-memory.
"""
query = "*:*"
fq = '+site_id:"%s" ' % config.get("ckan.site_id")
fq += "+state:active "
fq += "+type:dataset "

ret_all = []

start = 0
page_size = 10000
conn = make_connection()

log.info(f"Now loading SOLR packages using page size {page_size}...")

while True:
log.info(f"loading packages starting from {start}")
data = conn.search(query, fq=fq, start=start, rows=page_size, fl="id, metadata_modified")

if not data:
break

for r in data.docs:
ret_all.append((r.get("id"), r.get("metadata_modified")))

start += page_size

return ret_all


def get_all_entity_ids_date_hoid():
"""
Return a list of the IDs and metadata_modified of all indexed packages.
Expand Down Expand Up @@ -443,10 +478,7 @@ def db_solr_sync(dryrun, cleanup_solr, update_solr):
for id, *_ in solr_package:
work_list[id] = "solr"
for id, *_ in db_package:
if id in work_list:
work_list[id] = "solr-db"
else:
work_list[id] = "db"
work_list[id] = "db"

both = cleanup_solr == update_solr
set_cleanup = {i if work_list[i] == "solr" else None for i in work_list} - {None}
Expand Down Expand Up @@ -484,6 +516,84 @@ def db_solr_sync(dryrun, cleanup_solr, update_solr):
print(*active_package_id_wo_ho, sep='\n')


@geodatagov.command()
@click.option("--dryrun", is_flag=True, help="inspect what will be updated")
@click.option(
"--cleanup_solr", is_flag=True, help="Only remove orphaned entries in Solr"
)
@click.option(
"--update_solr",
is_flag=True,
help=(
"(Update solr entries with new data from DB) OR (Add DB data to Solr that is missing)"
),
)
def db_solr_sync_next(dryrun, cleanup_solr, update_solr):
"""db solr sync next for catalog-next"""
if dryrun:
log.info("Starting dryrun to update index.")

package_index = index_for(model.Package)

# get active packages from DB
active_package = {
(r[0], r[1].replace(microsecond=0))
for r in model.Session.query(
model.Package.id,
model.Package.metadata_modified
)
.filter(
model.Package.type == "dataset",
model.Package.state == "active"
)
.all()
}

log.info(f"total {len(active_package)} DB active_package")

# get indexed packages from solr
indexed_package = set(get_all_entity_ids_date())
log.info(f"total {len(indexed_package)} solr indexed_package")

solr_package = indexed_package - active_package
db_package = active_package - indexed_package

work_list = {}
for id, *_ in solr_package:
work_list[id] = "solr"
for id, *_ in db_package:
work_list[id] = "db"

both = cleanup_solr == update_solr
set_cleanup = {i if work_list[i] == "solr" else None for i in work_list} - {None}
set_update = work_list.keys() - set_cleanup
log.info(f"{len(set_cleanup)} packages need to be removed from Solr")
log.info(f"{len(set_update)} packages need to be updated/added to Solr")

if not dryrun and set_cleanup and (cleanup_solr or both):
log.info("Deleting indexes")
delete_packages(set_cleanup)
package_index.commit()
log.info("Finished cleaning solr entries.")

if not dryrun and set_update and (update_solr or both):
log.info("Rebuilding indexes")
try:
rebuild(package_ids=set_update, defer_commit=True)
except Exception as e:
log.error("Error while rebuild index %s: %s" % (id, repr(e)))
package_index.commit()
log.info("Finished updating solr entries.")
log.info("Here is the first a few dataset ids that are rebuilt:")
count = 0
max = 10
for id in set_update:
count = count + 1
if count > max:
break
log.info(f"{count}: {id}")


@geodatagov.command()
def check_stuck_jobs():
"""check stuck harvest jobs"""
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name="ckanext-geodatagov",
version="0.2.8",
version="0.2.9",
description="",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
2 changes: 1 addition & 1 deletion test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Setup and run extension tests. This script should be run in a _clean_ CKAN
# environment. e.g.:
#
# $ docker-compose run --rm app ./test.sh
# $ docker compose run --rm app ./test.sh
#

set -o errexit
Expand Down

0 comments on commit 4510c5b

Please sign in to comment.