From c026261e3612d0be4d63b4f994616d969d0b24fd Mon Sep 17 00:00:00 2001 From: Vincent Fazio Date: Mon, 22 Apr 2024 12:44:30 +0000 Subject: [PATCH] Changes: - moved OUTPUT_DIR to within Extractor class - added very simple tests for extraction classes - added keyword tests --- src/ISO19115_3_extract.py | 9 +------ src/ISO19139_extract.py | 15 +---------- src/add_coords.py | 2 +- src/add_links.py | 2 +- src/add_model_keyw.py | 2 +- src/bedrock_summary.py | 2 -- src/ckan_extract.py | 17 ++++--------- src/config.py | 5 ++++ src/constants.py | 3 --- src/extractor.py | 11 ++++++++ src/keywords.py | 30 ++++++---------------- src/oai_extract.py | 31 ++++++----------------- src/pdf_extract.py | 31 +++++++++++++++-------- src/process.py | 3 +-- src/summary.py | 2 +- tests/{namespaces.py => helpers.py} | 33 ++++++++++++++++++++++++ tests/test_ISO19115_3.py | 8 ++++++ tests/test_ISO19139.py | 12 +++++++++ tests/test_ckan.py | 7 ++++++ tests/test_oai.py | 22 ++++++++++++++++ tests/test_pdf.py | 6 +++++ tests/{test.py => unit_test.py} | 39 +++++++++++------------------ 22 files changed, 166 insertions(+), 126 deletions(-) delete mode 100644 src/constants.py rename tests/{namespaces.py => helpers.py} (73%) create mode 100644 tests/test_ISO19115_3.py create mode 100644 tests/test_ISO19139.py create mode 100644 tests/test_ckan.py create mode 100644 tests/test_oai.py create mode 100644 tests/test_pdf.py rename tests/{test.py => unit_test.py} (81%) diff --git a/src/ISO19115_3_extract.py b/src/ISO19115_3_extract.py index a2b94b3..e7b2147 100755 --- a/src/ISO19115_3_extract.py +++ b/src/ISO19115_3_extract.py @@ -6,7 +6,6 @@ import lxml from extractor import Extractor -from constants import OUTPUT_DIR from add_model_keyw import add_models_keyword @@ -180,16 +179,10 @@ def write_record(self, name, bbox, model_endpath, metadata_url, output_file): xml_string = add_models_keyword(str_result, 'utf-8', 'ISO19115-3') # Write to disk - with open(os.path.join(OUTPUT_DIR, output_file), 'w') as ff: + with open(os.path.join(self.output_dir, output_file), 'w') as ff: ff.write(xml_string) return True return False -# Used for testing only -if __name__ == "__main__": - url = "https://catalog.sarig.sa.gov.au/geonetwork/srv/api/records/9c6ae754-291d-4100-afd9-478c3a9ddf42/formatters/xml" - name = 'ngawler' - ce = ISO19115_3Extractor() - ce.write_record(name, {'north': '0.0', 'south': '-45', 'east': '-145', 'west':'-100'}, name, url, f"test_19115_3_{name}.xml") diff --git a/src/ISO19139_extract.py b/src/ISO19139_extract.py index b4cf12c..1cdc2b1 100755 --- a/src/ISO19139_extract.py +++ b/src/ISO19139_extract.py @@ -6,7 +6,6 @@ from lxml import etree from extractor import Extractor -from constants import OUTPUT_DIR from add_model_keyw import add_models_keyword class ISO19139Extractor(Extractor): @@ -139,20 +138,8 @@ def write_record(self, name, bbox, model_endpath, metadata_url, output_file): xml_string = add_models_keyword(str_result, 'utf-8', 'ISO19139') # Write to disk - with open(os.path.join(OUTPUT_DIR, output_file), 'w') as ff: + with open(os.path.join(self.output_dir, output_file), 'w') as ff: ff.write(xml_string) return True return False - -# This is used for testing only -if __name__ == "__main__": - - metadata_urls = [ - ("mcarthur", "http://www.ntlis.nt.gov.au/metadata/export_data?type=xml&metadata_id=1080195AEBC6A054E050CD9B214436A1"), - ("windimurra", "https://warsydprdstadasc.blob.core.windows.net/downloads/Metadata_Statements/XML/3D_Windimurra_2015.xml"), - ("sandstone", "https://warsydprdstadasc.blob.core.windows.net/downloads/Metadata_Statements/XML/3D_Sandstone_2015.xml") - ] - ce = ISO19139Extractor() - for name, url in metadata_urls: - ce.write_record(name, {'north': '0.0', 'south': '-45', 'east': '-145', 'west':'-100'}, name, url, f"test_19139_{name}.xml") diff --git a/src/add_coords.py b/src/add_coords.py index 2dfd55c..ee7f4a0 100755 --- a/src/add_coords.py +++ b/src/add_coords.py @@ -12,7 +12,7 @@ from copy import copy from lxml.builder import ElementMaker from add_model_keyw import insert -from constants import OUTPUT_DIR +from config import OUTPUT_DIR """ Utility functions used to add bounding box coordinates to ISO 19139 & 19115-3 XML diff --git a/src/add_links.py b/src/add_links.py index 3523d89..50a30ca 100755 --- a/src/add_links.py +++ b/src/add_links.py @@ -13,7 +13,7 @@ from lxml.builder import ElementMaker from add_model_keyw import insert -from constants import OUTPUT_DIR +from config import OUTPUT_DIR def add_model_link(model_endpath, text): diff --git a/src/add_model_keyw.py b/src/add_model_keyw.py index f00ba5d..3920852 100755 --- a/src/add_model_keyw.py +++ b/src/add_model_keyw.py @@ -12,7 +12,7 @@ from copy import copy from lxml.builder import ElementMaker -from constants import OUTPUT_DIR +from config import OUTPUT_DIR """ Adds keywords to ISO 19139 and ISO 19115-3 XML using XPATH insertion """ diff --git a/src/bedrock_summary.py b/src/bedrock_summary.py index 6da1b35..1bef067 100755 --- a/src/bedrock_summary.py +++ b/src/bedrock_summary.py @@ -7,8 +7,6 @@ import boto3 import botocore -from constants import OUTPUT_DIR - """ Use Claude V2.0 model to summarize text via AWS Bedrock and 'boto3' package """ diff --git a/src/ckan_extract.py b/src/ckan_extract.py index a4f9454..fd5ccac 100755 --- a/src/ckan_extract.py +++ b/src/ckan_extract.py @@ -13,7 +13,6 @@ from extractor import Extractor -from constants import OUTPUT_DIR class CkanExtractor(Extractor): """ Connects to CKAN repository @@ -143,11 +142,13 @@ def output_xml(self, ckan_dict, url, model_endpath, output_file): } } - xml_string = render_j2_template(mcf_dict, template_dir='../data/templates/ISO19115-3') + template_dir = os.path.join(os.path.dirname(__file__), '../data/templates/ISO19115-3') + xml_string = render_j2_template(mcf_dict, template_dir=template_dir) # write to disk - with open(os.path.join(OUTPUT_DIR, output_file), 'w') as ff: + with open(os.path.join(self.output_dir, output_file), 'w') as ff: ff.write(xml_string) + return True def write_record(self, name, bbox, model_endpath, ckan_url, package_id, output_file): @@ -172,13 +173,5 @@ def write_record(self, name, bbox, model_endpath, ckan_url, package_id, output_f except json.JSONDecodeError: return False if dict['success'] is True: - self.output_xml(dict['result'], r.url, model_endpath, output_file) - return True + return self.output_xml(dict['result'], r.url, model_endpath, output_file) return False - - -# Used for testing only -if __name__ == "__main__": - SITE__URL = 'https://geoscience.data.qld.gov.au' - ce = CkanExtractor() - ce.write_record('Mt Dore', 'mtdore', SITE__URL, 'ds000002', 'test_ckan.xml') diff --git a/src/config.py b/src/config.py index 94a0cc0..af0c1e8 100755 --- a/src/config.py +++ b/src/config.py @@ -1,3 +1,5 @@ +from pathlib import Path + """ Config for creation of ISO19139 or ISO19115-3 XML metadata records from PDF reports or online metadata services (e.g. CKAN, dSpace, geonetwork) @@ -153,3 +155,6 @@ ] } } + +# Currently set to root dir +OUTPUT_DIR = str(Path(__file__).parent / 'output') diff --git a/src/constants.py b/src/constants.py deleted file mode 100644 index 701227f..0000000 --- a/src/constants.py +++ /dev/null @@ -1,3 +0,0 @@ -# General constants - -OUTPUT_DIR = "output" diff --git a/src/extractor.py b/src/extractor.py index 391566c..cffd9d5 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -1,8 +1,19 @@ +import os +from config import OUTPUT_DIR + """ Parent class for reading sources and writing out XML This is specialised for different data source types """ class Extractor: + def __init__(self): + print(f"{OUTPUT_DIR=}") + try: + os.mkdir(OUTPUT_DIR) + except FileExistsError as fee: + pass + self.output_dir = OUTPUT_DIR + def write_record(self, bbox, model_endpath): """ NB: The input parameters for this function should match the parameters defined in the configuration file """ diff --git a/src/keywords.py b/src/keywords.py index 764762c..f8c2f7b 100755 --- a/src/keywords.py +++ b/src/keywords.py @@ -2,6 +2,7 @@ import glob import sys +import os import sqlite3 from contextlib import closing @@ -68,7 +69,8 @@ def extract_db_terms(): name_dict = {} link_dict = {} # Connect to USGS Thesaurus DB (https://apps.usgs.gov/thesaurus/) - with closing(sqlite3.connect("../db/thesauri.db")) as con: + db_file = os.path.join(os.path.dirname(__file__), '../db/thesauri.db') + with closing(sqlite3.connect(db_file)) as con: with closing(con.cursor()) as cur: for row in cur.execute("SELECT code, name, parent FROM term"): # print(row) @@ -112,27 +114,9 @@ def get_keywords(text): Extracts keywords from text :param text: text - :returns: + :returns: set of geoscience keywords """ + # Creates lookup table using USGS Thesaurus kw_dict = extract_db_terms() - - yake_kwset = run_yake(kw_dict, text) - - return yake_kwset - - -# Used for testing only -if __name__ == "__main__": - kw_dict = extract_db_terms() - for file in ['G107513_OtwayBasin_3D_notes.pdf', - # 'G161893_VGP_TR35_3D-Geological-framework-Otway_low-res.pdf', - #'G35615_3DVIC1_pt1.pdf' - ]: - text = parse_pdf(f'../data/reports/vic/{file}', False) - - yake_kwset = run_yake(kw_dict, text) - print(f"{file}: usgs+yake: {yake_kwset}") - - #usgs_kwset = run_usgs(kw_dict, text) - #print("pure usgs:", usgs_kwset) - + # Runs yake and matches yake's keywords with USGS Thesaurus + return run_yake(kw_dict, text) diff --git a/src/oai_extract.py b/src/oai_extract.py index 464b2ce..69f7cbc 100755 --- a/src/oai_extract.py +++ b/src/oai_extract.py @@ -7,14 +7,13 @@ from sickle import Sickle from extractor import Extractor -from constants import OUTPUT_DIR class OaiExtractor(Extractor): - def __init__(self, oai_url, output_dir): + def __init__(self, oai_url): + super().__init__() self.OAI_URL = oai_url - self.output_dir = output_dir def output_xml(self, oai_dict, oai_id, bbox, model_endpath, service_name, output_file): """ @@ -128,10 +127,11 @@ def output_xml(self, oai_dict, oai_id, bbox, model_endpath, service_name, output } } - xml_string = render_j2_template(mcf_dict, template_dir='../data/templates/ISO19115-3') + template_dir = os.path.join(os.path.dirname(__file__), '../data/templates/ISO19115-3') + xml_string = render_j2_template(mcf_dict, template_dir=template_dir) # write to disk - with open(os.path.join(OUTPUT_DIR, output_file), 'w') as ff: + with open(os.path.join(self.output_dir, output_file), 'w') as ff: ff.write(xml_string) return True @@ -141,8 +141,9 @@ def write_record(self, name, bbox, model_endpath, oai_id, oai_prefix, service_na """ Write an XML record to file using metadata from OAI-PMH service + :param name: model name :param bbox: bounding box dict, keys are 'north' 'south' 'east' 'west', values are decimals as strings, EPSG:4326 is assumed - :param model_endpath: path of model in website + :param model_endpath: path of model in website, used to create a link to website URL :param oai_id: OAI-PMH identifier e.g. 'oai:eprints.rclis.org:4088' :param oai_prefix: OAI-PMH prefix e.g. 'oai_dc' :param service_name: generic name of OAI-PMH service @@ -158,21 +159,5 @@ def write_record(self, name, bbox, model_endpath, oai_id, oai_prefix, service_na #for k, v in oai_dict.items(): # print(k, '=>', v); - self.output_xml(oai_dict, oai_id, bbox, model_endpath, service_name, output_file) - -if __name__ == "__main__": - # Get records from Northern Territory Geological Service - # OAI-PMH URL - OAI__URL = 'https://geoscience.nt.gov.au/gemis/ntgsoai/request' - - # GEMIS permanent link of McArthur 3D model - MODEL__URL = 'https://geoscience.nt.gov.au/gemis/ntgsjspui/handle/1/81751' - oe = OaiExtractor(OAI__URL, 'output') - # Convert perm link to OAI-PMH ID - handle_id = '/'.join(MODEL__URL.split('/')[-2:]) - print(handle_id) - # NB: Some geological fields that are present in GEMIS website are missing from OAI output with 'oai_dc' prefix, - # i.e. "Stratigraphy" The 'xoai' prefix will allow extraction of these missing fields but the XML output - # would need to be parsed - oe.write_record([154.3, 109.1, -43.9, -10.6], 'mcarthur', 'oai:geoscience.nt.gov.au:'+handle_id, 'oai_dc', "NTGS GEMIS", 'test_oai.xml') + return self.output_xml(oai_dict, oai_id, bbox, model_endpath, service_name, output_file) diff --git a/src/pdf_extract.py b/src/pdf_extract.py index e786046..7e37323 100755 --- a/src/pdf_extract.py +++ b/src/pdf_extract.py @@ -12,18 +12,31 @@ from summary import get_summary from add_links import add_model_link from add_coords import add_coords -from constants import OUTPUT_DIR +from config import OUTPUT_DIR class PDFExtractor(Extractor): """ Creates an ISO 19115 XML file by reading a PDF file """ def write_record(self, name, model_endpath, pdf_file, pdf_url, organisation, title, bbox, cutoff, output_file): + """ + Write XML record + + :param name: model name used in download links in record + :param model_endpath: path of model in website, used to create a link to website URL + :param pdf_file: path to PDF file + :param pdf_url: URL for PDF file + :param organisation: name of organisation + :param title: title + :param bbox: bounding box coords, dict, keys are 'north', 'south' etc. + :param cutoff: skip pages that have less than this amount of text, set to between 1000 and 3000, used to filter out pages with no useful text + :param output_file: output filename e.g. 'blah.xml' + :returns: boolean + """ print(f"Converting: {model_endpath}") - print("bbox=", repr(bbox)) if not os.path.exists(pdf_file): print(f"{pdf_file} does not exist") - sys.exit(1) + return False # Extract keywords from PDF text pdf_text = parse_pdf(pdf_file, False) kwset = get_keywords(pdf_text) @@ -128,19 +141,15 @@ def write_record(self, name, model_endpath, pdf_file, pdf_url, organisation, tit "level": "dataset" }, "lineage": { - "statement": f"This metadata record was reproduced from PDF report retrieved from {pdf_url} on {datetime.datetime.now():%d %b %Y}" + "statement": f"This metadata record was reproduced from the PDF report retrieved from {pdf_url} on {datetime.datetime.now():%d %b %Y}. The abstract was generated by Antrhopic Claude V2.0 (https://www.anthropic.com/). Keywords were taken from USGS Thesaurus (https://apps.usgs.gov/thesaurus/) and extracted by yake (https://pypi.org/project/yake)" } } } - xml_string = render_j2_template(mcf_dict, template_dir='../data/templates/ISO19115-3') + template_dir = os.path.join(os.path.dirname(__file__), '../data/templates/ISO19115-3') + xml_string = render_j2_template(mcf_dict, template_dir=template_dir) # write to disk with open(os.path.join(OUTPUT_DIR, output_file), 'w') as ff: ff.write(xml_string) - - - -if __name__ == "__main__": - pe = PDFExtractor() - pe.write_record("test-pdf", "https://blah/blah.pdf") + return True diff --git a/src/process.py b/src/process.py index f526de2..a22c6e3 100755 --- a/src/process.py +++ b/src/process.py @@ -12,8 +12,7 @@ from ISO19115_3_extract import ISO19115_3Extractor from pdf_extract import PDFExtractor -from constants import OUTPUT_DIR -from config import CONFIG +from config import CONFIG, OUTPUT_DIR """ Create ISO19139 or ISO19115-3 XML metadata records from PDF reports or online metadata services diff --git a/src/summary.py b/src/summary.py index 679e7e2..553ed0e 100755 --- a/src/summary.py +++ b/src/summary.py @@ -7,7 +7,7 @@ from bedrock_summary import run_claude from pdf_helper import parse_pdf -from constants import OUTPUT_DIR +from config import OUTPUT_DIR # os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python' diff --git a/tests/namespaces.py b/tests/helpers.py similarity index 73% rename from tests/namespaces.py rename to tests/helpers.py index 1bdc2f6..3297452 100644 --- a/tests/namespaces.py +++ b/tests/helpers.py @@ -1,3 +1,4 @@ +import requests ns_19115_3 = { 'mdb':"http://standards.iso.org/iso/19115/-3/mdb/2.0", 'cat': "http://standards.iso.org/iso/19115/-3/cat/1.0", @@ -38,3 +39,35 @@ 'xlink':"http://www.w3.org/1999/xlink" } +def make_xpath(ns_dict, path_elems): + """ + Makes an xpath with namespaces given a list of tags + + :param ns_dict: namespace dictionary e.g. { 'cit': 'http://cit.org/1.0', 'dif': 'http://dif.org/2.0' } + :param path_elems: list of tags e.g. ['cit:citation', 'dif:differential'] + :returns: xpath string + """ + path = './/' + for ele in path_elems: + ns, dot, tag = ele.partition(':') + path += f"{{{ns_dict[ns]}}}{tag}/" + return path.rstrip('/') + +def get_metadata(metadata_url): + """ + Fethes metadata from a URL + + :param metadata_url: metadata URL + :returns: metadata, encoding + """ + meta = requests.get(metadata_url) + if meta.encoding is not None: + encoding = meta.encoding + else: + encoding = 'utf-8' + + # Read XML from URL + return encoding, meta + + + diff --git a/tests/test_ISO19115_3.py b/tests/test_ISO19115_3.py new file mode 100644 index 0000000..f0c736a --- /dev/null +++ b/tests/test_ISO19115_3.py @@ -0,0 +1,8 @@ +from ISO19115_3_extract import ISO19115_3Extractor + +def test_ISO19115_3(): + url = "https://catalog.sarig.sa.gov.au/geonetwork/srv/api/records/9c6ae754-291d-4100-afd9-478c3a9ddf42/formatters/xml" + name = 'ngawler' + ce = ISO19115_3Extractor() + assert ce.write_record(name, {'north': '0.0', 'south': '-45', 'east': '-145', 'west':'-100'}, name, url, f"test_19115_3_{name}.xml") + diff --git a/tests/test_ISO19139.py b/tests/test_ISO19139.py new file mode 100644 index 0000000..e767699 --- /dev/null +++ b/tests/test_ISO19139.py @@ -0,0 +1,12 @@ +from ISO19139_extract import ISO19139Extractor + +def test_ISO19139(): + metadata_urls = [ + ("mcarthur", "http://www.ntlis.nt.gov.au/metadata/export_data?type=xml&metadata_id=1080195AEBC6A054E050CD9B214436A1"), + ("windimurra", "https://warsydprdstadasc.blob.core.windows.net/downloads/Metadata_Statements/XML/3D_Windimurra_2015.xml"), + ("sandstone", "https://warsydprdstadasc.blob.core.windows.net/downloads/Metadata_Statements/XML/3D_Sandstone_2015.xml") + ] + ce = ISO19139Extractor() + for name, url in metadata_urls: + ce.write_record(name, {'north': '0.0', 'south': '-45', 'east': '-145', 'west':'-100'}, name, url, f"test_19139_{name}.xml") + diff --git a/tests/test_ckan.py b/tests/test_ckan.py new file mode 100644 index 0000000..ca2c905 --- /dev/null +++ b/tests/test_ckan.py @@ -0,0 +1,7 @@ +from ckan_extract import CkanExtractor + +def test_ckan(): + SITE__URL = 'https://geoscience.data.qld.gov.au' + ce = CkanExtractor() + assert ce.write_record('Mt Dore', { 'north': -10.2, 'south': -45.0, 'east': 145.0, 'west': 90.0}, 'mtdore', SITE__URL, 'ds000002', 'test_ckan.xml') + diff --git a/tests/test_oai.py b/tests/test_oai.py new file mode 100644 index 0000000..f0da9ca --- /dev/null +++ b/tests/test_oai.py @@ -0,0 +1,22 @@ +import os +from oai_extract import OaiExtractor + +def test_oai(): + # Get records from Northern Territory Geological Service + # OAI-PMH URL + OAI__URL = 'https://geoscience.nt.gov.au/gemis/ntgsoai/request' + XML_FILE = 'test_oai.xml' + + # GEMIS permanent link of McArthur 3D model + MODEL__URL = 'https://geoscience.nt.gov.au/gemis/ntgsjspui/handle/1/81751' + oe = OaiExtractor(OAI__URL) + # Convert perm link to OAI-PMH ID + handle_id = '/'.join(MODEL__URL.split('/')[-2:]) + # NB: Some geological fields that are present in GEMIS website are missing from OAI output with 'oai_dc' prefix, + # i.e. "Stratigraphy" The 'xoai' prefix will allow extraction of these missing fields but the XML output + # would need to be parsed + assert oe.write_record('test', {'east': 154.3, 'west': 109.1, 'south': -43.9, 'north': -10.6}, 'mcarthur', 'oai:geoscience.nt.gov.au:'+handle_id, 'oai_dc', "NTGS GEMIS", XML_FILE) + + with open(os.path.join(oe.output_dir, XML_FILE), 'r') as fd: + lines = fd.readlines() + assert '154.3' in ''.join(lines) diff --git a/tests/test_pdf.py b/tests/test_pdf.py new file mode 100644 index 0000000..3f9af42 --- /dev/null +++ b/tests/test_pdf.py @@ -0,0 +1,6 @@ +from pdf_extract import PDFExtractor + +def test_pdf(): + pe = PDFExtractor() + # Test missing PDF file + assert not pe.write_record("Blah Blah", "blah", "blah.pdf", "https://blah.org/blah.pdf", "test org", "test title", {'north': -15.0, 'south': -40.4, 'east': 120.5, 'west': 100.3}, 3000, "pdf_test.xml") diff --git a/tests/test.py b/tests/unit_test.py similarity index 81% rename from tests/test.py rename to tests/unit_test.py index 3be6942..a309c3e 100644 --- a/tests/test.py +++ b/tests/unit_test.py @@ -1,37 +1,19 @@ import requests import xml.etree.ElementTree as etree +from pathlib import Path +import os from add_coords import add_coords from add_links import add_model_link from add_model_keyw import add_models_keyword -from namespaces import ns_19115_3, ns_19139 +from helpers import ns_19115_3, ns_19139, get_metadata, make_xpath +from keywords import extract_db_terms, run_yake +from pdf_helper import parse_pdf ISO19139_URL = "http://52.65.91.200/geonetwork/srv/api/records/97ed8560c193e0c1855445cec4e812d4c59654ff/formatters/xml" ISO19115_3_URL = "https://catalog.sarig.sa.gov.au/geonetwork/srv/api/records/9c6ae754-291d-4100-afd9-478c3a9ddf42/formatters/xml" -def make_xpath(ns_dict, path_elems): - """ - Makes an xpath with namespaces given a list of tags - - :param ns_dict: namespace dictionary e.g. { 'cit': 'http://cit.org/1.0', 'dif': 'http://dif.org/2.0' } - :param path_elems: list of tags e.g. ['cit:citation', 'dif:differential'] - :returns: xpath string - """ - path = './/' - for ele in path_elems: - ns, dot, tag = ele.partition(':') - path += f"{{{ns_dict[ns]}}}{tag}/" - return path.rstrip('/') - -def get_metadata(metadata_url): - meta = requests.get(metadata_url) - if meta.encoding is not None: - encoding = meta.encoding - else: - encoding = 'utf-8' - - # Read XML from URL - return encoding, meta +DATA_DIR = str(Path(__file__).parent.parent / 'data') def test_add_coords(): """ @@ -106,3 +88,12 @@ def test_add_keyw(): xp = make_xpath(ns_19115_3, keywpath_list) xp += f"[.='AuScope 3D Geological Models']" assert root.findall(xp, namespaces=ns_19115_3) != [] + +def test_keywords(): + kw_dict = extract_db_terms() + print(f"{DATA_DIR=}") + text = parse_pdf(os.path.join(DATA_DIR, 'reports/vic/G107513_OtwayBasin_3D_notes.pdf'), False) + keywords = run_yake(kw_dict, text) + assert 'field inventory and monitoring' in keywords + assert 'topography' in keywords + assert 'Precambrian' in keywords