From c026261e3612d0be4d63b4f994616d969d0b24fd Mon Sep 17 00:00:00 2001
From: Vincent Fazio <vjf@users.noreply.github.com>
Date: Mon, 22 Apr 2024 12:44:30 +0000
Subject: [PATCH] Changes: - moved OUTPUT_DIR to within Extractor class - added
 very simple tests for extraction classes - added keyword tests

---
 src/ISO19115_3_extract.py           |  9 +------
 src/ISO19139_extract.py             | 15 +----------
 src/add_coords.py                   |  2 +-
 src/add_links.py                    |  2 +-
 src/add_model_keyw.py               |  2 +-
 src/bedrock_summary.py              |  2 --
 src/ckan_extract.py                 | 17 ++++---------
 src/config.py                       |  5 ++++
 src/constants.py                    |  3 ---
 src/extractor.py                    | 11 ++++++++
 src/keywords.py                     | 30 ++++++----------------
 src/oai_extract.py                  | 31 ++++++-----------------
 src/pdf_extract.py                  | 31 +++++++++++++++--------
 src/process.py                      |  3 +--
 src/summary.py                      |  2 +-
 tests/{namespaces.py => helpers.py} | 33 ++++++++++++++++++++++++
 tests/test_ISO19115_3.py            |  8 ++++++
 tests/test_ISO19139.py              | 12 +++++++++
 tests/test_ckan.py                  |  7 ++++++
 tests/test_oai.py                   | 22 ++++++++++++++++
 tests/test_pdf.py                   |  6 +++++
 tests/{test.py => unit_test.py}     | 39 +++++++++++------------------
 22 files changed, 166 insertions(+), 126 deletions(-)
 delete mode 100644 src/constants.py
 rename tests/{namespaces.py => helpers.py} (73%)
 create mode 100644 tests/test_ISO19115_3.py
 create mode 100644 tests/test_ISO19139.py
 create mode 100644 tests/test_ckan.py
 create mode 100644 tests/test_oai.py
 create mode 100644 tests/test_pdf.py
 rename tests/{test.py => unit_test.py} (81%)

diff --git a/src/ISO19115_3_extract.py b/src/ISO19115_3_extract.py
index a2b94b3..e7b2147 100755
--- a/src/ISO19115_3_extract.py
+++ b/src/ISO19115_3_extract.py
@@ -6,7 +6,6 @@
 import lxml
 
 from extractor import Extractor
-from constants import OUTPUT_DIR
 
 from add_model_keyw import add_models_keyword
 
@@ -180,16 +179,10 @@ def write_record(self, name, bbox, model_endpath, metadata_url, output_file):
             xml_string = add_models_keyword(str_result, 'utf-8', 'ISO19115-3')
 
             # Write to disk
-            with open(os.path.join(OUTPUT_DIR, output_file), 'w') as ff:
+            with open(os.path.join(self.output_dir, output_file), 'w') as ff:
                 ff.write(xml_string)
 
             return True
         return False
 
 
-# Used for testing only
-if __name__ == "__main__":
-    url = "https://catalog.sarig.sa.gov.au/geonetwork/srv/api/records/9c6ae754-291d-4100-afd9-478c3a9ddf42/formatters/xml"
-    name = 'ngawler'
-    ce = ISO19115_3Extractor()
-    ce.write_record(name, {'north': '0.0', 'south': '-45', 'east': '-145', 'west':'-100'}, name, url, f"test_19115_3_{name}.xml")
diff --git a/src/ISO19139_extract.py b/src/ISO19139_extract.py
index b4cf12c..1cdc2b1 100755
--- a/src/ISO19139_extract.py
+++ b/src/ISO19139_extract.py
@@ -6,7 +6,6 @@
 from lxml import etree
 
 from extractor import Extractor
-from constants import OUTPUT_DIR
 from add_model_keyw import add_models_keyword
 
 class ISO19139Extractor(Extractor):
@@ -139,20 +138,8 @@ def write_record(self, name, bbox, model_endpath, metadata_url, output_file):
             xml_string = add_models_keyword(str_result, 'utf-8', 'ISO19139')
 
             # Write to disk
-            with open(os.path.join(OUTPUT_DIR, output_file), 'w') as ff:
+            with open(os.path.join(self.output_dir, output_file), 'w') as ff:
                 ff.write(xml_string)
             return True
         return False
 
-
-# This is used for testing only
-if __name__ == "__main__":
-
-    metadata_urls = [
- ("mcarthur", "http://www.ntlis.nt.gov.au/metadata/export_data?type=xml&metadata_id=1080195AEBC6A054E050CD9B214436A1"),
- ("windimurra", "https://warsydprdstadasc.blob.core.windows.net/downloads/Metadata_Statements/XML/3D_Windimurra_2015.xml"),
- ("sandstone", "https://warsydprdstadasc.blob.core.windows.net/downloads/Metadata_Statements/XML/3D_Sandstone_2015.xml")
-    ]
-    ce = ISO19139Extractor()
-    for name, url in metadata_urls:
-        ce.write_record(name, {'north': '0.0', 'south': '-45', 'east': '-145', 'west':'-100'}, name, url, f"test_19139_{name}.xml")
diff --git a/src/add_coords.py b/src/add_coords.py
index 2dfd55c..ee7f4a0 100755
--- a/src/add_coords.py
+++ b/src/add_coords.py
@@ -12,7 +12,7 @@
 from copy import copy
 from lxml.builder import ElementMaker
 from add_model_keyw import insert
-from constants import OUTPUT_DIR
+from config import OUTPUT_DIR
 
 """
 Utility functions used to add bounding box coordinates to ISO 19139 & 19115-3 XML
diff --git a/src/add_links.py b/src/add_links.py
index 3523d89..50a30ca 100755
--- a/src/add_links.py
+++ b/src/add_links.py
@@ -13,7 +13,7 @@
 from lxml.builder import ElementMaker
 
 from add_model_keyw import insert
-from constants import OUTPUT_DIR
+from config import OUTPUT_DIR
 
 
 def add_model_link(model_endpath, text):
diff --git a/src/add_model_keyw.py b/src/add_model_keyw.py
index f00ba5d..3920852 100755
--- a/src/add_model_keyw.py
+++ b/src/add_model_keyw.py
@@ -12,7 +12,7 @@
 from copy import copy
 from lxml.builder import ElementMaker
 
-from constants import OUTPUT_DIR
+from config import OUTPUT_DIR
 
 """ Adds keywords to ISO 19139 and ISO 19115-3 XML using XPATH insertion
 """
diff --git a/src/bedrock_summary.py b/src/bedrock_summary.py
index 6da1b35..1bef067 100755
--- a/src/bedrock_summary.py
+++ b/src/bedrock_summary.py
@@ -7,8 +7,6 @@
 import boto3
 import botocore
 
-from constants import OUTPUT_DIR
-
 """
 Use Claude V2.0 model to summarize text via AWS Bedrock and 'boto3' package
 """
diff --git a/src/ckan_extract.py b/src/ckan_extract.py
index a4f9454..fd5ccac 100755
--- a/src/ckan_extract.py
+++ b/src/ckan_extract.py
@@ -13,7 +13,6 @@
 
 from extractor import Extractor
 
-from constants import OUTPUT_DIR
 
 class CkanExtractor(Extractor):
     """ Connects to CKAN repository
@@ -143,11 +142,13 @@ def output_xml(self, ckan_dict, url, model_endpath, output_file):
             }
         }
 
-        xml_string = render_j2_template(mcf_dict, template_dir='../data/templates/ISO19115-3')
+        template_dir = os.path.join(os.path.dirname(__file__), '../data/templates/ISO19115-3')
+        xml_string = render_j2_template(mcf_dict, template_dir=template_dir)
 
         # write to disk
-        with open(os.path.join(OUTPUT_DIR, output_file), 'w') as ff:
+        with open(os.path.join(self.output_dir, output_file), 'w') as ff:
             ff.write(xml_string)
+        return True
 
 
     def write_record(self, name, bbox, model_endpath, ckan_url, package_id, output_file):
@@ -172,13 +173,5 @@ def write_record(self, name, bbox, model_endpath, ckan_url, package_id, output_f
         except json.JSONDecodeError:
             return False
         if dict['success'] is True:
-            self.output_xml(dict['result'], r.url, model_endpath, output_file)
-            return True
+            return self.output_xml(dict['result'], r.url, model_endpath, output_file)
         return False
-
-
-# Used for testing only
-if __name__ == "__main__":
-    SITE__URL = 'https://geoscience.data.qld.gov.au'
-    ce = CkanExtractor()
-    ce.write_record('Mt Dore', 'mtdore', SITE__URL, 'ds000002', 'test_ckan.xml')
diff --git a/src/config.py b/src/config.py
index 94a0cc0..af0c1e8 100755
--- a/src/config.py
+++ b/src/config.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 """
 Config for creation of ISO19139 or ISO19115-3 XML metadata records from PDF reports or online metadata services
 (e.g. CKAN, dSpace, geonetwork)
@@ -153,3 +155,6 @@
                 ]
         }
 }
+
+# Currently set to root dir
+OUTPUT_DIR = str(Path(__file__).parent / 'output')
diff --git a/src/constants.py b/src/constants.py
deleted file mode 100644
index 701227f..0000000
--- a/src/constants.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# General constants
-
-OUTPUT_DIR = "output"
diff --git a/src/extractor.py b/src/extractor.py
index 391566c..cffd9d5 100644
--- a/src/extractor.py
+++ b/src/extractor.py
@@ -1,8 +1,19 @@
+import os
+from config import OUTPUT_DIR
+
 """
 Parent class for reading sources and writing out XML
 This is specialised for different data source types
 """
 class Extractor:
+    def __init__(self):
+        print(f"{OUTPUT_DIR=}")
+        try:
+            os.mkdir(OUTPUT_DIR)
+        except FileExistsError as fee:
+            pass
+        self.output_dir = OUTPUT_DIR
+
     def write_record(self, bbox, model_endpath):
         """ NB: The input  parameters for this function should match the parameters defined in the configuration file
         """
diff --git a/src/keywords.py b/src/keywords.py
index 764762c..f8c2f7b 100755
--- a/src/keywords.py
+++ b/src/keywords.py
@@ -2,6 +2,7 @@
 
 import glob
 import sys
+import os
 import sqlite3
 from contextlib import closing
 
@@ -68,7 +69,8 @@ def extract_db_terms():
     name_dict = {}
     link_dict = {}
     # Connect to USGS Thesaurus DB (https://apps.usgs.gov/thesaurus/)
-    with closing(sqlite3.connect("../db/thesauri.db")) as con:
+    db_file = os.path.join(os.path.dirname(__file__), '../db/thesauri.db')
+    with closing(sqlite3.connect(db_file)) as con:
         with closing(con.cursor()) as cur:
             for row in cur.execute("SELECT code, name, parent FROM term"):
                 # print(row)
@@ -112,27 +114,9 @@ def get_keywords(text):
     Extracts keywords from text
 
     :param text: text
-    :returns: 
+    :returns: set of geoscience keywords
     """
+    # Creates lookup table using USGS Thesaurus
     kw_dict = extract_db_terms()
- 
-    yake_kwset = run_yake(kw_dict, text)
-
-    return yake_kwset
-
-
-# Used for testing only
-if __name__ == "__main__":
-    kw_dict = extract_db_terms()
-    for file in ['G107513_OtwayBasin_3D_notes.pdf',
-                 # 'G161893_VGP_TR35_3D-Geological-framework-Otway_low-res.pdf', 
-                 #'G35615_3DVIC1_pt1.pdf'
-                 ]:
-        text = parse_pdf(f'../data/reports/vic/{file}', False)
- 
-        yake_kwset = run_yake(kw_dict, text)
-        print(f"{file}: usgs+yake: {yake_kwset}")
-    
-        #usgs_kwset = run_usgs(kw_dict, text)
-        #print("pure usgs:", usgs_kwset)
-
+    # Runs yake and matches yake's keywords with USGS Thesaurus
+    return run_yake(kw_dict, text)
diff --git a/src/oai_extract.py b/src/oai_extract.py
index 464b2ce..69f7cbc 100755
--- a/src/oai_extract.py
+++ b/src/oai_extract.py
@@ -7,14 +7,13 @@
 from sickle import Sickle
 
 from extractor import Extractor
-from constants import OUTPUT_DIR
 
 
 class OaiExtractor(Extractor):
 
-    def __init__(self, oai_url, output_dir):
+    def __init__(self, oai_url):
+        super().__init__() 
         self.OAI_URL = oai_url
-        self.output_dir = output_dir
 
     def output_xml(self, oai_dict, oai_id, bbox, model_endpath, service_name, output_file):
         """
@@ -128,10 +127,11 @@ def output_xml(self, oai_dict, oai_id, bbox, model_endpath, service_name, output
             }
         }
 
-        xml_string = render_j2_template(mcf_dict, template_dir='../data/templates/ISO19115-3')
+        template_dir = os.path.join(os.path.dirname(__file__), '../data/templates/ISO19115-3')
+        xml_string = render_j2_template(mcf_dict, template_dir=template_dir)
 
         # write to disk
-        with open(os.path.join(OUTPUT_DIR, output_file), 'w') as ff:
+        with open(os.path.join(self.output_dir, output_file), 'w') as ff:
             ff.write(xml_string)
         return True
 
@@ -141,8 +141,9 @@ def write_record(self, name, bbox, model_endpath, oai_id, oai_prefix, service_na
         """
         Write an XML record to file using metadata from OAI-PMH service
 
+        :param name: model name
         :param bbox: bounding box dict, keys are 'north' 'south' 'east' 'west', values are decimals as strings, EPSG:4326 is assumed
-        :param model_endpath: path of model in website
+        :param model_endpath: path of model in website, used to create a link to website URL
         :param oai_id: OAI-PMH identifier e.g. 'oai:eprints.rclis.org:4088'
         :param oai_prefix: OAI-PMH prefix e.g. 'oai_dc'
         :param service_name: generic name of OAI-PMH service
@@ -158,21 +159,5 @@ def write_record(self, name, bbox, model_endpath, oai_id, oai_prefix, service_na
         #for k, v in oai_dict.items():
         #    print(k, '=>', v);
 
-        self.output_xml(oai_dict, oai_id, bbox, model_endpath, service_name, output_file)
-
-if __name__ == "__main__":
-    # Get records from Northern Territory Geological Service
-    # OAI-PMH URL
-    OAI__URL = 'https://geoscience.nt.gov.au/gemis/ntgsoai/request'
-    
-    # GEMIS permanent link of McArthur 3D model
-    MODEL__URL = 'https://geoscience.nt.gov.au/gemis/ntgsjspui/handle/1/81751'
-    oe = OaiExtractor(OAI__URL, 'output')
-    # Convert perm link to OAI-PMH ID
-    handle_id = '/'.join(MODEL__URL.split('/')[-2:])
-    print(handle_id)
-    # NB: Some geological fields that are present in GEMIS website are missing from OAI output with 'oai_dc' prefix,
-    # i.e. "Stratigraphy" The 'xoai' prefix will allow extraction of these missing fields but the XML output
-    # would need to be parsed
-    oe.write_record([154.3, 109.1, -43.9, -10.6], 'mcarthur', 'oai:geoscience.nt.gov.au:'+handle_id, 'oai_dc', "NTGS GEMIS", 'test_oai.xml')
+        return self.output_xml(oai_dict, oai_id, bbox, model_endpath, service_name, output_file)
 
diff --git a/src/pdf_extract.py b/src/pdf_extract.py
index e786046..7e37323 100755
--- a/src/pdf_extract.py
+++ b/src/pdf_extract.py
@@ -12,18 +12,31 @@
 from summary import get_summary
 from add_links import add_model_link
 from add_coords import add_coords
-from constants import OUTPUT_DIR
+from config import OUTPUT_DIR
 
 class PDFExtractor(Extractor):
     """ Creates an ISO 19115 XML file by reading a PDF file
     """
 
     def write_record(self, name, model_endpath, pdf_file, pdf_url, organisation, title, bbox, cutoff, output_file):
+        """
+        Write XML record
+
+        :param name: model name used in download links in record
+        :param model_endpath: path of model in website, used to create a link to website URL
+        :param pdf_file: path to PDF file
+        :param pdf_url: URL for PDF file
+        :param organisation: name of organisation
+        :param title: title
+        :param bbox: bounding box coords, dict, keys are 'north', 'south' etc.
+        :param cutoff: skip pages that have less than this amount of text, set to between 1000 and 3000, used to filter out pages with no useful text
+        :param output_file: output filename e.g. 'blah.xml'
+        :returns: boolean
+        """
         print(f"Converting: {model_endpath}")
-        print("bbox=", repr(bbox))
         if not os.path.exists(pdf_file):
             print(f"{pdf_file} does not exist")
-            sys.exit(1)
+            return False
         # Extract keywords from PDF text
         pdf_text = parse_pdf(pdf_file, False)
         kwset = get_keywords(pdf_text)
@@ -128,19 +141,15 @@ def write_record(self, name, model_endpath, pdf_file, pdf_url, organisation, tit
                     "level": "dataset"
                 },
                 "lineage": {
-                    "statement": f"This metadata record was reproduced from PDF report retrieved from {pdf_url} on {datetime.datetime.now():%d %b %Y}"
+                    "statement": f"This metadata record was reproduced from the PDF report retrieved from {pdf_url} on {datetime.datetime.now():%d %b %Y}. The abstract was generated by Antrhopic Claude V2.0 (https://www.anthropic.com/). Keywords were taken from USGS Thesaurus (https://apps.usgs.gov/thesaurus/) and extracted by yake (https://pypi.org/project/yake)"
                 }
             }
         }
 
-        xml_string = render_j2_template(mcf_dict, template_dir='../data/templates/ISO19115-3')
+        template_dir = os.path.join(os.path.dirname(__file__), '../data/templates/ISO19115-3')
+        xml_string = render_j2_template(mcf_dict, template_dir=template_dir)
 
         # write to disk
         with open(os.path.join(OUTPUT_DIR, output_file), 'w') as ff:
             ff.write(xml_string)
-
-
-
-if __name__ == "__main__":
-    pe = PDFExtractor()
-    pe.write_record("test-pdf", "https://blah/blah.pdf")
+        return True
diff --git a/src/process.py b/src/process.py
index f526de2..a22c6e3 100755
--- a/src/process.py
+++ b/src/process.py
@@ -12,8 +12,7 @@
 from ISO19115_3_extract import ISO19115_3Extractor
 from pdf_extract import PDFExtractor
 
-from constants import OUTPUT_DIR
-from config import CONFIG
+from config import CONFIG, OUTPUT_DIR
 
 """
 Create ISO19139 or ISO19115-3 XML metadata records from PDF reports or online metadata services
diff --git a/src/summary.py b/src/summary.py
index 679e7e2..553ed0e 100755
--- a/src/summary.py
+++ b/src/summary.py
@@ -7,7 +7,7 @@
 from bedrock_summary import run_claude
 
 from pdf_helper import parse_pdf
-from constants import OUTPUT_DIR
+from config import OUTPUT_DIR
 
 # os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
 
diff --git a/tests/namespaces.py b/tests/helpers.py
similarity index 73%
rename from tests/namespaces.py
rename to tests/helpers.py
index 1bdc2f6..3297452 100644
--- a/tests/namespaces.py
+++ b/tests/helpers.py
@@ -1,3 +1,4 @@
+import requests
 
 ns_19115_3 =  { 'mdb':"http://standards.iso.org/iso/19115/-3/mdb/2.0",
                 'cat': "http://standards.iso.org/iso/19115/-3/cat/1.0",
@@ -38,3 +39,35 @@
            'xlink':"http://www.w3.org/1999/xlink" }
 
 
+def make_xpath(ns_dict, path_elems):
+    """
+    Makes an xpath with namespaces  given a list of tags
+
+    :param ns_dict: namespace dictionary e.g. { 'cit': 'http://cit.org/1.0', 'dif': 'http://dif.org/2.0' }
+    :param path_elems: list of tags e.g. ['cit:citation', 'dif:differential']
+    :returns: xpath string
+    """
+    path = './/'
+    for ele in path_elems:
+        ns, dot, tag = ele.partition(':')
+        path += f"{{{ns_dict[ns]}}}{tag}/"
+    return path.rstrip('/')
+
+def get_metadata(metadata_url):
+    """
+    Fethes metadata from a URL
+
+    :param metadata_url: metadata URL
+    :returns: metadata, encoding
+    """
+    meta = requests.get(metadata_url)
+    if meta.encoding is not None:
+        encoding = meta.encoding
+    else:
+        encoding = 'utf-8'
+
+    # Read XML from URL
+    return encoding, meta
+
+
+
diff --git a/tests/test_ISO19115_3.py b/tests/test_ISO19115_3.py
new file mode 100644
index 0000000..f0c736a
--- /dev/null
+++ b/tests/test_ISO19115_3.py
@@ -0,0 +1,8 @@
+from ISO19115_3_extract import ISO19115_3Extractor
+
+def test_ISO19115_3():
+    url = "https://catalog.sarig.sa.gov.au/geonetwork/srv/api/records/9c6ae754-291d-4100-afd9-478c3a9ddf42/formatters/xml"
+    name = 'ngawler'
+    ce = ISO19115_3Extractor()
+    assert ce.write_record(name, {'north': '0.0', 'south': '-45', 'east': '-145', 'west':'-100'}, name, url, f"test_19115_3_{name}.xml")
+
diff --git a/tests/test_ISO19139.py b/tests/test_ISO19139.py
new file mode 100644
index 0000000..e767699
--- /dev/null
+++ b/tests/test_ISO19139.py
@@ -0,0 +1,12 @@
+from ISO19139_extract import ISO19139Extractor
+
+def test_ISO19139():
+    metadata_urls = [
+ ("mcarthur", "http://www.ntlis.nt.gov.au/metadata/export_data?type=xml&metadata_id=1080195AEBC6A054E050CD9B214436A1"),
+ ("windimurra", "https://warsydprdstadasc.blob.core.windows.net/downloads/Metadata_Statements/XML/3D_Windimurra_2015.xml"),
+ ("sandstone", "https://warsydprdstadasc.blob.core.windows.net/downloads/Metadata_Statements/XML/3D_Sandstone_2015.xml")
+    ]
+    ce = ISO19139Extractor()
+    for name, url in metadata_urls:
+        ce.write_record(name, {'north': '0.0', 'south': '-45', 'east': '-145', 'west':'-100'}, name, url, f"test_19139_{name}.xml")
+
diff --git a/tests/test_ckan.py b/tests/test_ckan.py
new file mode 100644
index 0000000..ca2c905
--- /dev/null
+++ b/tests/test_ckan.py
@@ -0,0 +1,7 @@
+from ckan_extract import CkanExtractor
+
+def test_ckan():
+    SITE__URL = 'https://geoscience.data.qld.gov.au'
+    ce = CkanExtractor()
+    assert ce.write_record('Mt Dore', { 'north': -10.2, 'south': -45.0, 'east': 145.0, 'west': 90.0}, 'mtdore', SITE__URL, 'ds000002', 'test_ckan.xml')
+
diff --git a/tests/test_oai.py b/tests/test_oai.py
new file mode 100644
index 0000000..f0da9ca
--- /dev/null
+++ b/tests/test_oai.py
@@ -0,0 +1,22 @@
+import os
+from oai_extract import OaiExtractor
+
+def test_oai():
+    # Get records from Northern Territory Geological Service
+    # OAI-PMH URL
+    OAI__URL = 'https://geoscience.nt.gov.au/gemis/ntgsoai/request'
+    XML_FILE = 'test_oai.xml'
+    
+    # GEMIS permanent link of McArthur 3D model
+    MODEL__URL = 'https://geoscience.nt.gov.au/gemis/ntgsjspui/handle/1/81751'
+    oe = OaiExtractor(OAI__URL)
+    # Convert perm link to OAI-PMH ID
+    handle_id = '/'.join(MODEL__URL.split('/')[-2:])
+    # NB: Some geological fields that are present in GEMIS website are missing from OAI output with 'oai_dc' prefix,
+    # i.e. "Stratigraphy" The 'xoai' prefix will allow extraction of these missing fields but the XML output
+    # would need to be parsed
+    assert oe.write_record('test', {'east': 154.3, 'west': 109.1, 'south': -43.9, 'north': -10.6}, 'mcarthur', 'oai:geoscience.nt.gov.au:'+handle_id, 'oai_dc', "NTGS GEMIS", XML_FILE)
+
+    with open(os.path.join(oe.output_dir, XML_FILE), 'r') as fd:
+        lines = fd.readlines()
+        assert '154.3' in ''.join(lines)
diff --git a/tests/test_pdf.py b/tests/test_pdf.py
new file mode 100644
index 0000000..3f9af42
--- /dev/null
+++ b/tests/test_pdf.py
@@ -0,0 +1,6 @@
+from pdf_extract import PDFExtractor
+
+def test_pdf():
+    pe = PDFExtractor()
+    # Test missing PDF file
+    assert not pe.write_record("Blah Blah", "blah", "blah.pdf", "https://blah.org/blah.pdf", "test org", "test title", {'north': -15.0, 'south': -40.4, 'east': 120.5, 'west': 100.3}, 3000, "pdf_test.xml")
diff --git a/tests/test.py b/tests/unit_test.py
similarity index 81%
rename from tests/test.py
rename to tests/unit_test.py
index 3be6942..a309c3e 100644
--- a/tests/test.py
+++ b/tests/unit_test.py
@@ -1,37 +1,19 @@
 import requests
 import xml.etree.ElementTree as etree
+from pathlib import Path
+import os
 
 from add_coords import add_coords
 from add_links import add_model_link
 from add_model_keyw import add_models_keyword
-from namespaces import ns_19115_3, ns_19139
+from helpers import ns_19115_3, ns_19139, get_metadata, make_xpath
+from keywords import extract_db_terms, run_yake
+from pdf_helper import parse_pdf
 
 ISO19139_URL = "http://52.65.91.200/geonetwork/srv/api/records/97ed8560c193e0c1855445cec4e812d4c59654ff/formatters/xml"
 ISO19115_3_URL = "https://catalog.sarig.sa.gov.au/geonetwork/srv/api/records/9c6ae754-291d-4100-afd9-478c3a9ddf42/formatters/xml"
 
-def make_xpath(ns_dict, path_elems):
-    """
-    Makes an xpath with namespaces  given a list of tags
-
-    :param ns_dict: namespace dictionary e.g. { 'cit': 'http://cit.org/1.0', 'dif': 'http://dif.org/2.0' }
-    :param path_elems: list of tags e.g. ['cit:citation', 'dif:differential']
-    :returns: xpath string
-    """
-    path = './/'
-    for ele in path_elems:
-        ns, dot, tag = ele.partition(':')
-        path += f"{{{ns_dict[ns]}}}{tag}/"
-    return path.rstrip('/')
-
-def get_metadata(metadata_url):
-    meta = requests.get(metadata_url)
-    if meta.encoding is not None:
-        encoding = meta.encoding
-    else:
-        encoding = 'utf-8'
-
-    # Read XML from URL
-    return encoding, meta
+DATA_DIR = str(Path(__file__).parent.parent / 'data')
 
 def test_add_coords():
     """
@@ -106,3 +88,12 @@ def test_add_keyw():
     xp = make_xpath(ns_19115_3, keywpath_list)
     xp += f"[.='AuScope 3D Geological Models']"
     assert root.findall(xp, namespaces=ns_19115_3) != []
+
+def test_keywords():
+    kw_dict = extract_db_terms()
+    print(f"{DATA_DIR=}")
+    text = parse_pdf(os.path.join(DATA_DIR, 'reports/vic/G107513_OtwayBasin_3D_notes.pdf'), False)
+    keywords = run_yake(kw_dict, text)
+    assert 'field inventory and monitoring' in keywords
+    assert 'topography' in keywords
+    assert 'Precambrian' in keywords