Skip to content

Commit

Permalink
Changes:
Browse files Browse the repository at this point in the history
- moved OUTPUT_DIR to within Extractor class
- added very simple tests for extraction classes
- added keyword tests
  • Loading branch information
vjf committed Apr 22, 2024
1 parent 89a1041 commit c026261
Show file tree
Hide file tree
Showing 22 changed files with 166 additions and 126 deletions.
9 changes: 1 addition & 8 deletions src/ISO19115_3_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import lxml

from extractor import Extractor
from constants import OUTPUT_DIR

from add_model_keyw import add_models_keyword

Expand Down Expand Up @@ -180,16 +179,10 @@ def write_record(self, name, bbox, model_endpath, metadata_url, output_file):
xml_string = add_models_keyword(str_result, 'utf-8', 'ISO19115-3')

# Write to disk
with open(os.path.join(OUTPUT_DIR, output_file), 'w') as ff:
with open(os.path.join(self.output_dir, output_file), 'w') as ff:
ff.write(xml_string)

return True
return False


# Used for testing only
if __name__ == "__main__":
url = "https://catalog.sarig.sa.gov.au/geonetwork/srv/api/records/9c6ae754-291d-4100-afd9-478c3a9ddf42/formatters/xml"
name = 'ngawler'
ce = ISO19115_3Extractor()
ce.write_record(name, {'north': '0.0', 'south': '-45', 'east': '-145', 'west':'-100'}, name, url, f"test_19115_3_{name}.xml")
15 changes: 1 addition & 14 deletions src/ISO19139_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from lxml import etree

from extractor import Extractor
from constants import OUTPUT_DIR
from add_model_keyw import add_models_keyword

class ISO19139Extractor(Extractor):
Expand Down Expand Up @@ -139,20 +138,8 @@ def write_record(self, name, bbox, model_endpath, metadata_url, output_file):
xml_string = add_models_keyword(str_result, 'utf-8', 'ISO19139')

# Write to disk
with open(os.path.join(OUTPUT_DIR, output_file), 'w') as ff:
with open(os.path.join(self.output_dir, output_file), 'w') as ff:
ff.write(xml_string)
return True
return False


# This is used for testing only
if __name__ == "__main__":

metadata_urls = [
("mcarthur", "http://www.ntlis.nt.gov.au/metadata/export_data?type=xml&metadata_id=1080195AEBC6A054E050CD9B214436A1"),
("windimurra", "https://warsydprdstadasc.blob.core.windows.net/downloads/Metadata_Statements/XML/3D_Windimurra_2015.xml"),
("sandstone", "https://warsydprdstadasc.blob.core.windows.net/downloads/Metadata_Statements/XML/3D_Sandstone_2015.xml")
]
ce = ISO19139Extractor()
for name, url in metadata_urls:
ce.write_record(name, {'north': '0.0', 'south': '-45', 'east': '-145', 'west':'-100'}, name, url, f"test_19139_{name}.xml")
2 changes: 1 addition & 1 deletion src/add_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from copy import copy
from lxml.builder import ElementMaker
from add_model_keyw import insert
from constants import OUTPUT_DIR
from config import OUTPUT_DIR

"""
Utility functions used to add bounding box coordinates to ISO 19139 & 19115-3 XML
Expand Down
2 changes: 1 addition & 1 deletion src/add_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from lxml.builder import ElementMaker

from add_model_keyw import insert
from constants import OUTPUT_DIR
from config import OUTPUT_DIR


def add_model_link(model_endpath, text):
Expand Down
2 changes: 1 addition & 1 deletion src/add_model_keyw.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from copy import copy
from lxml.builder import ElementMaker

from constants import OUTPUT_DIR
from config import OUTPUT_DIR

""" Adds keywords to ISO 19139 and ISO 19115-3 XML using XPATH insertion
"""
Expand Down
2 changes: 0 additions & 2 deletions src/bedrock_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import boto3
import botocore

from constants import OUTPUT_DIR

"""
Use Claude V2.0 model to summarize text via AWS Bedrock and 'boto3' package
"""
Expand Down
17 changes: 5 additions & 12 deletions src/ckan_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

from extractor import Extractor

from constants import OUTPUT_DIR

class CkanExtractor(Extractor):
""" Connects to CKAN repository
Expand Down Expand Up @@ -143,11 +142,13 @@ def output_xml(self, ckan_dict, url, model_endpath, output_file):
}
}

xml_string = render_j2_template(mcf_dict, template_dir='../data/templates/ISO19115-3')
template_dir = os.path.join(os.path.dirname(__file__), '../data/templates/ISO19115-3')
xml_string = render_j2_template(mcf_dict, template_dir=template_dir)

# write to disk
with open(os.path.join(OUTPUT_DIR, output_file), 'w') as ff:
with open(os.path.join(self.output_dir, output_file), 'w') as ff:
ff.write(xml_string)
return True


def write_record(self, name, bbox, model_endpath, ckan_url, package_id, output_file):
Expand All @@ -172,13 +173,5 @@ def write_record(self, name, bbox, model_endpath, ckan_url, package_id, output_f
except json.JSONDecodeError:
return False
if dict['success'] is True:
self.output_xml(dict['result'], r.url, model_endpath, output_file)
return True
return self.output_xml(dict['result'], r.url, model_endpath, output_file)
return False


# Used for testing only
if __name__ == "__main__":
SITE__URL = 'https://geoscience.data.qld.gov.au'
ce = CkanExtractor()
ce.write_record('Mt Dore', 'mtdore', SITE__URL, 'ds000002', 'test_ckan.xml')
5 changes: 5 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from pathlib import Path

"""
Config for creation of ISO19139 or ISO19115-3 XML metadata records from PDF reports or online metadata services
(e.g. CKAN, dSpace, geonetwork)
Expand Down Expand Up @@ -153,3 +155,6 @@
]
}
}

# Currently set to root dir
OUTPUT_DIR = str(Path(__file__).parent / 'output')
3 changes: 0 additions & 3 deletions src/constants.py

This file was deleted.

11 changes: 11 additions & 0 deletions src/extractor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,19 @@
import os
from config import OUTPUT_DIR

"""
Parent class for reading sources and writing out XML
This is specialised for different data source types
"""
class Extractor:
def __init__(self):
print(f"{OUTPUT_DIR=}")
try:
os.mkdir(OUTPUT_DIR)
except FileExistsError as fee:
pass
self.output_dir = OUTPUT_DIR

def write_record(self, bbox, model_endpath):
""" NB: The input parameters for this function should match the parameters defined in the configuration file
"""
Expand Down
30 changes: 7 additions & 23 deletions src/keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import glob
import sys
import os
import sqlite3
from contextlib import closing

Expand Down Expand Up @@ -68,7 +69,8 @@ def extract_db_terms():
name_dict = {}
link_dict = {}
# Connect to USGS Thesaurus DB (https://apps.usgs.gov/thesaurus/)
with closing(sqlite3.connect("../db/thesauri.db")) as con:
db_file = os.path.join(os.path.dirname(__file__), '../db/thesauri.db')
with closing(sqlite3.connect(db_file)) as con:
with closing(con.cursor()) as cur:
for row in cur.execute("SELECT code, name, parent FROM term"):
# print(row)
Expand Down Expand Up @@ -112,27 +114,9 @@ def get_keywords(text):
Extracts keywords from text
:param text: text
:returns:
:returns: set of geoscience keywords
"""
# Creates lookup table using USGS Thesaurus
kw_dict = extract_db_terms()

yake_kwset = run_yake(kw_dict, text)

return yake_kwset


# Used for testing only
if __name__ == "__main__":
kw_dict = extract_db_terms()
for file in ['G107513_OtwayBasin_3D_notes.pdf',
# 'G161893_VGP_TR35_3D-Geological-framework-Otway_low-res.pdf',
#'G35615_3DVIC1_pt1.pdf'
]:
text = parse_pdf(f'../data/reports/vic/{file}', False)

yake_kwset = run_yake(kw_dict, text)
print(f"{file}: usgs+yake: {yake_kwset}")

#usgs_kwset = run_usgs(kw_dict, text)
#print("pure usgs:", usgs_kwset)

# Runs yake and matches yake's keywords with USGS Thesaurus
return run_yake(kw_dict, text)
31 changes: 8 additions & 23 deletions src/oai_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,13 @@
from sickle import Sickle

from extractor import Extractor
from constants import OUTPUT_DIR


class OaiExtractor(Extractor):

def __init__(self, oai_url, output_dir):
def __init__(self, oai_url):
super().__init__()
self.OAI_URL = oai_url
self.output_dir = output_dir

def output_xml(self, oai_dict, oai_id, bbox, model_endpath, service_name, output_file):
"""
Expand Down Expand Up @@ -128,10 +127,11 @@ def output_xml(self, oai_dict, oai_id, bbox, model_endpath, service_name, output
}
}

xml_string = render_j2_template(mcf_dict, template_dir='../data/templates/ISO19115-3')
template_dir = os.path.join(os.path.dirname(__file__), '../data/templates/ISO19115-3')
xml_string = render_j2_template(mcf_dict, template_dir=template_dir)

# write to disk
with open(os.path.join(OUTPUT_DIR, output_file), 'w') as ff:
with open(os.path.join(self.output_dir, output_file), 'w') as ff:
ff.write(xml_string)
return True

Expand All @@ -141,8 +141,9 @@ def write_record(self, name, bbox, model_endpath, oai_id, oai_prefix, service_na
"""
Write an XML record to file using metadata from OAI-PMH service
:param name: model name
:param bbox: bounding box dict, keys are 'north' 'south' 'east' 'west', values are decimals as strings, EPSG:4326 is assumed
:param model_endpath: path of model in website
:param model_endpath: path of model in website, used to create a link to website URL
:param oai_id: OAI-PMH identifier e.g. 'oai:eprints.rclis.org:4088'
:param oai_prefix: OAI-PMH prefix e.g. 'oai_dc'
:param service_name: generic name of OAI-PMH service
Expand All @@ -158,21 +159,5 @@ def write_record(self, name, bbox, model_endpath, oai_id, oai_prefix, service_na
#for k, v in oai_dict.items():
# print(k, '=>', v);

self.output_xml(oai_dict, oai_id, bbox, model_endpath, service_name, output_file)

if __name__ == "__main__":
# Get records from Northern Territory Geological Service
# OAI-PMH URL
OAI__URL = 'https://geoscience.nt.gov.au/gemis/ntgsoai/request'

# GEMIS permanent link of McArthur 3D model
MODEL__URL = 'https://geoscience.nt.gov.au/gemis/ntgsjspui/handle/1/81751'
oe = OaiExtractor(OAI__URL, 'output')
# Convert perm link to OAI-PMH ID
handle_id = '/'.join(MODEL__URL.split('/')[-2:])
print(handle_id)
# NB: Some geological fields that are present in GEMIS website are missing from OAI output with 'oai_dc' prefix,
# i.e. "Stratigraphy" The 'xoai' prefix will allow extraction of these missing fields but the XML output
# would need to be parsed
oe.write_record([154.3, 109.1, -43.9, -10.6], 'mcarthur', 'oai:geoscience.nt.gov.au:'+handle_id, 'oai_dc', "NTGS GEMIS", 'test_oai.xml')
return self.output_xml(oai_dict, oai_id, bbox, model_endpath, service_name, output_file)

31 changes: 20 additions & 11 deletions src/pdf_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,31 @@
from summary import get_summary
from add_links import add_model_link
from add_coords import add_coords
from constants import OUTPUT_DIR
from config import OUTPUT_DIR

class PDFExtractor(Extractor):
""" Creates an ISO 19115 XML file by reading a PDF file
"""

def write_record(self, name, model_endpath, pdf_file, pdf_url, organisation, title, bbox, cutoff, output_file):
"""
Write XML record
:param name: model name used in download links in record
:param model_endpath: path of model in website, used to create a link to website URL
:param pdf_file: path to PDF file
:param pdf_url: URL for PDF file
:param organisation: name of organisation
:param title: title
:param bbox: bounding box coords, dict, keys are 'north', 'south' etc.
:param cutoff: skip pages that have less than this amount of text, set to between 1000 and 3000, used to filter out pages with no useful text
:param output_file: output filename e.g. 'blah.xml'
:returns: boolean
"""
print(f"Converting: {model_endpath}")
print("bbox=", repr(bbox))
if not os.path.exists(pdf_file):
print(f"{pdf_file} does not exist")
sys.exit(1)
return False
# Extract keywords from PDF text
pdf_text = parse_pdf(pdf_file, False)
kwset = get_keywords(pdf_text)
Expand Down Expand Up @@ -128,19 +141,15 @@ def write_record(self, name, model_endpath, pdf_file, pdf_url, organisation, tit
"level": "dataset"
},
"lineage": {
"statement": f"This metadata record was reproduced from PDF report retrieved from {pdf_url} on {datetime.datetime.now():%d %b %Y}"
"statement": f"This metadata record was reproduced from the PDF report retrieved from {pdf_url} on {datetime.datetime.now():%d %b %Y}. The abstract was generated by Antrhopic Claude V2.0 (https://www.anthropic.com/). Keywords were taken from USGS Thesaurus (https://apps.usgs.gov/thesaurus/) and extracted by yake (https://pypi.org/project/yake)"
}
}
}

xml_string = render_j2_template(mcf_dict, template_dir='../data/templates/ISO19115-3')
template_dir = os.path.join(os.path.dirname(__file__), '../data/templates/ISO19115-3')
xml_string = render_j2_template(mcf_dict, template_dir=template_dir)

# write to disk
with open(os.path.join(OUTPUT_DIR, output_file), 'w') as ff:
ff.write(xml_string)



if __name__ == "__main__":
pe = PDFExtractor()
pe.write_record("test-pdf", "https://blah/blah.pdf")
return True
3 changes: 1 addition & 2 deletions src/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
from ISO19115_3_extract import ISO19115_3Extractor
from pdf_extract import PDFExtractor

from constants import OUTPUT_DIR
from config import CONFIG
from config import CONFIG, OUTPUT_DIR

"""
Create ISO19139 or ISO19115-3 XML metadata records from PDF reports or online metadata services
Expand Down
2 changes: 1 addition & 1 deletion src/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from bedrock_summary import run_claude

from pdf_helper import parse_pdf
from constants import OUTPUT_DIR
from config import OUTPUT_DIR

# os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

Expand Down
Loading

0 comments on commit c026261

Please sign in to comment.