Skip to content

Commit

Permalink
Enhance CSW harvester and DCAT-AP parsers
Browse files Browse the repository at this point in the history
- Improve frequency parser
- Improve resource rights parser
- Add reference serializer
  • Loading branch information
mjanez committed Jan 28, 2025
1 parent de3d7d4 commit 0439f88
Show file tree
Hide file tree
Showing 6 changed files with 293 additions and 70 deletions.
1 change: 1 addition & 0 deletions ckanext/schemingdcat/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
'FORMAT_MAPPING',
'RESOURCE_TYPES',
'FILE_TYPES',
'FORMAT_STANDARDIZATION',
## CSW processor configuration
'CSW_DEFAULT_LIMIT',
'CQL_QUERY_DEFAULT',
Expand Down
48 changes: 47 additions & 1 deletion ckanext/schemingdcat/config/harvest_csw.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

# XLSTransformer configuration
XLST_MAPPINGS_DIR = os.path.join(BASE_DIR, 'ckanext/schemingdcat/lib/csw_mapper/xslt/mappings')
DEFAULT_XSLT_FILE = 'https://raw.githubusercontent.com/mjanez/iso-19139-to-dcat-ap/refs/heads/main/iso-19139-to-dcat-ap.xsl'
# GeoDCAT-AP official XSLT
DEFAULT_XSLT_FILE = 'https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/refs/heads/geodcat-ap-2.0.0/iso-19139-to-dcat-ap.xsl'

# CSW processor configuration
CSW_DEFAULT_LIMIT = None
Expand Down Expand Up @@ -64,9 +65,54 @@
'www:link-1.0-http-rss': 'HTTP', # rss news feed (url)
'www:link-1.0-http-samples': 'HTTP', # showcase product (url)
'www:link-1.0-http-opendap': 'HTTP', # opendap url
'web map service': 'WMS',
'web feature service': 'WFS',
'web coverage service': 'WCS',
'web map tile service': 'WMTS',
'html': 'HTML',
'htm': 'HTML',
'octet-stream': 'BINARY',
'aspx': 'HTML',
'www:download': 'HTTP',
'www:link': 'HTTP',
'application/octet-stream': 'BINARY',
'application/html': 'HTML',
'text/html': 'HTML',
# Add other mappings as needed
}

FORMAT_STANDARDIZATION = {
'format_patterns': {
'wms': 'WMS',
'web map service': 'WMS',
'ogc:wms': 'WMS',
'wfs': 'WFS',
'web feature service': 'WFS',
'ogc:wfs': 'WFS',
'wmts': 'WMTS',
'web map tile service': 'WMTS',
'html': 'HTML',
'htm': 'HTML',
'binary': 'BINARY',
'octet-stream': 'BINARY',
'application/octet-stream': 'BINARY',
'visor': 'HTML',
'viewer': 'HTML',
'enlace': 'HTML',
'link': 'HTML',
'html': 'HTML',
'htm': 'HTML',
'web': 'HTML',
},
'mimetype_mapping': {
'WMS': 'application/vnd.ogc.wms_xml',
'WFS': 'application/gml+xml',
'WMTS': 'application/vnd.ogc.wmts+xml',
'HTML': 'text/html',
'BINARY': 'application/octet-stream',
}
}

SERVICE_FORMAT = 'SERVICE'
API_FORMAT = 'API'
LINKED_DATA_SERVICE = "Linked Data Service"
Expand Down
153 changes: 138 additions & 15 deletions ckanext/schemingdcat/harvesters/csw.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import traceback
import uuid
import dateutil
import time

import ckan.plugins as p
from ckan import model
Expand All @@ -27,7 +28,9 @@
DEFAULT_XSLT_FILE,
CQL_QUERY_DEFAULT,
INSPIRE_HVD_CATEGORY,
INSPIRE_HVD_APPLICABLE_LEGISLATION
INSPIRE_HVD_APPLICABLE_LEGISLATION,
PROTOCOL_MAPPING,
FORMAT_STANDARDIZATION
)
from ckanext.schemingdcat.lib.csw_mapper.xslt_transformer import XSLTTransformer
from ckanext.schemingdcat.interfaces import ISchemingDCATHarvester
Expand Down Expand Up @@ -152,6 +155,12 @@ def validate_config(self, config):
if field_name in dataset_field_names:
raise KeyError(f"Field name '{field_name}' in default_extras already exists in the schema")

if 'private_datasets' in config_obj:
if not isinstance(config_obj['private_datasets'], bool):
raise ValueError('private_datasets must be boolean')
else:
config_obj['private_datasets'] = True # default value

return config

def _set_config(self, config_str, harvest_source_id):
Expand All @@ -169,33 +178,34 @@ def _set_config(self, config_str, harvest_source_id):
self.config['ssl_verify'] = p.toolkit.asbool(p.toolkit.config.get('ckanext.schemingdcat.csw.ssl_verify', True))

log.debug('Using config: %r' % self.config)

def modify_package_dict(self, package_dict, harvest_object):
'''
Allows custom harvesters to modify the package dict before
creating or updating the actual package.
Allows custom harvesters to modify the package dict before
creating or updating the actual package.
'''
log.debug('In SchemingDCATCSWHarvester modify_package_dict')

# Assign HVD category
package_dict = self.normalize_inspire_hvd_category(package_dict)

# Standarize resources (dcat:Distribution)
for resource in package_dict.get("resources", []):
# Process the format field
format_value = resource.get('format')
#log.debug('format_value:%s', format_value)

if format_value:
# Extract the last part of the URL and convert to uppercase
resource['format'] = format_value.rsplit('/', 1)[-1].upper()
else:
resource.pop('format', None)
format_name, mimetype = self._clean_format(format_value)
if format_name:
resource['format'] = format_name
if mimetype:
resource['mimetype'] = mimetype
else:
resource.pop('format', None)
resource.pop('mimetype', None)

# Apply default values if required fields are empty
self._apply_default_values(package_dict)

#log.debug('package_dict after modify_package_dict: %s', package_dict)


return package_dict

def gather_stage(self, harvest_job):
Expand Down Expand Up @@ -229,6 +239,12 @@ def gather_stage(self, harvest_job):
cql_search_term=self.config.get('cql_search_term', None),
cql_use_like=self.config.get('cql_use_like', False)
)

# Limit to first 25 records for testing
if DEBUG_MODE:
gathered_identifiers = gathered_identifiers[:25]
log.debug('Limited to first 25 records for testing')

except KeyError as e:
# Handling the case of a missing key in self.config
missing_key = e.args[0]
Expand Down Expand Up @@ -336,6 +352,9 @@ def gather_stage(self, harvest_job):
# If the dataset has no identifier, use an UUID
if not dataset.get('identifier'):
dataset['identifier'] = str(uuid.uuid4())

else:
dataset['identifier'] = self._clean_identifier(dataset['identifier'])

except Exception as e:
skipped_datasets += 1
Expand All @@ -347,6 +366,24 @@ def gather_stage(self, harvest_job):
self._save_gather_error('Missing identifier for dataset with title: %s' % dataset.get('title'), harvest_job)
continue

if not dataset.get('reference') and dataset.get('identifier'):
# Build GetRecordById URL
getrecord_params = {
'service': 'CSW',
'version': '2.0.2',
'request': 'GetRecordById',
'id': dataset['identifier'],
'elementSetName': 'full',
'outputSchema': 'http://www.isotc211.org/2005/gmd',
'OutputFormat': 'application/xml'
}

# Convert params to URL query string
query_string = '&'.join([f"{k}={v}" for k, v in getrecord_params.items()])
dataset['reference'] = f"{csw_url}?{query_string}"
log.debug(f"Added CSW reference URL: {dataset['reference']}")


# Check if a dataset with the same identifier exists can be overridden if necessary
#existing_dataset = self._check_existing_package_by_ids(dataset)
#log.debug('existing_dataset: %s', existing_dataset)
Expand Down Expand Up @@ -771,4 +808,90 @@ def normalize_inspire_hvd_category(self, package_dict):
return package_dict

except Exception as e:
raise ValueError(f'Error updating the package dictionary: {e}') from e
raise ValueError(f'Error updating the package dictionary: {e}') from e

def _clean_identifier(self, identifier):
"""
Cleans identifier by removing or replacing reserved characters.
Args:
identifier (str): The identifier to clean
Returns:
str: The cleaned identifier
"""
if not identifier:
return identifier

# Define characters to replace with underscore
chars_to_replace = ['/', ':', '\\', ' ', '?', '#', '[', ']', '@', '!', '$', '&', "'",
'(', ')', '*', '+', ',', ';', '=']

clean_id = identifier
for char in chars_to_replace:
clean_id = clean_id.replace(char, '_')

# Remove multiple consecutive underscores
while '__' in clean_id:
clean_id = clean_id.replace('__', '_')

# Remove leading/trailing underscores
clean_id = clean_id.strip('_')

return clean_id

def _clean_format(self, format_value):
"""
Clean and standardize format values.
This method takes a format value, cleans it by converting it to lowercase,
removing unnecessary description text, and then attempts to match it against
known format patterns and protocol mappings to standardize it.
Args:
format_value (str): The format value to be cleaned and standardized.
Returns:
tuple: A tuple containing the standardized format and its corresponding
MIME type. If no match is found, returns (None, None).
"""
if not format_value:
return None, None

# Convert to lowercase and strip whitespace
format_lower = format_value.lower().strip()

# Remove IANA URL prefix if present
if 'www.iana.org/assignments/media-types/' in format_lower:
format_lower = format_lower.split('media-types/')[-1]
if format_lower.startswith('application/'):
format_lower = format_lower[12:]

# Split by common separators
parts = format_lower.replace('-', ' ').replace('/', ' ').replace('_', ' ').split()

# Try to find a valid format in any of the parts
for part in parts:
# First try direct match in format_patterns
if part in FORMAT_STANDARDIZATION['format_patterns']:
std_format = FORMAT_STANDARDIZATION['format_patterns'][part]
return std_format, FORMAT_STANDARDIZATION['mimetype_mapping'].get(std_format)

# If no direct match found, try with the complete string
for pattern, std_format in FORMAT_STANDARDIZATION['format_patterns'].items():
if pattern in format_lower:
return std_format, FORMAT_STANDARDIZATION['mimetype_mapping'].get(std_format)

# Check in protocol mapping as last resort
if format_lower in PROTOCOL_MAPPING:
protocol = PROTOCOL_MAPPING[format_lower]
if protocol in FORMAT_STANDARDIZATION['mimetype_mapping']:
return protocol, FORMAT_STANDARDIZATION['mimetype_mapping'][protocol]

# If still no match but contains service keywords
for service_type in ['wms', 'wfs', 'wmts', 'wcs']:
if service_type in format_lower:
std_format = service_type.upper()
return std_format, FORMAT_STANDARDIZATION['mimetype_mapping'].get(std_format)

return None, None
Loading

0 comments on commit 0439f88

Please sign in to comment.