From e7aa229ccb7c4e0cbeda6f2e2542fb5a9b3e3656 Mon Sep 17 00:00:00 2001 From: mjanez <96422458+mjanez@users.noreply.github.com> Date: Fri, 26 Jul 2024 17:57:28 +0200 Subject: [PATCH 1/8] First approach --- ckanext/schemingdcat/config.py | 545 ++-- ckanext/schemingdcat/harvesters/base.py | 199 +- ckanext/schemingdcat/harvesters/ckan.py | 116 +- ckanext/schemingdcat/helpers.py | 2706 ++++++++++---------- ckanext/schemingdcat/lib/field_mapping.py | 4 +- ckanext/schemingdcat/package_controller.py | 316 +-- ckanext/schemingdcat/utils.py | 581 +++-- 7 files changed, 2308 insertions(+), 2159 deletions(-) diff --git a/ckanext/schemingdcat/config.py b/ckanext/schemingdcat/config.py index 3752efc3..90f555ba 100644 --- a/ckanext/schemingdcat/config.py +++ b/ckanext/schemingdcat/config.py @@ -1,267 +1,280 @@ -import typing -import re - -# Default values -default_facet_operator = 'OR' -icons_dir = 'images/icons' -default_locale = 'en' -organization_custom_facets = False -group_custom_facets = False -debug = False -linkeddata_links = None -geometadata_links = None -endpoints = None -endpoints_yaml = 'endpoints.yaml' -facet_list_limit = 6 -default_package_item_icon = 'theme' -default_package_item_show_spatial = True -show_metadata_templates_toolbar = True -metadata_templates_search_identifier = 'schemingdcat_xls-template' -mimetype_base_uri = 'http://www.iana.org/assignments/media-types' - -# Default DCAT metadata configuration -OGC2CKAN_HARVESTER_MD_CONFIG = { - 'access_rights': 'http://inspire.ec.europa.eu/metadata-codelist/LimitationsOnPublicAccess/noLimitations', - 'conformance': [ - 'http://inspire.ec.europa.eu/documents/inspire-metadata-regulation','http://inspire.ec.europa.eu/documents/commission-regulation-eu-no-13122014-10-december-2014-amending-regulation-eu-no-10892010-0' - ], - 'author': 'ckanext-schemingdcat', - 'author_email': 'admin@{ckan_instance}', - 'author_url': '{ckan_instance}/organization/test', - 'author_uri': '{ckan_instance}/organization/test', - 'contact_name': 'ckanext-schemingdcat', - 'contact_email': 'admin@{ckan_instance}', - 'contact_url': '{ckan_instance}/organization/test', - 'contact_uri': '{ckan_instance}/organization/test', - 'dcat_type': { - 'series': 'http://inspire.ec.europa.eu/metadata-codelist/ResourceType/series', - 'dataset': 'http://inspire.ec.europa.eu/metadata-codelist/ResourceType/dataset', - 'spatial_data_service': 'http://inspire.ec.europa.eu/metadata-codelist/ResourceType/service', - 'default': 'http://inspire.ec.europa.eu/metadata-codelist/ResourceType/dataset', - 'collection': 'http://purl.org/dc/dcmitype/Collection', - 'event': 'http://purl.org/dc/dcmitype/Event', - 'image': 'http://purl.org/dc/dcmitype/Image', - 'still_image': 'http://purl.org/dc/dcmitype/StillImage', - 'moving_image': 'http://purl.org/dc/dcmitype/MovingImage', - 'physical_object': 'http://purl.org/dc/dcmitype/PhysicalObject', - 'interactive_resource': 'http://purl.org/dc/dcmitype/InteractiveResource', - 'service': 'http://purl.org/dc/dcmitype/Service', - 'sound': 'http://purl.org/dc/dcmitype/Sound', - 'software': 'http://purl.org/dc/dcmitype/Software', - 'text': 'http://purl.org/dc/dcmitype/Text', - }, - 'encoding': 'UTF-8', - 'frequency' : 'http://publications.europa.eu/resource/authority/frequency/UNKNOWN', - 'inspireid_theme': 'HB', - 'language': 'http://publications.europa.eu/resource/authority/language/ENG', - 'license': 'http://creativecommons.org/licenses/by/4.0/', - 'license_id': 'cc-by', - 'lineage_process_steps': 'ckanext-schemingdcat lineage process steps.', - 'maintainer': 'ckanext-schemingdcat', - 'maintainer_email': 'admin@{ckan_instance}', - 'maintainer_url': '{ckan_instance}/organization/test', - 'maintainer_uri': '{ckan_instance}/organization/test', - 'metadata_profile': [ - "http://semiceu.github.io/GeoDCAT-AP/releases/2.0.0","http://inspire.ec.europa.eu/document-tags/metadata" - ], - 'provenance': 'ckanext-schemingdcat provenance statement.', - 'publisher_name': 'ckanext-schemingdcat', - 'publisher_email': 'admin@{ckan_instance}', - 'publisher_url': '{ckan_instance}/organization/test', - 'publisher_identifier': '{ckan_instance}/organization/test', - 'publisher_uri': '{ckan_instance}/organization/test', - 'publisher_type': 'http://purl.org/adms/publishertype/NonProfitOrganisation', - 'reference_system': 'http://www.opengis.net/def/crs/EPSG/0/4258', - 'representation_type': { - 'wfs': 'http://inspire.ec.europa.eu/metadata-codelist/SpatialRepresentationType/vector', - 'wcs': 'http://inspire.ec.europa.eu/metadata-codelist/SpatialRepresentationType/grid', - 'default': 'http://inspire.ec.europa.eu/metadata-codelist/SpatialRepresentationType/vector', - 'grid': 'http://inspire.ec.europa.eu/metadata-codelist/SpatialRepresentationType/grid', - 'vector': 'http://inspire.ec.europa.eu/metadata-codelist/SpatialRepresentationType/vector', - 'textTable': 'http://inspire.ec.europa.eu/metadata-codelist/SpatialRepresentationType/textTable', - 'tin': 'http://inspire.ec.europa.eu/metadata-codelist/SpatialRepresentationType/tin', - 'stereoModel': 'http://inspire.ec.europa.eu/metadata-codelist/SpatialRepresentationType/stereoModel', - 'video': 'http://inspire.ec.europa.eu/metadata-codelist/SpatialRepresentationType/video', - }, - 'resources': { - 'availability': 'http://publications.europa.eu/resource/authority/planned-availability/AVAILABLE', - 'name': { - 'es': 'Distribución {format}', - 'en': 'Distribution {format}' - }, - }, - 'rights': 'http://inspire.ec.europa.eu/metadata-codelist/LimitationsOnPublicAccess/noLimitations', - 'spatial': None, - 'spatial_uri': 'http://datos.gob.es/recurso/sector-publico/territorio/Pais/España', - 'status': 'http://purl.org/adms/status/UnderDevelopment', - 'temporal_start': None, - 'temporal_end': None, - 'theme': 'http://inspire.ec.europa.eu/theme/hb', - 'theme_es': 'http://datos.gob.es/kos/sector-publico/sector/medio-ambiente', - 'theme_eu': 'http://publications.europa.eu/resource/authority/data-theme/ENVI', - 'topic': 'http://inspire.ec.europa.eu/metadata-codelist/TopicCategory/biota', - 'valid': None -} - -OGC2CKAN_MD_FORMATS = { - 'api': ('API', 'http://www.iana.org/assignments/media-types/application/vnd.api+json', None, 'Application Programming Interface'), - 'api feature': ('OGCFeat', 'http://www.opengis.net/def/interface/ogcapi-features', 'http://www.opengeospatial.org/standards/features', 'OGC API - Features'), - 'wms': ('WMS', 'http://www.opengis.net/def/serviceType/ogc/wms', 'http://www.opengeospatial.org/standards/wms', 'Web Map Service'), - 'zip': ('ZIP', 'http://www.iana.org/assignments/media-types/application/zip', 'http://www.iso.org/standard/60101.html', 'ZIP File'), - 'rar': ('RAR', 'http://www.iana.org/assignments/media-types/application/vnd.rar', 'http://www.rarlab.com/technote.htm', 'RAR File'), - 'wfs': ('WFS', 'http://www.opengis.net/def/serviceType/ogc/wfs', 'http://www.opengeospatial.org/standards/wfs', 'Web Feature Service'), - 'wcs': ('WCS', 'http://www.opengis.net/def/serviceType/ogc/wcs', 'http://www.opengeospatial.org/standards/wcs', 'Web Coverage Service'), - 'tms': ('TMS', 'http://wiki.osgeo.org/wiki/Tile_Map_Service_Specification', 'http://www.opengeospatial.org/standards/tms', 'Tile Map Service'), - 'wmts': ('WMTS', 'http://www.opengis.net/def/serviceType/ogc/wmts', 'http://www.opengeospatial.org/standards/wmts', 'Web Map Tile Service'), - 'kml': ('KML', 'http://www.iana.org/assignments/media-types/application/vnd.google-earth.kml+xml', 'http://www.opengeospatial.org/standards/kml', 'Keyhole Markup Language'), - 'kmz': ('KMZ', 'http://www.iana.org/assignments/media-types/application/vnd.google-earth.kmz+xml', 'http://www.opengeospatial.org/standards/kml', 'Compressed Keyhole Markup Language'), - 'gml': ('GML', 'http://www.iana.org/assignments/media-types/application/gml+xml', 'http://www.opengeospatial.org/standards/gml', 'Geography Markup Language'), - 'geojson': ('GeoJSON', 'http://www.iana.org/assignments/media-types/application/geo+json', 'http://www.rfc-editor.org/rfc/rfc7946', 'GeoJSON'), - 'json': ('JSON', 'http://www.iana.org/assignments/media-types/application/json', 'http://www.ecma-international.org/publications/standards/Ecma-404.htm', 'JavaScript Object Notation'), - 'atom': ('ATOM', 'http://www.iana.org/assignments/media-types/application/atom+xml', 'http://validator.w3.org/feed/docs/atom.html', 'Atom Syndication Format'), - 'xml': ('XML', 'http://www.iana.org/assignments/media-types/application/xml', 'http://www.w3.org/TR/REC-xml/', 'Extensible Markup Language'), - 'arcgis_rest': ('ESRI Rest', None, None, 'ESRI Rest Service'), - 'shp': ('SHP', 'http://www.iana.org/assignments/media-types/application/vnd.shp', 'http://www.esri.com/library/whitepapers/pdfs/shapefile.pdf', 'ESRI Shapefile'), - 'shapefile': ('SHP', 'http://www.iana.org/assignments/media-types/application/vnd.shp', 'http://www.esri.com/library/whitepapers/pdfs/shapefile.pdf', 'ESRI Shapefile'), - 'esri': ('SHP', 'http://www.iana.org/assignments/media-types/application/vnd.shp', 'http://www.esri.com/library/whitepapers/pdfs/shapefile.pdf', 'ESRI Shapefile'), - 'html': ('HTML', 'http://www.iana.org/assignments/media-types/text/html', 'http://www.w3.org/TR/2011/WD-html5-20110405/', 'HyperText Markup Language'), - 'html5': ('HTML', 'http://www.iana.org/assignments/media-types/text/html', 'http://www.w3.org/TR/2011/WD-html5-20110405/', 'HyperText Markup Language'), - 'visor': ('HTML', 'http://www.iana.org/assignments/media-types/text/html', 'http://www.w3.org/TR/2011/WD-html5-20110405/', 'Map Viewer'), - 'enlace': ('HTML', 'http://www.iana.org/assignments/media-types/text/html', 'http://www.w3.org/TR/2011/WD-html5-20110405/', 'Map Viewer'), - 'pdf': ('PDF', 'http://www.iana.org/assignments/media-types/application/pdf', 'http://www.iso.org/standard/75839.html', 'Portable Document Format'), - 'csv': ('CSV', 'http://www.iana.org/assignments/media-types/text/csv', 'http://www.rfc-editor.org/rfc/rfc4180', 'Comma-Separated Values'), - 'netcdf': ('NetCDF', 'http://www.iana.org/assignments/media-types/text/csv', 'http://www.opengeospatial.org/standards/netcdf', 'Network Common Data Form'), - 'csw': ('CSW', 'http://www.opengis.net/def/serviceType/ogc/csw', 'http://www.opengeospatial.org/standards/cat', 'Catalog Service for the Web'), - 'geodcatap': ('RDF', 'http://www.iana.org/assignments/media-types/application/rdf+xml', 'http://semiceu.github.io/GeoDCAT-AP/releases/2.0.0/', 'GeoDCAT-AP 2.0 Metadata') - , - 'inspire': ('XML', 'http://www.iana.org/assignments/media-types/application/xml', ['http://inspire.ec.europa.eu/documents/inspire-metadata-regulation','http://inspire.ec.europa.eu/documents/commission-regulation-eu-no-13122014-10-december-2014-amending-regulation-eu-no-10892010-0', 'http://www.isotc211.org/2005/gmd/'], 'INSPIRE ISO 19139 Metadata') -} - -OGC2CKAN_ISO_MD_ELEMENTS = { - 'lineage_source': 'gmd:dataQualityInfo/gmd:DQ_DataQuality/gmd:lineage/gmd:LI_Lineage/gmd:source/gmd:LI_Source/gmd:description/gco:CharacterString', - 'lineage_process_steps': 'gmd:dataQualityInfo/gmd:DQ_DataQuality/gmd:lineage/gmd:LI_Lineage/gmd:processStep' -} - -# loose definition of BCP47-like strings -BCP_47_LANGUAGE = u'^[a-z]{2,8}(-[0-9a-zA-Z]{1,8})*$' - -DATE_FIELDS = [ - {'field_name': 'created', 'fallback': 'issued', 'default_value': None, 'override': True, 'dtype': str}, - {'field_name': 'issued', 'fallback': None, 'default_value': None, 'override': True, 'dtype': str}, - {'field_name': 'modified', 'fallback': 'issued', 'default_value': None, 'override': True, 'dtype': str}, - {'field_name': 'valid', 'fallback': None, 'default_value': None, 'override': True, 'dtype': str}, - {'field_name': 'temporal_start', 'fallback': None, 'default_value': None, 'override': True, 'dtype': str}, - {'field_name': 'temporal_end', 'fallback': None, 'default_value': None, 'override': True, 'dtype': str} -] - -DATASET_DEFAULT_FIELDS = [ - {'field_name': 'id', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, - {'field_name': 'name', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, - {'field_name': 'title', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, - {'field_name': 'notes', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, - {'field_name': 'description', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, - {'field_name': 'access_rights', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['access_rights'], 'override': True, 'dtype': str}, - {'field_name': 'license', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['license'], 'override': True, 'dtype': str}, - {'field_name': 'license_id', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['license_id'], 'override': True, 'dtype': str}, - {'field_name': 'topic', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['topic'], 'override': True, 'dtype': str}, - {'field_name': 'theme', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['theme'], 'override': True, 'dtype': str}, - {'field_name': 'theme_eu', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['theme_eu'], 'override': True, 'dtype': str}, - {'field_name': 'status', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['status'], 'override': True, 'dtype': str}, - {'field_name': 'hvd_category', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, -] - -RESOURCE_DEFAULT_FIELDS = [ - {'field_name': 'url', 'fallback': None, 'default_value': "", 'override': False, 'dtype': str}, - {'field_name': 'name', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, - {'field_name': 'format', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, - {'field_name': 'protocol', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, - {'field_name': 'mimetype', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, - {'field_name': 'description', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, - {'field_name': 'license', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['license'], 'override': True, 'dtype': str}, - {'field_name': 'license_id', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['license_id'], 'override': True, 'dtype': str}, - {'field_name': 'rights', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['rights'], 'override': True, 'dtype': str}, - {'field_name': 'language', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['language'], 'override': False, 'dtype': str}, - {'field_name': 'conforms_to', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, - {'field_name': 'size', 'fallback': None, 'default_value': 0, 'override': True, 'dtype': int}, -] - -# Custom rules for harvesters.base._update_custom_format() -CUSTOM_FORMAT_RULES = [ - { - 'format_strings': ['esri', 'arcgis'], - 'url_string': 'viewer.html?url=', - 'format': 'HTML', - 'mimetype': 'https://www.iana.org/assignments/media-types/text/html' - }, - { - 'format_strings': ['html', 'html5'], - 'url_string': None, - 'format': 'HTML', - 'mimetype': 'https://www.iana.org/assignments/media-types/text/html' - }, - { - 'format_strings': None, - 'url_string': 'getrecordbyid', - 'format': 'XML', - 'mimetype': 'https://www.iana.org/assignments/media-types/application/xml' - } - # Add more rules here as needed -] - -DATADICTIONARY_DEFAULT_SCHEMA = [ - 'id', - 'type', - 'label', - 'notes', - 'type_override' - ] - -# Common date formats for parsing. https://docs.python.org/es/3/library/datetime.html#strftime-and-strptime-format-codes -COMMON_DATE_FORMATS = [ - '%Y-%m-%d', - '%d-%m-%Y', - '%m-%d-%Y', - '%Y/%m/%d', - '%d/%m/%Y', - '%m/%d/%Y', - '%Y-%m-%d %H:%M:%S', # Date with time - '%d-%m-%Y %H:%M:%S', # Date with time - '%m-%d-%Y %H:%M:%S', # Date with time - '%Y/%m/%d %H:%M:%S', # Date with time - '%d/%m/%Y %H:%M:%S', # Date with time - '%m/%d/%Y %H:%M:%S', # Date with time - '%Y-%m-%dT%H:%M:%S', # ISO 8601 format - '%Y-%m-%dT%H:%M:%SZ', # ISO 8601 format with Zulu time indicator -] -# Vocabs -SCHEMINGDCAT_DEFAULT_DATASET_SCHEMA_NAME: typing.Final[str] = "dataset" -SCHEMINGDCAT_INSPIRE_THEMES_VOCAB: typing.Final[str] = "theme" -SCHEMINGDCAT_DCAT_THEMES_VOCAB: typing.Final[list] = ["theme_es", "theme_eu"] -SCHEMINGDCAT_ISO19115_TOPICS_VOCAB: typing.Final[list] = "topic" - - -# Clean ckan names -URL_REGEX = re.compile( - r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' -) - -# Compile the regular expression -INVALID_CHARS = re.compile(r"[^a-zñ0-9_.-]") - -# Define a dictionary to map accented characters to their unaccented equivalents except ñ -ACCENT_MAP = str.maketrans({ - "á": "a", "à": "a", "ä": "a", "â": "a", "ã": "a", - "é": "e", "è": "e", "ë": "e", "ê": "e", - "í": "i", "ì": "i", "ï": "i", "î": "i", - "ó": "o", "ò": "o", "ö": "o", "ô": "o", "õ": "o", - "ú": "u", "ù": "u", "ü": "u", "û": "u", - "ñ": "ñ", -}) - -URL_FIELD_NAMES = { - 'dataset': - ['dcat_type', 'theme_es', 'language', 'topic', 'maintainer_url', 'tag_uri', 'contact_uri', 'contact_url', 'publisher_identifier', 'publisher_uri', 'publisher_url', 'publisher_type', 'maintainer_uri', 'maintainer_url', 'author_uri', 'author_url', 'conforms_to', 'theme', 'reference_system', 'spatial_uri', 'representation_type', 'license_id', 'access_rights', 'graphic_overview', 'frequency', 'hvd_category'], - 'resource': - ['url', 'availability', 'mimetype', 'status', 'resource_relation', 'license', 'rights', 'conforms_to', 'reference_system'] - } +import typing +import re + +# Default values +default_facet_operator = 'OR' +icons_dir = 'images/icons' +default_locale = 'en' +organization_custom_facets = False +group_custom_facets = False +debug = False +linkeddata_links = None +geometadata_links = None +endpoints = None +endpoints_yaml = 'endpoints.yaml' +facet_list_limit = 6 +default_package_item_icon = 'theme' +default_package_item_show_spatial = True +show_metadata_templates_toolbar = True +metadata_templates_search_identifier = 'schemingdcat_xls-template' +mimetype_base_uri = 'http://www.iana.org/assignments/media-types' +slugify_pat = re.compile('[^a-zA-Z0-9]') + +# Default DCAT metadata configuration +OGC2CKAN_HARVESTER_MD_CONFIG = { + 'access_rights': 'http://inspire.ec.europa.eu/metadata-codelist/LimitationsOnPublicAccess/noLimitations', + 'conformance': [ + 'http://inspire.ec.europa.eu/documents/inspire-metadata-regulation','http://inspire.ec.europa.eu/documents/commission-regulation-eu-no-13122014-10-december-2014-amending-regulation-eu-no-10892010-0' + ], + 'author': 'ckanext-schemingdcat', + 'author_email': 'admin@{ckan_instance}', + 'author_url': '{ckan_instance}/organization/test', + 'author_uri': '{ckan_instance}/organization/test', + 'contact_name': 'ckanext-schemingdcat', + 'contact_email': 'admin@{ckan_instance}', + 'contact_url': '{ckan_instance}/organization/test', + 'contact_uri': '{ckan_instance}/organization/test', + 'dcat_type': { + 'series': 'http://inspire.ec.europa.eu/metadata-codelist/ResourceType/series', + 'dataset': 'http://inspire.ec.europa.eu/metadata-codelist/ResourceType/dataset', + 'spatial_data_service': 'http://inspire.ec.europa.eu/metadata-codelist/ResourceType/service', + 'default': 'http://inspire.ec.europa.eu/metadata-codelist/ResourceType/dataset', + 'collection': 'http://purl.org/dc/dcmitype/Collection', + 'event': 'http://purl.org/dc/dcmitype/Event', + 'image': 'http://purl.org/dc/dcmitype/Image', + 'still_image': 'http://purl.org/dc/dcmitype/StillImage', + 'moving_image': 'http://purl.org/dc/dcmitype/MovingImage', + 'physical_object': 'http://purl.org/dc/dcmitype/PhysicalObject', + 'interactive_resource': 'http://purl.org/dc/dcmitype/InteractiveResource', + 'service': 'http://purl.org/dc/dcmitype/Service', + 'sound': 'http://purl.org/dc/dcmitype/Sound', + 'software': 'http://purl.org/dc/dcmitype/Software', + 'text': 'http://purl.org/dc/dcmitype/Text', + }, + 'encoding': 'UTF-8', + 'frequency' : 'http://publications.europa.eu/resource/authority/frequency/UNKNOWN', + 'inspireid_theme': 'HB', + 'language': 'http://publications.europa.eu/resource/authority/language/ENG', + 'license': 'http://creativecommons.org/licenses/by/4.0/', + 'license_id': 'cc-by', + 'lineage_process_steps': 'ckanext-schemingdcat lineage process steps.', + 'maintainer': 'ckanext-schemingdcat', + 'maintainer_email': 'admin@{ckan_instance}', + 'maintainer_url': '{ckan_instance}/organization/test', + 'maintainer_uri': '{ckan_instance}/organization/test', + 'metadata_profile': [ + "http://semiceu.github.io/GeoDCAT-AP/releases/2.0.0","http://inspire.ec.europa.eu/document-tags/metadata" + ], + 'provenance': 'ckanext-schemingdcat provenance statement.', + 'publisher_name': 'ckanext-schemingdcat', + 'publisher_email': 'admin@{ckan_instance}', + 'publisher_url': '{ckan_instance}/organization/test', + 'publisher_identifier': '{ckan_instance}/organization/test', + 'publisher_uri': '{ckan_instance}/organization/test', + 'publisher_type': 'http://purl.org/adms/publishertype/NonProfitOrganisation', + 'reference_system': 'http://www.opengis.net/def/crs/EPSG/0/4258', + 'representation_type': { + 'wfs': 'http://inspire.ec.europa.eu/metadata-codelist/SpatialRepresentationType/vector', + 'wcs': 'http://inspire.ec.europa.eu/metadata-codelist/SpatialRepresentationType/grid', + 'default': 'http://inspire.ec.europa.eu/metadata-codelist/SpatialRepresentationType/vector', + 'grid': 'http://inspire.ec.europa.eu/metadata-codelist/SpatialRepresentationType/grid', + 'vector': 'http://inspire.ec.europa.eu/metadata-codelist/SpatialRepresentationType/vector', + 'textTable': 'http://inspire.ec.europa.eu/metadata-codelist/SpatialRepresentationType/textTable', + 'tin': 'http://inspire.ec.europa.eu/metadata-codelist/SpatialRepresentationType/tin', + 'stereoModel': 'http://inspire.ec.europa.eu/metadata-codelist/SpatialRepresentationType/stereoModel', + 'video': 'http://inspire.ec.europa.eu/metadata-codelist/SpatialRepresentationType/video', + }, + 'resources': { + 'availability': 'http://publications.europa.eu/resource/authority/planned-availability/AVAILABLE', + 'name': { + 'es': 'Distribución {format}', + 'en': 'Distribution {format}' + }, + }, + 'rights': 'http://inspire.ec.europa.eu/metadata-codelist/LimitationsOnPublicAccess/noLimitations', + 'spatial': None, + 'spatial_uri': 'http://datos.gob.es/recurso/sector-publico/territorio/Pais/España', + 'status': 'http://purl.org/adms/status/UnderDevelopment', + 'temporal_start': None, + 'temporal_end': None, + 'theme': 'http://inspire.ec.europa.eu/theme/hb', + 'theme_es': 'http://datos.gob.es/kos/sector-publico/sector/medio-ambiente', + 'theme_eu': 'http://publications.europa.eu/resource/authority/data-theme/ENVI', + 'topic': 'http://inspire.ec.europa.eu/metadata-codelist/TopicCategory/biota', + 'valid': None +} + +OGC2CKAN_MD_FORMATS = { + 'api': ('API', 'http://www.iana.org/assignments/media-types/application/vnd.api+json', None, 'Application Programming Interface'), + 'api feature': ('OGCFeat', 'http://www.opengis.net/def/interface/ogcapi-features', 'http://www.opengeospatial.org/standards/features', 'OGC API - Features'), + 'wms': ('WMS', 'http://www.opengis.net/def/serviceType/ogc/wms', 'http://www.opengeospatial.org/standards/wms', 'Web Map Service'), + 'zip': ('ZIP', 'http://www.iana.org/assignments/media-types/application/zip', 'http://www.iso.org/standard/60101.html', 'ZIP File'), + 'rar': ('RAR', 'http://www.iana.org/assignments/media-types/application/vnd.rar', 'http://www.rarlab.com/technote.htm', 'RAR File'), + 'wfs': ('WFS', 'http://www.opengis.net/def/serviceType/ogc/wfs', 'http://www.opengeospatial.org/standards/wfs', 'Web Feature Service'), + 'wcs': ('WCS', 'http://www.opengis.net/def/serviceType/ogc/wcs', 'http://www.opengeospatial.org/standards/wcs', 'Web Coverage Service'), + 'tms': ('TMS', 'http://wiki.osgeo.org/wiki/Tile_Map_Service_Specification', 'http://www.opengeospatial.org/standards/tms', 'Tile Map Service'), + 'wmts': ('WMTS', 'http://www.opengis.net/def/serviceType/ogc/wmts', 'http://www.opengeospatial.org/standards/wmts', 'Web Map Tile Service'), + 'kml': ('KML', 'http://www.iana.org/assignments/media-types/application/vnd.google-earth.kml+xml', 'http://www.opengeospatial.org/standards/kml', 'Keyhole Markup Language'), + 'kmz': ('KMZ', 'http://www.iana.org/assignments/media-types/application/vnd.google-earth.kmz+xml', 'http://www.opengeospatial.org/standards/kml', 'Compressed Keyhole Markup Language'), + 'gml': ('GML', 'http://www.iana.org/assignments/media-types/application/gml+xml', 'http://www.opengeospatial.org/standards/gml', 'Geography Markup Language'), + 'geojson': ('GeoJSON', 'http://www.iana.org/assignments/media-types/application/geo+json', 'http://www.rfc-editor.org/rfc/rfc7946', 'GeoJSON'), + 'json': ('JSON', 'http://www.iana.org/assignments/media-types/application/json', 'http://www.ecma-international.org/publications/standards/Ecma-404.htm', 'JavaScript Object Notation'), + 'atom': ('ATOM', 'http://www.iana.org/assignments/media-types/application/atom+xml', 'http://validator.w3.org/feed/docs/atom.html', 'Atom Syndication Format'), + 'xml': ('XML', 'http://www.iana.org/assignments/media-types/application/xml', 'http://www.w3.org/TR/REC-xml/', 'Extensible Markup Language'), + 'arcgis_rest': ('ESRI Rest', None, None, 'ESRI Rest Service'), + 'shp': ('SHP', 'http://www.iana.org/assignments/media-types/application/vnd.shp', 'http://www.esri.com/library/whitepapers/pdfs/shapefile.pdf', 'ESRI Shapefile'), + 'shapefile': ('SHP', 'http://www.iana.org/assignments/media-types/application/vnd.shp', 'http://www.esri.com/library/whitepapers/pdfs/shapefile.pdf', 'ESRI Shapefile'), + 'esri': ('SHP', 'http://www.iana.org/assignments/media-types/application/vnd.shp', 'http://www.esri.com/library/whitepapers/pdfs/shapefile.pdf', 'ESRI Shapefile'), + 'html': ('HTML', 'http://www.iana.org/assignments/media-types/text/html', 'http://www.w3.org/TR/2011/WD-html5-20110405/', 'HyperText Markup Language'), + 'html5': ('HTML', 'http://www.iana.org/assignments/media-types/text/html', 'http://www.w3.org/TR/2011/WD-html5-20110405/', 'HyperText Markup Language'), + 'visor': ('HTML', 'http://www.iana.org/assignments/media-types/text/html', 'http://www.w3.org/TR/2011/WD-html5-20110405/', 'Map Viewer'), + 'enlace': ('HTML', 'http://www.iana.org/assignments/media-types/text/html', 'http://www.w3.org/TR/2011/WD-html5-20110405/', 'Map Viewer'), + 'pdf': ('PDF', 'http://www.iana.org/assignments/media-types/application/pdf', 'http://www.iso.org/standard/75839.html', 'Portable Document Format'), + 'csv': ('CSV', 'http://www.iana.org/assignments/media-types/text/csv', 'http://www.rfc-editor.org/rfc/rfc4180', 'Comma-Separated Values'), + 'netcdf': ('NetCDF', 'http://www.iana.org/assignments/media-types/text/csv', 'http://www.opengeospatial.org/standards/netcdf', 'Network Common Data Form'), + 'csw': ('CSW', 'http://www.opengis.net/def/serviceType/ogc/csw', 'http://www.opengeospatial.org/standards/cat', 'Catalog Service for the Web'), + 'geodcatap': ('RDF', 'http://www.iana.org/assignments/media-types/application/rdf+xml', 'http://semiceu.github.io/GeoDCAT-AP/releases/2.0.0/', 'GeoDCAT-AP 2.0 Metadata') + , + 'inspire': ('XML', 'http://www.iana.org/assignments/media-types/application/xml', ['http://inspire.ec.europa.eu/documents/inspire-metadata-regulation','http://inspire.ec.europa.eu/documents/commission-regulation-eu-no-13122014-10-december-2014-amending-regulation-eu-no-10892010-0', 'http://www.isotc211.org/2005/gmd/'], 'INSPIRE ISO 19139 Metadata') +} + +OGC2CKAN_ISO_MD_ELEMENTS = { + 'lineage_source': 'gmd:dataQualityInfo/gmd:DQ_DataQuality/gmd:lineage/gmd:LI_Lineage/gmd:source/gmd:LI_Source/gmd:description/gco:CharacterString', + 'lineage_process_steps': 'gmd:dataQualityInfo/gmd:DQ_DataQuality/gmd:lineage/gmd:LI_Lineage/gmd:processStep' +} + +# loose definition of BCP47-like strings +BCP_47_LANGUAGE = u'^[a-z]{2,8}(-[0-9a-zA-Z]{1,8})*$' + +DATASET_DEFAULT_SCHEMA = [ + 'id', + 'type', + 'isopen', + ] + +RESOURCE_DEFAULT_SCHEMA = [ + 'url', + 'name', + ] + + +DATE_FIELDS = [ + {'field_name': 'created', 'fallback': 'issued', 'default_value': None, 'override': True, 'dtype': str}, + {'field_name': 'issued', 'fallback': None, 'default_value': None, 'override': True, 'dtype': str}, + {'field_name': 'modified', 'fallback': 'issued', 'default_value': None, 'override': True, 'dtype': str}, + {'field_name': 'valid', 'fallback': None, 'default_value': None, 'override': True, 'dtype': str}, + {'field_name': 'temporal_start', 'fallback': None, 'default_value': None, 'override': True, 'dtype': str}, + {'field_name': 'temporal_end', 'fallback': None, 'default_value': None, 'override': True, 'dtype': str} +] + +DATASET_DEFAULT_FIELDS = [ + {'field_name': 'id', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, + {'field_name': 'name', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, + {'field_name': 'title', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, + {'field_name': 'notes', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, + {'field_name': 'description', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, + {'field_name': 'access_rights', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['access_rights'], 'override': True, 'dtype': str}, + {'field_name': 'license', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['license'], 'override': True, 'dtype': str}, + {'field_name': 'license_id', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['license_id'], 'override': True, 'dtype': str}, + {'field_name': 'topic', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['topic'], 'override': True, 'dtype': str}, + {'field_name': 'theme', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['theme'], 'override': True, 'dtype': str}, + {'field_name': 'theme_eu', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['theme_eu'], 'override': True, 'dtype': str}, + {'field_name': 'status', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['status'], 'override': True, 'dtype': str}, + {'field_name': 'hvd_category', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, +] + +RESOURCE_DEFAULT_FIELDS = [ + {'field_name': 'url', 'fallback': None, 'default_value': "", 'override': False, 'dtype': str}, + {'field_name': 'name', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, + {'field_name': 'format', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, + {'field_name': 'protocol', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, + {'field_name': 'mimetype', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, + {'field_name': 'description', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, + {'field_name': 'license', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['license'], 'override': True, 'dtype': str}, + {'field_name': 'license_id', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['license_id'], 'override': True, 'dtype': str}, + {'field_name': 'rights', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['rights'], 'override': True, 'dtype': str}, + {'field_name': 'language', 'fallback': None, 'default_value': OGC2CKAN_HARVESTER_MD_CONFIG['language'], 'override': False, 'dtype': str}, + {'field_name': 'conforms_to', 'fallback': None, 'default_value': None, 'override': False, 'dtype': str}, + {'field_name': 'size', 'fallback': None, 'default_value': 0, 'override': True, 'dtype': int}, +] + +# Custom rules for harvesters.base._update_custom_format() +CUSTOM_FORMAT_RULES = [ + { + 'format_strings': ['esri', 'arcgis'], + 'url_string': 'viewer.html?url=', + 'format': 'HTML', + 'mimetype': 'https://www.iana.org/assignments/media-types/text/html' + }, + { + 'format_strings': ['html', 'html5'], + 'url_string': None, + 'format': 'HTML', + 'mimetype': 'https://www.iana.org/assignments/media-types/text/html' + }, + { + 'format_strings': None, + 'url_string': 'getrecordbyid', + 'format': 'XML', + 'mimetype': 'https://www.iana.org/assignments/media-types/application/xml' + } + # Add more rules here as needed +] + +DATADICTIONARY_DEFAULT_SCHEMA = [ + 'id', + 'type', + 'label', + 'notes', + 'type_override' + ] + +# Common date formats for parsing. https://docs.python.org/es/3/library/datetime.html#strftime-and-strptime-format-codes +COMMON_DATE_FORMATS = [ + '%Y-%m-%d', + '%d-%m-%Y', + '%m-%d-%Y', + '%Y/%m/%d', + '%d/%m/%Y', + '%m/%d/%Y', + '%Y-%m-%d %H:%M:%S', # Date with time + '%d-%m-%Y %H:%M:%S', # Date with time + '%m-%d-%Y %H:%M:%S', # Date with time + '%Y/%m/%d %H:%M:%S', # Date with time + '%d/%m/%Y %H:%M:%S', # Date with time + '%m/%d/%Y %H:%M:%S', # Date with time + '%Y-%m-%dT%H:%M:%S', # ISO 8601 format + '%Y-%m-%dT%H:%M:%SZ', # ISO 8601 format with Zulu time indicator +] +# Vocabs +SCHEMINGDCAT_DEFAULT_DATASET_SCHEMA_NAME: typing.Final[str] = "dataset" +SCHEMINGDCAT_INSPIRE_THEMES_VOCAB: typing.Final[str] = "theme" +SCHEMINGDCAT_DCAT_THEMES_VOCAB: typing.Final[list] = ["theme_es", "theme_eu"] +SCHEMINGDCAT_ISO19115_TOPICS_VOCAB: typing.Final[list] = "topic" + + +# Clean ckan names +URL_REGEX = re.compile( + r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' +) + +# Compile the regular expression +INVALID_CHARS = re.compile(r"[^a-zñ0-9_.-]") + +# Define a dictionary to map accented characters to their unaccented equivalents except ñ +ACCENT_MAP = str.maketrans({ + "á": "a", "à": "a", "ä": "a", "â": "a", "ã": "a", + "é": "e", "è": "e", "ë": "e", "ê": "e", + "í": "i", "ì": "i", "ï": "i", "î": "i", + "ó": "o", "ò": "o", "ö": "o", "ô": "o", "õ": "o", + "ú": "u", "ù": "u", "ü": "u", "û": "u", + "ñ": "ñ", +}) + +URL_FIELD_NAMES = { + 'dataset': + ['dcat_type', 'theme_es', 'language', 'topic', 'maintainer_url', 'tag_uri', 'contact_uri', 'contact_url', 'publisher_identifier', 'publisher_uri', 'publisher_url', 'publisher_type', 'maintainer_uri', 'maintainer_url', 'author_uri', 'author_url', 'conforms_to', 'theme', 'reference_system', 'spatial_uri', 'representation_type', 'license_id', 'access_rights', 'graphic_overview', 'frequency', 'hvd_category'], + 'resource': + ['url', 'availability', 'mimetype', 'status', 'resource_relation', 'license', 'rights', 'conforms_to', 'reference_system'] + } EMAIL_FIELD_NAMES = ['publisher_email', 'maintainer_email', 'author_email', ] \ No newline at end of file diff --git a/ckanext/schemingdcat/harvesters/base.py b/ckanext/schemingdcat/harvesters/base.py index 0241645b..a4c87863 100644 --- a/ckanext/schemingdcat/harvesters/base.py +++ b/ckanext/schemingdcat/harvesters/base.py @@ -30,6 +30,8 @@ from ckanext.schemingdcat.lib.field_mapping import FieldMappingValidator from ckanext.schemingdcat.config import ( + DATASET_DEFAULT_SCHEMA, + RESOURCE_DEFAULT_SCHEMA, mimetype_base_uri, OGC2CKAN_HARVESTER_MD_CONFIG, OGC2CKAN_MD_FORMATS, @@ -40,7 +42,8 @@ DATADICTIONARY_DEFAULT_SCHEMA, URL_REGEX, INVALID_CHARS, - ACCENT_MAP + ACCENT_MAP, + slugify_pat ) log = logging.getLogger(__name__) @@ -241,19 +244,14 @@ def _get_local_schema(self, schema_type="dataset"): def _get_remote_schema(self, base_url, schema_type="dataset"): """ Fetches the remote schema for a given base URL and schema type. - + Args: base_url (str): The base URL of the remote server. schema_type (str, optional): The type of schema to fetch. Defaults to 'dataset'. - + Returns: - dict: The remote schema as a dictionary. - - Raises: - HarvesterBase.ContentFetchError: If there is an error fetching the remote schema content. - ValueError: If there is an error decoding the remote schema content. - KeyError: If the remote schema content does not contain the expected result. - + dict: The remote schema as a dictionary, or None if there is an error. + """ url = ( base_url @@ -264,12 +262,16 @@ def _get_remote_schema(self, base_url, schema_type="dataset"): try: content = self._get_content(url) content_dict = json.loads(content) - return content_dict["result"] - except (HarvesterBase.ContentFetchError, ValueError, KeyError): - log.debug("Could not fetch/decode remote schema") - raise HarvesterBase.RemoteResourceError( - "Could not fetch/decode remote schema" - ) + log.debug('content_dict: %s', content_dict) + + # Check if content_dict is a dictionary and contains 'result'. + if isinstance(content_dict, dict) and "result" in content_dict: + return content_dict["result"] + else: + return None + except (ContentFetchError, ValueError, KeyError) as e: + log.debug("Could not fetch/decode remote schema: %s", e) + return None def _get_local_required_lang(self): """ @@ -480,6 +482,97 @@ def _standardize_field_mapping_v1(self, field_mapping): # If the value is not a dictionary, it is a single-language field standardized_mapping[key] = {'field_name': value} return standardized_mapping + + def _standardize_ckan_dict_from_field_mapping(self, dataset, field_mapping): + """ + Standardizes a CKAN dataset dictionary according to the provided field mapping. + + Args: + dataset (dict): The CKAN dataset dictionary. + field_mapping (dict): The mapping of local field names to remote field names or values. + + Returns: + dict: The standardized CKAN dataset dictionary. + """ + def normalize_key(key): + """ + Helper function to normalize the key by converting to lowercase and replacing non-alphanumeric characters with underscores. + """ + return slugify_pat.sub('_', key.lower()) + + def get_extra_value(extras, key): + """ + Helper function to get the value from the extras list where the key matches (case insensitive and normalized). + """ + normalized_key = normalize_key(key) + for item in extras: + if normalize_key(item['key']) == normalized_key: + return item['value'] + return None + + def apply_field_mapping(d, mapping): + new_dict = {} + for local_field, remote_info in mapping.items(): + if 'field_name' in remote_info: + remote_field = remote_info['field_name'] + if remote_field and remote_field.startswith('extras.'): + extra_key = remote_field.split('.', 1)[1] + extra_value = get_extra_value(d.get('extras', []), extra_key) + if extra_value is not None: + new_dict[local_field] = extra_value + elif remote_field in d: + new_dict[local_field] = d[remote_field] + if 'field_value' in remote_info: + new_dict[local_field] = remote_info['field_value'] + if 'languages' in remote_info: + for lang, lang_info in remote_info['languages'].items(): + if 'field_name' in lang_info: + remote_field = lang_info['field_name'] + if remote_field and remote_field.startswith('extras.'): + extra_key = remote_field.split('.', 1)[1] + extra_value = get_extra_value(d.get('extras', []), extra_key) + if extra_value is not None: + if local_field not in new_dict: + new_dict[local_field] = {} + new_dict[local_field][lang] = extra_value + elif remote_field in d: + if local_field not in new_dict: + new_dict[local_field] = {} + new_dict[local_field][lang] = d[remote_field] + if 'field_value' in lang_info: + if local_field not in new_dict: + new_dict[local_field] = {} + new_dict[local_field][lang] = lang_info['field_value'] + return new_dict + + # Apply dataset field mapping + dataset_field_mapping = field_mapping.get('dataset_field_mapping', {}) + standardized_dataset = apply_field_mapping(dataset, dataset_field_mapping) + + # Ensure default schema fields are included in the dataset + for field in DATASET_DEFAULT_SCHEMA: + if field in dataset: + standardized_dataset[field] = dataset[field] + + # Maintain the tags list + standardized_dataset['tags'] = dataset.get('tags', []) + + # Apply distribution field mapping to each resource + distribution_field_mapping = field_mapping.get('distribution_field_mapping', {}) + standardized_resources = [] + for resource in dataset.get('resources', []): + standardized_resource = apply_field_mapping(resource, distribution_field_mapping) + + # Ensure default schema fields are included in each resource + for field in RESOURCE_DEFAULT_SCHEMA: + if field in resource: + standardized_resource[field] = resource[field] + + standardized_resources.append(standardized_resource) + + standardized_dataset['resources'] = standardized_resources + + return standardized_dataset def _standardize_df_fields_from_field_mapping(self, df, field_mapping): """ @@ -712,16 +805,21 @@ def get_mapped_fields(fields, field_mapping): log.debug("Validating remote schema from: %s", remote_ckan_base_url) if self._remote_schema is None: self._remote_schema = self._get_remote_schema(remote_ckan_base_url) - - remote_datasets_colnames = set( - field["field_name"] - for field in self._remote_schema["dataset_fields"] - ) - remote_distributions_colnames = set( - field["field_name"] - for field in self._remote_schema["resource_fields"] - ) - + + if self._remote_schema is not None: + remote_datasets_colnames = set( + field["field_name"] + for field in self._remote_schema["dataset_fields"] + ) + remote_distributions_colnames = set( + field["field_name"] + for field in self._remote_schema["resource_fields"] + ) + else: + log.warning("Failed to retrieve remote schema from: %s. Using local schema by default.", remote_ckan_base_url) + remote_datasets_colnames = set() + remote_distributions_colnames = set() + elif remote_dataset_field_names is not None: log.debug( "Validating remote schema using field names from package dict" @@ -969,21 +1067,6 @@ def _set_translated_fields(self, package_dict): ReadError: If there is an error translating the dataset. """ - basic_fields = [ - "id", - "name", - "title", - "title_translated", - "notes_translated", - "provenance", - "notes", - "provenance", - "private", - "groups", - "tags", - "tag_string", - "owner_org", - ] if ( not hasattr(self, "_mapped_schema") or "dataset_fields" not in self._mapped_schema @@ -1688,6 +1771,36 @@ def _clean_name(self, name): return name + def _fill_translated_properties(self, package_dict): + """ + Fills properties without the _translated suffix using the default language or the first available translation. + + Args: + package_dict (dict): The package dictionary to be modified. + default_language (str): The default language of the instance. + + Returns: + dict: The modified package dictionary. + """ + default_lang = self._get_local_required_lang() + + for key in list(package_dict.keys()): + if key.endswith('_translated'): + base_key = key[:-11] # Remove '_translated' suffix + translations = package_dict[key] + + # Use the default language if available + if default_lang and default_lang in translations: + package_dict[base_key] = translations[default_lang] + else: + # Use the first available translation with a value + for lang, value in translations.items(): + if value: + package_dict[base_key] = value + break + + return package_dict + def _create_or_update_package( self, package_dict, harvest_object, package_dict_form="rest" ): @@ -2019,19 +2132,15 @@ def _log_export_clean_datasets_and_ids(self, harvest_source_title, clean_dataset class ContentFetchError(Exception): pass - class ContentNotFoundError(ContentFetchError): pass - class RemoteResourceError(Exception): pass - class SearchError(Exception): pass - class ReadError(Exception): pass diff --git a/ckanext/schemingdcat/harvesters/ckan.py b/ckanext/schemingdcat/harvesters/ckan.py index 88b53363..c070c205 100644 --- a/ckanext/schemingdcat/harvesters/ckan.py +++ b/ckanext/schemingdcat/harvesters/ckan.py @@ -15,18 +15,16 @@ import ckan.logic as logic import uuid -from ckanext.harvest.harvesters.ckanharvester import ( - CKANHarvester, - ContentFetchError, - ContentNotFoundError, - RemoteResourceError, - SearchError, -) from ckanext.schemingdcat.harvesters.base import ( SchemingDCATHarvester, RemoteSchemaError, + ReadError, + ContentFetchError, + SearchError, + RemoteResourceError ) +from ckanext.schemingdcat.lib.field_mapping import FieldMappingValidator log = logging.getLogger(__name__) @@ -92,6 +90,9 @@ def validate_config(self, config): # Check basic validation config self._set_basic_validate_config(config) + # Instance field_mapping validator + field_mapping_validator = FieldMappingValidator() + # Check if the schema is specified if "schema" in config_obj: schema = config_obj["schema"] @@ -135,37 +136,33 @@ def validate_config(self, config): ): config = json.dumps({**config_obj, "remote_orgs": "only_local"}) - # Validate if exists a JSON contained the mapping field_names between the remote schema and the local schema - for mapping_name in ["dataset_field_mapping", "distribution_field_mapping"]: + # Check if 'field_mapping_schema_version' exists in the config + field_mapping_schema_version_error_message = f'Insert the schema version: "field_mapping_schema_version: ", one of: {", ".join(map(str, self._field_mapping_validator_versions))} . More info: https://github.com/mjanez/ckanext-schemingdcat?tab=readme-ov-file#remote-google-sheetonedrive-excel-metadata-upload-harvester' + if 'field_mapping_schema_version' not in config_obj and 'dataset_field_mapping' in config_obj: + raise ValueError(field_mapping_schema_version_error_message) + else: + # Check if is an integer and if it is in the versions + if not isinstance(config_obj['field_mapping_schema_version'], int) or config_obj['field_mapping_schema_version'] not in self._field_mapping_validator_versions: + raise ValueError(field_mapping_schema_version_error_message) + + # Validate if exists a JSON contained the mapping field_names between the remote schema and the local schema + for mapping_name in self._field_mapping_info.keys(): if mapping_name in config: field_mapping = config_obj[mapping_name] if not isinstance(field_mapping, dict): - raise ValueError(f"{mapping_name} must be a dictionary") - - # Check if the config is a valid mapping - for local_field, remote_field in field_mapping.items(): - if not isinstance(local_field, basestring): - raise ValueError('"local_field_name" must be a string') - if not isinstance(remote_field, (basestring, dict)): - raise ValueError( - '"remote_field_name" must be a string or a dictionary' - ) - if isinstance(remote_field, dict): - for lang, remote_field_name in remote_field.items(): - if not isinstance(lang, basestring) or not isinstance( - remote_field_name, basestring - ): - raise ValueError( - 'In translated fields, both language and remote_field_name must be strings. e.g. "notes_translated": {"es": "notes-es"}' - ) - if not re.match("^[a-z]{2}$", lang): - raise ValueError( - "Language code must be a 2-letter ISO 639-1 code" - ) + raise ValueError(f'{mapping_name} must be a dictionary') + + schema_version = config_obj['field_mapping_schema_version'] + + try: + # Validate field_mappings acordin schema versions + field_mapping = field_mapping_validator.validate(field_mapping, schema_version) + except ValueError as e: + raise ValueError(f"The field mapping is invalid: {e}") from e config = json.dumps({**config_obj, mapping_name: field_mapping}) - return config + return config def gather_stage(self, harvest_job): """ @@ -181,7 +178,7 @@ def gather_stage(self, harvest_job): harvest_source_title = harvest_job.source.title remote_ckan_base_url = harvest_job.source.url.rstrip("/") - log.debug('In SchemingDCATCKANHarvester gather_stage with harvest source: %s and database URL: %s', harvest_source_title, remote_ckan_base_url) + log.debug('In SchemingDCATCKANHarvester gather_stage with harvest source: %s and URL: %s', harvest_source_title, remote_ckan_base_url) # Get config options toolkit.requires_ckan_version(min_version="2.0") @@ -274,6 +271,23 @@ def gather_stage(self, harvest_job): ) return [] + + # Check if the content_dicts colnames correspond to the local schema + try: + # Standardizes the field_mapping + field_mappings = { + 'dataset_field_mapping': self._standardize_field_mapping(self.config.get("dataset_field_mapping")), + 'distribution_field_mapping': self._standardize_field_mapping(self.config.get("distribution_field_mapping")), + 'datadictionary_field_mapping': None + } + + except RemoteSchemaError as e: + self._save_gather_error('Error standardize field mapping: {0}'.format(e), harvest_job) + return [] + + except ReadError as e: + self._save_gather_error('Error generating default values for dataset/distribution config field mappings: {0}'.format(e), harvest_job) + # Create harvest objects for each dataset try: package_ids = set() @@ -284,18 +298,15 @@ def gather_stage(self, harvest_job): if self.config.get("dataset_field_mapping") is None and self.config.get("distribution_field_mapping") is None: log.warning('If no *_field_mapping is provided in the configuration for validation, fields are automatically mapped to the local schema.') else: - # Standardizes the field_mapping - remote_dataset_field_mapping = self._standardize_field_mapping(self.config.get("dataset_field_mapping")) - remote_distribution_field_mapping = self._standardize_field_mapping(self.config.get("distribution_field_mapping")) - - log.debug('remote_dataset_field_mapping: %s', remote_dataset_field_mapping) - log.debug('remote_distribution_field_mapping: %s', remote_distribution_field_mapping) + # Standardizes the field_mapping + log.debug('remote_dataset_field_mapping: %s', field_mappings.get('dataset_field_mapping')) + log.debug('remote_distribution_field_mapping: %s', field_mappings.get('distribution_field_mapping')) self._validate_remote_schema( remote_dataset_field_names=None, remote_ckan_base_url=remote_ckan_base_url, remote_resource_field_names=None, - remote_dataset_field_mapping=remote_dataset_field_mapping, - remote_distribution_field_mapping=remote_distribution_field_mapping, + remote_dataset_field_mapping=field_mappings.get('dataset_field_mapping'), + remote_distribution_field_mapping=field_mappings.get('distribution_field_mapping'), ) except RemoteSchemaError as e: self._save_gather_error( @@ -312,6 +323,17 @@ def gather_stage(self, harvest_job): pkg_dict["id"], ) continue + + # Check if the content_dicts colnames correspond to the local schema + try: + #log.debug('content_dicts: %s', content_dicts) + # Standardizes the field names + pkg_dict = self._standardize_ckan_dict_from_field_mapping(pkg_dict, field_mappings) + log.debug('Standardized package dict: %s', pkg_dict) + except RemoteSchemaError as e: + self._save_gather_error('Error standarize remote dataset: {0}'.format(e), harvest_job) + return [] + package_ids.add(pkg_dict["id"]) # Set translated fields @@ -319,6 +341,8 @@ def gather_stage(self, harvest_job): log.debug( "Creating HarvestObject for %s %s", pkg_dict["name"], pkg_dict["id"] ) + log.debug('Translated package dict: %s', pkg_dict) + obj = HarvestObject( guid=pkg_dict["id"], job=harvest_job, content=json.dumps(pkg_dict) ) @@ -427,6 +451,9 @@ def modify_package_dict(self, package_dict, harvest_object): """ # Clean up any existing extras already in package_dict package_dict = self._remove_duplicate_keys_in_extras(package_dict) + + # Check basic fields without translations + package_dict = self._fill_translated_properties(package_dict) return package_dict @@ -548,11 +575,13 @@ def import_stage(self, harvest_object): # key. resource.pop("revision_id", None) + log.debug('package_dict BEFORE MODIFY: %s', package_dict) package_dict = self.modify_package_dict(package_dict, harvest_object) - result = self._create_or_update_package( package_dict, harvest_object, package_dict_form="package_show" ) + log.debug('package_dict AFTER MODIFY: %s', package_dict) + # Log package_dict, package dict is a dict log.debug("Package create or update: %s", result) @@ -590,4 +619,5 @@ def get_package_dict(self, harvest_object, context, package_dict=None): resource['id'] = str(uuid.uuid4()) resource.pop('dataset_id', None) - return package_dict \ No newline at end of file + return package_dict + diff --git a/ckanext/schemingdcat/helpers.py b/ckanext/schemingdcat/helpers.py index 55ab0491..5e61bfc1 100644 --- a/ckanext/schemingdcat/helpers.py +++ b/ckanext/schemingdcat/helpers.py @@ -1,1353 +1,1353 @@ -from ckan.common import json, c, request, is_flask_request -from ckan.lib import helpers as ckan_helpers -import ckan.logic as logic -from ckan import model -from ckan.lib.i18n import get_available_locales, get_lang -import ckan.plugins as p -import six -import re -import yaml -from yaml.loader import SafeLoader -from pathlib import Path -from functools import lru_cache -import datetime -import typing -from urllib.parse import urlparse -from urllib.error import URLError - -from six.moves.urllib.parse import urlencode - -from ckanext.scheming.helpers import ( - scheming_choices_label, - scheming_language_text, - scheming_dataset_schemas, - scheming_get_schema -) - -from ckanext.harvest.helpers import ( - get_harvest_source -) -from ckanext.harvest.utils import ( - DATASET_TYPE_NAME -) - -import ckanext.schemingdcat.config as sdct_config -from ckanext.schemingdcat.utils import ( - get_facets_dict, - public_file_exists, - public_dir_exists, -) -from ckanext.dcat.utils import CONTENT_TYPES, get_endpoint -from ckanext.fluent.validators import LANG_SUFFIX -import logging - -log = logging.getLogger(__name__) - -all_helpers = {} -prettify_cache = {} -DEFAULT_LANG = None - -@lru_cache(maxsize=None) -def get_scheming_dataset_schemas(): - """ - Retrieves the dataset schemas using the scheming_dataset_schemas function. - Caches the result using the LRU cache decorator for efficient retrieval. - """ - return scheming_dataset_schemas() - - -def helper(fn): - """Collect helper functions into the ckanext.schemingdcat.all_helpers dictionary. - - Args: - fn (function): The helper function to add to the dictionary. - - Returns: - function: The helper function. - """ - all_helpers[fn.__name__] = fn - return fn - - -@helper -def schemingdcat_get_schema_names(): - """ - Get the names of all the schemas defined for the Scheming DCAT extension. - - Returns: - list: A list of schema names. - """ - schemas = get_scheming_dataset_schemas() - - return [schema["schema_name"] for schema in schemas.values()] - - -@helper -def schemingdcat_default_facet_search_operator(): - """Return the default facet search operator: AND/OR. - - Returns: - str: The default facet search operator. - """ - facet_operator = sdct_config.default_facet_operator - if facet_operator and ( - facet_operator.upper() == "AND" or facet_operator.upper() == "OR" - ): - facet_operator = facet_operator.upper() - else: - facet_operator = "AND" - return facet_operator - - -@helper -def schemingdcat_decode_json(json_text): - """Convert a JSON string to a Python object. - - Args: - json_text (str): The JSON string to convert. - - Returns: - object: A Python object representing the JSON data. - """ - return json.loads(json_text) - - -@helper -def schemingdcat_organization_name(org_id): - """Return the name of the organization from its ID. - - Args: - org_id (dict): A dictionary containing the ID of the organization. - - Returns: - str: The name of the organization, or None if the organization cannot be found. - """ - org_name = None - try: - org_dic = ckan_helpers.get_organization(org_id["display_name"]) - if org_dic is not None: - org_name = org_dic["name"] - else: - log.warning( - "Could not find the name of the organization with ID {0}".format( - org_id["display_name"] - ) - ) - except Exception as e: - log.error( - "Exception while trying to find the name of the organization: {0}".format(e) - ) - return org_name - - -@helper -def schemingdcat_get_facet_label(facet): - """Return the label for a given facet. - - Args: - facet (str): The name of the facet. - - Returns: - str: The label for the given facet. - """ - return get_facets_dict[facet] - - -@helper -def schemingdcat_get_facet_items_dict( - facet, search_facets=None, limit=None, exclude_active=False, scheming_choices=None -): - """Return the list of unselected facet items for the given facet, sorted - by count. - - Returns the list of unselected facet contraints or facet items (e.g. tag - names like "russian" or "tolstoy") for the given search facet (e.g. - "tags"), sorted by facet item count (i.e. the number of search results that - match each facet item). - - Reads the complete list of facet items for the given facet from - c.search_facets, and filters out the facet items that the user has already - selected. - - List of facet items are ordered acording the faccet_sort parameter - - Arguments: - facet -- the name of the facet to filter. - search_facets -- dict with search facets(c.search_facets in Pylons) - limit -- the max. number of facet items to return. - exclude_active -- only return unselected facets. - scheming_choices -- scheming choices to use to get label from value. - - """ - - # log.debug("Returning facets for: {0}".format(facet)) - - order = "default" - items = [] - - search_facets = search_facets or getattr(c, "search_facets", None) - - if ( - search_facets - and isinstance(search_facets, dict) - and search_facets.get(facet, {}).get("items") - ): - for facet_item in search_facets.get(facet)["items"]: - if scheming_choices: - facet_item["label"] = scheming_choices_label( - scheming_choices, facet_item["name"] - ) - else: - facet_item["label"] = facet_item["display_name"] - - if not len(facet_item["name"].strip()): - continue - - params_items = ( - request.params.items(multi=True) - if is_flask_request() - else request.params.items() - ) - - if not (facet, facet_item["name"]) in params_items: - items.append(dict(active=False, **facet_item)) - elif not exclude_active: - items.append(dict(active=True, **facet_item)) - - # log.debug("params: {0}:{1}".format( - # facet,request.params.getlist("_%s_sort" % facet))) - order_lst = request.params.getlist("_%s_sort" % facet) - if len(order_lst): - order = order_lst[0] - # Sort descendingly by count and ascendingly by case-sensitive display name - # items.sort(key=lambda it: (-it['count'], it['display_name'].lower())) - sorts = { - "name": ("label", False), - "name_r": ("label", True), - "count": ("count", False), - "count_r": ("count", True), - } - if sorts.get(order): - items.sort( - key=lambda it: (it[sorts.get(order)[0]]), reverse=sorts.get(order)[1] - ) - else: - items.sort(key=lambda it: (-it["count"], it["label"].lower())) - - if hasattr(c, "search_facets_limits"): - if c.search_facets_limits and limit is None: - limit = c.search_facets_limits.get(facet) - # zero treated as infinite for hysterical raisins - if limit is not None and limit > 0: - return items[:limit] - - return items - - -@helper -def schemingdcat_new_order_url(facet_name, order_concept, extras=None): - """Return a URL with the order parameter for the given facet and concept to use. - - Based on the actual order, it rotates cyclically from no order -> direct order -> inverse order over the given concept. - - Args: - facet_name (str): The name of the facet to order. - order_concept (str): The concept (name or count) that will be used to order. - extras (dict, optional): Extra parameters to include in the URL. - - Returns: - str: The URL with the order parameter for the given facet and concept. - """ - old_order = None - order_param = "_%s_sort" % facet_name - order_lst = request.params.getlist(order_param) - if not extras: - extras = {} - - controller = getattr(c, "controller", False) or request.blueprint - action = getattr(c, "action", False) or p.toolkit.get_endpoint()[1] - url = ckan_helpers.url_for(controller=controller, action=action, **extras) - - if len(order_lst): - old_order = order_lst[0] - - order_mapping = { - "name": {"name": "name_r", "name_r": None, None: "name"}, - "count": {"count": "count_r", "count_r": None, None: "count"}, - } - - new_order = order_mapping.get(order_concept, {}).get(old_order) - - params_items = ( - request.params.items(multi=True) - if is_flask_request() - else request.params.items() - ) - params_nopage = [(k, v) for k, v in params_items if k != order_param] - - if new_order: - params_nopage.append((order_param, new_order)) - - if params_nopage: - url = url + "?" + urlencode(params_nopage) - - return url - -@helper -def schemingdcat_get_facet_list_limit(): - """ - Retrieves the limit for the facet list from the scheming DCAT configuration. - - Returns: - int: The limit for the facet list. - """ - return sdct_config.facet_list_limit - -@helper -def schemingdcat_get_icons_dir(field=None, field_name=None): - """ - Returns the defined icons directory for a given scheming field definition or field name. - - This function is used to retrieve the icons directory associated with a - specific field in a scheming dataset or directly by field name. If no icons directory is defined, - the function will return None. - - Args: - field (dict, optional): A dictionary representing the scheming field definition. - This should include all the properties of the field, - including the icons directory if one is defined. - field_name (str, optional): The name of the field. If provided, the function will - look for an icons directory with this name. - - Returns: - str: A string representing the icons directory for the field or field name. - If no icons directory is defined or found, the function will return None. - """ - if field: - if "icons_dir" in field: - return field["icons_dir"] - - if "field_name" in field: - dir = sdct_config.icons_dir + "/" + field["field_name"] - if public_dir_exists(dir): - return dir - - elif field_name: - dir = sdct_config.icons_dir + "/" + field_name - if public_dir_exists(dir): - return dir - - return None - -@helper -def schemingdcat_get_default_icon(field): - """Return the defined default icon for a scheming field definition. - - Args: - field (dict): The scheming field definition. - - Returns: - str: The defined default icon, or None if not found. - """ - if "default_icon" in field: - return field["default_icon"] - -@helper -def schemingdcat_get_default_package_item_icon(): - """ - Returns the default icon defined for a given scheming field definition. - - This function is used to retrieve the default icon associated with a - specific field in a scheming dataset. If no default icon is defined, - the function will return None. - - Args: - field (dict): A dictionary representing the scheming field definition. - This should include all the properties of the field, - including the default icon if one is defined. - - Returns: - str: A string representing the default icon for the field. This could - be a URL, a data URI, or any other string format used to represent - images. If no default icon is defined for the field, the function - will return None. - """ - return sdct_config.default_package_item_icon - -@helper -def schemingdcat_get_default_package_item_show_spatial(): - """ - Returns the configuration value for showing spatial information in the default package item. - - This function is used to retrieve the configuration value that determines - whether the spatial information should be shown in the default package item. - If no value is defined in the configuration, the function will return None. - - Returns: - bool: A boolean value representing whether the spatial information should - be shown in the default package item. If no value is defined in the - configuration, the function will return None. - """ - return sdct_config.default_package_item_show_spatial - -@helper -def schemingdcat_get_show_metadata_templates_toolbar(): - """ - Returns the configuration value for showing the metadata templates toolbar. - - This function is used to retrieve the configuration value that determines - whether the metadata templates toolbar should be shown or not. If the configuration - value is not set, the function will return False. - - Returns: - bool: A boolean value representing whether the metadata templates toolbar - should be shown. If the configuration value is not set, the function - will return False. - """ - return sdct_config.show_metadata_templates_toolbar - -@helper -def schemingdcat_get_metadata_templates_search_identifier(): - """ - Returns the default icon defined for a given scheming field definition. - - This function is used to retrieve the default value to retrieve metadata templates. If no default value is defined, - the function will return None. - - Args: - field (dict): A dictionary representing the scheming field definition. - This should include all the properties of the field, - including the default icon if one is defined. - - Returns: - str: A string representing the default icon for the field. This could - be a URL, a data URI, or any other string format used to represent - images. If no default icon is defined for the field, the function - will return None. - """ - return sdct_config.metadata_templates_search_identifier - -@helper -def schemingdcat_get_schemingdcat_xls_harvest_templates(search_identifier=sdct_config.metadata_templates_search_identifier, count=10): - """ - This helper function retrieves the schemingdcat_xls templates from the CKAN instance. - It uses the 'package_search' action of the CKAN logic layer to perform a search with specific parameters. - - Parameters: - search_identifier (str): The text to search in the identifier. Default is sdct_config.metadata_templates_search_identifier. - count (int): The number of featured datasets to retrieve. Default is 10. - - Returns: - list: A list of dictionaries, each representing a featured dataset. If no results are found, returns None. - """ - fq = f'+extras_schemingdcat_xls_metadata_template:{True}' - search_dict = { - 'fq': fq, - 'fl': 'name,extras_identifier,title,notes,metadata_modified,extras_title_translated,extras_notes_translated', - 'rows': count - } - context = {'model': model, 'session': model.Session} - result = logic.get_action('package_search')(context, search_dict) - - if not result['results']: - fq = f'+extras_schemingdcat_xls_metadata_template:*{search_identifier}*' - search_dict['fq'] = fq - result = logic.get_action('package_search')(context, search_dict) - - return result['results'] if result['results'] else None - -@helper -def schemingdcat_get_icon( - choice=None, icons_dir=None, default="/images/default/no_icon.svg", choice_value=None -): - """Return the relative URL to the icon for the item. - - Args: - choice (dict, optional): The choice selected for the field. - icons_dir (str, optional): The path to search for the icon. Usually the common path for icons for this field. - default (str, optional): The default value to return if no icon is found. - choice_value (str, optional): The value of the choice selected for the field. If provided, it will be used instead of choice['value']. - - Returns: - str: The relative URL to the icon, or the default value if not found. - """ - extensions = [".svg", ".png", ".jpg", ".jpeg", ".gif"] - icon_name = None - - if choice_value is None and choice: - choice_value = choice.get("icon") or choice.get("value") - - if choice_value: - if ckan_helpers.is_url(choice_value): - url_parts = choice_value.split("/") - - if len(url_parts) == 1: - icon_name = url_parts[-1].lower() - else: - icon_name = url_parts[-2].lower() + "/" + url_parts[-1].lower() - else: - icon_name = choice_value - - url_path = (icons_dir + "/" if icons_dir else "") + icon_name - - for extension in extensions: - if public_file_exists(url_path + extension): - return url_path + extension - - return default - -@helper -def schemingdcat_get_choice_item(field, value): - """Return the whole choice item for the given value in the scheming field. - - Args: - field (dict): The scheming field to look for the choice item in. - value (str): The option item value. - - Returns: - dict: The whole option item in scheming, or None if not found. - """ - if field and ("choices" in field): - # log.debug("Searching: {0} en {1}".format(value,field['choices'])) - for choice in field["choices"]: - if choice["value"] == value: - return choice - - return None - -@helper -def schemingdcat_get_choice_property(choices, value, property): - """ - Retrieve a specific property from a choice dictionary based on the given value. - - Args: - choices (list): List of dictionaries containing "label" and "value" keys. - value (str): The value to match against the choices. - property (str): The property to retrieve from the matching choice dictionary. - - Returns: - str or None: The property value from the matching choice dictionary, or None if not found. - """ - for c in choices: - if c['value'] == value: - return c.get(property, None) - return None - - -@helper -def scheming_display_json_list(value): - """Return the object passed serialized as a JSON list. - - Args: - value (any): The object to serialize. - - Returns: - str: The serialized object as a JSON list, or the original value if it cannot be serialized. - """ - if isinstance(value, six.string_types): - return value - try: - return json.loads(value) - except (TypeError, ValueError): - return value - -@helper -def scheming_clean_json_value(value): - """Clean a JSON list value to avoid errors with: '"' and spaces. - - Args: - value (str): The object to serialize. - - Returns: - str: The cleaned value, or the original value if it cannot be cleaned. - """ - try: - value = value.strip(" ").replace('\\"', "%%%@#") - value = value.replace('"', "") - value = value.replace("%%%@#", '"') - return value - except (TypeError, ValueError): - return value - -def format_eli_label(parsed_url): - """ - Formats the label for a parsed URL with 'eli' segment. - - Args: - parsed_url (ParseResult): The parsed URL. - - Returns: - str: The formatted label. - """ - segments = parsed_url.path.split('/') - eli_index = next(i for i, segment in enumerate(segments) if segment == 'eli') - return '/'.join(segments[eli_index + 1:]).upper() - -@helper -def schemingdcat_prettify_url(url): - """ - Prettifies a URL by removing the protocol and trailing slash. - - Args: - url (str): The URL to prettify. - - Returns: - str: The prettified URL, or the original URL if an error occurred. - """ - if url in prettify_cache: - return prettify_cache[url] - - try: - prettified_url = re.sub(r"^https?://(?:www\.)?", "", url).rstrip("/") - prettify_cache[url] = prettified_url - return prettified_url - except (TypeError, AttributeError): - return url - -@helper -def schemingdcat_prettify_url_name(url): - """ - Prettifies a URL name by extracting the last segment and cleaning it. - - Args: - url (str): The URL to extract the name from. - - Returns: - str: The prettified URL name, or the original URL if an error occurred. - """ - if url is None: - return url - - if url in prettify_cache: - return prettify_cache[url] - - try: - parsed_url = urlparse(url) - - if '/eli/' in url: - prettified_url_name = format_eli_label(parsed_url) - else: - url_name = parsed_url.path.split("/")[-1].split('.')[0].replace('_', '-') - prettified_url_name = ' '.join(url_name.split(' ')[:4]) - - prettify_cache[url] = prettified_url_name - return prettified_url_name - - except (URLError, ValueError) as e: - print(f"Error while prettifying URL: {e}") - return url - -@helper -def schemingdcat_listify_str(values): - """Converts a string or list/tuple of strings to a list of strings. - - If `values` is already a list or tuple, it is returned as is. If `values` is a string, - it is split into a list of strings using commas as the delimiter. Each string in the - resulting list is stripped of leading/trailing whitespace and quotes. - - Args: - values (str or list or tuple): The value(s) to convert to a list of strings. - - Returns: - list: A list of strings. - """ - if isinstance(values, str): - values = values.strip("][").split(",") - values = [item.strip().strip('"') for item in values] - elif not isinstance(values, (list, tuple)): - log.debug("Not a list or string: {0}".format(values)) - values = [""] - - return values - -@helper -def schemingdcat_load_yaml(file, folder="codelists"): - """Load a YAML file from the folder, by default 'codelists' directory. - - Args: - file (str): The name of the YAML file to load. - - Returns: - dict: A dictionary containing the data from the YAML file. - """ - source_path = Path(__file__).resolve(True) - yaml_data = {} - try: - p = source_path.parent.joinpath(folder, file) - with open(p, "r") as f: - yaml_data = yaml.load(f, Loader=SafeLoader) - except FileNotFoundError: - log.error("The file {0} does not exist".format(file)) - except Exception as e: - log.error("Could not read configuration from {0}: {1}".format(file, e)) - - return yaml_data - -@helper -def schemingdcat_get_linked_data(id): - """Get linked data for a given identifier. - - Args: - id (str): The identifier to get linked data for. - - Returns: - list: A list of dictionaries containing linked data for the identifier. - """ - return [ - { - "name": name, - "display_name": sdct_config.linkeddata_links.get(name, {"display_name": content_type})[ - "display_name" - ], - "format": sdct_config.linkeddata_links.get(name, {}).get("format"), - "image_display_url": sdct_config.linkeddata_links.get(name, {}).get( - "image_display_url" - ), - "endpoint_icon": sdct_config.linkeddata_links.get(name, {}).get( - "endpoint_icon" - ), - "description": sdct_config.linkeddata_links.get(name, {}).get("description") - or f"Formats {content_type}", - "description_url": sdct_config.linkeddata_links.get(name, {}).get("description_url"), - "endpoint": "dcat.read_dataset", - "endpoint_data": { - "_id": id, - "_format": name, - }, - } - for name, content_type in CONTENT_TYPES.items() - ] - -@helper -def schemingdcat_get_catalog_endpoints(): - """Get the catalog endpoints. - - Returns: - list: A list of dictionaries containing linked data for the identifier. - """ - csw_uri = schemingdcat_get_geospatial_endpoint("catalog") - - return [ - { - "name": item["name"], - "display_name": item["display_name"], - "format": item["format"], - "image_display_url": item["image_display_url"], - "endpoint_icon": item["endpoint_icon"], - "fa_icon": item["fa_icon"], - "description": item["description"], - "type": item["type"], - "profile": item["profile"], - "profile_label": item["profile_label"], - "endpoint": get_endpoint("catalog") - if item.get("type").lower() == "lod" - else csw_uri.format(version=item["version"]) - if item.get("type").lower() == "ogc" - else None, - "endpoint_data": { - "_format": item["format"], - "_external": True, - "profiles": item["profile"], - }, - } - for item in sdct_config.endpoints["catalog_endpoints"] - ] - -@helper -def schemingdcat_get_geospatial_endpoint(type="dataset"): - """Get geospatial base URI for CSW Endpoint. - - Args: - type (str): The type of endpoint to return. Can be 'catalog' or 'dataset'. - - Returns: - str: The base URI of the CSW Endpoint with the appropriate format. - """ - try: - if sdct_config.geometadata_base_uri: - csw_uri = sdct_config.geometadata_base_uri - - if ( - sdct_config.geometadata_base_uri - and "/csw" not in sdct_config.geometadata_base_uri - ): - csw_uri = sdct_config.geometadata_base_uri.rstrip("/") + "/csw" - elif sdct_config.geometadata_base_uri == "": - csw_uri = "/csw" - else: - csw_uri = sdct_config.geometadata_base_uri.rstrip("/") - except: - csw_uri = "/csw" - - if type == "catalog": - return csw_uri + "?service=CSW&version={version}&request=GetCapabilities" - else: - return ( - csw_uri - + "?service=CSW&version={version}&request=GetRecordById&id={id}&elementSetName={element_set_name}&outputSchema={output_schema}&OutputFormat={output_format}" - ) - -@helper -def schemingdcat_get_geospatial_metadata(): - """Get geospatial metadata for CSW formats. - - Returns: - list: A list of dictionaries containing geospatial metadata for CSW formats. - """ - csw_uri = schemingdcat_get_geospatial_endpoint("dataset") - - return [ - { - "name": item["name"], - "display_name": item["display_name"], - "format": item["format"], - "image_display_url": item["image_display_url"], - "endpoint_icon": item["endpoint_icon"], - "description": item["description"], - "description_url": item["description_url"], - "url": csw_uri.format( - output_format=item["output_format"], - version=item["version"], - element_set_name=item["element_set_name"], - output_schema=item["output_schema"], - id="{id}", - ), - } - for item in sdct_config.geometadata_links["csw_formats"] - ] - -@helper -def schemingdcat_get_all_metadata(id): - """Get linked data and geospatial metadata for a given identifier. - - Args: - id (str): The identifier to get linked data and geospatial metadata for. - - Returns: - list: A list of dictionaries containing linked data and geospatial metadata for the identifier. - """ - geospatial_metadata = schemingdcat_get_geospatial_metadata() - linked_data = schemingdcat_get_linked_data(id) - - for metadata in geospatial_metadata: - metadata["endpoint_type"] = "csw" - - for data in linked_data: - data["endpoint_type"] = "dcat" - - return geospatial_metadata + linked_data - -@helper -def fluent_form_languages(field=None, entity_type=None, object_type=None, schema=None): - """ - Return a list of language codes for this form (or form field) - - 1. return field['form_languages'] if it is defined - 2. return schema['form_languages'] if it is defined - 3. get schema from entity_type + object_type then - return schema['form_languages'] if they are defined - 4. return languages from site configuration - """ - if field and "form_languages" in field: - return field["form_languages"] - if schema and "form_languages" in schema: - return schema["form_languages"] - if entity_type and object_type: - # late import for compatibility with older ckanext-scheming - from ckanext.scheming.helpers import scheming_get_schema - - schema = scheming_get_schema(entity_type, object_type) - if schema and "form_languages" in schema: - return schema["form_languages"] - - langs = [] - for l in get_available_locales(): - if l.language not in langs: - langs.append(l.language) - return langs - -@helper -def schemingdcat_fluent_form_label(field, lang): - """Returns a label for the input field in the specified language. - - If the field has a `fluent_form_label` defined, the label will be taken from there. - If a matching label cannot be found, this helper will return the standard label - with the language code in uppercase. - - Args: - field (dict): A dictionary representing the input field. - lang (str): A string representing the language code. - - Returns: - str: A string representing the label for the input field in the specified language. - """ - form_label = field.get("fluent_form_label", {}) - label = scheming_language_text(form_label.get(lang, field["label"])) - return f"{label} ({lang.upper()})" - -@helper -def schemingdcat_multiple_field_required(field, lang): - """ - Returns whether a field is required or not based on the field definition and language. - - Args: - field (dict): The field definition. - lang (str): The language to check for required fields. - - Returns: - bool: True if the field is required, False otherwise. - """ - if "required" in field: - return field["required"] - if "required_language" in field and field["required_language"] == lang: - return True - return "not_empty" in field.get("validators", "").split() - -def parse_json(value, default_value=None): - try: - return json.loads(value) - except (ValueError, TypeError, AttributeError): - if default_value is not None: - return default_value - return value - -@helper -def schemingdcat_get_default_lang(): - global DEFAULT_LANG - if DEFAULT_LANG is None: - DEFAULT_LANG = p.toolkit.config.get("ckan.locale_default", "en") - return DEFAULT_LANG - -@helper -def schemingdcat_get_current_lang(): - """ - Returns the current language of the CKAN instance. - - Returns: - str: The current language of the CKAN instance. If the language cannot be determined, the default language 'en' is returned. - """ - try: - return get_lang() - except TypeError: - return p.toolkit.config.get("ckan.locale_default", "en") - -@helper -def schemingdcat_extract_lang_text(text, current_lang): - """ - Extracts the text content for a specified language from a string. - - Args: - text (str): The string to extract the language content from. - Example: "[#en#]Welcome to the CKAN Open Data Portal.[#es#]Bienvenido al portal de datos abiertos CKAN." - current_lang (str): The language code to extract the content for. - Example: "es" - - Returns: - str: The extracted language content, or the original string if no content is found. - Example: "Bienvenido al portal de datos abiertos CKAN." - - """ - - @lru_cache(maxsize=30) - def process_language_content(language_label, text): - """Helper function to process the content for a specific language label. - - Args: - language_label (str): The language label to process. - text (str): The text to process. - - Returns: - str: The text corresponding to the specified language label. - - """ - pattern = re.compile(r'\[#(.*?)#\](.*?)(?=\[#|$)', re.DOTALL) - matches = pattern.findall(text) - - for lang, content in matches: - if lang == language_label.replace('[#', '').replace('#]', ''): - return content.strip() - - return '' - - lang_label = f"[#{current_lang}#]" - default_lang = schemingdcat_get_default_lang() - default_lang_label = f"[#{default_lang}#]" - - lang_text = process_language_content(lang_label, text) - - if not lang_text and lang_label != default_lang_label: - lang_text = process_language_content(default_lang_label, text) - - if not lang_text: - return text - - return lang_text - -@helper -def dataset_display_name(package_or_package_dict): - """ - Returns the localized value of the dataset name by extracting the correct translation. - - Args: - - package_or_package_dict: A dictionary containing the package information. - - Returns: - - The localized value of the dataset name. - """ - field_name = "title" if "title" in package_or_package_dict else "name" - - return schemingdcat_get_localized_value_from_dict( - package_or_package_dict, field_name - ) - - -@helper -def dataset_display_field_value(package_or_package_dict, field_name): - """ - Extracts the correct translation of the dataset field. - - Args: - package_or_package_dict (dict): The package or package dictionary to extract the value from. - field_name (str): The name of the field to extract the value for. - - Returns: - str: The localized value for the given field name. - """ - return schemingdcat_get_localized_value_from_dict( - package_or_package_dict, field_name - ) - -@helper -def schemingdcat_get_localized_value_from_dict( - package_or_package_dict, field_name, default="" -): - """ - Get the localized value from a dictionary. - - This function tries to get the value of a field in a specific language. - If the value is not available in the specific language, it tries to get it in the default language. - If the value is not available in the default language, it tries to get the untranslated value. - If the untranslated value is not available, it returns a default value. - - Args: - package_or_package_dict (dict or str): The package or dictionary to get the value from. - If it's a string, it tries to convert it to a dictionary using json.loads. - field_name (str): The name of the field to get the value from. - default (str, optional): The default value to return if the value is not available. Defaults to "". - - Returns: - str: The localized value, or the default value if the localized value is not available. - """ - if isinstance(package_or_package_dict, str): - try: - package_or_package_dict = json.loads(package_or_package_dict) - except ValueError: - return default - - lang_code = schemingdcat_get_current_lang().split("_")[0] - schemingdcat_get_default_lang() - - translated_field = package_or_package_dict.get(field_name + "_translated", {}) - if isinstance(translated_field, str): - try: - translated_field = json.loads(translated_field) - except ValueError: - translated_field = {} - - # Check the lang_code, if not check the default_lang, if not check the field without translation - return translated_field.get(lang_code) or translated_field.get(DEFAULT_LANG) or package_or_package_dict.get(field_name, default) - -@helper -def schemingdcat_get_readable_file_size(num, suffix="B"): - if not num: - return False - try: - for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]: - num = float(num) - if abs(num) < 1024.0: - return "%3.1f%s%s" % (num, unit, suffix) - num /= 1024.0 - return "%.1f%s%s" % (num, "Y", suffix) - except ValueError: - return False - - -@helper -def schemingdcat_get_group_or_org(id, type="group"): - """ - Retrieve information about a group or organization in CKAN. - - Args: - id (str): The ID of the group or organization. - type (str, optional): The type of the entity to retrieve. Defaults to 'group'. - - Returns: - dict: A dictionary containing information about the group or organization. - """ - return logic.get_action(f"{type}_show")({}, {"id": id}) - -@helper -def schemingdcat_package_list_for_source(source_id): - ''' - Creates a dataset list with the ones belonging to a particular harvest - source. - - It calls the package_list snippet and the pager. - ''' - limit = 20 - page = int(request.args.get('page', 1)) - fq = '+harvest_source_id:"{0}"'.format(source_id) - search_dict = { - 'fq': fq, - 'rows': limit, - 'sort': 'metadata_modified desc', - 'start': (page - 1) * limit, - 'include_private': True - } - - context = {'model': model, 'session': model.Session} - harvest_source = get_harvest_source(source_id) - owner_org = harvest_source.get('owner_org', '') - if owner_org: - user_member_of_orgs = [org['id'] for org - in ckan_helpers.organizations_available('read')] - if (harvest_source and owner_org in user_member_of_orgs): - context['ignore_capacity_check'] = True - - query = logic.get_action('package_search')(context, search_dict) - - base_url = ckan_helpers.url_for( - '{0}.read'.format(DATASET_TYPE_NAME), - id=harvest_source['name'] - ) - - def pager_url(q=None, page=None): - url = base_url - if page: - url += '?page={0}'.format(page) - return url - - pager = ckan_helpers.Page( - collection=query['results'], - page=page, - url=pager_url, - item_count=query['count'], - items_per_page=limit - ) - pager.items = query['results'] - - if query['results']: - out = ckan_helpers.snippet('snippets/package_list.html', packages=query['results']) - out += pager.pager() - else: - out = ckan_helpers.snippet('snippets/package_list_empty.html') - - return out -@helper -def schemingdcat_package_count_for_source(source_id): - ''' - Returns the current package count for datasets associated with the given - source id - ''' - fq = '+harvest_source_id:"{0}"'.format(source_id) - search_dict = {'fq': fq, 'include_private': True} - context = {'model': model, 'session': model.Session} - result = logic.get_action('package_search')(context, search_dict) - return result.get('count', 0) - -@helper -def schemingdcat_parse_localised_date(date_=None): - '''Parse a datetime object or timestamp string as a localised date. - If timestamp is badly formatted, then None is returned. - - :param date_: the date - :type date_: datetime or date or ISO string format - :rtype: date - ''' - if not date_: - return None - if isinstance(date_, str): - try: - date_ = ckan_helpers.date_str_to_datetime(date_) - except (TypeError, ValueError): - return None - # check we are now a datetime or date - if isinstance(date_, datetime.datetime): - date_ = date_.date() - elif not isinstance(date_, datetime.date): - return None - - # Format date based on locale - locale = schemingdcat_get_current_lang() - if locale == 'es': - return date_.strftime('%d-%m-%Y') - else: - return date_.strftime('%Y-%m-%d') - -@lru_cache(maxsize=None) -@helper -def schemingdcat_get_dataset_schema(schema_type="dataset"): - """ - Retrieves the schema for the dataset instance and caches it using the LRU cache decorator for efficient retrieval. - - Args: - schema_type (str, optional): The type of schema to retrieve. Defaults to 'dataset'. - - Returns: - dict: The schema of the dataset instance. - """ - return logic.get_action("scheming_dataset_schema_show")( - {}, {"type": schema_type} - ) - -@helper -def schemingdcat_get_schema_form_groups(entity_type=None, object_type=None, schema=None): - """ - Return a list of schema metadata groups for this form. - - 1. return schema['schema_form_groups'] if it is defined - 2. get schema from entity_type + object_type then - return schema['schema_form_groups'] if they are defined - """ - if schema and "schema_form_groups" in schema: - return schema["schema_form_groups"] - elif entity_type and object_type: - schema = scheming_get_schema(entity_type, object_type) - return schema["schema_form_groups"] if schema and "schema_form_groups" in schema else None - else: - return None - -# Vocabs -@helper -def get_inspire_themes(*args, **kwargs) -> typing.List[typing.Dict[str, str]]: - log.debug(f"inside get_inspire_themes {args=} {kwargs=}") - try: - inspire_themes = p.toolkit.get_action("tag_list")( - data_dict={"vocabulary_id": sdct_config.SCHEMINGDCAT_INSPIRE_THEMES_VOCAB} - ) - except p.toolkit.ObjectNotFound: - inspire_themes = [] - return [{"value": t, "label": t} for t in inspire_themes] - -@helper -def get_ckan_cleaned_name(name): - """ - Cleans a name by removing accents, special characters, and spaces. - - Args: - name (str): The name to clean. - - Returns: - str: The cleaned name. - """ - MAX_TAG_LENGTH = 100 - MIN_TAG_LENGTH = 2 - # Define a dictionary to map accented characters to their unaccented equivalents except ñ - accent_map = { - "á": "a", "à": "a", "ä": "a", "â": "a", "ã": "a", - "é": "e", "è": "e", "ë": "e", "ê": "e", - "í": "i", "ì": "i", "ï": "i", "î": "i", - "ó": "o", "ò": "o", "ö": "o", "ô": "o", "õ": "o", - "ú": "u", "ù": "u", "ü": "u", "û": "u", - "ñ": "ñ", - } - - # Convert the name to lowercase - name = name.lower() - - # Replace accented and special characters with their unaccented equivalents or - - name = "".join(accent_map.get(c, c) for c in name) - name = re.sub(r"[^a-zñ0-9_.-]", "-", name.strip()) - - # Truncate the name to MAX_TAG_LENGTH characters - name = name[:MAX_TAG_LENGTH] - - # If the name is shorter than MIN_TAG_LENGTH, pad it with underscores - if len(name) < MIN_TAG_LENGTH: - name = name.ljust(MIN_TAG_LENGTH, '_') - - return name - -@helper -def get_featured_datasets(count=1): - """ - This helper function retrieves a specified number of featured datasets from the CKAN instance. - It uses the 'package_search' action of the CKAN logic layer to perform a search with specific parameters. - - Parameters: - count (int): The number of featured datasets to retrieve. Default is 1. - - Returns: - list: A list of dictionaries, each representing a featured dataset. - """ - fq = '+featured:true' - search_dict = { - 'fq': fq, - 'sort': 'metadata_modified desc', - 'fl': 'id,name,title,notes,state,metadata_modified,type,extras_featured,extras_graphic_overview', - 'rows': count - } - context = {'model': model, 'session': model.Session} - result = logic.get_action('package_search')(context, search_dict) - - return result['results'] - -@helper -def get_spatial_datasets(count=10): - """ - This helper function retrieves a specified number of featured datasets from the CKAN instance. - It uses the 'package_search' action of the CKAN logic layer to perform a search with specific parameters. - - Parameters: - count (int): The number of featured datasets to retrieve. Default is 1. - - Returns: - list: A list of dictionaries, each representing a featured dataset. - """ - fq = '+dcat_type:*inspire*' - search_dict = { - 'fq': fq, - 'fl': 'extras_dcat_type', - 'rows': count - } - context = {'model': model, 'session': model.Session} - result = logic.get_action('package_search')(context, search_dict) - - return result['results'] - -@lru_cache(maxsize=None) -@helper -def get_header_endpoint_url(endpoint, site_protocol_and_host): - url_for = ckan_helpers.url_for - endpoint_type = endpoint['type'] - endpoint_value = endpoint['endpoint'] - - if endpoint_type == 'ogc': - if ckan_helpers.is_url(endpoint_value): - return ckan_helpers.url_for_static_or_external(endpoint_value) - else: - protocol, host = site_protocol_and_host - return f"{protocol}://{host}/{endpoint_value}" - elif endpoint_type == 'ckan': - return url_for('api.action', ver=3, logic_function='package_list', qualified=True) - elif endpoint_type == 'lod': - return url_for(endpoint_value, **endpoint['endpoint_data']) - elif endpoint_type == 'sparql': - return url_for('/sparql') - -@helper -def schemingdcat_check_valid_url(url): - """ - Check if a string is a valid URL. - - Args: - url (str): The string to check. - - Returns: - bool: True if the string is a valid URL, False otherwise. - """ - try: - result = urlparse(url) - return all([result.scheme, result.netloc]) - except ValueError: - return False +from ckan.common import json, c, request, is_flask_request +from ckan.lib import helpers as ckan_helpers +import ckan.logic as logic +from ckan import model +from ckan.lib.i18n import get_available_locales, get_lang +import ckan.plugins as p +import six +import re +import yaml +from yaml.loader import SafeLoader +from pathlib import Path +from functools import lru_cache +import datetime +import typing +from urllib.parse import urlparse +from urllib.error import URLError + +from six.moves.urllib.parse import urlencode + +from ckanext.scheming.helpers import ( + scheming_choices_label, + scheming_language_text, + scheming_dataset_schemas, + scheming_get_schema +) + +from ckanext.harvest.helpers import ( + get_harvest_source +) +from ckanext.harvest.utils import ( + DATASET_TYPE_NAME +) + +import ckanext.schemingdcat.config as sdct_config +from ckanext.schemingdcat.utils import ( + get_facets_dict, + public_file_exists, + public_dir_exists, +) +from ckanext.dcat.utils import CONTENT_TYPES, get_endpoint +from ckanext.fluent.validators import LANG_SUFFIX +import logging + +log = logging.getLogger(__name__) + +all_helpers = {} +prettify_cache = {} +DEFAULT_LANG = None + +@lru_cache(maxsize=None) +def get_scheming_dataset_schemas(): + """ + Retrieves the dataset schemas using the scheming_dataset_schemas function. + Caches the result using the LRU cache decorator for efficient retrieval. + """ + return scheming_dataset_schemas() + + +def helper(fn): + """Collect helper functions into the ckanext.schemingdcat.all_helpers dictionary. + + Args: + fn (function): The helper function to add to the dictionary. + + Returns: + function: The helper function. + """ + all_helpers[fn.__name__] = fn + return fn + + +@helper +def schemingdcat_get_schema_names(): + """ + Get the names of all the schemas defined for the Scheming DCAT extension. + + Returns: + list: A list of schema names. + """ + schemas = get_scheming_dataset_schemas() + + return [schema["schema_name"] for schema in schemas.values()] + + +@helper +def schemingdcat_default_facet_search_operator(): + """Return the default facet search operator: AND/OR. + + Returns: + str: The default facet search operator. + """ + facet_operator = sdct_config.default_facet_operator + if facet_operator and ( + facet_operator.upper() == "AND" or facet_operator.upper() == "OR" + ): + facet_operator = facet_operator.upper() + else: + facet_operator = "AND" + return facet_operator + + +@helper +def schemingdcat_decode_json(json_text): + """Convert a JSON string to a Python object. + + Args: + json_text (str): The JSON string to convert. + + Returns: + object: A Python object representing the JSON data. + """ + return json.loads(json_text) + + +@helper +def schemingdcat_organization_name(org_id): + """Return the name of the organization from its ID. + + Args: + org_id (dict): A dictionary containing the ID of the organization. + + Returns: + str: The name of the organization, or None if the organization cannot be found. + """ + org_name = None + try: + org_dic = ckan_helpers.get_organization(org_id["display_name"]) + if org_dic is not None: + org_name = org_dic["name"] + else: + log.warning( + "Could not find the name of the organization with ID {0}".format( + org_id["display_name"] + ) + ) + except Exception as e: + log.error( + "Exception while trying to find the name of the organization: {0}".format(e) + ) + return org_name + + +@helper +def schemingdcat_get_facet_label(facet): + """Return the label for a given facet. + + Args: + facet (str): The name of the facet. + + Returns: + str: The label for the given facet. + """ + return get_facets_dict[facet] + + +@helper +def schemingdcat_get_facet_items_dict( + facet, search_facets=None, limit=None, exclude_active=False, scheming_choices=None +): + """Return the list of unselected facet items for the given facet, sorted + by count. + + Returns the list of unselected facet contraints or facet items (e.g. tag + names like "russian" or "tolstoy") for the given search facet (e.g. + "tags"), sorted by facet item count (i.e. the number of search results that + match each facet item). + + Reads the complete list of facet items for the given facet from + c.search_facets, and filters out the facet items that the user has already + selected. + + List of facet items are ordered acording the faccet_sort parameter + + Arguments: + facet -- the name of the facet to filter. + search_facets -- dict with search facets(c.search_facets in Pylons) + limit -- the max. number of facet items to return. + exclude_active -- only return unselected facets. + scheming_choices -- scheming choices to use to get label from value. + + """ + + # log.debug("Returning facets for: {0}".format(facet)) + + order = "default" + items = [] + + search_facets = search_facets or getattr(c, "search_facets", None) + + if ( + search_facets + and isinstance(search_facets, dict) + and search_facets.get(facet, {}).get("items") + ): + for facet_item in search_facets.get(facet)["items"]: + if scheming_choices: + facet_item["label"] = scheming_choices_label( + scheming_choices, facet_item["name"] + ) + else: + facet_item["label"] = facet_item["display_name"] + + if not len(facet_item["name"].strip()): + continue + + params_items = ( + request.params.items(multi=True) + if is_flask_request() + else request.params.items() + ) + + if not (facet, facet_item["name"]) in params_items: + items.append(dict(active=False, **facet_item)) + elif not exclude_active: + items.append(dict(active=True, **facet_item)) + + # log.debug("params: {0}:{1}".format( + # facet,request.params.getlist("_%s_sort" % facet))) + order_lst = request.params.getlist("_%s_sort" % facet) + if len(order_lst): + order = order_lst[0] + # Sort descendingly by count and ascendingly by case-sensitive display name + # items.sort(key=lambda it: (-it['count'], it['display_name'].lower())) + sorts = { + "name": ("label", False), + "name_r": ("label", True), + "count": ("count", False), + "count_r": ("count", True), + } + if sorts.get(order): + items.sort( + key=lambda it: (it[sorts.get(order)[0]]), reverse=sorts.get(order)[1] + ) + else: + items.sort(key=lambda it: (-it["count"], it["label"].lower())) + + if hasattr(c, "search_facets_limits"): + if c.search_facets_limits and limit is None: + limit = c.search_facets_limits.get(facet) + # zero treated as infinite for hysterical raisins + if limit is not None and limit > 0: + return items[:limit] + + return items + + +@helper +def schemingdcat_new_order_url(facet_name, order_concept, extras=None): + """Return a URL with the order parameter for the given facet and concept to use. + + Based on the actual order, it rotates cyclically from no order -> direct order -> inverse order over the given concept. + + Args: + facet_name (str): The name of the facet to order. + order_concept (str): The concept (name or count) that will be used to order. + extras (dict, optional): Extra parameters to include in the URL. + + Returns: + str: The URL with the order parameter for the given facet and concept. + """ + old_order = None + order_param = "_%s_sort" % facet_name + order_lst = request.params.getlist(order_param) + if not extras: + extras = {} + + controller = getattr(c, "controller", False) or request.blueprint + action = getattr(c, "action", False) or p.toolkit.get_endpoint()[1] + url = ckan_helpers.url_for(controller=controller, action=action, **extras) + + if len(order_lst): + old_order = order_lst[0] + + order_mapping = { + "name": {"name": "name_r", "name_r": None, None: "name"}, + "count": {"count": "count_r", "count_r": None, None: "count"}, + } + + new_order = order_mapping.get(order_concept, {}).get(old_order) + + params_items = ( + request.params.items(multi=True) + if is_flask_request() + else request.params.items() + ) + params_nopage = [(k, v) for k, v in params_items if k != order_param] + + if new_order: + params_nopage.append((order_param, new_order)) + + if params_nopage: + url = url + "?" + urlencode(params_nopage) + + return url + +@helper +def schemingdcat_get_facet_list_limit(): + """ + Retrieves the limit for the facet list from the scheming DCAT configuration. + + Returns: + int: The limit for the facet list. + """ + return sdct_config.facet_list_limit + +@helper +def schemingdcat_get_icons_dir(field=None, field_name=None): + """ + Returns the defined icons directory for a given scheming field definition or field name. + + This function is used to retrieve the icons directory associated with a + specific field in a scheming dataset or directly by field name. If no icons directory is defined, + the function will return None. + + Args: + field (dict, optional): A dictionary representing the scheming field definition. + This should include all the properties of the field, + including the icons directory if one is defined. + field_name (str, optional): The name of the field. If provided, the function will + look for an icons directory with this name. + + Returns: + str: A string representing the icons directory for the field or field name. + If no icons directory is defined or found, the function will return None. + """ + if field: + if "icons_dir" in field: + return field["icons_dir"] + + if "field_name" in field: + dir = sdct_config.icons_dir + "/" + field["field_name"] + if public_dir_exists(dir): + return dir + + elif field_name: + dir = sdct_config.icons_dir + "/" + field_name + if public_dir_exists(dir): + return dir + + return None + +@helper +def schemingdcat_get_default_icon(field): + """Return the defined default icon for a scheming field definition. + + Args: + field (dict): The scheming field definition. + + Returns: + str: The defined default icon, or None if not found. + """ + if "default_icon" in field: + return field["default_icon"] + +@helper +def schemingdcat_get_default_package_item_icon(): + """ + Returns the default icon defined for a given scheming field definition. + + This function is used to retrieve the default icon associated with a + specific field in a scheming dataset. If no default icon is defined, + the function will return None. + + Args: + field (dict): A dictionary representing the scheming field definition. + This should include all the properties of the field, + including the default icon if one is defined. + + Returns: + str: A string representing the default icon for the field. This could + be a URL, a data URI, or any other string format used to represent + images. If no default icon is defined for the field, the function + will return None. + """ + return sdct_config.default_package_item_icon + +@helper +def schemingdcat_get_default_package_item_show_spatial(): + """ + Returns the configuration value for showing spatial information in the default package item. + + This function is used to retrieve the configuration value that determines + whether the spatial information should be shown in the default package item. + If no value is defined in the configuration, the function will return None. + + Returns: + bool: A boolean value representing whether the spatial information should + be shown in the default package item. If no value is defined in the + configuration, the function will return None. + """ + return sdct_config.default_package_item_show_spatial + +@helper +def schemingdcat_get_show_metadata_templates_toolbar(): + """ + Returns the configuration value for showing the metadata templates toolbar. + + This function is used to retrieve the configuration value that determines + whether the metadata templates toolbar should be shown or not. If the configuration + value is not set, the function will return False. + + Returns: + bool: A boolean value representing whether the metadata templates toolbar + should be shown. If the configuration value is not set, the function + will return False. + """ + return sdct_config.show_metadata_templates_toolbar + +@helper +def schemingdcat_get_metadata_templates_search_identifier(): + """ + Returns the default icon defined for a given scheming field definition. + + This function is used to retrieve the default value to retrieve metadata templates. If no default value is defined, + the function will return None. + + Args: + field (dict): A dictionary representing the scheming field definition. + This should include all the properties of the field, + including the default icon if one is defined. + + Returns: + str: A string representing the default icon for the field. This could + be a URL, a data URI, or any other string format used to represent + images. If no default icon is defined for the field, the function + will return None. + """ + return sdct_config.metadata_templates_search_identifier + +@helper +def schemingdcat_get_schemingdcat_xls_harvest_templates(search_identifier=sdct_config.metadata_templates_search_identifier, count=10): + """ + This helper function retrieves the schemingdcat_xls templates from the CKAN instance. + It uses the 'package_search' action of the CKAN logic layer to perform a search with specific parameters. + + Parameters: + search_identifier (str): The text to search in the identifier. Default is sdct_config.metadata_templates_search_identifier. + count (int): The number of featured datasets to retrieve. Default is 10. + + Returns: + list: A list of dictionaries, each representing a featured dataset. If no results are found, returns None. + """ + fq = f'+extras_schemingdcat_xls_metadata_template:{True}' + search_dict = { + 'fq': fq, + 'fl': 'name,extras_identifier,title,notes,metadata_modified,extras_title_translated,extras_notes_translated', + 'rows': count + } + context = {'model': model, 'session': model.Session} + result = logic.get_action('package_search')(context, search_dict) + + if not result['results']: + fq = f'+extras_schemingdcat_xls_metadata_template:*{search_identifier}*' + search_dict['fq'] = fq + result = logic.get_action('package_search')(context, search_dict) + + return result['results'] if result['results'] else None + +@helper +def schemingdcat_get_icon( + choice=None, icons_dir=None, default="/images/default/no_icon.svg", choice_value=None +): + """Return the relative URL to the icon for the item. + + Args: + choice (dict, optional): The choice selected for the field. + icons_dir (str, optional): The path to search for the icon. Usually the common path for icons for this field. + default (str, optional): The default value to return if no icon is found. + choice_value (str, optional): The value of the choice selected for the field. If provided, it will be used instead of choice['value']. + + Returns: + str: The relative URL to the icon, or the default value if not found. + """ + extensions = [".svg", ".png", ".jpg", ".jpeg", ".gif"] + icon_name = None + + if choice_value is None and choice: + choice_value = choice.get("icon") or choice.get("value") + + if choice_value: + if ckan_helpers.is_url(choice_value): + url_parts = choice_value.split("/") + + if len(url_parts) == 1: + icon_name = url_parts[-1].lower() + else: + icon_name = url_parts[-2].lower() + "/" + url_parts[-1].lower() + else: + icon_name = choice_value + + url_path = (icons_dir + "/" if icons_dir else "") + icon_name + + for extension in extensions: + if public_file_exists(url_path + extension): + return url_path + extension + + return default + +@helper +def schemingdcat_get_choice_item(field, value): + """Return the whole choice item for the given value in the scheming field. + + Args: + field (dict): The scheming field to look for the choice item in. + value (str): The option item value. + + Returns: + dict: The whole option item in scheming, or None if not found. + """ + if field and ("choices" in field): + # log.debug("Searching: {0} en {1}".format(value,field['choices'])) + for choice in field["choices"]: + if choice["value"] == value: + return choice + + return None + +@helper +def schemingdcat_get_choice_property(choices, value, property): + """ + Retrieve a specific property from a choice dictionary based on the given value. + + Args: + choices (list): List of dictionaries containing "label" and "value" keys. + value (str): The value to match against the choices. + property (str): The property to retrieve from the matching choice dictionary. + + Returns: + str or None: The property value from the matching choice dictionary, or None if not found. + """ + for c in choices: + if c['value'] == value: + return c.get(property, None) + return None + + +@helper +def scheming_display_json_list(value): + """Return the object passed serialized as a JSON list. + + Args: + value (any): The object to serialize. + + Returns: + str: The serialized object as a JSON list, or the original value if it cannot be serialized. + """ + if isinstance(value, six.string_types): + return value + try: + return json.loads(value) + except (TypeError, ValueError): + return value + +@helper +def scheming_clean_json_value(value): + """Clean a JSON list value to avoid errors with: '"' and spaces. + + Args: + value (str): The object to serialize. + + Returns: + str: The cleaned value, or the original value if it cannot be cleaned. + """ + try: + value = value.strip(" ").replace('\\"', "%%%@#") + value = value.replace('"', "") + value = value.replace("%%%@#", '"') + return value + except (TypeError, ValueError): + return value + +def format_eli_label(parsed_url): + """ + Formats the label for a parsed URL with 'eli' segment. + + Args: + parsed_url (ParseResult): The parsed URL. + + Returns: + str: The formatted label. + """ + segments = parsed_url.path.split('/') + eli_index = next(i for i, segment in enumerate(segments) if segment == 'eli') + return '/'.join(segments[eli_index + 1:]).upper() + +@helper +def schemingdcat_prettify_url(url): + """ + Prettifies a URL by removing the protocol and trailing slash. + + Args: + url (str): The URL to prettify. + + Returns: + str: The prettified URL, or the original URL if an error occurred. + """ + if url in prettify_cache: + return prettify_cache[url] + + try: + prettified_url = re.sub(r"^https?://(?:www\.)?", "", url).rstrip("/") + prettify_cache[url] = prettified_url + return prettified_url + except (TypeError, AttributeError): + return url + +@helper +def schemingdcat_prettify_url_name(url): + """ + Prettifies a URL name by extracting the last segment and cleaning it. + + Args: + url (str): The URL to extract the name from. + + Returns: + str: The prettified URL name, or the original URL if an error occurred. + """ + if url is None: + return url + + if url in prettify_cache: + return prettify_cache[url] + + try: + parsed_url = urlparse(url) + + if '/eli/' in url: + prettified_url_name = format_eli_label(parsed_url) + else: + url_name = parsed_url.path.split("/")[-1].split('.')[0].replace('_', '-') + prettified_url_name = ' '.join(url_name.split(' ')[:4]) + + prettify_cache[url] = prettified_url_name + return prettified_url_name + + except (URLError, ValueError) as e: + print(f"Error while prettifying URL: {e}") + return url + +@helper +def schemingdcat_listify_str(values): + """Converts a string or list/tuple of strings to a list of strings. + + If `values` is already a list or tuple, it is returned as is. If `values` is a string, + it is split into a list of strings using commas as the delimiter. Each string in the + resulting list is stripped of leading/trailing whitespace and quotes. + + Args: + values (str or list or tuple): The value(s) to convert to a list of strings. + + Returns: + list: A list of strings. + """ + if isinstance(values, str): + values = values.strip("][").split(",") + values = [item.strip().strip('"') for item in values] + elif not isinstance(values, (list, tuple)): + log.debug("Not a list or string: {0}".format(values)) + values = [""] + + return values + +@helper +def schemingdcat_load_yaml(file, folder="codelists"): + """Load a YAML file from the folder, by default 'codelists' directory. + + Args: + file (str): The name of the YAML file to load. + + Returns: + dict: A dictionary containing the data from the YAML file. + """ + source_path = Path(__file__).resolve(True) + yaml_data = {} + try: + p = source_path.parent.joinpath(folder, file) + with open(p, "r") as f: + yaml_data = yaml.load(f, Loader=SafeLoader) + except FileNotFoundError: + log.error("The file {0} does not exist".format(file)) + except Exception as e: + log.error("Could not read configuration from {0}: {1}".format(file, e)) + + return yaml_data + +@helper +def schemingdcat_get_linked_data(id): + """Get linked data for a given identifier. + + Args: + id (str): The identifier to get linked data for. + + Returns: + list: A list of dictionaries containing linked data for the identifier. + """ + return [ + { + "name": name, + "display_name": sdct_config.linkeddata_links.get(name, {"display_name": content_type})[ + "display_name" + ], + "format": sdct_config.linkeddata_links.get(name, {}).get("format"), + "image_display_url": sdct_config.linkeddata_links.get(name, {}).get( + "image_display_url" + ), + "endpoint_icon": sdct_config.linkeddata_links.get(name, {}).get( + "endpoint_icon" + ), + "description": sdct_config.linkeddata_links.get(name, {}).get("description") + or f"Formats {content_type}", + "description_url": sdct_config.linkeddata_links.get(name, {}).get("description_url"), + "endpoint": "dcat.read_dataset", + "endpoint_data": { + "_id": id, + "_format": name, + }, + } + for name, content_type in CONTENT_TYPES.items() + ] + +@helper +def schemingdcat_get_catalog_endpoints(): + """Get the catalog endpoints. + + Returns: + list: A list of dictionaries containing linked data for the identifier. + """ + csw_uri = schemingdcat_get_geospatial_endpoint("catalog") + + return [ + { + "name": item["name"], + "display_name": item["display_name"], + "format": item["format"], + "image_display_url": item["image_display_url"], + "endpoint_icon": item["endpoint_icon"], + "fa_icon": item["fa_icon"], + "description": item["description"], + "type": item["type"], + "profile": item["profile"], + "profile_label": item["profile_label"], + "endpoint": get_endpoint("catalog") + if item.get("type").lower() == "lod" + else csw_uri.format(version=item["version"]) + if item.get("type").lower() == "ogc" + else None, + "endpoint_data": { + "_format": item["format"], + "_external": True, + "profiles": item["profile"], + }, + } + for item in sdct_config.endpoints["catalog_endpoints"] + ] + +@helper +def schemingdcat_get_geospatial_endpoint(type="dataset"): + """Get geospatial base URI for CSW Endpoint. + + Args: + type (str): The type of endpoint to return. Can be 'catalog' or 'dataset'. + + Returns: + str: The base URI of the CSW Endpoint with the appropriate format. + """ + try: + if sdct_config.geometadata_base_uri: + csw_uri = sdct_config.geometadata_base_uri + + if ( + sdct_config.geometadata_base_uri + and "/csw" not in sdct_config.geometadata_base_uri + ): + csw_uri = sdct_config.geometadata_base_uri.rstrip("/") + "/csw" + elif sdct_config.geometadata_base_uri == "": + csw_uri = "/csw" + else: + csw_uri = sdct_config.geometadata_base_uri.rstrip("/") + except: + csw_uri = "/csw" + + if type == "catalog": + return csw_uri + "?service=CSW&version={version}&request=GetCapabilities" + else: + return ( + csw_uri + + "?service=CSW&version={version}&request=GetRecordById&id={id}&elementSetName={element_set_name}&outputSchema={output_schema}&OutputFormat={output_format}" + ) + +@helper +def schemingdcat_get_geospatial_metadata(): + """Get geospatial metadata for CSW formats. + + Returns: + list: A list of dictionaries containing geospatial metadata for CSW formats. + """ + csw_uri = schemingdcat_get_geospatial_endpoint("dataset") + + return [ + { + "name": item["name"], + "display_name": item["display_name"], + "format": item["format"], + "image_display_url": item["image_display_url"], + "endpoint_icon": item["endpoint_icon"], + "description": item["description"], + "description_url": item["description_url"], + "url": csw_uri.format( + output_format=item["output_format"], + version=item["version"], + element_set_name=item["element_set_name"], + output_schema=item["output_schema"], + id="{id}", + ), + } + for item in sdct_config.geometadata_links["csw_formats"] + ] + +@helper +def schemingdcat_get_all_metadata(id): + """Get linked data and geospatial metadata for a given identifier. + + Args: + id (str): The identifier to get linked data and geospatial metadata for. + + Returns: + list: A list of dictionaries containing linked data and geospatial metadata for the identifier. + """ + geospatial_metadata = schemingdcat_get_geospatial_metadata() + linked_data = schemingdcat_get_linked_data(id) + + for metadata in geospatial_metadata: + metadata["endpoint_type"] = "csw" + + for data in linked_data: + data["endpoint_type"] = "dcat" + + return geospatial_metadata + linked_data + +@helper +def fluent_form_languages(field=None, entity_type=None, object_type=None, schema=None): + """ + Return a list of language codes for this form (or form field) + + 1. return field['form_languages'] if it is defined + 2. return schema['form_languages'] if it is defined + 3. get schema from entity_type + object_type then + return schema['form_languages'] if they are defined + 4. return languages from site configuration + """ + if field and "form_languages" in field: + return field["form_languages"] + if schema and "form_languages" in schema: + return schema["form_languages"] + if entity_type and object_type: + # late import for compatibility with older ckanext-scheming + from ckanext.scheming.helpers import scheming_get_schema + + schema = scheming_get_schema(entity_type, object_type) + if schema and "form_languages" in schema: + return schema["form_languages"] + + langs = [] + for l in get_available_locales(): + if l.language not in langs: + langs.append(l.language) + return langs + +@helper +def schemingdcat_fluent_form_label(field, lang): + """Returns a label for the input field in the specified language. + + If the field has a `fluent_form_label` defined, the label will be taken from there. + If a matching label cannot be found, this helper will return the standard label + with the language code in uppercase. + + Args: + field (dict): A dictionary representing the input field. + lang (str): A string representing the language code. + + Returns: + str: A string representing the label for the input field in the specified language. + """ + form_label = field.get("fluent_form_label", {}) + label = scheming_language_text(form_label.get(lang, field["label"])) + return f"{label} ({lang.upper()})" + +@helper +def schemingdcat_multiple_field_required(field, lang): + """ + Returns whether a field is required or not based on the field definition and language. + + Args: + field (dict): The field definition. + lang (str): The language to check for required fields. + + Returns: + bool: True if the field is required, False otherwise. + """ + if "required" in field: + return field["required"] + if "required_language" in field and field["required_language"] == lang: + return True + return "not_empty" in field.get("validators", "").split() + +def parse_json(value, default_value=None): + try: + return json.loads(value) + except (ValueError, TypeError, AttributeError): + if default_value is not None: + return default_value + return value + +@helper +def schemingdcat_get_default_lang(): + global DEFAULT_LANG + if DEFAULT_LANG is None: + DEFAULT_LANG = p.toolkit.config.get("ckan.locale_default", "en") + return DEFAULT_LANG + +@helper +def schemingdcat_get_current_lang(): + """ + Returns the current language of the CKAN instance. + + Returns: + str: The current language of the CKAN instance. If the language cannot be determined, the default language 'en' is returned. + """ + try: + return get_lang() + except TypeError: + return p.toolkit.config.get("ckan.locale_default", "en") + +@helper +def schemingdcat_extract_lang_text(text, current_lang): + """ + Extracts the text content for a specified language from a string. + + Args: + text (str): The string to extract the language content from. + Example: "[#en#]Welcome to the CKAN Open Data Portal.[#es#]Bienvenido al portal de datos abiertos CKAN." + current_lang (str): The language code to extract the content for. + Example: "es" + + Returns: + str: The extracted language content, or the original string if no content is found. + Example: "Bienvenido al portal de datos abiertos CKAN." + + """ + + @lru_cache(maxsize=30) + def process_language_content(language_label, text): + """Helper function to process the content for a specific language label. + + Args: + language_label (str): The language label to process. + text (str): The text to process. + + Returns: + str: The text corresponding to the specified language label. + + """ + pattern = re.compile(r'\[#(.*?)#\](.*?)(?=\[#|$)', re.DOTALL) + matches = pattern.findall(text) + + for lang, content in matches: + if lang == language_label.replace('[#', '').replace('#]', ''): + return content.strip() + + return '' + + lang_label = f"[#{current_lang}#]" + default_lang = schemingdcat_get_default_lang() + default_lang_label = f"[#{default_lang}#]" + + lang_text = process_language_content(lang_label, text) + + if not lang_text and lang_label != default_lang_label: + lang_text = process_language_content(default_lang_label, text) + + if not lang_text: + return text + + return lang_text + +@helper +def dataset_display_name(package_or_package_dict): + """ + Returns the localized value of the dataset name by extracting the correct translation. + + Args: + - package_or_package_dict: A dictionary containing the package information. + + Returns: + - The localized value of the dataset name. + """ + field_name = "title" if "title" in package_or_package_dict else "name" + + return schemingdcat_get_localized_value_from_dict( + package_or_package_dict, field_name + ) + + +@helper +def dataset_display_field_value(package_or_package_dict, field_name): + """ + Extracts the correct translation of the dataset field. + + Args: + package_or_package_dict (dict): The package or package dictionary to extract the value from. + field_name (str): The name of the field to extract the value for. + + Returns: + str: The localized value for the given field name. + """ + return schemingdcat_get_localized_value_from_dict( + package_or_package_dict, field_name + ) + +@helper +def schemingdcat_get_localized_value_from_dict( + package_or_package_dict, field_name, default="" +): + """ + Get the localized value from a dictionary. + + This function tries to get the value of a field in a specific language. + If the value is not available in the specific language, it tries to get it in the default language. + If the value is not available in the default language, it tries to get the untranslated value. + If the untranslated value is not available, it returns a default value. + + Args: + package_or_package_dict (dict or str): The package or dictionary to get the value from. + If it's a string, it tries to convert it to a dictionary using json.loads. + field_name (str): The name of the field to get the value from. + default (str, optional): The default value to return if the value is not available. Defaults to "". + + Returns: + str: The localized value, or the default value if the localized value is not available. + """ + if isinstance(package_or_package_dict, str): + try: + package_or_package_dict = json.loads(package_or_package_dict) + except ValueError: + return default + + lang_code = schemingdcat_get_current_lang().split("_")[0] + schemingdcat_get_default_lang() + + translated_field = package_or_package_dict.get(field_name + "_translated", {}) + if isinstance(translated_field, str): + try: + translated_field = json.loads(translated_field) + except ValueError: + translated_field = {} + + # Check the lang_code, if not check the default_lang, if not check the field without translation + return translated_field.get(lang_code) or translated_field.get(DEFAULT_LANG) or package_or_package_dict.get(field_name, default) + +@helper +def schemingdcat_get_readable_file_size(num, suffix="B"): + if not num: + return False + try: + for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]: + num = float(num) + if abs(num) < 1024.0: + return "%3.1f%s%s" % (num, unit, suffix) + num /= 1024.0 + return "%.1f%s%s" % (num, "Y", suffix) + except ValueError: + return False + + +@helper +def schemingdcat_get_group_or_org(id, type="group"): + """ + Retrieve information about a group or organization in CKAN. + + Args: + id (str): The ID of the group or organization. + type (str, optional): The type of the entity to retrieve. Defaults to 'group'. + + Returns: + dict: A dictionary containing information about the group or organization. + """ + return logic.get_action(f"{type}_show")({}, {"id": id}) + +@helper +def schemingdcat_package_list_for_source(source_id): + ''' + Creates a dataset list with the ones belonging to a particular harvest + source. + + It calls the package_list snippet and the pager. + ''' + limit = 20 + page = int(request.args.get('page', 1)) + fq = '+harvest_source_id:"{0}"'.format(source_id) + search_dict = { + 'fq': fq, + 'rows': limit, + 'sort': 'metadata_modified desc', + 'start': (page - 1) * limit, + 'include_private': True + } + + context = {'model': model, 'session': model.Session} + harvest_source = get_harvest_source(source_id) + owner_org = harvest_source.get('owner_org', '') + if owner_org: + user_member_of_orgs = [org['id'] for org + in ckan_helpers.organizations_available('read')] + if (harvest_source and owner_org in user_member_of_orgs): + context['ignore_capacity_check'] = True + + query = logic.get_action('package_search')(context, search_dict) + + base_url = ckan_helpers.url_for( + '{0}.read'.format(DATASET_TYPE_NAME), + id=harvest_source['name'] + ) + + def pager_url(q=None, page=None): + url = base_url + if page: + url += '?page={0}'.format(page) + return url + + pager = ckan_helpers.Page( + collection=query['results'], + page=page, + url=pager_url, + item_count=query['count'], + items_per_page=limit + ) + pager.items = query['results'] + + if query['results']: + out = ckan_helpers.snippet('snippets/package_list.html', packages=query['results']) + out += pager.pager() + else: + out = ckan_helpers.snippet('snippets/package_list_empty.html') + + return out +@helper +def schemingdcat_package_count_for_source(source_id): + ''' + Returns the current package count for datasets associated with the given + source id + ''' + fq = '+harvest_source_id:"{0}"'.format(source_id) + search_dict = {'fq': fq, 'include_private': True} + context = {'model': model, 'session': model.Session} + result = logic.get_action('package_search')(context, search_dict) + return result.get('count', 0) + +@helper +def schemingdcat_parse_localised_date(date_=None): + '''Parse a datetime object or timestamp string as a localised date. + If timestamp is badly formatted, then None is returned. + + :param date_: the date + :type date_: datetime or date or ISO string format + :rtype: date + ''' + if not date_: + return None + if isinstance(date_, str): + try: + date_ = ckan_helpers.date_str_to_datetime(date_) + except (TypeError, ValueError): + return None + # check we are now a datetime or date + if isinstance(date_, datetime.datetime): + date_ = date_.date() + elif not isinstance(date_, datetime.date): + return None + + # Format date based on locale + locale = schemingdcat_get_current_lang() + if locale == 'es': + return date_.strftime('%d-%m-%Y') + else: + return date_.strftime('%Y-%m-%d') + +@lru_cache(maxsize=None) +@helper +def schemingdcat_get_dataset_schema(schema_type="dataset"): + """ + Retrieves the schema for the dataset instance and caches it using the LRU cache decorator for efficient retrieval. + + Args: + schema_type (str, optional): The type of schema to retrieve. Defaults to 'dataset'. + + Returns: + dict: The schema of the dataset instance. + """ + return logic.get_action("scheming_dataset_schema_show")( + {}, {"type": schema_type} + ) + +@helper +def schemingdcat_get_schema_form_groups(entity_type=None, object_type=None, schema=None): + """ + Return a list of schema metadata groups for this form. + + 1. return schema['schema_form_groups'] if it is defined + 2. get schema from entity_type + object_type then + return schema['schema_form_groups'] if they are defined + """ + if schema and "schema_form_groups" in schema: + return schema["schema_form_groups"] + elif entity_type and object_type: + schema = scheming_get_schema(entity_type, object_type) + return schema["schema_form_groups"] if schema and "schema_form_groups" in schema else None + else: + return None + +# Vocabs +@helper +def get_inspire_themes(*args, **kwargs) -> typing.List[typing.Dict[str, str]]: + log.debug(f"inside get_inspire_themes {args=} {kwargs=}") + try: + inspire_themes = p.toolkit.get_action("tag_list")( + data_dict={"vocabulary_id": sdct_config.SCHEMINGDCAT_INSPIRE_THEMES_VOCAB} + ) + except p.toolkit.ObjectNotFound: + inspire_themes = [] + return [{"value": t, "label": t} for t in inspire_themes] + +@helper +def get_ckan_cleaned_name(name): + """ + Cleans a name by removing accents, special characters, and spaces. + + Args: + name (str): The name to clean. + + Returns: + str: The cleaned name. + """ + MAX_TAG_LENGTH = 100 + MIN_TAG_LENGTH = 2 + # Define a dictionary to map accented characters to their unaccented equivalents except ñ + accent_map = { + "á": "a", "à": "a", "ä": "a", "â": "a", "ã": "a", + "é": "e", "è": "e", "ë": "e", "ê": "e", + "í": "i", "ì": "i", "ï": "i", "î": "i", + "ó": "o", "ò": "o", "ö": "o", "ô": "o", "õ": "o", + "ú": "u", "ù": "u", "ü": "u", "û": "u", + "ñ": "ñ", + } + + # Convert the name to lowercase + name = name.lower() + + # Replace accented and special characters with their unaccented equivalents or - + name = "".join(accent_map.get(c, c) for c in name) + name = re.sub(r"[^a-zñ0-9_.-]", "-", name.strip()) + + # Truncate the name to MAX_TAG_LENGTH characters + name = name[:MAX_TAG_LENGTH] + + # If the name is shorter than MIN_TAG_LENGTH, pad it with underscores + if len(name) < MIN_TAG_LENGTH: + name = name.ljust(MIN_TAG_LENGTH, '_') + + return name + +@helper +def get_featured_datasets(count=1): + """ + This helper function retrieves a specified number of featured datasets from the CKAN instance. + It uses the 'package_search' action of the CKAN logic layer to perform a search with specific parameters. + + Parameters: + count (int): The number of featured datasets to retrieve. Default is 1. + + Returns: + list: A list of dictionaries, each representing a featured dataset. + """ + fq = '+featured:true' + search_dict = { + 'fq': fq, + 'sort': 'metadata_modified desc', + 'fl': 'id,name,title,notes,state,metadata_modified,type,extras_featured,extras_graphic_overview', + 'rows': count + } + context = {'model': model, 'session': model.Session} + result = logic.get_action('package_search')(context, search_dict) + + return result['results'] + +@helper +def get_spatial_datasets(count=10): + """ + This helper function retrieves a specified number of featured datasets from the CKAN instance. + It uses the 'package_search' action of the CKAN logic layer to perform a search with specific parameters. + + Parameters: + count (int): The number of featured datasets to retrieve. Default is 1. + + Returns: + list: A list of dictionaries, each representing a featured dataset. + """ + fq = '+dcat_type:*inspire*' + search_dict = { + 'fq': fq, + 'fl': 'extras_dcat_type', + 'rows': count + } + context = {'model': model, 'session': model.Session} + result = logic.get_action('package_search')(context, search_dict) + + return result['results'] + +@lru_cache(maxsize=None) +@helper +def get_header_endpoint_url(endpoint, site_protocol_and_host): + url_for = ckan_helpers.url_for + endpoint_type = endpoint['type'] + endpoint_value = endpoint['endpoint'] + + if endpoint_type == 'ogc': + if ckan_helpers.is_url(endpoint_value): + return ckan_helpers.url_for_static_or_external(endpoint_value) + else: + protocol, host = site_protocol_and_host + return f"{protocol}://{host}/{endpoint_value}" + elif endpoint_type == 'ckan': + return url_for('api.action', ver=3, logic_function='package_list', qualified=True) + elif endpoint_type == 'lod': + return url_for(endpoint_value, **endpoint['endpoint_data']) + elif endpoint_type == 'sparql': + return url_for('/sparql') + +@helper +def schemingdcat_check_valid_url(url): + """ + Check if a string is a valid URL. + + Args: + url (str): The string to check. + + Returns: + bool: True if the string is a valid URL, False otherwise. + """ + try: + result = urlparse(url) + return all([result.scheme, result.netloc]) + except ValueError: + return False diff --git a/ckanext/schemingdcat/lib/field_mapping.py b/ckanext/schemingdcat/lib/field_mapping.py index 3babce2c..34fe3f8b 100644 --- a/ckanext/schemingdcat/lib/field_mapping.py +++ b/ckanext/schemingdcat/lib/field_mapping.py @@ -119,7 +119,7 @@ def validate_v1(self, field_mapping): if not isinstance(lang, str) or not isinstance(remote_field_name, str): raise ValueError('In translated fields, both language and remote_field_name must be strings. e.g. "notes_translated": {"es": "notes-es"}') if not re.match("^[a-z]{2}$", lang): - raise ValueError('Language code must be a 2-letter ISO 639-1 code') + raise ValueError(f'Invalid field "{lang}". Language code must be a 2-letter ISO 639-1 code') def validate_v2(self, field_mapping): """ @@ -162,7 +162,7 @@ def validate_v2(self, field_mapping): raise ValueError('%s must be a dictionary', self.language_field) for lang, lang_config in value.items(): if not isinstance(lang, str) or not re.match("^[a-z]{2}$", lang): - raise ValueError('Language code must be a 2-letter ISO 639-1 code') + raise ValueError(f'Invalid field "{lang}". Language code must be a 2-letter ISO 639-1 code') if not isinstance(lang_config, dict): raise ValueError('Language config must be a dictionary') for lang_prop, lang_value in lang_config.items(): diff --git a/ckanext/schemingdcat/package_controller.py b/ckanext/schemingdcat/package_controller.py index 1c1b8fd4..99db3d5b 100644 --- a/ckanext/schemingdcat/package_controller.py +++ b/ckanext/schemingdcat/package_controller.py @@ -1,159 +1,159 @@ -from ckan.common import request -import json -import ckan.plugins as plugins -import ckanext.schemingdcat.config as sdct_config -import ckanext.schemingdcat.utils as utils - -import logging -import sys - -FACET_OPERATOR_PARAM_NAME = '_facet_operator' -FACET_SORT_PARAM_NAME = '_%s_sort' - -log = logging.getLogger(__name__) - - -class PackageController(): - - plugins.implements(plugins.IPackageController) - - default_facet_operator = sdct_config.default_facet_operator - - def read(self, entity): - pass - - def create(self, entity): - pass - - def edit(self, entity): - pass - - def authz_add_role(self, object_role): - pass - - def authz_remove_role(self, object_role): - pass - - def delete(self, entity): - pass - - def before_search(self, search_params): - """Modifies search parameters before executing a search. - - This method adjusts the 'fq' (filter query) parameter based on the 'facet.field' value in the search parameters. If 'facet.field' is a list, it iterates through each field, applying the '_facet_search_operator' to modify 'fq'. If 'facet.field' is a string, it directly applies the '_facet_search_operator'. If 'facet.field' is not present or is invalid, no modification is made. - - Args: - search_params (dict): The search parameters to be modified. Expected to contain 'facet.field' and 'fq'. - - Returns: - dict: The modified search parameters. - - Raises: - Exception: Captures and logs any exception that occurs during the modification of search parameters. - """ - try: - facet_field = search_params.get('facet.field', '') - if not facet_field: - return search_params - elif isinstance(facet_field, list): - for field in facet_field: - new_fq = self._facet_search_operator(search_params.get('fq', ''), field) - if new_fq and isinstance(new_fq, str): - search_params.update({'fq': new_fq}) - elif isinstance(facet_field, str): - new_fq = self._facet_search_operator(search_params.get('fq', ''), facet_field) - if new_fq and isinstance(new_fq, str): - search_params.update({'fq': new_fq}) - except Exception as e: - log.error("[before_search] Error: %s", e) - return search_params - - def after_search(self, search_results, search_params): - return search_results - - def before_index(self, data_dict): - """Processes the data dictionary before indexing. - - Iterates through each facet defined in the system's facets dictionary. For each facet present in the data dictionary, it attempts to parse its value as JSON. If the value is a valid JSON string, it replaces the original string value with the parsed JSON object. If the value cannot be parsed as JSON (e.g., because it's not a valid JSON string), it leaves the value unchanged. Facets present in the data dictionary but not containing any data are removed. - - Args: - data_dict (dict): The data dictionary to be processed. It's expected to contain keys corresponding to facet names with their associated data as values. - - Returns: - dict: The processed data dictionary with JSON strings parsed into objects where applicable and empty facets removed. - """ - for facet, label in utils.get_facets_dict().items(): - data = data_dict.get(facet) - #log.debug("[before_index] Data ({1}) in facet: {0}".format(data, facet)) - if data: - if isinstance(data, str): - try: - data_dict[facet] = json.loads(data) - except json.decoder.JSONDecodeError: - data_dict[facet] = data - else: - if facet in data_dict: - del data_dict[facet] - - return data_dict - - def before_view(self, pkg_dict): - return pkg_dict - - def after_create(self, context, data_dict): - return data_dict - - def after_update(self, context, data_dict): - return data_dict - - def after_delete(self, context, data_dict): - return data_dict - - def after_show(self, context, data_dict): - return data_dict - - def update_facet_titles(self, facet_titles): - return facet_titles - - def package_controller_config(self, default_facet_operator): - self.default_facet_operator = default_facet_operator - - def _facet_search_operator(self, fq, facet_field): - """Modifies the query filter (fq) to use the OR operator among the specified facet filters. - - Args: - fq (str): The current query filter. - facet_field (list): List of facet fields to consider for the OR operation. - - Returns: - str: The modified query filter. - """ - new_fq = fq - try: - facet_operator = self.default_facet_operator - # Determine the facet operator based on request parameters - if request.params.get(FACET_OPERATOR_PARAM_NAME) == 'OR': - facet_operator = 'OR' - elif request.params.get(FACET_OPERATOR_PARAM_NAME) == 'AND': - facet_operator = 'AND' - - if facet_operator == 'OR' and facet_field: - # Split the original fq into conditions, assuming they are separated by " AND " - conditions = fq.split(' AND ') - # Filter and group conditions that correspond to facet fields - facet_conditions = [cond for cond in conditions if any(fld in cond for fld in facet_field)] - non_facet_conditions = [cond for cond in conditions if not any(fld in cond for fld in facet_field)] - # Reconstruct fq using " OR " to join facet conditions and " AND " for the rest - if facet_conditions: - new_fq = ' OR '.join(facet_conditions) - if non_facet_conditions: - new_fq = f"({new_fq}) AND {' AND '.join(non_facet_conditions)}" - else: - new_fq = ' AND '.join(non_facet_conditions) - - except Exception as e: - log.error("[_facet_search_operator] Error modifying the query filter: %s", e) - # In case of error, return the original fq - new_fq = fq - +from ckan.common import request +import json +import ckan.plugins as plugins +import ckanext.schemingdcat.config as sdct_config +import ckanext.schemingdcat.utils as utils + +import logging +import sys + +FACET_OPERATOR_PARAM_NAME = '_facet_operator' +FACET_SORT_PARAM_NAME = '_%s_sort' + +log = logging.getLogger(__name__) + + +class PackageController(): + + plugins.implements(plugins.IPackageController) + + default_facet_operator = sdct_config.default_facet_operator + + def read(self, entity): + pass + + def create(self, entity): + pass + + def edit(self, entity): + pass + + def authz_add_role(self, object_role): + pass + + def authz_remove_role(self, object_role): + pass + + def delete(self, entity): + pass + + def before_search(self, search_params): + """Modifies search parameters before executing a search. + + This method adjusts the 'fq' (filter query) parameter based on the 'facet.field' value in the search parameters. If 'facet.field' is a list, it iterates through each field, applying the '_facet_search_operator' to modify 'fq'. If 'facet.field' is a string, it directly applies the '_facet_search_operator'. If 'facet.field' is not present or is invalid, no modification is made. + + Args: + search_params (dict): The search parameters to be modified. Expected to contain 'facet.field' and 'fq'. + + Returns: + dict: The modified search parameters. + + Raises: + Exception: Captures and logs any exception that occurs during the modification of search parameters. + """ + try: + facet_field = search_params.get('facet.field', '') + if not facet_field: + return search_params + elif isinstance(facet_field, list): + for field in facet_field: + new_fq = self._facet_search_operator(search_params.get('fq', ''), field) + if new_fq and isinstance(new_fq, str): + search_params.update({'fq': new_fq}) + elif isinstance(facet_field, str): + new_fq = self._facet_search_operator(search_params.get('fq', ''), facet_field) + if new_fq and isinstance(new_fq, str): + search_params.update({'fq': new_fq}) + except Exception as e: + log.error("[before_search] Error: %s", e) + return search_params + + def after_search(self, search_results, search_params): + return search_results + + def before_index(self, data_dict): + """Processes the data dictionary before indexing. + + Iterates through each facet defined in the system's facets dictionary. For each facet present in the data dictionary, it attempts to parse its value as JSON. If the value is a valid JSON string, it replaces the original string value with the parsed JSON object. If the value cannot be parsed as JSON (e.g., because it's not a valid JSON string), it leaves the value unchanged. Facets present in the data dictionary but not containing any data are removed. + + Args: + data_dict (dict): The data dictionary to be processed. It's expected to contain keys corresponding to facet names with their associated data as values. + + Returns: + dict: The processed data dictionary with JSON strings parsed into objects where applicable and empty facets removed. + """ + for facet, label in utils.get_facets_dict().items(): + data = data_dict.get(facet) + #log.debug("[before_index] Data ({1}) in facet: {0}".format(data, facet)) + if data: + if isinstance(data, str): + try: + data_dict[facet] = json.loads(data) + except json.decoder.JSONDecodeError: + data_dict[facet] = data + else: + if facet in data_dict: + del data_dict[facet] + + return data_dict + + def before_view(self, pkg_dict): + return pkg_dict + + def after_create(self, context, data_dict): + return data_dict + + def after_update(self, context, data_dict): + return data_dict + + def after_delete(self, context, data_dict): + return data_dict + + def after_show(self, context, data_dict): + return data_dict + + def update_facet_titles(self, facet_titles): + return facet_titles + + def package_controller_config(self, default_facet_operator): + self.default_facet_operator = default_facet_operator + + def _facet_search_operator(self, fq, facet_field): + """Modifies the query filter (fq) to use the OR operator among the specified facet filters. + + Args: + fq (str): The current query filter. + facet_field (list): List of facet fields to consider for the OR operation. + + Returns: + str: The modified query filter. + """ + new_fq = fq + try: + facet_operator = self.default_facet_operator + # Determine the facet operator based on request parameters + if request.params.get(FACET_OPERATOR_PARAM_NAME) == 'OR': + facet_operator = 'OR' + elif request.params.get(FACET_OPERATOR_PARAM_NAME) == 'AND': + facet_operator = 'AND' + + if facet_operator == 'OR' and facet_field: + # Split the original fq into conditions, assuming they are separated by " AND " + conditions = fq.split(' AND ') + # Filter and group conditions that correspond to facet fields + facet_conditions = [cond for cond in conditions if any(fld in cond for fld in facet_field)] + non_facet_conditions = [cond for cond in conditions if not any(fld in cond for fld in facet_field)] + # Reconstruct fq using " OR " to join facet conditions and " AND " for the rest + if facet_conditions: + new_fq = ' OR '.join(facet_conditions) + if non_facet_conditions: + new_fq = f"({new_fq}) AND {' AND '.join(non_facet_conditions)}" + else: + new_fq = ' AND '.join(non_facet_conditions) + + except Exception as e: + log.error("[_facet_search_operator] Error modifying the query filter: %s", e) + # In case of error, return the original fq + new_fq = fq + return new_fq \ No newline at end of file diff --git a/ckanext/schemingdcat/utils.py b/ckanext/schemingdcat/utils.py index 90fb731e..071a4f4a 100644 --- a/ckanext/schemingdcat/utils.py +++ b/ckanext/schemingdcat/utils.py @@ -1,293 +1,290 @@ -from ckan.common import config -import ckan.logic as logic -from ckanext.schemingdcat import config as sdct_config -import logging -import os -import inspect -import json -import hashlib -from threading import Lock -from ckanext.dcat.utils import CONTENT_TYPES -import yaml -from yaml.loader import SafeLoader -from pathlib import Path - -try: - from paste.reloader import watch_file -except ImportError: - watch_file = None - -log = logging.getLogger(__name__) - -_facets_dict = None -_public_dirs = None -_files_hash = [] -_dirs_hash = [] - -_facets_dict_lock = Lock() -_public_dirs_lock = Lock() - - -def get_facets_dict(): - """Get the labels for all fields defined in the scheming file. - - Returns: - dict: A dictionary containing the labels for all fields defined in the scheming file. - """ - global _facets_dict - if not _facets_dict: - with _facets_dict_lock: - if not _facets_dict: - _facets_dict = {} - - schema = logic.get_action('scheming_dataset_schema_show')( - {}, - {'type': 'dataset'} - ) - - for item in schema['dataset_fields']: - _facets_dict[item['field_name']] = item['label'] - - for item in schema['resource_fields']: - _facets_dict[item['field_name']] = item['label'] - - return _facets_dict - -def get_public_dirs(): - """Get the list of public directories specified in the configuration file. - - Returns: - list: A list of public directories specified in the configuration file. - """ - global _public_dirs - - if not _public_dirs: - with _public_dirs_lock: - if not _public_dirs: - _public_dirs = config.get('extra_public_paths', '').split(',') - - return _public_dirs - -def public_file_exists(path): - """Check if a file exists in the public directories specified in the configuration file. - - Args: - path (str): The path of the file to check. - - Returns: - bool: True if the file exists in one of the public directories, False otherwise. - """ - #log.debug("Check if exists: {0}".format(path)) - file_hash = hashlib.sha512(path.encode('utf-8')).hexdigest() - - if file_hash in _files_hash: - return True - - public_dirs = get_public_dirs() - for i in range(len(public_dirs)): - public_path = os.path.join(public_dirs[i], path) - if os.path.isfile(public_path): - _files_hash.append(file_hash) - return True - - return False - -def public_dir_exists(path): - """Check if a directory exists in the public directories specified in the configuration file. - - Args: - path (str): The path of the directory to check. - - Returns: - bool: True if the directory exists in one of the public directories, False otherwise. - """ - dir_hash = hashlib.sha512(path.encode('utf-8')).hexdigest() - - if dir_hash in _dirs_hash: - return True - - public_dirs = get_public_dirs() - for i in range(len(public_dirs)): - public_path = os.path.join(public_dirs[i], path) - if os.path.isdir(public_path): - _dirs_hash.append(dir_hash) - return True - - return False - -def init_config(): - sdct_config.linkeddata_links = _load_yaml('linkeddata_links.yaml') - sdct_config.geometadata_links = _load_yaml('geometadata_links.yaml') - sdct_config.endpoints = _load_yaml(sdct_config.endpoints_yaml) - -def is_yaml(file): - """Check if a file has a YAML extension. - - Args: - file (str): The file name or path. - - Returns: - bool: True if the file has a .yaml or .yml extension, False otherwise. - """ - return file.lower().endswith(('.yaml', '.yml')) - -def _load_yaml(file): - """Load a YAML file, either from a module path or a default directory. - - Args: - file (str): The name of the YAML file to load. Can be a module path like "module:file.yaml". - - Returns: - dict: A dictionary containing the data from the YAML file, or an empty dictionary if the file is invalid or cannot be loaded. - """ - if not is_yaml(file): - log.error("The file {0} is not a valid YAML file".format(file)) - return {} - - yaml_data = _load_yaml_module_path(file) - if not yaml_data: - yaml_data = _load_default_yaml(file) - return yaml_data - -def _load_yaml_module_path(file): - """Load a YAML file from a module path. - - Given a path like "module:file.yaml", find the file relative to the import path of the module. - - Args: - file (str): The module path of the YAML file. - - Returns: - dict or None: A dictionary containing the data from the YAML file, or None if the module cannot be imported or the file cannot be loaded. - """ - log.debug('file: %s', file) - - if ':' not in file: - return None - - module, file_name = file.split(':', 1) - try: - m = __import__(module, fromlist=['']) - log.debug('m: %s', m) - log.debug('file_name: %s', os.path.join(os.path.dirname(inspect.getfile(m)), file_name)) - except ImportError: - log.error("Module {0} could not be imported".format(module)) - return None - - return _load_yaml_file(os.path.join(os.path.dirname(inspect.getfile(m)), file_name)) - -def _load_default_yaml(file): - """Load a YAML file from the 'codelists' directory of the schemingdcat extension. - - Args: - file (str): The name of the YAML file to load. - - Returns: - dict: A dictionary containing the data from the YAML file, or an empty dictionary if the file cannot be loaded. - """ - source_path = Path(__file__).resolve(True) - log.debug('source_path: %s', source_path) - return _load_yaml_file(source_path.parent.joinpath('codelists', file)) - -def _load_yaml_file(path): - """Load a YAML file from a given path. - - Args: - path (str): The file path of the YAML file. - - Returns: - dict: A dictionary containing the data from the YAML file, or an empty dictionary if the file cannot be loaded. - """ - yaml_data = {} - try: - if os.path.exists(path): - if watch_file: - watch_file(path) - with open(path, 'r') as f: - yaml_data = yaml.load(f, Loader=SafeLoader) - else: - log.error("The file {0} does not exist".format(path)) - except Exception as e: - log.error("Could not read configuration from {0}: {1}".format(path, e)) - return yaml_data - -def get_linked_data(id): - """Get linked data for a given identifier. - - Args: - id (str): The identifier to get linked data for. - - Returns: - list: A list of dictionaries containing linked data for the identifier. - """ - if sdct_config.debug: - linkeddata_links = _load_yaml('linkeddata_links.yaml') - else: - linkeddata_links = sdct_config.linkeddata_links - - data=[] - for name in CONTENT_TYPES: - data.append({ - 'name': name, - 'display_name': linkeddata_links.get(name,{}).get('display_name',CONTENT_TYPES[name]), - 'image_display_url': linkeddata_links.get(name,{}).get('image_display_url', None), - 'description': linkeddata_links.get(name,{}).get('description','Formats '+ CONTENT_TYPES[name]), - 'description_url': linkeddata_links.get(name,{}).get('description_url', None), - 'endpoint_data':{ - '_id': id, - '_format': name, - } - }) - - return data - -def get_geospatial_metadata(): - """Get geospatial metadata for CSW formats. - - Returns: - list: A list of dictionaries containing geospatial metadata for CSW formats. - """ - if sdct_config.debug: - geometadata_links = _load_yaml('geometadata_links.yaml') - else: - geometadata_links = sdct_config.geometadata_links - data=[] - for item in geometadata_links.get('csw_formats',{}): - data.append({ - 'name': item['name'], - 'display_name': item['display_name'], - 'image_display_url': item['image_display_url'], - 'description': item['description'], - 'description_url': item['description_url'], - 'url': (sdct_config.geometadata_link_domain or '') + geometadata_links['csw_url'].format(output_format=item['output_format'], schema=item['output_schema'], id='{id}') - }) - - return data - -def parse_json(value, default_value=None): - """ - Parses a JSON string and returns the resulting object. - If the input value is not a valid JSON string, returns the default value. - If the default value is not provided, returns the input value. - - Args: - value (str): The JSON string to parse. - default_value (any, optional): The default value to return if the input value is not a valid JSON string. - Defaults to None. - - Returns: - any: The parsed JSON object, or the default value if the input value is not a valid JSON string. - """ - try: - return json.loads(value) - except (ValueError, TypeError, AttributeError): - if default_value is not None: - return default_value - - # The json may already have been parsed and we have the value for the - # language already. - if isinstance(value, int): - # If the value is a number, it has been converted into an int - but - # we want a string here. - return str(value) +from ckan.common import config +import ckan.logic as logic +from ckanext.schemingdcat import config as sdct_config +import logging +import os +import inspect +import json +import hashlib +from threading import Lock +from ckanext.dcat.utils import CONTENT_TYPES +import yaml +from yaml.loader import SafeLoader +from pathlib import Path + +try: + from paste.reloader import watch_file +except ImportError: + watch_file = None + +log = logging.getLogger(__name__) + +_facets_dict = None +_public_dirs = None +_files_hash = [] +_dirs_hash = [] + +_facets_dict_lock = Lock() +_public_dirs_lock = Lock() + + +def get_facets_dict(): + """Get the labels for all fields defined in the scheming file. + + Returns: + dict: A dictionary containing the labels for all fields defined in the scheming file. + """ + global _facets_dict + if not _facets_dict: + with _facets_dict_lock: + if not _facets_dict: + _facets_dict = {} + + schema = logic.get_action('scheming_dataset_schema_show')( + {}, + {'type': 'dataset'} + ) + + for item in schema['dataset_fields']: + _facets_dict[item['field_name']] = item['label'] + + for item in schema['resource_fields']: + _facets_dict[item['field_name']] = item['label'] + + return _facets_dict + +def get_public_dirs(): + """Get the list of public directories specified in the configuration file. + + Returns: + list: A list of public directories specified in the configuration file. + """ + global _public_dirs + + if not _public_dirs: + with _public_dirs_lock: + if not _public_dirs: + _public_dirs = config.get('extra_public_paths', '').split(',') + + return _public_dirs + +def public_file_exists(path): + """Check if a file exists in the public directories specified in the configuration file. + + Args: + path (str): The path of the file to check. + + Returns: + bool: True if the file exists in one of the public directories, False otherwise. + """ + #log.debug("Check if exists: {0}".format(path)) + file_hash = hashlib.sha512(path.encode('utf-8')).hexdigest() + + if file_hash in _files_hash: + return True + + public_dirs = get_public_dirs() + for i in range(len(public_dirs)): + public_path = os.path.join(public_dirs[i], path) + if os.path.isfile(public_path): + _files_hash.append(file_hash) + return True + + return False + +def public_dir_exists(path): + """Check if a directory exists in the public directories specified in the configuration file. + + Args: + path (str): The path of the directory to check. + + Returns: + bool: True if the directory exists in one of the public directories, False otherwise. + """ + dir_hash = hashlib.sha512(path.encode('utf-8')).hexdigest() + + if dir_hash in _dirs_hash: + return True + + public_dirs = get_public_dirs() + for i in range(len(public_dirs)): + public_path = os.path.join(public_dirs[i], path) + if os.path.isdir(public_path): + _dirs_hash.append(dir_hash) + return True + + return False + +def init_config(): + sdct_config.linkeddata_links = _load_yaml('linkeddata_links.yaml') + sdct_config.geometadata_links = _load_yaml('geometadata_links.yaml') + sdct_config.endpoints = _load_yaml(sdct_config.endpoints_yaml) + +def is_yaml(file): + """Check if a file has a YAML extension. + + Args: + file (str): The file name or path. + + Returns: + bool: True if the file has a .yaml or .yml extension, False otherwise. + """ + return file.lower().endswith(('.yaml', '.yml')) + +def _load_yaml(file): + """Load a YAML file, either from a module path or a default directory. + + Args: + file (str): The name of the YAML file to load. Can be a module path like "module:file.yaml". + + Returns: + dict: A dictionary containing the data from the YAML file, or an empty dictionary if the file is invalid or cannot be loaded. + """ + if not is_yaml(file): + log.error("The file {0} is not a valid YAML file".format(file)) + return {} + + yaml_data = _load_yaml_module_path(file) + if not yaml_data: + yaml_data = _load_default_yaml(file) + return yaml_data + +def _load_yaml_module_path(file): + """Load a YAML file from a module path. + + Given a path like "module:file.yaml", find the file relative to the import path of the module. + + Args: + file (str): The module path of the YAML file. + + Returns: + dict or None: A dictionary containing the data from the YAML file, or None if the module cannot be imported or the file cannot be loaded. + """ + + if ':' not in file: + return None + + module, file_name = file.split(':', 1) + try: + m = __import__(module, fromlist=['']) + except ImportError: + log.error("Module {0} could not be imported".format(module)) + return None + + return _load_yaml_file(os.path.join(os.path.dirname(inspect.getfile(m)), file_name)) + +def _load_default_yaml(file): + """Load a YAML file from the 'codelists' directory of the schemingdcat extension. + + Args: + file (str): The name of the YAML file to load. + + Returns: + dict: A dictionary containing the data from the YAML file, or an empty dictionary if the file cannot be loaded. + """ + source_path = Path(__file__).resolve(True) + log.debug('source_path: %s', source_path) + return _load_yaml_file(source_path.parent.joinpath('codelists', file)) + +def _load_yaml_file(path): + """Load a YAML file from a given path. + + Args: + path (str): The file path of the YAML file. + + Returns: + dict: A dictionary containing the data from the YAML file, or an empty dictionary if the file cannot be loaded. + """ + yaml_data = {} + try: + if os.path.exists(path): + if watch_file: + watch_file(path) + with open(path, 'r') as f: + yaml_data = yaml.load(f, Loader=SafeLoader) + else: + log.error("The file {0} does not exist".format(path)) + except Exception as e: + log.error("Could not read configuration from {0}: {1}".format(path, e)) + return yaml_data + +def get_linked_data(id): + """Get linked data for a given identifier. + + Args: + id (str): The identifier to get linked data for. + + Returns: + list: A list of dictionaries containing linked data for the identifier. + """ + if sdct_config.debug: + linkeddata_links = _load_yaml('linkeddata_links.yaml') + else: + linkeddata_links = sdct_config.linkeddata_links + + data=[] + for name in CONTENT_TYPES: + data.append({ + 'name': name, + 'display_name': linkeddata_links.get(name,{}).get('display_name',CONTENT_TYPES[name]), + 'image_display_url': linkeddata_links.get(name,{}).get('image_display_url', None), + 'description': linkeddata_links.get(name,{}).get('description','Formats '+ CONTENT_TYPES[name]), + 'description_url': linkeddata_links.get(name,{}).get('description_url', None), + 'endpoint_data':{ + '_id': id, + '_format': name, + } + }) + + return data + +def get_geospatial_metadata(): + """Get geospatial metadata for CSW formats. + + Returns: + list: A list of dictionaries containing geospatial metadata for CSW formats. + """ + if sdct_config.debug: + geometadata_links = _load_yaml('geometadata_links.yaml') + else: + geometadata_links = sdct_config.geometadata_links + data=[] + for item in geometadata_links.get('csw_formats',{}): + data.append({ + 'name': item['name'], + 'display_name': item['display_name'], + 'image_display_url': item['image_display_url'], + 'description': item['description'], + 'description_url': item['description_url'], + 'url': (sdct_config.geometadata_link_domain or '') + geometadata_links['csw_url'].format(output_format=item['output_format'], schema=item['output_schema'], id='{id}') + }) + + return data + +def parse_json(value, default_value=None): + """ + Parses a JSON string and returns the resulting object. + If the input value is not a valid JSON string, returns the default value. + If the default value is not provided, returns the input value. + + Args: + value (str): The JSON string to parse. + default_value (any, optional): The default value to return if the input value is not a valid JSON string. + Defaults to None. + + Returns: + any: The parsed JSON object, or the default value if the input value is not a valid JSON string. + """ + try: + return json.loads(value) + except (ValueError, TypeError, AttributeError): + if default_value is not None: + return default_value + + # The json may already have been parsed and we have the value for the + # language already. + if isinstance(value, int): + # If the value is a number, it has been converted into an int - but + # we want a string here. + return str(value) return value \ No newline at end of file From 8ad1c7447f17c89317d5e26c3c08cf4af380608b Mon Sep 17 00:00:00 2001 From: mjanez <96422458+mjanez@users.noreply.github.com> Date: Mon, 29 Jul 2024 15:03:28 +0200 Subject: [PATCH 2/8] Improve ckan harvester - Add field_mapping for ckan harvester. - Add field_mapping for extras fields. - Add interfaces --- README.md | 128 +++++++++++++++++++++++- ckanext/schemingdcat/config.py | 6 ++ ckanext/schemingdcat/harvesters/base.py | 115 +++++++++++++-------- ckanext/schemingdcat/harvesters/ckan.py | 52 +++++++--- ckanext/schemingdcat/interfaces.py | 32 +++++- 5 files changed, 268 insertions(+), 65 deletions(-) diff --git a/README.md b/README.md index 526db978..b9845dd6 100644 --- a/README.md +++ b/README.md @@ -346,9 +346,9 @@ To use it, you need to add the `schemingdcat_ckan_harvester` plugin to your opti The Scheming DCAT CKAN Harvester supports the same configuration options as the [CKAN Harvester](https://github.com/ckan/ckanext-harvest#the-ckan-harvester), plus the following additional options: -* `dataset_field_mapping/distribution_field_mapping` (Optional): Mapping field names from local to remote instance, all info at: [Field mapping structure](#field-mapping-structure) +* `dataset_field_mapping/distribution_field_mapping` (Optional): Mapping field names from local to remote instance, all info at: [CKAN Harvester Field mapping structure](#field-mapping-structure) * `field_mapping_schema_version` (**Mandatory if exists** `dataset_field_mapping/distribution_field_mapping`): Schema version of the field_mapping to ensure compatibility with older schemas. The default is `2`. -* `schema` (Optional): The name of the schema to use for the harvested datasets. This is the `schema_name` as defined in the scheming file. The remote and local instances must have the same dataset schema. If not provided, the local instance schema will be used. +* `schema` (Optional): The name of the schema to use for the harvested datasets. This is the `schema_name` as defined in the scheming file. The remote and local instances must have the same dataset schema. If not provided, the `dataset_field_mapping/distribution_field_mapping` is needed to mapping fields. * `allow_harvest_datasets` (Optional): If `true`, the harvester will create new records even if the package type is from the harvest source. If `false`, the harvester will only create records that originate from the instance. Default is `false`. * `remote_orgs` (Optional): [WIP]. Only `only_local`. * `remote_groups` (Optional): [WIP]. Only `only_local`. @@ -400,10 +400,130 @@ And example configuration might look like this: // "field_value" extends the original list of values retrieved from the remote file for all records. "field_value": ["https://www.example.org/codelist/a","https://www.example.org/codelist/b", "https://www.example.org/codelist/c"] }, + "my_custom_field": { + // If you need to map a field in a remote dict to the "extras" dict, use the "extras_" prefix to indicate that the field is there. + "field_name": "extras_remote_custom_field" + }, }, } ``` +#### Field mapping structure +The `dataset_field_mapping`/`distribution_field_mapping` is structured as follows (multilingual version): + +```json +{ + ... + "field_mapping_schema_version": 2, + "/": { + "": { + "languages": { + "": { + <"field_value": "/">,/<"field_name": "/"> + }, + ... + }, + ... + }, + ... + } +} +``` + +* ``: The name of the field in the CKAN schema. + * ``: (Optional) The language code for multilingual fields. This should be a valid [ISO 639-1 language code](https://localizely.com/iso-639-1-list/). This is now nested under the `languages` key. +* `/`: (Optional) A fixed value or a list of fixed values that will be assigned to the field for all records. +* **Field labels**: Field name: + * `/`: (Optional) The name of the field in the remote file or a list of field names. + +For fields that are not multilingual, you can directly use `field_name` without the `languages` key. For example: + +```json +{ + ... + "field_mapping_schema_version": 2, + "/": { + "": { + <"field_value": "/">,/<"field_name": "/"> + }, + ... + } +} +``` + +>[!IMPORTANT] +>The field mapping can be done either at the dataset level using `dataset_field_mapping` or at the resource level using `distribution_field_mapping`. The structure and options are the same for both. The `field_mapping_schema_version` is `2` by default, but needs to be set to avoid errors. + +#### Field Types +There are two types of fields that can be defined in the configuration: + +1. **Regular fields**: These fields have a field label to define the mapping or a fixed value for all its records. + - **Properties**: A field can have one of these three properties: + - **Fixed value fields (`field_value`)**: These fields have a fixed value that is assigned to all records. This is defined using the `field_value` property. If `field_value` is a list, `field_name` could be set at the same time, and the `field_value` extends the list obtained from the remote field. + - **Field labels**: Field name: + - **Name based fields (`field_name`)**: These fields are defined by their name in the Excel file. This is defined using the `field_name` property, or if you need to map a field in a remote dict to the `extras` dict, use the `extras_` prefix to indicate that the field is there. +2. **Multilingual Fields (`languages`)**: These fields have different values for different languages. Each language is represented as a separate object within the field object (`es`, `en`, ...). The language object can have `field_value` and `field_name` properties, just like a normal field. + + +**Example** +Here are some examples of configuration files: + + * *Field names*: With `field_name` to define the mapping based on names of attributes in the remote sheet (`my_title`, `org_identifier`, `keywords`). + ```json + { + "storage_type": "gspread", + "dataset_sheet": "Dataset", + "distribution_sheet": "Distribution", + + ... + # other properties + ... + + "field_mapping_schema_version": 2, + "dataset_field_mapping": { + "title": { + "field_name": "my_title" + }, + "title_translated": { + "languages": { + "en": { + "field_name": "my_title-en" + }, + "de": { + "field_value": "" + }, + "es": { + "field_name": "my_title" + } + } + }, + "private": { + "field_name": "private" + }, + "theme": { + "field_name": ["theme", "theme_eu"] + }, + "tag_custom": { + "field_name": "keywords" + }, + "tag_string": { + "field_name": ["theme_a", "theme_b", "theme_c"] + }, + "theme_es": { + "field_value": "http://datos.gob.es/kos/sector-publico/sector/medio-ambiente" + }, + "tag_uri": { + "field_name": "keyword_uri", + // "field_value" extends the original list of values retrieved from the remote file for all records. + "field_value": ["https://www.example.org/codelist/a","https://www.example.org/codelist/b", "https://www.example.org/codelist/c"] + }, + "my_custom_field": { + // If you need to map a field in a remote dict to the "extras" dict, use the "extras_" prefix to indicate that the field is there. + "field_name": "extras_remote_custom_field" + } + } + } + ``` ###TODO: Scheming DCAT CSW INSPIRE Harvester A harvester for remote CSW catalogues using the INSPIRE ISO 19139 metadata profile. This harvester is a subclass of the CSW Harvester provided by `ckanext-spatial` and is designed to work with the `schemingdcat` plugin to provide a more versatile and customizable harvester for CSW endpoints and GeoDCAT-AP CKAN instances. @@ -429,7 +549,7 @@ Remote Google Sheet/Onedrive Excel metadata upload Harvester supports the follow * `storage_type` - **Mandatory**: The type of storage to use for the harvested datasets as `onedrive` or `gspread`. Default is `onedrive`. * `dataset_sheet` - **Mandatory**: The name of the sheet in the Excel file that contains the dataset records. * `field_mapping_schema_version`: Schema version of the field_mapping to ensure compatibility with older schemas. The default is `2`. -* `dataset_field_mapping/distribution_field_mapping`: Mapping field names from local to remote instance, all info at: [Field mapping structure](#field-mapping-structure) +* `dataset_field_mapping/distribution_field_mapping`: Mapping field names from local to remote instance, all info at: [Field mapping structure](#field-mapping-structure-sheets-harvester) * `credentials`: The `credentials` parameter should be used to provide the authentication credentials. The credentials depends on the `storage_type` used. * For `onedrive`: The credentials parameter should be a dictionary with the following keys: `username`: A string representing the username. `password`: A string representing the password. * For `gspread` or `gdrive`: The credentials parameter should be a string containing the credentials in `JSON` format. You can obtain the credentials by following the instructions provided in the [Google Workspace documentation.](https://developers.google.com/workspace/guides/create-credentials?hl=es-419) @@ -452,7 +572,7 @@ Remote Google Sheet/Onedrive Excel metadata upload Harvester supports the follow * `clean_tags`: By default, tags are stripped of accent characters, spaces and capital letters for display. Setting this option to `False` will keep the original tag names. Default is `True`. * `source_date_format`: By default the harvester uses [`dateutil`](https://dateutil.readthedocs.io/en/stable/parser.html) to parse the date, but if the date format of the strings is particularly different you can use this parameter to specify the format, e.g. `%d/%m/%Y`. Accepted formats are: [COMMON_DATE_FORMATS](https://github.com/mjanez/ckanext-schemingdcat/blob/main/ckanext/schemingdcat/config.py#L185-L200) -#### Field mapping structure +#### Field mapping structure (Sheets harvester) The `dataset_field_mapping`/`distribution_field_mapping` is structured as follows (multilingual version): ```json diff --git a/ckanext/schemingdcat/config.py b/ckanext/schemingdcat/config.py index 90f555ba..c00b1b43 100644 --- a/ckanext/schemingdcat/config.py +++ b/ckanext/schemingdcat/config.py @@ -20,6 +20,12 @@ mimetype_base_uri = 'http://www.iana.org/assignments/media-types' slugify_pat = re.compile('[^a-zA-Z0-9]') +# schemingdcat field_mapping extras field_names +field_mapping_extras_prefix_symbol = '_' +field_mapping_extras_prefix_list = 'extras' +field_mapping_extras_prefix = field_mapping_extras_prefix_list + field_mapping_extras_prefix_symbol + + # Default DCAT metadata configuration OGC2CKAN_HARVESTER_MD_CONFIG = { 'access_rights': 'http://inspire.ec.europa.eu/metadata-codelist/LimitationsOnPublicAccess/noLimitations', diff --git a/ckanext/schemingdcat/harvesters/base.py b/ckanext/schemingdcat/harvesters/base.py index a4c87863..eaa5d3d5 100644 --- a/ckanext/schemingdcat/harvesters/base.py +++ b/ckanext/schemingdcat/harvesters/base.py @@ -43,7 +43,10 @@ URL_REGEX, INVALID_CHARS, ACCENT_MAP, - slugify_pat + slugify_pat, + field_mapping_extras_prefix, + field_mapping_extras_prefix_symbol, + field_mapping_extras_prefix_list ) log = logging.getLogger(__name__) @@ -496,18 +499,39 @@ def _standardize_ckan_dict_from_field_mapping(self, dataset, field_mapping): """ def normalize_key(key): """ - Helper function to normalize the key by converting to lowercase and replacing non-alphanumeric characters with underscores. + Helper function to normalize the key by removing accents, converting to lowercase, replacing non-alphanumeric characters with '-', and trimming spaces. """ - return slugify_pat.sub('_', key.lower()) + try: + key = key.strip() + + # Remove accents + norm_key = key.translate(ACCENT_MAP) + + # Replace non-alphanumeric characters with underscores + normalized_key = slugify_pat.sub('-', norm_key.lower()) + + #log.debug('key: %s normalize key: %s', key, normalized_key) + + return normalized_key + + except AttributeError: + # Manejar el caso donde 'key' no es una cadena + raise ValueError("The provided key must be a string") + + except Exception as e: + # Manejar cualquier otra excepción + raise RuntimeError(f"An unexpected error occurred: {e}") def get_extra_value(extras, key): """ Helper function to get the value from the extras list where the key matches (case insensitive and normalized). - """ + """ normalized_key = normalize_key(key) for item in extras: if normalize_key(item['key']) == normalized_key: + #log.debug('"extras" dict key: %s - normalized: %s', key, normalized_key) return item['value'] + return None def apply_field_mapping(d, mapping): @@ -515,9 +539,9 @@ def apply_field_mapping(d, mapping): for local_field, remote_info in mapping.items(): if 'field_name' in remote_info: remote_field = remote_info['field_name'] - if remote_field and remote_field.startswith('extras.'): - extra_key = remote_field.split('.', 1)[1] - extra_value = get_extra_value(d.get('extras', []), extra_key) + if remote_field and remote_field.startswith(field_mapping_extras_prefix): + extra_key = remote_field.split(field_mapping_extras_prefix_symbol, 1)[1] + extra_value = get_extra_value(d.get(field_mapping_extras_prefix_list, []), extra_key) if extra_value is not None: new_dict[local_field] = extra_value elif remote_field in d: @@ -528,9 +552,9 @@ def apply_field_mapping(d, mapping): for lang, lang_info in remote_info['languages'].items(): if 'field_name' in lang_info: remote_field = lang_info['field_name'] - if remote_field and remote_field.startswith('extras.'): - extra_key = remote_field.split('.', 1)[1] - extra_value = get_extra_value(d.get('extras', []), extra_key) + if remote_field and remote_field.startswith(field_mapping_extras_prefix): + extra_key = remote_field.split(field_mapping_extras_prefix_symbol, 1)[1] + extra_value = get_extra_value(d.get(field_mapping_extras_prefix_list, []), extra_key) if extra_value is not None: if local_field not in new_dict: new_dict[local_field] = {} @@ -544,7 +568,7 @@ def apply_field_mapping(d, mapping): new_dict[local_field] = {} new_dict[local_field][lang] = lang_info['field_value'] return new_dict - + # Apply dataset field mapping dataset_field_mapping = field_mapping.get('dataset_field_mapping', {}) standardized_dataset = apply_field_mapping(dataset, dataset_field_mapping) @@ -816,9 +840,9 @@ def get_mapped_fields(fields, field_mapping): for field in self._remote_schema["resource_fields"] ) else: - log.warning("Failed to retrieve remote schema from: %s. Using local schema by default.", remote_ckan_base_url) - remote_datasets_colnames = set() - remote_distributions_colnames = set() + log.warning("Failed to retrieve remote schema from: %s. Using local schema and config field_mapping by default.", remote_ckan_base_url) + remote_datasets_colnames = set(remote_dataset_field_mapping.keys()) + remote_distributions_colnames = set(remote_distribution_field_mapping.keys()) elif remote_dataset_field_names is not None: log.debug( @@ -1056,16 +1080,16 @@ def _check_existing_package_by_ids(self, package_dict): def _set_translated_fields(self, package_dict): """ Sets translated fields in the package dictionary based on the mapped schema. - + Args: package_dict (dict): The package dictionary to update with translated fields. - + Returns: dict: The updated package dictionary. - + Raises: ReadError: If there is an error translating the dataset. - + """ if ( not hasattr(self, "_mapped_schema") @@ -1079,26 +1103,24 @@ def _set_translated_fields(self, package_dict): if field.get("modified", True): local_field_name = field["local_field_name"] remote_field_name = field["remote_field_name"] - - translated_fields["dataset_fields"].append( - local_field_name - ) - + + translated_fields["dataset_fields"].append(local_field_name) + if isinstance(remote_field_name, dict): package_dict[local_field_name] = { - lang: package_dict.get(name, None) + lang: package_dict.get(name, package_dict.get(local_field_name, {}).get(lang)) for lang, name in remote_field_name.items() } if local_field_name.endswith('_translated'): if self._local_required_lang in remote_field_name: - package_dict[local_field_name.replace('_translated', '')] = package_dict.get(remote_field_name[self._local_required_lang], None) + package_dict[local_field_name.replace('_translated', '')] = package_dict.get(remote_field_name[self._local_required_lang], package_dict.get(local_field_name.replace('_translated', ''))) else: raise ValueError("Missing translated field: %s for required language: %s" % (remote_field_name, self._local_required_lang)) else: if remote_field_name not in package_dict: raise KeyError(f"Field {remote_field_name} does not exist in the local schema") - package_dict[local_field_name] = package_dict.get(remote_field_name, None) - + package_dict[local_field_name] = package_dict.get(remote_field_name, package_dict.get(local_field_name)) + if package_dict["resources"]: for i, resource in enumerate(package_dict["resources"]): if self._mapped_schema and "resource_fields" in self._mapped_schema and self._mapped_schema["resource_fields"] is not None: @@ -1106,36 +1128,38 @@ def _set_translated_fields(self, package_dict): if field.get("modified", True): local_field_name = field["local_field_name"] remote_field_name = field["remote_field_name"] - - translated_fields["resource_fields"].append( - local_field_name - ) - + + translated_fields["resource_fields"].append(local_field_name) + if isinstance(remote_field_name, dict): - package_dict[local_field_name] = { - lang: package_dict.get(name, None) + resource[local_field_name] = { + lang: resource.get(name, resource.get(local_field_name, {}).get(lang)) for lang, name in remote_field_name.items() } if local_field_name.endswith('_translated'): if self._local_required_lang in remote_field_name: - package_dict[local_field_name.replace('_translated', '')] = package_dict.get(remote_field_name[self._local_required_lang], None) + resource[local_field_name.replace('_translated', '')] = resource.get(remote_field_name[self._local_required_lang], resource.get(local_field_name.replace('_translated', ''))) else: raise ValueError("Missing translated field: %s for required language: %s" % (remote_field_name, self._local_required_lang)) - + else: + if remote_field_name not in resource: + raise KeyError(f"Field {remote_field_name} does not exist in the local schema") + resource[local_field_name] = resource.get(remote_field_name, resource.get(local_field_name)) + else: log.warning("self._mapped_schema['resource_fields'] is None, skipping resource fields translation.") - + # Update the resource in package_dict package_dict["resources"][i] = resource - + #log.debug('Translated fields: %s', translated_fields) - + except Exception as e: raise ReadError( "Error translating dataset: %s. Error: %s" % (package_dict["title"], str(e)) ) - + return package_dict # TODO: Fix this method @@ -1361,9 +1385,12 @@ def _update_package_dict_with_config_mapping_default_values(self, package_dict): # Create default values dict from config mappings. try: self.create_default_values(field_mappings) - - except ReadError as e: - self._save_gather_error('Error generating default values for dataset/distribution config field mappings: {0}'.format(e), harvest_job) + + except Exception as e: + raise ReadError( + "Error generating default values from config field mappings. Error: %s" + % (str(e)) + ) def update_dict_with_defaults(target_dict, default_values): for key, default_value in default_values.items(): @@ -1430,7 +1457,7 @@ def _set_package_dict_default_values(self, package_dict, harvest_object, context if default_extras: override_extras = self.config.get('override_extras',False) for key,value in default_extras.items(): - log.debug('Processing extra %s', key) + #log.debug('Processing extra %s', key) if not key in extras or override_extras: # Look for replacement strings if isinstance(value,six.string_types): diff --git a/ckanext/schemingdcat/harvesters/ckan.py b/ckanext/schemingdcat/harvesters/ckan.py index c070c205..9f6cbb2a 100644 --- a/ckanext/schemingdcat/harvesters/ckan.py +++ b/ckanext/schemingdcat/harvesters/ckan.py @@ -7,7 +7,7 @@ from urllib.parse import urlencode from ckanext.harvest.model import HarvestObject import datetime -from ckan.plugins import toolkit +import ckan.plugins as p import requests from requests.exceptions import HTTPError, RequestException @@ -15,7 +15,6 @@ import ckan.logic as logic import uuid - from ckanext.schemingdcat.harvesters.base import ( SchemingDCATHarvester, RemoteSchemaError, @@ -25,6 +24,7 @@ RemoteResourceError ) from ckanext.schemingdcat.lib.field_mapping import FieldMappingValidator +from ckanext.schemingdcat.interfaces import ISchemingDCATHarvester log = logging.getLogger(__name__) @@ -107,7 +107,9 @@ def validate_config(self, config): ) else: raise ValueError( - f"schema should match the local schema: {self._local_schema_name}" + f"Config schema should match the local schema: '{self._local_schema_name}'. " + f"Check the remote schema with CKAN API: {{ckan_site_url}}/api/3/action/scheming_dataset_schema_show?type=dataset, " + f"or specify the local schema, and the harvester will try to map the fields." ) config = json.dumps({**config_obj, "schema": schema.lower().strip()}) @@ -181,7 +183,7 @@ def gather_stage(self, harvest_job): log.debug('In SchemingDCATCKANHarvester gather_stage with harvest source: %s and URL: %s', harvest_source_title, remote_ckan_base_url) # Get config options - toolkit.requires_ckan_version(min_version="2.0") + p.toolkit.requires_ckan_version(min_version="2.0") get_all_packages = True self._set_config(harvest_job.source.config) @@ -326,23 +328,20 @@ def gather_stage(self, harvest_job): # Check if the content_dicts colnames correspond to the local schema try: + + #log.debug('RAW package_dict: %s', pkg_dict) + #log.debug('content_dicts: %s', content_dicts) # Standardizes the field names pkg_dict = self._standardize_ckan_dict_from_field_mapping(pkg_dict, field_mappings) - log.debug('Standardized package dict: %s', pkg_dict) + + #log.debug('Standardized package dict: %s', pkg_dict) except RemoteSchemaError as e: self._save_gather_error('Error standarize remote dataset: {0}'.format(e), harvest_job) return [] package_ids.add(pkg_dict["id"]) - # Set translated fields - pkg_dict = self._set_translated_fields(pkg_dict) - log.debug( - "Creating HarvestObject for %s %s", pkg_dict["name"], pkg_dict["id"] - ) - log.debug('Translated package dict: %s', pkg_dict) - obj = HarvestObject( guid=pkg_dict["id"], job=harvest_job, content=json.dumps(pkg_dict) ) @@ -451,10 +450,16 @@ def modify_package_dict(self, package_dict, harvest_object): """ # Clean up any existing extras already in package_dict package_dict = self._remove_duplicate_keys_in_extras(package_dict) - + + # Set translated fields + package_dict = self._set_translated_fields(package_dict) + # Check basic fields without translations package_dict = self._fill_translated_properties(package_dict) + # Using self._dataset_default_values and self._distribution_default_values based on config mappings + package_dict = self._update_package_dict_with_config_mapping_default_values(package_dict) + return package_dict def import_stage(self, harvest_object): @@ -490,7 +495,7 @@ def import_stage(self, harvest_object): try: package_dict = json.loads(harvest_object.content) - + # Add default values: tags, groups, etc. package_dict = self._set_package_dict_default_values( package_dict, harvest_object, base_context @@ -575,13 +580,28 @@ def import_stage(self, harvest_object): # key. resource.pop("revision_id", None) - log.debug('package_dict BEFORE MODIFY: %s', package_dict) + # before_cleaning interface + for harvester in p.PluginImplementations(ISchemingDCATHarvester): + if hasattr(harvester, 'before_modify_package_dict'): + package_dict, before_modify_package_dict_errors = harvester.before_modify_package_dict(package_dict) + + for err in before_modify_package_dict_errors: + self._save_object_error(f'before_modify_package_dict error: {err}', harvest_object, 'Import') + return False + package_dict = self.modify_package_dict(package_dict, harvest_object) result = self._create_or_update_package( package_dict, harvest_object, package_dict_form="package_show" ) - log.debug('package_dict AFTER MODIFY: %s', package_dict) + # after_modify_package_dict interface + for harvester in p.PluginImplementations(ISchemingDCATHarvester): + if hasattr(harvester, 'after_modify_package_dict'): + package_dict, after_modify_package_dict_errors = harvester.after_modify_package_dict(package_dict) + + for err in after_modify_package_dict_errors: + self._save_object_error(f'after_modify_package_dict error: {err}', harvest_object, 'Import') + return False # Log package_dict, package dict is a dict log.debug("Package create or update: %s", result) diff --git a/ckanext/schemingdcat/interfaces.py b/ckanext/schemingdcat/interfaces.py index af54ad39..c318efd7 100644 --- a/ckanext/schemingdcat/interfaces.py +++ b/ckanext/schemingdcat/interfaces.py @@ -236,4 +236,34 @@ def update_package_schema_for_update(self, package_schema): Returns: object: The updated package_schema object """ - return package_schema \ No newline at end of file + return package_schema + + def before_modify_package_dict(self, package_dict): + """ + Interface called just before modifying the package_dict in the CKAN harvester. + + Args: + package_dict (dict): The package dictionary that is about to be updated. + + Returns: + tuple: A tuple with two items: + * The updated package dictionary. + * A list of error messages. These will get stored as import + errors by the harvester + """ + return package_dict, [] + + def after_modify_package_dict(self, package_dict): + """ + Interface called just after modifying the package_dict in the CKAN harvester. + + Args: + package_dict (dict): The package dictionary that has been updated. + + Returns: + tuple: A tuple with two items: + * The updated package dictionary. + * A list of error messages. These will get stored as import + errors by the harvester + """ + return package_dict, [] From 3beccce0cecf2d680e17f14b06adaaf87b9388ce Mon Sep 17 00:00:00 2001 From: mjanez <96422458+mjanez@users.noreply.github.com> Date: Mon, 29 Jul 2024 18:07:31 +0200 Subject: [PATCH 3/8] Improve ckan harvester - Fixduplicate list default_values from config. --- ckanext/schemingdcat/config.py | 7 ++----- ckanext/schemingdcat/harvesters/base.py | 22 +++++++++++++++------- ckanext/schemingdcat/harvesters/ckan.py | 3 --- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/ckanext/schemingdcat/config.py b/ckanext/schemingdcat/config.py index c00b1b43..01a5a0e9 100644 --- a/ckanext/schemingdcat/config.py +++ b/ckanext/schemingdcat/config.py @@ -19,12 +19,9 @@ metadata_templates_search_identifier = 'schemingdcat_xls-template' mimetype_base_uri = 'http://www.iana.org/assignments/media-types' slugify_pat = re.compile('[^a-zA-Z0-9]') - -# schemingdcat field_mapping extras field_names +# schemingdcat field_mapping extras prefix, e.g. custom_field = extras_custom_field +field_mapping_extras_prefix = 'extras' field_mapping_extras_prefix_symbol = '_' -field_mapping_extras_prefix_list = 'extras' -field_mapping_extras_prefix = field_mapping_extras_prefix_list + field_mapping_extras_prefix_symbol - # Default DCAT metadata configuration OGC2CKAN_HARVESTER_MD_CONFIG = { diff --git a/ckanext/schemingdcat/harvesters/base.py b/ckanext/schemingdcat/harvesters/base.py index eaa5d3d5..475a4745 100644 --- a/ckanext/schemingdcat/harvesters/base.py +++ b/ckanext/schemingdcat/harvesters/base.py @@ -46,7 +46,6 @@ slugify_pat, field_mapping_extras_prefix, field_mapping_extras_prefix_symbol, - field_mapping_extras_prefix_list ) log = logging.getLogger(__name__) @@ -540,8 +539,8 @@ def apply_field_mapping(d, mapping): if 'field_name' in remote_info: remote_field = remote_info['field_name'] if remote_field and remote_field.startswith(field_mapping_extras_prefix): - extra_key = remote_field.split(field_mapping_extras_prefix_symbol, 1)[1] - extra_value = get_extra_value(d.get(field_mapping_extras_prefix_list, []), extra_key) + extra_key = remote_field.split(field_mapping_extras_prefix + field_mapping_extras_prefix_symbol, 1)[1] + extra_value = get_extra_value(d.get(field_mapping_extras_prefix, []), extra_key) if extra_value is not None: new_dict[local_field] = extra_value elif remote_field in d: @@ -553,8 +552,8 @@ def apply_field_mapping(d, mapping): if 'field_name' in lang_info: remote_field = lang_info['field_name'] if remote_field and remote_field.startswith(field_mapping_extras_prefix): - extra_key = remote_field.split(field_mapping_extras_prefix_symbol, 1)[1] - extra_value = get_extra_value(d.get(field_mapping_extras_prefix_list, []), extra_key) + extra_key = remote_field.split(field_mapping_extras_prefix + field_mapping_extras_prefix_symbol, 1)[1] + extra_value = get_extra_value(d.get(field_mapping_extras_prefix, []), extra_key) if extra_value is not None: if local_field not in new_dict: new_dict[local_field] = {} @@ -1398,6 +1397,7 @@ def update_dict_with_defaults(target_dict, default_values): target_dict[key] = default_value elif isinstance(target_dict[key], list) and isinstance(default_value, list): target_dict[key].extend(default_value) + target_dict[key] = list(set(target_dict[key])) elif isinstance(default_value, dict): target_dict[key] = target_dict.get(key, {}) for subkey, subvalue in default_value.items(): @@ -1492,8 +1492,8 @@ def _set_package_dict_default_values(self, package_dict, harvest_object, context # Prepare tags package_dict, existing_tags_ids = self._set_ckan_tags(package_dict) - #TODO: Fix existing_tags_ids - log.debug('TODO:existing_tags_ids: %s', existing_tags_ids) + # Existing_tags_ids + log.debug('existing_tags_ids: %s', existing_tags_ids) # Set default tags if needed default_tags = self.config.get("default_tags", []) @@ -1516,6 +1516,14 @@ def _set_package_dict_default_values(self, package_dict, harvest_object, context package_dict["groups"] = cleaned_groups + # Remove duplicates in list or dictionary fields + for key, value in package_dict.items(): + if key not in ['groups', 'resources', 'tags']: + if isinstance(value, list): + package_dict[key] = list({json.dumps(item): item for item in value}.values()) + elif isinstance(value, dict): + package_dict[key] = {k: v for k, v in value.items()} + # log.debug('package_dict default values: %s', package_dict) return package_dict diff --git a/ckanext/schemingdcat/harvesters/ckan.py b/ckanext/schemingdcat/harvesters/ckan.py index 9f6cbb2a..39410827 100644 --- a/ckanext/schemingdcat/harvesters/ckan.py +++ b/ckanext/schemingdcat/harvesters/ckan.py @@ -630,9 +630,6 @@ def get_package_dict(self, harvest_object, context, package_dict=None): Returns: dict: The package dictionary with translated fields and default values set. """ - # Add default values: tags, groups, etc. - package_dict = self._set_package_dict_default_values(package_dict, harvest_object, context) - # Update unique ids for resource in package_dict['resources']: resource['alternate_identifier'] = resource['id'] From 36a298e9b3f2671c477dbf02da6280d61284aa07 Mon Sep 17 00:00:00 2001 From: mjanez <96422458+mjanez@users.noreply.github.com> Date: Tue, 30 Jul 2024 10:15:40 +0200 Subject: [PATCH 4/8] Fix bug when schemingdcat.endpoints_yaml is None --- ckanext/schemingdcat/plugin.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ckanext/schemingdcat/plugin.py b/ckanext/schemingdcat/plugin.py index dab1f085..1b6605b3 100644 --- a/ckanext/schemingdcat/plugin.py +++ b/ckanext/schemingdcat/plugin.py @@ -70,7 +70,7 @@ def update_config(self, config_): sdct_config.default_package_item_icon = config_.get( "schemingdcat.default_package_item_icon", sdct_config.default_package_item_icon - ) + ) or sdct_config.default_package_item_icon sdct_config.default_package_item_show_spatial = toolkit.asbool( config_.get( @@ -86,11 +86,11 @@ def update_config(self, config_): sdct_config.metadata_templates_search_identifier = config_.get( "schemingdcat.metadata_templates_search_identifier", sdct_config.metadata_templates_search_identifier - ) + ) or sdct_config.metadata_templates_search_identifier sdct_config.endpoints_yaml = config_.get( - "schemingdcat.endpoints_yaml", sdct_config.endpoints_yaml - ) + "schemingdcat.endpoints_yaml", sdct_config.endpoints_yaml + ) or sdct_config.endpoints_yaml sdct_config.debug = toolkit.asbool(config_.get("debug", sdct_config.debug)) From 32d790181001f92036183cf1607f865c4d9c5ce5 Mon Sep 17 00:00:00 2001 From: mjanez <96422458+mjanez@users.noreply.github.com> Date: Tue, 30 Jul 2024 13:13:31 +0200 Subject: [PATCH 5/8] Fix file_size in resource metadata info --- ckanext/schemingdcat/helpers.py | 1 - .../templates/schemingdcat/display_snippets/file_size.html | 2 +- .../schemingdcat/package/snippets/resource_extended_info.html | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ckanext/schemingdcat/helpers.py b/ckanext/schemingdcat/helpers.py index 5e61bfc1..f6b3162d 100644 --- a/ckanext/schemingdcat/helpers.py +++ b/ckanext/schemingdcat/helpers.py @@ -1071,7 +1071,6 @@ def schemingdcat_get_readable_file_size(num, suffix="B"): except ValueError: return False - @helper def schemingdcat_get_group_or_org(id, type="group"): """ diff --git a/ckanext/schemingdcat/templates/schemingdcat/display_snippets/file_size.html b/ckanext/schemingdcat/templates/schemingdcat/display_snippets/file_size.html index 33214654..595bf400 100644 --- a/ckanext/schemingdcat/templates/schemingdcat/display_snippets/file_size.html +++ b/ckanext/schemingdcat/templates/schemingdcat/display_snippets/file_size.html @@ -1 +1 @@ -{{ h.schemingdcat_get_readable_file_size(data[field.field_name]) or '-' }} \ No newline at end of file +{{ h.localised_filesize(data[field.field_name]) or '-' }} \ No newline at end of file diff --git a/ckanext/schemingdcat/templates/schemingdcat/package/snippets/resource_extended_info.html b/ckanext/schemingdcat/templates/schemingdcat/package/snippets/resource_extended_info.html index 1d06d733..fa20f205 100644 --- a/ckanext/schemingdcat/templates/schemingdcat/package/snippets/resource_extended_info.html +++ b/ckanext/schemingdcat/templates/schemingdcat/package/snippets/resource_extended_info.html @@ -102,7 +102,7 @@ {{ h.scheming_language_text(field.label) }} - {{ res[field_name] }} + {{ h.localised_filesize(res[field_name]) }} {% endblock %} From 3952322c036b7eeb4d3b2407448c943a842cf521 Mon Sep 17 00:00:00 2001 From: mjanez <96422458+mjanez@users.noreply.github.com> Date: Wed, 31 Jul 2024 10:20:15 +0200 Subject: [PATCH 6/8] Fix CKAN harvester search functionality - Now return results of all pages, not only the first x rows. --- ckanext/schemingdcat/harvesters/ckan.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ckanext/schemingdcat/harvesters/ckan.py b/ckanext/schemingdcat/harvesters/ckan.py index 39410827..de1b1922 100644 --- a/ckanext/schemingdcat/harvesters/ckan.py +++ b/ckanext/schemingdcat/harvesters/ckan.py @@ -383,10 +383,11 @@ def _search_for_datasets(self, remote_ckan_base_url, fq_terms=None): pkg_dicts = [] pkg_ids = set() previous_content = None - url = base_search_url + "?" + urlencode(params) - log.debug("Searching for CKAN datasets: %s", url) while True: + url = base_search_url + "?" + urlencode(params) + log.debug("Searching for CKAN datasets: %s", url) + try: content = self._get_content(url) except ContentFetchError as e: @@ -429,6 +430,8 @@ def _search_for_datasets(self, remote_ckan_base_url, fq_terms=None): params["start"] = str(int(params["start"]) + int(params["rows"])) + log.debug('Number of elements in remote CKAN: %s', len(pkg_dicts)) + return pkg_dicts def fetch_stage(self, harvest_object): From a226240480408ed7464d5b1b4e455a23d13ef8a0 Mon Sep 17 00:00:00 2001 From: mjanez <96422458+mjanez@users.noreply.github.com> Date: Wed, 31 Jul 2024 13:08:46 +0200 Subject: [PATCH 7/8] Improve clean_tags The clean_tags option is added to the configuration file, allowing users to control whether tags should be stripped of accent characters, spaces, and capital letters for display. --- README.md | 7 ++- ckanext/schemingdcat/config.py | 6 ++ ckanext/schemingdcat/harvesters/base.py | 80 +++++++++++++++++++------ 3 files changed, 72 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index b9845dd6..cb403ccd 100644 --- a/README.md +++ b/README.md @@ -352,12 +352,14 @@ The Scheming DCAT CKAN Harvester supports the same configuration options as the * `allow_harvest_datasets` (Optional): If `true`, the harvester will create new records even if the package type is from the harvest source. If `false`, the harvester will only create records that originate from the instance. Default is `false`. * `remote_orgs` (Optional): [WIP]. Only `only_local`. * `remote_groups` (Optional): [WIP]. Only `only_local`. +* `clean_tags`: By default, tags are stripped of accent characters, spaces and capital letters for display. Setting this option to `False` will keep the original tag names. Default is `True`. And example configuration might look like this: ```json { "api_version": 2, + "clean_tags": false, "default_tags": [{"name": "inspire"}, {"name": "geodcatap"}], "default_groups": ["transportation", "hb"], "default_extras": {"encoding":"utf8", "harvest_description":"Harvesting from Sample Catalog", "harvest_url": "{harvest_source_url}/dataset/{dataset_id}"}, @@ -471,9 +473,8 @@ Here are some examples of configuration files: ```json { - "storage_type": "gspread", - "dataset_sheet": "Dataset", - "distribution_sheet": "Distribution", + "api_version": 2, + "clean_tags": false, ... # other properties diff --git a/ckanext/schemingdcat/config.py b/ckanext/schemingdcat/config.py index 01a5a0e9..fa6987d4 100644 --- a/ckanext/schemingdcat/config.py +++ b/ckanext/schemingdcat/config.py @@ -274,6 +274,12 @@ "ñ": "ñ", }) +# CKAN tags fields to be searched in the harvester +AUX_TAG_FIELDS = [ + 'tag_string', + 'keywords' +] + URL_FIELD_NAMES = { 'dataset': ['dcat_type', 'theme_es', 'language', 'topic', 'maintainer_url', 'tag_uri', 'contact_uri', 'contact_url', 'publisher_identifier', 'publisher_uri', 'publisher_url', 'publisher_type', 'maintainer_uri', 'maintainer_url', 'author_uri', 'author_url', 'conforms_to', 'theme', 'reference_system', 'spatial_uri', 'representation_type', 'license_id', 'access_rights', 'graphic_overview', 'frequency', 'hvd_category'], diff --git a/ckanext/schemingdcat/harvesters/base.py b/ckanext/schemingdcat/harvesters/base.py index 475a4745..2861beac 100644 --- a/ckanext/schemingdcat/harvesters/base.py +++ b/ckanext/schemingdcat/harvesters/base.py @@ -43,6 +43,7 @@ URL_REGEX, INVALID_CHARS, ACCENT_MAP, + AUX_TAG_FIELDS, slugify_pat, field_mapping_extras_prefix, field_mapping_extras_prefix_symbol, @@ -1489,11 +1490,11 @@ def _set_package_dict_default_values(self, package_dict, harvest_object, context # Using self._dataset_default_values and self._distribution_default_values based on config mappings package_dict = self._update_package_dict_with_config_mapping_default_values(package_dict) - # Prepare tags - package_dict, existing_tags_ids = self._set_ckan_tags(package_dict) + # Prepare tags + package_dict, existing_tags_ids = self._set_ckan_tags(package_dict, clean_tags=self.config.get("clean_tags", True)) # Existing_tags_ids - log.debug('existing_tags_ids: %s', existing_tags_ids) + #log.debug('existing_tags_ids: %s', existing_tags_ids) # Set default tags if needed default_tags = self.config.get("default_tags", []) @@ -1559,13 +1560,14 @@ def _update_resource_dict(self, resource): return self._get_ckan_format(resource) - def _set_ckan_tags(self, package_dict, tag_fields=["tag_string", "keywords"]): + def _set_ckan_tags(self, package_dict, tag_fields=AUX_TAG_FIELDS, clean_tags=True): """ Process the tags from the provided sources. Args: package_dict (dict): The package dictionary containing the information. tag_fields (list): The list of sources to check for tags. Default: ['tag_string', 'keywords'] + clean_tags (bool): By default, tags are stripped of accent characters, spaces and capital letters for display. Setting this option to `False` will keep the original tag names. Default is `True`. Returns: list: A list of processed tags. @@ -1586,7 +1588,9 @@ def _set_ckan_tags(self, package_dict, tag_fields=["tag_string", "keywords"]): tags = [{"name": tags}] else: raise ValueError("Unsupported type for tags") - cleaned_tags = self._clean_tags(tags) + + # Clean tags + cleaned_tags = self._clean_tags(tags=tags, clean_tag_names=clean_tags, existing_dataset=True) for tag in cleaned_tags: if tag["name"] not in existing_tags_ids: @@ -1728,31 +1732,53 @@ def _get_ckan_format(self, resource): #log.debug('resource: %s', resource) return resource - def _clean_tags(self, tags): + def _clean_tags(self, tags, clean_tag_names=True, existing_dataset=False): """ Cleans the names of tags. - + Each keyword is cleaned by removing non-alphanumeric characters, allowing only: a-z, ñ, 0-9, _, -, ., and spaces, and truncating to a maximum length of 100 characters. If the name of the keyword is a URL, it is converted into a standard CKAN name using the _url_to_ckan_name function. - + Args: - tags (list): The tags to be cleaned. Each keyword is a - dictionary with a 'name' key. - + tags (list): The tags to be cleaned. Each keyword is a dictionary with a `name` key. + + clean_tag_names (bool): By default, tags are stripped of accent characters, spaces and capital letters for display. Setting this harvester config option `clean_tags` to `False` will keep the original tag names. Default is `True`. + + existing_dataset (bool): If the tags are from a dataset from the local CKAN instance. + Returns: list: A list of dictionaries with cleaned keyword names. """ cleaned_tags = [] + seen_names = set() + for k in tags: if k and "name" in k: name = k["name"] + vocabulary_id = k.get("vocabulary_id") or None if self._is_url(name): name = self._url_to_ckan_name(name) - cleaned_tags.append({"name": self._clean_name(name), "display_name": k["name"]}) - return cleaned_tags + + normalized_name = self._clean_name(name) + + if normalized_name in seen_names: + continue + + seen_names.add(normalized_name) + + tag = { + "name": normalized_name if clean_tag_names else name, + "display_name": k["name"] + } + + if vocabulary_id and existing_dataset: + tag["vocabulary_id"] = vocabulary_id + + cleaned_tags.append(tag) + return cleaned_tags def _is_url(self, name): """ @@ -1976,6 +2002,18 @@ def _create_or_update_package( package_dict["resources"] = new_resources + # Clean tags before update existing dataset + tags = package_dict.get("tags", []) + + if hasattr(self, 'config') and self.config: + package_dict["tags"] = self._clean_tags(tags=tags, clean_tag_names=self.config.get("clean_tags", True), existing_dataset=False) + else: + package_dict["tags"] = self._clean_tags(tags=tags, clean_tag_names=True, existing_dataset=True) + + # Remove tag_fields from package_dict + for field in AUX_TAG_FIELDS: + package_dict.pop(field, None) + for field in p.toolkit.aslist( config.get("ckan.harvest.not_overwrite_fields") ): @@ -2035,11 +2073,17 @@ def _create_or_update_package( "Import", ) - log.info( - "Created new package ID: %s with GUID: %s", - package_dict["id"], - harvest_object.guid, - ) + # Clean tags before create. Not existing_dataset + tags = package_dict.get("tags", []) + + if hasattr(self, 'config') and self.config: + package_dict["tags"] = self._clean_tags(tags=tags, clean_tag_names=self.config.get("clean_tags", True), existing_dataset=False) + else: + package_dict["tags"] = self._clean_tags(tags=tags, clean_tag_names=True, existing_dataset=False) + + # Remove tag_fields from package_dict + for field in AUX_TAG_FIELDS: + package_dict.pop(field, None) #log.debug('Package: %s', package_dict) harvest_object.package_id = package_dict["id"] From ea133a3ee40a574744c070db871f7d27b257f8c5 Mon Sep 17 00:00:00 2001 From: mjanez <96422458+mjanez@users.noreply.github.com> Date: Thu, 1 Aug 2024 02:27:18 +0200 Subject: [PATCH 8/8] Add licenses.json Add CC-BY 4.0 and more from https://licenses.opendefinition.org/licenses/groups/ckan.json --- .../schemingdcat/public/static/licenses.json | 220 ++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 ckanext/schemingdcat/public/static/licenses.json diff --git a/ckanext/schemingdcat/public/static/licenses.json b/ckanext/schemingdcat/public/static/licenses.json new file mode 100644 index 00000000..74fbe1e3 --- /dev/null +++ b/ckanext/schemingdcat/public/static/licenses.json @@ -0,0 +1,220 @@ +[ + { + "domain_content": false, + "domain_data": false, + "domain_software": false, + "family": "", + "id": "notspecified", + "is_generic": true, + "maintainer": "", + "od_conformance": "not reviewed", + "osd_conformance": "not reviewed", + "status": "active", + "title": "License Not Specified", + "url": "" + }, + { + "domain_content": false, + "domain_data": true, + "domain_software": false, + "family": "", + "id": "PDDL-1.0", + "legacy_ids": [ + "ODC-PDDL-1.0" + ], + "maintainer": "", + "od_conformance": "approved", + "osd_conformance": "not reviewed", + "status": "active", + "title": "Open Data Commons Public Domain Dedication and Licence 1.0", + "url": "https://opendefinition.org/licenses/odc-pddl" + }, + { + "domain_content": false, + "domain_data": true, + "domain_software": false, + "family": "", + "id": "ODbL-1.0", + "maintainer": "", + "od_conformance": "approved", + "osd_conformance": "not reviewed", + "status": "active", + "title": "Open Data Commons Open Database License 1.0", + "url": "https://opendefinition.org/licenses/odc-odbl" + }, + { + "domain_content": false, + "domain_data": true, + "domain_software": false, + "family": "", + "id": "ODC-BY-1.0", + "maintainer": "Open Data Commons", + "od_conformance": "approved", + "osd_conformance": "not reviewed", + "status": "active", + "title": "Open Data Commons Attribution License 1.0", + "url": "https://opendefinition.org/licenses/odc-by" + }, + { + "domain_content": true, + "domain_data": true, + "domain_software": true, + "family": "", + "id": "CC0-1.0", + "maintainer": "Creative Commons", + "od_conformance": "approved", + "osd_conformance": "not reviewed", + "status": "active", + "title": "CC0 1.0", + "url": "https://creativecommons.org/publicdomain/zero/1.0/" + }, + { + "domain_content": "False", + "domain_data": "False", + "domain_software": "False", + "family": "", + "id": "cc-by", + "maintainer": "", + "od_conformance": "approved", + "osd_conformance": "not reviewed", + "status": "active", + "title": "Creative Commons Attribution", + "url": "http://www.opendefinition.org/licenses/cc-by" + }, + { + "domain_content": true, + "domain_data": true, + "domain_software": false, + "family": "", + "id": "CC-BY-4.0", + "maintainer": "Creative Commons", + "od_conformance": "approved", + "osd_conformance": "not reviewed", + "status": "active", + "title": "Creative Commons Attribution 4.0", + "url": "https://creativecommons.org/licenses/by/4.0/" + }, + { + "domain_content": true, + "domain_data": true, + "domain_software": false, + "family": "", + "id": "CC-BY-SA-4.0", + "maintainer": "Creative Commons", + "od_conformance": "approved", + "osd_conformance": "not reviewed", + "status": "active", + "title": "Creative Commons Attribution Share-Alike 4.0", + "url": "https://creativecommons.org/licenses/by-sa/4.0/" + }, + { + "domain_content": true, + "domain_data": false, + "domain_software": false, + "family": "", + "id": "GFDL-1.3-no-cover-texts-no-invariant-sections", + "maintainer": "Free Software Foundation", + "od_conformance": "approved", + "osd_conformance": "not reviewed", + "status": "active", + "title": "GNU Free Documentation License 1.3 with no cover texts and no invariant sections", + "url": "https://opendefinition.org/licenses/gfdl" + }, + { + "domain_content": true, + "domain_data": false, + "domain_software": false, + "family": "", + "id": "other-open", + "is_generic": true, + "maintainer": "", + "od_conformance": "approved", + "osd_conformance": "not reviewed", + "status": "active", + "title": "Other (Open)", + "url": "" + }, + { + "domain_content": true, + "domain_data": false, + "domain_software": false, + "family": "", + "id": "other-pd", + "is_generic": true, + "maintainer": "", + "od_conformance": "approved", + "osd_conformance": "not reviewed", + "status": "active", + "title": "Other (Public Domain)", + "url": "" + }, + { + "domain_content": true, + "domain_data": false, + "domain_software": false, + "family": "", + "id": "other-at", + "is_generic": true, + "maintainer": "", + "od_conformance": "approved", + "osd_conformance": "not reviewed", + "status": "active", + "title": "Other (Attribution)", + "url": "" + }, + { + "domain_content": true, + "domain_data": true, + "domain_software": true, + "family": "", + "id": "OGL-UK-2.0", + "is_generic": false, + "maintainer": "UK Government", + "od_conformance": "approved", + "osd_conformance": "not reviewed", + "status": "active", + "title": "Open Government Licence 2.0 (United Kingdom)", + "url": "https://www.nationalarchives.gov.uk/doc/open-government-licence/version/2/" + }, + { + "domain_content": true, + "domain_data": true, + "domain_software": false, + "family": "Creative Commons", + "id": "CC-BY-NC-4.0", + "maintainer": "Creative Commons", + "od_conformance": "rejected", + "osd_conformance": "not reviewed", + "status": "active", + "title": "Creative Commons Attribution-NonCommercial 4.0", + "url": "https://creativecommons.org/licenses/by-nc/4.0/" + }, + { + "domain_content": false, + "domain_data": false, + "domain_software": false, + "family": "", + "id": "other-nc", + "is_generic": true, + "maintainer": "", + "od_conformance": "not reviewed", + "osd_conformance": "not reviewed", + "status": "active", + "title": "Other (Non-Commercial)", + "url": "" + }, + { + "domain_content": false, + "domain_data": false, + "domain_software": false, + "family": "", + "id": "other-closed", + "is_generic": true, + "maintainer": "", + "od_conformance": "not reviewed", + "osd_conformance": "not reviewed", + "status": "active", + "title": "Other (Not Open)", + "url": "" + } +] \ No newline at end of file