From c5c4320ef8de9fd8534c0236dd9a2947b17afdc3 Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Thu, 11 Jul 2024 15:17:13 -0400 Subject: [PATCH] add support for OGC TrainingDML-AI --- pygeometa/schemas/__init__.py | 3 +- pygeometa/schemas/tdml_ai/__init__.py | 291 ++++++++++++++++++++++++++ sample-tdml-ai.yml | 164 +++++++++++++++ 3 files changed, 457 insertions(+), 1 deletion(-) create mode 100644 pygeometa/schemas/tdml_ai/__init__.py create mode 100644 sample-tdml-ai.yml diff --git a/pygeometa/schemas/__init__.py b/pygeometa/schemas/__init__.py index a545c0d..ba0065b 100644 --- a/pygeometa/schemas/__init__.py +++ b/pygeometa/schemas/__init__.py @@ -53,12 +53,13 @@ THISDIR = os.path.dirname(os.path.realpath(__file__)) SCHEMAS = { + 'dcat': 'pygeometa.schemas.dcat.DCATOutputSchema', 'iso19139': 'pygeometa.schemas.iso19139.ISO19139OutputSchema', 'iso19139-2': 'pygeometa.schemas.iso19139_2.ISO19139_2OutputSchema', 'iso19139-hnap': 'pygeometa.schemas.iso19139_hnap.ISO19139HNAPOutputSchema', # noqa 'oarec-record': 'pygeometa.schemas.ogcapi_records.OGCAPIRecordOutputSchema', # noqa 'stac-item': 'pygeometa.schemas.stac.STACItemOutputSchema', - 'dcat': 'pygeometa.schemas.dcat.DCATOutputSchema', + 'tdml-ai': 'pygeometa.schemas.tdml_ai.TDML_AIOutputSchema', 'wmo-cmp': 'pygeometa.schemas.wmo_cmp.WMOCMPOutputSchema', 'wmo-wcmp2': 'pygeometa.schemas.wmo_wcmp2.WMOWCMP2OutputSchema', 'wmo-wigos': 'pygeometa.schemas.wmo_wigos.WMOWIGOSOutputSchema' diff --git a/pygeometa/schemas/tdml_ai/__init__.py b/pygeometa/schemas/tdml_ai/__init__.py new file mode 100644 index 0000000..5720ff9 --- /dev/null +++ b/pygeometa/schemas/tdml_ai/__init__.py @@ -0,0 +1,291 @@ +# ================================================================= +# +# Terms and Conditions of Use +# +# Unless otherwise noted, computer program source code of this +# distribution # is covered under Crown Copyright, Government of +# Canada, and is distributed under the MIT License. +# +# The Canada wordmark and related graphics associated with this +# distribution are protected under trademark law and copyright law. +# No permission is granted to use them outside the parameters of +# the Government of Canada's corporate identity program. For +# more information, see +# http://www.tbs-sct.gc.ca/fip-pcim/index-eng.asp +# +# Copyright title to all 3rd party software distributed with this +# software is held by the respective copyright holders as noted in +# those files. Users are asked to read the 3rd Party Licenses +# referenced with those assets. +# +# Copyright (c) 2024 Tom Kralidis +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# ================================================================= + +import json +import logging +import os +from typing import Union + +from pygeometa.core import get_charstring +from pygeometa.helpers import json_serial +from pygeometa.schemas.base import BaseOutputSchema + +THISDIR = os.path.dirname(os.path.realpath(__file__)) + +LOGGER = logging.getLogger(__name__) + + +class TDML_AIOutputSchema(BaseOutputSchema): + """OGC Training Data Markup Language for Artificial Intelligence""" + + def __init__(self): + """ + Initialize object + + :returns: pygeometa.schemas.base.BaseOutputSchema + """ + + description = 'OGC Training Data Markup Language for Artificial Intelligence' # noqa + + super().__init__('tdml-ai', description, 'json', THISDIR) + + def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]: + """ + Write outputschema to JSON string buffer + + :param mcf: dict of MCF content model + :param stringify: whether to return a string representation (default) + else native (dict, etree) + + + :returns: `dict` or `str` of MCF as an OARec record representation + """ + + self.lang1 = mcf['metadata'].get('language') + self.lang2 = mcf['metadata'].get('language_alternate') + + minx, miny, maxx, maxy = (mcf['identification']['extents'] + ['spatial'][0]['bbox']) + + title = get_charstring(mcf['identification'].get('title'), + self.lang1, self.lang2) + + description = get_charstring(mcf['identification'].get('abstract'), + self.lang1, self.lang2) + + dataset = { + 'version': '1.0', + 'id': mcf['metadata']['identifier'], + 'type': 'AI_EOTrainingDataset', + 'name': title, + 'description': description, + 'extent': { + 'geographicElement': { + 'geographicBoundingBox': { + 'westBoundLongitude': minx, + 'eastBoundLongitude': maxx, + 'southBoundLatitude': miny, + 'northBoundLatitude': maxy + } + } + } + } + + LOGGER.debug('Checking for temporal') + if all(['temporal' in mcf['identification']['extents'], + mcf['identification']['extents']['temporal'] != [{}]]): + + begin = mcf['identification']['extents']['temporal'][0]['begin'] + end = mcf['identification']['extents']['temporal'][0].get('end') + + if begin in ['now', 'None', None]: + begin = None + + if end in ['now', 'None', None]: + end = None + + if [begin, end] == [None, None]: + pass + + else: + dataset['extent']['temporalElement'] = {'TimePeriod': {}} + for pos in [[begin, 'beginPosition'], ['end', 'endPosition']]: + if pos[0] is not None: + dataset['extent']['temporalElement']['TimePeriod'][pos[1]] = pos[0] # noqa + + dataset['license'] = mcf['identification']['license']['name'] + + LOGGER.debug('Checking for dates') + if 'dates' in mcf['identification']: + if 'creation' in mcf['identification']['dates']: + dataset['createdTime'] = str(mcf['identification']['dates']['creation']) # noqa + if 'revision' in mcf['identification']['dates']: + dataset['updatedTime'] = str(mcf['identification']['dates']['revision']) # noqa + + LOGGER.debug('Checking for contacts') + dataset['providers'] = self.generate_providers(mcf['contact']) + + LOGGER.debug('Checking for tasks') + dataset['tasks'] = self.generate_tasks(mcf['tasks']) + + LOGGER.debug('Checking for classes') + dataset['classes'] = self.generate_classes(mcf['classes']) + dataset['numberOfClasses'] = len(dataset['classes']) + + LOGGER.debug('Checking for bands') + dataset['variables'] = self.generate_variables(mcf['attributes']) + + LOGGER.debug('Checking for doi') + if 'doi' in mcf['identification']: + dataset['doi'] = mcf['identification']['doi'] + + all_keywords = [] + + LOGGER.debug('Checking for keywords') + for key, value in mcf['identification']['keywords'].items(): + keywords = get_charstring(value.get('keywords'), self.lang1, + self.lang2) + + for kw in keywords[0]: + all_keywords.append(kw) + + if all_keywords: + dataset['keywords'] = all_keywords + + LOGGER.debug('Checking for data') + dataset['data'] = self.generate_data(mcf['training-data']) + + if stringify: + return json.dumps(dataset, default=json_serial, indent=4) + return dataset + + def generate_variables(self, attributes: list) -> list: + """ + Generates 1..n tasks + + :param contact: `list` of attributes + + :returns: `list` of variable objects + """ + + variables = [] + + for attribute in attributes: + variable = { + 'name': attribute['name'], + } + if 'units' in attribute: + variable['unit'] = attribute['units'] + if 'abstract' in attribute: + variable['description'] = attribute['abstract'] + + variables.append(variable) + + return variables + + def generate_classes(self, classes: list) -> list: + """ + Generates 1..n tasks + + :param contact: `list` of classes + + :returns: `list` of class objects + """ + + classes_ = [] + + for count, value in enumerate(classes): + classes_.append({ + 'key': value, + 'value': count + }) + + return classes_ + + def generate_tasks(self, tasks: dict) -> list: + """ + Generates 1..n tasks + + :param contact: `dict` of tasks + + :returns: `list` of tasks + """ + + tasks_ = [] + + for key, value in tasks.items(): + tasks_.append({ + 'id': key, + 'type:': 'AI_EOTask', + 'description': value['description'], + 'taskType': value['type'] + }) + + return tasks_ + + def generate_providers(self, contact: dict) -> list: + """ + Generates 1..n providers + + :param contact: `dict` of contacts + + :returns: `list` of providers + """ + + providers = [] + + for key, value in contact.items(): + providers.append(value['organization']) + + return providers + + def generate_data(self, training_data: dict) -> dict: + """ + Generates training data objects from MCF training-data object + + :param training_data: `dict` of MCF training-data + + :returns: `list` of training data objects + """ + + datas = [] + + for key, value in training_data.items(): + data = { + 'type': 'AI_EO_TrainingData', + 'id': key, + 'dataURL': [value['url']], + 'labels': [] + } + for label in value['labels']: + data['labels'].append({ + 'type': f"AI_{label['type']}Label", + f"{label['type']}LabelURL": label['url'], + f"{label['type']}LabelField": label['field'], + }) + + datas.append(data) + + return datas diff --git a/sample-tdml-ai.yml b/sample-tdml-ai.yml new file mode 100644 index 0000000..a54a412 --- /dev/null +++ b/sample-tdml-ai.yml @@ -0,0 +1,164 @@ +mcf: + version: 1.0 + +metadata: + identifier: 3f342f64-9348-11df-ba6a-0014c2c00eab + language: en + language_alternate: fr + charset: utf8 + parentidentifier: someparentid + hierarchylevel: dataset + datestamp: 2014-11-11 + dataseturi: http://some/minted/uri + +spatial: + datatype: vector + geomtype: point + +identification: + language: eng; CAN + charset: utf8 + title: + en: title in English + fr: title in French + abstract: + en: abstract in English + fr: abstract in French + edition: 1.8.0 + dates: + creation: 2000-09-01T00:00:00Z + publication: 2001-11-11 + keywords: + default: + keywords: + en: [kw1 in English,kw2 in English,kw3 in English] + fr: [kw1 in French,kw2 in French,kw3 in French] + wmo: + keywords: + en: [FOO,BAR] + keywords_type: theme + vocabulary: + name: + en: My vocabulary + fr: Mon vocabulaire + url: http://example.org/vocab + gc_cst: + keywords: + en: [kw1,kw2] + fr: [kw1,kw2] + topiccategory: + - climatologyMeteorologyAtmosphere + extents: + spatial: + - bbox: [-141,42,-52,84] + crs: 4326 + temporal: + - begin: 1950-07-31 + end: now + resolution: P1Y + fees: None + accessconstraints: otherRestrictions + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0 + rights: + en: Copyright (c) 2010 Her Majesty the Queen in Right of Canada + fr: Copyright (c) 2010 Her Majesty the Queen in Right of Canada + url: http://geogratis.ca/geogratis/en/product/search.do?id=08DB5E85-7405-FE3A-2860-CC3663245625 + status: onGoing + maintenancefrequency: continual + +content_info: + type: image + cloud_cover: 72 + processing_level: "1.0" + attributes: + - name: foo + units: m + - name: bar + units: K + dimensions: + - name: B1 + units: nm + min: 932 + max: 958 + +# platform metadata, applicable to iso19139-2 output +acquisition: + platforms: + - identifier: LANDSAT_8 + description: Landsat 8 + instruments: + - identifier: OLI_TIRS + type: INS-NOBS + +contact: + pointOfContact: &contact_poc + organization: Environment Canada + url: https://www.ec.gc.ca/ + individualname: Tom Kralidis + positionname: Senior Systems Scientist + phone: +01-123-456-7890 + fax: +01-123-456-7890 + address: 4905 Dufferin Street + city: Toronto + administrativearea: Ontario + postalcode: M3H 5T4 + country: Canada + email: foo@bar.tld + hoursofservice: 0700h - 1500h EST + contactinstructions: email + + distributor: *contact_poc + +distribution: + waf: + url: https://example.org/data + type: WWW:LINK + rel: canonical + name: my waf + description: + en: description in English + fr: description in French + function: download + + wms: + url: https://example.org/wms + type: OGC:WMS + rel: service + name: + en: roads + fr: routes + description: + en: description in English + fr: description in French + function: download + +dataquality: + scope: + level: dataset + lineage: + statement: this dataset was derived from a custom process against dataset xyz + +attributes: + - name: red + - name: green + - name: blue + +tasks: + task1: + type: http://demo#point_cloud_semantic_segmentation + description: Point Cloud Semantic Segmentation + +classes: + - Ground + - Pole + - Fence + +training-data: + L001: + url: data/L001.ply + labels: + - url: data/L001.ply2 + type: Point + field: scalar_Label