From c5c4320ef8de9fd8534c0236dd9a2947b17afdc3 Mon Sep 17 00:00:00 2001
From: Tom Kralidis <tomkralidis@gmail.com>
Date: Thu, 11 Jul 2024 15:17:13 -0400
Subject: [PATCH] add support for OGC TrainingDML-AI

---
 pygeometa/schemas/__init__.py         |   3 +-
 pygeometa/schemas/tdml_ai/__init__.py | 291 ++++++++++++++++++++++++++
 sample-tdml-ai.yml                    | 164 +++++++++++++++
 3 files changed, 457 insertions(+), 1 deletion(-)
 create mode 100644 pygeometa/schemas/tdml_ai/__init__.py
 create mode 100644 sample-tdml-ai.yml

diff --git a/pygeometa/schemas/__init__.py b/pygeometa/schemas/__init__.py
index a545c0d..ba0065b 100644
--- a/pygeometa/schemas/__init__.py
+++ b/pygeometa/schemas/__init__.py
@@ -53,12 +53,13 @@
 THISDIR = os.path.dirname(os.path.realpath(__file__))
 
 SCHEMAS = {
+    'dcat': 'pygeometa.schemas.dcat.DCATOutputSchema',
     'iso19139': 'pygeometa.schemas.iso19139.ISO19139OutputSchema',
     'iso19139-2': 'pygeometa.schemas.iso19139_2.ISO19139_2OutputSchema',
     'iso19139-hnap': 'pygeometa.schemas.iso19139_hnap.ISO19139HNAPOutputSchema',  # noqa
     'oarec-record': 'pygeometa.schemas.ogcapi_records.OGCAPIRecordOutputSchema',  # noqa
     'stac-item': 'pygeometa.schemas.stac.STACItemOutputSchema',
-    'dcat': 'pygeometa.schemas.dcat.DCATOutputSchema',
+    'tdml-ai': 'pygeometa.schemas.tdml_ai.TDML_AIOutputSchema',
     'wmo-cmp': 'pygeometa.schemas.wmo_cmp.WMOCMPOutputSchema',
     'wmo-wcmp2': 'pygeometa.schemas.wmo_wcmp2.WMOWCMP2OutputSchema',
     'wmo-wigos': 'pygeometa.schemas.wmo_wigos.WMOWIGOSOutputSchema'
diff --git a/pygeometa/schemas/tdml_ai/__init__.py b/pygeometa/schemas/tdml_ai/__init__.py
new file mode 100644
index 0000000..5720ff9
--- /dev/null
+++ b/pygeometa/schemas/tdml_ai/__init__.py
@@ -0,0 +1,291 @@
+# =================================================================
+#
+# Terms and Conditions of Use
+#
+# Unless otherwise noted, computer program source code of this
+# distribution # is covered under Crown Copyright, Government of
+# Canada, and is distributed under the MIT License.
+#
+# The Canada wordmark and related graphics associated with this
+# distribution are protected under trademark law and copyright law.
+# No permission is granted to use them outside the parameters of
+# the Government of Canada's corporate identity program. For
+# more information, see
+# http://www.tbs-sct.gc.ca/fip-pcim/index-eng.asp
+#
+# Copyright title to all 3rd party software distributed with this
+# software is held by the respective copyright holders as noted in
+# those files. Users are asked to read the 3rd Party Licenses
+# referenced with those assets.
+#
+# Copyright (c) 2024 Tom Kralidis
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# =================================================================
+
+import json
+import logging
+import os
+from typing import Union
+
+from pygeometa.core import get_charstring
+from pygeometa.helpers import json_serial
+from pygeometa.schemas.base import BaseOutputSchema
+
+THISDIR = os.path.dirname(os.path.realpath(__file__))
+
+LOGGER = logging.getLogger(__name__)
+
+
+class TDML_AIOutputSchema(BaseOutputSchema):
+    """OGC Training Data Markup Language for Artificial Intelligence"""
+
+    def __init__(self):
+        """
+        Initialize object
+
+        :returns: pygeometa.schemas.base.BaseOutputSchema
+        """
+
+        description = 'OGC Training Data Markup Language for Artificial Intelligence'  # noqa
+
+        super().__init__('tdml-ai', description, 'json', THISDIR)
+
+    def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]:
+        """
+        Write outputschema to JSON string buffer
+
+        :param mcf: dict of MCF content model
+        :param stringify: whether to return a string representation (default)
+                          else native (dict, etree)
+
+
+        :returns: `dict` or `str` of MCF as an OARec record representation
+        """
+
+        self.lang1 = mcf['metadata'].get('language')
+        self.lang2 = mcf['metadata'].get('language_alternate')
+
+        minx, miny, maxx, maxy = (mcf['identification']['extents']
+                                  ['spatial'][0]['bbox'])
+
+        title = get_charstring(mcf['identification'].get('title'),
+                               self.lang1, self.lang2)
+
+        description = get_charstring(mcf['identification'].get('abstract'),
+                                     self.lang1, self.lang2)
+
+        dataset = {
+            'version': '1.0',
+            'id': mcf['metadata']['identifier'],
+            'type': 'AI_EOTrainingDataset',
+            'name': title,
+            'description': description,
+            'extent': {
+                'geographicElement': {
+                    'geographicBoundingBox': {
+                        'westBoundLongitude': minx,
+                        'eastBoundLongitude': maxx,
+                        'southBoundLatitude': miny,
+                        'northBoundLatitude': maxy
+                    }
+                }
+            }
+        }
+
+        LOGGER.debug('Checking for temporal')
+        if all(['temporal' in mcf['identification']['extents'],
+                mcf['identification']['extents']['temporal'] != [{}]]):
+
+            begin = mcf['identification']['extents']['temporal'][0]['begin']
+            end = mcf['identification']['extents']['temporal'][0].get('end')
+
+            if begin in ['now', 'None', None]:
+                begin = None
+
+            if end in ['now', 'None', None]:
+                end = None
+
+            if [begin, end] == [None, None]:
+                pass
+
+            else:
+                dataset['extent']['temporalElement'] = {'TimePeriod': {}}
+                for pos in [[begin, 'beginPosition'], ['end', 'endPosition']]:
+                    if pos[0] is not None:
+                        dataset['extent']['temporalElement']['TimePeriod'][pos[1]] = pos[0]  # noqa
+
+        dataset['license'] = mcf['identification']['license']['name']
+
+        LOGGER.debug('Checking for dates')
+        if 'dates' in mcf['identification']:
+            if 'creation' in mcf['identification']['dates']:
+                dataset['createdTime'] = str(mcf['identification']['dates']['creation'])  # noqa
+            if 'revision' in mcf['identification']['dates']:
+                dataset['updatedTime'] = str(mcf['identification']['dates']['revision'])  # noqa
+
+        LOGGER.debug('Checking for contacts')
+        dataset['providers'] = self.generate_providers(mcf['contact'])
+
+        LOGGER.debug('Checking for tasks')
+        dataset['tasks'] = self.generate_tasks(mcf['tasks'])
+
+        LOGGER.debug('Checking for classes')
+        dataset['classes'] = self.generate_classes(mcf['classes'])
+        dataset['numberOfClasses'] = len(dataset['classes'])
+
+        LOGGER.debug('Checking for bands')
+        dataset['variables'] = self.generate_variables(mcf['attributes'])
+
+        LOGGER.debug('Checking for doi')
+        if 'doi' in mcf['identification']:
+            dataset['doi'] = mcf['identification']['doi']
+
+        all_keywords = []
+
+        LOGGER.debug('Checking for keywords')
+        for key, value in mcf['identification']['keywords'].items():
+            keywords = get_charstring(value.get('keywords'), self.lang1,
+                                      self.lang2)
+
+            for kw in keywords[0]:
+                all_keywords.append(kw)
+
+        if all_keywords:
+            dataset['keywords'] = all_keywords
+
+        LOGGER.debug('Checking for data')
+        dataset['data'] = self.generate_data(mcf['training-data'])
+
+        if stringify:
+            return json.dumps(dataset, default=json_serial, indent=4)
+        return dataset
+
+    def generate_variables(self, attributes: list) -> list:
+        """
+        Generates 1..n tasks
+
+        :param contact: `list` of attributes
+
+        :returns: `list` of variable objects
+        """
+
+        variables = []
+
+        for attribute in attributes:
+            variable = {
+                'name': attribute['name'],
+            }
+            if 'units' in attribute:
+                variable['unit'] = attribute['units']
+            if 'abstract' in attribute:
+                variable['description'] = attribute['abstract']
+
+            variables.append(variable)
+
+        return variables
+
+    def generate_classes(self, classes: list) -> list:
+        """
+        Generates 1..n tasks
+
+        :param contact: `list` of classes
+
+        :returns: `list` of class objects
+        """
+
+        classes_ = []
+
+        for count, value in enumerate(classes):
+            classes_.append({
+                'key': value,
+                'value': count
+            })
+
+        return classes_
+
+    def generate_tasks(self, tasks: dict) -> list:
+        """
+        Generates 1..n tasks
+
+        :param contact: `dict` of tasks
+
+        :returns: `list` of tasks
+        """
+
+        tasks_ = []
+
+        for key, value in tasks.items():
+            tasks_.append({
+                'id': key,
+                'type:': 'AI_EOTask',
+                'description': value['description'],
+                'taskType': value['type']
+            })
+
+        return tasks_
+
+    def generate_providers(self, contact: dict) -> list:
+        """
+        Generates 1..n providers
+
+        :param contact: `dict` of contacts
+
+        :returns: `list` of providers
+        """
+
+        providers = []
+
+        for key, value in contact.items():
+            providers.append(value['organization'])
+
+        return providers
+
+    def generate_data(self, training_data: dict) -> dict:
+        """
+        Generates training data objects from MCF training-data object
+
+        :param training_data: `dict` of MCF training-data
+
+        :returns: `list` of training data objects
+        """
+
+        datas = []
+
+        for key, value in training_data.items():
+            data = {
+                'type': 'AI_EO_TrainingData',
+                'id': key,
+                'dataURL': [value['url']],
+                'labels': []
+            }
+            for label in value['labels']:
+                data['labels'].append({
+                    'type': f"AI_{label['type']}Label",
+                    f"{label['type']}LabelURL": label['url'],
+                    f"{label['type']}LabelField": label['field'],
+                })
+
+            datas.append(data)
+
+        return datas
diff --git a/sample-tdml-ai.yml b/sample-tdml-ai.yml
new file mode 100644
index 0000000..a54a412
--- /dev/null
+++ b/sample-tdml-ai.yml
@@ -0,0 +1,164 @@
+mcf:
+    version: 1.0
+
+metadata:
+    identifier: 3f342f64-9348-11df-ba6a-0014c2c00eab
+    language: en
+    language_alternate: fr
+    charset: utf8
+    parentidentifier: someparentid
+    hierarchylevel: dataset
+    datestamp: 2014-11-11
+    dataseturi: http://some/minted/uri
+
+spatial:
+    datatype: vector
+    geomtype: point
+
+identification:
+    language: eng; CAN
+    charset: utf8
+    title:
+        en: title in English
+        fr: title in French
+    abstract:
+        en: abstract in English
+        fr: abstract in French
+    edition: 1.8.0
+    dates:
+        creation: 2000-09-01T00:00:00Z
+        publication: 2001-11-11
+    keywords:
+        default:
+            keywords:
+                en: [kw1 in English,kw2 in English,kw3 in English]
+                fr: [kw1 in French,kw2 in French,kw3 in French]
+        wmo:
+            keywords:
+                en: [FOO,BAR]
+            keywords_type: theme
+            vocabulary:
+                name:
+                    en: My vocabulary
+                    fr: Mon vocabulaire
+                url: http://example.org/vocab
+        gc_cst:
+            keywords:
+                en: [kw1,kw2]
+                fr: [kw1,kw2]
+    topiccategory:
+        - climatologyMeteorologyAtmosphere
+    extents:
+        spatial:
+            - bbox: [-141,42,-52,84]
+              crs: 4326
+        temporal:
+            - begin: 1950-07-31
+              end: now
+              resolution: P1Y
+    fees: None
+    accessconstraints: otherRestrictions
+    license:
+        name: CC BY 4.0
+        url: https://creativecommons.org/licenses/by/4.0
+    rights:
+        en: Copyright (c) 2010 Her Majesty the Queen in Right of Canada
+        fr: Copyright (c) 2010 Her Majesty the Queen in Right of Canada
+    url: http://geogratis.ca/geogratis/en/product/search.do?id=08DB5E85-7405-FE3A-2860-CC3663245625
+    status: onGoing
+    maintenancefrequency: continual
+
+content_info:
+    type: image
+    cloud_cover: 72
+    processing_level: "1.0"
+    attributes:
+        - name: foo
+          units: m
+        - name: bar
+          units: K
+    dimensions:
+        - name: B1
+          units: nm
+          min: 932
+          max: 958
+
+# platform metadata, applicable to iso19139-2 output
+acquisition:
+    platforms:
+        - identifier: LANDSAT_8
+          description: Landsat 8
+          instruments:
+              - identifier: OLI_TIRS
+                type: INS-NOBS
+
+contact:
+    pointOfContact: &contact_poc
+        organization: Environment Canada
+        url: https://www.ec.gc.ca/
+        individualname: Tom Kralidis
+        positionname: Senior Systems Scientist
+        phone: +01-123-456-7890
+        fax: +01-123-456-7890
+        address: 4905 Dufferin Street
+        city: Toronto
+        administrativearea: Ontario
+        postalcode: M3H 5T4
+        country: Canada
+        email: foo@bar.tld
+        hoursofservice: 0700h - 1500h EST
+        contactinstructions: email
+
+    distributor: *contact_poc
+
+distribution:
+    waf:
+        url: https://example.org/data
+        type: WWW:LINK
+        rel: canonical
+        name: my waf
+        description:
+            en: description in English
+            fr: description in French
+        function: download
+
+    wms:
+        url: https://example.org/wms
+        type: OGC:WMS
+        rel: service
+        name:
+            en: roads
+            fr: routes
+        description:
+            en: description in English
+            fr: description in French
+        function: download
+
+dataquality:
+    scope:
+        level: dataset
+    lineage:
+        statement: this dataset was derived from a custom process against dataset xyz
+
+attributes:
+    - name: red
+    - name: green
+    - name: blue
+
+tasks:
+    task1:
+      type: http://demo#point_cloud_semantic_segmentation
+      description: Point Cloud Semantic Segmentation
+
+classes:
+    - Ground
+    - Pole
+    - Fence
+
+training-data:
+    L001:
+        url: data/L001.ply
+        labels:
+            - url: data/L001.ply2
+              type: Point
+              field: scalar_Label