From 7160fd4c5c2cc54225e0677689a0be7c52ed4773 Mon Sep 17 00:00:00 2001
From: Patrick Titzler <ptitzler@us.ibm.com>
Date: Wed, 7 Oct 2020 10:21:15 -0700
Subject: [PATCH] cleanup

---
 README.md                                     |  8 +-
 dax-data-set-descriptors/jfk.yaml             | 74 +++++++++++++++++++
 .../lorem_ipsum.yaml                          | 23 +++++-
 templates/openaihub_out.yaml                  | 21 ++++++
 4 files changed, 120 insertions(+), 6 deletions(-)
 create mode 100644 dax-data-set-descriptors/jfk.yaml
 rename {samples => dax-data-set-descriptors}/lorem_ipsum.yaml (70%)
 create mode 100644 templates/openaihub_out.yaml

diff --git a/README.md b/README.md
index d704bd2..adad658 100644
--- a/README.md
+++ b/README.md
@@ -42,9 +42,13 @@ Replace `{{...}}` placeholders in `my.template` with values from `my.yaml`. The
 $ python metadata_converter/apply.py my.yaml my.template -o my_completed.template
 ```
 
-## Examples
+## Templates
 
-Example templates and YAML file can be found in [samples/](/samples).
+Example template files for DLF and OpenAIHub can be found in the [templates/](/templates) directory.
+
+## DAX data set descriptors
+
+Descriptor files for DAX data sets can be found in the [dax-data-set-descriptors/](/dax-data-set-descriptors) directory.
 
 ## License
 
diff --git a/dax-data-set-descriptors/jfk.yaml b/dax-data-set-descriptors/jfk.yaml
new file mode 100644
index 0000000..1c14a08
--- /dev/null
+++ b/dax-data-set-descriptors/jfk.yaml
@@ -0,0 +1,74 @@
+id: noaa-weather-data–jfk-airport
+name: NOAA Weather Data – JFK Airport
+description: Local climatological data originally collected at JFK airport.
+version: 1.1.4
+# TBD use timestamp for better accuracy and timezone support
+created: 2019-07-19
+updated: 2020-07-19
+# TBD how to handle compound types (a data set comprises of multiple files using different format)
+format:
+  type: CSV
+  mime_type: text/csv
+domain: Time Series
+
+# Information about the entity that makes the data set avaialable
+provider:
+  name: Data Asset eXchange
+  url: https://developer.ibm.com/exchanges/data/all/jfk-weather-data/
+
+# identifies where the data set is stored and how it is stored (REQUIRED)
+repository:
+  type: HTTP 
+  url: https://dax-cdn.cdn.appdomain.cloud/dax-noaa-weather-data-jfk-airport/1.1.4/noaa-weather-data-jfk-airport.tar.gz
+  mime_type: application/x-tar
+  sha_512:  e3f27a8fcc0db5289df356e3f48aef6df56236798d5b3ae3889d358489ec6609d2d797e4c4932b86016d2ce4a379ac0a0749b6fb2c293ebae4e585ea1c8422ac
+  size: 3.5M
+
+# REQUIRED; data set license information
+license:
+  commercial: false
+  name: CDLA-Sharing
+  url: https://cdla.io/sharing-1-0/
+
+# REQUIRED; describes relevant files in the data set archive
+content:
+  - file_name: noaa-weather-data-jfk-airport/jfk_weather.csv
+    description: Raw data file
+    records: 114546
+    size: 30M
+    type: CSV
+    mime_type: text/csv
+  - file_name: noaa-weather-data-jfk-airport/jfk_weather_cleaned.csv
+    description: jfk_weather.csv cleaned-up using noaa-weather-data-jfk-airport/clean_data.py
+    records: 75125
+    size: 6M
+    type: CSV
+    mime_type: text/csv
+
+# OPTIONAL; Identifies where the data set was obtained from
+source:
+  name: National Oceanic and Atmospheric Administration
+  url: https://www.ncdc.noaa.gov/
+
+# OPTIONAL; but recommended
+seo_tags:
+  - weather
+  - time series
+
+# OPTIONAL; assets that complement this data set, e.g. notebooks
+related_assets:
+  - name: Explore the data
+    description: Data preview and glossary
+    url: https://dax-cdn.cdn.appdomain.cloud/dax-noaa-weather-data-jfk-airport/1.1.4/data-preview/index.html
+  - name: DAX Weather project
+    description: Watson Studio Gallery project for the NOAA data set
+    mime_type: text/html
+    url: https://dataplatform.cloud.ibm.com/exchange/public/entry/view/a7432f0c29c5bda2fb42749f363bd9ff
+  - name: Effective Farming - Monitor Crop Growth
+    description: Accelerator for Cloud Pak for Data
+    mime_type: text/html
+    url: https://community.ibm.com/community/user/cloudpakfordata/viewdocument/effective-farming-monitor-crop-gr
+  - name: Elyra pipeline
+    description: Elyra example pipeline
+    mime_type: text/html
+    url: https://github.com/elyra-ai/examples/tree/master/pipelines/dax_noaa_weather_data
diff --git a/samples/lorem_ipsum.yaml b/dax-data-set-descriptors/lorem_ipsum.yaml
similarity index 70%
rename from samples/lorem_ipsum.yaml
rename to dax-data-set-descriptors/lorem_ipsum.yaml
index fba06fd..64b62bf 100644
--- a/samples/lorem_ipsum.yaml
+++ b/dax-data-set-descriptors/lorem_ipsum.yaml
@@ -19,17 +19,32 @@ provider:                           # Information about the repository that make
 # identifies where the data set is stored and how it is stored (REQUIRED)
 repository:
   type: HTTP                         # 
-  url: http://.../loremipsum.tar.gz  # Data set file download URL
-  mime_type: application/x-tar       # Data set file MIME type (https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types)
+  url: http://.../loremipsum.tar.gz  # Data set archive download URL
+  mime_type: application/x-tar       # Data set archive MIME type (https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types)
   sha_512: cf83e135...7eefb          # SHA-512 checksum of the data set file
-  size: 12 M                         # Data set file size including units (E, P, T, G, M, K) (which isn't necessarily the size of the data file)
+  size: 12M                          # Data set archive size including units (E, P, T, G, M, K) (which isn't necessarily the size of the data file)
 
 # REQUIRED; data set license information
 license:
-  # TBD comercial license indicator?
+  commercial: false                  # if true, a commercial license (assume false if not set)
   name: CDLA-Sharing                 # License name
   url: https://cdla.io/sharing-1-0/  # Link to public license text
 
+# REQUIRED; describes relevant files in the data set archive
+content:
+  - file_name: path/lorem-ipsum1.csv  # includes path
+    description: l-i training data    # free form text describing the file content
+    records: 5000                     # including units (E, P, T, G, M, K), if applicable
+    size: 1M                          # size including units (E, P, T, G, M, K), if applicable
+    type: CSV                         # file format (user friendly)
+    mime_type: text/csv               # File MIME type (https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types)
+  - file_name: path/lorem-ipsum2.csv  # includes path
+    description: l-i test data        # free form text describing the file content
+    records: 3000                     # including units (E, P, T, G, M, K), if applicable
+    size: 500K                        # size including units (E, P, T, G, M, K), if applicable
+    type: CSV                         # file format (user friendly)
+    mime_type: text/csv               # File MIME type (https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types)
+
 # OPTIONAL; Identifies where the data set was obtained from
 source:
   name: entity-name                   # Name of the owning entity
diff --git a/templates/openaihub_out.yaml b/templates/openaihub_out.yaml
new file mode 100644
index 0000000..0af54a6
--- /dev/null
+++ b/templates/openaihub_out.yaml
@@ -0,0 +1,21 @@
+# Copyright 2019 IBM Corporation 
+# 
+# Licensed under the Apache License, Version 2.0 (the "License"); 
+# you may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at 
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0 
+# 
+# Unless required by applicable law or agreed to in writing, software 
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+# See the License for the specific language governing permissions and 
+# limitations under the License.
+apiVersion: com.ibm/v1alpha1
+kind: Dataset
+metadata:
+  name: {{name}}
+spec:
+  type: "ARCHIVE"
+  url: {{repository.url}}
+  format: {{repository.mime_type}}