From 7160fd4c5c2cc54225e0677689a0be7c52ed4773 Mon Sep 17 00:00:00 2001 From: Patrick Titzler Date: Wed, 7 Oct 2020 10:21:15 -0700 Subject: [PATCH] cleanup --- README.md | 8 +- dax-data-set-descriptors/jfk.yaml | 74 +++++++++++++++++++ .../lorem_ipsum.yaml | 23 +++++- templates/openaihub_out.yaml | 21 ++++++ 4 files changed, 120 insertions(+), 6 deletions(-) create mode 100644 dax-data-set-descriptors/jfk.yaml rename {samples => dax-data-set-descriptors}/lorem_ipsum.yaml (70%) create mode 100644 templates/openaihub_out.yaml diff --git a/README.md b/README.md index d704bd2..adad658 100644 --- a/README.md +++ b/README.md @@ -42,9 +42,13 @@ Replace `{{...}}` placeholders in `my.template` with values from `my.yaml`. The $ python metadata_converter/apply.py my.yaml my.template -o my_completed.template ``` -## Examples +## Templates -Example templates and YAML file can be found in [samples/](/samples). +Example template files for DLF and OpenAIHub can be found in the [templates/](/templates) directory. + +## DAX data set descriptors + +Descriptor files for DAX data sets can be found in the [dax-data-set-descriptors/](/dax-data-set-descriptors) directory. ## License diff --git a/dax-data-set-descriptors/jfk.yaml b/dax-data-set-descriptors/jfk.yaml new file mode 100644 index 0000000..1c14a08 --- /dev/null +++ b/dax-data-set-descriptors/jfk.yaml @@ -0,0 +1,74 @@ +id: noaa-weather-data–jfk-airport +name: NOAA Weather Data – JFK Airport +description: Local climatological data originally collected at JFK airport. +version: 1.1.4 +# TBD use timestamp for better accuracy and timezone support +created: 2019-07-19 +updated: 2020-07-19 +# TBD how to handle compound types (a data set comprises of multiple files using different format) +format: + type: CSV + mime_type: text/csv +domain: Time Series + +# Information about the entity that makes the data set avaialable +provider: + name: Data Asset eXchange + url: https://developer.ibm.com/exchanges/data/all/jfk-weather-data/ + +# identifies where the data set is stored and how it is stored (REQUIRED) +repository: + type: HTTP + url: https://dax-cdn.cdn.appdomain.cloud/dax-noaa-weather-data-jfk-airport/1.1.4/noaa-weather-data-jfk-airport.tar.gz + mime_type: application/x-tar + sha_512: e3f27a8fcc0db5289df356e3f48aef6df56236798d5b3ae3889d358489ec6609d2d797e4c4932b86016d2ce4a379ac0a0749b6fb2c293ebae4e585ea1c8422ac + size: 3.5M + +# REQUIRED; data set license information +license: + commercial: false + name: CDLA-Sharing + url: https://cdla.io/sharing-1-0/ + +# REQUIRED; describes relevant files in the data set archive +content: + - file_name: noaa-weather-data-jfk-airport/jfk_weather.csv + description: Raw data file + records: 114546 + size: 30M + type: CSV + mime_type: text/csv + - file_name: noaa-weather-data-jfk-airport/jfk_weather_cleaned.csv + description: jfk_weather.csv cleaned-up using noaa-weather-data-jfk-airport/clean_data.py + records: 75125 + size: 6M + type: CSV + mime_type: text/csv + +# OPTIONAL; Identifies where the data set was obtained from +source: + name: National Oceanic and Atmospheric Administration + url: https://www.ncdc.noaa.gov/ + +# OPTIONAL; but recommended +seo_tags: + - weather + - time series + +# OPTIONAL; assets that complement this data set, e.g. notebooks +related_assets: + - name: Explore the data + description: Data preview and glossary + url: https://dax-cdn.cdn.appdomain.cloud/dax-noaa-weather-data-jfk-airport/1.1.4/data-preview/index.html + - name: DAX Weather project + description: Watson Studio Gallery project for the NOAA data set + mime_type: text/html + url: https://dataplatform.cloud.ibm.com/exchange/public/entry/view/a7432f0c29c5bda2fb42749f363bd9ff + - name: Effective Farming - Monitor Crop Growth + description: Accelerator for Cloud Pak for Data + mime_type: text/html + url: https://community.ibm.com/community/user/cloudpakfordata/viewdocument/effective-farming-monitor-crop-gr + - name: Elyra pipeline + description: Elyra example pipeline + mime_type: text/html + url: https://github.com/elyra-ai/examples/tree/master/pipelines/dax_noaa_weather_data diff --git a/samples/lorem_ipsum.yaml b/dax-data-set-descriptors/lorem_ipsum.yaml similarity index 70% rename from samples/lorem_ipsum.yaml rename to dax-data-set-descriptors/lorem_ipsum.yaml index fba06fd..64b62bf 100644 --- a/samples/lorem_ipsum.yaml +++ b/dax-data-set-descriptors/lorem_ipsum.yaml @@ -19,17 +19,32 @@ provider: # Information about the repository that make # identifies where the data set is stored and how it is stored (REQUIRED) repository: type: HTTP # - url: http://.../loremipsum.tar.gz # Data set file download URL - mime_type: application/x-tar # Data set file MIME type (https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types) + url: http://.../loremipsum.tar.gz # Data set archive download URL + mime_type: application/x-tar # Data set archive MIME type (https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types) sha_512: cf83e135...7eefb # SHA-512 checksum of the data set file - size: 12 M # Data set file size including units (E, P, T, G, M, K) (which isn't necessarily the size of the data file) + size: 12M # Data set archive size including units (E, P, T, G, M, K) (which isn't necessarily the size of the data file) # REQUIRED; data set license information license: - # TBD comercial license indicator? + commercial: false # if true, a commercial license (assume false if not set) name: CDLA-Sharing # License name url: https://cdla.io/sharing-1-0/ # Link to public license text +# REQUIRED; describes relevant files in the data set archive +content: + - file_name: path/lorem-ipsum1.csv # includes path + description: l-i training data # free form text describing the file content + records: 5000 # including units (E, P, T, G, M, K), if applicable + size: 1M # size including units (E, P, T, G, M, K), if applicable + type: CSV # file format (user friendly) + mime_type: text/csv # File MIME type (https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types) + - file_name: path/lorem-ipsum2.csv # includes path + description: l-i test data # free form text describing the file content + records: 3000 # including units (E, P, T, G, M, K), if applicable + size: 500K # size including units (E, P, T, G, M, K), if applicable + type: CSV # file format (user friendly) + mime_type: text/csv # File MIME type (https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types) + # OPTIONAL; Identifies where the data set was obtained from source: name: entity-name # Name of the owning entity diff --git a/templates/openaihub_out.yaml b/templates/openaihub_out.yaml new file mode 100644 index 0000000..0af54a6 --- /dev/null +++ b/templates/openaihub_out.yaml @@ -0,0 +1,21 @@ +# Copyright 2019 IBM Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: com.ibm/v1alpha1 +kind: Dataset +metadata: + name: {{name}} +spec: + type: "ARCHIVE" + url: {{repository.url}} + format: {{repository.mime_type}}