From 54084772479a1626feca84f02fb00d5c24430f33 Mon Sep 17 00:00:00 2001 From: Aninda Ghosh Date: Thu, 13 Jun 2024 17:19:00 -0700 Subject: [PATCH 1/5] Converters for Austria and Brazil --- .../datasets/fieldscapes_austria_2021.py | 167 ++++++++++++++++++ fiboa_cli/datasets/fieldscapes_brazil_2020.py | 136 ++++++++++++++ 2 files changed, 303 insertions(+) create mode 100644 fiboa_cli/datasets/fieldscapes_austria_2021.py create mode 100644 fiboa_cli/datasets/fieldscapes_brazil_2020.py diff --git a/fiboa_cli/datasets/fieldscapes_austria_2021.py b/fiboa_cli/datasets/fieldscapes_austria_2021.py new file mode 100644 index 0000000..fbff4d7 --- /dev/null +++ b/fiboa_cli/datasets/fieldscapes_austria_2021.py @@ -0,0 +1,167 @@ +# TEMPLATE FOR A FIBOA CONVERTER +# +# Copy this file and rename it to something sensible. +# The name of the file will be the name of the converter in the cli. +# If you name it 'de_abc' you'll be able to run `fiboa convert de_abc` in the cli. + +from ..convert_utils import convert as convert_ + +# File to read the data from +# Can read any tabular data format that GeoPandas can read through read_file() +# Supported protcols: HTTP(S), GCS, S3, or the local file system + +# Local URI added to the repository for initial conversion, Original Source https://beta.source.coop/esa/fusion-competition/ +URI = "/home/byteboogie/work/labwork_hkerner/fieldscapes/austria/boundaries_austria_2021.gpkg" + +# Unique identifier for the collection +ID = "fieldscapes_austria_2021" +# Title of the collection +TITLE = "Field boundaries for Austria (Fieldscapes)" +# Description of the collection. Can be multiline and include CommonMark. +DESCRIPTION = """ The dataset contains field boundaries for the Austria.""" +# Bounding box of the data in WGS84 coordinates +BBOX = [13.239974981742014, 48.204179578647796, 16.960943738443856, 48.974515524098045] + +# Provider name, can be None if not applicable, must be provided if PROVIDER_URL is provided +PROVIDER_NAME = "Euro Crops" +# URL to the homepage of the data or the provider, can be None if not applicable +PROVIDER_URL = "https://data.europa.eu/data/datasets/ama_invekosreferenzensterreich2021?locale=en" +# Attribution, can be None if not applicable +ATTRIBUTION = "Publications Office of the European Union." + +# License of the data, either +# 1. a SPDX license identifier (including "dl-de/by-2-0" / "dl-de/zero-2-0"), or +LICENSE = "CC-BY-4.0" +# 2. a STAC Link Object with relation type "license" +# LICENSE = {"title": "CC-BY-4.0", "href": "https://creativecommons.org/licenses/by/4.0/", "type": "text/html", "rel": "license"} + +# Map original column names to fiboa property names +# You also need to list any column that you may have added in the MIGRATION function (see below). +COLUMNS = { + "FS_KENNUNG": "id", + "SL_FLAECHE": "area", + "EC_hcat_c": "crop_id", + "EC_hcat_n": "crop_name", + "geometry": "geometry", + "SNAR_BEZEI": "SNAR_BEZEI", + "GEO_ID" : "GEO_ID", + "SNAR_CODE" : "SNAR_CODE", + "GEO_PART_K" : "GEO_PART_K", + "FART_ID" : "FART_ID", + "GML_LENGTH" : "GML_LENGTH", + "EC_trans_n" : "EC_trans_n" +} + +# Add columns with constant values. +# The key is the column name, the value is a constant value that's used for all rows. +ADD_COLUMNS = { + "determination_datetime": "2021-01-01T00:00:00Z" +} + +# A list of implemented extension identifiers +EXTENSIONS = [] + +# Functions to migrate data in columns to match the fiboa specification. +# Example: You have a column area_m in square meters and want to convert +# to hectares as required for the area field in fiboa. +# Function signature: +# func(column: pd.Series) -> pd.Series +COLUMN_MIGRATIONS = { + +} + +# Filter columns to only include the ones that are relevant for the collection, +# e.g. only rows that contain the word "agriculture" but not "forest" in the column "land_cover_type". +# Lamda function accepts a Pandas Series and returns a Series or a Tuple with a Series and True to inverse the mask. +COLUMN_FILTERS = { + +} + +# Custom function to migrate the GeoDataFrame if the other options are not sufficient +# This should be the last resort! +# Function signature: +# func(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame +MIGRATION = None + +# Schemas for the fields that are not defined in fiboa +# Keys must be the values from the COLUMNS dict, not the keys +MISSING_SCHEMAS = { + "required": ["SNAR_BEZEI", "GEO_ID", "SNAR_CODE", "GEO_PART_K", "FART_ID", "GML_LENGTH", "EC_trans_n"], # i.e. non-nullable properties + "properties": { + "SNAR_BEZEI": { + "type": "string" + }, + "GEO_ID": { + "type": "int64" + }, + "SNAR_CODE": { + "type": "int64" + }, + "GEO_PART_K": { + "type": "int64" + }, + "FART_ID": { + "type": "int64" + }, + "GML_LENGTH": { + "type": "int64" + }, + "EC_trans_n": { + "type": "string" + } + } +} + + +# Conversion function, usually no changes required +def convert(output_file, cache_file = None, source_coop_url = None, collection = False, compression = None): + """ + Converts the field boundary datasets to fiboa. + + For reference, this is the order in which the conversion steps are applied: + 0. Read GeoDataFrame from file + 1. Run global migration (if provided through MIGRATION) + 2. Run filters to remove rows that shall not be in the final data + (if provided through COLUMN_FILTERS) + 3. Add columns with constant values + 4. Run column migrations (if provided through COLUMN_MIGRATIONS) + 5. Duplicate columns (if an array is provided as the value in COLUMNS) + 6. Rename columns (as provided in COLUMNS) + 7. Remove columns (if column is not present as value in COLUMNS) + 8. Create the collection + 9. Change data types of the columns based on the provided schemas + (fiboa spec, extensions, and MISSING_SCHEMAS) + 10. Write the data to the Parquet file + + Parameters: + output_file (str): Path where the Parquet file shall be stored. + cache_file (str): Path to a cached file of the data. Default: None. + Can be used to avoid repetitive downloads from the original data source. + source_coop_url (str): URL to the (future) Source Cooperative repository. Default: None + collection (bool): Additionally, store the collection separate from Parquet file. Default: False + compression (str): Compression method for the Parquet file. Default: zstd + kwargs: Additional keyword arguments for GeoPanda's read_file() or read_parquet() function. + """ + convert_( + output_file, + cache_file, + URI, + COLUMNS, + ID, + TITLE, + DESCRIPTION, + BBOX, + provider_name=PROVIDER_NAME, + provider_url=PROVIDER_URL, + source_coop_url=source_coop_url, + extensions=EXTENSIONS, + missing_schemas=MISSING_SCHEMAS, + column_additions=ADD_COLUMNS, + column_migrations=COLUMN_MIGRATIONS, + column_filters=COLUMN_FILTERS, + migration=MIGRATION, + attribution=ATTRIBUTION, + store_collection=collection, + license=LICENSE, + compression=compression, + ) \ No newline at end of file diff --git a/fiboa_cli/datasets/fieldscapes_brazil_2020.py b/fiboa_cli/datasets/fieldscapes_brazil_2020.py new file mode 100644 index 0000000..b4d9455 --- /dev/null +++ b/fiboa_cli/datasets/fieldscapes_brazil_2020.py @@ -0,0 +1,136 @@ +# TEMPLATE FOR A FIBOA CONVERTER +# +# Copy this file and rename it to something sensible. +# The name of the file will be the name of the converter in the cli. +# If you name it 'de_abc' you'll be able to run `fiboa convert de_abc` in the cli. + +from ..convert_utils import convert as convert_ + +# File to read the data from +# Can read any tabular data format that GeoPandas can read through read_file() +# Supported protcols: HTTP(S), GCS, S3, or the local file system + +# Local URI added to the repository for initial conversion, Original Source https://beta.source.coop/esa/fusion-competition/ +URI = "/home/byteboogie/work/labwork_hkerner/fieldscapes/brazil/boundaries_brazil_2020.gpkg" + +# Unique identifier for the collection +ID = "fieldscapes_brazil_2020" +# Title of the collection +TITLE = "Field boundaries for Brazil (Fieldscapes)" +# Description of the collection. Can be multiline and include CommonMark. +DESCRIPTION = """ The dataset contains field boundaries for the Brazil.""" +# Bounding box of the data in WGS84 coordinates +BBOX = [-46.39769258914609, -13.832659641089542, -45.56417133292678, -11.835700893930944] + +# Provider name, can be None if not applicable, must be provided if PROVIDER_URL is provided +PROVIDER_NAME = "Brazilian Biomes project (Brazil Data Cube), funded by the Amazon Fund through the financial collaboration of the Brazilian Development Bank (BNDES) and the Foundation for Science, Technology and Space Applications (FUNCATE)" +# URL to the homepage of the data or the provider, can be None if not applicable +PROVIDER_URL = "https://data.mendeley.com/datasets/vz6d7tw87f/1#file-5ac1542b-12ef-4dce-8258-113b5c5d87c9" +# Attribution, can be None if not applicable +ATTRIBUTION = "Mendeley Data" + +# License of the data, either +# 1. a SPDX license identifier (including "dl-de/by-2-0" / "dl-de/zero-2-0"), or +LICENSE = "CC-BY-4.0" +# 2. a STAC Link Object with relation type "license" +# LICENSE = {"title": "CC-BY-4.0", "href": "https://creativecommons.org/licenses/by/4.0/", "type": "text/html", "rel": "license"} + +# Map original column names to fiboa property names +# You also need to list any column that you may have added in the MIGRATION function (see below). +COLUMNS = { + "id": "id", + "geometry": "geometry" +} + +# Add columns with constant values. +# The key is the column name, the value is a constant value that's used for all rows. +ADD_COLUMNS = { + "determination_datetime": "2020-01-01T00:00:00Z" +} + +# A list of implemented extension identifiers +EXTENSIONS = [] + +# Functions to migrate data in columns to match the fiboa specification. +# Example: You have a column area_m in square meters and want to convert +# to hectares as required for the area field in fiboa. +# Function signature: +# func(column: pd.Series) -> pd.Series +COLUMN_MIGRATIONS = { + +} + +# Filter columns to only include the ones that are relevant for the collection, +# e.g. only rows that contain the word "agriculture" but not "forest" in the column "land_cover_type". +# Lamda function accepts a Pandas Series and returns a Series or a Tuple with a Series and True to inverse the mask. +COLUMN_FILTERS = { + +} + +# Custom function to migrate the GeoDataFrame if the other options are not sufficient +# This should be the last resort! +# Function signature: +# func(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame +MIGRATION = None + +# Schemas for the fields that are not defined in fiboa +# Keys must be the values from the COLUMNS dict, not the keys +MISSING_SCHEMAS = { + "required": [], # i.e. non-nullable properties + "properties": { + } +} + + +# Conversion function, usually no changes required +def convert(output_file, cache_file = None, source_coop_url = None, collection = False, compression = None): + """ + Converts the field boundary datasets to fiboa. + + For reference, this is the order in which the conversion steps are applied: + 0. Read GeoDataFrame from file + 1. Run global migration (if provided through MIGRATION) + 2. Run filters to remove rows that shall not be in the final data + (if provided through COLUMN_FILTERS) + 3. Add columns with constant values + 4. Run column migrations (if provided through COLUMN_MIGRATIONS) + 5. Duplicate columns (if an array is provided as the value in COLUMNS) + 6. Rename columns (as provided in COLUMNS) + 7. Remove columns (if column is not present as value in COLUMNS) + 8. Create the collection + 9. Change data types of the columns based on the provided schemas + (fiboa spec, extensions, and MISSING_SCHEMAS) + 10. Write the data to the Parquet file + + Parameters: + output_file (str): Path where the Parquet file shall be stored. + cache_file (str): Path to a cached file of the data. Default: None. + Can be used to avoid repetitive downloads from the original data source. + source_coop_url (str): URL to the (future) Source Cooperative repository. Default: None + collection (bool): Additionally, store the collection separate from Parquet file. Default: False + compression (str): Compression method for the Parquet file. Default: zstd + kwargs: Additional keyword arguments for GeoPanda's read_file() or read_parquet() function. + """ + convert_( + output_file, + cache_file, + URI, + COLUMNS, + ID, + TITLE, + DESCRIPTION, + BBOX, + provider_name=PROVIDER_NAME, + provider_url=PROVIDER_URL, + source_coop_url=source_coop_url, + extensions=EXTENSIONS, + missing_schemas=MISSING_SCHEMAS, + column_additions=ADD_COLUMNS, + column_migrations=COLUMN_MIGRATIONS, + column_filters=COLUMN_FILTERS, + migration=MIGRATION, + attribution=ATTRIBUTION, + store_collection=collection, + license=LICENSE, + compression=compression, + ) \ No newline at end of file From 492e50951f0bbe731a8107cbb8e648972e824c01 Mon Sep 17 00:00:00 2001 From: Aninda Ghosh Date: Fri, 14 Jun 2024 14:03:51 -0700 Subject: [PATCH 2/5] Removed unnecessary columns --- .../datasets/fieldscapes_austria_2021.py | 33 ++----------------- 1 file changed, 3 insertions(+), 30 deletions(-) diff --git a/fiboa_cli/datasets/fieldscapes_austria_2021.py b/fiboa_cli/datasets/fieldscapes_austria_2021.py index fbff4d7..b449ef1 100644 --- a/fiboa_cli/datasets/fieldscapes_austria_2021.py +++ b/fiboa_cli/datasets/fieldscapes_austria_2021.py @@ -42,14 +42,7 @@ "SL_FLAECHE": "area", "EC_hcat_c": "crop_id", "EC_hcat_n": "crop_name", - "geometry": "geometry", - "SNAR_BEZEI": "SNAR_BEZEI", - "GEO_ID" : "GEO_ID", - "SNAR_CODE" : "SNAR_CODE", - "GEO_PART_K" : "GEO_PART_K", - "FART_ID" : "FART_ID", - "GML_LENGTH" : "GML_LENGTH", - "EC_trans_n" : "EC_trans_n" + "geometry": "geometry" } # Add columns with constant values. @@ -86,29 +79,9 @@ # Schemas for the fields that are not defined in fiboa # Keys must be the values from the COLUMNS dict, not the keys MISSING_SCHEMAS = { - "required": ["SNAR_BEZEI", "GEO_ID", "SNAR_CODE", "GEO_PART_K", "FART_ID", "GML_LENGTH", "EC_trans_n"], # i.e. non-nullable properties + "required": [ ], # i.e. non-nullable properties "properties": { - "SNAR_BEZEI": { - "type": "string" - }, - "GEO_ID": { - "type": "int64" - }, - "SNAR_CODE": { - "type": "int64" - }, - "GEO_PART_K": { - "type": "int64" - }, - "FART_ID": { - "type": "int64" - }, - "GML_LENGTH": { - "type": "int64" - }, - "EC_trans_n": { - "type": "string" - } + } } From 1b88f6164a5a871337c1af2d677c14871950fd6e Mon Sep 17 00:00:00 2001 From: Aninda Ghosh Date: Mon, 17 Jun 2024 17:28:24 -0700 Subject: [PATCH 3/5] Updated Converters with the Missing Schemas, made some cosmetic changes. --- fiboa_cli/datasets/fieldscapes_austria_2021.py | 9 +++++++-- fiboa_cli/datasets/fieldscapes_brazil_2020.py | 8 +++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/fiboa_cli/datasets/fieldscapes_austria_2021.py b/fiboa_cli/datasets/fieldscapes_austria_2021.py index b449ef1..3e46aa7 100644 --- a/fiboa_cli/datasets/fieldscapes_austria_2021.py +++ b/fiboa_cli/datasets/fieldscapes_austria_2021.py @@ -79,9 +79,14 @@ # Schemas for the fields that are not defined in fiboa # Keys must be the values from the COLUMNS dict, not the keys MISSING_SCHEMAS = { - "required": [ ], # i.e. non-nullable properties + "required": [ "crop_id", "crop_name" ], # i.e. non-nullable properties "properties": { - + "crop_id": { + "type": "int64" + }, + "crop_name": { + "type": "string" + } } } diff --git a/fiboa_cli/datasets/fieldscapes_brazil_2020.py b/fiboa_cli/datasets/fieldscapes_brazil_2020.py index b4d9455..5534da3 100644 --- a/fiboa_cli/datasets/fieldscapes_brazil_2020.py +++ b/fiboa_cli/datasets/fieldscapes_brazil_2020.py @@ -76,8 +76,14 @@ # Schemas for the fields that are not defined in fiboa # Keys must be the values from the COLUMNS dict, not the keys MISSING_SCHEMAS = { - "required": [], # i.e. non-nullable properties + "required": [ "crop_id", "crop_name" ], # i.e. non-nullable properties "properties": { + "crop_id": { + "type": "int64" + }, + "crop_name": { + "type": "string" + } } } From fa102992531e0dbb8b8fb9f5fc13820ae442791f Mon Sep 17 00:00:00 2001 From: Aninda Ghosh Date: Mon, 17 Jun 2024 17:29:01 -0700 Subject: [PATCH 4/5] Cosmetic Changes --- fiboa_cli/datasets/fieldscapes_austria_2021.py | 2 +- fiboa_cli/datasets/fieldscapes_brazil_2020.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fiboa_cli/datasets/fieldscapes_austria_2021.py b/fiboa_cli/datasets/fieldscapes_austria_2021.py index 3e46aa7..cdd36f6 100644 --- a/fiboa_cli/datasets/fieldscapes_austria_2021.py +++ b/fiboa_cli/datasets/fieldscapes_austria_2021.py @@ -18,7 +18,7 @@ # Title of the collection TITLE = "Field boundaries for Austria (Fieldscapes)" # Description of the collection. Can be multiline and include CommonMark. -DESCRIPTION = """ The dataset contains field boundaries for the Austria.""" +DESCRIPTION = "The dataset contains field boundaries for the Austria." # Bounding box of the data in WGS84 coordinates BBOX = [13.239974981742014, 48.204179578647796, 16.960943738443856, 48.974515524098045] diff --git a/fiboa_cli/datasets/fieldscapes_brazil_2020.py b/fiboa_cli/datasets/fieldscapes_brazil_2020.py index 5534da3..44b75c7 100644 --- a/fiboa_cli/datasets/fieldscapes_brazil_2020.py +++ b/fiboa_cli/datasets/fieldscapes_brazil_2020.py @@ -18,7 +18,7 @@ # Title of the collection TITLE = "Field boundaries for Brazil (Fieldscapes)" # Description of the collection. Can be multiline and include CommonMark. -DESCRIPTION = """ The dataset contains field boundaries for the Brazil.""" +DESCRIPTION = "The dataset contains field boundaries for the Brazil." # Bounding box of the data in WGS84 coordinates BBOX = [-46.39769258914609, -13.832659641089542, -45.56417133292678, -11.835700893930944] From 7bbb992db41e29f3a4786b6ca6e972d6a5a5b10e Mon Sep 17 00:00:00 2001 From: Aninda Ghosh Date: Fri, 12 Jul 2024 15:44:45 -0700 Subject: [PATCH 5/5] Updated Datetime for both Austria and Brazil Dataset --- fiboa_cli/datasets/fieldscapes_austria_2021.py | 2 +- fiboa_cli/datasets/fieldscapes_brazil_2020.py | 14 +++++--------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/fiboa_cli/datasets/fieldscapes_austria_2021.py b/fiboa_cli/datasets/fieldscapes_austria_2021.py index cdd36f6..5d25dca 100644 --- a/fiboa_cli/datasets/fieldscapes_austria_2021.py +++ b/fiboa_cli/datasets/fieldscapes_austria_2021.py @@ -48,7 +48,7 @@ # Add columns with constant values. # The key is the column name, the value is a constant value that's used for all rows. ADD_COLUMNS = { - "determination_datetime": "2021-01-01T00:00:00Z" + "determination_datetime": "2021-12-31T00:00:00Z" } # A list of implemented extension identifiers diff --git a/fiboa_cli/datasets/fieldscapes_brazil_2020.py b/fiboa_cli/datasets/fieldscapes_brazil_2020.py index 44b75c7..dfabdd9 100644 --- a/fiboa_cli/datasets/fieldscapes_brazil_2020.py +++ b/fiboa_cli/datasets/fieldscapes_brazil_2020.py @@ -39,13 +39,14 @@ # You also need to list any column that you may have added in the MIGRATION function (see below). COLUMNS = { "id": "id", - "geometry": "geometry" + "geometry": "geometry", + "datetime" : "determination_datetime", } # Add columns with constant values. # The key is the column name, the value is a constant value that's used for all rows. ADD_COLUMNS = { - "determination_datetime": "2020-01-01T00:00:00Z" + } # A list of implemented extension identifiers @@ -76,14 +77,9 @@ # Schemas for the fields that are not defined in fiboa # Keys must be the values from the COLUMNS dict, not the keys MISSING_SCHEMAS = { - "required": [ "crop_id", "crop_name" ], # i.e. non-nullable properties + "required": [ ], # i.e. non-nullable properties "properties": { - "crop_id": { - "type": "int64" - }, - "crop_name": { - "type": "string" - } + } }