From a4e924907d720a12e23f71097c51c2aa800c743a Mon Sep 17 00:00:00 2001 From: Aninda Ghosh Date: Thu, 6 Jun 2024 13:03:25 -0700 Subject: [PATCH 1/4] Added South Africa and Germany Fieldscapes Converter --- fiboa_cli/datasets/fs_de_bb.py | 144 +++++++++++++++++++++++++++++++++ fiboa_cli/datasets/fs_za_ct.py | 144 +++++++++++++++++++++++++++++++++ 2 files changed, 288 insertions(+) create mode 100644 fiboa_cli/datasets/fs_de_bb.py create mode 100644 fiboa_cli/datasets/fs_za_ct.py diff --git a/fiboa_cli/datasets/fs_de_bb.py b/fiboa_cli/datasets/fs_de_bb.py new file mode 100644 index 0000000..adf5bbc --- /dev/null +++ b/fiboa_cli/datasets/fs_de_bb.py @@ -0,0 +1,144 @@ +# TEMPLATE FOR A FIBOA CONVERTER +# +# Copy this file and rename it to something sensible. +# The name of the file will be the name of the converter in the cli. +# If you name it 'de_abc' you'll be able to run `fiboa convert de_abc` in the cli. + +from ..convert_utils import convert as convert_ + +# File to read the data from +# Can read any tabular data format that GeoPandas can read through read_file() +# Supported protcols: HTTP(S), GCS, S3, or the local file system + +# Local URI added to the repository for initial conversion, Original Source https://beta.source.coop/esa/fusion-competition/ +URI = "/home/byteboogie/fieldscapes/germany/fs_de_bb.gpkg" + +# Unique identifier for the collection +ID = "fs_de_bb" +# Title of the collection +TITLE = "Field boundaries for Germany, Brandenburg" +# Description of the collection. Can be multiline and include CommonMark. +DESCRIPTION = """ The dataset contains field boundaries for the German state of Brandenburg.""" +# Bounding box of the data in WGS84 coordinates +BBOX = [13.635334610075107, 52.41814553442972, 14.35270427904761, 52.849468757681805] + +# Provider name, can be None if not applicable, must be provided if PROVIDER_URL is provided +PROVIDER_NAME = "ESA" +# URL to the homepage of the data or the provider, can be None if not applicable +PROVIDER_URL = "https://beta.source.coop/esa/fusion-competition/" +# Attribution, can be None if not applicable +ATTRIBUTION = "© GeoBasis-DE/LGB" + +# License of the data, either +# 1. a SPDX license identifier (including "dl-de/by-2-0" / "dl-de/zero-2-0"), or +LICENSE = "DL-DE->BY-2.0" +# 2. a STAC Link Object with relation type "license" +# LICENSE = {"title": "CC-BY-4.0", "href": "https://creativecommons.org/licenses/by/4.0/", "type": "text/html", "rel": "license"} + +# Map original column names to fiboa property names +# You also need to list any column that you may have added in the MIGRATION function (see below). +COLUMNS = { + 'fid': 'id', + 'grid_id': 'grid_id', + "SHAPE_AREA": "area", + "SHAPE_LEN": "perimeter", + 'geometry': 'geometry', + 'crop_id': 'crop_id', + 'crop_name': 'crop_name' +} + +# Add columns with constant values. +# The key is the column name, the value is a constant value that's used for all rows. +ADD_COLUMNS = { + "determination_datetime": "2018-01-01T00:00:00Z" +} + +# A list of implemented extension identifiers +EXTENSIONS = [] + +# Functions to migrate data in columns to match the fiboa specification. +# Example: You have a column area_m in square meters and want to convert +# to hectares as required for the area field in fiboa. +# Function signature: +# func(column: pd.Series) -> pd.Series +COLUMN_MIGRATIONS = { + +} + +# Filter columns to only include the ones that are relevant for the collection, +# e.g. only rows that contain the word "agriculture" but not "forest" in the column "land_cover_type". +# Lamda function accepts a Pandas Series and returns a Series or a Tuple with a Series and True to inverse the mask. +COLUMN_FILTERS = { + +} + +# Custom function to migrate the GeoDataFrame if the other options are not sufficient +# This should be the last resort! +# Function signature: +# func(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame +MIGRATION = None + +# Schemas for the fields that are not defined in fiboa +# Keys must be the values from the COLUMNS dict, not the keys +MISSING_SCHEMAS = { + "required": ["grid_id"], # i.e. non-nullable properties + "properties": { + "grid_id": { + "type": "string" + } + } +} + + +# Conversion function, usually no changes required +def convert(output_file, cache_file = None, source_coop_url = None, collection = False, compression = None): + """ + Converts the field boundary datasets to fiboa. + + For reference, this is the order in which the conversion steps are applied: + 0. Read GeoDataFrame from file + 1. Run global migration (if provided through MIGRATION) + 2. Run filters to remove rows that shall not be in the final data + (if provided through COLUMN_FILTERS) + 3. Add columns with constant values + 4. Run column migrations (if provided through COLUMN_MIGRATIONS) + 5. Duplicate columns (if an array is provided as the value in COLUMNS) + 6. Rename columns (as provided in COLUMNS) + 7. Remove columns (if column is not present as value in COLUMNS) + 8. Create the collection + 9. Change data types of the columns based on the provided schemas + (fiboa spec, extensions, and MISSING_SCHEMAS) + 10. Write the data to the Parquet file + + Parameters: + output_file (str): Path where the Parquet file shall be stored. + cache_file (str): Path to a cached file of the data. Default: None. + Can be used to avoid repetitive downloads from the original data source. + source_coop_url (str): URL to the (future) Source Cooperative repository. Default: None + collection (bool): Additionally, store the collection separate from Parquet file. Default: False + compression (str): Compression method for the Parquet file. Default: zstd + kwargs: Additional keyword arguments for GeoPanda's read_file() or read_parquet() function. + """ + convert_( + output_file, + cache_file, + URI, + COLUMNS, + ID, + TITLE, + DESCRIPTION, + BBOX, + provider_name=PROVIDER_NAME, + provider_url=PROVIDER_URL, + source_coop_url=source_coop_url, + extensions=EXTENSIONS, + missing_schemas=MISSING_SCHEMAS, + column_additions=ADD_COLUMNS, + column_migrations=COLUMN_MIGRATIONS, + column_filters=COLUMN_FILTERS, + migration=MIGRATION, + attribution=ATTRIBUTION, + store_collection=collection, + license=LICENSE, + compression=compression, + ) \ No newline at end of file diff --git a/fiboa_cli/datasets/fs_za_ct.py b/fiboa_cli/datasets/fs_za_ct.py new file mode 100644 index 0000000..569906b --- /dev/null +++ b/fiboa_cli/datasets/fs_za_ct.py @@ -0,0 +1,144 @@ +# TEMPLATE FOR A FIBOA CONVERTER +# +# Copy this file and rename it to something sensible. +# The name of the file will be the name of the converter in the cli. +# If you name it 'de_abc' you'll be able to run `fiboa convert de_abc` in the cli. + +from ..convert_utils import convert as convert_ + +# File to read the data from +# Can read any tabular data format that GeoPandas can read through read_file() +# Supported protcols: HTTP(S), GCS, S3, or the local file system + +# Local URI added to the repository for initial conversion, Original Source https://beta.source.coop/esa/fusion-competition/ +URI = "/home/byteboogie/fieldscapes/south_africa/fs_za_ct.gpkg" + +# Unique identifier for the collection +ID = "fs_za_ct" +# Title of the collection +TITLE = "Field boundaries for Cape Town, South Africa" +# Description of the collection. Can be multiline and include CommonMark. +DESCRIPTION = """ The dataset contains field boundaries for the Cape Town, South Africa.""" +# Bounding box of the data in WGS84 coordinates +BBOX = [20.521492384730347, -34.39922362572791, 21.04341451023305, -33.980506187460875] + +# Provider name, can be None if not applicable, must be provided if PROVIDER_URL is provided +PROVIDER_NAME = "Planet, Radiant Earth Foundation, Western Cape Department of Agriculture, & German Aerospace Center (DLR)" +# URL to the homepage of the data or the provider, can be None if not applicable +PROVIDER_URL = "https://beta.source.coop/esa/fusion-competition/" +# Attribution, can be None if not applicable +ATTRIBUTION = "ESA Fusion Competition" + +# License of the data, either +# 1. a SPDX license identifier (including "dl-de/by-2-0" / "dl-de/zero-2-0"), or +LICENSE = "CC BY-NC-SA 4.0" +# 2. a STAC Link Object with relation type "license" +# LICENSE = {"title": "CC-BY-4.0", "href": "https://creativecommons.org/licenses/by/4.0/", "type": "text/html", "rel": "license"} + +# Map original column names to fiboa property names +# You also need to list any column that you may have added in the MIGRATION function (see below). +COLUMNS = { + 'fid': 'id', + 'grid_id': 'grid_id', + "SHAPE_AREA": "area", + "SHAPE_LEN": "perimeter", + 'geometry': 'geometry', + 'crop_id': 'crop_id', + 'crop_name': 'crop_name' +} + +# Add columns with constant values. +# The key is the column name, the value is a constant value that's used for all rows. +ADD_COLUMNS = { + "determination_datetime": "2021-01-01T00:00:00Z" +} + +# A list of implemented extension identifiers +EXTENSIONS = [] + +# Functions to migrate data in columns to match the fiboa specification. +# Example: You have a column area_m in square meters and want to convert +# to hectares as required for the area field in fiboa. +# Function signature: +# func(column: pd.Series) -> pd.Series +COLUMN_MIGRATIONS = { + "area_m": lambda column: column * 0.0001 +} + +# Filter columns to only include the ones that are relevant for the collection, +# e.g. only rows that contain the word "agriculture" but not "forest" in the column "land_cover_type". +# Lamda function accepts a Pandas Series and returns a Series or a Tuple with a Series and True to inverse the mask. +COLUMN_FILTERS = { + +} + +# Custom function to migrate the GeoDataFrame if the other options are not sufficient +# This should be the last resort! +# Function signature: +# func(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame +MIGRATION = None + +# Schemas for the fields that are not defined in fiboa +# Keys must be the values from the COLUMNS dict, not the keys +MISSING_SCHEMAS = { + "required": ["grid_id"], # i.e. non-nullable properties + "properties": { + "grid_id": { + "type": "string" + } + } +} + + +# Conversion function, usually no changes required +def convert(output_file, cache_file = None, source_coop_url = None, collection = False, compression = None): + """ + Converts the field boundary datasets to fiboa. + + For reference, this is the order in which the conversion steps are applied: + 0. Read GeoDataFrame from file + 1. Run global migration (if provided through MIGRATION) + 2. Run filters to remove rows that shall not be in the final data + (if provided through COLUMN_FILTERS) + 3. Add columns with constant values + 4. Run column migrations (if provided through COLUMN_MIGRATIONS) + 5. Duplicate columns (if an array is provided as the value in COLUMNS) + 6. Rename columns (as provided in COLUMNS) + 7. Remove columns (if column is not present as value in COLUMNS) + 8. Create the collection + 9. Change data types of the columns based on the provided schemas + (fiboa spec, extensions, and MISSING_SCHEMAS) + 10. Write the data to the Parquet file + + Parameters: + output_file (str): Path where the Parquet file shall be stored. + cache_file (str): Path to a cached file of the data. Default: None. + Can be used to avoid repetitive downloads from the original data source. + source_coop_url (str): URL to the (future) Source Cooperative repository. Default: None + collection (bool): Additionally, store the collection separate from Parquet file. Default: False + compression (str): Compression method for the Parquet file. Default: zstd + kwargs: Additional keyword arguments for GeoPanda's read_file() or read_parquet() function. + """ + convert_( + output_file, + cache_file, + URI, + COLUMNS, + ID, + TITLE, + DESCRIPTION, + BBOX, + provider_name=PROVIDER_NAME, + provider_url=PROVIDER_URL, + source_coop_url=source_coop_url, + extensions=EXTENSIONS, + missing_schemas=MISSING_SCHEMAS, + column_additions=ADD_COLUMNS, + column_migrations=COLUMN_MIGRATIONS, + column_filters=COLUMN_FILTERS, + migration=MIGRATION, + attribution=ATTRIBUTION, + store_collection=collection, + license=LICENSE, + compression=compression, + ) \ No newline at end of file From 3c014a85ded2e2f51a0896b85443f9674adaa947 Mon Sep 17 00:00:00 2001 From: Aninda Ghosh Date: Thu, 13 Jun 2024 17:10:53 -0700 Subject: [PATCH 2/4] Updated Converters for Germnay and South Africa (Fieldscapes Subset) --- .../datasets/fieldscapes_germany_2021.py | 138 ++++++++++++++++++ .../datasets/fieldscapes_southafrica_2021.py | 138 ++++++++++++++++++ 2 files changed, 276 insertions(+) create mode 100644 fiboa_cli/datasets/fieldscapes_germany_2021.py create mode 100644 fiboa_cli/datasets/fieldscapes_southafrica_2021.py diff --git a/fiboa_cli/datasets/fieldscapes_germany_2021.py b/fiboa_cli/datasets/fieldscapes_germany_2021.py new file mode 100644 index 0000000..ccc9327 --- /dev/null +++ b/fiboa_cli/datasets/fieldscapes_germany_2021.py @@ -0,0 +1,138 @@ +# TEMPLATE FOR A FIBOA CONVERTER +# +# Copy this file and rename it to something sensible. +# The name of the file will be the name of the converter in the cli. +# If you name it 'de_abc' you'll be able to run `fiboa convert de_abc` in the cli. + +from ..convert_utils import convert as convert_ + +# File to read the data from +# Can read any tabular data format that GeoPandas can read through read_file() +# Supported protcols: HTTP(S), GCS, S3, or the local file system + +# Local URI added to the repository for initial conversion, Original Source https://beta.source.coop/esa/fusion-competition/ +URI = "/home/byteboogie/work/labwork_hkerner/fieldscapes/germany/boundaries_germany_2021.gpkg" + +# Unique identifier for the collection +ID = "fieldscapes_germany_2021" +# Title of the collection +TITLE = "Field boundaries for Germany, Brandenburg (Fieldscapes)" +# Description of the collection. Can be multiline and include CommonMark. +DESCRIPTION = """ The dataset contains field boundaries for the German state of Brandenburg.""" +# Bounding box of the data in WGS84 coordinates +BBOX = [13.635334610075107, 52.41814553442972, 14.35270427904761, 52.849468757681805] + +# Provider name, can be None if not applicable, must be provided if PROVIDER_URL is provided +PROVIDER_NAME = "ESA" +# URL to the homepage of the data or the provider, can be None if not applicable +PROVIDER_URL = "https://beta.source.coop/esa/fusion-competition/" +# Attribution, can be None if not applicable +ATTRIBUTION = "© GeoBasis-DE/LGB" + +# License of the data, either +# 1. a SPDX license identifier (including "dl-de/by-2-0" / "dl-de/zero-2-0"), or +LICENSE = "dl-de/by-2-0" +# 2. a STAC Link Object with relation type "license" +# LICENSE = {"title": "CC-BY-4.0", "href": "https://creativecommons.org/licenses/by/4.0/", "type": "text/html", "rel": "license"} + +# Map original column names to fiboa property names +# You also need to list any column that you may have added in the MIGRATION function (see below). +COLUMNS = { + "id": "id", + "SHAPE_AREA": "area", + "SHAPE_LEN": "perimeter", + "geometry": "geometry", + "crop_id": "crop_id", + "crop_name": "crop_name" +} + +# Add columns with constant values. +# The key is the column name, the value is a constant value that's used for all rows. +ADD_COLUMNS = { + "determination_datetime": "2021-01-01T00:00:00Z" +} + +# A list of implemented extension identifiers +EXTENSIONS = [] + +# Functions to migrate data in columns to match the fiboa specification. +# Example: You have a column area_m in square meters and want to convert +# to hectares as required for the area field in fiboa. +# Function signature: +# func(column: pd.Series) -> pd.Series +COLUMN_MIGRATIONS = { + +} + +# Filter columns to only include the ones that are relevant for the collection, +# e.g. only rows that contain the word "agriculture" but not "forest" in the column "land_cover_type". +# Lamda function accepts a Pandas Series and returns a Series or a Tuple with a Series and True to inverse the mask. +COLUMN_FILTERS = { + +} + +# Custom function to migrate the GeoDataFrame if the other options are not sufficient +# This should be the last resort! +# Function signature: +# func(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame +MIGRATION = None + +# Schemas for the fields that are not defined in fiboa +# Keys must be the values from the COLUMNS dict, not the keys +MISSING_SCHEMAS = { + +} + + +# Conversion function, usually no changes required +def convert(output_file, cache_file = None, source_coop_url = None, collection = False, compression = None): + """ + Converts the field boundary datasets to fiboa. + + For reference, this is the order in which the conversion steps are applied: + 0. Read GeoDataFrame from file + 1. Run global migration (if provided through MIGRATION) + 2. Run filters to remove rows that shall not be in the final data + (if provided through COLUMN_FILTERS) + 3. Add columns with constant values + 4. Run column migrations (if provided through COLUMN_MIGRATIONS) + 5. Duplicate columns (if an array is provided as the value in COLUMNS) + 6. Rename columns (as provided in COLUMNS) + 7. Remove columns (if column is not present as value in COLUMNS) + 8. Create the collection + 9. Change data types of the columns based on the provided schemas + (fiboa spec, extensions, and MISSING_SCHEMAS) + 10. Write the data to the Parquet file + + Parameters: + output_file (str): Path where the Parquet file shall be stored. + cache_file (str): Path to a cached file of the data. Default: None. + Can be used to avoid repetitive downloads from the original data source. + source_coop_url (str): URL to the (future) Source Cooperative repository. Default: None + collection (bool): Additionally, store the collection separate from Parquet file. Default: False + compression (str): Compression method for the Parquet file. Default: zstd + kwargs: Additional keyword arguments for GeoPanda's read_file() or read_parquet() function. + """ + convert_( + output_file, + cache_file, + URI, + COLUMNS, + ID, + TITLE, + DESCRIPTION, + BBOX, + provider_name=PROVIDER_NAME, + provider_url=PROVIDER_URL, + source_coop_url=source_coop_url, + extensions=EXTENSIONS, + missing_schemas=MISSING_SCHEMAS, + column_additions=ADD_COLUMNS, + column_migrations=COLUMN_MIGRATIONS, + column_filters=COLUMN_FILTERS, + migration=MIGRATION, + attribution=ATTRIBUTION, + store_collection=collection, + license=LICENSE, + compression=compression, + ) \ No newline at end of file diff --git a/fiboa_cli/datasets/fieldscapes_southafrica_2021.py b/fiboa_cli/datasets/fieldscapes_southafrica_2021.py new file mode 100644 index 0000000..21cfb8f --- /dev/null +++ b/fiboa_cli/datasets/fieldscapes_southafrica_2021.py @@ -0,0 +1,138 @@ +# TEMPLATE FOR A FIBOA CONVERTER +# +# Copy this file and rename it to something sensible. +# The name of the file will be the name of the converter in the cli. +# If you name it 'de_abc' you'll be able to run `fiboa convert de_abc` in the cli. + +from ..convert_utils import convert as convert_ + +# File to read the data from +# Can read any tabular data format that GeoPandas can read through read_file() +# Supported protcols: HTTP(S), GCS, S3, or the local file system + +# Local URI added to the repository for initial conversion, Original Source https://beta.source.coop/esa/fusion-competition/ +URI = "/home/byteboogie/work/labwork_hkerner/fieldscapes/southafrica/boundaries_southafrica_2021.gpkg" + +# Unique identifier for the collection +ID = "boundaries_southafrica_2021" +# Title of the collection +TITLE = "Field boundaries for Cape Town, South Africa" +# Description of the collection. Can be multiline and include CommonMark. +DESCRIPTION = """ The dataset contains field boundaries for the Cape Town, South Africa.""" +# Bounding box of the data in WGS84 coordinates +BBOX = [20.521492384730347, -34.39922362572791, 21.04341451023305, -33.980506187460875] + +# Provider name, can be None if not applicable, must be provided if PROVIDER_URL is provided +PROVIDER_NAME = "Planet, Radiant Earth Foundation, Western Cape Department of Agriculture, & German Aerospace Center (DLR)" +# URL to the homepage of the data or the provider, can be None if not applicable +PROVIDER_URL = "https://beta.source.coop/esa/fusion-competition/" +# Attribution, can be None if not applicable +ATTRIBUTION = "ESA Fusion Competition" + +# License of the data, either +# 1. a SPDX license identifier (including "dl-de/by-2-0" / "dl-de/zero-2-0"), or +LICENSE = "CC-BY-NC-SA-4.0" +# 2. a STAC Link Object with relation type "license" +# LICENSE = {"title": "CC-BY-4.0", "href": "https://creativecommons.org/licenses/by/4.0/", "type": "text/html", "rel": "license"} + +# Map original column names to fiboa property names +# You also need to list any column that you may have added in the MIGRATION function (see below). +COLUMNS = { + "id": "id", + "SHAPE_AREA": "area", + "SHAPE_LEN": "perimeter", + "geometry": "geometry", + "crop_id": "crop_id", + "crop_name": "crop_name" +} + +# Add columns with constant values. +# The key is the column name, the value is a constant value that's used for all rows. +ADD_COLUMNS = { + "determination_datetime": "2021-01-01T00:00:00Z" +} + +# A list of implemented extension identifiers +EXTENSIONS = [] + +# Functions to migrate data in columns to match the fiboa specification. +# Example: You have a column area_m in square meters and want to convert +# to hectares as required for the area field in fiboa. +# Function signature: +# func(column: pd.Series) -> pd.Series +COLUMN_MIGRATIONS = { + +} + +# Filter columns to only include the ones that are relevant for the collection, +# e.g. only rows that contain the word "agriculture" but not "forest" in the column "land_cover_type". +# Lamda function accepts a Pandas Series and returns a Series or a Tuple with a Series and True to inverse the mask. +COLUMN_FILTERS = { + +} + +# Custom function to migrate the GeoDataFrame if the other options are not sufficient +# This should be the last resort! +# Function signature: +# func(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame +MIGRATION = None + +# Schemas for the fields that are not defined in fiboa +# Keys must be the values from the COLUMNS dict, not the keys +MISSING_SCHEMAS = { + +} + + +# Conversion function, usually no changes required +def convert(output_file, cache_file = None, source_coop_url = None, collection = False, compression = None): + """ + Converts the field boundary datasets to fiboa. + + For reference, this is the order in which the conversion steps are applied: + 0. Read GeoDataFrame from file + 1. Run global migration (if provided through MIGRATION) + 2. Run filters to remove rows that shall not be in the final data + (if provided through COLUMN_FILTERS) + 3. Add columns with constant values + 4. Run column migrations (if provided through COLUMN_MIGRATIONS) + 5. Duplicate columns (if an array is provided as the value in COLUMNS) + 6. Rename columns (as provided in COLUMNS) + 7. Remove columns (if column is not present as value in COLUMNS) + 8. Create the collection + 9. Change data types of the columns based on the provided schemas + (fiboa spec, extensions, and MISSING_SCHEMAS) + 10. Write the data to the Parquet file + + Parameters: + output_file (str): Path where the Parquet file shall be stored. + cache_file (str): Path to a cached file of the data. Default: None. + Can be used to avoid repetitive downloads from the original data source. + source_coop_url (str): URL to the (future) Source Cooperative repository. Default: None + collection (bool): Additionally, store the collection separate from Parquet file. Default: False + compression (str): Compression method for the Parquet file. Default: zstd + kwargs: Additional keyword arguments for GeoPanda's read_file() or read_parquet() function. + """ + convert_( + output_file, + cache_file, + URI, + COLUMNS, + ID, + TITLE, + DESCRIPTION, + BBOX, + provider_name=PROVIDER_NAME, + provider_url=PROVIDER_URL, + source_coop_url=source_coop_url, + extensions=EXTENSIONS, + missing_schemas=MISSING_SCHEMAS, + column_additions=ADD_COLUMNS, + column_migrations=COLUMN_MIGRATIONS, + column_filters=COLUMN_FILTERS, + migration=MIGRATION, + attribution=ATTRIBUTION, + store_collection=collection, + license=LICENSE, + compression=compression, + ) \ No newline at end of file From 58d22465791a00a605b7d58991431198d89e1841 Mon Sep 17 00:00:00 2001 From: Aninda Ghosh Date: Thu, 13 Jun 2024 17:11:18 -0700 Subject: [PATCH 3/4] Removed older converters --- fiboa_cli/datasets/fs_de_bb.py | 144 --------------------------------- fiboa_cli/datasets/fs_za_ct.py | 144 --------------------------------- 2 files changed, 288 deletions(-) delete mode 100644 fiboa_cli/datasets/fs_de_bb.py delete mode 100644 fiboa_cli/datasets/fs_za_ct.py diff --git a/fiboa_cli/datasets/fs_de_bb.py b/fiboa_cli/datasets/fs_de_bb.py deleted file mode 100644 index adf5bbc..0000000 --- a/fiboa_cli/datasets/fs_de_bb.py +++ /dev/null @@ -1,144 +0,0 @@ -# TEMPLATE FOR A FIBOA CONVERTER -# -# Copy this file and rename it to something sensible. -# The name of the file will be the name of the converter in the cli. -# If you name it 'de_abc' you'll be able to run `fiboa convert de_abc` in the cli. - -from ..convert_utils import convert as convert_ - -# File to read the data from -# Can read any tabular data format that GeoPandas can read through read_file() -# Supported protcols: HTTP(S), GCS, S3, or the local file system - -# Local URI added to the repository for initial conversion, Original Source https://beta.source.coop/esa/fusion-competition/ -URI = "/home/byteboogie/fieldscapes/germany/fs_de_bb.gpkg" - -# Unique identifier for the collection -ID = "fs_de_bb" -# Title of the collection -TITLE = "Field boundaries for Germany, Brandenburg" -# Description of the collection. Can be multiline and include CommonMark. -DESCRIPTION = """ The dataset contains field boundaries for the German state of Brandenburg.""" -# Bounding box of the data in WGS84 coordinates -BBOX = [13.635334610075107, 52.41814553442972, 14.35270427904761, 52.849468757681805] - -# Provider name, can be None if not applicable, must be provided if PROVIDER_URL is provided -PROVIDER_NAME = "ESA" -# URL to the homepage of the data or the provider, can be None if not applicable -PROVIDER_URL = "https://beta.source.coop/esa/fusion-competition/" -# Attribution, can be None if not applicable -ATTRIBUTION = "© GeoBasis-DE/LGB" - -# License of the data, either -# 1. a SPDX license identifier (including "dl-de/by-2-0" / "dl-de/zero-2-0"), or -LICENSE = "DL-DE->BY-2.0" -# 2. a STAC Link Object with relation type "license" -# LICENSE = {"title": "CC-BY-4.0", "href": "https://creativecommons.org/licenses/by/4.0/", "type": "text/html", "rel": "license"} - -# Map original column names to fiboa property names -# You also need to list any column that you may have added in the MIGRATION function (see below). -COLUMNS = { - 'fid': 'id', - 'grid_id': 'grid_id', - "SHAPE_AREA": "area", - "SHAPE_LEN": "perimeter", - 'geometry': 'geometry', - 'crop_id': 'crop_id', - 'crop_name': 'crop_name' -} - -# Add columns with constant values. -# The key is the column name, the value is a constant value that's used for all rows. -ADD_COLUMNS = { - "determination_datetime": "2018-01-01T00:00:00Z" -} - -# A list of implemented extension identifiers -EXTENSIONS = [] - -# Functions to migrate data in columns to match the fiboa specification. -# Example: You have a column area_m in square meters and want to convert -# to hectares as required for the area field in fiboa. -# Function signature: -# func(column: pd.Series) -> pd.Series -COLUMN_MIGRATIONS = { - -} - -# Filter columns to only include the ones that are relevant for the collection, -# e.g. only rows that contain the word "agriculture" but not "forest" in the column "land_cover_type". -# Lamda function accepts a Pandas Series and returns a Series or a Tuple with a Series and True to inverse the mask. -COLUMN_FILTERS = { - -} - -# Custom function to migrate the GeoDataFrame if the other options are not sufficient -# This should be the last resort! -# Function signature: -# func(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame -MIGRATION = None - -# Schemas for the fields that are not defined in fiboa -# Keys must be the values from the COLUMNS dict, not the keys -MISSING_SCHEMAS = { - "required": ["grid_id"], # i.e. non-nullable properties - "properties": { - "grid_id": { - "type": "string" - } - } -} - - -# Conversion function, usually no changes required -def convert(output_file, cache_file = None, source_coop_url = None, collection = False, compression = None): - """ - Converts the field boundary datasets to fiboa. - - For reference, this is the order in which the conversion steps are applied: - 0. Read GeoDataFrame from file - 1. Run global migration (if provided through MIGRATION) - 2. Run filters to remove rows that shall not be in the final data - (if provided through COLUMN_FILTERS) - 3. Add columns with constant values - 4. Run column migrations (if provided through COLUMN_MIGRATIONS) - 5. Duplicate columns (if an array is provided as the value in COLUMNS) - 6. Rename columns (as provided in COLUMNS) - 7. Remove columns (if column is not present as value in COLUMNS) - 8. Create the collection - 9. Change data types of the columns based on the provided schemas - (fiboa spec, extensions, and MISSING_SCHEMAS) - 10. Write the data to the Parquet file - - Parameters: - output_file (str): Path where the Parquet file shall be stored. - cache_file (str): Path to a cached file of the data. Default: None. - Can be used to avoid repetitive downloads from the original data source. - source_coop_url (str): URL to the (future) Source Cooperative repository. Default: None - collection (bool): Additionally, store the collection separate from Parquet file. Default: False - compression (str): Compression method for the Parquet file. Default: zstd - kwargs: Additional keyword arguments for GeoPanda's read_file() or read_parquet() function. - """ - convert_( - output_file, - cache_file, - URI, - COLUMNS, - ID, - TITLE, - DESCRIPTION, - BBOX, - provider_name=PROVIDER_NAME, - provider_url=PROVIDER_URL, - source_coop_url=source_coop_url, - extensions=EXTENSIONS, - missing_schemas=MISSING_SCHEMAS, - column_additions=ADD_COLUMNS, - column_migrations=COLUMN_MIGRATIONS, - column_filters=COLUMN_FILTERS, - migration=MIGRATION, - attribution=ATTRIBUTION, - store_collection=collection, - license=LICENSE, - compression=compression, - ) \ No newline at end of file diff --git a/fiboa_cli/datasets/fs_za_ct.py b/fiboa_cli/datasets/fs_za_ct.py deleted file mode 100644 index 569906b..0000000 --- a/fiboa_cli/datasets/fs_za_ct.py +++ /dev/null @@ -1,144 +0,0 @@ -# TEMPLATE FOR A FIBOA CONVERTER -# -# Copy this file and rename it to something sensible. -# The name of the file will be the name of the converter in the cli. -# If you name it 'de_abc' you'll be able to run `fiboa convert de_abc` in the cli. - -from ..convert_utils import convert as convert_ - -# File to read the data from -# Can read any tabular data format that GeoPandas can read through read_file() -# Supported protcols: HTTP(S), GCS, S3, or the local file system - -# Local URI added to the repository for initial conversion, Original Source https://beta.source.coop/esa/fusion-competition/ -URI = "/home/byteboogie/fieldscapes/south_africa/fs_za_ct.gpkg" - -# Unique identifier for the collection -ID = "fs_za_ct" -# Title of the collection -TITLE = "Field boundaries for Cape Town, South Africa" -# Description of the collection. Can be multiline and include CommonMark. -DESCRIPTION = """ The dataset contains field boundaries for the Cape Town, South Africa.""" -# Bounding box of the data in WGS84 coordinates -BBOX = [20.521492384730347, -34.39922362572791, 21.04341451023305, -33.980506187460875] - -# Provider name, can be None if not applicable, must be provided if PROVIDER_URL is provided -PROVIDER_NAME = "Planet, Radiant Earth Foundation, Western Cape Department of Agriculture, & German Aerospace Center (DLR)" -# URL to the homepage of the data or the provider, can be None if not applicable -PROVIDER_URL = "https://beta.source.coop/esa/fusion-competition/" -# Attribution, can be None if not applicable -ATTRIBUTION = "ESA Fusion Competition" - -# License of the data, either -# 1. a SPDX license identifier (including "dl-de/by-2-0" / "dl-de/zero-2-0"), or -LICENSE = "CC BY-NC-SA 4.0" -# 2. a STAC Link Object with relation type "license" -# LICENSE = {"title": "CC-BY-4.0", "href": "https://creativecommons.org/licenses/by/4.0/", "type": "text/html", "rel": "license"} - -# Map original column names to fiboa property names -# You also need to list any column that you may have added in the MIGRATION function (see below). -COLUMNS = { - 'fid': 'id', - 'grid_id': 'grid_id', - "SHAPE_AREA": "area", - "SHAPE_LEN": "perimeter", - 'geometry': 'geometry', - 'crop_id': 'crop_id', - 'crop_name': 'crop_name' -} - -# Add columns with constant values. -# The key is the column name, the value is a constant value that's used for all rows. -ADD_COLUMNS = { - "determination_datetime": "2021-01-01T00:00:00Z" -} - -# A list of implemented extension identifiers -EXTENSIONS = [] - -# Functions to migrate data in columns to match the fiboa specification. -# Example: You have a column area_m in square meters and want to convert -# to hectares as required for the area field in fiboa. -# Function signature: -# func(column: pd.Series) -> pd.Series -COLUMN_MIGRATIONS = { - "area_m": lambda column: column * 0.0001 -} - -# Filter columns to only include the ones that are relevant for the collection, -# e.g. only rows that contain the word "agriculture" but not "forest" in the column "land_cover_type". -# Lamda function accepts a Pandas Series and returns a Series or a Tuple with a Series and True to inverse the mask. -COLUMN_FILTERS = { - -} - -# Custom function to migrate the GeoDataFrame if the other options are not sufficient -# This should be the last resort! -# Function signature: -# func(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame -MIGRATION = None - -# Schemas for the fields that are not defined in fiboa -# Keys must be the values from the COLUMNS dict, not the keys -MISSING_SCHEMAS = { - "required": ["grid_id"], # i.e. non-nullable properties - "properties": { - "grid_id": { - "type": "string" - } - } -} - - -# Conversion function, usually no changes required -def convert(output_file, cache_file = None, source_coop_url = None, collection = False, compression = None): - """ - Converts the field boundary datasets to fiboa. - - For reference, this is the order in which the conversion steps are applied: - 0. Read GeoDataFrame from file - 1. Run global migration (if provided through MIGRATION) - 2. Run filters to remove rows that shall not be in the final data - (if provided through COLUMN_FILTERS) - 3. Add columns with constant values - 4. Run column migrations (if provided through COLUMN_MIGRATIONS) - 5. Duplicate columns (if an array is provided as the value in COLUMNS) - 6. Rename columns (as provided in COLUMNS) - 7. Remove columns (if column is not present as value in COLUMNS) - 8. Create the collection - 9. Change data types of the columns based on the provided schemas - (fiboa spec, extensions, and MISSING_SCHEMAS) - 10. Write the data to the Parquet file - - Parameters: - output_file (str): Path where the Parquet file shall be stored. - cache_file (str): Path to a cached file of the data. Default: None. - Can be used to avoid repetitive downloads from the original data source. - source_coop_url (str): URL to the (future) Source Cooperative repository. Default: None - collection (bool): Additionally, store the collection separate from Parquet file. Default: False - compression (str): Compression method for the Parquet file. Default: zstd - kwargs: Additional keyword arguments for GeoPanda's read_file() or read_parquet() function. - """ - convert_( - output_file, - cache_file, - URI, - COLUMNS, - ID, - TITLE, - DESCRIPTION, - BBOX, - provider_name=PROVIDER_NAME, - provider_url=PROVIDER_URL, - source_coop_url=source_coop_url, - extensions=EXTENSIONS, - missing_schemas=MISSING_SCHEMAS, - column_additions=ADD_COLUMNS, - column_migrations=COLUMN_MIGRATIONS, - column_filters=COLUMN_FILTERS, - migration=MIGRATION, - attribution=ATTRIBUTION, - store_collection=collection, - license=LICENSE, - compression=compression, - ) \ No newline at end of file From 52b744b14c6b2dd381a373db91888ad444c82083 Mon Sep 17 00:00:00 2001 From: Aninda Ghosh Date: Mon, 8 Jul 2024 16:03:28 -0700 Subject: [PATCH 4/4] Updated Years --- ...germany_2021.py => fieldscapes_germany.py} | 19 ++++++++++++++----- ...021.py => fieldscapes_southafrica_2018.py} | 16 ++++++++++++---- 2 files changed, 26 insertions(+), 9 deletions(-) rename fiboa_cli/datasets/{fieldscapes_germany_2021.py => fieldscapes_germany.py} (93%) rename fiboa_cli/datasets/{fieldscapes_southafrica_2021.py => fieldscapes_southafrica_2018.py} (94%) diff --git a/fiboa_cli/datasets/fieldscapes_germany_2021.py b/fiboa_cli/datasets/fieldscapes_germany.py similarity index 93% rename from fiboa_cli/datasets/fieldscapes_germany_2021.py rename to fiboa_cli/datasets/fieldscapes_germany.py index ccc9327..9b03266 100644 --- a/fiboa_cli/datasets/fieldscapes_germany_2021.py +++ b/fiboa_cli/datasets/fieldscapes_germany.py @@ -11,10 +11,10 @@ # Supported protcols: HTTP(S), GCS, S3, or the local file system # Local URI added to the repository for initial conversion, Original Source https://beta.source.coop/esa/fusion-competition/ -URI = "/home/byteboogie/work/labwork_hkerner/fieldscapes/germany/boundaries_germany_2021.gpkg" +URI = "/home/byteboogie/work/labwork_hkerner/fieldscapes/germany/boundaries_germany.gpkg" # Unique identifier for the collection -ID = "fieldscapes_germany_2021" +ID = "fieldscapes_germany" # Title of the collection TITLE = "Field boundaries for Germany, Brandenburg (Fieldscapes)" # Description of the collection. Can be multiline and include CommonMark. @@ -43,13 +43,14 @@ "SHAPE_LEN": "perimeter", "geometry": "geometry", "crop_id": "crop_id", - "crop_name": "crop_name" + "crop_name": "crop_name", + "determination_datetime": "determination_datetime" } # Add columns with constant values. # The key is the column name, the value is a constant value that's used for all rows. ADD_COLUMNS = { - "determination_datetime": "2021-01-01T00:00:00Z" + } # A list of implemented extension identifiers @@ -80,7 +81,15 @@ # Schemas for the fields that are not defined in fiboa # Keys must be the values from the COLUMNS dict, not the keys MISSING_SCHEMAS = { - + "required": [ "crop_id", "crop_name" ], # i.e. non-nullable properties + "properties": { + "crop_id": { + "type": "int64" + }, + "crop_name": { + "type": "string" + } + } } diff --git a/fiboa_cli/datasets/fieldscapes_southafrica_2021.py b/fiboa_cli/datasets/fieldscapes_southafrica_2018.py similarity index 94% rename from fiboa_cli/datasets/fieldscapes_southafrica_2021.py rename to fiboa_cli/datasets/fieldscapes_southafrica_2018.py index 21cfb8f..68ad866 100644 --- a/fiboa_cli/datasets/fieldscapes_southafrica_2021.py +++ b/fiboa_cli/datasets/fieldscapes_southafrica_2018.py @@ -11,10 +11,10 @@ # Supported protcols: HTTP(S), GCS, S3, or the local file system # Local URI added to the repository for initial conversion, Original Source https://beta.source.coop/esa/fusion-competition/ -URI = "/home/byteboogie/work/labwork_hkerner/fieldscapes/southafrica/boundaries_southafrica_2021.gpkg" +URI = "/home/byteboogie/work/labwork_hkerner/fieldscapes/southafrica/boundaries_southafrica_2018.gpkg" # Unique identifier for the collection -ID = "boundaries_southafrica_2021" +ID = "boundaries_southafrica_2018" # Title of the collection TITLE = "Field boundaries for Cape Town, South Africa" # Description of the collection. Can be multiline and include CommonMark. @@ -49,7 +49,7 @@ # Add columns with constant values. # The key is the column name, the value is a constant value that's used for all rows. ADD_COLUMNS = { - "determination_datetime": "2021-01-01T00:00:00Z" + "determination_datetime": "2018-03-31T00:00:00Z" } # A list of implemented extension identifiers @@ -80,7 +80,15 @@ # Schemas for the fields that are not defined in fiboa # Keys must be the values from the COLUMNS dict, not the keys MISSING_SCHEMAS = { - + "required": [ "crop_id", "crop_name" ], # i.e. non-nullable properties + "properties": { + "crop_id": { + "type": "int64" + }, + "crop_name": { + "type": "string" + } + } }