Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fiboa Cli Converter for Brazil #84

Merged
merged 6 commits into from
Jul 30, 2024
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 206 additions & 0 deletions fiboa_cli/datasets/fs_br.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
from ..convert_utils import convert as convert_
import re
import pandas as pd

# Please REMOVE the comment after review:
# I was actually using this link "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/vz6d7tw87f-1.zip" which is copied from the
# "Download All" button from the website (https://data.mendeley.com/datasets/vz6d7tw87f/1).
# But output an error saying """/vsizip//tmp/tmp7puy6g90/vz6d7tw87f-1.zip' not recognized as a supported file format.
# It might help to specify the correct driver explicitly by prefixing the file path with '<DRIVER>:', e.g. 'CSV:path'""""
SOURCES = "https://github.com/fiboa/data/files/15438544/LEM_dataset.zip"
m-mohr marked this conversation as resolved.
Show resolved Hide resolved


ID = "br"
SHORT_NAME = "Brazil"
TITLE = "Field boundaries for Brazil"
m-mohr marked this conversation as resolved.
Show resolved Hide resolved
DESCRIPTION = """
This dataset is the supplementary data of a paper published in the Data in Brief Journal.

The dataset, in ESRI shapefile format (spatial reference system: WGS 84, EPSG: 4326), provides monthly land use
information about 1854 fields from October 2019 to September 2020 from Luís Eduardo Magalhães (LEM) and other
municipalities in the west of Bahia state, Brazil. The majority of the 16 land uses classes are related to crops.
"""


PROVIDERS = [
{
"name": "Mendeley Data",
"url": "https://data.mendeley.com/datasets/vz6d7tw87f/1#file-5ac1542b-12ef-4dce-8258-113b5c5d87c9",
"roles": ["producer", "licensor"]
}
]

ATTRIBUTION = "Copyright © 2024 Elsevier inc, its licensors, and contributors."
LICENSE = "CC-BY-4.0"

# Please REMOVE the comment after review:
# The commented variables are my proposed columns attributes, but it doesn't seem
# logical to me to have months and years as the data's attributes. What do you think?
COLUMNS = {
'geometry': 'geometry',
'id': 'id',
# 'OCT_2019': 'oct_2019',
# 'NOV_2019': 'nov_2019',
# 'DEC_2019': 'dec_2019',
# 'JAN_2020': 'jan_2020',
# 'FEB_2020': 'feb_2020',
# 'MAR_2020': 'mar_2020',
# 'APR_2020': 'apr_2020',
# 'MAY_2020': 'may_2020',
# 'JUN_2020': 'jun_2020',
# 'JUL_2020': 'jul_2020',
# 'AUG_2020': 'aug_2020',
# 'SEP_2020': 'sep_2020',
# 'NOTE': 'note'
}
Barasakar marked this conversation as resolved.
Show resolved Hide resolved

# Please REMOVE the comment after review:
# I do think it is safe to assume the determination date is the last date of September in 2020 as the dataset
# has SEP_2020 (which is the latests time) as one of its column attributes.
ADD_COLUMNS = {
"determination_datetime": "2020-09-30T00:00:00Z"
}
Barasakar marked this conversation as resolved.
Show resolved Hide resolved

EXTENSIONS = []

COLUMN_MIGRATIONS = {
}

COLUMN_FILTERS = {
}

MIGRATION = None

FILE_MIGRATION = None

# Please REMOVE the comment after review:
# There is one field that I omitted, which is the `NOTE` field in the dataset.
# This is because it is nullable.
MISSING_SCHEMAS = {
# "required": ["oct_2019"],
# "properties": {
# "oct_2019": {
# "type": "string"
# }
# },
# "required": ["nov_2019"],
# "properties": {
# "nov_2019": {
# "type": "string"
# }
# },
# "required": ["dec_2019"],
# "properties": {
# "dec_2019": {
# "type": "string"
# }
# },
# "required": ["jan_2020"],
# "properties": {
# "jan_2020": {
# "type": "string"
# }
# },
# "required": ["feb_2020"],
# "properties": {
# "feb_2020": {
# "type": "string"
# }
# },
# "required": ["mar_2020"],
# "properties": {
# "mar_2020": {
# "type": "string"
# }
# },
# "required": ["apr_2020"],
# "properties": {
# "apr_2020": {
# "type": "string"
# }
# },
# "required": ["may_2020"],
# "properties": {
# "may_2020": {
# "type": "string"
# }
# },
# "required": ["jun_2020"],
# "properties": {
# "jun_2020": {
# "type": "string"
# }
# },
# "required": ["jul_2020"],
# "properties": {
# "jun_2020": {
# "type": "string"
# }
# },
# "required": ["aug_2020"],
# "properties": {
# "aug_2020": {
# "type": "string"
# }
# },
# "required": ["sep_2020"],
# "properties": {
# "sep_2020": {
# "type": "string"
# }
# },
}
Barasakar marked this conversation as resolved.
Show resolved Hide resolved


# Conversion function, usually no changes required
def convert(output_file, input_files = None, cache = None, source_coop_url = None, collection = False, compression = None):
"""
Converts the field boundary datasets to fiboa.

For reference, this is the order in which the conversion steps are applied:
0. Read GeoDataFrame from file(s) / layer(s) and run the FILE_MIGRATION function if provided
1. Run global migration (if provided through MIGRATION)
2. Run filters to remove rows that shall not be in the final data
(if provided through COLUMN_FILTERS)
3. Add columns with constant values
4. Run column migrations (if provided through COLUMN_MIGRATIONS)
5. Duplicate columns (if an array is provided as the value in COLUMNS)
6. Rename columns (as provided in COLUMNS)
7. Remove columns (if column is not present as value in COLUMNS)
8. Create the collection
9. Change data types of the columns based on the provided schemas
(fiboa spec, extensions, and MISSING_SCHEMAS)
10. Write the data to the Parquet file

Parameters:
output_file (str): Path where the Parquet file shall be stored.
cache (str): Path to a cached folder for the data. Default: None.
Can be used to avoid repetitive downloads from the original data source.
source_coop_url (str): URL to the (future) Source Cooperative repository. Default: None
collection (bool): Additionally, store the collection separate from Parquet file. Default: False
compression (str): Compression method for the Parquet file. Default: zstd
kwargs: Additional keyword arguments for GeoPanda's read_file() or read_parquet() function.
"""
convert_(
output_file,
cache,
SOURCES,
COLUMNS,
ID,
TITLE,
DESCRIPTION,
input_files=input_files,
providers=PROVIDERS,
source_coop_url=source_coop_url,
extensions=EXTENSIONS,
missing_schemas=MISSING_SCHEMAS,
column_additions=ADD_COLUMNS,
column_migrations=COLUMN_MIGRATIONS,
column_filters=COLUMN_FILTERS,
migration=MIGRATION,
file_migration=FILE_MIGRATION,
attribution=ATTRIBUTION,
store_collection=collection,
license=LICENSE,
compression=compression,
)