Skip to content

Automate publication process #58

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 31 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
a94f941
Automate publication process
ivorbosloper Jun 18, 2024
028b888
Improve robustness
ivorbosloper Jun 21, 2024
de13b81
Update cache-option with main
ivorbosloper Jun 24, 2024
ed71e08
Update publication process
ivorbosloper Aug 22, 2024
1b233eb
source coop has changed
ivorbosloper Nov 9, 2024
1fcf9c7
Automatically generate README.md and LICENSE.txt
ivorbosloper Nov 10, 2024
7a5bf94
Update
ivorbosloper Nov 10, 2024
a4fde7a
Extend with option to use DATA_SURVEY_FILE from pull-request
ivorbosloper Nov 12, 2024
71e7a52
Fix generated urls
ivorbosloper Nov 12, 2024
bfd6be2
Remove beta from soorce-coop url
ivorbosloper Nov 12, 2024
9f0f3be
Resolve PR suggestions
ivorbosloper Nov 12, 2024
18ca830
Update
ivorbosloper Nov 12, 2024
4d3a524
Add publish option with input files
ivorbosloper Nov 13, 2024
fac4cd9
Fix issue with source.coop / data.source.coop
ivorbosloper Nov 14, 2024
72c0799
Fix urls
ivorbosloper Nov 14, 2024
95502c1
License update
ivorbosloper Nov 14, 2024
f4b6d98
Merge branch 'main' into publish_cli
ivorbosloper Nov 15, 2024
fd1926d
Use proper download_url
ivorbosloper Nov 15, 2024
ccc06d0
Merge fix
ivorbosloper Nov 15, 2024
16fb220
Use flatgeobuff, geo.json can become huge
ivorbosloper Nov 27, 2024
66493d8
Merge branch 'main' into publish_cli
ivorbosloper Dec 3, 2024
80599ac
Test replace temporary_file by pipe for ogr2ogr to tippecanoe
ivorbosloper Dec 3, 2024
6036b2a
prepare for class-based converter
ivorbosloper Dec 13, 2024
6446e28
Simplify stuff
ivorbosloper Dec 24, 2024
9e75d37
Merge branch 'main' into publish_cli
ivorbosloper Jan 5, 2025
13857c5
Add changelog
ivorbosloper Jan 5, 2025
ef93935
Fix python 3.9 compatibility
ivorbosloper Jan 5, 2025
6a3cbf0
Merge branch 'main' into publish_cli
ivorbosloper Jan 22, 2025
ea9e749
Merge branch 'main' into publish_cli
ivorbosloper Mar 19, 2025
99c48a5
Fix ruff
ivorbosloper Mar 19, 2025
be4505a
Improve template
ivorbosloper Mar 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,16 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Converter for Latvia (from original source)
- Converter for Japan, currently based on supplied (non-fiboa) parquet files
- Many converters implement the admin extension
- Assure tests don't download external sources
- `fiboa convert`: New parameter `--original-geometries` / `-og` to keep the original geometries
- Command `fiboa publish` to automate source coop publication process
- Checks for proper existing source coop url
- Runs the converter to get the parquet file
- Validates parquet file
- Checks for README.md, if missing generates one based on data-survey (if available) and converter
- Checks for LICENSE.txt, if missing generate one based on the converter file
- Generates pmtiles file
- Checks AWS-environment vars, synchronizes parquet + pmtiles + README/LICENSE to source coop repo

### Changed

Expand Down
43 changes: 43 additions & 0 deletions fiboa_cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .jsonschema import jsonschema as jsonschema_
from .merge import DEFAULT_CRS
from .merge import merge as merge_
from .publish import publish as publish_
from .rename_extension import rename_extension as rename_extension_
from .util import (
check_ext_schema_for_cli,
Expand Down Expand Up @@ -753,6 +754,47 @@ def improve(
sys.exit(1)


## Publish
@click.command()
@click.argument("dataset", nargs=1, type=click.Choice(list_all_converter_ids()))
@click.argument("directory", nargs=1, type=click.Path(exists=False))
@click.option(
"--cache",
"-c",
type=click.Path(exists=False),
help="By default the CLI downloads the source data on every execution. Specify a local folder to avoid downloading the files again. If the files exist, reads from there, otherwise stores the files there.",
default=None,
)
@click.option(
"--source-coop-extension",
"-e",
type=click.STRING,
help="(Future) source_coop extension, will be used as https://beta.source.coop/fiboa/xx-yy/",
default=None,
)
@click.option(
"--input",
"-i",
type=click.STRING,
help="File(s) or URL(s) to read from. Can be used multiple times. Specific files from ZIP and 7Z archives can be picked by providing the archive path and the file path in the archive separated by a pipe sign. To pick multiple files from a single archive separate them by comma. Example: /path/to/archive.zip|file1.gpkg,subfolder/file2.gpkg",
callback=parse_converter_input_files,
multiple=True,
default=None,
)
def publish(dataset, directory, cache, source_coop_extension, input):
"""
Publish a fiboa collection on
"""
log(f"Trying to publish on source coop CLI {__version__}\n", "success")
try:
directory = os.path.abspath(directory)
publish_(dataset, directory, cache, source_coop_extension, input)
log(f"Dataset published from {directory}", "success")
except Exception as e:
log(e, "error")
sys.exit(1)


cli.add_command(describe)
cli.add_command(validate)
cli.add_command(validate_schema)
Expand All @@ -763,6 +805,7 @@ def improve(
cli.add_command(converters)
cli.add_command(rename_extension)
cli.add_command(merge)
cli.add_command(publish)
cli.add_command(improve)

if __name__ == "__main__":
Expand Down
251 changes: 251 additions & 0 deletions fiboa_cli/publish.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
import json
import os
import re
import sys
from datetime import date
from functools import cache

import requests

from fiboa_cli import list_all_converter_ids, version
from fiboa_cli.convert import convert
from fiboa_cli.convert import read_converter as _read_converter
from fiboa_cli.validate import validate

from .util import log

STAC_EXTENSION = "https://stac-extensions.github.io/web-map-links/v1.2.0/schema.json"


def read_converter(converter_id):
# Temporary to-lowercase getattr proxy to facilitate the transfer to class based converters
converter = _read_converter(converter_id)

class Proxy:
def __getattr__(self, attr):
if not hasattr(converter, attr):
attr = attr.lower()
return getattr(converter, attr)

return Proxy()


def exc(cmd):
assert os.system(cmd) == 0


def check_command(cmd, name=None):
if os.system(f"{cmd} --version") != 0:
log(f"Missing command {cmd}. Please install {name or cmd}", "error")
sys.exit(1)


@cache
def get_data_survey(dataset):
base = dataset.replace("_", "-").upper()
# override data survey location with env variable, e.g. for unmerged pull-requests
data_survey = (
os.getenv("FIBOA_DATA_SURVEY")
or f"https://raw.githubusercontent.com/fiboa/data-survey/refs/heads/main/data/{base}.md"
)
response = requests.get(data_survey)
assert response.ok, (
f"Missing data survey {base}.md at {data_survey}. Can not auto-generate file"
)
return dict(re.findall(r"- \*\*(.+?):\*\* (.+?)\n", response.text))


def readme_attribute_table(stac_data):
cols = [["Property", "**Data Type**", "Description"]] + [
[s["name"], re.search(r"\w+", s["type"])[0], ""]
for s in stac_data["assets"]["data"]["table:columns"]
if s["name"] != "geometry"
]
widths = [max(len(c[i]) for c in cols) for i in range(3)]
aligned_cols = [[f" {c:<{w}} " for c, w in zip(row, widths)] for row in cols]
aligned_cols.insert(1, ["-" * (w + 2) for w in widths])
return "\n".join(["|" + "|".join(cols) + "|" for cols in aligned_cols])


def make_license(dataset, **kwargs):
props = get_data_survey(dataset)
text = ""
if "license" in props:
text += props["license"] + "\n\n"
converter = read_converter(dataset)
if hasattr(converter, "LICENSE"):
text += (
converter.LICENSE["title"] if isinstance(converter.LICENSE, dict) else converter.LICENSE
)
return text


def make_readme(dataset, file_name, stac, source_coop_extension):
source_coop_data = f"https://data.source.coop/fiboa/{source_coop_extension}/"

converter = read_converter(dataset)
stac_data = json.load(open(stac))
count = stac_data["assets"]["data"]["table:row_count"]
columns = readme_attribute_table(stac_data)
props = get_data_survey(dataset)
_download_urls = converter.get_urls().keys() or ["manually downloaded file"]
downloaded_urls = "\n".join([(" - " + url) for url in _download_urls])

return f"""# Field boundaries for {converter.SHORT_NAME}

Provides {count} official field boundaries from {converter.SHORT_NAME}.
It has been converted to a fiboa GeoParquet file from data obtained from {props["Data Provider (Legal Entity)"]}.

- **Source Data Provider:** [{props["Data Provider (Legal Entity)"]}]({props["Homepage"]})
- **Converted by:** {props["Submitter (Affiliation)"]}
- **License:** {props["License"]}
- **Projection:** {props["Projection"]}

---

- [Download the data as fiboa GeoParquet]({source_coop_data}{file_name}.parquet)
- [STAC Browser](https://radiantearth.github.io/stac-browser/#/external/data.source.coop/fiboa/{source_coop_extension}/stac/collection.json)
- [STAC Collection]({source_coop_data}stac/collection.json)
- [PMTiles]({source_coop_data}{file_name}.pmtiles)

## Columns

{columns}

## Lineage

- Data downloaded on {date.today()} from:
{downloaded_urls}
- Converted to GeoParquet using [fiboa-cli](https://github.com/fiboa/cli), version {version.__version__}
"""


def publish(dataset, directory, cache, source_coop_extension, input_files=None):
"""
Implement https://github.com/fiboa/data/blob/main/HOWTO.md#each-time-you-update-the-dataset

You need GDAL 3.8 or later (for ogr2ogr) with libgdal-arrow-parquet, tippecanoe, and AWS CLI
- https://gdal.org/
- https://github.com/felt/tippecanoe
- https://aws.amazon.com/cli/
"""
assert dataset in list_all_converter_ids()
os.makedirs(directory, exist_ok=True)

parent = os.path.dirname(directory)
os.chdir(parent)

if not source_coop_extension:
source_coop_extension = dataset.replace("_", "-")

file_name = source_coop_extension.replace("-", "_") # not sure if we want this

parquet_file = os.path.join(directory, f"{file_name}.parquet")
source_coop_url = f"https://source.coop/fiboa/{source_coop_extension}/"
source_coop_data = f"https://data.source.coop/fiboa/{source_coop_extension}/"

assert (
requests.get(
f"https://source.coop/api/v1/repositories/fiboa/{source_coop_extension}"
).status_code
== 200
), f"Missing repo at {source_coop_url}"

collection_file = os.path.join(directory, "collection.json")

stac_directory = os.path.join(directory, "stac")
done_convert = os.path.exists(parquet_file) and os.path.exists(
os.path.join(stac_directory, "collection.json")
)

if not done_convert:
# fiboa convert xx_yy -o data/xx-yy.parquet -h https://data.source.coop/fiboa/xx-yy/ --collection
log(f"Converting file for {dataset} to {parquet_file}\n", "success")
convert(
dataset,
parquet_file,
cache=cache,
source_coop_url=source_coop_url,
collection=True,
input_files=input_files,
)
log("Done\n", "success")
else:
log(f"Using existing file {parquet_file} for {dataset}\n", "success")

# fiboa validate data/xx-yy.parquet --data
log(f"Validating {parquet_file}", "info")
result = validate(parquet_file, config={"data": True})
if result:
log("\n => VALID\n", "success")
else:
log("\n => INVALID\n", "error")
sys.exit(1)

# mkdir data/stac; mv data/collection.json data/stac
stac_target = os.path.join(stac_directory, "collection.json")
if not done_convert:
os.makedirs(stac_directory, exist_ok=True)
data = json.load(open(collection_file))
assert data["id"] == dataset, f"Wrong collection dataset id: {data['id']} != {dataset}"

if STAC_EXTENSION not in data["stac_extensions"]:
data["stac_extensions"].append(STAC_EXTENSION)
data["links"].append(
{
"href": f"{source_coop_data}{file_name}.pmtiles",
"type": "application/vnd.pmtiles",
"rel": "pmtiles",
}
)

with open(stac_target, "w", encoding="utf-8") as f:
json.dump(data, f)
os.remove(collection_file)

for required in ("README.md", "LICENSE.txt"):
path = os.path.join(directory, required)
if not os.path.exists(path):
log(f"Missing {required}. Generating at {path}", "warning")
if required == "README.md":
text = make_readme(
dataset,
file_name=file_name,
stac=stac_target,
source_coop_extension=source_coop_extension,
)
else:
text = make_license(
dataset, source_coop_url=source_coop_url, file_name=file_name, stac=stac_target
)
with open(path, "w") as f:
f.write(text)
log(f"Please complete the {path} before continuing", "warning")
sys.exit(1)

pm_file = os.path.join(directory, f"{file_name}.pmtiles")
if not os.path.exists(pm_file):
log("Running ogr2ogr | tippecanoe", "info")
check_command("tippecanoe")
check_command("ogr2ogr", name="GDAL")
exc(
f"ogr2ogr -t_srs EPSG:4326 -f geojson /vsistdout/ {parquet_file} | tippecanoe -zg --projection=EPSG:4326 -o {pm_file} -l {dataset} --drop-densest-as-needed"
)

log("Uploading to aws", "info")
if not os.environ.get("AWS_SECRET_ACCESS_KEY"):
log(f"Get your credentials at {source_coop_url}manage/", "info")
log(" Then press 'ACCESS DATA',\n and click 'Create API Key',", "info")
log(
" Run export AWS_ENDPOINT_URL=https://data.source.coop AWS_ACCESS_KEY_ID=<> AWS_SECRET_ACCESS_KEY=<>\n"
" where you copy-past the access key and secret",
"info",
)
log("Please set AWS_ env vars from source_coop", "error")
sys.exit(1)

assert os.environ.get("AWS_ENDPOINT_URL") == "https://data.source.coop", (
"Missing AWS_ENDPOINT_URL env var"
)
check_command("aws")
exc(f"aws s3 sync {directory} s3://fiboa/{source_coop_extension}/")