From 0e9be7f46439c2184307eb272e76d4a38921a1a4 Mon Sep 17 00:00:00 2001 From: Thomas Grandjean Date: Sun, 15 Dec 2024 21:58:24 +0100 Subject: [PATCH] start pushing catalog to python client --- .../cartiflette/cartiflette/client.py | 94 +++++++++++++++++-- .../cartiflette/cartiflette/constants.py | 5 + .../cartiflette/cartiflette/utils.py | 28 ++++++ 3 files changed, 121 insertions(+), 6 deletions(-) diff --git a/python-package/cartiflette/cartiflette/client.py b/python-package/cartiflette/cartiflette/client.py index a20078d..1ac967d 100644 --- a/python-package/cartiflette/cartiflette/client.py +++ b/python-package/cartiflette/cartiflette/client.py @@ -1,13 +1,25 @@ -from requests_cache import CachedSession -import os -import typing -import geopandas as gpd from datetime import date import logging +import os +import typing -from cartiflette.constants import DIR_CACHE, CACHE_NAME, BUCKET, PATH_WITHIN_BUCKET +from requests_cache import CachedSession +import geopandas as gpd +import pandas as pd + +from cartiflette.constants import ( + DIR_CACHE, + CACHE_NAME, + BUCKET, + PATH_WITHIN_BUCKET, + CATALOG, +) from cartiflette.config import _config -from cartiflette.utils import create_path_bucket, standardize_inputs +from cartiflette.utils import ( + create_path_bucket, + standardize_inputs, + flatten_dict, +) logger = logging.getLogger(__name__) @@ -94,6 +106,76 @@ def download_cartiflette_single( else: return gdf + def get_catalog(self, **kwargs) -> pd.DataFrame: + """ + Retrieve and load cartiflette's current datasets' inventory (as a + dataframe). + + Inventory columns are [ + 'source', + 'year', + 'administrative_level', + 'crs', + 'filter_by', + 'value', + 'vectorfile_format', + 'territory', + 'simplification' + ] + + Each row corresponds to an available DataFrame. + + Parameters + ---------- + fs : S3FileSystem, optional + S3 File System. The default is FS. + bucket : str, optional + Used bucket (both for inventory querying and json storage). The default + is BUCKET. + path_within_bucket : str, optional + Path used within bucket. The default is PATH_WITHIN_BUCKET. + + Returns + ------- + df : pd.DataFrame + Inventory DataFrame + + """ + + url = CATALOG + + url = f"https://minio.lab.sspcloud.fr/{url}" + + try: + r = self.get(url) + d = r.json() + except Exception as e: + logger.error( + f"There was an error while reading the file from the URL: {url}" + ) + logger.error(f"Error message: {str(e)}") + return + + d = flatten_dict(d) + + index = pd.MultiIndex.from_tuples(d.keys()) + df = pd.DataFrame( + list(d.values()), index=index, columns=["simplification"] + ) + index.names = [ + "source", + "year", + "administrative_level", + "crs", + "filter_by", + "value", + "vectorfile_format", + "territory", + ] + + df = df.reset_index(drop=False) + return df + def get_dataset( self, values: typing.List[typing.Union[str, int, float]], diff --git a/python-package/cartiflette/cartiflette/constants.py b/python-package/cartiflette/cartiflette/constants.py index c662279..a4d3f1a 100644 --- a/python-package/cartiflette/cartiflette/constants.py +++ b/python-package/cartiflette/cartiflette/constants.py @@ -10,3 +10,8 @@ CACHE_NAME = "cartiflette_http_cache.sqlite" BUCKET = "projet-cartiflette" PATH_WITHIN_BUCKET = "production" + +CATALOG = url = ( + "https://minio.lab.sspcloud.fr/" + f"{BUCKET}/{PATH_WITHIN_BUCKET}/inventory.json" +) diff --git a/python-package/cartiflette/cartiflette/utils.py b/python-package/cartiflette/cartiflette/utils.py index 0961fcc..7a1a42f 100644 --- a/python-package/cartiflette/cartiflette/utils.py +++ b/python-package/cartiflette/cartiflette/utils.py @@ -152,3 +152,31 @@ def create_path_bucket(config: ConfigDict) -> str: write_path += f"/raw.{vectorfile_format}" return write_path + + +def flatten_dict(d: dict, parent_key: tuple = ()) -> dict: + """ + Convenience function, flattens a nested dictionary and convert it back to + dataframe. + + Parameters + ---------- + d : dict + Nested dictionary + parent_key : tuple, optional + Optional key, used for recursive purposes. The default is (). + + Returns + ------- + dict + flattened dictionnary + + """ + items = [] + for k, v in d.items(): + new_key = parent_key + (k,) + if isinstance(v, dict): + items.extend(flatten_dict(v, new_key).items()) + else: + items.append((new_key, v)) + return dict(items)