start pushing catalog to python client

InseeFrLab · Dec 15, 2024 · 0e9be7f · 0e9be7f
1 parent 289b386
commit 0e9be7f
Show file tree

Hide file tree

Showing 3 changed files with 121 additions and 6 deletions.
diff --git a/python-package/cartiflette/cartiflette/client.py b/python-package/cartiflette/cartiflette/client.py
@@ -1,13 +1,25 @@
-from requests_cache import CachedSession
-import os
-import typing
-import geopandas as gpd
 from datetime import date
 import logging
+import os
+import typing
 
-from cartiflette.constants import DIR_CACHE, CACHE_NAME, BUCKET, PATH_WITHIN_BUCKET
+from requests_cache import CachedSession
+import geopandas as gpd
+import pandas as pd
+
+from cartiflette.constants import (
+    DIR_CACHE,
+    CACHE_NAME,
+    BUCKET,
+    PATH_WITHIN_BUCKET,
+    CATALOG,
+)
 from cartiflette.config import _config
-from cartiflette.utils import create_path_bucket, standardize_inputs
+from cartiflette.utils import (
+    create_path_bucket,
+    standardize_inputs,
+    flatten_dict,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -94,6 +106,76 @@ def download_cartiflette_single(
         else:
             return gdf
 
+    def get_catalog(self, **kwargs) -> pd.DataFrame:
+        """
+        Retrieve and load cartiflette's current datasets' inventory (as a
+        dataframe).
+
+        Inventory columns are [
+             'source',
+             'year',
+             'administrative_level',
+             'crs',
+             'filter_by',
+             'value',
+             'vectorfile_format',
+             'territory',
+             'simplification'
+             ]
+
+        Each row corresponds to an available DataFrame.
+
+        Parameters
+        ----------
+        fs : S3FileSystem, optional
+            S3 File System. The default is FS.
+        bucket : str, optional
+            Used bucket (both for inventory querying and json storage). The default
+            is BUCKET.
+        path_within_bucket : str, optional
+            Path used within bucket. The default is PATH_WITHIN_BUCKET.
+
+        Returns
+        -------
+        df : pd.DataFrame
+            Inventory DataFrame
+
+        """
+
+        url = CATALOG
+
+        url = f"https://minio.lab.sspcloud.fr/{url}"
+
+        try:
+            r = self.get(url)
+            d = r.json()
+        except Exception as e:
+            logger.error(
+                f"There was an error while reading the file from the URL: {url}"
+            )
+            logger.error(f"Error message: {str(e)}")
+            return
+
+        d = flatten_dict(d)
+
+        index = pd.MultiIndex.from_tuples(d.keys())
+        df = pd.DataFrame(
+            list(d.values()), index=index, columns=["simplification"]
+        )
+        index.names = [
+            "source",
+            "year",
+            "administrative_level",
+            "crs",
+            "filter_by",
+            "value",
+            "vectorfile_format",
+            "territory",
+        ]
+
+        df = df.reset_index(drop=False)
+        return df
+
     def get_dataset(
         self,
         values: typing.List[typing.Union[str, int, float]],

diff --git a/python-package/cartiflette/cartiflette/constants.py b/python-package/cartiflette/cartiflette/constants.py
@@ -10,3 +10,8 @@
 CACHE_NAME = "cartiflette_http_cache.sqlite"
 BUCKET = "projet-cartiflette"
 PATH_WITHIN_BUCKET = "production"
+
+CATALOG = url = (
+    "https://minio.lab.sspcloud.fr/"
+    f"{BUCKET}/{PATH_WITHIN_BUCKET}/inventory.json"
+)
diff --git a/python-package/cartiflette/cartiflette/utils.py b/python-package/cartiflette/cartiflette/utils.py
@@ -152,3 +152,31 @@ def create_path_bucket(config: ConfigDict) -> str:
         write_path += f"/raw.{vectorfile_format}"
 
     return write_path
+
+
+def flatten_dict(d: dict, parent_key: tuple = ()) -> dict:
+    """
+    Convenience function, flattens a nested dictionary and convert it back to
+    dataframe.
+
+    Parameters
+    ----------
+    d : dict
+        Nested dictionary
+    parent_key : tuple, optional
+        Optional key, used for recursive purposes. The default is ().
+
+    Returns
+    -------
+    dict
+        flattened dictionnary
+
+    """
+    items = []
+    for k, v in d.items():
+        new_key = parent_key + (k,)
+        if isinstance(v, dict):
+            items.extend(flatten_dict(v, new_key).items())
+        else:
+            items.append((new_key, v))
+    return dict(items)