Merge remote-tracking branch 'origin/mapshaper' into demoday

InseeFrLab · Nov 16, 2023 · e9a4b2a · e9a4b2a
2 parents 17c7ba0 + 172f401
commit e9a4b2a
Show file tree

Hide file tree

Showing 15 changed files with 823 additions and 175 deletions.
diff --git a/cartiflette/__init__.py b/cartiflette/__init__.py
@@ -10,3 +10,5 @@
 from cartiflette.utils import *
 from cartiflette.download import *
 from cartiflette.s3 import *
+from cartiflette.pipeline import *
+from cartiflette.mapshaper import *
diff --git a/cartiflette/mapshaper/__init__.py b/cartiflette/mapshaper/__init__.py
@@ -0,0 +1 @@
+from .mapshaper_split import *
diff --git a/cartiflette/mapshaper/mapshaper_split.py b/cartiflette/mapshaper/mapshaper_split.py
@@ -0,0 +1,168 @@
+import subprocess
+
+
+
+DICT_CORRESP_IGN = {"REGION": "INSEE_REG", "DEPARTEMENT": "INSEE_DEP"}
+
+
+def mapshaperize_split(
+    local_dir="temp",
+    filename_initial="COMMUNE",
+    extension_initial="shp",
+    format_output="topojson",
+    niveau_agreg="DEPARTEMENT",
+    provider="IGN",
+    source="EXPRESS-COG-CARTO-TERRITOIRE",
+    year=2022,
+    dataset_family="ADMINEXPRESS",
+    territory="metropole",
+    crs=4326,
+    simplification=0,
+    dict_corresp=DICT_CORRESP_IGN
+):
+    """
+    Processes shapefiles and splits them based on specified parameters using Mapshaper.
+
+    Parameters
+    ----------
+    local_dir : str, optional
+        The local directory for file storage, by default "temp".
+    filename_initial : str, optional
+        The initial filename, by default "COMMUNE".
+    extension_initial : str, optional
+        The initial file extension, by default "shp".
+    format_output : str, optional
+        The output format, by default "topojson".
+    niveau_agreg : str, optional
+        The level of aggregation for the split, by default "DEPARTEMENT".
+    provider : str, optional
+        The data provider, by default "IGN".
+    source : str, optional
+        The data source, by default "EXPRESS-COG-CARTO-TERRITOIRE".
+    year : int, optional
+        The year of the data, by default 2022.
+    dataset_family : str, optional
+        The dataset family, by default "ADMINEXPRESS".
+    territory : str, optional
+        The territory of the data, by default "metropole".
+    crs : int, optional
+        The coordinate reference system (CRS) code, by default 4326.
+    simplification : int, optional
+        The degree of simplification, by default 0.
+    dict_corresp: dict
+        A dictionary giving correspondance between niveau_agreg argument
+        and variable names.
+
+    Returns
+    -------
+    str
+        The output path of the processed and split shapefiles.
+
+    """
+
+    simplification_percent = simplification if simplification is not None else 0
+
+    output_path = f"{local_dir}/{niveau_agreg}/{format_output}/{simplification=}"
+
+    if simplification_percent != 0:
+        option_simplify = f"-simplify {simplification_percent}% "
+    else:
+        option_simplify = ""
+
+    cmd = (
+            f"mapshaper {local_dir}/{filename_initial}.{extension_initial} name='' -proj EPSG:{crs} "
+            f"{option_simplify}"
+            f"-each \"SOURCE='{provider}:{source}'\" "
+            f"-split {dict_corresp[niveau_agreg]} "
+            f"-o {output_path} format={format_output} extension=\".{format_output}\" singles"
+        )
+
+
+    subprocess.run(
+        cmd,
+        shell=True
+    )
+
+    return output_path
+
+
+
+def mapshaperize_split_merge(
+    local_dir="temp",
+    extension_initial="shp",
+    format_output="topojson",
+    niveau_agreg="DEPARTEMENT",
+    provider="IGN",
+    source="EXPRESS-COG-CARTO-TERRITOIRE",
+    year=2022,
+    dataset_family="ADMINEXPRESS",
+    territory="metropole",
+    crs=4326,
+    simplification=0,
+    dict_corresp=DICT_CORRESP_IGN
+):
+
+    simplification_percent = simplification if simplification is not None else 0
+
+    output_path = f"{local_dir}/{niveau_agreg}/{format_output}/{simplification=}"
+
+    if simplification_percent != 0:
+        option_simplify = f"-simplify {simplification_percent}% "
+    else:
+        option_simplify = ""
+
+
+    format_intermediate = "geojson"
+
+    # PREPROCESS CITIES
+    subprocess.run(
+        (
+        f"mapshaper {local_dir}/COMMUNE.{extension_initial} name='COMMUNE' "
+        f"-proj EPSG:{crs} "
+        f"-filter '\"69123,13055,75056\".indexOf(INSEE_COM) > -1' invert "
+        f"-each \"INSEE_COG=INSEE_COM\" "
+        f"-o {output_path}/communes_simples.{format_intermediate} format={format_intermediate} extension=\".{format_intermediate}\" singles"
+        ),
+        shell=True
+    )
+
+    # PREPROCESS ARRONDISSEMENT
+    subprocess.run(
+        (
+        f"mapshaper {local_dir}/ARRONDISSEMENT_MUNICIPAL.{extension_initial} name='ARRONDISSEMENT_MUNICIPAL' "
+        f"-proj EPSG:{crs} "
+        f"-rename-fields INSEE_COG=INSEE_ARM "
+        f"-each 'INSEE_DEP=INSEE_COG.substr(0,2), STATUT=\"Arrondissement municipal\" ' "
+        f"-o {output_path}/arrondissements.{format_intermediate} format={format_intermediate} extension=\".{format_intermediate}\""
+        ),
+        shell=True
+    )
+
+    # MERGE CITIES AND ARRONDISSEMENT
+    subprocess.run(
+        (
+        f"mapshaper {output_path}/communes_simples.{format_intermediate} {output_path}/arrondissements.{format_intermediate} snap combine-files "
+        f"-proj EPSG:{crs} "
+        f"-rename-layers COMMUNE,ARRONDISSEMENT_MUNICIPAL "
+        f"-merge-layers target=COMMUNE,ARRONDISSEMENT_MUNICIPAL force "
+        f"-rename-layers COMMUNE_ARRONDISSEMENT "
+        f"-o {output_path}/raw.{format_intermediate} format={format_intermediate} extension=\".{format_intermediate}\""
+        ),
+        shell=True
+    )
+
+    # TRANSFORM AS NEEDED
+    cmd = (
+        f"mapshaper {output_path}/raw.{format_intermediate} "
+        f"{option_simplify}"
+        f"-proj EPSG:{crs} "
+        f"-each \"SOURCE='{provider}:{source}'\" "
+        f"-split {dict_corresp[niveau_agreg]} "
+        f"-o {output_path} format={format_output} extension=\".{format_output}\" singles"
+    )
+
+
+    subprocess.run(
+        cmd,
+        shell=True
+    )
diff --git a/cartiflette/pipeline/__init__.py b/cartiflette/pipeline/__init__.py
@@ -0,0 +1,7 @@
+from .cross_product_parameters import (
+    restructure_nested_dict_borders,
+    crossproduct_parameters_production
+)
+
+from .prepare_mapshaper import prepare_local_directory_mapshaper
+from .mapshaper_split_from_s3 import mapshaperize_split_from_s3, mapshaperize_merge_split_from_s3
diff --git a/cartiflette/pipeline/cross_product_parameters.py b/cartiflette/pipeline/cross_product_parameters.py
@@ -0,0 +1,111 @@
+import itertools
+import pandas as pd
+
+
+def restructure_nested_dict_borders(dict_with_list: dict):
+    """
+    Restructures a nested dictionary by flattening its values and their corresponding keys.
+
+    Parameters:
+    -----------
+    dict_with_list : dict
+        A dictionary with list values to be restructured.
+
+    Returns:
+    --------
+    list
+        A list of lists containing key-value pairs obtained by flattening the input dictionary.
+
+    Example:
+    --------
+    Example usage:
+        sample_dict = {'a': [1, 2, 3], 'b': [4, 5]}
+        result = restructure_nested_dict_borders(sample_dict)
+        print(result)
+        
+    This will output:
+        [['a', 1], ['a', 2], ['a', 3], ['b', 4], ['b', 5]]
+    """
+    croisement_filter_by_borders_flat = [
+        [key, inner_value]
+        for key, values in dict_with_list.items()
+        for inner_value in values
+    ]
+
+    return croisement_filter_by_borders_flat
+
+import itertools
+import pandas as pd
+
+def crossproduct_parameters_production(
+    croisement_filter_by_borders: dict,
+    list_format: list,
+    years: list,
+    crs_list: list,
+    sources: list,
+    simplifications: list
+) -> pd.DataFrame:
+    """
+    Generates a DataFrame by performing a cross-product of the given parameters.
+
+    Parameters:
+    -----------
+    croisement_filter_by_borders : dict
+        A dictionary with nested lists for cross-product generation.
+    list_format : list
+        A list of formats for cross-product generation.
+    years : list
+        A list of years for cross-product generation.
+    crs_list : list
+        A list of CRS (Coordinate Reference Systems) for cross-product generation.
+    sources : list
+        A list of sources for cross-product generation.
+    simplifications : list
+        A list of simplifications for cross-product generation.
+
+    Returns:
+    --------
+    pd.DataFrame
+        A pandas DataFrame containing the cross-product of the input parameters.
+
+    Example:
+    --------
+    Example usage:
+        sample_dict = {'a': [1, 2, 3], 'b': [4, 5]}
+        formats = ['geojson', 'gpkg']
+        years = [2022, 2022]
+        crs_list = [4326, 2154]
+        sources = ['source1', 'source2']
+        simplifications = [0, 40]
+        result = crossproduct_parameters_production(
+            sample_dict, formats, years, crs_list, sources, simplifications
+        )
+        print(result)
+
+    This will output:
+        A pandas DataFrame with the cross-product of the provided parameters.
+    """
+    croisement_filter_by_borders_flat = restructure_nested_dict_borders(
+        croisement_filter_by_borders
+    )
+
+    combinations = list(
+        itertools.product(
+            list_format,
+            croisement_filter_by_borders_flat,
+            years,
+            crs_list,
+            sources,
+            simplifications
+        )
+    )
+
+    tempdf = pd.DataFrame(
+        combinations,
+        columns=["format", "nested", "year", "crs", "source", "simplification"]
+    )
+    tempdf["borders"] = tempdf["nested"].apply(lambda l: l[0])
+    tempdf["filter_by"] = tempdf["nested"].apply(lambda l: l[1])
+    tempdf.drop("nested", axis="columns", inplace=True)
+
+    return tempdf