diff --git a/cartiflette/__init__.py b/cartiflette/__init__.py index 4d462569..90eececd 100644 --- a/cartiflette/__init__.py +++ b/cartiflette/__init__.py @@ -10,3 +10,5 @@ from cartiflette.utils import * from cartiflette.download import * from cartiflette.s3 import * +from cartiflette.pipeline import * +from cartiflette.mapshaper import * \ No newline at end of file diff --git a/cartiflette/mapshaper/__init__.py b/cartiflette/mapshaper/__init__.py new file mode 100644 index 00000000..3d095a10 --- /dev/null +++ b/cartiflette/mapshaper/__init__.py @@ -0,0 +1 @@ +from .mapshaper_split import * diff --git a/cartiflette/mapshaper/mapshaper_split.py b/cartiflette/mapshaper/mapshaper_split.py new file mode 100644 index 00000000..1f848490 --- /dev/null +++ b/cartiflette/mapshaper/mapshaper_split.py @@ -0,0 +1,168 @@ +import subprocess + + + +DICT_CORRESP_IGN = {"REGION": "INSEE_REG", "DEPARTEMENT": "INSEE_DEP"} + + +def mapshaperize_split( + local_dir="temp", + filename_initial="COMMUNE", + extension_initial="shp", + format_output="topojson", + niveau_agreg="DEPARTEMENT", + provider="IGN", + source="EXPRESS-COG-CARTO-TERRITOIRE", + year=2022, + dataset_family="ADMINEXPRESS", + territory="metropole", + crs=4326, + simplification=0, + dict_corresp=DICT_CORRESP_IGN +): + """ + Processes shapefiles and splits them based on specified parameters using Mapshaper. + + Parameters + ---------- + local_dir : str, optional + The local directory for file storage, by default "temp". + filename_initial : str, optional + The initial filename, by default "COMMUNE". + extension_initial : str, optional + The initial file extension, by default "shp". + format_output : str, optional + The output format, by default "topojson". + niveau_agreg : str, optional + The level of aggregation for the split, by default "DEPARTEMENT". + provider : str, optional + The data provider, by default "IGN". + source : str, optional + The data source, by default "EXPRESS-COG-CARTO-TERRITOIRE". + year : int, optional + The year of the data, by default 2022. + dataset_family : str, optional + The dataset family, by default "ADMINEXPRESS". + territory : str, optional + The territory of the data, by default "metropole". + crs : int, optional + The coordinate reference system (CRS) code, by default 4326. + simplification : int, optional + The degree of simplification, by default 0. + dict_corresp: dict + A dictionary giving correspondance between niveau_agreg argument + and variable names. + + Returns + ------- + str + The output path of the processed and split shapefiles. + + """ + + simplification_percent = simplification if simplification is not None else 0 + + output_path = f"{local_dir}/{niveau_agreg}/{format_output}/{simplification=}" + + if simplification_percent != 0: + option_simplify = f"-simplify {simplification_percent}% " + else: + option_simplify = "" + + cmd = ( + f"mapshaper {local_dir}/{filename_initial}.{extension_initial} name='' -proj EPSG:{crs} " + f"{option_simplify}" + f"-each \"SOURCE='{provider}:{source}'\" " + f"-split {dict_corresp[niveau_agreg]} " + f"-o {output_path} format={format_output} extension=\".{format_output}\" singles" + ) + + + subprocess.run( + cmd, + shell=True + ) + + return output_path + + + +def mapshaperize_split_merge( + local_dir="temp", + extension_initial="shp", + format_output="topojson", + niveau_agreg="DEPARTEMENT", + provider="IGN", + source="EXPRESS-COG-CARTO-TERRITOIRE", + year=2022, + dataset_family="ADMINEXPRESS", + territory="metropole", + crs=4326, + simplification=0, + dict_corresp=DICT_CORRESP_IGN +): + + simplification_percent = simplification if simplification is not None else 0 + + output_path = f"{local_dir}/{niveau_agreg}/{format_output}/{simplification=}" + + if simplification_percent != 0: + option_simplify = f"-simplify {simplification_percent}% " + else: + option_simplify = "" + + + format_intermediate = "geojson" + + # PREPROCESS CITIES + subprocess.run( + ( + f"mapshaper {local_dir}/COMMUNE.{extension_initial} name='COMMUNE' " + f"-proj EPSG:{crs} " + f"-filter '\"69123,13055,75056\".indexOf(INSEE_COM) > -1' invert " + f"-each \"INSEE_COG=INSEE_COM\" " + f"-o {output_path}/communes_simples.{format_intermediate} format={format_intermediate} extension=\".{format_intermediate}\" singles" + ), + shell=True + ) + + # PREPROCESS ARRONDISSEMENT + subprocess.run( + ( + f"mapshaper {local_dir}/ARRONDISSEMENT_MUNICIPAL.{extension_initial} name='ARRONDISSEMENT_MUNICIPAL' " + f"-proj EPSG:{crs} " + f"-rename-fields INSEE_COG=INSEE_ARM " + f"-each 'INSEE_DEP=INSEE_COG.substr(0,2), STATUT=\"Arrondissement municipal\" ' " + f"-o {output_path}/arrondissements.{format_intermediate} format={format_intermediate} extension=\".{format_intermediate}\"" + ), + shell=True + ) + + # MERGE CITIES AND ARRONDISSEMENT + subprocess.run( + ( + f"mapshaper {output_path}/communes_simples.{format_intermediate} {output_path}/arrondissements.{format_intermediate} snap combine-files " + f"-proj EPSG:{crs} " + f"-rename-layers COMMUNE,ARRONDISSEMENT_MUNICIPAL " + f"-merge-layers target=COMMUNE,ARRONDISSEMENT_MUNICIPAL force " + f"-rename-layers COMMUNE_ARRONDISSEMENT " + f"-o {output_path}/raw.{format_intermediate} format={format_intermediate} extension=\".{format_intermediate}\"" + ), + shell=True + ) + + # TRANSFORM AS NEEDED + cmd = ( + f"mapshaper {output_path}/raw.{format_intermediate} " + f"{option_simplify}" + f"-proj EPSG:{crs} " + f"-each \"SOURCE='{provider}:{source}'\" " + f"-split {dict_corresp[niveau_agreg]} " + f"-o {output_path} format={format_output} extension=\".{format_output}\" singles" + ) + + + subprocess.run( + cmd, + shell=True + ) diff --git a/cartiflette/pipeline/__init__.py b/cartiflette/pipeline/__init__.py new file mode 100644 index 00000000..202eb36b --- /dev/null +++ b/cartiflette/pipeline/__init__.py @@ -0,0 +1,7 @@ +from .cross_product_parameters import ( + restructure_nested_dict_borders, + crossproduct_parameters_production +) + +from .prepare_mapshaper import prepare_local_directory_mapshaper +from .mapshaper_split_from_s3 import mapshaperize_split_from_s3, mapshaperize_merge_split_from_s3 \ No newline at end of file diff --git a/cartiflette/pipeline/cross_product_parameters.py b/cartiflette/pipeline/cross_product_parameters.py new file mode 100644 index 00000000..ec50455d --- /dev/null +++ b/cartiflette/pipeline/cross_product_parameters.py @@ -0,0 +1,111 @@ +import itertools +import pandas as pd + + +def restructure_nested_dict_borders(dict_with_list: dict): + """ + Restructures a nested dictionary by flattening its values and their corresponding keys. + + Parameters: + ----------- + dict_with_list : dict + A dictionary with list values to be restructured. + + Returns: + -------- + list + A list of lists containing key-value pairs obtained by flattening the input dictionary. + + Example: + -------- + Example usage: + sample_dict = {'a': [1, 2, 3], 'b': [4, 5]} + result = restructure_nested_dict_borders(sample_dict) + print(result) + + This will output: + [['a', 1], ['a', 2], ['a', 3], ['b', 4], ['b', 5]] + """ + croisement_filter_by_borders_flat = [ + [key, inner_value] + for key, values in dict_with_list.items() + for inner_value in values + ] + + return croisement_filter_by_borders_flat + +import itertools +import pandas as pd + +def crossproduct_parameters_production( + croisement_filter_by_borders: dict, + list_format: list, + years: list, + crs_list: list, + sources: list, + simplifications: list +) -> pd.DataFrame: + """ + Generates a DataFrame by performing a cross-product of the given parameters. + + Parameters: + ----------- + croisement_filter_by_borders : dict + A dictionary with nested lists for cross-product generation. + list_format : list + A list of formats for cross-product generation. + years : list + A list of years for cross-product generation. + crs_list : list + A list of CRS (Coordinate Reference Systems) for cross-product generation. + sources : list + A list of sources for cross-product generation. + simplifications : list + A list of simplifications for cross-product generation. + + Returns: + -------- + pd.DataFrame + A pandas DataFrame containing the cross-product of the input parameters. + + Example: + -------- + Example usage: + sample_dict = {'a': [1, 2, 3], 'b': [4, 5]} + formats = ['geojson', 'gpkg'] + years = [2022, 2022] + crs_list = [4326, 2154] + sources = ['source1', 'source2'] + simplifications = [0, 40] + result = crossproduct_parameters_production( + sample_dict, formats, years, crs_list, sources, simplifications + ) + print(result) + + This will output: + A pandas DataFrame with the cross-product of the provided parameters. + """ + croisement_filter_by_borders_flat = restructure_nested_dict_borders( + croisement_filter_by_borders + ) + + combinations = list( + itertools.product( + list_format, + croisement_filter_by_borders_flat, + years, + crs_list, + sources, + simplifications + ) + ) + + tempdf = pd.DataFrame( + combinations, + columns=["format", "nested", "year", "crs", "source", "simplification"] + ) + tempdf["borders"] = tempdf["nested"].apply(lambda l: l[0]) + tempdf["filter_by"] = tempdf["nested"].apply(lambda l: l[1]) + tempdf.drop("nested", axis="columns", inplace=True) + + return tempdf diff --git a/cartiflette/pipeline/mapshaper_split_from_s3.py b/cartiflette/pipeline/mapshaper_split_from_s3.py new file mode 100644 index 00000000..7c66eb6b --- /dev/null +++ b/cartiflette/pipeline/mapshaper_split_from_s3.py @@ -0,0 +1,160 @@ + +import os + +from cartiflette.config import BUCKET, PATH_WITHIN_BUCKET, FS +from cartiflette.utils import create_path_bucket +from cartiflette.mapshaper import mapshaperize_split, mapshaperize_split_merge +from .prepare_mapshaper import prepare_local_directory_mapshaper + +def mapshaperize_split_from_s3( + path_bucket, + config, + fs=FS +): + + format_output = config.get("format_output", "topojson") + filter_by = config.get("filter_by", "DEPARTEMENT") + borders = config.get("borders", "COMMUNE") + territory = config.get("territory", "metropole") + + provider = config.get("provider", "IGN") + source = config.get("source", "EXPRESS-COG-CARTO-TERRITOIRE") + year = config.get("year", 2022) + dataset_family = config.get("dataset_family", "ADMINEXPRESS") + territory = config.get("territory", "metropole") + crs = config.get("crs", 4326) + simplification = config.get("simplification", 0) + + bucket = config.get("bucket", BUCKET) + path_within_bucket = config.get("path_within_bucket", PATH_WITHIN_BUCKET) + local_dir = config.get("local_dir", "temp") + + local_directories = prepare_local_directory_mapshaper( + path_bucket, + borders=borders, + niveau_agreg=filter_by, + format_output=format_output, + simplification=simplification, + local_dir=local_dir, + fs=fs + ) + + output_path = mapshaperize_split( + local_dir=local_dir, + filename_initial=borders, + extension_initial="shp", + format_output=format_output, + niveau_agreg=filter_by, + provider=provider, + source=source, + year=year, + dataset_family=dataset_family, + territory=territory, + crs=crs, + simplification=simplification + ) + + for values in os.listdir(output_path): + path_s3 = create_path_bucket( + { + "bucket": bucket, + "path_within_bucket": path_within_bucket, + "year": year, + "borders": borders, + "crs": crs, + "filter_by": filter_by, + "value": values.replace(f".{format_output}", ""), + "vectorfile_format": format_output, + "provider": provider, + "dataset_family": dataset_family, + "source": source, + "territory": territory, + "simplification": simplification + }) + fs.put(f"{output_path}/{values}", path_s3) + + + return output_path + + +def mapshaperize_merge_split_from_s3( + path_bucket, + config, + fs=FS +): + + format_output = config.get("format_output", "topojson") + filter_by = config.get("filter_by", "DEPARTEMENT") + borders = config.get("borders", "COMMUNE") + territory = config.get("territory", "metropole") + + provider = config.get("provider", "IGN") + source = config.get("source", "EXPRESS-COG-CARTO-TERRITOIRE") + year = config.get("year", 2022) + dataset_family = config.get("dataset_family", "ADMINEXPRESS") + territory = config.get("territory", "metropole") + crs = config.get("crs", 4326) + simplification = config.get("simplification", 0) + + bucket = config.get("bucket", BUCKET) + path_within_bucket = config.get("path_within_bucket", PATH_WITHIN_BUCKET) + local_dir = config.get("local_dir", "temp") + + local_directory = prepare_local_directory_mapshaper( + path_bucket, + borders="COMMUNE", + niveau_agreg=filter_by, + format_output=format_output, + simplification=simplification, + local_dir=local_dir, + fs=fs + ) + + prepare_local_directory_mapshaper( + path_bucket, + borders="ARRONDISSEMENT_MUNICIPAL", + niveau_agreg=filter_by, + format_output=format_output, + simplification=simplification, + local_dir=local_dir, + fs=fs + ) + + local_directory + + output_path = mapshaperize_split_merge( + local_dir=local_dir, + extension_initial="shp", + format_output=format_output, + niveau_agreg=filter_by, + provider=provider, + source=source, + year=year, + dataset_family=dataset_family, + territory=territory, + crs=crs, + simplification=simplification + ) + + for values in os.listdir(output_path): + path_s3 = create_path_bucket( + { + "bucket": bucket, + "path_within_bucket": path_within_bucket, + "year": year, + "borders": "COMMUNE_ARRONDISSEMENT", + "crs": crs, + "filter_by": filter_by, + "value": values.replace(f".{format_output}", ""), + "vectorfile_format": format_output, + "provider": provider, + "dataset_family": dataset_family, + "source": source, + "territory": territory, + "simplification": simplification + }) + fs.put(f"{output_path}/{values}", path_s3) + + + return output_path + diff --git a/cartiflette/pipeline/prepare_mapshaper.py b/cartiflette/pipeline/prepare_mapshaper.py new file mode 100644 index 00000000..5a857554 --- /dev/null +++ b/cartiflette/pipeline/prepare_mapshaper.py @@ -0,0 +1,57 @@ +import os + +from cartiflette.config import FS +from cartiflette.s3 import list_raw_files_level, download_files_from_list + + +def prepare_local_directory_mapshaper( + path_bucket, + borders="COMMUNE", + niveau_agreg="DEPARTEMENT", + format_output="topojson", + simplification=0, + local_dir="temp", + fs=FS, +): + """ + Prepares the local directory for processing with Mapshaper. + + This function creates a local directory structure and downloads + raw shapefiles from the specified path in the file system. + + Parameters + ---------- + path_bucket : str + The path to the bucket in the file system. + borders : str, optional + The type of borders, by default "COMMUNE". + niveau_agreg : str, optional + The level of aggregation, by default "DEPARTEMENT". + format_output : str, optional + The output format, by default "topojson". + simplification : int, optional + The degree of simplification, by default 0. + local_dir : str, optional + The local directory for file storage, by default "temp". + fs : FileSystem, optional + The file system object, by default fs. + + Returns + ------- + dict + A dictionary containing paths for the original and destination directories. + + """ + os.makedirs(local_dir, exist_ok=True) + # Get all raw shapefiles from Minio + list_raw_files = list_raw_files_level(fs, path_bucket, borders=borders) + download_files_from_list(fs, list_raw_files) + local_path_destination = f"{local_dir}/{niveau_agreg}/{format_output}/{simplification=}" + os.makedirs( + local_path_destination, + exist_ok=True + ) + paths = { + "path_origin": local_dir, "path_destination": local_path_destination + } + return paths diff --git a/cartiflette/s3/__init__.py b/cartiflette/s3/__init__.py index 3be32503..b4bacf14 100644 --- a/cartiflette/s3/__init__.py +++ b/cartiflette/s3/__init__.py @@ -1,4 +1,4 @@ -from cartiflette.s3.s3 import ( +from .s3 import ( write_vectorfile_s3_all, write_vectorfile_s3_custom_arrondissement, production_cartiflette, @@ -6,10 +6,17 @@ write_cog_s3, ) +from .upload_raw_s3 import * +from .list_files_s3 import * + + __all__ = [ "write_vectorfile_s3_all", "write_vectorfile_s3_custom_arrondissement", "production_cartiflette", "list_produced_cartiflette", + "upload_s3_raw", + "download_files_from_list", + "list_raw_files_level", "write_cog_s3", ] diff --git a/cartiflette/s3/list_files_s3.py b/cartiflette/s3/list_files_s3.py new file mode 100644 index 00000000..bbc64c79 --- /dev/null +++ b/cartiflette/s3/list_files_s3.py @@ -0,0 +1,57 @@ +import s3fs + +from cartiflette.config import ENDPOINT_URL + + +fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": ENDPOINT_URL}) + + +def list_raw_files_level(fs, path_bucket, borders): + """ + Lists raw files at a specific level within the file system. + + Parameters + ---------- + fs : FileSystem + The file system object. + path_bucket : str + The path to the bucket in the file system. + borders : str + The specific level for which raw files are to be listed. + + Returns + ------- + list + A list of raw files at the specified level in the file system. + """ + list_raw_files = fs.ls(path_bucket) + list_raw_files = [ + chemin for chemin in list_raw_files if chemin.rsplit("/", maxsplit=1)[-1].startswith(f'{borders}.') + ] + return list_raw_files + + +def download_files_from_list(fs, list_raw_files, local_dir = "temp"): + """ + Downloads files from a list of raw files to a specified local directory. + + Parameters + ---------- + fs : FileSystem + The file system object. + list_raw_files : list + A list of raw files to be downloaded. + local_dir : str, optional + The local directory where the files will be downloaded, by default "temp". + + Returns + ------- + str + The path of the local directory where the files are downloaded. + """ + for files in list_raw_files: + fs.download( + files, + f"{local_dir}/{files.rsplit('/', maxsplit=1)[-1]}" + ) + return local_dir diff --git a/cartiflette/s3/upload_raw_s3.py b/cartiflette/s3/upload_raw_s3.py new file mode 100644 index 00000000..e90dea6b --- /dev/null +++ b/cartiflette/s3/upload_raw_s3.py @@ -0,0 +1,83 @@ +from cartiflette.download.download import _download_sources +from cartiflette.utils import create_path_bucket + + +def upload_s3_raw( + provider="IGN", + source="EXPRESS-COG-CARTO-TERRITOIRE", + year=2022, + dataset_family="ADMINEXPRESS", + territory="metropole", + borders="COMMUNE", + path_within_bucket="test-download6", + crs=4326, + bucket="projet-cartiflette" + ): + """ + Uploads raw data to an S3 bucket and returns the path to the bucket. + + Parameters + ---------- + provider : str, optional + The provider of the data, by default "IGN". + source : str, optional + The data source, by default "EXPRESS-COG-CARTO-TERRITOIRE". + year : int, optional + The year of the data, by default 2022. + dataset_family : str, optional + The dataset family, by default "ADMINEXPRESS". + territory : str, optional + The territory of the data, by default "metropole". + borders : str, optional + The type of borders, by default "COMMUNE". + path_within_bucket : str, optional + The path within the S3 bucket, by default "test-download6". + crs : int, optional + The coordinate reference system (CRS) code, by default 4326. + bucket : str, optional + The S3 bucket name, by default "projet-cartiflette". + + Returns + ------- + str + The path to the S3 bucket where the raw data is uploaded. + + """ + + x = _download_sources( + upload=True, + providers=provider, + dataset_families=dataset_family, + sources=source, + territories=territory, + years=year, + path_within_bucket=path_within_bucket + ) + + rawpaths = x[provider][dataset_family][source][territory][year]['paths'] + + if rawpaths is None: + path_raw_s3 = create_path_bucket( + { + "bucket": bucket, + "path_within_bucket": path_within_bucket, + "year": year, + "borders": None, + "crs": 2154, + "filter_by": "origin", + "value": "raw", + "vectorfile_format": "shp", + "provider": provider, + "dataset_family": dataset_family, + "source": source, + "territory": territory, + "filename": "COMMUNE.shp", + "simplification": 0 + } + ) + else: + path_raw_s3 = rawpaths[borders][0] + + path_bucket = path_raw_s3.rsplit("/", maxsplit=1)[0] + + return path_bucket diff --git a/cartiflette/utils/create_path_bucket.py b/cartiflette/utils/create_path_bucket.py index 985bb8cc..f904a45f 100644 --- a/cartiflette/utils/create_path_bucket.py +++ b/cartiflette/utils/create_path_bucket.py @@ -59,6 +59,8 @@ def create_path_bucket(config: ConfigDict) -> str: value = config.get("value") crs = config.get("crs", 2154) simplification = config.get("simplification", 0) + if simplification is None: + simplification = 0 filename = config.get("filename") diff --git a/misc/install-mapshaper.sh b/misc/install-mapshaper.sh index 810a770b..344ec24b 100644 --- a/misc/install-mapshaper.sh +++ b/misc/install-mapshaper.sh @@ -4,6 +4,6 @@ sudo apt-get install libmagic-dev -y git clone https://github.com/mbloch/mapshaper.git --single-branch cd mapshaper -npm install # install dependencies -npm run build # bundle source code files +yes | npm install # install dependencies +yes yes | npm run build # bundle source code files sudo npm link # (optional) add global symlinks so scripts are available systemwide \ No newline at end of file diff --git a/misc/prototype_mapshaper.py b/misc/prototype_mapshaper.py index b0c297e3..bf53f4ff 100644 --- a/misc/prototype_mapshaper.py +++ b/misc/prototype_mapshaper.py @@ -1,209 +1,108 @@ -import s3fs -import os -import subprocess - -from cartiflette.download.download import _download_sources -from cartiflette.utils import create_path_bucket - -ENDPOINT_URL = "https://minio.lab.sspcloud.fr" - -fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": ENDPOINT_URL}) - - - -provider = "IGN" -source = "EXPRESS-COG-CARTO-TERRITOIRE", -dict_corresp = {"REGION": "INSEE_REG", "DEPARTEMENT": "INSEE_DEP"} -year = 2022 -provider = "IGN" -dataset_family = "ADMINEXPRESS" -source = "EXPRESS-COG-CARTO-TERRITOIRE" -territory = "metropole" -path_within_bucket = "test-download5" -crs = 4326 -bucket = "projet-cartiflette" - -borders="COMMUNE" #tempdf['borders'].iloc[0] -format_output="topojson" #tempdf['format'].iloc[0] -niveau_agreg="DEPARTEMENT"#tempdf['filter_by'].iloc[0] -simplification = 0 +from cartiflette.s3 import upload_s3_raw +from cartiflette.pipeline import crossproduct_parameters_production +from cartiflette.pipeline import mapshaperize_split_from_s3, mapshaperize_merge_split_from_s3 # DOWNLOAD ========================= - -x = _download_sources( - upload = True, - providers = provider, - dataset_families = dataset_family, - sources = source, - territories = territory, - years = year, - path_within_bucket = path_within_bucket -) - - -# path_manual = create_path_bucket( -# { -# "bucket": bucket, -# "path_within_bucket": path_within_bucket, -# "year": year, -# "borders": None, -# "crs": 2154, -# "filter_by": "origin", -# "value": "raw", -# "vectorfile_format": "shp", -# "provider": provider, -# "dataset_family": dataset_family, -# "source": source, -# "territory": territory, -# "filename": "COMMUNE.shp", -# } -# ) - -path = x['IGN']['ADMINEXPRESS']['EXPRESS-COG-CARTO-TERRITOIRE']['metropole'][2022]['paths']['COMMUNE'][0] -path_bucket = path.rsplit("/", maxsplit=1)[0] - - -def list_raw_files_level(fs, path_bucket, borders): - list_raw_files = fs.ls(f"{path_bucket}") - list_raw_files = [ - chemin for chemin in list_raw_files if chemin.rsplit("/", maxsplit=1)[-1].startswith(f'{borders}.') - ] - return list_raw_files +path_within_bucket = "test-download9" -def download_files_from_list(fs, list_raw_files): - for files in list_raw_files: - fs.download( - files, - "temp/" +\ - files.rsplit("/", maxsplit=1)[-1] - ) +path_bucket = upload_s3_raw(path_within_bucket=path_within_bucket) -os.mkdir("temp") - -list_raw_files = list_raw_files_level(fs, path_bucket, borders=borders) -download_files_from_list(fs, list_raw_files) - -os.makedirs(f"{niveau_agreg}/{format_output}/", exist_ok=True) - -simplification_percent = simplification if simplification is not None else 0 - -subprocess.run( - ( - f"mapshaper temp/{borders}.shp name='' -proj EPSG:{crs} " - f"-simplify {simplification_percent}% " - f"-each \"SOURCE='{provider}:{source[0]}'\" " - f"-split {dict_corresp[niveau_agreg]} " - f"-o {niveau_agreg}/{format_output}/ format={format_output} extension=\".{format_output}\" singles" - ), - shell=True +mapshaperize_split_from_s3( + path_bucket, + { + 'path_within_bucket': path_within_bucket, + "borders": "COMMUNE", + "filter_by": "REGION", + "simplification": 50 + } ) -bucket = bucket -path_within_bucket = path_within_bucket - -for values in os.listdir(f"{niveau_agreg}/{format_output}"): - path_s3 = create_path_bucket( - { - "bucket": bucket, - "path_within_bucket": path_within_bucket, - "year": year, - "borders": borders, - "crs": crs, - "filter_by": niveau_agreg, - "value": values.replace(f".{format_output}", ""), - "vectorfile_format": format_output, - "provider": provider, - "dataset_family": dataset_family, - "source": source, - "territory": territory, - "simplification": simplification - }) - fs.put(f"{niveau_agreg}/{format_output}/{values}", path_s3, recursive=True) - - -# OLD +mapshaperize_merge_split_from_s3( + path_bucket, + { + 'path_within_bucket': path_within_bucket, + "simplification": 50 + } +) croisement_decoupage_level = { ## structure -> niveau geo: [niveau decoupage macro], - "REGION": ["FRANCE_ENTIERE"], - "ARRONDISSEMENT_MUNICIPAL" : ['DEPARTEMENT'], - "COMMUNE_ARRONDISSEMENT": ["DEPARTEMENT", "REGION", "FRANCE_ENTIERE"], - "COMMUNE": ["DEPARTEMENT", "REGION", "FRANCE_ENTIERE"], - "DEPARTEMENT": ["REGION", "FRANCE_ENTIERE"] + # "REGION": ["FRANCE_ENTIERE"], + #"ARRONDISSEMENT_MUNICIPAL" : ['DEPARTEMENT'], + #"COMMUNE_ARRONDISSEMENT": ["DEPARTEMENT", "REGION"],# "FRANCE_ENTIERE"], + "COMMUNE": ["DEPARTEMENT", "REGION"],# "FRANCE_ENTIERE"], + "DEPARTEMENT": ["REGION"]#, "FRANCE_ENTIERE"] } #formats = ["geoparquet", "shp", "gpkg", "geojson"] -formats = ["topojson"] -#formats = ["geojson"] +formats = ["topojson", "geojson"] #years = [y for y in range(2021, 2023)] years = [2022] #crs_list = [4326, 2154, "official"] -crs_list = [4326] +crs_list = [4326, 2154] sources = ["EXPRESS-COG-CARTO-TERRITOIRE"] -#tempdf = s3.crossproduct_parameters_production( -# croisement_filter_by_borders=croisement_decoupage_level, -# list_format=formats, -# years=years, -# crs_list=crs_list, -# sources=sources, -# ) - -dict_corresp = {"REGION": "INSEE_REG", "DEPARTEMENT": "INSEE_DEP"} - - -bucket = "projet-cartiflette" -path_within_bucket = "diffusion/shapefiles-test2" -year=2022 -provider="IGN" -source='EXPRESS-COG-TERRITOIRE' -field="metropole" - -borders="COMMUNE" #tempdf['borders'].iloc[0] -format_output="topojson" #tempdf['format'].iloc[0] -niveau_agreg="DEPARTEMENT"#tempdf['filter_by'].iloc[0] +tempdf = crossproduct_parameters_production( + croisement_filter_by_borders=croisement_decoupage_level, + list_format=formats, + years=years, + crs_list=crs_list, + sources=sources, + simplifications=[0, 50] + ) -path_bucket = f"{bucket}/{path_within_bucket}/{year=}/raw/{provider=}/{source=}/{field=}" +for index, row in tempdf.iterrows(): + print(row) + mapshaperize_split_from_s3( + path_bucket, + { + **{'path_within_bucket': path_within_bucket}, + **row.to_dict() + } + ) -def list_raw_files_level(fs, path_bucket, borders): - list_raw_files = fs.ls(f"{path_bucket}") - list_raw_files = [ - chemin for chemin in list_raw_files if chemin.rsplit("/", maxsplit=1)[-1].startswith(f'{borders}.') - ] - return list_raw_files +# niveau commune_arrondissement +from cartiflette.config import FS +from cartiflette.pipeline.prepare_mapshaper import prepare_local_directory_mapshaper +from cartiflette.mapshaper import mapshaperize_split_merge + +local_dir = "temp/" + +format_intermediate = "geojson" +local_directories = prepare_local_directory_mapshaper( + path_bucket, + borders="COMMUNE", + niveau_agreg="DEPARTEMENT", + format_output="topojson", + simplification=0, + local_dir=local_dir, + fs=FS +) +local_directories = prepare_local_directory_mapshaper( + path_bucket, + borders="ARRONDISSEMENT_MUNICIPAL", + niveau_agreg="DEPARTEMENT", + format_output="topojson", + simplification=0, + local_dir=local_dir, + fs=FS +) -def download_files_from_list(fs, list_raw_files): - for files in list_raw_files: - fs.download( - files, - "temp/" +\ - files.rsplit("/", maxsplit=1)[-1] - ) -os.mkdir("temp") -list_raw_files = list_raw_files_level(fs, path_bucket, borders=borders) -download_files_from_list(fs, list_raw_files) -os.mkdir(niveau_agreg) -subprocess.run( - f"mapshaper temp/{borders}.shp name='' -proj wgs84 \ - -each \"SOURCE='{provider}:{source}'\"\ - -split {dict_corresp[niveau_agreg]} \ - -o '{niveau_agreg}/' format={format_output} extension=\".{format_output}\" singles", - shell=True -) +# A intégrer # topojson & niveau communal format_output="topojson" @@ -270,3 +169,78 @@ def download_files_from_list(fs, list_raw_files): shell=True ) + + +# old + +from cartiflette.config import ENDPOINT_URL +fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": ENDPOINT_URL}) + + + +provider = "IGN" +source = "EXPRESS-COG-CARTO-TERRITOIRE", +year = 2022 +provider = "IGN" +dataset_family = "ADMINEXPRESS" +source = "EXPRESS-COG-CARTO-TERRITOIRE" +territory = "metropole" +path_within_bucket = "test-download6" +crs = 4326 +bucket = "projet-cartiflette" + +dict_corresp = {"REGION": "INSEE_REG", "DEPARTEMENT": "INSEE_DEP"} + +borders="COMMUNE" #tempdf['borders'].iloc[0] +format_output="topojson" #tempdf['format'].iloc[0] +niveau_agreg="DEPARTEMENT"#tempdf['filter_by'].iloc[0] +simplification = 0 + + + + +bucket = "projet-cartiflette" +#path_within_bucket = "shapefiles-test2" +year=2022 +provider="IGN" +source='EXPRESS-COG-TERRITOIRE' +field="metropole" + +borders="COMMUNE" #tempdf['borders'].iloc[0] +format_output="topojson" #tempdf['format'].iloc[0] +niveau_agreg="DEPARTEMENT"#tempdf['filter_by'].iloc[0] + + +path_bucket = f"{bucket}/{path_within_bucket}/{year=}/raw/{provider=}/{source=}/{field=}" + + +def list_raw_files_level(fs, path_bucket, borders): + list_raw_files = fs.ls(f"{path_bucket}") + list_raw_files = [ + chemin for chemin in list_raw_files if chemin.rsplit("/", maxsplit=1)[-1].startswith(f'{borders}.') + ] + return list_raw_files + + +def download_files_from_list(fs, list_raw_files): + for files in list_raw_files: + fs.download( + files, + "temp/" +\ + files.rsplit("/", maxsplit=1)[-1] + ) + +os.mkdir("temp") +list_raw_files = list_raw_files_level(fs, path_bucket, borders=borders) +download_files_from_list(fs, list_raw_files) + +os.mkdir(niveau_agreg) + + +subprocess.run( + f"mapshaper temp/{borders}.shp name='' -proj wgs84 \ + -each \"SOURCE='{provider}:{source}'\"\ + -split {dict_corresp[niveau_agreg]} \ + -o '{niveau_agreg}/' format={format_output} extension=\".{format_output}\" singles", + shell=True +) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 00000000..a74f1a36 --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,16 @@ +import pandas as pd +import pytest + + +from cartiflette.pipeline import restructure_nested_dict_borders, crossproduct_parameters_production + +def test_restructure_nested_dict_borders(): + sample_dict = {'a': [1, 2, 3], 'b': [4, 5]} + expected_result = [['a', 1], ['a', 2], ['a', 3], ['b', 4], ['b', 5]] + assert restructure_nested_dict_borders(sample_dict) == expected_result + + empty_dict = {} + assert restructure_nested_dict_borders(empty_dict) == [] + + single_item_dict = {'a': [1]} + assert restructure_nested_dict_borders(single_item_dict) == [['a', 1]] diff --git a/tests/test_utils.py b/tests/test_utils.py index 5d47029e..9bf62e11 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -53,3 +53,6 @@ def test_create_path_bucket(config, expected_path): result = create_path_bucket(config) assert result == expected_path + + +