diff --git a/.gitignore b/.gitignore
index 7fbcbea0..35dc79bb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -130,4 +130,6 @@ dmypy.json
*.sqlite
*.sqlite*
-/argo-pipeline/src/cartiflette-s3-cache
+**/cartiflette-s3-cache/*
+*.json
+/cartiflette/pipeline/cartiflette-s3-cache
diff --git a/Dockerfile b/Dockerfile
index 9863757b..73b6d1af 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -31,4 +31,6 @@ COPY docker/test.py .
RUN curl https://install.python-poetry.org/ | python -
RUN poetry install --only main --no-interaction
+# TODO : is this necessary? This should throw an exception if datasets have not
+# already been (manually) uploaded ?
CMD ["python", "test.py"]
\ No newline at end of file
diff --git a/argo-pipeline/api.py b/argo-pipeline/api.py
index 4156cac8..859feaf2 100644
--- a/argo-pipeline/api.py
+++ b/argo-pipeline/api.py
@@ -1,10 +1,12 @@
"""A simple API to expose cartiflette files"""
+
import typing
from fastapi import FastAPI, Response
from fastapi.responses import FileResponse
from cartiflette.api import download_from_cartiflette_inner
-from cartiflette.config import PATH_WITHIN_BUCKET
+from cartiflette.config import PATH_WITHIN_BUCKET, DATASETS_HIGH_RESOLUTION
+from cartiflette.pipeline_constants import COG_TERRITOIRE # , IRIS
app = FastAPI(
title="API de récupération des fonds de carte avec cartiflette
",
@@ -42,9 +44,10 @@ def download_from_cartiflette_api(
year=2022,
crs=4326,
simplification=simplification,
- provider="IGN",
- dataset_family="ADMINEXPRESS",
- source="EXPRESS-COG-CARTO-TERRITOIRE",
+ provider="Cartiflette",
+ dataset_family="production",
+ # TODO : source can also be IRIS[DATASETS_HIGH_RESOLUTION]
+ source=COG_TERRITOIRE[DATASETS_HIGH_RESOLUTION],
return_as_json=False,
path_within_bucket=PATH_WITHIN_BUCKET,
)
diff --git a/argo-pipeline/pipeline.yaml b/argo-pipeline/pipeline.yaml
index 0a1b9a0e..e6ecbff5 100644
--- a/argo-pipeline/pipeline.yaml
+++ b/argo-pipeline/pipeline.yaml
@@ -24,257 +24,100 @@ spec:
# ---------------------------
dag:
+ failFast: false
tasks:
- # STEP 0: RETRIEVE IGN FILE AND DUPLICATE IN MINIO
- - name: duplicate-ign
- template: duplicate-ign
- - name: test-volume
- template: test-volume
- dependencies: [ duplicate-ign ]
- # STEP 1.1. SPLIT BY DEPARTEMENT
- - name: prepare-split-departement
- template: prepare-split
- dependencies: [ duplicate-ign ]
- arguments:
- parameters:
- - name: restrict_field
- value: "DEPARTEMENT"
- - name: split-departement
- template: split-dataset
- dependencies: [ prepare-split-departement ]
- arguments:
- parameters:
- - name: split_type
- value: "DEPARTEMENT"
- - name: format_output
- value: "{{item.format-output}}"
- - name: year
- value: "{{item.year}}"
- - name: crs
- value: "{{item.crs}}"
- - name: source
- value: "{{item.source}}"
- - name: simplification
- value: "{{item.simplification}}"
- - name: level_polygons
- value: "{{item.level-polygons}}"
- - name: filter_by
- value: "{{item.filter-by}}"
- withParam: "{{tasks.prepare-split-departement.outputs.result}}"
- # STEP 1.2. SPLIT BY COMMUNE
- - name: prepare-split-commune
- template: prepare-split
- dependencies: [ duplicate-ign ]
- arguments:
- parameters:
- - name: restrict_field
- value: "COMMUNE"
- - name: split-commune
- template: split-dataset
- dependencies: [ prepare-split-commune ]
- arguments:
- parameters:
- - name: split_type
- value: "COMMUNE"
- - name: format_output
- value: "{{item.format-output}}"
- - name: year
- value: "{{item.year}}"
- - name: crs
- value: "{{item.crs}}"
- - name: source
- value: "{{item.source}}"
- - name: simplification
- value: "{{item.simplification}}"
- - name: level_polygons
- value: "{{item.level-polygons}}"
- - name: filter_by
- value: "{{item.filter-by}}"
- withParam: "{{tasks.prepare-split-commune.outputs.result}}"
- # STEP 1.3. SPLIT BY REGION
- - name: prepare-split-region
- template: prepare-split
- dependencies: [ duplicate-ign ]
- arguments:
- parameters:
- - name: restrict_field
- value: "REGION"
- - name: split-region
- template: split-dataset
- dependencies: [ prepare-split-region ]
- arguments:
- parameters:
- - name: split_type
- value: "REGION"
- - name: format_output
- value: "{{item.format-output}}"
- - name: year
- value: "{{item.year}}"
- - name: crs
- value: "{{item.crs}}"
- - name: source
- value: "{{item.source}}"
- - name: simplification
- value: "{{item.simplification}}"
- - name: level_polygons
- value: "{{item.level-polygons}}"
- - name: filter_by
- value: "{{item.filter-by}}"
- withParam: "{{tasks.prepare-split-region.outputs.result}}"
- # STEP 1.4. SPLIT BY BASSIN VIE
- - name: prepare-split-bassin-vie
- template: prepare-split
- dependencies: [ duplicate-ign ]
- arguments:
- parameters:
- - name: restrict_field
- value: "BASSIN_VIE"
- - name: split-bassin-vie
- template: split-dataset
- dependencies: [ prepare-split-bassin-vie ]
- arguments:
- parameters:
- - name: split_type
- value: "BASSIN_VIE"
- - name: format_output
- value: "{{item.format-output}}"
- - name: year
- value: "{{item.year}}"
- - name: crs
- value: "{{item.crs}}"
- - name: source
- value: "{{item.source}}"
- - name: simplification
- value: "{{item.simplification}}"
- - name: level_polygons
- value: "{{item.level-polygons}}"
- - name: filter_by
- value: "{{item.filter-by}}"
- withParam: "{{tasks.prepare-split-bassin-vie.outputs.result}}"
- # STEP 1.5. SPLIT BY ZONE_EMPLOI
- - name: prepare-split-zone-emploi
- template: prepare-split
- dependencies: [ duplicate-ign ]
+
+ # TASK 0 : MOUNT VOLUMES AND CHECK PERMISSIONS
+ - name: init-and-test-volume
+ template: init-and-test-volume
+
+ # TASK 1 : RETRIEVE ALL (NEW) FILES FROM SOURCES AND UPLOAD TO MINIO
+ - name: download-all-sources
+ template: download-all-sources
+ dependencies: [ init-and-test-volume ]
+
+ # TASK 2 : CHECK WICH VINTAGE SHOULD BE RE-PROCESSED FROM (NEW) RAW SOURCES
+ - name: select-downstream-vintage-to-process
+ template: select-downstream-vintage-to-process
+ dependencies: [ download-all-sources ]
arguments:
parameters:
- - name: restrict_field
- value: "ZONE_EMPLOI"
- - name: split-zone-emploi
- template: split-dataset
- dependencies: [ prepare-split-zone-emploi ]
+ - name: download_results
+ value: "{{tasks.download-all-sources.outputs.parameters.download_all_results}}"
+
+ # TASK 3.1 : CREATE BASE GEODATASETS ON MINIO FROM RAW TERRITORIAL FILES
+ - name: make-geodatasets
+ template: make-geodatasets
+ dependencies: [ select-downstream-vintage-to-process ]
arguments:
parameters:
- - name: split_type
- value: "ZONE_EMPLOI"
- - name: format_output
- value: "{{item.format-output}}"
- name: year
- value: "{{item.year}}"
- - name: crs
- value: "{{item.crs}}"
- - name: source
- value: "{{item.source}}"
- - name: simplification
- value: "{{item.simplification}}"
- - name: level_polygons
- value: "{{item.level-polygons}}"
- - name: filter_by
- value: "{{item.filter-by}}"
- withParam: "{{tasks.prepare-split-zone-emploi.outputs.result}}"
- # STEP 1.6. SPLIT BY UNITE_URBAINE
- - name: prepare-split-unite-urbaine
- template: prepare-split
- dependencies: [ duplicate-ign ]
- arguments:
- parameters:
- - name: restrict_field
- value: "UNITE_URBAINE"
- - name: split-unite-urbaine
- template: split-dataset
- dependencies: [ prepare-split-unite-urbaine ]
+ value: "{{item}}"
+ withParam: "{{tasks.select-downstream-vintage-to-process.outputs.parameters.geodatasets_vintage_to_update}}"
+
+ # TASK 3.2 : CREATE METADATA FILES ON MINIO FROM RAW INSEE FILES
+ - name: make-metadata
+ template: make-metadata
+ dependencies: [ select-downstream-vintage-to-process ]
arguments:
parameters:
- - name: split_type
- value: "UNITE_URBAINE"
- - name: format_output
- value: "{{item.format-output}}"
- - name: year
- value: "{{item.year}}"
- - name: crs
- value: "{{item.crs}}"
- - name: source
- value: "{{item.source}}"
- - name: simplification
- value: "{{item.simplification}}"
- - name: level_polygons
- value: "{{item.level-polygons}}"
- - name: filter_by
- value: "{{item.filter-by}}"
- withParam: "{{tasks.prepare-split-unite-urbaine.outputs.result}}"
- # STEP 1.6. SPLIT BY AIRE_ATTRACTION_VILLES
- - name: prepare-split-aire-attraction
- template: prepare-split
- dependencies: [ duplicate-ign ]
+ - name: years
+ value: "{{tasks.select-downstream-vintage-to-process.outputs.parameters.metadata_vintage_to_update}}"
+
+ # TASK 4: SELECT DOWNSTREAM YEARS TO GENERATE
+ # (FAN-OUT STEP TO ENSURE SCALIBILITY AND RESULTS' MAX LENGTH IN NEXT STEP)
+ - name: operationnal-selection-of-vintages-to-generate
+ template: operationnal-selection-of-vintages-to-generate
+ dependencies: [ make-geodatasets, make-metadata ]
arguments:
parameters:
- - name: restrict_field
- value: "AIRE_ATTRACTION_VILLES"
- - name: split-aire-attraction
- template: split-dataset
- dependencies: [ prepare-split-aire-attraction ]
+ - name: years_geodatasets
+ value: "{{tasks.make-geodatasets.outputs.parameters.updated_geodata}}"
+ - name: years_metadata
+ value: "{{tasks.make-metadata.outputs.parameters.updated_metadata}}"
+
+ # TASK 5 : TASK WITH 2 NESTED STEPS FOR
+ # step 5.1 selecting geodatasets to generate
+ # step 5.2 creating selected geodatasets
+ - name: generate-downstream-datasets
+ template: generate-downstream-datasets
+ dependencies: [ operationnal-selection-of-vintages-to-generate ]
arguments:
parameters:
- - name: split_type
- value: "AIRE_ATTRACTION_VILLES"
- - name: format_output
- value: "{{item.format-output}}"
- name: year
- value: "{{item.year}}"
- - name: crs
- value: "{{item.crs}}"
- - name: source
- value: "{{item.source}}"
- - name: simplification
- value: "{{item.simplification}}"
- - name: level_polygons
- value: "{{item.level-polygons}}"
- - name: filter_by
- value: "{{item.filter-by}}"
- withParam: "{{tasks.prepare-split-aire-attraction.outputs.result}}"
-
+ value: "{{item}}"
+ withParam: "{{tasks.operationnal-selection-of-vintages-to-generate.outputs.parameters.years}}"
+
+ # TASK 6 : GENERATE CATALOG
+ - name: make-catalog
+ template: make-catalog
+ dependencies: [generate-downstream-datasets]
# --------------------------
# TEMPLATES DEFINITION
# ---------------------------
- # First step: retrieving and duplicating IGN tiles ------------------
- - name: duplicate-ign
+ - name: init-and-test-volume
inputs:
artifacts:
- name: code
path: /mnt/bin
git:
repo: https://github.com/inseefrlab/cartiflette
- revision: "main"
+ revision: "feat/refacto_pipeline_first_steps"
container:
- image: inseefrlab/cartiflette
+ image: inseefrlab/cartiflette:latest
command: [sh, -c]
- args: ["
- mkdir -p $LOCAL_DATA_PATH ;
- mkdir -p /mnt/bin/src ;
+ args: ["mkdir -p /mnt/bin/src ;
mv /mnt/bin/argo-pipeline/src/* /mnt/bin/src ;
- python /mnt/bin/src/duplicate_in_bucket.py --path $PATH_WRITING_S3 --localpath $LOCAL_DATA_PATH ;
+ echo $ENVIRONMENT;
"]
volumeMounts:
- name: volume-workflow-tmp
mountPath: /mnt
env: &env_parameters
- - name: PATH_WRITING_S3
- value: "production"
- name: PYTHONPATH
value: "${PYTHONPATH}:/mnt/bin"
- - name: LOCAL_DATA_PATH
- value: "/mnt/data"
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
@@ -291,62 +134,182 @@ spec:
value: minio.lab.sspcloud.fr
- name: MC_HOST_s3
value: https://$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY@$AWS_S3_ENDPOINT
+ - name: ENVIRONMENT
+ # set value to "test" to simplify pipeline execution (2 years, only topojson, etc.), use "preprod" or "prod" else
+ # -> this will also configure the path_within_bucket constant
+ value: test
+
+ - name: download-all-sources
+ outputs:
+ parameters:
+ - name: download_all_results
+ valueFrom:
+ path: download_all_results.json
+ container:
+ image: inseefrlab/cartiflette:latest
+ command: [sh, -c]
+ args: ["
+ python /mnt/bin/src/download_all_sources.py;
+ "]
+ volumeMounts:
+ - name: volume-workflow-tmp
+ mountPath: /mnt
+ env: *env_parameters
+
+ - name: select-downstream-vintage-to-process
+ inputs:
+ parameters:
+ - name: download_results
+ outputs:
+ parameters:
+ - name: geodatasets_vintage_to_update
+ valueFrom:
+ path: geodatasets_years.json
+ - name: metadata_vintage_to_update
+ valueFrom:
+ path: metadata_years.json
+ container:
+ image: inseefrlab/cartiflette:latest
+ command: [sh, -c]
+ volumeMounts:
+ - name: volume-workflow-tmp
+ mountPath: /mnt
+ args: ["
+ python /mnt/bin/src/select_downstream_vintage_to_process.py --download_results '{{inputs.parameters.download_results}}'
+ "]
+ env: *env_parameters
- - name: test-volume
+ - name: make-geodatasets
+ inputs:
+ parameters:
+ - name: year
+ outputs:
+ parameters:
+ - name: updated_geodata
+ valueFrom:
+ path: "geodataset_years/{{inputs.parameters.year}}.json"
container:
- image: inseefrlab/cartiflette
+ image: inseefrlab/cartiflette:latest
command: [sh, -c]
- args: ["echo $PATH_WRITING_S3 ;
- head -n 1 ${LOCAL_DATA_PATH}/tagc.csv"]
+ volumeMounts:
+ - name: volume-workflow-tmp
+ mountPath: /mnt
+ args: ["
+ python /mnt/bin/src/make_geodata_datasets.py --year '{{inputs.parameters.year}}';
+ "]
env: *env_parameters
+
+ - name: make-metadata
+ inputs:
+ parameters:
+ - name: years
+ outputs:
+ parameters:
+ - name: updated_metadata
+ valueFrom:
+ path: metadata_years.json
+ container:
+ image: inseefrlab/cartiflette:latest
+ command: [sh, -c]
volumeMounts:
- name: volume-workflow-tmp
mountPath: /mnt
+ args: ["
+ python /mnt/bin/src/make_metadata_datasets.py --years '{{inputs.parameters.years}}';
+ "]
+ env: *env_parameters
- # Step 2: creating template task for splitting ------------------
- - name: prepare-split
+ - name: operationnal-selection-of-vintages-to-generate
inputs:
parameters:
- - name: restrict_field
+ - name: years_geodatasets
+ - name: years_metadata
+ outputs:
+ parameters:
+ - name: years
+ valueFrom:
+ path: vintages_operationnal_generation.json
container:
- image: inseefrlab/cartiflette
+ image: inseefrlab/cartiflette:latest
command: [sh, -c]
volumeMounts:
- name: volume-workflow-tmp
mountPath: /mnt
+ env: *env_parameters
args: ["
- python /mnt/bin/src/crossproduct.py --restrictfield '{{inputs.parameters.restrict_field}}'
+ python /mnt/bin/src/filter_vintages_operationnal.py --years-geodatasets '{{inputs.parameters.years_geodatasets}}' --years-metadata '{{inputs.parameters.years_metadata}}';
"]
- - name: split-dataset
+ - name : generate-downstream-datasets
+ inputs:
+ parameters:
+ - name: year
+ steps:
+ - - name: select-configs-for-generation
+ template: select-configs-for-generation
+ arguments:
+ parameters:
+ - name: year
+ value: "{{inputs.parameters.year}}"
+ - - name: generate-datasets
+ template: generate-datasets
+ arguments:
+ parameters:
+ - name: year
+ value: "{{inputs.parameters.year}}"
+ - name: config_generation
+ value: "{{steps.select-configs-for-generation.outputs.parameters.configs}}"
+
+ - name: select-configs-for-generation
+ inputs:
+ parameters:
+ - name: year
+ outputs:
+ parameters:
+ - name: configs
+ valueFrom:
+ path: "configs_datasets_to_generate/{{ inputs.parameters.year }}.json"
+ container:
+ image: inseefrlab/cartiflette:latest
+ command: [sh, -c]
+ volumeMounts:
+ - name: volume-workflow-tmp
+ mountPath: /mnt
+ env: *env_parameters
+ args: ["
+ python /mnt/bin/src/crossproduct.py --year '{{inputs.parameters.year}}';
+ "]
+
+ - name: generate-datasets
inputs:
parameters:
- - name: split_type
- - name: format_output
- name: year
- - name: crs
- - name: source
- - name: simplification
- - name: level_polygons
- - name: filter_by
+ - name: config_generation
+ outputs:
+ parameters:
+ - name: result
+ valueFrom:
+ path: "generation/{{ inputs.parameters.year }}/result.json"
container:
- image: inseefrlab/cartiflette
+ image: inseefrlab/cartiflette:latest
command: ["sh", "-c"]
args: ["
- mkdir -p temp/ && cp /mnt/data/tagc.csv temp/tagc.csv ;
- python /mnt/bin/src/split_merge_tiles.py \
- --path $PATH_WRITING_S3 \
- --format_output {{inputs.parameters.format_output}} \
- --year {{inputs.parameters.year}} \
- --crs {{inputs.parameters.crs}} \
- --source {{inputs.parameters.source}} \
- --simplification {{inputs.parameters.simplification}} \
- --level_polygons {{inputs.parameters.level_polygons}} \
- --filter_by {{inputs.parameters.filter_by}}"
+ python /mnt/bin/src/split_merge_tiles_multithreading.py \
+ --year '{{inputs.parameters.year}}' \
+ --configs '{{inputs.parameters.config_generation}}'"
]
volumeMounts:
- name: volume-workflow-tmp
mountPath: /mnt
env: *env_parameters
-
+
+ - name: make-catalog
+ container:
+ image: inseefrlab/cartiflette:latest
+ command: ["sh", "-c"]
+ args: ["python /mnt/bin/src/catalog.py"]
+ volumeMounts:
+ - name: volume-workflow-tmp
+ mountPath: /mnt
+ env: *env_parameters
diff --git a/argo-pipeline/src/catalog.py b/argo-pipeline/src/catalog.py
new file mode 100644
index 00000000..1b91bfec
--- /dev/null
+++ b/argo-pipeline/src/catalog.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Create cartiflette's catalog
+"""
+
+import logging
+
+from s3fs import S3FileSystem
+
+from cartiflette.config import (
+ BUCKET,
+ PATH_WITHIN_BUCKET,
+ FS,
+)
+from cartiflette.s3 import make_s3_inventory
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+logger.info("=" * 50)
+logger.info("\n%s", __doc__)
+logger.info("=" * 50)
+
+# Nota : no parsed needed for this command
+
+
+def main(
+ bucket: str = BUCKET,
+ path_within_bucket: str = PATH_WITHIN_BUCKET,
+ fs: S3FileSystem = FS,
+):
+
+ make_s3_inventory(
+ fs=fs, bucket=bucket, path_within_bucket=path_within_bucket
+ )
+
+ logger.info("Success!")
+
+
+if __name__ == "__main__":
+ main(
+ bucket=BUCKET,
+ path_within_bucket=PATH_WITHIN_BUCKET,
+ fs=FS,
+ )
diff --git a/argo-pipeline/src/crossproduct.py b/argo-pipeline/src/crossproduct.py
index 26da9839..cb7b49da 100644
--- a/argo-pipeline/src/crossproduct.py
+++ b/argo-pipeline/src/crossproduct.py
@@ -1,67 +1,126 @@
-import json
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+4.2th step of pipeline
+
+Prepare arguments for next step
+"""
+
import argparse
+import json
+import logging
+import os
+from typing import List
+
+from s3fs import S3FileSystem
+
from cartiflette.pipeline import crossproduct_parameters_production
+from cartiflette.config import (
+ BUCKET,
+ PATH_WITHIN_BUCKET,
+ FS,
+)
+from cartiflette.pipeline_constants import (
+ PIPELINE_SIMPLIFICATION_LEVELS,
+ # PIPELINE_FORMATS,
+ # PIPELINE_CRS,
+)
+
+logger = logging.getLogger(__name__)
+
+logging.basicConfig(level=logging.INFO)
+
+logger.info("=" * 50)
+logger.info("\n" + __doc__)
+logger.info("=" * 50)
+
parser = argparse.ArgumentParser(description="Crossproduct Script")
+
parser.add_argument(
- "--restrictfield", type=str, default=None, help="Field to restrict level-polygons"
+ "-y",
+ "--year",
+ default="2023",
+ help="Filter downstream vintage to process",
)
+# parser.add_argument(
+# "-f",
+# "--formats",
+# default=",".join(PIPELINE_FORMATS),
+# help="Desired output formats, as a comma separated values list",
+# )
+
+# parser.add_argument(
+# "-c",
+# "--crs",
+# default=",".join([str(x) for x in PIPELINE_CRS]),
+# help="Desired projections as EPSG codes, as a comma separated values list",
+# )
+
+parser.add_argument(
+ "-s",
+ "--simplifications",
+ default=",".join([str(x) for x in PIPELINE_SIMPLIFICATION_LEVELS]),
+ help="Desired simplifications levels, as a comma separated values list",
+)
-# parameters
-formats = ["topojson", "geojson"]
-years = [2022]
-crs_list = [4326]
-sources = ["EXPRESS-COG-CARTO-TERRITOIRE"]
-
-croisement_decoupage_level = {
- # structure -> niveau geo: [niveau decoupage macro],
- "COMMUNE": [
- "BASSIN_VIE",
- "ZONE_EMPLOI",
- "UNITE_URBAINE",
- "AIRE_ATTRACTION_VILLES", # zonages d'études
- "DEPARTEMENT",
- "REGION", # zonages administratifs
- "TERRITOIRE",
- "FRANCE_ENTIERE",
- "FRANCE_ENTIERE_DROM_RAPPROCHES",
- ],
- "DEPARTEMENT": [
- "REGION",
- "TERRITOIRE",
- "FRANCE_ENTIERE",
- "FRANCE_ENTIERE_DROM_RAPPROCHES",
- ],
- "REGION": ["TERRITOIRE", "FRANCE_ENTIERE", "FRANCE_ENTIERE_DROM_RAPPROCHES"],
- "BASSIN_VIE": ["TERRITOIRE", "FRANCE_ENTIERE", "FRANCE_ENTIERE_DROM_RAPPROCHES"],
- "ZONE_EMPLOI": ["TERRITOIRE", "FRANCE_ENTIERE", "FRANCE_ENTIERE_DROM_RAPPROCHES"],
- "UNITE_URBAINE": ["TERRITOIRE", "FRANCE_ENTIERE", "FRANCE_ENTIERE_DROM_RAPPROCHES"],
- "AIRE_ATTRACTION_VILLES": ["TERRITOIRE", "FRANCE_ENTIERE", "FRANCE_ENTIERE_DROM_RAPPROCHES"],
-}
args = parser.parse_args()
+year = args.year
+# formats = args.formats.split(",")
+# crs = args.crs.split(",")
+simplifications = args.simplifications.split(",")
+
+
+# TODO : convert bucket & path_within_bucket to parsable arguments
-def main():
- tempdf = crossproduct_parameters_production(
- croisement_filter_by_borders=croisement_decoupage_level,
- list_format=formats,
- years=years,
- crs_list=crs_list,
- sources=sources,
- simplifications=[0, 50],
+
+def main(
+ year: int = None,
+ simplifications: List[str] = None,
+ formats: List[str] = None,
+ crs: List[int] = None,
+ bucket: str = BUCKET,
+ path_within_bucket: str = PATH_WITHIN_BUCKET,
+ fs: S3FileSystem = FS,
+):
+
+ simplifications = (
+ simplifications if simplifications else PIPELINE_SIMPLIFICATION_LEVELS
+ )
+
+ logger.info("Crossproduct with year=%s", year)
+ logger.info("Crossproduct with simplifications=%s", simplifications)
+ logger.info("Crossproduct with formats=%s", formats)
+ logger.info("Crossproduct with crs=%s", crs)
+
+ configs = crossproduct_parameters_production(
+ # list_format=formats,
+ year=year,
+ # crs_list=crs,
+ simplifications=simplifications,
+ fs=fs,
+ bucket=bucket,
+ path_within_bucket=path_within_bucket,
)
- tempdf.columns = tempdf.columns.str.replace("_", "-")
- # Apply filtering if restrictfield is provided
- if args.restrictfield:
- tempdf = tempdf.loc[tempdf["level-polygons"] == args.restrictfield]
+ try:
+ os.makedirs("configs_datasets_to_generate")
+ except FileExistsError:
+ pass
- output = tempdf.to_json(orient="records")
- parsed = json.loads(output)
- print(json.dumps(parsed))
+ with open(f"configs_datasets_to_generate/{year}.json", "w") as out:
+ json.dump(configs, out)
+ return configs
if __name__ == "__main__":
- main()
+ configs = main(
+ year=year,
+ simplifications=simplifications,
+ bucket=BUCKET,
+ path_within_bucket=PATH_WITHIN_BUCKET,
+ fs=FS,
+ )
diff --git a/argo-pipeline/src/download_all_sources.py b/argo-pipeline/src/download_all_sources.py
new file mode 100644
index 00000000..8bbc61ce
--- /dev/null
+++ b/argo-pipeline/src/download_all_sources.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+First step of pipeline
+
+Performs a full pipeline to download data and store them on MinIO. The
+target files are described in cartiflette/constants.py under the
+constant PIPELINE_DOWNLOAD_ARGS. Those files' characteristics must also be
+described in the cartiflette/utils/sources.yaml file.
+
+Note: to perform an easy debugging task, please overwrite
+cartiflette.config.THREADS_DOWNLOAD to 1 (to avoid multithreading which
+could be gruesome to debug).
+
+During the operation:
+ * GIS files should be reprojected to 4326 if current projection has no EPSG
+ code
+ * each file should be re-encoded in UTF-8
+ * unvalid geometries will try to be fixed using a 0 buffer
+
+"""
+
+import argparse
+from datetime import date
+import logging
+import os
+import json
+
+from cartiflette.config import BUCKET, PATH_WITHIN_BUCKET, FS
+from cartiflette.pipeline import download_all
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+logger.info("=" * 50)
+logger.info("\n" + __doc__)
+logger.info("=" * 50)
+
+
+# Initialize ArgumentParser
+parser = argparse.ArgumentParser(
+ description="Run Cartiflette pipeline download script."
+)
+
+default_years = ",".join(str(x) for x in range(2020, date.today().year + 1))
+parser.add_argument(
+ "--years",
+ type=str,
+ help="List of years to perform download on (as comma separated values)",
+ default=default_years,
+)
+
+parser.add_argument(
+ "--skip",
+ action="store_true",
+ help="Skip download for speeding debugging purposes",
+)
+
+# Parse arguments
+args = parser.parse_args()
+
+bucket = BUCKET
+years = args.years
+skip = args.skip
+
+if os.environ.get("ENVIRONMENT", None) == "test":
+ logging.warning(
+ "test environment -> restrict download to 2023 & 2024 only"
+ )
+ years = "2023,2024"
+
+if years:
+ years = [int(x) for x in years.split(",")]
+
+fs = FS
+
+
+try:
+ if not skip:
+ results = download_all(
+ bucket, PATH_WITHIN_BUCKET, fs=fs, upload=True, years=years
+ )
+ else:
+ results = dict()
+ logger.warning(
+ "\n\n!!!! Download skipped !!!\n\n"
+ "To reset download, remove --skip flag from pipeline yaml (from "
+ "download-all-sources template)!"
+ )
+
+ with open("download_all_results.json", "w") as out:
+ json.dump(results, out)
+except Exception:
+ try:
+ os.unlink("download_all_results.json")
+ except FileNotFoundError:
+ pass
+ raise
+
+logger.info(results)
diff --git a/argo-pipeline/src/duplicate_in_bucket.py b/argo-pipeline/src/duplicate_in_bucket.py
deleted file mode 100644
index 1853ff99..00000000
--- a/argo-pipeline/src/duplicate_in_bucket.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import argparse
-import os
-
-from cartiflette.config import BUCKET, PATH_WITHIN_BUCKET, FS
-from cartiflette.utils import create_path_bucket
-from cartiflette.pipeline.combine_adminexpress_france import (
- combine_adminexpress_territory,
-)
-from cartiflette.pipeline.prepare_cog_metadata import prepare_cog_metadata
-
-# Initialize ArgumentParser
-parser = argparse.ArgumentParser(description="Run Cartiflette pipeline script.")
-parser.add_argument(
- "-p", "--path", help="Path within bucket", default=PATH_WITHIN_BUCKET
-)
-parser.add_argument(
- "-lp", "--localpath", help="Path within bucket", default="temp"
-)
-
-# Parse arguments
-args = parser.parse_args()
-
-bucket = BUCKET
-path_within_bucket = args.path
-local_path = args.localpath
-
-year = 2022
-fs = FS
-
-os.makedirs(local_path, exist_ok=True)
-
-# PART 1/ COMBINE RAW FILES TOGETHER AND WRITE TO S3
-
-
-def main(path_within_bucket, localpath, bucket=BUCKET, year=year):
-
- path_combined_files = combine_adminexpress_territory(
- path_within_bucket=path_within_bucket,
- intermediate_dir=localpath
- )
-
- path_raw_s3 = create_path_bucket(
- {
- "bucket": bucket,
- "path_within_bucket": path_within_bucket,
- "year": year,
- "borders": "france",
- "crs": 4326,
- "filter_by": "preprocessed",
- "value": "before_cog",
- "vectorfile_format": "geojson",
- "provider": "IGN",
- "dataset_family": "ADMINEXPRESS",
- "source": "EXPRESS-COG-CARTO-TERRITOIRE",
- "territory": "france",
- "filename": "raw.geojson",
- "simplification": 0,
- }
- )
-
- fs.put_file(path_combined_files, path_raw_s3)
-
- # Retrieve COG metadata
- tagc_metadata = prepare_cog_metadata(
- path_within_bucket, local_dir=localpath)
- tagc_metadata.drop(columns=["LIBGEO"]).to_csv(f"{localpath}/tagc.csv")
-
- data = {"preprocessed": path_combined_files, "metadata": f"{localpath}/tagc.csv"}
-
- return data
-
-
-if __name__ == "__main__":
- main(path_within_bucket, localpath=local_path)
diff --git a/argo-pipeline/src/filter_vintages_operationnal.py b/argo-pipeline/src/filter_vintages_operationnal.py
new file mode 100644
index 00000000..f7daf740
--- /dev/null
+++ b/argo-pipeline/src/filter_vintages_operationnal.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+4.1th step of pipeline
+
+Filter years for which geodata OR metadata have been successfuly generated.
+This is a dummy task in the dag, only used to force a fan-out step, which
+should ensure next step does not reach ARGO's maximum 262,144 characters for
+output/input.
+"""
+
+import argparse
+import json
+import os
+import logging
+
+from cartiflette.config import (
+ BUCKET,
+ PATH_WITHIN_BUCKET,
+ FS,
+)
+
+logger = logging.getLogger(__name__)
+
+logger.info("=" * 50)
+logger.info("\n" + __doc__)
+logger.info("=" * 50)
+
+
+parser = argparse.ArgumentParser(description="Crossproduct Script")
+
+parser.add_argument(
+ "-yg",
+ "--years-geodatasets",
+ default=r'["{\"2023\": true}"]',
+ help="Updated geodataset's vintages",
+)
+
+parser.add_argument(
+ "-ym",
+ "--years-metadata",
+ default="[2023]",
+ help="Updated metadata's vintages",
+)
+
+args = parser.parse_args()
+
+years_geodatasets = [json.loads(x) for x in json.loads(args.years_geodatasets)]
+years_geodatasets = {
+ int(year)
+ for d in years_geodatasets
+ for (year, result) in d.items()
+ if result
+}
+
+years_metadata = {int(x) for x in json.loads(args.years_metadata)}
+
+years = sorted(list(years_geodatasets | years_metadata))
+
+if os.environ.get("ENVIRONMENT", None) == "test":
+ logging.warning("test environment -> restrict generation to 2023, 2024 ")
+ years = [2023, 2024]
+
+logger.info(
+ "selected downstream years for operationnal generation of datasets : %s",
+ years,
+)
+
+with open("vintages_operationnal_generation.json", "w") as out:
+ json.dump(years, out)
diff --git a/argo-pipeline/src/make_geodata_datasets.py b/argo-pipeline/src/make_geodata_datasets.py
new file mode 100644
index 00000000..77bc38ce
--- /dev/null
+++ b/argo-pipeline/src/make_geodata_datasets.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+3rd step of pipeline - part 1
+
+Retrieve all territorial cities' files and merge those into single files
+for each vintage.
+"""
+
+import argparse
+import json
+import logging
+import os
+from typing import List
+
+from cartiflette.config import (
+ BUCKET,
+ PATH_WITHIN_BUCKET,
+ FS,
+ INTERMEDIATE_FORMAT,
+)
+from cartiflette.pipeline_constants import PIPELINE_SIMPLIFICATION_LEVELS
+from cartiflette.pipeline.prepare_geodatasets import (
+ create_one_year_geodataset_batch,
+)
+
+logging.basicConfig(level=logging.INFO)
+
+logger = logging.getLogger(__name__)
+
+
+logger.info("=" * 50)
+logger.info("\n" + __doc__)
+logger.info("=" * 50)
+
+# Initialize ArgumentParser
+parser = argparse.ArgumentParser(
+ description="Preprocess geodatasets from raw sources"
+)
+parser.add_argument(
+ "-y", "--year", help="Vintage to perform computation on", default="2023"
+)
+
+parser.add_argument(
+ "-s",
+ "--simplify",
+ help="Simplifications levels to perform",
+ default=PIPELINE_SIMPLIFICATION_LEVELS,
+)
+
+# Parse arguments
+args = parser.parse_args()
+year = args.year
+simplifications = args.simplify
+
+bucket = BUCKET
+fs = FS
+
+
+def main(
+ simplifications: List[int],
+ bucket=BUCKET,
+ year: int = None,
+ path_within_bucket: str = PATH_WITHIN_BUCKET,
+):
+
+ created = create_one_year_geodataset_batch(
+ year,
+ format_output=INTERMEDIATE_FORMAT,
+ simplifications_values=simplifications,
+ bucket=bucket,
+ path_within_bucket=path_within_bucket,
+ fs=fs,
+ )
+
+ try:
+ os.makedirs("geodataset_years")
+ except FileExistsError:
+ pass
+
+ with open(f"geodataset_years/{year}.json", "w") as out:
+ json.dump(created, out)
+
+ return created
+
+
+if __name__ == "__main__":
+ data = main(simplifications=simplifications, year=year)
diff --git a/argo-pipeline/src/make_metadata_datasets.py b/argo-pipeline/src/make_metadata_datasets.py
new file mode 100644
index 00000000..bd0079c1
--- /dev/null
+++ b/argo-pipeline/src/make_metadata_datasets.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+3rd step of pipeline - part 2
+
+Update/create vintaged metadata files and send those to S3
+"""
+
+import argparse
+import json
+import logging
+import os
+import tempfile
+
+from cartiflette.config import BUCKET, PATH_WITHIN_BUCKET, FS
+from cartiflette.utils import create_path_bucket
+from cartiflette.pipeline.prepare_cog_metadata import prepare_cog_metadata
+
+
+logging.basicConfig(level=logging.INFO)
+
+logger = logging.getLogger(__name__)
+
+logger.info("=" * 50)
+logger.info("\n%s", __doc__)
+logger.info("=" * 50)
+
+# Initialize ArgumentParser
+parser = argparse.ArgumentParser(
+ description="Preprocess metadata from raw sources"
+)
+
+parser.add_argument(
+ "-y", "--years", help="Vintage to perform computation on", default="[]"
+)
+
+# Parse arguments
+args = parser.parse_args()
+
+bucket = BUCKET
+years = args.years
+
+years = json.loads(years)
+
+fs = FS
+
+
+def main(
+ path_within_bucket: str = PATH_WITHIN_BUCKET,
+ bucket: str = BUCKET,
+ years: int = None,
+):
+
+ created = []
+
+ with tempfile.TemporaryDirectory() as tempdir:
+ for year in years:
+ logger.info("-" * 50)
+ logger.info("Computing metadata for year=%s", year)
+ logger.info("-" * 50)
+
+ config = {
+ "bucket": bucket,
+ "path_within_bucket": path_within_bucket,
+ "year": year,
+ "crs": None,
+ "filter_by": "preprocessed",
+ "value": "tagc",
+ "vectorfile_format": "csv",
+ "provider": "Cartiflette",
+ "dataset_family": "metadata",
+ "source": "INSEE",
+ "territory": "france",
+ "filename": "metadata.csv",
+ "simplification": 0,
+ }
+
+ # Retrieve COG metadata
+ # TODO : update prepare_cog_metadata to send directly to S3
+ metadata = prepare_cog_metadata(
+ bucket=bucket,
+ path_within_bucket=path_within_bucket,
+ year=year,
+ )
+ if metadata is None:
+ continue
+
+ for key in [
+ "COMMUNE",
+ "ARRONDISSEMENT_MUNICIPAL",
+ "CANTON",
+ "IRIS",
+ ]:
+ try:
+ metadata_border = metadata[key]
+ except KeyError:
+ continue
+ if metadata_border.empty:
+ continue
+ config["borders"] = key
+ path_raw_s3 = create_path_bucket(config)
+ localfile = f"{tempdir}/metadata.csv"
+ metadata_border.to_csv(localfile, index=False)
+ try:
+ logger.info("sending %s -> %s", localfile, path_raw_s3)
+ fs.put_file(localfile, path_raw_s3)
+ except Exception:
+ raise
+ finally:
+ os.unlink(localfile)
+
+ # if at least one metadata constructed
+ created.append(year)
+
+ created = sorted(list(set(created)))
+
+ with open("metadata_years.json", "w", encoding="utf8") as out:
+ json.dump(created, out)
+
+ return created
+
+
+if __name__ == "__main__":
+ data = main(years=years)
diff --git a/argo-pipeline/src/select_downstream_vintage_to_process.py b/argo-pipeline/src/select_downstream_vintage_to_process.py
new file mode 100644
index 00000000..b29ed0fd
--- /dev/null
+++ b/argo-pipeline/src/select_downstream_vintage_to_process.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+2nd step of pipeline
+
+Select which geodatasets should be updated (those where raw datasets components
+have been re-downloaded) to select downstream steps
+"""
+
+import argparse
+import logging
+import os
+import json
+
+from cartiflette.config import (
+ DATASETS_HIGH_RESOLUTION,
+)
+from cartiflette.pipeline_constants import COG_TERRITOIRE, IRIS
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+logger.info("=" * 50)
+logger.info("\n%s", __doc__)
+logger.info("=" * 50)
+
+parser = argparse.ArgumentParser(description="Select vintage to update")
+parser.add_argument(
+ "--download_results",
+ type=str,
+ default="{}",
+ help="Results of download pipeline",
+)
+
+args = parser.parse_args()
+download_results = args.download_results
+
+download_results = json.loads(download_results)
+
+# Example of download_results
+# {"IGN": {"ADMINEXPRESS": {"EXPRESS-COG-TERRITOIRE": {"guadeloupe": {"2024": {"downloaded": true, "paths": {"COMMUNE": ["projet-cartiflette/test/provider=IGN/dataset_family=ADMINEXPRESS/source=EXPRESS-COG-TERRITOIRE/year=2024/administrative_level=None/crs=5490/origin=raw/vectorfile_format=shp/territory=guadeloupe/simplification=0/COMMUNE.shp"]}}}, "martinique": {"2024": {"downloaded": true, "paths": {"COMMUNE": ["projet-cartiflette/test/provider=IGN/dataset_family=ADMINEXPRESS/source=EXPRESS-COG-TERRITOIRE/year=2024/administrative_level=None/crs=5490/origin=raw/vectorfile_format=shp/territory=martinique/simplification=0/COMMUNE.shp"]}}}, "guyane": {"2024": {"downloaded": true, "paths": {"COMMUNE": ["projet-cartiflette/test/provider=IGN/dataset_family=ADMINEXPRESS/source=EXPRESS-COG-TERRITOIRE/year=2024/administrative_level=None/crs=2972/origin=raw/vectorfile_format=shp/territory=guyane/simplification=0/COMMUNE.shp"]}}}, "reunion": {"2024": {"downloaded": true, "paths": {"COMMUNE": ["projet-cartiflette/test/provider=IGN/dataset_family=ADMINEXPRESS/source=EXPRESS-COG-TERRITOIRE/year=2024/administrative_level=None/crs=2975/origin=raw/vectorfile_format=shp/territory=reunion/simplification=0/COMMUNE.shp"]}}}, "mayotte": {"2024": {"downloaded": true, "paths": {"COMMUNE": ["projet-cartiflette/test/provider=IGN/dataset_family=ADMINEXPRESS/source=EXPRESS-COG-TERRITOIRE/year=2024/administrative_level=None/crs=4326/origin=raw/vectorfile_format=shp/territory=mayotte/simplification=0/COMMUNE.shp"]}}}, "metropole": {"2024": {"downloaded": true, "paths": {"COMMUNE": ["projet-cartiflette/test/provider=IGN/dataset_family=ADMINEXPRESS/source=EXPRESS-COG-TERRITOIRE/year=2024/administrative_level=None/crs=2154/origin=raw/vectorfile_format=shp/territory=metropole/simplification=0/COMMUNE.shp"]}}}}}}, "Insee": {"COG": {"DEPARTEMENT": {"france_entiere": {"2024": {"downloaded": false, "paths": null}}}, "REGION": {"france_entiere": {"2024": {"downloaded": false, "paths": null}}}}, "TAGC": {"APPARTENANCE": {"france_entiere": {"2024": {"downloaded": true, "paths": {"table-appartenance-geo-communes-2024": ["projet-cartiflette/test/provider=Insee/dataset_family=TAGC/source=APPARTENANCE/year=2024/administrative_level=None/crs=None/origin=raw/vectorfile_format=xlsx/territory=france_entiere/simplification=0/table-appartenance-geo-communes-2024.xlsx"]}}}}}}}
+
+
+if os.environ.get("ENVIRONMENT", None) == "test":
+ logging.warning("test environment -> force generation of only 2023 & 2024")
+
+
+def store_to_json(name, years):
+ "util function to store vintage selections to json for argo results"
+ with open(name, "w", encoding="utf8") as out:
+ json.dump(years, out)
+ return years
+
+
+def filter_geodata(results):
+ "filter the downloaded vintages of geodatasets"
+ if os.environ.get("ENVIRONMENT", None) == "test":
+ return store_to_json("geodatasets_years.json", [2023, 2024])
+
+ years = set()
+ keys_geo = (
+ ("ADMINEXPRESS", COG_TERRITOIRE[DATASETS_HIGH_RESOLUTION]),
+ ("IRIS", IRIS[DATASETS_HIGH_RESOLUTION]),
+ )
+ try:
+ raw = [results["IGN"][family][geo] for family, geo in keys_geo]
+ except KeyError:
+ years = []
+ else:
+ for dset in raw:
+ for dict_results in dset.values():
+ for year, dict_results_this_year in dict_results.items():
+ if dict_results_this_year["downloaded"]:
+ years.add(year)
+
+ years = sorted(list(years))
+ logger.info("selected downstream geodatasets : %s", years)
+ return store_to_json("geodatasets_years.json", years)
+
+
+def filter_metadata(results):
+ "filter the downloaded vintages of metadatasets"
+ if os.environ.get("ENVIRONMENT", None) == "test":
+ return store_to_json("metadata_years.json", [2023, 2024])
+
+ years = set()
+ try:
+ raw = [dset for provider, dset in results.items() if provider != "IGN"]
+ except KeyError:
+ years = []
+ else:
+
+ for dset_provider in raw:
+ for dset_family in dset_provider.values():
+ for dset in dset_family.values():
+ for dict_results in dset.values():
+ for (
+ year,
+ dict_results_this_year,
+ ) in dict_results.items():
+ if dict_results_this_year["downloaded"]:
+ years.add(year)
+
+ years = sorted(list(years))
+ logger.info("selected downstream metadatasets : %s", years)
+ return store_to_json("metadata_years.json", years)
+
+
+if __name__ == "__main__":
+ filter_geodata(download_results)
+ filter_metadata(download_results)
diff --git a/argo-pipeline/src/split_merge_tiles.py b/argo-pipeline/src/split_merge_tiles.py
deleted file mode 100644
index 1e392614..00000000
--- a/argo-pipeline/src/split_merge_tiles.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import argparse
-from cartiflette.config import PATH_WITHIN_BUCKET
-from cartiflette.pipeline import (
- mapshaperize_split_from_s3,
- mapshaperize_merge_split_from_s3,
-)
-import logging
-
-
-parser = argparse.ArgumentParser(description="Process command line arguments.")
-logger = logging.getLogger(__name__)
-
-# Define the arguments with their default values
-parser.add_argument(
- "--path", type=str, default=PATH_WITHIN_BUCKET, help="Path in bucket"
-)
-parser.add_argument(
- "--format_output", type=str, default="geojson", help="Output format"
-)
-parser.add_argument("--year", type=int, default=2022, help="Year for the data")
-parser.add_argument("--crs", type=int, default=4326, help="Coordinate Reference System")
-parser.add_argument(
- "--source", type=str, default="EXPRESS-COG-CARTO-TERRITOIRE", help="Data source"
-)
-parser.add_argument(
- "--simplification", type=float, default=0, help="Simplification level"
-)
-parser.add_argument(
- "--level_polygons", type=str, default="COMMUNE", help="Level of polygons"
-)
-parser.add_argument(
- "--filter_by", type=str, default="DEPARTEMENT", help="Splitting criteria"
-)
-
-# Parse the arguments
-args = parser.parse_args()
-
-# Create a dictionary from the parsed arguments
-args_dict = {
- "path_within_bucket": args.path,
- "format_output": args.format_output,
- "year": args.year,
- "crs": args.crs,
- "source": args.source,
- "simplification": args.simplification,
- "level_polygons": args.level_polygons,
- "filter_by": args.filter_by,
-}
-
-
-def main(args_dict):
- logger.info("Processing with provided arguments")
- logger.info("Arguments for mapshaperize_split_from_s3 ---> {0}".format(args_dict))
- mapshaperize_split_from_s3(args_dict)
-
- if args_dict["level_polygons"] != "COMMUNE":
- return None
-
- logger.info("Also processing for COMMUNE_ARRONDISSEMENT borders")
- args_dict["level_polygons"] = "COMMUNE_ARRONDISSEMENT"
- mapshaperize_merge_split_from_s3(args_dict)
-
- return args_dict
-
-
-if __name__ == "__main__":
- main(args_dict)
diff --git a/argo-pipeline/src/split_merge_tiles_multithreading.py b/argo-pipeline/src/split_merge_tiles_multithreading.py
new file mode 100644
index 00000000..61af7d68
--- /dev/null
+++ b/argo-pipeline/src/split_merge_tiles_multithreading.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Last step of pipeline (with multithreading)
+
+Create all geodatasets served by cartiflette
+"""
+
+import argparse
+import json
+import logging
+import os
+
+from s3fs import S3FileSystem
+
+from cartiflette.config import (
+ BUCKET,
+ PATH_WITHIN_BUCKET,
+ FS,
+)
+from cartiflette.pipeline import mapshaperize_split_from_s3_multithreading
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+logger.info("=" * 50)
+logger.info("\n%s", __doc__)
+logger.info("=" * 50)
+
+parser = argparse.ArgumentParser(description="Process command line arguments.")
+
+# Define the arguments with their default values
+parser.add_argument(
+ "-y",
+ "--year",
+ default="2023",
+ help="Filter downstream vintage to process",
+)
+
+parser.add_argument(
+ "-c",
+ "--configs",
+ default='[{"mesh_init":"ARRONDISSEMENT_MUNICIPAL","source_geodata":"EXPRESS-COG-CARTO-TERRITOIRE","simplification":40,"dissolve_by":"ARRONDISSEMENT_MUNICIPAL","territories":["FRANCE_ENTIERE_DROM_RAPPROCHES","FRANCE_ENTIERE"]}]',
+ help="Configurations for child datasets",
+)
+
+# Parse the arguments
+args = parser.parse_args()
+
+
+def main(
+ year,
+ config_generation: list,
+ bucket: str = BUCKET,
+ path_within_bucket: str = PATH_WITHIN_BUCKET,
+ fs: S3FileSystem = FS,
+):
+
+ result = mapshaperize_split_from_s3_multithreading(
+ year=year,
+ configs=config_generation,
+ bucket=bucket,
+ path_within_bucket=path_within_bucket,
+ fs=fs,
+ )
+
+ out_path = f"generation/{year}/result.json"
+ try:
+ os.makedirs(os.path.dirname(out_path))
+ except FileExistsError:
+ pass
+ with open(out_path, "w", encoding="utf8") as out:
+ json.dump(result, out)
+
+ return result
+
+
+if __name__ == "__main__":
+ main(
+ year=args.year,
+ config_generation=json.loads(args.configs),
+ # config_generation=[
+ # {
+ # "mesh_init": "IRIS",
+ # "source_geodata": "CONTOUR-IRIS",
+ # "simplification": "40",
+ # "dissolve_by": "IRIS",
+ # "territories": [
+ # "FRANCE_ENTIERE_IDF_DROM_RAPPROCHES",
+ # ],
+ # }
+ # ],
+ bucket=BUCKET,
+ path_within_bucket=PATH_WITHIN_BUCKET,
+ fs=FS,
+ )
diff --git a/cartiflette/api/output.py b/cartiflette/api/output.py
index c17854e6..89cfbef1 100644
--- a/cartiflette/api/output.py
+++ b/cartiflette/api/output.py
@@ -8,7 +8,7 @@
import s3fs
import geopandas as gpd
-from cartiflette.download.scraper import MasterScraper
+from cartiflette.download import Scraper
from cartiflette.utils import create_path_bucket, standardize_inputs
from cartiflette.config import BUCKET, PATH_WITHIN_BUCKET, FS
@@ -305,7 +305,7 @@ def download_vectorfile_single(
fs.download(remote_file, local_path)
else:
- with MasterScraper(*args, **kwargs) as s:
+ with Scraper(*args, **kwargs) as s:
# Note that python should cleanup all tmpfile by itself
if format_read == "shp":
@@ -314,9 +314,7 @@ def download_vectorfile_single(
successes = []
for remote_file in files:
remote = os.path.splitext(url)[0] + f".{ext}"
- success, tmp = s.download_to_tempfile_http(
- url=remote
- )
+ success, tmp = s.download_to_tempfile_http(url=remote)
successes.append(success)
shutil.copy(tmp, f"{tdir.name}/raw.{ext}")
local_path = f"{tdir.name}/raw.shp"
diff --git a/cartiflette/config.py b/cartiflette/config.py
index e0c44844..3a60aa25 100644
--- a/cartiflette/config.py
+++ b/cartiflette/config.py
@@ -1,12 +1,13 @@
# -*- coding: utf-8 -*-
import os
+import warnings
from dotenv import load_dotenv
import s3fs
-load_dotenv()
+load_dotenv(override=True)
BUCKET = "projet-cartiflette"
-PATH_WITHIN_BUCKET = "production"
+PATH_WITHIN_BUCKET = os.environ.get("ENVIRONMENT", "test")
ENDPOINT_URL = "https://minio.lab.sspcloud.fr"
kwargs = {}
@@ -16,9 +17,27 @@
except KeyError:
continue
FS = s3fs.S3FileSystem(client_kwargs={"endpoint_url": ENDPOINT_URL}, **kwargs)
+# Double the standard timeouts
+FS.read_timeout = 30
+FS.connect_timeout = 10
THREADS_DOWNLOAD = 5
# Nota : each thread may also span the same number of children threads;
# set to 1 for debugging purposes (will deactivate multithreading)
-LEAVE_TQDM = False
+RETRYING = True # WHETHER TO USE RETRYING MODULE ON DOWNLOAD/UPLOAD
+
+# =============================================================================
+# PIPELINE CONFIG
+# =============================================================================
+
+# set to low resolution datasets for test environment, high for anything else
+INTERMEDIATE_FORMAT = "geojson"
+DATASETS_HIGH_RESOLUTION = os.environ.get("ENVIRONMENT", "test") != "test"
+MAPSHAPER_QUIET = os.environ.get("ENVIRONMENT", "test") != "test"
+
+if not DATASETS_HIGH_RESOLUTION:
+ warnings.warn(
+ "cartiflette is running with test configuration, using only low "
+ "resolution datasets"
+ )
diff --git a/cartiflette/constants.py b/cartiflette/constants.py
index 28e10a12..3463cc16 100644
--- a/cartiflette/constants.py
+++ b/cartiflette/constants.py
@@ -28,46 +28,17 @@
"geometry": box(44.7437, -13.2733, 45.507, -12.379),
},
{
- "location": "saint_pierre_et_miquelon",
+ "location": "saint-pierre-et-miquelon",
"geometry": box(-56.6975, 46.5488, -55.9066, 47.3416),
},
+ {
+ "location": "saint-barthelemy",
+ "geometry": box(-62.951118, 17.870818, -62.789027, 17.974103),
+ },
+ {
+ "location": "saint-martin",
+ "geometry": box(-63.153327, 18.046591, -62.970338, 18.125203),
+ },
]
REFERENCES = gpd.GeoDataFrame(REFERENCES, crs=4326)
-
-DOWNLOAD_PIPELINE_ARGS = {
- "ADMIN-EXPRESS": [
- "IGN",
- "ADMINEXPRESS",
- "EXPRESS-COG-TERRITOIRE",
- [
- "guadeloupe",
- "martinique",
- "guyane",
- "reunion",
- "mayotte",
- "metropole",
- ],
- ],
- "BDTOPO": ["IGN", "BDTOPO", "ROOT", "france_entiere"],
- "IRIS": ["IGN", "CONTOUR-IRIS", "ROOT", None],
- "COG": [
- "Insee",
- "COG",
- [
- "COMMUNE",
- "CANTON",
- "ARRONDISSEMENT",
- "DEPARTEMENT",
- "REGION",
- "COLLECTIVITE",
- "PAYS",
- ],
- "france_entiere",
- ],
- "BV 2022": ["Insee", "BV", "FondsDeCarte_BV_2022", "france_entiere"],
-}
-
-# EXPRESS-COG ?
-# EXPRESS-COG-CARTO-TERRITOIRE ?
-# EXPRESS-COG-CARTO ?
diff --git a/cartiflette/download/__init__.py b/cartiflette/download/__init__.py
index c8e07b4e..1f675160 100644
--- a/cartiflette/download/__init__.py
+++ b/cartiflette/download/__init__.py
@@ -1,14 +1,6 @@
-# from cartiflette.download.dev import (
-# get_vectorfile_communes_arrondissement,
-# # get_BV,
-# )
+# -*- coding: utf-8 -*-
+from .scraper import Scraper
+from .download import _download_and_store_sources
-from cartiflette.download.pipeline import (
- download_all,
-)
-
-
-__all__ = [
- "download_all",
-]
+__all__ = ["Scraper", "_download_and_store_sources"]
diff --git a/cartiflette/download/dataset.py b/cartiflette/download/dataset.py
index 018a6f56..65556185 100644
--- a/cartiflette/download/dataset.py
+++ b/cartiflette/download/dataset.py
@@ -6,23 +6,25 @@
import json
import logging
import os
-import pebble
-import py7zr
import re
-import s3fs
import tempfile
from typing import Tuple
+import warnings
import zipfile
+import pebble
+import py7zr
+import s3fs
+
from cartiflette.utils import import_yaml_config, hash_file, deep_dict_update
from cartiflette.config import BUCKET, PATH_WITHIN_BUCKET, FS
logger = logging.getLogger(__name__)
-class Dataset:
+class RawDataset:
"""
- Class representing a dataset stored in the yaml meant to be retrieved
+ Class representing a raw dataset stored in the yaml meant to be retrieved
"""
md5 = None
@@ -40,7 +42,7 @@ def __init__(
fs: s3fs.S3FileSystem = FS,
):
"""
- Initialize a Dataset object.
+ Initialize a RawDataset object.
Parameters
----------
@@ -85,7 +87,10 @@ def __str__(self):
territory = self.territory
provider = self.provider
- name = f""
+ name = (
+ f""
+ )
return name
def __repr__(self):
@@ -113,15 +118,15 @@ def _md5(file_path: str) -> str:
def _get_last_md5(self) -> None:
"""
Read the last md5 hash value of the target on the s3 and store it
- as an attribute of the Dataset : self.md5
+ as an attribute of the RawDataset : self.md5
"""
try:
with self.fs.open(self.json_md5, "r") as f:
all_md5 = json.load(f)
- except Exception as e:
- logger.error(e)
- logger.error("md5 json not found on MinIO")
+ except FileNotFoundError as e:
+ # use warnings instead of logging to display this only once
+ warnings.warn(f"md5 json not found on MinIO - {e}")
return
try:
md5 = all_md5[self.provider][self.dataset_family][self.source][
@@ -129,8 +134,7 @@ def _get_last_md5(self) -> None:
][str(self.year)]
self.md5 = md5
except Exception as e:
- logger.debug(e)
- logger.debug("file not referenced in md5 json")
+ logger.debug("file not referenced in md5 json %s", e)
@pebble.synchronized
def update_json_md5(self, md5: str) -> bool:
@@ -213,6 +217,9 @@ def get_path_from_provider(self) -> str:
d = d[key]
try:
self.pattern = d["pattern"]
+ if isinstance(self.pattern, str):
+ self.pattern = [self.pattern]
+
break
except KeyError:
continue
@@ -269,7 +276,16 @@ def get_path_from_provider(self) -> str:
url = url.format(**kwargs)
- logger.debug(f"using {url}")
+ try:
+ # check if {territory} is part of self.pattern:
+ if territory != "":
+ self.pattern = [
+ x.format(**{"territory": territory}) for x in self.pattern
+ ]
+ except UnboundLocalError:
+ pass
+
+ logger.debug("using %s", url)
return url
@@ -295,7 +311,7 @@ def unpack(self, protocol: str) -> Tuple[str, Tuple[Tuple[str, ...], ...]]:
the decompressed files. Note that this folder will be stored in the
temporary cache, but requires manual cleanup.
If nested archives (ie zip in zip), will unpack all nested data and
- look for target pattern **INSIDE** the nested archive only
+ look for target pattern(s) **INSIDE** the nested archive only
Every file Path
@@ -326,7 +342,7 @@ def unpack(self, protocol: str) -> Tuple[str, Tuple[Tuple[str, ...], ...]]:
# unzip in temp directory
location = tempfile.mkdtemp()
- logger.debug(f"Extracting to {location}")
+ logger.debug("Extracting to %s", location)
year = self.year
source = self.source
@@ -351,7 +367,7 @@ def get_utils_from_protocol(protocol):
list_files = "namelist"
extract = "extractall"
targets_kw = "members"
- # TODO
+ # TODO : other archives formats? (rar, tar, gz, ...)
# rar files, see https://pypi.org/project/rarfile/
# tar files
# gz files
@@ -362,14 +378,18 @@ def get_utils_from_protocol(protocol):
archives_to_process = [(self.temp_archive_path, protocol)]
while archives_to_process:
archive, protocol = archives_to_process.pop()
- loader, list_files, extract, targets_kw = get_utils_from_protocol(protocol)
+ loader, list_files, extract, targets_kw = get_utils_from_protocol(
+ protocol
+ )
with loader(archive, mode="r") as archive:
everything = getattr(archive, list_files)()
# Handle nested archives (and presume there is no mixup in
# formats...)
archives = [
- x for x in everything if x.endswith(".zip") or x.endswith(".7z")
+ x
+ for x in everything
+ if x.endswith(".zip") or x.endswith(".7z")
]
archives = [(x, x.split(".")[-1]) for x in archives]
for nested_archive, protocol in archives:
@@ -378,10 +398,16 @@ def get_utils_from_protocol(protocol):
(io.BytesIO(nested.read()), protocol)
)
- files = filter_case_insensitive(self.pattern, everything)
+ files = [
+ file
+ for pattern in self.pattern
+ for file in filter_case_insensitive(pattern, everything)
+ ]
if year <= 2020 and source.endswith("-TERRITOIRE"):
- territory_code = sources["territory"][territory].split("_")[0]
+ territory_code = sources["territory"][territory].split(
+ "_"
+ )[0]
files = {x for x in files if territory_code in x}
# Find all auxiliary files sharing the same name as those found
@@ -414,18 +440,24 @@ def get_utils_from_protocol(protocol):
# when using dbf) -> return only target but extract all
patterns = {x.rsplit(".", maxsplit=1)[0] for x in targets}
real_extracts = {
- x for x in everything if x.rsplit(".", maxsplit=1)[0] in patterns
+ x
+ for x in everything
+ if x.rsplit(".", maxsplit=1)[0] in patterns
}
kwargs = {"path": location, targets_kw: real_extracts}
getattr(archive, extract)(**kwargs)
- extracted += [os.path.join(location, target) for target in targets]
+ extracted += [
+ os.path.join(location, target) for target in targets
+ ]
# self._list_levels(extracted)
if any(x.lower().endswith(".shp") for x in extracted):
shapefiles_pattern = {
- os.path.splitext(x)[0] for x in files if x.lower().endswith(".shp")
+ os.path.splitext(x)[0]
+ for x in files
+ if x.lower().endswith(".shp")
}
extracted = [
diff --git a/cartiflette/download/download.py b/cartiflette/download/download.py
index f782d955..9e977461 100644
--- a/cartiflette/download/download.py
+++ b/cartiflette/download/download.py
@@ -5,28 +5,56 @@
# pipeline, please refer yourself to cartiflette\download\pipeline.py
# =============================================================================
-from collections import OrderedDict
+from collections import OrderedDict, Counter
from itertools import product
import logging
-from pebble import ThreadPool
-import s3fs
import shutil
import traceback
from typing import Union
-from cartiflette.config import BUCKET, PATH_WITHIN_BUCKET, FS, THREADS_DOWNLOAD
+from pebble import ThreadPool
+from retrying import retry
+import s3fs
+
+from cartiflette.config import (
+ BUCKET,
+ PATH_WITHIN_BUCKET,
+ FS,
+ THREADS_DOWNLOAD,
+ RETRYING,
+)
from cartiflette.utils import (
deep_dict_update,
create_path_bucket,
)
-from cartiflette.download.scraper import MasterScraper
-from cartiflette.download.dataset import Dataset
+from .scraper import Scraper
+from .dataset import RawDataset
logger = logging.getLogger(__name__)
+if not RETRYING:
+ # patch retrying
+ def retry(*args, **kwargs):
+ def decorator(func):
+ return func
+
+ return decorator
+
+
+def _result_is_ko(result):
+ """
+ return True if result is ko
+ used to check if _upload_raw_dataset_to_s3 should be retried
+ """
+ return result is not None and len(result) == 0
+
+
+@retry(
+ retry_on_result=_result_is_ko, stop_max_attempt_number=3, wait_fixed=2000
+)
def _upload_raw_dataset_to_s3(
- dataset: Dataset,
+ dataset: RawDataset,
result: dict,
bucket: str = BUCKET,
path_within_bucket: str = PATH_WITHIN_BUCKET,
@@ -40,8 +68,8 @@ def _upload_raw_dataset_to_s3(
Parameters
----------
- dataset : Dataset
- Dataset object to store into s3
+ dataset : RawDataset
+ RawDataset object to store into s3
result : dict
result of the dataset's download
bucket : str, optional
@@ -80,7 +108,7 @@ def _upload_raw_dataset_to_s3(
try:
# DUPLICATE SOURCES IN BUCKET
errors_encountered = False
- dataset_paths = dict()
+ dataset_paths = {}
for key, layer in result["layers"].items():
layer_paths = []
for path, rename_basename in layer.files_to_upload.items():
@@ -105,7 +133,7 @@ def _upload_raw_dataset_to_s3(
layer_paths.append(path_within)
- logger.debug(f"upload to {path_within}")
+ logger.info("upload to %s", path_within)
try:
fs.put(path, path_within, recursive=True)
@@ -114,7 +142,9 @@ def _upload_raw_dataset_to_s3(
errors_encountered = True
if any(x.lower().endswith(".shp") for x in layer_paths):
- layer_paths = [x for x in layer_paths if x.lower().endswith(".shp")]
+ layer_paths = [
+ x for x in layer_paths if x.lower().endswith(".shp")
+ ]
dataset_paths[key] = layer_paths
@@ -131,22 +161,20 @@ def _upload_raw_dataset_to_s3(
# to allow for further tentatives)
dataset.update_json_md5(result["hash"])
return dataset_paths
- else:
- return {}
+ return {}
-def _download_sources(
+def _download_and_store_sources(
providers: Union[list[str, ...], str],
dataset_families: Union[list[str, ...], str],
sources: Union[list[str, ...], str],
- territories: Union[list[str, ...], str],
years: Union[list[str, ...], str],
+ territories: Union[list[str, ...], str] = None,
bucket: str = BUCKET,
path_within_bucket: str = PATH_WITHIN_BUCKET,
fs: s3fs.S3FileSystem = FS,
upload: bool = True,
) -> dict:
- # TODO : contrôler return
"""
Main function to perform downloads of datasets and store them the s3.
All available combinations will be tested; hence an unfound file might not
@@ -162,10 +190,12 @@ def _download_sources(
List of datasets family in the yaml file
sources : list[str, ...]
List of sources in the yaml file
- territories : list[str, ...]
- List of territoires in the yaml file
years : list[int, ...]
List of years in the yaml file
+ territories : list[str, ...], optional
+ List of territoires in the yaml file. The default is None (corresponds
+ to datasets where that field is absent), which will set
+ "territory=france_entiere" when uploading to the S3 FileSystem.
bucket : str, optional
Bucket to use. The default is BUCKET.
path_within_bucket : str, optional
@@ -178,10 +208,6 @@ def _download_sources(
Returns
-------
- dict
- DESCRIPTION.
-
-
files : dict
Structure of the nested dict will use the following keys :
provider
@@ -192,22 +218,23 @@ def _download_sources(
{downloaded: bool, paths: list:str}
For instance:
{
- 'IGN': {
- 'BDTOPO': {
- 'ROOT': {
+ 'Insee': {
+ 'COG': {
+ 'COMMUNE': {
'france_entiere': {
- 2017: {
+ 2023: {
'downloaded': True,
'paths': {
- 'CHEF_LIEU': [
- 'projet-cartiflette/diffusion/shapefiles-test4/year=2017/administrative_level=None/crs=4326/None=None/vectorfile_format=shp/provider=IGN/dataset_family=BDTOPO/source=ROOT/territory=martinique/CHEF_LIEU.shp'
- ],
- 'COMMUNE': [
- 'projet-cartiflette/diffusion/shapefiles-test4/year=2017/administrative_level=None/crs=4326/None=None/vectorfile_format=shp/provider=IGN/dataset_family=BDTOPO/source=ROOT/territory=martinique/COMMUNE.shp'
- ],
- 'ARRONDISSEMENT': [
- 'projet-cartiflette/diffusion/shapefiles-test4/year=2017/administrative_level=None/crs=4326/None=None/vectorfile_format=shp/provider=IGN/dataset_family=BDTOPO/source=ROOT/territory=metropole/ARRONDISSEMENT.shp'
- ]
+ 'dummy_file_2023': [
+ 'projet-cartiflette/.../dummmy.csv'
+ ]
+ }
+ },
+ 2024: {
+ 'downloaded': True,
+ 'paths': {
+ 'dummy_file_2024': [
+ 'projet-cartiflette/.../dummy.csv']
}
}
}
@@ -215,7 +242,12 @@ def _download_sources(
}
}
}
+
"""
+
+ if not territories:
+ territories = "france_entiere"
+
kwargs = OrderedDict()
items = [
("sources", sources),
@@ -225,28 +257,89 @@ def _download_sources(
("dataset_families", dataset_families),
]
for key, val in items:
- if isinstance(val, str) or isinstance(val, int):
+ if isinstance(val, (str, int)):
kwargs[key] = [val]
elif not val:
kwargs[key] = [None]
- elif isinstance(val, list) or isinstance(val, tuple) or isinstance(val, set):
+ elif isinstance(val, (list, tuple, set)):
kwargs[key] = list(val)
combinations = list(product(*kwargs.values()))
+ order = "source", "territory", "year", "provider", "dataset_family"
+ combinations = [dict(zip(order, x)) for x in combinations]
+
+ # Check wether (some) urls are reused in this batch
+ reused_urls = set()
+ datasets = [
+ RawDataset(
+ bucket=bucket,
+ path_within_bucket=path_within_bucket,
+ **x,
+ )
+ for y in years
+ for x in combinations
+ if x["year"] == y
+ ]
+
+ def try_get_path(dset):
+ try:
+ return dset.get_path_from_provider()
+ except ValueError:
+ # Do not bother to log this, there will be warning later on
+ # when Cartiflette tries to retrieve the datasets
+ pass
+
+ reused = {
+ (url, md5)
+ for (url, md5), count in Counter(
+ (try_get_path(dset), dset.md5)
+ for dset in datasets
+ if try_get_path(dset)
+ ).items()
+ if count > 1
+ }
+ reused_urls.update(reused)
+ reused_urls = list(reused_urls)
+ # reused_urls = [(url_1, md5_1), (url_2, md5_2)]
+
+ # -> proceed to immediate download of reused urls (and don't do anything
+ # about it), and resort to requests-cache to dispatch it to the different
+ # datasets later
+ if reused_urls:
+ with Scraper() as s:
+ threads = min(THREADS_DOWNLOAD, len(reused_urls))
+ if threads > 1:
+ with ThreadPool(
+ threads,
+ ) as pool:
+ iterator = pool.map(
+ s.simple_download, *zip(*reused_urls), timeout=60 * 10
+ ).result()
+ index = 0
+ while True:
+ try:
+ next(iterator)
+ except StopIteration:
+ break
+ except Exception:
+ logger.error(traceback.format_exc())
+ logger.error("url was %s", reused_urls[index])
+ finally:
+ index += 1
+ else:
+ for url, md5 in reused_urls:
+ try:
+ s.simple_download(url, md5)
+ except Exception:
+ logger.error(traceback.format_exc())
+ logger.error("url was %s", (url, md5))
files = {}
- with MasterScraper() as s:
-
- def func(args):
- source, territory, year, provider, dataset_family = args
- datafile = Dataset(
- dataset_family,
- source,
- year,
- provider,
- territory,
- bucket,
- path_within_bucket,
+ with Scraper() as s:
+
+ def func(kwargs):
+ datafile = RawDataset(
+ bucket=bucket, path_within_bucket=path_within_bucket, **kwargs
)
try:
result = s.download_unpack(datafile)
@@ -254,11 +347,11 @@ def func(args):
logger.warning(e)
this_result = {
- provider: {
- dataset_family: {
- source: {
- territory: {
- year: {
+ kwargs["provider"]: {
+ kwargs["dataset_family"]: {
+ kwargs["source"]: {
+ kwargs["territory"]: {
+ kwargs["year"]: {
"downloaded": False,
"paths": None,
}
@@ -283,24 +376,39 @@ def func(args):
result["paths"] = paths
this_result = {
- provider: {dataset_family: {source: {territory: {year: result}}}}
+ kwargs["provider"]: {
+ kwargs["dataset_family"]: {
+ kwargs["source"]: {
+ kwargs["territory"]: {kwargs["year"]: result}
+ }
+ }
+ }
}
return this_result
if THREADS_DOWNLOAD > 1:
with ThreadPool(THREADS_DOWNLOAD) as pool:
- iterator = pool.map(func, combinations).result()
+ iterator = pool.map(
+ func, combinations, timeout=60 * 10
+ ).result()
+ index = 0
while True:
try:
files = deep_dict_update(files, next(iterator))
except StopIteration:
break
- except Exception as e:
- logger.error(e)
+ except Exception:
logger.error(traceback.format_exc())
+ logger.error("config was %s", combinations[index])
+ finally:
+ index += 1
else:
for args in combinations:
- files = deep_dict_update(files, func(args))
+ try:
+ files = deep_dict_update(files, func(args))
+ except Exception:
+ logger.error(traceback.format_exc())
+ logger.error("config was %s", args)
return files
diff --git a/cartiflette/download/layer.py b/cartiflette/download/layer.py
index ef7160dc..4fda3f58 100644
--- a/cartiflette/download/layer.py
+++ b/cartiflette/download/layer.py
@@ -1,24 +1,32 @@
# -*- coding: utf-8 -*-
+
+import logging
+import os
+
from charset_normalizer import from_bytes, is_binary
import fiona
import geopandas as gpd
-import logging
-import os
+import pyogrio
from shapely.geometry import box
-from cartiflette.download.dataset import Dataset
+from .dataset import RawDataset
from cartiflette.constants import REFERENCES
logger = logging.getLogger(__name__)
class Layer:
- def __init__(self, dataset: Dataset, cluster_name: str, files: dict):
+ """
+ Layer present in a dataset.
+ """
+
+ def __init__(self, dataset: RawDataset, cluster_name: str, files: dict):
"""
- Layer present in a dataset. A layer is defined by a distinctive
- combination of path and basename (without extension). To that effect,
- each auxialary file associated to a shapefile shall be present in the
- same layer.
+ Layer present in a dataset.
+
+ A layer is defined by a distinctive combination of path and basename
+ (without extension). To that effect, each auxiliary file associated to
+ a shapefile shall be present in the same layer.
Nota : distinction between selected and unselected files in `files`
argument helps to evaluate territory using a shapefile even if the
@@ -26,8 +34,8 @@ def __init__(self, dataset: Dataset, cluster_name: str, files: dict):
Parameters
----------
- dataset : Dataset
- Dataset containing layers
+ dataset : RawDataset
+ RawDataset containing layers
cluster_name : str
Unique name for a layer (computed by the scraper after data
unpacking) and corresponding to the minimum recursive distinct path
@@ -63,14 +71,18 @@ def __repr__(self):
return self.__str__()
def _get_format(self):
- if any(x.lower().split(".")[-1] == "shp" for x in self.files_to_upload):
+ if any(
+ x.lower().split(".")[-1] == "shp" for x in self.files_to_upload
+ ):
self.format = "shp"
else:
# assume there is only one file
self.format = list(self.files_to_upload)[0].split(".")[-1]
def _get_encoding(self):
- ref_cpg_file = [x for x in self.files if x.lower().split(".")[-1] == "cpg"]
+ ref_cpg_file = [
+ x for x in self.files if x.lower().split(".")[-1] == "cpg"
+ ]
try:
ref_cpg_file = ref_cpg_file[0]
except IndexError:
@@ -81,7 +93,9 @@ def _get_encoding(self):
return encoding.lower()
def _get_gis_file(self):
- ref_gis_file = [x for x in self.files if x.lower().split(".")[-1] == "shp"]
+ ref_gis_file = [
+ x for x in self.files if x.lower().split(".")[-1] == "shp"
+ ]
try:
ref_gis_file = ref_gis_file[0]
except IndexError:
@@ -96,13 +110,22 @@ def _gis_and_encoding_evaluation(self):
ref_gis_file = self._get_gis_file()
try:
# Note : read all rows to evaluate bbox / territory
+
+ # Disable fiona logger
+ fiona_logger = logging.getLogger("fiona")
+ init = fiona_logger.level
+ fiona_logger.setLevel(logging.CRITICAL)
gdf = gpd.read_file(ref_gis_file, **kwargs)
+
+ fiona_logger.setLevel(init)
+
self.crs = gdf.crs.to_epsg()
if not self.crs:
- logger.warning(
- f"{self} - projection without known EPSG, "
- "layer will be reprojected to 4326"
+ logger.info(
+ "%s - projection without known EPSG, "
+ "layer will be reprojected to 4326",
+ self,
)
# Let's reproject...
@@ -110,20 +133,35 @@ def _gis_and_encoding_evaluation(self):
self.crs = 4326
# let's overwrite initial files
- gdf.to_file(ref_gis_file, encoding="utf-8")
+ gdf.to_file(ref_gis_file, encoding="utf-8", engine="fiona")
elif encoding and encoding != "utf-8":
- logger.warning(
- f"{self} - encoding={encoding}, " "layer will be re-encoded to UTF8"
+ logger.info(
+ "%s - encoding=%s, layer will be re-encoded to UTF8",
+ self,
+ encoding,
)
# let's overwrite initial files with utf8...
- gdf.to_file(ref_gis_file, encoding="utf-8")
-
- except (AttributeError, fiona.errors.DriverError):
+ gdf.to_file(ref_gis_file, encoding="utf-8", engine="fiona")
+
+ except (
+ AttributeError,
+ fiona.errors.DriverError,
+ pyogrio.errors.DataSourceError,
+ pyogrio.errors.CRSError,
+ ):
# Non-native-GIS dataset
self.crs = None
+ fiona_logger.setLevel(init)
if self.crs:
+ # check if geometries are valid:
+ geometries_valid = gdf["geometry"].is_valid.all()
+ if not geometries_valid:
+ # try to fix geometries and overwrite file
+ gdf["geometry"] = gdf["geometry"].buffer(0)
+ gdf.to_file(ref_gis_file, encoding="utf-8", engine="fiona")
+
bbox = box(*gdf.total_bounds)
bbox = gpd.GeoSeries([bbox], crs=gdf.crs)
@@ -139,17 +177,19 @@ def _gis_and_encoding_evaluation(self):
elif len(intersects) > 1 and "metropole" in intersects:
self.territory = "france_entiere"
else:
- logger.warning(
- f"{self} : spatial join used for territory recognition "
- "failed, dataset's raw description will be used instead"
+ logger.info(
+ "%s : spatial join used for territory recognition "
+ "failed, dataset's raw description will be used instead",
+ self,
)
self.territory = self.dataset.territory
elif not self.crs:
# TODO : chercher un champ de clefs INSEE ?
logger.info(
- f"{self} : coverage analysis of non-gis files is not yet "
- "implemented, dataset's raw description will be used instead"
+ "%s : coverage analysis of non-gis files is not yet "
+ "implemented, dataset's raw description will be used instead",
+ self,
)
self.territory = self.dataset.territory
@@ -175,9 +215,11 @@ def _gis_and_encoding_evaluation(self):
pass
else:
if encoding != "utf_8":
- logger.warning(
- f"{self} - encoding={encoding}, "
- "layer will be re-encoded to UTF8"
+ logger.info(
+ "%s - encoding=%s, "
+ "layer will be re-encoded to UTF8",
+ self,
+ encoding,
)
with open(file, "w", encoding="utf8"):
data.decode(encoding)
diff --git a/cartiflette/download/scraper.py b/cartiflette/download/scraper.py
index b5f4e95e..066d1c67 100644
--- a/cartiflette/download/scraper.py
+++ b/cartiflette/download/scraper.py
@@ -1,27 +1,36 @@
# -*- coding: utf-8 -*-
-import magic
from glob import glob
import logging
-import numpy as np
import os
import re
-import requests
-import requests_cache
+from retrying import retry
import tempfile
-from tqdm import tqdm
from typing import TypedDict
+
+import magic
+import requests
+import requests_cache
from unidecode import unidecode
from cartiflette.utils import hash_file
-from cartiflette.download.dataset import Dataset
-from cartiflette.download.layer import Layer
-from cartiflette.config import LEAVE_TQDM
+from .dataset import RawDataset
+from .layer import Layer
+from cartiflette.config import RETRYING
+
+if not RETRYING:
+ # patch retrying
+ def retry(*args, **kwargs):
+ def decorator(func):
+ return func
+
+ return decorator
+
logger = logging.getLogger(__name__)
-class MasterScraper(requests_cache.CachedSession):
+class Scraper(requests_cache.CachedSession):
"""
Scraper class which could be used to perform either http/https get
downloads.
@@ -76,7 +85,35 @@ def __init__(
except KeyError:
continue
- def download_unpack(self, datafile: Dataset, **kwargs) -> DownloadReturn:
+ def simple_download(self, url: str, hash_: str = None, **kwargs):
+ """
+ Trigger a simple download to temporary file (immediatly cleaned)
+
+ Use this only to cache http response (for instance, when you know that
+ a request will be queried multiple times)
+
+ Parameters
+ ----------
+ url : str
+ url to download
+ hash_ : str, optional
+ previous hash signature of the file at latest download. The default
+ is None.
+ **kwargs :
+ Optional arguments to pass to requests.Session object.
+
+ Returns
+ -------
+ None.
+
+ """
+ done, _, temp = download_to_tempfile_http(url, hash_, self, **kwargs)
+ if done:
+ os.unlink(temp)
+
+ def download_unpack(
+ self, datafile: RawDataset, **kwargs
+ ) -> DownloadReturn:
"""
Performs a download (through http, https) to a tempfile
which will be cleaned automatically ; unzip targeted files to a 2nd
@@ -96,8 +133,8 @@ def download_unpack(self, datafile: Dataset, **kwargs) -> DownloadReturn:
Parameters
----------
- datafile : Dataset
- Dataset object to download.
+ datafile : RawDataset
+ RawDataset object to download.
**kwargs :
Optional arguments to pass to requests.Session object.
@@ -117,17 +154,17 @@ def download_unpack(self, datafile: Dataset, **kwargs) -> DownloadReturn:
'hash': '5435fca3e488ca0372505b9bcacfde30',
'layers': {
'CONTOURS-IRIS_2-1_SHP_RGR92UTM40S_REU-2022_CONTOURS-IRIS':
- < Layer CONTOURS - IRIS_2 - 1_SHP_RGR92UTM40S_REU - 2022_CONTOURS - IRIS from < Dataset IGN CONTOUR - IRIS ROOT None 2022 >> ,
+ < Layer CONTOURS - IRIS_2 - 1_SHP_RGR92UTM40S_REU - 2022_CONTOURS - IRIS from < RawDataset IGN CONTOUR - IRIS ROOT None 2022 >> ,
'CONTOURS-IRIS_2-1_SHP_RGAF09UTM20_GLP-2022_CONTOURS-IRIS':
- < Layer CONTOURS - IRIS_2 - 1_SHP_RGAF09UTM20_GLP - 2022_CONTOURS - IRIS from < Dataset IGN CONTOUR - IRIS ROOT None 2022 >> ,
+ < Layer CONTOURS - IRIS_2 - 1_SHP_RGAF09UTM20_GLP - 2022_CONTOURS - IRIS from < RawDataset IGN CONTOUR - IRIS ROOT None 2022 >> ,
'CONTOURS-IRIS_2-1_SHP_RGAF09UTM20_MTQ-2022_CONTOURS-IRIS':
- < Layer CONTOURS - IRIS_2 - 1_SHP_RGAF09UTM20_MTQ - 2022_CONTOURS - IRIS from < Dataset IGN CONTOUR - IRIS ROOT None 2022 >> ,
+ < Layer CONTOURS - IRIS_2 - 1_SHP_RGAF09UTM20_MTQ - 2022_CONTOURS - IRIS from < RawDataset IGN CONTOUR - IRIS ROOT None 2022 >> ,
'CONTOURS-IRIS_2-1_SHP_LAMB93_FXX-2022_CONTOURS-IRIS':
- < Layer CONTOURS - IRIS_2 - 1_SHP_LAMB93_FXX - 2022_CONTOURS - IRIS from < Dataset IGN CONTOUR - IRIS ROOT None 2022 >> ,
+ < Layer CONTOURS - IRIS_2 - 1_SHP_LAMB93_FXX - 2022_CONTOURS - IRIS from < RawDataset IGN CONTOUR - IRIS ROOT None 2022 >> ,
'CONTOURS-IRIS_2-1_SHP_UTM22RGFG95_GUF-2022_CONTOURS-IRIS':
- < Layer CONTOURS - IRIS_2 - 1_SHP_UTM22RGFG95_GUF - 2022_CONTOURS - IRIS from < Dataset IGN CONTOUR - IRIS ROOT None 2022 >> ,
+ < Layer CONTOURS - IRIS_2 - 1_SHP_UTM22RGFG95_GUF - 2022_CONTOURS - IRIS from < RawDataset IGN CONTOUR - IRIS ROOT None 2022 >> ,
'CONTOURS-IRIS_2-1_SHP_RGM04UTM38S_MYT-2022_CONTOURS-IRIS':
- < Layer CONTOURS - IRIS_2 - 1_SHP_RGM04UTM38S_MYT - 2022_CONTOURS - IRIS from < Dataset IGN CONTOUR - IRIS ROOT None 2022 >>
+ < Layer CONTOURS - IRIS_2 - 1_SHP_RGM04UTM38S_MYT - 2022_CONTOURS - IRIS from < RawDataset IGN CONTOUR - IRIS ROOT None 2022 >>
},
'root_cleanup': 'C:\\Users\\tintin.milou\\AppData\\Local\\Temp\\tmpnbvoes9g'
}
@@ -135,7 +172,7 @@ def download_unpack(self, datafile: Dataset, **kwargs) -> DownloadReturn:
"""
- hash_ = None
+ hash_ = datafile.md5
url = datafile.get_path_from_provider()
# Download to temporary file
@@ -164,13 +201,13 @@ def download_unpack(self, datafile: Dataset, **kwargs) -> DownloadReturn:
datafile.set_temp_file_path(temp_archive_file_raw)
+ simple_copy = ["Microsoft Excel 2007+", "Unicode text", "CSV text"]
+
if "7-zip" in filetype:
root_folder, files_locations = datafile.unpack(protocol="7z")
elif "Zip archive" in filetype:
- root_folder, files_locations = datafile.unpack(
- protocol="zip"
- )
- elif "Unicode text" in filetype or "CSV text" in filetype:
+ root_folder, files_locations = datafile.unpack(protocol="zip")
+ elif any(x for x in simple_copy if x in filetype):
# copy in temp directory without processing
root_folder = tempfile.mkdtemp()
with open(temp_archive_file_raw, "rb") as f:
@@ -178,13 +215,17 @@ def download_unpack(self, datafile: Dataset, **kwargs) -> DownloadReturn:
filename = "_".join(
x for x in re.split(r"\W+", filename) if x
)
- path = os.path.join(root_folder, filename + ".csv")
+ if filetype == "Microsoft Excel 2007+":
+ ext = ".xlsx"
+ else:
+ ext = ".csv"
+
+ path = os.path.join(root_folder, filename + ext)
with open(path, "wb") as out:
out.write(f.read())
- logger.debug(f"Storing CSV to {root_folder}")
+ logger.debug("Storing file to %s", root_folder)
files_locations = ((path,),)
-
else:
raise NotImplementedError(f"{filetype} encountered")
except Exception as e:
@@ -211,7 +252,7 @@ def download_unpack(self, datafile: Dataset, **kwargs) -> DownloadReturn:
for basename, cluster in basenames.items()
}
- layers = dict()
+ layers = {}
for cluster_name, cluster_filtered in paths.items():
cluster_pattern = {
os.path.splitext(x)[0] for x in cluster_filtered
@@ -253,6 +294,7 @@ def validate_file(file_path, hash_):
return hash_file(file_path) == hash_
+@retry(stop_max_attempt_number=3, wait_fixed=2000)
def download_to_tempfile_http(
url: str,
hash_: str = None,
@@ -312,55 +354,64 @@ def download_to_tempfile_http(
head = r.headers
if not r.ok:
- raise IOError(f"download failed with {r.status_code} code")
+ raise IOError(f"download failed with {r.status_code} code at {url=}")
try:
expected_md5 = head["content-md5"]
- logger.debug(f"File MD5 is {expected_md5}")
+ logger.debug("File MD5 is %s", expected_md5)
except KeyError:
expected_md5 = None
- logger.debug(f"md5 not found in header at url {url}")
+ logger.debug("md5 not found in header at url %s", url)
else:
if hash_ and expected_md5 == hash_:
# unchanged file -> exit
- logger.info(f"md5 matched at {url} - download prevented")
+ logger.info("md5 matched at %s - download prevented", url)
return False, None, None
- finally:
- try:
- # No MD5 in header -> check requested file's size
- expected_file_size = int(head["Content-length"])
- logger.debug(f"File size is {expected_file_size}")
- except KeyError:
- expected_file_size = None
- msg = f"Content-Length not found in header at url {url}"
- logger.debug(msg)
with tempfile.NamedTemporaryFile("wb", delete=False) as temp_file:
file_path = temp_file.name
- logger.debug(f"Downloading to {file_path}")
-
- logger.debug(f"starting download at {url}")
+ logger.debug("Downloading to %s", file_path)
+ logger.info("starting download at %s", url)
r = session.get(url, stream=True, **kwargs)
if not r.ok:
raise IOError(f"download failed with {r.status_code} code")
- if expected_file_size:
- total = int(np.ceil(expected_file_size / block_size))
- else:
- total = None
- with tqdm(
- desc="Downloading: ",
- total=total,
- unit="iB",
- unit_scale=True,
- unit_divisor=1024,
- leave=LEAVE_TQDM,
- ) as pbar:
- for chunk in r.iter_content(chunk_size=block_size):
- if chunk: # filter out keep-alive new chunks
- size = temp_file.write(chunk)
- pbar.update(size)
+ try:
+ # Nota : check Content-length after full request (not only head
+ # request), some sites are adapting the header to the kind of
+ # http request performed
+ head = r.headers
+ expected_file_size = int(head["Content-length"])
+ logger.debug("File size is %s", expected_file_size)
+ except KeyError:
+ expected_file_size = None
+ msg = f"Content-Length not found in header at url {url}"
+ logger.debug(msg)
+
+ # =====================================================================
+ # This is not working (yet) with requests-cache:
+ # =====================================================================
+ # if expected_file_size:
+ # total = int(np.ceil(expected_file_size / block_size))
+ # else:
+ # total = None
+ # with tqdm(
+ # desc="Downloading: ",
+ # total=total,
+ # unit="iB",
+ # unit_scale=True,
+ # unit_divisor=1024,
+ # leave=LEAVE_TQDM,
+ # ) as pbar:
+ # for chunk in r.iter_content(chunk_size=block_size):
+ # if chunk: # filter out keep-alive new chunks
+ # size = temp_file.write(chunk)
+ # pbar.update(size)
+
+ for chunk in r.iter_content(chunk_size=block_size):
+ if chunk:
+ temp_file.write(chunk)
# Check that the downloaded file has the expected characteristics
if expected_md5:
@@ -370,16 +421,25 @@ def download_to_tempfile_http(
elif expected_file_size:
# check that the downloaded file is the expected size
if not expected_file_size == os.path.getsize(file_path):
+ print(expected_file_size, os.path.getsize(file_path))
os.unlink(file_path)
raise IOError("download failed (corrupted file)")
# if there's a hash value, check if there are any changes
if hash_ and validate_file(file_path, hash_):
# unchanged file -> exit (after deleting the downloaded file)
- logger.debug(f"md5 matched at {url} after download")
+ logger.info("md5 matched at %s after download", url)
os.unlink(file_path)
return False, None, None
+ logger.info(
+ "NO md5 match at %s after download : hash_=%s and new hash=%s "
+ "-> keeping new file",
+ url,
+ hash_,
+ hash_file(file_path),
+ )
+
filetype = magic.from_file(file_path)
return True, filetype, file_path
diff --git a/cartiflette/mapshaper/__init__.py b/cartiflette/mapshaper/__init__.py
index b5b57a06..d53c7cd7 100644
--- a/cartiflette/mapshaper/__init__.py
+++ b/cartiflette/mapshaper/__init__.py
@@ -1,13 +1,40 @@
"""
Handling spatial data with mapshaper behind the stage
"""
-from .mapshaperize import mapshaperize_split, mapshaperize_split_merge
-from .mapshaper_convert_mercator import mapshaper_convert_mercator
+
+from .mapshaper_split import mapshaper_split
+from .mapshaper_convert_reproject import mapshaper_convert_reproject
from .mapshaper_closer import mapshaper_bring_closer
+from .mapshaper_enrich import mapshaper_enrich
+from .mapshaper_dissolve import mapshaper_dissolve
+from .mapshaper_concat import mapshaper_concat
+from .mapshaper_remove_cities_with_districts import (
+ mapshaper_remove_cities_with_districts,
+)
+from .mapshaper_process_communal_districts import (
+ mapshaper_process_communal_districts,
+)
+from .mapshaper_combine_districts_and_cities import (
+ mapshaper_combine_districts_and_cities,
+)
+from .mapshaper_simplify import mapshaper_simplify
+from .mapshaper_add_field import mapshaper_add_field
+from .mapshaper_capture_cities_from_ultramarine_territories import (
+ mapshaper_capture_cities_from_ultramarine_territories,
+)
+
__all__ = [
- "mapshaperize_split",
- "mapshaperize_split_merge",
- "mapshaper_convert_mercator",
+ "mapshaper_convert_reproject",
"mapshaper_bring_closer",
+ "mapshaper_enrich",
+ "mapshaper_split",
+ "mapshaper_dissolve",
+ "mapshaper_concat",
+ "mapshaper_remove_cities_with_districts",
+ "mapshaper_process_communal_districts",
+ "mapshaper_combine_districts_and_cities",
+ "mapshaper_simplify",
+ "mapshaper_add_field",
+ "mapshaper_capture_cities_from_ultramarine_territories",
]
diff --git a/cartiflette/mapshaper/mapshaper_add_field.py b/cartiflette/mapshaper/mapshaper_add_field.py
new file mode 100644
index 00000000..729119f1
--- /dev/null
+++ b/cartiflette/mapshaper/mapshaper_add_field.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import os
+
+from .utils import run
+
+
+def mapshaper_add_field(
+ input_file: str,
+ label: str,
+ value: str,
+ output_dir: str = "temp",
+ output_name: str = "output",
+ output_format: str = "geojson",
+ quiet: bool = True,
+) -> str:
+ """
+ Add a field (= a column/attribute) to the dataset.
+ To add a static value, please set it between double+singles quotes,
+ ("'blah'" for instance). To add a dynamic field (from existing fields), use
+ a single double quote string (for isntance "INSEE_DEP+INSEE_CAN").
+
+ Parameters
+ ----------
+ input_file : str
+ Path to the input file.
+ label : str
+ The added field's name.
+ value : str
+ The value of the added field.
+ output_dir : str
+ Directory to store the output file. The default is "temp"
+ output_name : str, optional
+ The path to write the file to (without extension).
+ The default is "concatenated"
+ output_format : str, optional
+ The format to write the outputfile. The default is "geojson".
+ quiet : bool, optional
+ If True, inhibits console messages. The default is True.
+
+ Returns
+ -------
+ output : str
+ Path of the created file
+
+ """
+
+ try:
+ os.makedirs(output_dir)
+ except FileExistsError:
+ pass
+
+ output = f"{output_dir}/{output_name}.{output_format}"
+ quiet = "-quiet " if quiet else " "
+
+ cmd = (
+ f"mapshaper {input_file} "
+ f'-each "{label}={value}" '
+ f"{quiet}"
+ "-proj EPSG:4326 "
+ f" -o {output} "
+ f'format={output_format} extension=".{output_format}" force'
+ )
+
+ # Run Mapshaper command
+ run(cmd)
+
+ return output
diff --git a/cartiflette/mapshaper/mapshaper_capture_cities_from_ultramarine_territories.py b/cartiflette/mapshaper/mapshaper_capture_cities_from_ultramarine_territories.py
new file mode 100644
index 00000000..0afe8d4f
--- /dev/null
+++ b/cartiflette/mapshaper/mapshaper_capture_cities_from_ultramarine_territories.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os
+
+from .utils import run
+
+
+def mapshaper_capture_cities_from_ultramarine_territories(
+ input_city_file: str,
+ output_dir: str = "temp",
+ output_name: str = "output",
+ output_format: str = "geojson",
+ quiet: bool = True,
+) -> str:
+ """
+ Remove cities from departements, and keep only cities from ultramarine
+ territories (Saint-Martin, etc.).
+
+ Parameters
+ ----------
+ input_city_file : str
+ Path to the input file.
+ output_dir : str
+ Directory to store the output file. The default is "temp".
+ output_name : str, optional
+ Name of the written file, without extension. The default is "output".
+ output_format : str, optional
+ Format for output file. The default is "geojson".
+ quiet : bool, optional
+ If True, inhibits console messages. The default is True.
+
+ Returns
+ -------
+ output : str
+ Path of the created file
+
+ """
+ try:
+ os.makedirs(output_dir)
+ except FileExistsError:
+ pass
+
+ output = f"{output_dir}/{output_name}.{output_format}"
+ quiet = "-quiet " if quiet else " "
+
+ cmd = (
+ f"mapshaper {input_city_file} name='COMMUNE' -proj EPSG:4326 "
+ "-filter \"'saint-barthelemy,saint-pierre-et-miquelon,saint-martin'"
+ '.indexOf(AREA) > -1" '
+ "-drop fields=TYP_IRIS "
+ f"{quiet}"
+ "-o force "
+ f'{output} format={output_format} extension=".{output_format}" singles'
+ )
+ run(cmd)
+
+ return output
diff --git a/cartiflette/mapshaper/mapshaper_closer.py b/cartiflette/mapshaper/mapshaper_closer.py
index d17fb7e0..b825b5a4 100644
--- a/cartiflette/mapshaper/mapshaper_closer.py
+++ b/cartiflette/mapshaper/mapshaper_closer.py
@@ -1,134 +1,186 @@
-import subprocess
+import logging
import os
+from cartiflette.mapshaper.utils import run
+
+logger = logging.getLogger(__name__)
+
+# TODO : TOM (St-Martin, St-Barthelemy, St-Pierre-et-Miquelon)
+
logical_conditions = {
"EMPRISES": {
+ # left, bottom, right, top (epsg=3857)
+ "ile de france": "IDF==1",
"metropole": "bbox=-572324.2901945524,5061666.243842439,1064224.7522608414,6638201.7541528195",
"guadeloupe": "bbox=-6880639.760944527,1785277.734007631,-6790707.017202182,1864381.5053494961",
- "martinique": 'bbox=-6815985.711078632,1618842.9696702233,-6769303.6899859235,1675227.3853840816',
+ "martinique": "bbox=-6815985.711078632,1618842.9696702233,-6769303.6899859235,1675227.3853840816",
"guyane": "bbox=-6078313.094526156,235057.05702474713,-5746208.123095576,641016.7211362486",
- "reunion": 'bbox=6146675.557436854,-2438398.996947137,6215705.133130206,-2376601.891080389',
- "mayotte": 'bbox=5011418.778972076,-1460351.1566339568,5042772.003914668,-1418243.6428180535'
- },
- "DEPARTEMENT": {
- "ile de france": "['75', '92', '93', '94'].includes(INSEE_DEP)",
- "zoom idf": 4,
- },
- "REGION": {
- "ile de france": "INSEE_REG == 11",
- "zoom idf": 1.5
- },
- "BASSIN_VIE": {
- "ile de france": "BV2012 == 75056",
- "zoom idf": 1.5
- },
- "UNITE_URBAINE": {
- "ile de france": "UU2020 == '00851'",
- "zoom idf": 1.5
+ "reunion": "bbox=6146675.557436854,-2438398.996947137,6215705.133130206,-2376601.891080389",
+ "mayotte": "bbox=5011418.778972076,-1460351.1566339568,5042772.003914668,-1418243.6428180535",
+ "saint-martin": "bbox=-7034906.766337046, 2038329.0872462029, -7009537.630813715, 2056865.7060235194",
+ "saint-pierre-et-miquelon": "bbox=-6298822.299318486, 5894013.594517256, -6239181.296921183, 5973004.907786214",
+ "saint-barthelemy": "bbox=-7003557.376380256, 2018598.440800959, -6985037.106437805, 2033965.5078367123",
},
- "ZONE_EMPLOI": {
- "ile de france": 'ZE2020 == 1109',
- "zoom idf": 1.5
- },
- "AIRE_ATTRACTION_VILLES": {
- "ile de france": "AAV2020 == '001'",
- "zoom idf": 1.5
- }
-
+ "IRIS": 8,
}
shift = {
- 'guadeloupe': '6355000,3330000',
- 'martinique': '6480000,3505000',
- 'guyane': '5760000,4720000',
- 'reunion': '-6170000,7560000',
- 'mayotte': '-4885000,6590000',
+ # X, Y shift
+ "guadeloupe": "6355000,3330000",
+ "martinique": "6480000,3505000",
+ "guyane": "5760000,4720000",
+ "reunion": "-6170000,7560000",
+ "mayotte": "-4885000,6590000",
+ "saint-martin": "5690000,-900000",
+ "saint-pierre-et-miquelon": "2880000,-2910000",
+ "saint-barthelemy": "5670000,-730000",
}
scale = {
- 'guadeloupe': '1.5',
- 'martinique': '1.5',
- 'guyane': '0.35',
- 'reunion': '1.5',
- 'mayotte': '1.5'
+ "guadeloupe": "1.5",
+ "martinique": "1.5",
+ "guyane": "0.35",
+ "reunion": "1.5",
+ "mayotte": "1.5",
+ "saint-martin": "2.5",
+ "saint-pierre-et-miquelon": "2",
+ "saint-barthelemy": "2.5",
}
def mapshaper_bring_closer(
- france_vector_path="temp.geojson",
- level_agreg="DEPARTEMENT"
- ):
-
- output_path = "temp/preprocessed_transformed/idf_combined.geojson"
- output_dir = os.path.dirname(output_path)
+ input_file: str,
+ bring_out_idf: str = True,
+ output_dir: str = "temp",
+ output_name: str = "output",
+ output_format: str = "geojson",
+ level_agreg: str = "DEPARTEMENT",
+ quiet: bool = True,
+):
+ """
+ Bring DROM closer and zoom over IDF.
+
+ Parameters
+ ----------
+ input_file : str
+ Path to the input file.
+ bring_out_idf : bool, optional
+ If True, will extract IdF and zoom on it. The default is True.
+ output_dir : str
+ Directory to store the output file. The default is "temp"
+ output_name : str, optional
+ The path to write the file to (without extension).
+ The default is "concatenated"
+ output_format : str, optional
+ The format to write the outputfile. The default is "geojson".
+ level_agreg : str, optional
+ Desired aggregation configuration. The default is "DEPARTEMENT".
+ quiet : bool, optional
+ If True, inhibits console messages. The default is True.
+
+ Returns
+ -------
+ str
+ Local path to the output file
+
+ """
+
+ try:
+ os.makedirs(output_dir)
+ except FileExistsError:
+ pass
+
+ logical_idf = logical_conditions["EMPRISES"]["ile de france"]
+ zoom_idf = logical_conditions.get(level_agreg, 5)
+ if zoom_idf < 5:
+ shift_idf = "-650000,275000"
+ elif zoom_idf < 6:
+ shift_idf = "-650000,320000"
+ else:
+ shift_idf = "-650000,450000"
- logical_idf = logical_conditions[level_agreg]["ile de france"]
- zoom_idf = logical_conditions[level_agreg]["zoom idf"]
logical_metropole = logical_conditions["EMPRISES"]["metropole"]
- idf_zoom = (
- f"mapshaper -i {france_vector_path} "
- f"-proj EPSG:3857 "
- f'-filter "{logical_idf}" '
- f"-affine shift=-650000,275000 scale={zoom_idf} "
- f"-o {output_dir}/idf_zoom.geojson"
- )
-
- france_metropolitaine = (
- f"mapshaper -i {france_vector_path} "
- f"-proj EPSG:3857 "
- f'-filter "{logical_metropole}" '
- f"-o {output_dir}/metropole.geojson"
- )
-
- subprocess.run(
- idf_zoom,
- shell=True,
- check=True,
- )
-
- subprocess.run(
- france_metropolitaine,
- shell=True,
- check=True,
- )
-
- for region, shift_value in shift.items():
- print(f"Processing {region}")
- cmd = (
- f"mapshaper -i {france_vector_path} "
+ quiet = "-quiet " if quiet else " "
+
+ try:
+ france_metropolitaine = (
+ f"mapshaper -i {input_file} "
f"-proj EPSG:3857 "
- f'-filter "{logical_conditions["EMPRISES"][region]}" '
- f'-affine shift={shift_value} scale={scale[region]} '
- f"-o {output_dir}/{region}.geojson"
+ f'-filter "{logical_metropole}" '
+ f"{quiet}"
+ f"-o {output_dir}/metropole.{output_format}"
+ )
+
+ if bring_out_idf:
+ idf_zoom = (
+ f"mapshaper -i {input_file} "
+ f"-proj EPSG:3857 "
+ f'-filter "{logical_idf}" '
+ f"-affine shift={shift_idf} scale={zoom_idf} "
+ f"{quiet}"
+ f"-o {output_dir}/idf_zoom.{output_format}"
+ )
+
+ run(idf_zoom)
+
+ run(france_metropolitaine)
+
+ for region, shift_value in shift.items():
+ logger.info("Processing %s", region)
+ cmd = (
+ f"mapshaper -i {input_file} "
+ f"-proj EPSG:3857 "
+ f'-filter "{logical_conditions["EMPRISES"][region]}" '
+ f"-affine shift={shift_value} scale={scale[region]} "
+ f"{quiet}"
+ f"-o {output_dir}/{region}.{output_format}"
+ )
+ run(cmd)
+
+ # fix_geo = "fix-geometry" if output_format == "topojson" else ""
+
+ output = f"{output_dir}/{output_name}.{output_format}"
+ bring_out_idf = (
+ f"{output_dir}/idf_zoom.{output_format} " if bring_out_idf else ""
)
- subprocess.run(
- cmd,
- shell=True,
- check=True,
+ cmd_combined = (
+ f"mapshaper "
+ f"{output_dir}/metropole.{output_format} "
+ + bring_out_idf
+ + f"{output_dir}/guadeloupe.{output_format} "
+ f"{output_dir}/martinique.{output_format} "
+ f"{output_dir}/guyane.{output_format} "
+ f"{output_dir}/reunion.{output_format} "
+ f"{output_dir}/mayotte.{output_format} "
+ f"snap combine-files "
+ f'-proj wgs84 init="EPSG:3857" target=* '
+ f"-rename-layers FRANCE,IDF,GDP,MTQ,GUY,REU,MAY "
+ f"-merge-layers target=FRANCE,IDF,GDP,MTQ,GUY,REU,MAY force "
+ f"-rename-layers FRANCE_TRANSFORMED "
+ "-explode "
+ f"{quiet}"
+ f"-o {output} "
+ # f"{fix_geo}"
)
- cmd_combined = (
- f"mapshaper "
- f"{output_dir}/metropole.geojson "
- f"{output_dir}/idf_zoom.geojson "
- f"{output_dir}/guadeloupe.geojson "
- f"{output_dir}/martinique.geojson "
- f"{output_dir}/guyane.geojson "
- f"{output_dir}/reunion.geojson "
- f"{output_dir}/mayotte.geojson "
- f"snap combine-files "
- f'-proj wgs84 init="EPSG:3857" target=* '
- f"-rename-layers FRANCE,IDF,GDP,MTQ,GUY,REU,MAY "
- f"-merge-layers target=FRANCE,IDF,GDP,MTQ,GUY,REU,MAY force "
- f"-rename-layers FRANCE_TRANSFORMED "
- f"-o {output_dir}/idf_combined.geojson "
- )
-
- subprocess.run(
- cmd_combined,
- shell=True,
- check=True,
- )
-
- return f"{output_dir}/idf_combined.geojson"
\ No newline at end of file
+ run(cmd_combined)
+ except Exception:
+ raise
+
+ finally:
+ for tempfile in [
+ "metropole",
+ "idf_zoom",
+ "guadeloupe",
+ "martinique",
+ "guyane",
+ "reunion",
+ "mayotte",
+ ]:
+ try:
+ os.unlink(f"{output_dir}/{tempfile}.{output_format}")
+ except FileNotFoundError:
+ pass
+
+ return output
diff --git a/cartiflette/mapshaper/mapshaper_combine_districts_and_cities.py b/cartiflette/mapshaper/mapshaper_combine_districts_and_cities.py
new file mode 100644
index 00000000..42d68c1f
--- /dev/null
+++ b/cartiflette/mapshaper/mapshaper_combine_districts_and_cities.py
@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+
+import os
+
+from .utils import run
+
+
+def mapshaper_combine_districts_and_cities(
+ input_city_file: str,
+ input_communal_districts_file: str,
+ output_dir: str,
+ output_name: str = "output",
+ output_format: str = "geojson",
+ quiet: bool = True,
+) -> str:
+ """
+ Combine cities' dataset with communal districts', ensure layer renamming
+ before merging.
+
+ Parameters
+ ----------
+ input_communal_districts_file : str
+ Path to the input file.
+ output_dir : str
+ Directory to store the output file.
+ output_name : str, optional
+ Name of the written file, without extension. The default is "output".
+ output_format : str, optional
+ Format for output file. The default is "geojson".
+ quiet : bool, optional
+ If True, inhibits console messages. The default is True.
+
+ Returns
+ -------
+ output : str
+ Path of the created file
+
+ """
+
+ try:
+ os.makedirs(output_dir)
+ except FileExistsError:
+ pass
+
+ # fix_geo = "fix-geometry" if output_format == "topojson" else ""
+ quiet = "-quiet " if quiet else " "
+ output = f"{output_dir}/{output_name}.{output_format}"
+
+ rename = "INSEE_ARM=INSEE_COG,NOM_ARM=NOM"
+ drop = "STATUT,INSEE_ARR,INSEE_CAN,INSEE_DEP,INSEE_REG,SIREN_EPCI"
+
+ cmd = (
+ f"mapshaper {input_city_file} {input_communal_districts_file} "
+ "snap combine-files "
+ "-proj EPSG:4326 "
+ "-rename-layers COMMUNE,ARRONDISSEMENT_MUNICIPAL "
+ "-merge-layers target=COMMUNE,ARRONDISSEMENT_MUNICIPAL force "
+ "-rename-layers ARRONDISSEMENT_MUNICIPAL "
+ f"-rename-fields {rename} "
+ f"-drop fields={drop} "
+ f"-o {output} "
+ # f"{fix_geo} "
+ f"{quiet}"
+ f"format={output_format} "
+ f'extension=".{output_format}"'
+ )
+ run(cmd)
+
+ return output
diff --git a/cartiflette/mapshaper/mapshaper_concat.py b/cartiflette/mapshaper/mapshaper_concat.py
new file mode 100644
index 00000000..5c1959b7
--- /dev/null
+++ b/cartiflette/mapshaper/mapshaper_concat.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os
+
+from .utils import run
+
+
+def mapshaper_concat(
+ input_dir: str,
+ input_format: str = "*",
+ output_dir: str = "temp",
+ output_name: str = "concatenated",
+ output_format: str = "geojson",
+ quiet: bool = True,
+) -> str:
+ """
+ Concat multiple files (all files should have the same projection).
+
+ Parameters
+ ----------
+ input_dir : str
+ Directory containing the files to concat
+ input_format : str, optional
+ Input file's format. If "*", will match every kind of files.
+ The default is "*"
+ output_dir : str
+ Directory to store the output file. The default is "temp"
+ output_name : str, optional
+ The path to write the file to (without extension).
+ The default is "concatenated"
+ output_format : str, optional
+ The format to write the outputfile. The default is "geojson".
+ quiet : bool, optional
+ If True, inhibits console messages. The default is True.
+
+ Returns
+ -------
+ output : str
+ Path of the created file
+
+ """
+
+ try:
+ os.makedirs(output_dir)
+ except FileExistsError:
+ pass
+
+ quiet = "-quiet " if quiet else " "
+ output = f"{output_dir}/{output_name}.{output_format}"
+
+ cmd = (
+ f"mapshaper -i {input_dir}/*.{input_format}"
+ f" combine-files name='{output_name}' "
+ f"-proj EPSG:4326 "
+ f"-merge-layers "
+ f"{quiet}"
+ f'-o {output} format={output_format} extension=".{output_format}" '
+ "singles"
+ )
+
+ run(cmd)
+
+ return output
diff --git a/cartiflette/mapshaper/mapshaper_convert_mercator.py b/cartiflette/mapshaper/mapshaper_convert_mercator.py
deleted file mode 100644
index ce66b9c9..00000000
--- a/cartiflette/mapshaper/mapshaper_convert_mercator.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import subprocess
-
-
-def mapshaper_convert_mercator(
- local_dir="temp",
- territory="reunion",
- file="COMMUNE",
- extension_initial="shp",
- format_intermediate="geojson",
- identifier="",
- output_path=None,
-):
- if output_path is None:
- output_path = f"{local_dir}/preprocessed"
-
- output_name = f"{output_path}/{territory}.{format_intermediate}"
-
- if identifier != "":
- identifier = f"-each \"AREA='{identifier}'\" "
-
- subprocess.run(
- (
- f"mapshaper {local_dir}/{territory}/COMMUNE.{extension_initial} name='COMMUNE' "
- f"-proj EPSG:4326 "
- f"{identifier}"
- f"-o {output_name} "
- f'format={format_intermediate} extension=".{format_intermediate}" singles'
- ),
- shell=True,
- check=True,
- )
-
- return output_name
diff --git a/cartiflette/mapshaper/mapshaper_convert_reproject.py b/cartiflette/mapshaper/mapshaper_convert_reproject.py
new file mode 100644
index 00000000..a51b868f
--- /dev/null
+++ b/cartiflette/mapshaper/mapshaper_convert_reproject.py
@@ -0,0 +1,68 @@
+import os
+
+from .utils import run
+
+
+def mapshaper_convert_reproject(
+ input_file: str,
+ epsg: int = 4326,
+ output_dir: str = "temp",
+ output_name: str = "output",
+ output_format: str = "geojson",
+ filter_by: str = "",
+ quiet: bool = True,
+) -> str:
+ """
+ Project a file to a given EPSG (into a given format).
+ If identifier is given, will filter the file based on the following
+ criteria: AREA='{identifier}'
+
+ Parameters
+ ----------
+ input_file : str
+ Path to the input file.
+ epsg : int, optional
+ EPSG code to project into. The default is 4326.
+ output_dir : str, optional
+ Directory to store the output file. The default is "temp"
+ output_name : str, optional
+ The path to write the file to (without extension).
+ The default is "concatenated"
+ output_format : str, optional
+ The format to write the outputfile. The default is "geojson".
+ filter_by: str, optional
+ The criteria to filter the input file, based on AREA field. The default
+ is "", which will not perform any filter.
+ quiet : bool, optional
+ If True, inhibits console messages. The default is True.
+
+ Returns
+ -------
+ output : str
+ Path of the created file
+
+ """
+
+ try:
+ os.makedirs(output_dir)
+ except FileExistsError:
+ pass
+
+ quiet = "-quiet " if quiet else " "
+ output = f"{output_dir}/{output_name}.{output_format}"
+
+ if filter_by != "":
+ filter_by = f"-each \"AREA='{filter_by}'\" "
+
+ cmd = (
+ f"mapshaper {input_file} name='{output_name}' "
+ f"-proj EPSG:{epsg} "
+ f"{filter_by} "
+ f"{quiet}"
+ f"-o {output} force "
+ f'format={output_format} extension=".{output_format}" singles'
+ )
+
+ run(cmd)
+
+ return output
diff --git a/cartiflette/mapshaper/mapshaper_dissolve.py b/cartiflette/mapshaper/mapshaper_dissolve.py
new file mode 100644
index 00000000..ad1b2cd6
--- /dev/null
+++ b/cartiflette/mapshaper/mapshaper_dissolve.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os
+from typing import List
+
+from .utils import run
+
+
+def mapshaper_dissolve(
+ input_file: str,
+ by: str,
+ copy_fields: List[str] = None,
+ calc: List[str] = None,
+ output_dir: str = "temp",
+ output_name: str = "output",
+ output_format: str = "geojson",
+ quiet: bool = True,
+) -> str:
+ """
+ Dissolve geometries
+
+ Dissolve geometries on field `by`, keeping fields `copy_fields`. Other
+ fields should be computaded using javascript functions with `calc`
+ argument.
+
+
+ Parameters
+ ----------
+ input_file : str
+ Path to the input file.
+ by : str
+ Field used to dissolve
+ copy_fields : List[str], optional
+ Copies values from the first feature in each group of dissolved
+ features. The default is None.
+ calc : Listr[str], optional
+ Fields on which computed should be operated, describing valid js
+ functions. For instance ["POPULATION=sum(POPULATION)"]. The default
+ is None.
+ output_dir : str
+ Directory to store the output file. The default is "temp"
+ output_name : str, optional
+ The path to write the file to (without extension).
+ The default is "concatenated"
+ output_format : str, optional
+ The format to write the outputfile. The default is "geojson".
+ quiet : bool, optional
+ If True, inhibits console messages. The default is True.
+
+ Returns
+ -------
+ output : str
+ Path of the created file
+
+ """
+
+ try:
+ os.makedirs(output_dir)
+ except FileExistsError:
+ pass
+
+ quiet = "-quiet " if quiet else " "
+ output = f"{output_dir}/{output_name}.{output_format}"
+
+ name = "_".join(by)
+ by = ",".join(by)
+ cmd = (
+ f"mapshaper {input_file} "
+ f"name='{name}' "
+ "-proj EPSG:4326 "
+ f"-dissolve {by} "
+ )
+ if calc:
+ calc = ",".join(calc)
+ cmd += f"calc='{calc}' "
+ if copy_fields:
+ cmd += "copy-fields=" + ",".join(copy_fields)
+
+ # fix_geo = "fix-geometry" if output_format == "topojson" else ""
+
+ cmd += (
+ f" {quiet}"
+ f" -o {output} force "
+ # f"{fix_geo}"
+ )
+
+ run(cmd)
+
+ return output
diff --git a/cartiflette/mapshaper/mapshaper_enrich.py b/cartiflette/mapshaper/mapshaper_enrich.py
new file mode 100644
index 00000000..ca6c68ea
--- /dev/null
+++ b/cartiflette/mapshaper/mapshaper_enrich.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os
+from typing import List
+
+from .utils import run
+
+
+def mapshaper_enrich(
+ input_geodata_file: str,
+ input_metadata_file: str,
+ keys: List[str],
+ dtype: str = None,
+ drop: list = None,
+ rename: dict = None,
+ output_dir: str = "temp",
+ output_name: str = "output",
+ output_format: str = "geojson",
+ quiet: bool = True,
+) -> str:
+ """
+ Enriches an initial geodata file with additional data using Mapshaper.
+
+ Parameters
+ ----------
+ input_geodata_file : str
+ Path to the input geodata file.
+ input_metadata_file : str
+ Path to the input metadata file to join to the geodata file.
+ keys : List[str]
+ List of fields used for joining the dataframes. Should be a tuple
+ corresponding to left-field and right-field, for instance
+ ['INSEE_COM', 'CODGEO']
+ dtype : dict, optional
+ Dtypes (among "str", "string", "num", "number"), for
+ instance {"INSEE_REG": "str"} . Default is None.
+ drop : list, optional
+ List of columns to drop (if not None). Default is None.
+ rename : dict, optional
+ List of columns to rename (if not None) in a pandas' syntax-like.
+ To rename A -> B, pass {"A": "B"}. The default is None.
+ output_dir : str, optional
+ Directory to store the output file. The default is "temp"
+ output_name : str, optional
+ The path to write the file to (without extension).
+ The default is "concatenated"
+ output_format : str, optional
+ The format to write the outputfile. The default is "geojson".
+ quiet : bool, optional
+ If True, inhibits console messages. The default is True.
+
+ Returns
+ -------
+ output : str
+ Path of the created file
+
+ """
+
+ try:
+ os.makedirs(output_dir)
+ except FileExistsError:
+ pass
+
+ quiet = "-quiet " if quiet else " "
+ output = f"{output_dir}/{output_name}.{output_format}"
+ dtype = ",".join(
+ [f"{key}:{val}" for key, val in dtype.items()] if dtype else []
+ )
+ keys = ",".join(keys)
+ drop = ",".join(drop if drop else [])
+
+ # Warning : for mapshaper, to rename A -> B, use B=A syntax!
+ rename = ",".join(
+ [f"{val}={key}" for key, val in rename.items()] if rename else []
+ )
+
+ # Mapshaper command for the enrichment process
+ cmd = (
+ f"mapshaper {input_geodata_file} "
+ "name='' -proj EPSG:4326 "
+ f"-join {input_metadata_file} keys={keys} field-types={dtype} "
+ f"-filter-fields {drop} invert "
+ f"-rename-fields {rename} "
+ "-each \"PAYS='France'\" "
+ f"{quiet}"
+ f"-o {output} force"
+ )
+
+ # Run Mapshaper command
+ run(cmd)
+
+ return output
+
+
+# %%
diff --git a/cartiflette/mapshaper/mapshaper_process_communal_districts.py b/cartiflette/mapshaper/mapshaper_process_communal_districts.py
new file mode 100644
index 00000000..4fa8cdf0
--- /dev/null
+++ b/cartiflette/mapshaper/mapshaper_process_communal_districts.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os
+
+from .utils import run
+
+
+def mapshaper_process_communal_districts(
+ input_communal_districts_file: str,
+ output_dir: str,
+ output_name: str = "output",
+ output_format: str = "geojson",
+ quiet: bool = True,
+) -> str:
+ """
+ Preprocess communal districts files to ensure
+
+ Parameters
+ ----------
+ input_communal_districts_file : str
+ Path to the input file.
+ output_dir : str
+ Directory to store the output file. The default is "temp".
+ output_name : str, optional
+ Name of the written file, without extension. The default is "output".
+ output_format : str, optional
+ Format for output file. The default is "geojson".
+ quiet : bool, optional
+ If True, inhibits console messages. The default is True.
+
+ Returns
+ -------
+ output : str
+ Path of the created file
+
+ """
+ try:
+ os.makedirs(output_dir)
+ except FileExistsError:
+ pass
+
+ quiet = "-quiet " if quiet else " "
+ output = f"{output_dir}/{output_name}.{output_format}"
+
+ cmd = (
+ f"mapshaper {input_communal_districts_file} "
+ "name='ARRONDISSEMENT_MUNICIPAL' "
+ "-proj EPSG:4326 "
+ "-rename-fields INSEE_COG=INSEE_ARM "
+ "-each 'STATUT=\"Arrondissement municipal\"' "
+ f"{quiet}"
+ "-o force "
+ f'{output} format={output_format} extension=".{output_format}"'
+ )
+ run(cmd)
+
+ return output
diff --git a/cartiflette/mapshaper/mapshaper_remove_cities_with_districts.py b/cartiflette/mapshaper/mapshaper_remove_cities_with_districts.py
new file mode 100644
index 00000000..840d693e
--- /dev/null
+++ b/cartiflette/mapshaper/mapshaper_remove_cities_with_districts.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os
+
+from .utils import run
+
+
+def mapshaper_remove_cities_with_districts(
+ input_city_file: str,
+ output_dir: str = "temp",
+ output_name: str = "output",
+ output_format: str = "geojson",
+ quiet: bool = True,
+) -> str:
+ """
+ Remove cities with communal districts (Paris, Lyon, Marseille) from the
+ base cities geodataset.
+
+ Parameters
+ ----------
+ input_city_file : str
+ Path to the input file.
+ output_dir : str
+ Directory to store the output file. The default is "temp".
+ output_name : str, optional
+ Name of the written file, without extension. The default is "output".
+ output_format : str, optional
+ Format for output file. The default is "geojson".
+ quiet : bool, optional
+ If True, inhibits console messages. The default is True.
+
+ Returns
+ -------
+ output : str
+ Path of the created file
+
+ """
+ try:
+ os.makedirs(output_dir)
+ except FileExistsError:
+ pass
+
+ quiet = "-quiet " if quiet else " "
+ output = f"{output_dir}/{output_name}.{output_format}"
+
+ cmd = (
+ f"mapshaper {input_city_file} name='COMMUNE' -proj EPSG:4326 "
+ "-filter \"'69123,13055,75056'.indexOf(INSEE_COM) > -1\" invert "
+ '-each "INSEE_COG=INSEE_COM" '
+ f"{quiet}"
+ "-o force "
+ f'{output} format={output_format} extension=".{output_format}" singles'
+ )
+ run(cmd)
+
+ return output
diff --git a/cartiflette/mapshaper/mapshaper_simplify.py b/cartiflette/mapshaper/mapshaper_simplify.py
new file mode 100644
index 00000000..9d9287c6
--- /dev/null
+++ b/cartiflette/mapshaper/mapshaper_simplify.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os
+
+from .utils import run
+
+
+def mapshaper_simplify(
+ input_file: str,
+ option_simplify: str = "",
+ output_dir: str = "temp",
+ output_name: str = "output",
+ output_format: str = "geojson",
+ quiet: bool = True,
+) -> str:
+ """
+ SImplify geometries
+
+
+ Parameters
+ ----------
+ input_file : str
+ Path to the input file.
+ option_simplify : str, optional
+ Additional options for simplifying geometries, for instance
+ "-simplify 50%". The default is "".
+ output_dir : str
+ Directory to store the output file. The default is "temp"
+ output_name : str, optional
+ The path to write the file to (without extension).
+ The default is "concatenated"
+ output_format : str, optional
+ The format to write the outputfile. The default is "geojson".
+ quiet : bool, optional
+ If True, inhibits console messages. The default is True.
+
+ Returns
+ -------
+ output : str
+ Path of the created file
+
+ """
+
+ try:
+ os.makedirs(output_dir)
+ except FileExistsError:
+ pass
+
+ # fix_geo = "fix-geometry" if output_format == "topojson" else ""
+
+ quiet = "-quiet " if quiet else " "
+ output = f"{output_dir}/{output_name}.{output_format}"
+
+ cmd = (
+ f"mapshaper {input_file} "
+ "-proj EPSG:4326 "
+ f"{option_simplify} "
+ f"{quiet}"
+ f" -o {output} force "
+ # f"{fix_geo}"
+ )
+
+ run(cmd)
+
+ return output
diff --git a/cartiflette/mapshaper/mapshaper_split.py b/cartiflette/mapshaper/mapshaper_split.py
new file mode 100644
index 00000000..b1e54675
--- /dev/null
+++ b/cartiflette/mapshaper/mapshaper_split.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+from glob import glob
+import os
+from typing import List
+
+from .utils import run
+
+
+def mapshaper_split(
+ input_file: str,
+ layer_name: str = "",
+ split_variable: str = "DEPARTEMENT",
+ output_dir: str = "temp",
+ output_format: str = "geojson",
+ crs: int = 4326,
+ option_simplify: str = "",
+ quiet: bool = True,
+) -> List[str]:
+ """
+ Splits a GeoJSON file based on a specified variable using Mapshaper.
+
+ Parameters
+ ----------
+ input_file : str
+ The input file to be split (default is "temp.geojson").
+ layer_name : str, optional
+ The name of the layer within the file. The default is "".
+ split_variable : str, optional
+ The variable used for splitting the file. The default is "DEPARTEMENT".
+ output_dir : str, optional
+ Directory to store output files. The default is "temp".
+ output_format : str, optional
+ Format for output files. The default is "geojson".
+ crs : int, optional
+ The coordinate reference system EPSG code. The default is 4326.
+ option_simplify : str, optional
+ Additional options for simplifying geometries, for instance
+ "-simplify 50%". The default is "".
+ quiet : bool, optional
+ If True, inhibits console messages. The default is True.
+
+ Returns
+ -------
+ final_files : List[str]
+ List of paths of created files
+
+ """
+
+ # make a temporary inner directory to retrieve the full list of produced
+ # files at the end
+ temp_output_dir = os.path.join(output_dir, "this_is_a_dumb_temp_directory")
+ try:
+ os.makedirs(temp_output_dir)
+ except FileExistsError:
+ pass
+
+ quiet = "-quiet " if quiet else " "
+
+ # Mapshaper command for the splitting process
+ cmd = (
+ f"mapshaper {input_file} name='{layer_name}' -proj EPSG:{crs} "
+ f"{option_simplify} "
+ f"-split {split_variable} "
+ "-drop fields=IDF " # remove IDF used for tagging IdF entities on every level
+ f"{quiet}"
+ f"-o {temp_output_dir}/ "
+ f'format={output_format} extension=".{output_format}" singles'
+ )
+
+ # Run Mapshaper command
+ run(cmd)
+
+ produced_files = glob(os.path.join(temp_output_dir, f"*.{output_format}"))
+ final_files = [
+ file.replace(temp_output_dir, output_dir) for file in produced_files
+ ]
+ [os.replace(src, dst) for src, dst in zip(produced_files, final_files)]
+ os.rmdir(temp_output_dir)
+
+ return final_files
diff --git a/cartiflette/mapshaper/mapshaper_wrangling.py b/cartiflette/mapshaper/mapshaper_wrangling.py
deleted file mode 100644
index 827db5aa..00000000
--- a/cartiflette/mapshaper/mapshaper_wrangling.py
+++ /dev/null
@@ -1,92 +0,0 @@
-"""
-Data wrangling (geo)operations wrappers from mapshaper.
-"""
-
-import subprocess
-from cartiflette.utils import DICT_CORRESP_ADMINEXPRESS
-
-
-def mapshaper_enrich(
- local_dir: str = "temp",
- filename_initial: str = "COMMUNE",
- extension_initial: str = "shp",
- output_path: str = "temp.geojson",
- metadata_file: str = "temp/tagc.csv",
- dict_corresp: dict = DICT_CORRESP_ADMINEXPRESS,
-) -> None:
- """
- Enriches an initial shapefile with additional data using Mapshaper and a specified
- correspondence dictionary.
-
- Parameters:
- - local_dir (str): The local directory where the initial shapefile is stored and
- Mapshaper will be executed (default is "temp").
- - filename_initial (str): The name of the initial shapefile without extension
- (default is "COMMUNE").
- - extension_initial (str): The extension of the initial shapefile (default is "shp").
- - output_path (str): The path for the output GeoJSON file after enrichment
- (default is "temp.geojson").
- - dict_corresp (dict): A dictionary containing correspondences for field renaming
- and value assignment (default is DICT_CORRESP_ADMINEXPRESS).
-
- Returns:
- - None: The function runs Mapshaper with the specified commands and enriches
- the initial shapefile.
- """
-
- # Mapshaper command for the enrichment process
- cmd_step1 = (
- f"mapshaper {local_dir}/{filename_initial}.{extension_initial} "
- f"name='' -proj EPSG:4326 "
- f"-join {metadata_file} "
- f"keys=INSEE_COM,CODGEO field-types=INSEE_COM:str,CODGEO:str "
- f"-filter-fields INSEE_CAN,INSEE_ARR,SIREN_EPCI,INSEE_DEP,INSEE_REG,NOM_M invert "
- f"-rename-fields INSEE_DEP=DEP,INSEE_REG=REG "
- f"-each \"{dict_corresp['FRANCE_ENTIERE']}='France'\" "
- f"-o {output_path}"
- )
-
- # Run Mapshaper command
- subprocess.run(cmd_step1, shell=True, check=True)
-
-
-def mapshaper_split(
- input_file: str = "temp.geojson",
- layer_name: str = "",
- split_variable: str = "DEPARTEMENT",
- output_path: str = "temp2.geojson",
- format_output: str = "geojson",
- crs: int = 4326,
- option_simplify: str = "",
- source_identifier: str = "",
-) -> None:
- """
- Splits a GeoJSON file based on a specified variable using Mapshaper.
-
- Parameters:
- - input_file (str): The input GeoJSON file to be split (default is "temp.geojson").
- - layer_name (str): The name of the layer within the GeoJSON file (default is "").
- - split_variable (str): The variable used for splitting the GeoJSON file
- (default is "DEPARTEMENT").
- - output_path (str): The path for the output GeoJSON file after splitting
- (default is "temp2.geojson").
- - format_output (str): The format for the output GeoJSON file (default is "geojson").
- - crs (int): The coordinate reference system EPSG code (default is 4326).
- - option_simplify (str): Additional options for simplifying geometries (default is "").
- - source_identifier (str): Identifier for the data source (default is "").
-
- Returns:
- - None: The function runs Mapshaper with the specified commands and splits the GeoJSON file.
- """
-
- # Mapshaper command for the splitting process
- cmd_step2 = (
- f"mapshaper {input_file} name='{layer_name}' -proj EPSG:{crs} "
- f"{option_simplify}"
- f"-each \"SOURCE='{source_identifier}'\" "
- f"-split {split_variable} "
- f'-o {output_path} format={format_output} extension=".{format_output}" singles'
- )
-
- # Run Mapshaper command
- subprocess.run(cmd_step2, shell=True, check=True)
diff --git a/cartiflette/mapshaper/mapshaperize.py b/cartiflette/mapshaper/mapshaperize.py
deleted file mode 100644
index 54d97506..00000000
--- a/cartiflette/mapshaper/mapshaperize.py
+++ /dev/null
@@ -1,262 +0,0 @@
-import os
-import subprocess
-
-from cartiflette.utils import DICT_CORRESP_ADMINEXPRESS
-from .mapshaper_wrangling import mapshaper_enrich, mapshaper_split
-from .mapshaper_closer import mapshaper_bring_closer
-
-
-def mapshaperize_split(
- local_dir="temp",
- config_file_city={},
- format_output="topojson",
- niveau_polygons="COMMUNE",
- niveau_agreg="DEPARTEMENT",
- provider="IGN",
- source="EXPRESS-COG-CARTO-TERRITOIRE",
- territory="metropole",
- crs=4326,
- simplification=0,
- dict_corresp=DICT_CORRESP_ADMINEXPRESS,
-):
- """
- Processes shapefiles and splits them based on specified parameters using Mapshaper.
-
- Parameters
- ----------
- local_dir : str, optional
- The local directory for file storage, by default "temp".
- filename_initial : str, optional
- The initial filename, by default "COMMUNE".
- extension_initial : str, optional
- The initial file extension, by default "shp".
- format_output : str, optional
- The output format, by default "topojson".
- niveau_agreg : str, optional
- The level of aggregation for the split, by default "DEPARTEMENT".
- provider : str, optional
- The data provider, by default "IGN".
- source : str, optional
- The data source, by default "EXPRESS-COG-CARTO-TERRITOIRE".
- year : int, optional
- The year of the data, by default 2022.
- dataset_family : str, optional
- The dataset family, by default "ADMINEXPRESS".
- territory : str, optional
- The territory of the data, by default "metropole".
- crs : int, optional
- The coordinate reference system (CRS) code, by default 4326.
- simplification : int, optional
- The degree of simplification, by default 0.
- dict_corresp: dict
- A dictionary giving correspondance between niveau_agreg argument
- and variable names.
-
- Returns
- -------
- str
- The output path of the processed and split shapefiles.
-
- """
-
- simplification_percent = simplification if simplification is not None else 0
-
- # City level borders, file location
- directory_city = config_file_city.get("location", local_dir)
- initial_filename_city = config_file_city.get("filename", "COMMUNE")
- extension_initial_city = config_file_city.get("extension", "shp")
-
- output_path = (
- f"{local_dir}/{territory}/{niveau_agreg}/{format_output}/{simplification=}"
- )
-
- os.makedirs(output_path, exist_ok=True)
-
- if simplification_percent != 0:
- option_simplify = f"-simplify {simplification_percent}% "
- else:
- option_simplify = ""
-
- temp_filename = "temp.geojson"
-
- # STEP 1: ENRICHISSEMENT AVEC COG
- mapshaper_enrich(
- local_dir=directory_city,
- filename_initial=initial_filename_city,
- extension_initial=extension_initial_city,
- dict_corresp=dict_corresp,
- output_path=temp_filename,
- )
-
- if niveau_polygons != initial_filename_city:
- csv_list_vars = (
- f"{dict_corresp[niveau_polygons]}," f"{dict_corresp[niveau_agreg]}"
- )
- libelle_niveau_polygons = dict_corresp.get("LIBELLE_" + niveau_polygons, "")
- if libelle_niveau_polygons != "":
- libelle_niveau_polygons = f",{libelle_niveau_polygons}"
- libelle_niveau_agreg = dict_corresp.get("LIBELLE_" + niveau_agreg, "")
- if libelle_niveau_polygons != "":
- libelle_niveau_agreg = f",{libelle_niveau_agreg}"
- csv_list_vars = (
- f"{csv_list_vars}{libelle_niveau_polygons}{libelle_niveau_agreg}"
- )
-
- # STEP 1B: DISSOLVE IF NEEDED
- cmd_dissolve = (
- f"mapshaper {temp_filename} "
- f"name='' -proj EPSG:4326 "
- f"-dissolve {dict_corresp[niveau_polygons]} "
- f"calc='POPULATION=sum(POPULATION)' "
- f"copy-fields={csv_list_vars} "
- "-o temp.geojson force"
- )
- subprocess.run(cmd_dissolve, shell=True, check=True)
-
- # IF WE DESIRE TO BRING "DROM" CLOSER TO FRANCE
- if niveau_agreg.upper() == "FRANCE_ENTIERE_DROM_RAPPROCHES":
- niveau_filter_drom = "DEPARTEMENT"
- if niveau_polygons != "COMMUNE":
- niveau_filter_drom = niveau_polygons
- input_path = mapshaper_bring_closer(
- temp_filename, level_agreg=niveau_filter_drom
- )
- else:
- input_path = "temp.geojson"
-
- print(input_path)
-
- # STEP 2: SPLIT ET SIMPLIFIE
- mapshaper_split(
- input_file=input_path,
- layer_name="",
- split_variable=dict_corresp[niveau_agreg],
- output_path=output_path,
- format_output=format_output,
- crs=crs,
- option_simplify=option_simplify,
- source_identifier=f"{provider}:{source}",
- )
-
- return output_path
-
-
-def mapshaperize_split_merge(
- format_output="topojson",
- niveau_agreg="DEPARTEMENT",
- provider="IGN",
- source="EXPRESS-COG-CARTO-TERRITOIRE",
- territory="metropole",
- config_file_city={},
- config_file_arrondissement={},
- local_dir="temp",
- crs=4326,
- simplification=0,
- dict_corresp=DICT_CORRESP_ADMINEXPRESS,
-):
- simplification_percent = simplification if simplification is not None else 0
-
- # City level borders, file location
- directory_city = config_file_city.get("location", local_dir)
- initial_filename_city = config_file_city.get("filename", "COMMUNE")
- extension_initial_city = config_file_city.get("extension", "shp")
-
- # Arrondissement level borders, file location
- directory_arrondissement = config_file_arrondissement.get("location", local_dir)
- initial_filename_arrondissement = config_file_arrondissement.get(
- "filename", "ARRONDISSEMENT_MUNICIPAL"
- )
- extension_initial_arrondissement = config_file_arrondissement.get(
- "extension", "shp"
- )
-
- # Intermediate output location
- output_path = (
- f"{local_dir}/{territory}/{niveau_agreg}/{format_output}/{simplification=}"
- )
-
- if simplification_percent != 0:
- option_simplify = f"-simplify {simplification_percent}% "
- else:
- option_simplify = ""
-
- format_intermediate = "geojson"
-
- # PREPROCESS CITIES
- file_city = f"{directory_city}/{initial_filename_city}.{extension_initial_city}"
- subprocess.run(
- (
- f"mapshaper {file_city} name='COMMUNE' "
- f"-proj EPSG:4326 "
- f"-filter '\"69123,13055,75056\".indexOf(INSEE_COM) > -1' invert "
- f'-each "INSEE_COG=INSEE_COM" '
- f"-o {output_path}/communes_simples.{format_intermediate} "
- f'format={format_intermediate} extension=".{format_intermediate}" singles'
- ),
- shell=True,
- check=True,
- )
-
- # PREPROCESS ARRONDISSEMENT
- file_arrondissement = (
- f"{directory_arrondissement}/"
- f"{initial_filename_arrondissement}.{extension_initial_arrondissement}"
- )
- subprocess.run(
- (
- f"mapshaper {file_arrondissement} "
- f"name='ARRONDISSEMENT_MUNICIPAL' "
- f"-proj EPSG:4326 "
- f"-rename-fields INSEE_COG=INSEE_ARM "
- f"-each 'STATUT=\"Arrondissement municipal\" ' "
- f"-o {output_path}/arrondissements.{format_intermediate} "
- f'format={format_intermediate} extension=".{format_intermediate}"'
- ),
- shell=True,
- check=True,
- )
-
- # MERGE CITIES AND ARRONDISSEMENT
- subprocess.run(
- (
- f"mapshaper "
- f"{output_path}/communes_simples.{format_intermediate} "
- f"{output_path}/arrondissements.{format_intermediate} snap combine-files "
- f"-proj EPSG:4326 "
- f"-rename-layers COMMUNE,ARRONDISSEMENT_MUNICIPAL "
- f"-merge-layers target=COMMUNE,ARRONDISSEMENT_MUNICIPAL force "
- f"-rename-layers COMMUNE_ARRONDISSEMENT "
- f"-o {output_path}/raw.{format_intermediate} "
- f'format={format_intermediate} extension=".{format_intermediate}"'
- ),
- shell=True,
- check=True,
- )
-
- # STEP 1: ENRICHISSEMENT AVEC COG
- mapshaper_enrich(
- local_dir=output_path,
- filename_initial="raw",
- extension_initial=format_intermediate,
- output_path=f"{output_path}/raw2.{format_intermediate}",
- dict_corresp=DICT_CORRESP_ADMINEXPRESS,
- )
-
- input_path = f"{output_path}/raw2.{format_intermediate}"
-
- if niveau_agreg.upper() == "FRANCE_ENTIERE_DROM_RAPPROCHES":
- input_path = mapshaper_bring_closer(input_path)
-
- # TRANSFORM AS NEEDED
- mapshaper_split(
- input_file=input_path,
- layer_name="",
- split_variable=dict_corresp[niveau_agreg],
- output_path=output_path,
- format_output=format_output,
- crs=crs,
- option_simplify=option_simplify,
- source_identifier=f"{provider}:{source}",
- )
-
- return output_path
diff --git a/cartiflette/mapshaper/utils.py b/cartiflette/mapshaper/utils.py
new file mode 100644
index 00000000..a83bf783
--- /dev/null
+++ b/cartiflette/mapshaper/utils.py
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+"""
+Utils to ensure subprocess is ran with same level of debugging between windows
+& linux
+"""
+import logging
+import os
+import subprocess
+
+
+def run(cmd):
+ if os.name == "nt":
+ kwargs = {"shell": True, "text": True, "capture_output": True}
+ result = subprocess.run(cmd, **kwargs)
+ logging.info(result.stdout)
+ if not result.returncode == 0:
+ logging.warning(result.stderr)
+ raise subprocess.CalledProcessError(result.returncode, cmd)
+ else:
+ # on windows, mapshaper's output seem to always be in stderr,
+ # whether there was an error or not
+ logging.info(result.stderr)
+
+ else:
+ kwargs = {
+ "shell": True,
+ "check": True,
+ "text": True,
+ }
+ subprocess.run(cmd, **kwargs)
+
+ return
diff --git a/cartiflette/pipeline/__init__.py b/cartiflette/pipeline/__init__.py
index 355511e8..6ed4c100 100644
--- a/cartiflette/pipeline/__init__.py
+++ b/cartiflette/pipeline/__init__.py
@@ -1,18 +1,15 @@
from .cross_product_parameters import (
- restructure_nested_dict_borders,
crossproduct_parameters_production,
)
-
-from .prepare_mapshaper import prepare_local_directory_mapshaper
from .mapshaper_split_from_s3 import (
mapshaperize_split_from_s3,
- mapshaperize_merge_split_from_s3,
+ mapshaperize_split_from_s3_multithreading,
)
+from .download import download_all
__all__ = [
- "restructure_nested_dict_borders",
"crossproduct_parameters_production",
- "prepare_local_directory_mapshaper",
"mapshaperize_split_from_s3",
- "mapshaperize_merge_split_from_s3",
+ "mapshaperize_split_from_s3_multithreading",
+ "download_all",
]
diff --git a/cartiflette/pipeline/combine_adminexpress_france.py b/cartiflette/pipeline/combine_adminexpress_france.py
deleted file mode 100644
index 43d6d078..00000000
--- a/cartiflette/pipeline/combine_adminexpress_france.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import subprocess
-
-from cartiflette.config import FS, PATH_WITHIN_BUCKET
-from cartiflette.utils import import_yaml_config
-from cartiflette.mapshaper import mapshaper_convert_mercator
-from cartiflette.s3 import upload_s3_raw
-from .prepare_mapshaper import prepare_local_directory_mapshaper
-
-
-def combine_adminexpress_territory(
- intermediate_dir="temp", path_within_bucket=PATH_WITHIN_BUCKET, fs=FS
-):
- local_dir = intermediate_dir
- format_intermediate = "geojson"
-
- yaml = import_yaml_config()
-
- list_territories = yaml["IGN"]["ADMINEXPRESS"]["EXPRESS-COG-TERRITOIRE"][
- "territory"
- ].keys()
-
- list_location_raw = {
- territ: upload_s3_raw(
- path_within_bucket=path_within_bucket, year=2022, territory=territ
- )
- for territ in list_territories
- }
-
- for territory, path_bucket in list_location_raw.items():
- prepare_local_directory_mapshaper(
- path_bucket,
- borders="COMMUNE",
- territory=territory,
- niveau_agreg="COMMUNE",
- format_output="geojson",
- simplification=0,
- local_dir=local_dir,
- fs=fs,
- )
-
- for territ in list_territories:
- mapshaper_convert_mercator(
- local_dir=local_dir, territory=territ, identifier=territ
- )
-
- output_path = f"{local_dir}/preprocessed_combined/raw.{format_intermediate}"
-
- subprocess.run(
- (
- f"mapshaper -i {local_dir}/preprocessed/*.geojson combine-files name='COMMUNE' "
- f"-proj EPSG:4326 "
- f"-merge-layers "
- f"-o {output_path} "
- f'format={format_intermediate} extension=".{format_intermediate}" singles'
- ),
- shell=True,
- check=True,
- )
-
- return output_path
diff --git a/cartiflette/pipeline/cross_product_parameters.py b/cartiflette/pipeline/cross_product_parameters.py
index 9282d7e2..414f9fef 100644
--- a/cartiflette/pipeline/cross_product_parameters.py
+++ b/cartiflette/pipeline/cross_product_parameters.py
@@ -1,109 +1,331 @@
-import itertools
+import logging
+
import pandas as pd
+from pebble import ThreadPool
+from s3fs import S3FileSystem
+from cartiflette.config import (
+ FS,
+ BUCKET,
+ PATH_WITHIN_BUCKET,
+ THREADS_DOWNLOAD,
+ INTERMEDIATE_FORMAT,
+)
+from cartiflette.pipeline_constants import (
+ AVAILABLE_DISSOLUTIONS_FROM_RAW_MESH,
+ AVAILABLE_TERRITORIAL_SPLITS_FOR_BORDERS,
+ PIPELINE_DOWNLOAD_ARGS,
+ PIPELINE_SIMPLIFICATION_LEVELS,
+)
+from cartiflette.s3 import S3GeoDataset
-def restructure_nested_dict_borders(dict_with_list: dict):
- """
- Restructures a nested dictionary by flattening its values and their corresponding keys.
+logger = logging.getLogger(__name__)
- Parameters:
- -----------
- dict_with_list : dict
- A dictionary with list values to be restructured.
- Returns:
- --------
- list
- A list of lists containing key-value pairs obtained by flattening the input dictionary.
+def flatten_dict_to_list(dict_with_list: dict) -> list:
+ """
+ Restructures a nested dictionary by flattening its values and their
+ corresponding keys.
- Example:
- --------
- Example usage:
- sample_dict = {'a': [1, 2, 3], 'b': [4, 5]}
- result = restructure_nested_dict_borders(sample_dict)
- print(result)
+ Parameters:
+ -----------
+ dict_with_list : dict
+ A dictionary with list values to be restructured.
+
+ Returns:
+ --------
+ flattened_list : list
+ A list of lists containing key-value pairs obtained by flattening
+ the input dictionary.
+ ValueError
+ Example:
+ --------
+ Example usage:
+ sample_dict = {'a': [1, 2, 3], 'b': [4, 5]}
+ result = flatten_dict(sample_dict)
+ print(result)
- This will output:
- [['a', 1], ['a', 2], ['a', 3], ['b', 4], ['b', 5]]
+ This will output:
+ [['a', 1], ['a', 2], ['a', 3], ['b', ValueError4], ['b', 5]]
"""
- croisement_filter_by_borders_flat = [
+ flattened_list = [
[key, inner_value]
for key, values in dict_with_list.items()
for inner_value in values
]
- return croisement_filter_by_borders_flat
+ return flattened_list
def crossproduct_parameters_production(
- croisement_filter_by_borders: dict,
- list_format: list,
- years: list,
- crs_list: list,
- sources: list,
- simplifications: list,
-) -> pd.DataFrame:
+ year: int,
+ simplifications: list = None,
+ fs: S3FileSystem = FS,
+ bucket: str = BUCKET,
+ path_within_bucket: str = PATH_WITHIN_BUCKET,
+) -> list:
"""
- Generates a DataFrame by performing a cross-product of the given parameters.
+ Generates a dict of arguments commanding the generation of output
+ geodatasets. Only the best available process to generate a given dataset
+ are kep (for instance among available IRIS and COMMUNE candidates).
+
+ Note that the length of the return represents the number of downstream
+ pods.
Parameters:
-----------
- croisement_filter_by_borders : dict
- A dictionary with nested lists for cross-product generation.
- list_format : list
- A list of formats for cross-product generation.
- years : list
- A list of years for cross-product generation.
- crs_list : list
- A list of CRS (Coordinate Reference Systems) for cross-product generation.
- sources : list
- A list of sources for cross-product generation.
- simplifications : list
- A list of simplifications for cross-product generation.
+ year : int
+ Desired vintage. For ex. 2023
+ simplifications : list, optional
+ A list of simplification for cross-product generation. The default is
+ None and will result to PIPELINE_SIMPLIFICATION_LEVELS.
+ fs : S3FileSystem, optional
+ S3FileSystem used for storage. The default is FS.
+ bucket : str, optional
+ The bucket used for storage on fs. The default is BUCKET.
+ path_within_bucket : str, optional
+ The path within the bucket used for storage on fs. The default is
+ PATH_WITHIN_BUCKET.
Returns:
--------
- pd.DataFrame
- A pandas DataFrame containing the cross-product of the input parameters.
+ combinations : list
+ A list of dicts used for commanding the generation of a downstream
+ dataset.
+
+ Each dict has 5 keys:
+ * mesh_init: str (for instance 'DEPARTEMENT')
+ * source_geodata: str (for instance 'EXPRESS-COG-CARTO-TERRITOIRE')
+ * simplification: str (for instance '2021')
+ * dissolve_by: str (for instance 'ARRONDISSEMENT')
+ * config: List[str] of territorial splits
Example:
--------
Example usage:
- sample_dict = {'a': [1, 2, 3], 'b': [4, 5]}
- formats = ['geojson', 'gpkg']
- years = [2022, 2022]
- crs_list = [4326, 2154]
- sources = ['source1', 'source2']
- simplifications = [0, 40]
- result = crossproduct_parameters_production(
- sample_dict, formats, years, crs_list, sources, simplifications
- )
- print(result)
+ >>> year = 2023
+ >>> simplifications = [0, 40]
+ >>> result = crossproduct_parameters_production(year, simplifications)
+ >>> print(result)
+ >>> [{
+ 'mesh_init': 'ARRONDISSEMENT_MUNICIPAL',
+ 'source_geodata': 'EXPRESS-COG-CARTO-TERRITOIRE',
+ 'simplification': 40,
+ 'dissolve_by': 'ARRONDISSEMENT_MUNICIPAL',
+ 'territories': ['ZONE_EMPLOI', ..., 'AIRE_ATTRACTION_VILLES']
+ }, ..., {
+ 'mesh_init': 'IRIS',
+ 'source_geodata': 'CONTOUR-IRIS',
+ 'simplification': 40,
+ 'dissolve_by': 'IRIS',
+ 'territories': ['ZONE_EMPLOI', ..., 'AIRE_ATTRACTION_VILLES']
+ }
+ ]
- This will output:
- A pandas DataFrame with the cross-product of the provided parameters.
"""
- croisement_filter_by_borders_flat = restructure_nested_dict_borders(
- croisement_filter_by_borders
+
+ if not simplifications:
+ simplifications = PIPELINE_SIMPLIFICATION_LEVELS
+
+ # prepare a list of (potential) sources from cartiflette's config
+ # (the result will depend of the resolution in the config)
+ sources = {
+ "ARRONDISSEMENT_MUNICIPAL": PIPELINE_DOWNLOAD_ARGS["ADMIN-EXPRESS"],
+ "COMMUNE": PIPELINE_DOWNLOAD_ARGS["ADMIN-EXPRESS"],
+ "IRIS": PIPELINE_DOWNLOAD_ARGS["IRIS"],
+ "CANTON": PIPELINE_DOWNLOAD_ARGS["ADMIN-EXPRESS"],
+ }
+ sources = pd.DataFrame(sources).T
+ sources.columns = [
+ "geodata_provider",
+ "geodata_dataset_family",
+ "geodata_source",
+ "geodata_territorial_components",
+ ]
+ sources.index.name = "mesh_init"
+ sources = sources.reset_index(drop=False)
+ sources["geodata_territorial_components"] = (
+ sources.geodata_territorial_components.apply(", ".join)
+ )
+
+ sources = sources.drop(
+ ["geodata_provider", "geodata_dataset_family"], axis=1
+ )
+
+ # prepare a list of tuples (
+ # administrative_level = polygon level = borders,
+ # territory used for splitting the file's boundaries = territory
+ # ),
+ croisement_filter_by_borders_flat = pd.DataFrame(
+ flatten_dict_to_list(AVAILABLE_TERRITORIAL_SPLITS_FOR_BORDERS),
+ columns=["borders", "territory"],
)
- combinations = list(
- itertools.product(
- list_format,
+ # prepare a list of tuples (
+ # raw source's polygon level,
+ # mesh created after dissolve
+ # ),
+ geometries_dissolutions = pd.DataFrame(
+ flatten_dict_to_list(AVAILABLE_DISSOLUTIONS_FROM_RAW_MESH),
+ columns=["mesh_init", "dissolve_by"],
+ )
+
+ combinations = sources.merge(
+ geometries_dissolutions.merge(
croisement_filter_by_borders_flat,
- years,
- crs_list,
- sources,
- simplifications,
+ left_on="dissolve_by",
+ right_on="borders",
)
)
+ combinations = (
+ combinations
+ # .join(
+ # pd.Series(list_format, name="format_output"), how="cross"
+ # )
+ # .join(pd.Series(crs_list, name="epsg"), how="cross")
+ .join(pd.Series(simplifications, name="simplification"), how="cross")
+ )
- tempdf = pd.DataFrame(
- combinations,
- columns=["format_output", "nested", "year", "crs", "source", "simplification"],
+ combinations = combinations.drop(
+ ["geodata_territorial_components", "borders"], axis=1
)
- tempdf["level_polygons"] = tempdf["nested"].apply(lambda tup: tup[0])
- tempdf["filter_by"] = tempdf["nested"].apply(lambda tup: tup[1])
- tempdf.drop("nested", axis="columns", inplace=True)
- return tempdf
+ def geodataset_exists(borders, geodata_source, simplification):
+ "check if preprocessed geodata file is found on S3"
+ config = {
+ "bucket": bucket,
+ "path_within_bucket": path_within_bucket,
+ "provider": "Cartiflette",
+ "dataset_family": "geodata",
+ "source": geodata_source,
+ "year": year,
+ "borders": borders,
+ "crs": 4326,
+ "filter_by": "preprocessed",
+ "value": "before_cog",
+ "vectorfile_format": INTERMEDIATE_FORMAT,
+ "territory": "france",
+ "simplification": simplification,
+ "fs": fs,
+ }
+ try:
+ S3GeoDataset(**config, build_from_local=False)
+ return True
+ except ValueError:
+ # raw file does not exist
+ return False
+
+ def metadataset_exists(borders):
+ "check if preprocessed metadata file is found on S3"
+ config = {
+ "bucket": bucket,
+ "path_within_bucket": path_within_bucket,
+ "provider": "Cartiflette",
+ "dataset_family": "metadata",
+ "source": "*",
+ "year": year,
+ "borders": borders,
+ "crs": None,
+ "filter_by": "preprocessed",
+ "value": "tagc",
+ "vectorfile_format": "csv",
+ "territory": "france",
+ "simplification": 0,
+ "fs": fs,
+ }
+ try:
+ S3GeoDataset(**config, build_from_local=False)
+ return True
+ except ValueError:
+ # raw file does not exist
+ return False
+
+ # remove combinations having no available upstream source
+ geodata_unique = combinations[
+ ["mesh_init", "geodata_source", "simplification"]
+ ].drop_duplicates()
+ metadata_unique = combinations[["mesh_init"]].drop_duplicates()
+
+ if THREADS_DOWNLOAD == 1:
+
+ geodata_unique["upstream_geodata_exists"] = geodata_unique.apply(
+ lambda tup: geodataset_exists(*tup), axis=1
+ )
+
+ metadata_unique["upstream_metadata_exists"] = metadata_unique.apply(
+ lambda tup: metadataset_exists(*tup), axis=1
+ )
+
+ else:
+ with ThreadPool(min(THREADS_DOWNLOAD, len(combinations))) as pool:
+ geodata_unique["upstream_geodata_exists"] = list(
+ pool.map(
+ geodataset_exists, *zip(*geodata_unique.values.tolist())
+ ).result()
+ )
+
+ metadata_unique["upstream_metadata_exists"] = list(
+ pool.map(
+ metadataset_exists, *zip(*metadata_unique.values.tolist())
+ ).result()
+ )
+
+ combinations = combinations.merge(geodata_unique).merge(metadata_unique)
+ combinations["upstream_exists"] = (
+ combinations["upstream_geodata_exists"]
+ & combinations["upstream_metadata_exists"]
+ )
+
+ ix = combinations[~combinations.upstream_exists].index
+ combinations = combinations.drop(ix).drop(
+ [
+ "upstream_exists",
+ "upstream_geodata_exists",
+ "upstream_metadata_exists",
+ ],
+ axis=1,
+ )
+
+ logger.debug(
+ "found %s combinations of downstream geodatasets", len(combinations)
+ )
+
+ if len(combinations) == 0:
+ raise ValueError(f"no combination available for {year=}")
+
+ # get best combination available among COMMUNE/IRIS/CANTON
+ # -> for each geodataset to generate, keep COMMUNE if available, IRIS
+ # otherwise (and CANTON for border=CANTON generation)
+ dups = [
+ "dissolve_by",
+ "territory",
+ # "format_output",
+ # "epsg",
+ "simplification",
+ "mesh_init",
+ ]
+ combinations = combinations.sort_values(dups, ascending=False)
+ combinations = combinations.drop_duplicates(dups[:-1], keep="last")
+
+ keys = ["mesh_init", "geodata_source", "simplification", "dissolve_by"]
+ combinations = (
+ combinations.set_index(keys)
+ .groupby(keys)["territory"]
+ .agg(list)
+ .to_dict()
+ )
+ logger.info("%s batch datasets will be created", len(combinations))
+ logger.info("combinations are %s", combinations)
+
+ combinations = [
+ {
+ "mesh_init": key[0],
+ "source_geodata": key[1],
+ "simplification": key[2],
+ "dissolve_by": key[3],
+ "territories": val,
+ }
+ for key, val in combinations.items()
+ ]
+
+ return combinations
diff --git a/cartiflette/download/pipeline.py b/cartiflette/pipeline/download.py
similarity index 69%
rename from cartiflette/download/pipeline.py
rename to cartiflette/pipeline/download.py
index 4b8761d8..c6ecaeef 100644
--- a/cartiflette/download/pipeline.py
+++ b/cartiflette/pipeline/download.py
@@ -3,12 +3,20 @@
from datetime import date
import json
import logging
+import traceback
+from typing import List
+
from pebble import ThreadPool
import s3fs
-from cartiflette.config import BUCKET, PATH_WITHIN_BUCKET, FS, THREADS_DOWNLOAD
-from cartiflette.constants import DOWNLOAD_PIPELINE_ARGS
-from cartiflette.download.download import _download_sources
+from cartiflette.config import (
+ BUCKET,
+ PATH_WITHIN_BUCKET,
+ FS,
+ THREADS_DOWNLOAD,
+)
+from cartiflette.pipeline_constants import PIPELINE_DOWNLOAD_ARGS
+from cartiflette.download import _download_and_store_sources
from cartiflette.utils import deep_dict_update
logger = logging.getLogger(__name__)
@@ -19,11 +27,12 @@ def download_all(
path_within_bucket: str = PATH_WITHIN_BUCKET,
fs: s3fs.S3FileSystem = FS,
upload: bool = True,
+ years: List[int] = None,
) -> dict:
"""
Performs a full pipeline to download data and store them on MinIO. The
target files are described in cartiflette/constants.py under the
- constant DOWNLOAD_PIPELINE_ARGS. Those files' characteristics must also be
+ constant PIPELINE_DOWNLOAD_ARGS. Those files' characteristics must also be
described in the cartiflette/utils/sources.yaml file.
Note: to perform an easy debugging task, please overwrite
@@ -42,6 +51,9 @@ def download_all(
Whether to store data on MinIO or not. This argument should only be
used for debugging purposes. The default is True, to upload data on
MinIO.
+ years : List[int], optional
+ Years to perform download on. If not set, will result to
+ range(2015, date.today().year + 1). The default is None.
Returns
-------
@@ -100,8 +112,15 @@ def download_all(
"""
+ if not years:
+ years = list(range(2015, date.today().year + 1))[-1::-1]
+
+ logger.info(f"performing download on {years=}")
+
if not upload:
- logger.warning("no upload to s3 will be done, set upload=True to upload")
+ logger.warning(
+ "no upload to s3 will be done, set upload=True to upload"
+ )
# Initialize MD5 json if absent
json_md5 = f"{bucket}/{path_within_bucket}/md5.json"
@@ -117,7 +136,6 @@ def download_all(
"fs": fs,
"upload": upload,
}
- years = list(range(2015, date.today().year + 1))[-1::-1]
results = {}
@@ -125,11 +143,32 @@ def download_all(
def func(args):
key, args = args
- results = _download_sources(*args, years=years, **kwargs)
+ try:
+ providers, dataset_families, sources, territories = args
+ except ValueError:
+ # No territories set in constant (will ultimately be stored at
+ # "france_entiere" on the S3 FileSystem)
+ providers, dataset_families, sources = args
+ territories = None
+ logger.info(
+ "looking for %s %s %s %s",
+ providers,
+ dataset_families,
+ sources,
+ territories,
+ )
+ results = _download_and_store_sources(
+ providers=providers,
+ dataset_families=dataset_families,
+ sources=sources,
+ years=years,
+ territories=territories,
+ **kwargs,
+ )
logger.info(f"{key} done")
return results
- datasets_args = DOWNLOAD_PIPELINE_ARGS
+ datasets_args = PIPELINE_DOWNLOAD_ARGS
if THREADS_DOWNLOAD > 1:
with ThreadPool(THREADS_DOWNLOAD) as pool:
@@ -139,97 +178,17 @@ def func(args):
results = deep_dict_update(results, next(iterator))
except StopIteration:
break
- except Exception as e:
- logger.error(e)
+ except Exception:
+ logger.error(traceback.format_exc())
else:
for args in datasets_args.items():
results = deep_dict_update(results, func(args))
return results
-# def download_all_option2():
-# # Dérouler le yaml comme dans le test
-
-# yaml = import_yaml_config()
-
-# with MasterScraper() as scraper:
-# for provider, provider_yaml in yaml.items():
-# if not isinstance(provider_yaml, dict):
-# continue
-
-# for dataset_family, dataset_family_yaml in provider_yaml.items():
-# if not isinstance(dataset_family_yaml, dict):
-# continue
-
-# for source, source_yaml in dataset_family_yaml.items():
-# str_yaml = f"{dataset_family}/{source}"
-
-# if not isinstance(source_yaml, dict):
-# logger.error(
-# f"yaml {str_yaml} contains '{source_yaml}'"
-# )
-# continue
-# elif "FTP" in set(source_yaml.keys()):
-# logger.info("yaml {str_yaml} not checked (FTP)")
-# continue
-
-# years = set(source_yaml.keys()) - {"field", "FTP"}
-# try:
-# territories = set(source_yaml["field"].keys())
-# except KeyError:
-# territories = {""}
-
-# for year in years:
-# for territory in territories:
-# str_yaml = (
-# f"{dataset_family}/{source}/{year}/"
-# f"{provider}/{territory}"
-# )
-
-# if territory == "":
-# territory = None
-# try:
-# ds = Dataset(
-# dataset_family,
-# source,
-# int(year),
-# provider,
-# territory,
-# )
-# except Exception:
-# logger.error(
-# f"error on yaml {str_yaml} : "
-# "dataset not constructed"
-# )
-# continue
-# try:
-# url = ds.get_path_from_provider()
-# except Exception:
-# logger.error(
-# f"error on yaml {str_yaml} : "
-# "url no reconstructed"
-# )
-# continue
-
-# try:
-# r = scraper.get(url, stream=True)
-# except Exception:
-# logger.error(
-# f"error on yaml {str_yaml} : "
-# f"https get request failed on {url}"
-# )
-# continue
-# if not r.ok:
-# logger.error(
-# f"error on yaml {str_yaml} : "
-# "https get request "
-# f"got code {r.status_code} on {url}"
-# )
-
if __name__ == "__main__":
logging.basicConfig(
- level=logging.INFO,
+ level=logging.WARNING,
format="%(levelname)s :%(filename)s:%(lineno)d (%(funcName)s) - %(message)s",
)
-
results = download_all(upload=True)
diff --git a/cartiflette/pipeline/mapshaper_split_from_s3.py b/cartiflette/pipeline/mapshaper_split_from_s3.py
index 57e68e55..3a80ecd2 100644
--- a/cartiflette/pipeline/mapshaper_split_from_s3.py
+++ b/cartiflette/pipeline/mapshaper_split_from_s3.py
@@ -1,200 +1,292 @@
-import os
-import shutil
-
-from cartiflette.config import BUCKET, PATH_WITHIN_BUCKET, FS
-from cartiflette.utils import create_path_bucket
-from cartiflette.mapshaper import mapshaperize_split, mapshaperize_split_merge
-from .prepare_mapshaper import prepare_local_directory_mapshaper
-
-
-def mapshaperize_split_from_s3(config, fs=FS):
- format_output = config.get("format_output", "topojson")
- filter_by = config.get("filter_by", "DEPARTEMENT")
- territory = config.get("territory", "metropole")
- level_polygons = config.get("level_polygons", "COMMUNE")
- territory = config.get("territory", "metropole")
-
- provider = config.get("provider", "IGN")
- source = config.get("source", "EXPRESS-COG-CARTO-TERRITOIRE")
- year = config.get("year", 2022)
- dataset_family = config.get("dataset_family", "ADMINEXPRESS")
- territory = config.get("territory", "metropole")
- crs = config.get("crs", 4326)
- simplification = config.get("simplification", 0)
-
- bucket = config.get("bucket", BUCKET)
- path_within_bucket = config.get("path_within_bucket", PATH_WITHIN_BUCKET)
- local_dir = config.get("local_dir", "temp")
-
- path_raw_s3_combined = create_path_bucket(
- {
- "bucket": bucket,
- "path_within_bucket": path_within_bucket,
- "year": year,
- "borders": "france",
- "crs": 4326,
- "filter_by": "preprocessed",
- "value": "before_cog",
- "vectorfile_format": "geojson",
- "provider": "IGN",
- "dataset_family": "ADMINEXPRESS",
- "source": "EXPRESS-COG-CARTO-TERRITOIRE",
- "territory": "france",
- "filename": "raw.geojson",
- "simplification": 0,
- }
- )
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
- fs.download(path_raw_s3_combined, "temp/preprocessed_combined/COMMUNE.geojson")
-
- output_path = mapshaperize_split(
- local_dir=local_dir,
- config_file_city={
- "location": "temp/preprocessed_combined",
- "filename": "COMMUNE",
- "extension": "geojson",
- },
- format_output=format_output,
- niveau_agreg=filter_by,
- niveau_polygons=level_polygons,
- provider=provider,
- source=source,
- crs=crs,
- simplification=simplification,
- )
+import logging
+import traceback
+from typing import List
- for values in os.listdir(output_path):
- path_s3 = create_path_bucket(
- {
- "bucket": bucket,
- "path_within_bucket": path_within_bucket,
- "year": year,
- "borders": level_polygons,
- "crs": crs,
- "filter_by": filter_by,
- "value": values.replace(f".{format_output}", ""),
- "vectorfile_format": format_output,
- "provider": provider,
- "dataset_family": dataset_family,
- "source": source,
- "territory": territory,
- "simplification": simplification,
- }
- )
- fs.put(f"{output_path}/{values}", path_s3)
-
- shutil.rmtree(output_path)
-
-
-def mapshaperize_merge_split_from_s3(config, fs=FS):
- format_output = config.get("format_output", "topojson")
- filter_by = config.get("filter_by", "DEPARTEMENT")
- territory = config.get("territory", "metropole")
-
- provider = config.get("provider", "IGN")
- source = config.get("source", "EXPRESS-COG-CARTO-TERRITOIRE")
- year = config.get("year", 2022)
- dataset_family = config.get("dataset_family", "ADMINEXPRESS")
- territory = config.get("territory", "metropole")
- crs = config.get("crs", 4326)
- simplification = config.get("simplification", 0)
-
- bucket = config.get("bucket", BUCKET)
- path_within_bucket = config.get("path_within_bucket", PATH_WITHIN_BUCKET)
- local_dir = config.get("local_dir", "temp")
-
- path_raw_s3_combined = create_path_bucket(
- {
- "bucket": bucket,
- "path_within_bucket": path_within_bucket,
- "year": year,
- "borders": "france",
- "crs": 4326,
- "filter_by": "preprocessed",
- "value": "before_cog",
- "vectorfile_format": "geojson",
- "provider": "IGN",
- "dataset_family": "ADMINEXPRESS",
- "source": "EXPRESS-COG-CARTO-TERRITOIRE",
- "territory": "france",
- "filename": "raw.geojson",
- "simplification": 0,
- }
- )
+from pebble import ThreadPool
+from s3fs import S3FileSystem
- fs.download(path_raw_s3_combined, "temp/preprocessed_combined/COMMUNE.geojson")
-
- path_raw_s3_arrondissement = create_path_bucket(
- {
- "bucket": bucket,
- "path_within_bucket": path_within_bucket,
- "year": year,
- "borders": None,
- "crs": 2154,
- "filter_by": "origin",
- "value": "raw",
- "vectorfile_format": "shp",
- "provider": "IGN",
- "dataset_family": "ADMINEXPRESS",
- "source": "EXPRESS-COG-CARTO-TERRITOIRE",
- "territory": "metropole",
- "filename": "ARRONDISSEMENT_MUNICIPAL.shp",
- "simplification": 0,
- }
- )
- path_raw_s3_arrondissement = path_raw_s3_arrondissement.rsplit("/", maxsplit=1)[0]
+from cartiflette.config import (
+ BUCKET,
+ PATH_WITHIN_BUCKET,
+ FS,
+ INTERMEDIATE_FORMAT,
+ THREADS_DOWNLOAD,
+)
+from cartiflette.s3 import S3GeoDataset, S3Dataset
- # retrieve arrondissement
- prepare_local_directory_mapshaper(
- path_raw_s3_arrondissement,
- borders="ARRONDISSEMENT_MUNICIPAL",
- territory="metropole",
- niveau_agreg=filter_by,
- format_output="topojson",
- simplification=simplification,
- local_dir="temp",
- fs=FS,
+
+logger = logging.getLogger(__name__)
+
+
+def mapshaperize_split_from_s3(
+ year: int,
+ init_geometry_level: str,
+ source: str,
+ simplification: int,
+ dissolve_by: str,
+ territorial_splits: list,
+ fs: S3FileSystem = FS,
+ bucket: str = BUCKET,
+ path_within_bucket: str = PATH_WITHIN_BUCKET,
+):
+ logger.info(
+ "processing %s from '%s' geometries and dissolve on '%s'",
+ year,
+ init_geometry_level,
+ dissolve_by,
)
- output_path = mapshaperize_split_merge(
- local_dir=local_dir,
- config_file_city={
- "location": "temp/preprocessed_combined",
- "filename": "COMMUNE",
- "extension": "geojson",
- },
- config_file_arrondissement={
- "location": "temp/metropole",
- "filename": "ARRONDISSEMENT_MUNICIPAL",
- "extension": "shp",
- },
- format_output=format_output,
- niveau_agreg=filter_by,
- provider=provider,
+ kwargs = {
+ "fs": fs,
+ "bucket": bucket,
+ "path_within_bucket": path_within_bucket,
+ "year": year,
+ "borders": init_geometry_level,
+ "filter_by": "preprocessed",
+ "provider": "Cartiflette",
+ "territory": "france",
+ }
+ with S3Dataset(
+ dataset_family="metadata",
+ source="*",
+ crs=None,
+ value="tagc",
+ vectorfile_format="csv",
+ **kwargs,
+ ) as metadata, S3GeoDataset(
+ dataset_family="geodata",
source=source,
- crs=crs,
+ crs=4326,
+ value="before_cog",
+ vectorfile_format=INTERMEDIATE_FORMAT,
simplification=simplification,
- )
+ **kwargs,
+ ) as gis_file:
+
+ failed = []
+ success = []
+ skipped = []
+ for niveau_agreg in territorial_splits:
+
+ # Check that both niveau_agreg and dissolve_by correspond to
+ # definitive fields from either metadata/geodata
+ available = set(gis_file._get_columns()) | set(
+ metadata._get_columns()
+ )
+
+ warnings = []
+ for field in niveau_agreg, dissolve_by:
+ if field in [
+ "FRANCE_ENTIERE",
+ "FRANCE_ENTIERE_DROM_RAPPROCHES",
+ "FRANCE_ENTIERE_IDF_DROM_RAPPROCHES",
+ ]:
+ continue
+ try:
+ metadata.find_column_name(field, available)
+ except (ValueError, IndexError) as exc:
+ warnings.append(str(exc))
+ if warnings:
+ skipped.append(
+ {
+ "warning": " - ".join(warnings),
+ "aggreg": niveau_agreg,
+ }
+ )
+ continue
+
+ with gis_file.copy() as gis_copy:
+ try:
+ gis_copy.create_downstream_geodatasets(
+ metadata,
+ niveau_agreg=niveau_agreg,
+ init_geometry_level=init_geometry_level,
+ dissolve_by=dissolve_by,
+ simplification=simplification,
+ )
+ except Exception as exc:
+ failed.append(
+ {
+ "error": exc,
+ "aggreg": niveau_agreg,
+ "traceback": traceback.format_exc(),
+ }
+ )
+ else:
+ success.append(
+ {
+ "aggreg": niveau_agreg,
+ }
+ )
+
+ warning_traceback = []
+ error_traceback = []
+ if skipped:
+ for one_skipped in skipped:
+ msg = "\n".join(
+ [
+ "-" * 50,
+ one_skipped["warning"],
+ f"aggregation: {one_skipped['aggreg']}",
+ ]
+ )
+ logger.warning(msg)
+ warning_traceback.append(msg)
+ if failed:
+ for one_failed in failed:
+ msg = "\n".join(
+ [
+ "=" * 50,
+ f"error: {one_failed['error']}",
+ f"aggregation: {one_failed['aggreg']}",
+ "-" * 50,
+ f"traceback:\n{one_failed['traceback']}",
+ ]
+ )
+ logger.error(msg)
+ error_traceback.append(msg)
+
+ return {
+ "success": success,
+ "skipped": skipped,
+ "failed": failed,
+ "warning_traceback": warning_traceback,
+ "error_traceback": error_traceback,
+ }
+
+
+def mapshaperize_split_from_s3_multithreading(
+ year: int,
+ configs: List[dict],
+ fs: S3FileSystem = FS,
+ bucket: str = BUCKET,
+ path_within_bucket: str = PATH_WITHIN_BUCKET,
+):
+
+ results = {
+ "success": 0,
+ "skipped": 0,
+ "failed": 0,
+ "warning_traceback": [],
+ "error_traceback": [],
+ }
+ if THREADS_DOWNLOAD > 1:
+ with ThreadPool(min(len(configs), THREADS_DOWNLOAD)) as pool:
+ args = [
+ (
+ year,
+ d["mesh_init"],
+ d["source_geodata"],
+ d["simplification"],
+ d["dissolve_by"],
+ d["territories"],
+ fs,
+ bucket,
+ path_within_bucket,
+ )
+ for d in configs
+ ]
+ iterator = pool.map(
+ mapshaperize_split_from_s3, *zip(*args), timeout=60 * 10
+ ).result()
+
+ failed = False
+ index = 0
+ while True:
+ try:
+ this_result = next(iterator)
+ except StopIteration:
+ break
+ except Exception:
+ logger.error(traceback.format_exc())
+ logger.error("args were %s", args[index])
+ else:
+ for key in "success", "skipped", "failed":
+ results[key] += len(this_result[key])
+ for key in "warning_traceback", "error_traceback":
+ results[key] += this_result[key]
+ finally:
+ index += 1
+ else:
+ for d in configs:
+
+ d["init_geometry_level"] = d.pop("mesh_init")
+ d["source"] = d.pop("source_geodata")
+ d["territorial_splits"] = d.pop("territories")
+ try:
+ this_result = mapshaperize_split_from_s3(
+ year=year,
+ fs=fs,
+ bucket=bucket,
+ path_within_bucket=path_within_bucket,
+ **d,
+ )
+ except Exception:
+ logger.error(traceback.format_exc())
+ logger.error("args were %s", d)
+ else:
+ for key in "success", "skipped", "failed":
+ results[key] += len(this_result[key])
+ for key in "warning_traceback", "error_traceback":
+ results[key] += this_result[key]
+
+ skipped = results["skipped"]
+ success = results["success"]
+ failed = results["failed"]
+ warnings = results["warning_traceback"]
+ errors = results["error_traceback"]
+
+ if warnings or errors:
+ level = "warning"
+ if errors:
+ level = "error"
+ log_func = getattr(logger, level)
+ log_func("=" * 50)
+ log_func("Traceback recaps")
+ for msg in warnings:
+ logger.warning(msg)
+ logger.info("%s", "-" * 50)
+ for msg in errors:
+ logger.error(msg)
+ logger.info("%s", "-" * 50)
+
+ logger.info("%s file(s) generation(s) were skipped", skipped)
+ logger.info("%s file(s) generation(s) succeeded", success)
+ logger.error("%s file(s) generation(s) failed", failed)
+
+ if failed:
+ raise ValueError("some datasets' generation failed")
+
+ return {
+ "success": success,
+ "skipped": skipped,
+ "failed": failed,
+ }
+
+
+# if __name__ == "__main__":
+# import logging
+# from cartiflette.pipeline_constants import COG_TERRITOIRE
+# from cartiflette.config import DATASETS_HIGH_RESOLUTION
+
+# logging.basicConfig(level=logging.INFO)
- for values in os.listdir(output_path):
- path_s3 = create_path_bucket(
- {
- "bucket": bucket,
- "path_within_bucket": path_within_bucket,
- "year": year,
- "borders": "COMMUNE_ARRONDISSEMENT",
- "crs": crs,
- "filter_by": filter_by,
- "value": values.replace(f".{format_output}", ""),
- "vectorfile_format": format_output,
- "provider": provider,
- "dataset_family": dataset_family,
- "source": source,
- "territory": territory,
- "simplification": simplification,
- }
- )
- fs.put(f"{output_path}/{values}", path_s3)
-
- shutil.rmtree(output_path)
+# mapshaperize_split_from_s3(
+# year=2023,
+# init_geometry_level="ARRONDISSEMENT_MUNICIPAL",
+# source=COG_TERRITOIRE[DATASETS_HIGH_RESOLUTION],
+# simplification=40,
+# dissolve_by="DEPARTEMENT",
+# config_generation={
+# "FRANCE_ENTIERE_DROM_RAPPROCHES": [
+# {"format_output": "gpkg", "epsg": "4326"},
+# {"format_output": "geojson", "epsg": "4326"},
+# {"format_output": "gpkg", "epsg": "2154"},
+# {"format_output": "geojson", "epsg": "2154"},
+# ]
+# },
+# )
diff --git a/cartiflette/pipeline/prepare_cog_metadata.py b/cartiflette/pipeline/prepare_cog_metadata.py
index 5706975d..614c26a8 100644
--- a/cartiflette/pipeline/prepare_cog_metadata.py
+++ b/cartiflette/pipeline/prepare_cog_metadata.py
@@ -1,21 +1,101 @@
-import os
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import io
+import logging
+import re
+import warnings
+
+from diskcache import Cache
import pandas as pd
+import numpy as np
+import polars as pl
+from pebble import ThreadPool
import s3fs
-from cartiflette.config import FS
-from cartiflette.s3 import upload_s3_raw
+from cartiflette.config import FS, BUCKET, PATH_WITHIN_BUCKET, THREADS_DOWNLOAD
+
+
+cache = Cache("cartiflette-s3-cache", timeout=3600)
+logger = logging.getLogger(__name__)
+
+
+def s3_to_df(
+ fs: s3fs.S3FileSystem, path_in_bucket: str, **kwargs
+) -> pd.DataFrame:
+ """
+ Retrieve DataFrame from S3 with cache handling.
+
+ Parameters
+ ----------
+ fs : s3fs.S3FileSystem
+ An S3FileSystem object for interacting with the S3 bucket
+ path_in_bucket : str
+ Target file's path on S3 bucket
+ **kwargs :
+ Optionnal kwargs to pass to either pandas.read_excel or pandas.read_csv
+
+ Returns
+ -------
+ df : pd.DataFrame
+ Download dataset as dataframe
+
+ """
+
+ try:
+ return cache[("metadata", path_in_bucket)]
+ except KeyError:
+ pass
+
+ try:
+
+ with fs.open(path_in_bucket, mode="rb") as remote_file:
+ remote = io.BytesIO(remote_file.read())
+ if path_in_bucket.endswith("csv") or path_in_bucket.endswith("txt"):
+ df = pl.read_csv(
+ remote, infer_schema_length=0, **kwargs
+ ).to_pandas()
+
+ elif path_in_bucket.endswith("xls") or path_in_bucket.endswith("xlsx"):
+ # carefull, with polars skip_rows and header_row are summed !
+ kwargs = {"header_row": kwargs["skip_rows"]}
+ df = pl.read_excel(
+ remote,
+ has_header=True,
+ infer_schema_length=0,
+ read_options=kwargs,
+ ).to_pandas()
+
+ except Exception as e:
+ warnings.warn(f"could not read {path_in_bucket=}: {e}")
+ raise
+ df.columns = [x.upper() for x in df.columns]
+
+ # Remove 'ZZZZZ'-like values from INSEE datasets
+ for col in df.columns:
+ ix = df[df[col].fillna("").str.fullmatch("Z+", case=False)].index
+ df.loc[ix, col] = pd.NA
+
+ cache[("metadata", path_in_bucket)] = df
+
+ return df
def prepare_cog_metadata(
- path_within_bucket: str, local_dir: str = "temp", fs: s3fs.core.S3FileSystem = FS
+ year: int,
+ bucket: str = BUCKET,
+ path_within_bucket: str = PATH_WITHIN_BUCKET,
+ fs: s3fs.core.S3FileSystem = FS,
) -> pd.DataFrame:
"""
- Prepares and retrieves COG (French Census Geographic Code) metadata by fetching and merging
- relevant datasets from remote sources, such as DEPARTEMENT, REGION, and TAGC (Appartenance).
+ Prepares and retrieves COG (French Census Geographic Code) metadata by
+ merging relevant datasets from raw sources stored on S3, such as
+ DEPARTEMENT, REGION, and TAGC (Appartenance).
Parameters:
- - path_within_bucket (str): The path within the S3 bucket where the datasets will be stored.
- - local_dir (str): Local directory where the datasets will be downloaded.
+ - year (int): The COG metadata's vintage
+ - bucket (str): The bucket where the dataset are stored
+ - path_within_bucket (str): The path within the S3 bucket where the datasets are stored.
- fs (s3fs.core.S3FileSystem): An S3FileSystem object for interacting with the S3 bucket.
Returns:
@@ -23,81 +103,413 @@ def prepare_cog_metadata(
and TAGC information.
"""
- # Create the local directory if it does not exist
- os.makedirs(local_dir, exist_ok=True)
-
- # Fetch and upload DEPARTEMENT dataset to S3
- path_bucket_cog_departement = upload_s3_raw(
- provider="Insee",
- dataset_family="COG",
- source="DEPARTEMENT",
- territory="france_entiere",
- borders="DATASET_INSEE_COG_DEPARTEMENT_FRANCE_ENTIERE_2022",
- year=2022,
- vectorfile_format="csv",
- path_within_bucket=path_within_bucket,
- )
-
- # Fetch and upload REGION dataset to S3
- path_bucket_cog_region = upload_s3_raw(
- provider="Insee",
- dataset_family="COG",
- source="REGION",
- territory="france_entiere",
- borders="DATASET_INSEE_COG_REGION_FRANCE_ENTIERE_2022",
- year=2022,
- vectorfile_format="csv",
- path_within_bucket=path_within_bucket,
- )
+ # TODO : calcul des tables BANATIC, etc.
- # Fetch and upload TAGC APPARTENANCE dataset to S3
- path_bucket_tagc_appartenance = upload_s3_raw(
- provider="Insee",
- dataset_family="TAGC",
- source="APPARTENANCE",
- territory="france_entiere",
- borders="table-appartenance-geo-communes-22",
- year=2022,
- vectorfile_format="xlsx",
- path_within_bucket=path_within_bucket,
- )
+ # =========================================================================
+ # Part 1. : retrieve all paths on S3
+ # =========================================================================
+ paths_bucket = {}
- # Retrieve paths for the uploaded datasets
- path_tagc = fs.ls(path_bucket_tagc_appartenance)[0]
- path_bucket_cog_departement = fs.ls(path_bucket_cog_departement)[0]
- path_bucket_cog_region = fs.ls(path_bucket_cog_region)[0]
-
- # Read datasets from S3 into Pandas DataFrames
- with fs.open(path_tagc, mode="rb") as remote_file:
- tagc = pd.read_excel(
- remote_file,
- skiprows=5,
- dtype_backend="pyarrow",
- dtype={"REG": "string[pyarrow]"},
+ def retrieve_path(provider, family: str, source: str, ext: str):
+ path = (
+ f"{bucket}/{path_within_bucket}/"
+ f"provider={provider}/dataset_family={family}/source={source}"
+ f"/year={year}/**/*.{ext}"
)
+ logger.debug(path)
+ try:
+ path = paths_bucket[(family, source)] = fs.glob(path)[0]
+ except IndexError:
+ warnings.warn(f"missing {family} {source} file for {year=}")
- with fs.open(path_bucket_cog_departement, mode="rb") as remote_file:
- cog_dep = pd.read_csv(
- remote_file, dtype_backend="pyarrow", dtype={"REG": "string[pyarrow]"}
- )
+ args = [
+ ("Insee", "COG", "COMMUNE-OUTRE-MER", "csv"),
+ ("Insee", "COG", "CANTON", "csv"),
+ ("Insee", "COG", "COMMUNE", "csv"),
+ ("Insee", "COG", "ARRONDISSEMENT", "csv"),
+ ("Insee", "COG", "DEPARTEMENT", "csv"),
+ ("Insee", "COG", "REGION", "csv"),
+ ("Insee", "TAGC", "APPARTENANCE", "xlsx"),
+ ("Insee", "TAGIRIS", "APPARTENANCE", "xlsx"),
+ ("DGCL", "BANATIC", "CORRESPONDANCE-SIREN-INSEE-COMMUNES", "xlsx"),
+ ("Insee", "ZONAGES", "EPCI-FP", "xlsx"),
+ ("Insee", "ZONAGES", "EPT", "xlsx"),
+ ("Insee", "ZONAGES", "UNITES-URBAINES", "xlsx"),
+ ("Insee", "ZONAGES", "BASSINS-VIE", "xlsx"),
+ ("Insee", "ZONAGES", "AIRES-ATTRACTION-VILLES", "xlsx"),
+ ("Insee", "ZONAGES", "ZONES-EMPLOI", "xlsx"),
+ ("Insee", "POPULATION", "POPULATION-IRIS-COM", "xlsx"),
+ ("Insee", "POPULATION", "POPULATION-IRIS-FRANCE-HORS-MAYOTTE", "xlsx"),
+ ]
+ if THREADS_DOWNLOAD > 1:
+ with ThreadPool(THREADS_DOWNLOAD) as pool:
+ list(pool.map(retrieve_path, *zip(*args)).result())
+ else:
+ for provider, family, source, ext in args:
+ retrieve_path(
+ provider=provider, family=family, source=source, ext=ext
+ )
- with fs.open(path_bucket_cog_region, mode="rb") as remote_file:
- cog_region = pd.read_csv(
- remote_file, dtype_backend="pyarrow", dtype={"REG": "string[pyarrow]"}
- )
+ try:
+ [
+ paths_bucket[("COG", x)]
+ for x in ("REGION", "DEPARTEMENT", "ARRONDISSEMENT")
+ ]
+ except KeyError:
+ warnings.warn(f"{year=} metadata not constructed!")
+ return
+
+ # =========================================================================
+ # Part 2. : download and read all datasets from S3
+ # =========================================================================
+
+ def download(key, skip_rows):
+ try:
+ path = paths_bucket[key]
+ return key, s3_to_df(fs, path, skip_rows=skip_rows)
+ except KeyError:
+ # not there
+ return key, pd.DataFrame()
+
+ args = [
+ (("COG", "COMMUNE"), 0),
+ (("COG", "ARRONDISSEMENT"), 0),
+ (("COG", "DEPARTEMENT"), 0),
+ (("COG", "REGION"), 0),
+ (("COG", "COMMUNE-OUTRE-MER"), 0),
+ (("BANATIC", "CORRESPONDANCE-SIREN-INSEE-COMMUNES"), 0),
+ (("POPULATION", "POPULATION-IRIS-FRANCE-HORS-MAYOTTE"), 5),
+ (("POPULATION", "POPULATION-IRIS-COM"), 5),
+ (("ZONAGES", "EPCI-FP"), 5),
+ (("ZONAGES", "EPT"), 5),
+ (("ZONAGES", "UNITES-URBAINES"), 5),
+ (("ZONAGES", "BASSINS-VIE"), 5),
+ (("ZONAGES", "AIRES-ATTRACTION-VILLES"), 5),
+ (("ZONAGES", "ZONES-EMPLOI"), 5),
+ (("TAGIRIS", "APPARTENANCE"), 5),
+ (("TAGC", "APPARTENANCE"), 5),
+ (("COG", "CANTON"), 0),
+ ]
+ if THREADS_DOWNLOAD > 1:
+ with ThreadPool(THREADS_DOWNLOAD) as pool:
+ ddf = dict(pool.map(download, *zip(*args)).result())
+ else:
+ ddf = {key: download(key, skip)[-1] for key, skip in args}
- # Merge DEPARTEMENT and REGION COG metadata
+ # Merge ARR, DEPARTEMENT and REGION COG metadata
cog_metadata = (
- cog_dep.loc[:, ["DEP", "REG", "LIBELLE"]]
+ # Note : Mayotte (976) not in ARR
+ # -> take DEP & REG from cog dep & cog reg
+ ddf[("COG", "ARRONDISSEMENT")]
+ .loc[:, ["ARR", "DEP", "LIBELLE"]]
+ .rename({"LIBELLE": "LIBELLE_ARRONDISSEMENT"}, axis=1)
.merge(
- cog_region.loc[:, ["REG", "LIBELLE"]],
- on="REG",
- suffixes=["_DEPARTEMENT", "_REGION"],
+ ddf[("COG", "DEPARTEMENT")]
+ .loc[:, ["DEP", "REG", "LIBELLE"]]
+ .merge(
+ ddf[("COG", "REGION")].loc[:, ["REG", "LIBELLE"]],
+ on="REG",
+ suffixes=["_DEPARTEMENT", "_REGION"],
+ ),
+ on="DEP",
+ how="outer", # Nota : Mayotte not in ARR file
)
- .drop(columns=["REG"])
)
+ # Ex. cog_metadata :
+ # ARR DEP LIBELLE_ARRONDISSEMENT REG LIBELLE_DEPARTEMENT \
+ # 0 011 01 Belley 84 Ain
+ # 1 012 01 Bourg-en-Bresse 84 Ain
+
+ # LIBELLE_REGION
+ # 0 Auvergne-Rhône-Alpes
+ # 1 Auvergne-Rhône-Alpes
+
+ # Compute metadata at COMMUNE level
+ tagc = ddf[("TAGC", "APPARTENANCE")]
+ if tagc.empty:
+ warnings.warn(f"{year=} metadata for cities not constructed!")
+ cities = pd.DataFrame()
+ arm = pd.DataFrame()
+ else:
+ drop = {"CANOV", "CV"} & set(tagc.columns)
+ tagc = tagc.drop(list(drop), axis=1)
+
+ # Add various labels for zoning plans
+ zoning = {
+ "EPCI-FP": ("EPCI", "LIBEPCI"),
+ "EPT": ("EPT", "LIBEPT"),
+ "UNITES-URBAINES": ("UU", "LIBUU"),
+ "BASSINS-VIE": ("BV", "LIBBV"),
+ "AIRES-ATTRACTION-VILLES": ("AAV", "LIBAAV"),
+ "ZONES-EMPLOI": ("ZE", "LIBZE"),
+ }
+ for file_key, (key, label) in zoning.items():
+ labels = ddf[("ZONAGES", file_key)]
+ if not labels.empty:
+ labels = labels.dropna()
+
+ def find_from_pattern(target):
+ found = [
+ x
+ for x in labels.columns
+ if re.match(target + "([0-9]{4})?", x)
+ ]
+ if len(found) > 1 or not found:
+ warnings.warn(
+ f"could not find {target} in zonage {file_key}"
+ )
+ else:
+ return found[0]
+
+ pk_insee = find_from_pattern(key)
+ label_insee = find_from_pattern(label)
+ if not (pk_insee and label_insee):
+ continue
+
+ labels = labels.loc[:, [pk_insee, label_insee]]
+ labels = labels.rename(
+ {
+ label_insee: f"LIBELLE_{file_key.replace('-', '_')}",
+ },
+ axis=1,
+ )
+ try:
+ tagc = tagc.merge(labels, on=pk_insee, how="left")
+ except KeyError as e:
+ pass
+
+ cities = tagc.merge(
+ cog_metadata, on=["ARR", "DEP", "REG"], how="inner"
+ )
+ cities = cities.rename({"LIBGEO": "LIBELLE_COMMUNE"}, axis=1)
+
+ cog_tom = ddf[("COG", "COMMUNE-OUTRE-MER")]
+ if not cog_tom.empty:
+ keep = ["COM_COMER", "LIBELLE", "COMER", "LIBELLE_COMER"]
+ cog_tom = cog_tom.query("NATURE_ZONAGE=='COM'").loc[:, keep]
+ cog_tom = cog_tom.rename(
+ {
+ "COMER": "DEP",
+ "LIBELLE_COMER": "LIBELLE_DEPARTEMENT",
+ "COM_COMER": "CODGEO",
+ "LIBELLE": "LIBELLE_COMMUNE",
+ },
+ axis=1,
+ )
+ cities = pd.concat([cities, cog_tom], ignore_index=True)
+
+ cog_arm = ddf[("COG", "COMMUNE")].query("TYPECOM=='ARM'")
+ cog_arm = cog_arm.loc[:, ["TYPECOM", "COM", "LIBELLE", "COMPARENT"]]
+
+ arm = cities.merge(
+ cog_arm.drop("TYPECOM", axis=1).rename(
+ {
+ "COM": "CODE_ARM",
+ "LIBELLE": "LIBELLE_ARRONDISSEMENT_MUNICIPAL",
+ },
+ axis=1,
+ ),
+ how="left",
+ left_on="CODGEO",
+ right_on="COMPARENT",
+ ).drop("COMPARENT", axis=1)
+ ix = arm[arm.CODE_ARM.isnull()].index
+ arm.loc[ix, "CODE_ARM"] = arm.loc[ix, "CODGEO"]
+ arm.loc[ix, "LIBELLE_ARRONDISSEMENT_MUNICIPAL"] = arm.loc[
+ ix, "LIBELLE_COMMUNE"
+ ]
+ # Set unique ARR code (as "NumDEP" + "NumARR") to ensure dissolution
+ # is ok
+ for df in arm, cities:
+ ix = df[(df.ARR.notnull())].index
+ df.loc[ix, "INSEE_ARR"] = df.loc[ix, "DEP"] + df.loc[ix, "ARR"]
+
+ siren = ddf[("BANATIC", "CORRESPONDANCE-SIREN-INSEE-COMMUNES")]
+ if not siren.empty:
+ pop_communes = {
+ "PTOT_([0-9]{4})": "POPULATION_TOTALE",
+ "PMUN_[0-9]{4}": "POPULATION_MUNICIPALE",
+ "PCAP_[0-9]{4}": "POPULATION_COMPTEE_A_PART",
+ }
+ rename = {
+ col: f"{new}_" + re.findall("[0-9]{4}", col)[0]
+ for pattern, new in pop_communes.items()
+ for col in siren.columns
+ if re.match(pattern, col)
+ }
+ rename.update({"SIREN": "SIREN_COMMUNE"})
+ siren = siren.drop(
+ ["REG_COM", "DEP_COM", "NOM_COM"], axis=1
+ ).rename(rename, axis=1)
+
+ cities = cities.merge(
+ siren, how="left", left_on="CODGEO", right_on="INSEE"
+ ).drop("INSEE", axis=1)
+
+ # Do not keep populations for ARM (info is not available on ARM
+ # level for LYON or MARSEILLE)
+ drop = {
+ col: f"{new}_" + re.findall("[0-9]{4}", col)[0]
+ for pattern, new in pop_communes.items()
+ for col in siren.columns
+ if re.match(pattern, col)
+ }
+ arm = arm.merge(
+ siren[["SIREN_COMMUNE", "INSEE"]],
+ how="left",
+ left_on="CODGEO",
+ right_on="INSEE",
+ ).drop(
+ ["INSEE"],
+ axis=1,
+ )
+ for df in arm, cities:
+ df["SOURCE_METADATA"] = "Cartiflette, d'après INSEE & DGCL"
+
+ # Compute metadata at IRIS level
+ iris = ddf[("TAGIRIS", "APPARTENANCE")]
+ if iris.empty:
+ warnings.warn(f"{year=} metadata for iris not constructed!")
+ iris = pd.DataFrame()
+ else:
+ iris = iris.drop(columns=["LIBCOM", "UU2020", "REG", "DEP"])
+ rename = {"DEPCOM": "CODE_ARM", "LIB_IRIS": "LIBELLE_IRIS"}
+ iris = iris.rename(rename, axis=1)
+
+ # retrieve populations
+ pop_iris = pd.concat(
+ [
+ ddf[("POPULATION", "POPULATION-IRIS-FRANCE-HORS-MAYOTTE")],
+ ddf[("POPULATION", "POPULATION-IRIS-COM")],
+ ],
+ ignore_index=True,
+ )
+ if pop_iris.empty:
+ # all iris population dataframe are empty, triggering an exception
+ pop_iris = pd.DataFrame()
+ else:
+ pop_iris_field = re.compile("P[0-9]{2}_POP$")
+ pop_iris_field = [
+ x for x in pop_iris.columns if pop_iris_field.match(x)
+ ][0]
+ pop_iris = pop_iris.loc[:, ["IRIS", pop_iris_field]].rename(
+ {
+ pop_iris_field: "POPULATION_"
+ + re.findall("([0-9]{2})", pop_iris_field)[0]
+ }
+ )
+
+ iris = arm.merge(iris, on="CODE_ARM", how="left")
+
+ # Compute metadata at CANTON level
+ cantons = ddf[("COG", "CANTON")]
+ if cantons.empty:
+ warnings.warn(f"{year=} metadata for cantons not constructed!")
+ else:
+
+ # Set pure "CANTON" code (without dep part) to prepare for
+ # join with IGN's CANTON geodataset
+ cantons["INSEE_CAN"] = cantons["CAN"].str[-2:]
+
+ # Add Lyon if missing (<2024): single CANTON since creation of the
+ # metropole, not covering the whole dept, so this should be added
+ # before the merge operation like Paris, Martinique, etc.
+ ix = cantons[
+ (cantons.DEP == "69") & (cantons.NCC.str.contains("LYON"))
+ ].index
+ if ix.empty:
+ cantons = pd.concat(
+ [
+ cantons,
+ pd.DataFrame(
+ [
+ {
+ "CAN": "69NR",
+ "DEP": "69",
+ "REG": "84",
+ "INSEE_CAN": "NR",
+ "LIBELLE": "Lyon",
+ "INSEE_CAN": "NR",
+ }
+ ]
+ ),
+ ],
+ ignore_index=True,
+ )
+
+ # Merge CANTON metadata with COG metadata
+ cantons = cantons.merge(
+ # Nota : we do not have the CANTON -> ARR imbrication as of yet
+ # (except of course as a geospatial join...)
+ cog_metadata.drop(
+ ["ARR", "LIBELLE_ARRONDISSEMENT"], axis=1
+ ).drop_duplicates(),
+ on=["REG", "DEP"],
+ # Note : Martinique (972) and Guyane (973) missing from CANTON
+ # as well as Paris (75) for older vintages
+ # -> go for outer join
+ how="outer",
+ )
+ keep = [
+ "INSEE_CAN",
+ "CAN",
+ "DEP",
+ "REG",
+ "BURCENTRAL",
+ "TYPECT",
+ "LIBELLE",
+ "LIBELLE_DEPARTEMENT",
+ "LIBELLE_REGION",
+ ]
+ cantons = cantons.loc[:, keep].rename(
+ {"LIBELLE": "LIBELLE_CANTON"}, axis=1
+ )
+
+ # Hack to set PARIS, GUYANE and MARTINIQUE with the same key as IGN's
+ # dataset (if trully missing)
+ for dep, label in {
+ # Paris missing for year <2024 / for year 2024, key is 99 in IGN
+ # datasets, NR in INSEE's
+ "75": "Paris",
+ "973": "Guyane",
+ "972": "Martinique",
+ }.items():
+ ix = cantons[cantons.DEP == dep].index
+ if dep == "75" or cantons.loc[ix, "CAN"].isnull().all():
+ cantons.loc[ix, "INSEE_CAN"] = "NR"
+ cantons.loc[ix, "CAN"] = (
+ cantons.loc[ix, "DEP"] + cantons.loc[ix, "INSEE_CAN"]
+ )
+ if cantons.loc[ix, "CAN"].isnull().all():
+ cantons.loc[ix, "LIBELLE_CANTON"] = label
+
+ cantons["SOURCE_METADATA"] = "Cartiflette d'après INSEE"
+
+ rename = {
+ "DEP": "INSEE_DEP",
+ "REG": "INSEE_REG",
+ # "ARR": "INSEE_ARR", <- carefull, there is a INSEE_ARR already there!
+ "CODGEO": "INSEE_COM",
+ # "CAN": "INSEE_CAN", <- carefull, there is a INSEE_CAN already there!
+ "CODE_ARM": "INSEE_ARM",
+ }
+
+ # Prepare field used for bringing IDF closer in further pipeline steps
+ return_dict = {}
+ ile_de_france = pd.DataFrame({"DEP": ["75", "92", "93", "94"]})
+ ile_de_france["IDF"] = 1
- # Merge TAGC metadata with COG metadata
- tagc_metadata = tagc.merge(cog_metadata)
+ for label, df in [
+ ("IRIS", iris),
+ ("COMMUNE", cities),
+ ("CANTON", cantons),
+ ("ARRONDISSEMENT_MUNICIPAL", arm),
+ ]:
+ if not df.empty:
+ df = df.replace(np.nan, pd.NA)
+ df = df.merge(ile_de_france, on="DEP", how="left")
+ df["IDF"] = df["IDF"].fillna(0).astype(int)
+ df = df.rename(rename, axis=1)
+ return_dict[label] = df
- return tagc_metadata
+ return return_dict
diff --git a/cartiflette/pipeline/prepare_geodatasets.py b/cartiflette/pipeline/prepare_geodatasets.py
new file mode 100644
index 00000000..2a1488ba
--- /dev/null
+++ b/cartiflette/pipeline/prepare_geodatasets.py
@@ -0,0 +1,394 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from contextlib import ExitStack, nullcontext
+from copy import deepcopy
+from functools import partial
+from itertools import product
+import logging
+import os
+import re
+from tempfile import TemporaryDirectory
+import traceback
+from typing import Union, List
+import warnings
+
+import geopandas as gpd
+from pebble import ThreadPool
+import s3fs
+
+from cartiflette.config import (
+ FS,
+ BUCKET,
+ PATH_WITHIN_BUCKET,
+ THREADS_DOWNLOAD,
+ INTERMEDIATE_FORMAT,
+ DATASETS_HIGH_RESOLUTION,
+)
+from cartiflette.pipeline_constants import (
+ PIPELINE_DOWNLOAD_ARGS,
+ PIPELINE_SIMPLIFICATION_LEVELS,
+ COG_TERRITOIRE,
+)
+from cartiflette.s3.geodataset import (
+ S3GeoDataset,
+ concat_s3geodataset,
+ from_frame,
+)
+
+logger = logging.getLogger(__name__)
+
+
+COMPILED_TERRITORY = re.compile(r"territory=([a-z\-]*)/", flags=re.IGNORECASE)
+
+
+def make_one_geodataset(
+ dset: S3GeoDataset,
+ with_municipal_district: bool,
+ simplification: int,
+ communal_districts: S3GeoDataset = None,
+) -> str:
+ """
+ Generate one geodataset and upload it to S3FileSystem
+
+ Parameters
+ ----------
+ dset : S3GeoDataset
+ Basic geodataset with full France coverage, already downloaded. This
+ dataset should have a basic geometric mesh coherent with the `mesh`
+ argument. At the time of the docstring redaction, the dataset should
+ either be composed of cities or cantons.
+ with_municipal_district : bool
+ Whether to substitutes main cities (Paris, Lyon, Marseille) with
+ their municipal districts. Obviously, this can only be used with a
+ cities dataset.
+ simplification : int
+ Level of desired simplification.
+ communal_districts : S3GeoDataset, optional
+ Geodataset for communal districts (only : no cities are in), already
+ downloaded. Only needed if `mesh == 'COMMUNE'`. The default is None.
+
+ Returns
+ -------
+ uploaded : str
+ Uploaded file's path on S3FileSystem
+
+ """
+
+ mesh = dset.config["borders"]
+
+ if mesh != "COMMUNE" and with_municipal_district:
+ raise ValueError(
+ "with_municipal_district is not authorized with this S3GeoDataset "
+ f"(found {mesh=} instead of 'COMMUNE')"
+ )
+
+ log = "Create %s geodatasets with simplification=%s"
+ if with_municipal_district:
+ log += " with municipal districts substitution"
+ logger.info(log, mesh, simplification)
+
+ kwargs = {"format_output": INTERMEDIATE_FORMAT}
+
+ # Note : must escape the ' for the js command in mapshaper,
+ # hence the raw string
+ source = r"Cartiflette d\'après IGN simplifié à " + f"{simplification} %"
+ new_dset = dset.copy()
+ if with_municipal_district:
+ # substitute communal districts
+ districts = new_dset.substitute_municipal_districts(
+ communal_districts=communal_districts.copy(), **kwargs
+ )
+ else:
+ districts = nullcontext()
+
+ with new_dset, districts:
+ processed_dset = districts if with_municipal_district else new_dset
+ processed_dset.simplify(simplification=simplification, **kwargs)
+ processed_dset.add_field("GEODATA_SOURCE", f"'{source}'")
+ processed_dset.to_s3()
+ uploaded = processed_dset.s3_dirpath
+
+ return uploaded
+
+
+def create_one_year_geodataset_batch(
+ year: Union[str, int],
+ format_output: str = "geojson",
+ simplifications_values: List[int] = None,
+ bucket: str = BUCKET,
+ path_within_bucket: str = PATH_WITHIN_BUCKET,
+ fs: s3fs.S3FileSystem = FS,
+) -> dict:
+ """
+ Merge cities datasets into a single file (full France territory).
+
+ All files are retrieved from S3, projected to 4326 coordinates, then
+ merged using mapshaper. Every computation is done on the disk, inside
+ a temporary dir.
+
+ Note that multithreading will be used .
+ To debug code, please consider deactivating the threading, using
+ `cartiflette.config.THREADS = 1` beforehand.
+
+ Parameters
+ ----------
+ year : Union[str, int]
+ Desired vintage
+ format_output : str, optional
+ Final (and intermediate) formats to use. The default is "geojson"
+ simplifications_values : List[int], optional
+ List of simplifications' levels to compute (as percentage values
+ casted to integers). The default is None, which will result to
+ PIPELINE_SIMPLIFICATION_LEVELS.
+ bucket : str, optional
+ Storage bucket on S3 FileSystem. The default is BUCKET.
+ path_within_bucket : str, optional
+ Path within S3 bucket used for storage. The default is
+ PATH_WITHIN_BUCKET.
+ fs : s3fs.FyleSystem, optional
+ S3 file system used for storage of raw data. The default is FS.
+
+ Returns
+ -------
+ success : dict
+ {"year": True/False}
+
+ """
+
+ logger.info("-" * 50)
+ logger.info(f"Merging territorial files of cities for {year=}")
+ logger.info("-" * 50)
+
+ if not simplifications_values:
+ simplifications_values = PIPELINE_SIMPLIFICATION_LEVELS
+
+ paths = (
+ f"{bucket}/{path_within_bucket}/"
+ "provider=IGN/dataset_family=*/"
+ "source=*/"
+ f"year={year}/"
+ "administrative_level=None/"
+ "crs=*/"
+ "origin=raw/"
+ "vectorfile_format=*/"
+ "territory=*/**/*.shp"
+ )
+
+ paths = fs.glob(paths)
+ dirs = {os.path.dirname(x) for x in paths}
+ territories = {t for x in dirs for t in COMPILED_TERRITORY.findall(x)}
+ territories = territories - {"france_entiere"}
+
+ if not territories:
+ warnings.warn(f"{year} not constructed (no territories available)")
+ return
+
+ logger.info("Territoires identifiés:\n%s", "\n".join(territories))
+
+ config = {
+ "bucket": bucket,
+ "path_within_bucket": path_within_bucket,
+ "provider": "IGN",
+ "dataset_family": "ADMINEXPRESS",
+ "source": COG_TERRITOIRE[DATASETS_HIGH_RESOLUTION],
+ "borders": None,
+ "crs": "*",
+ "filter_by": "origin",
+ "value": "raw",
+ "vectorfile_format": "shp",
+ "simplification": 0,
+ "year": year,
+ "fs": fs,
+ }
+
+ uploaded = []
+
+ # Construct S3GeoDataset for municipal districts
+ raw_config = deepcopy(config)
+ kwargs = {"territory": "metropole", "filename": "ARRONDISSEMENT_MUNICIPAL"}
+ try:
+ districts = S3GeoDataset(**kwargs, **raw_config)
+ except ValueError:
+ # ARM is missing
+ districts = None
+
+ input_geodatasets = {}
+ # Retrieve raw files of cities, cantons and iris
+ dset_source_configs = {
+ "COMMUNE": PIPELINE_DOWNLOAD_ARGS["ADMIN-EXPRESS"][:3],
+ "CANTON": PIPELINE_DOWNLOAD_ARGS["ADMIN-EXPRESS"][:3],
+ "IRIS": PIPELINE_DOWNLOAD_ARGS["IRIS"][:3],
+ }
+ for mesh in "CANTON", "COMMUNE", "IRIS":
+
+ provider, family, source = dset_source_configs[mesh]
+ # Construct S3GeoDatasets for each territory (Guyane, metropole, ...)
+ # at mesh level (COMMUNE or CANTON)
+ mesh_config = deepcopy(config)
+ mesh_config["provider"] = provider
+ # Nota : filename for IRIS might be CONTOURS-IRIS.shp or IRIS_GE.shp
+ # while COMMUNE and CANTON are COMMUNE.shp and CANTON.shp
+ mesh_config["filename"] = f"*{mesh}*"
+ mesh_config["dataset_family"] = family
+ mesh_config["source"] = source
+ geodatasets = []
+ for territory in territories:
+ try:
+ geodatasets.append(
+ S3GeoDataset(territory=territory, **mesh_config)
+ )
+ except ValueError:
+ # not present for this territory and this mesh
+ logger.warning(
+ "file not found for %s on mesh=%s", territory, mesh
+ )
+ input_geodatasets[mesh] = None
+ continue
+
+ with TemporaryDirectory() as tempdir:
+ with ExitStack() as stack:
+ # download all datasets in context: download at enter
+ if THREADS_DOWNLOAD > 1:
+ threads = min(THREADS_DOWNLOAD, len(geodatasets))
+ with ThreadPool(threads) as pool:
+ geodatasets = list(
+ pool.map(
+ stack.enter_context,
+ geodatasets,
+ timeout=60 * 2,
+ ).result()
+ )
+ else:
+ geodatasets = [
+ stack.enter_context(dset) for dset in geodatasets
+ ]
+
+ if not geodatasets:
+ logger.warning(
+ "base geodataset from mesh=%s was not generated", mesh
+ )
+ continue
+
+ # concat S3GeoDataset
+ mesh_config.update(
+ {
+ "vectorfile_format": format_output,
+ "crs": 4326,
+ "borders": mesh,
+ "filter_by": "preprocessed",
+ "value": "before_cog",
+ "territory": "france",
+ "provider": "Cartiflette",
+ "dataset_family": "geodata",
+ }
+ )
+ dset = concat_s3geodataset(
+ geodatasets,
+ output_dir=tempdir,
+ output_name=mesh,
+ **mesh_config,
+ )
+
+ input_geodatasets[mesh] = dset.copy()
+
+ # clean intermediate datasets from local disk at exit (keep
+ # only concatenated S3GeoDataset, which exists only on local
+ # disk)
+
+ try:
+ # Capture ultramarine territories geometries from IRIS to complete the
+ # COMMUNE geodataset
+ with TemporaryDirectory() as tempdir:
+ with input_geodatasets["IRIS"].copy() as temp:
+ tom_from_iris = temp.only_ultramarine_territories().to_frame()
+ tom_from_iris = tom_from_iris.rename(
+ {"NOM_COM": "NOM"}, axis=1
+ )
+ cities = input_geodatasets["COMMUNE"].to_frame()
+ concat = gpd.pd.concat(
+ [tom_from_iris, cities], ignore_index=True
+ )
+ full_cities = from_frame(
+ concat, fs=fs, **input_geodatasets["COMMUNE"].config
+ )
+ input_geodatasets["COMMUNE"] = full_cities
+ except (AttributeError, KeyError, IndexError):
+ # AttributeError : input_geodatasets["IRIS"] is None
+ # IndexError : INSEE_COM field was not found in the dataset
+ pass
+
+ with (
+ input_geodatasets["COMMUNE"]
+ if input_geodatasets["COMMUNE"]
+ else nullcontext()
+ ) as commune, (
+ input_geodatasets["CANTON"]
+ if input_geodatasets["CANTON"]
+ else nullcontext()
+ ) as canton, (
+ input_geodatasets["IRIS"]
+ if input_geodatasets["IRIS"]
+ else nullcontext()
+ ) as iris, (
+ districts if districts else nullcontext()
+ ) as districts:
+ # download communal_districts and enter context for commune/canton/iris
+
+ if districts:
+ with_districts = [False, True]
+ else:
+ with_districts = [False]
+ warnings.warn("ARM could not be fetched")
+ args = (
+ list(product([commune], with_districts, simplifications_values))
+ + list((product([canton], [False], simplifications_values)))
+ + list((product([iris], [False], simplifications_values)))
+ )
+ args = [x for x in args if x[0]] # remove dsets with nullcontext
+
+ func = partial(
+ make_one_geodataset,
+ communal_districts=districts,
+ )
+
+ if THREADS_DOWNLOAD > 1:
+ # create geodatasets with multithreading
+ threads = min(THREADS_DOWNLOAD, len(args))
+ logger.info(
+ "Parallelizing simplifications with %s threads", threads
+ )
+ with ThreadPool(threads) as pool:
+ iterator = pool.map(func, *list(zip(*args))).result()
+
+ while True:
+ try:
+ uploaded.append(next(iterator))
+ except StopIteration:
+ break
+ except Exception:
+ logger.error(traceback.format_exc())
+ else:
+ # create geodatasets using a simple loop
+ for dset, with_municipal_district, simplification in args:
+ try:
+ uploaded.append(
+ func(
+ dset=dset,
+ with_municipal_district=with_municipal_district,
+ simplification=simplification,
+ )
+ )
+ except Exception:
+ logger.error(traceback.format_exc())
+
+ logger.info(f"Created files are : {uploaded}")
+
+ success = True if uploaded else False
+
+ return {year: success}
+
+
+if __name__ == "__main__":
+ logging.basicConfig(level=logging.INFO)
+ created = create_one_year_geodataset_batch(2023)
diff --git a/cartiflette/pipeline/prepare_mapshaper.py b/cartiflette/pipeline/prepare_mapshaper.py
deleted file mode 100644
index 3aae9457..00000000
--- a/cartiflette/pipeline/prepare_mapshaper.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import os
-
-from cartiflette.config import FS
-from cartiflette.s3 import list_raw_files_level, download_files_from_list
-
-
-def prepare_local_directory_mapshaper(
- path_bucket,
- borders="COMMUNE",
- territory="metropole",
- niveau_agreg="DEPARTEMENT",
- format_output="topojson",
- simplification=0,
- local_dir="temp",
- fs=FS,
-):
- """
- Prepares the local directory for processing with Mapshaper.
-
- This function creates a local directory structure and downloads
- raw shapefiles from the specified path in the file system.
-
- Parameters
- ----------
- path_bucket : str
- The path to the bucket in the file system.
- borders : str, optional
- The type of borders, by default "COMMUNE".
- niveau_agreg : str, optional
- The level of aggregation, by default "DEPARTEMENT".
- format_output : str, optional
- The output format, by default "topojson".
- simplification : int, optional
- The degree of simplification, by default 0.
- local_dir : str, optional
- The local directory for file storage, by default "temp".
- fs : FileSystem, optional
- The file system object, by default fs.
-
- Returns
- -------
- dict
- A dictionary containing paths for the original and destination directories.
-
- """
- local_dir = f"{local_dir}/{territory}"
- os.makedirs(local_dir, exist_ok=True)
- # Get all raw shapefiles from Minio
- list_raw_files = list_raw_files_level(fs, path_bucket, borders=borders)
- download_files_from_list(fs, list_raw_files, local_dir=local_dir)
- local_path_destination = (
- f"{local_dir}/{niveau_agreg}/{format_output}/{simplification=}"
- )
- os.makedirs(local_path_destination, exist_ok=True)
- paths = {"path_origin": local_dir, "path_destination": local_path_destination}
- return paths
diff --git a/cartiflette/pipeline_constants.py b/cartiflette/pipeline_constants.py
new file mode 100644
index 00000000..e3347d87
--- /dev/null
+++ b/cartiflette/pipeline_constants.py
@@ -0,0 +1,268 @@
+# -*- coding: utf-8 -*-
+
+import os
+from cartiflette.config import DATASETS_HIGH_RESOLUTION
+
+# Keys for COG_TERRITOIRE and IRIS are True for high resolution and False for
+# low resolution of datasets
+COG_TERRITOIRE = {
+ False: "EXPRESS-COG-CARTO-TERRITOIRE",
+ True: "EXPRESS-COG-TERRITOIRE",
+}
+IRIS = {
+ # Keys are DATASETS_HIGH_RESOLUTION's potential value
+ False: "CONTOUR-IRIS",
+ True: "IRIS-GE",
+}
+
+PIPELINE_DOWNLOAD_ARGS = {
+ "ADMIN-EXPRESS": [
+ "IGN",
+ "ADMINEXPRESS",
+ COG_TERRITOIRE[DATASETS_HIGH_RESOLUTION],
+ [
+ "guadeloupe",
+ "martinique",
+ "guyane",
+ "reunion",
+ "mayotte",
+ "metropole",
+ ],
+ ],
+ "IRIS": [
+ "IGN",
+ "IRIS",
+ IRIS[DATASETS_HIGH_RESOLUTION],
+ [
+ "guadeloupe",
+ "martinique",
+ "guyane",
+ "reunion",
+ "mayotte",
+ "metropole",
+ "saint-pierre-et-miquelon",
+ "saint-barthelemy",
+ "saint-martin",
+ ],
+ ],
+ "COG": [
+ "Insee",
+ "COG",
+ [
+ "COMMUNE",
+ "CANTON",
+ "ARRONDISSEMENT",
+ "DEPARTEMENT",
+ "REGION",
+ "COMMUNE-OUTRE-MER",
+ ],
+ "france_entiere",
+ ],
+ "TAGC": ["Insee", "TAGC", "APPARTENANCE"],
+ "TAGIRIS": ["Insee", "TAGIRIS", "APPARTENANCE"],
+ "CORRESPONDANCE-SIREN-INSEE-COMMUNES": [
+ "DGCL",
+ "BANATIC",
+ "CORRESPONDANCE-SIREN-INSEE-COMMUNES",
+ ],
+ "EPCI-FP": ["Insee", "ZONAGES", "EPCI-FP"],
+ "EPT": ["Insee", "ZONAGES", "EPT"],
+ "UNITES-URBAINES": ["Insee", "ZONAGES", "UNITES-URBAINES"],
+ "BASSINS-VIE": ["Insee", "ZONAGES", "BASSINS-VIE"],
+ "AIRES-ATTRACTION-VILLES": ["Insee", "ZONAGES", "AIRES-ATTRACTION-VILLES"],
+ "ZONES-EMPLOI": ["Insee", "ZONAGES", "ZONES-EMPLOI"],
+ "POPULATION": [
+ "Insee",
+ "POPULATION",
+ "POPULATION-IRIS-FRANCE-HORS-MAYOTTE",
+ ],
+ "POPULATION-COM": ["Insee", "POPULATION", "POPULATION-IRIS-COM"],
+}
+
+if os.environ.get("ENVIRONMENT", "test") != "test":
+ PIPELINE_CRS = [2154, 4326, 3857]
+ PIPELINE_SIMPLIFICATION_LEVELS = [100, 40]
+ PIPELINE_FORMATS = ["geojson", "topojson", "gpkg"]
+else:
+ PIPELINE_CRS = [4326]
+ PIPELINE_SIMPLIFICATION_LEVELS = [40]
+ PIPELINE_FORMATS = ["topojson"]
+
+
+# which dissolutions can be operated from a given raw geodataset, depending
+# of it's source (either from IRIS or from COMMUNES)
+AVAILABLE_DISSOLUTIONS_FROM_RAW_MESH = {
+ "IRIS": [
+ "IRIS",
+ "COMMUNE",
+ "ARRONDISSEMENT_MUNICIPAL",
+ "EPCI",
+ "EPT",
+ "UNITE_URBAINE",
+ "ZONE_EMPLOI",
+ "BASSIN_VIE",
+ "AIRE_ATTRACTION_VILLES",
+ "ARRONDISSEMENT",
+ "DEPARTEMENT",
+ "REGION",
+ "TERRITOIRE",
+ ],
+ "ARRONDISSEMENT_MUNICIPAL": [
+ "ARRONDISSEMENT_MUNICIPAL",
+ ],
+ "COMMUNE": [
+ "COMMUNE",
+ "EPCI",
+ "EPT",
+ "UNITE_URBAINE",
+ "ZONE_EMPLOI",
+ "BASSIN_VIE",
+ "AIRE_ATTRACTION_VILLES",
+ "ARRONDISSEMENT",
+ "DEPARTEMENT",
+ "REGION",
+ "TERRITOIRE",
+ ],
+ "CANTON": [
+ "CANTON",
+ ],
+}
+
+# which territorial splits can be derived from a given geodataset (which
+# borders' levels has been deduced from raw sources by dissolution)
+AVAILABLE_TERRITORIAL_SPLITS_FOR_BORDERS = {
+ # borders -> [filter_by1, filter_by2, ... ]
+ "IRIS": [
+ # "COMMUNE" -> too much files generated, trigger this only if usecase
+ # CANTON -> if INSEE can prepare a junction between IRIS and CANTON
+ "BASSIN_VIE",
+ "ZONE_EMPLOI",
+ "UNITE_URBAINE",
+ "AIRE_ATTRACTION_VILLES",
+ "EPT",
+ "ARRONDISSEMENT",
+ "DEPARTEMENT",
+ "REGION",
+ "TERRITOIRE",
+ "FRANCE_ENTIERE",
+ "FRANCE_ENTIERE_IDF_DROM_RAPPROCHES",
+ ],
+ "ARRONDISSEMENT_MUNICIPAL": [
+ "BASSIN_VIE",
+ "ZONE_EMPLOI",
+ "UNITE_URBAINE",
+ "AIRE_ATTRACTION_VILLES",
+ "EPT",
+ "ARRONDISSEMENT",
+ "DEPARTEMENT",
+ "REGION",
+ "TERRITOIRE",
+ "FRANCE_ENTIERE",
+ "FRANCE_ENTIERE_IDF_DROM_RAPPROCHES",
+ ],
+ "COMMUNE": [
+ "BASSIN_VIE",
+ "ZONE_EMPLOI",
+ "UNITE_URBAINE",
+ "AIRE_ATTRACTION_VILLES",
+ "EPT",
+ "ARRONDISSEMENT",
+ "DEPARTEMENT",
+ "REGION",
+ "TERRITOIRE",
+ "FRANCE_ENTIERE",
+ "FRANCE_ENTIERE_IDF_DROM_RAPPROCHES",
+ ],
+ "EPCI": [
+ "DEPARTEMENT",
+ "REGION",
+ "TERRITOIRE",
+ "FRANCE_ENTIERE",
+ "FRANCE_ENTIERE_DROM_RAPPROCHES",
+ ],
+ "EPT": [
+ "DEPARTEMENT",
+ "REGION",
+ "TERRITOIRE",
+ "FRANCE_ENTIERE",
+ "FRANCE_ENTIERE_DROM_RAPPROCHES",
+ ],
+ "CANTON": [
+ "DEPARTEMENT",
+ "REGION",
+ "TERRITOIRE",
+ "FRANCE_ENTIERE",
+ "FRANCE_ENTIERE_IDF_DROM_RAPPROCHES",
+ ],
+ "ARRONDISSEMENT": [
+ "DEPARTEMENT",
+ "REGION",
+ "TERRITOIRE",
+ "FRANCE_ENTIERE",
+ "FRANCE_ENTIERE_IDF_DROM_RAPPROCHES",
+ ],
+ "DEPARTEMENT": [
+ "REGION",
+ "TERRITOIRE",
+ "FRANCE_ENTIERE",
+ "FRANCE_ENTIERE_IDF_DROM_RAPPROCHES",
+ ],
+ "REGION": [
+ "TERRITOIRE",
+ "FRANCE_ENTIERE",
+ "FRANCE_ENTIERE_DROM_RAPPROCHES",
+ ],
+ "BASSIN_VIE": [
+ "TERRITOIRE",
+ "FRANCE_ENTIERE",
+ "FRANCE_ENTIERE_DROM_RAPPROCHES",
+ ],
+ "ZONE_EMPLOI": [
+ "TERRITOIRE",
+ "FRANCE_ENTIERE",
+ "FRANCE_ENTIERE_DROM_RAPPROCHES",
+ ],
+ "UNITE_URBAINE": [
+ "TERRITOIRE",
+ "FRANCE_ENTIERE",
+ "FRANCE_ENTIERE_DROM_RAPPROCHES",
+ ],
+ "AIRE_ATTRACTION_VILLES": [
+ "TERRITOIRE",
+ "FRANCE_ENTIERE",
+ "FRANCE_ENTIERE_DROM_RAPPROCHES",
+ ],
+}
+
+# Check integrity
+all_dissolutions = {
+ dissolution
+ for key, dissolutions in AVAILABLE_DISSOLUTIONS_FROM_RAW_MESH.items()
+ for dissolution in dissolutions
+}
+
+all_borders = {
+ split
+ for key, splits in AVAILABLE_TERRITORIAL_SPLITS_FOR_BORDERS.items()
+ for split in splits
+} | {
+ # unwanted splits (too much files without due use case)
+ "IRIS", # -> will never need to make a map for a given IRIS
+ "COMMUNE", # -> should never need to make a map for a given COMMUNE
+ "ARRONDISSEMENT_MUNICIPAL", # -> should never need to make a map for a given ARM
+ "CANTON", # -> might need it ?
+ "EPCI", # -> might need it ?
+}
+
+differences = (all_borders ^ all_dissolutions) - {
+ "FRANCE_ENTIERE",
+ "FRANCE_ENTIERE_DROM_RAPPROCHES",
+ "FRANCE_ENTIERE_IDF_DROM_RAPPROCHES",
+}
+if differences:
+ raise ValueError(
+ "keys of AVAILABLE_DISSOLUTIONS_FROM_RAW_MESH must be the same as "
+ "every available dissolution from "
+ "AVAILABLE_DISSOLUTIONS_FROM_RAW_MESH. Found the following "
+ f"differences : {differences}"
+ )
diff --git a/cartiflette/s3/__init__.py b/cartiflette/s3/__init__.py
index 1f124522..6a12633b 100644
--- a/cartiflette/s3/__init__.py
+++ b/cartiflette/s3/__init__.py
@@ -1,8 +1,12 @@
-from .upload_raw_s3 import upload_s3_raw
-from .list_files_s3 import download_files_from_list, list_raw_files_level
from .download_vectorfile import download_vectorfile_url_all
+from .geodataset import S3GeoDataset, concat_s3geodataset
+from .dataset import S3Dataset
+from .inventory import make_s3_inventory
__all__ = [
- "upload_s3_raw", "download_files_from_list", "list_raw_files_level",
- "download_vectorfile_url_all"
+ "download_vectorfile_url_all",
+ "S3GeoDataset",
+ "S3Dataset",
+ "concat_s3geodataset",
+ "make_s3_inventory",
]
diff --git a/cartiflette/s3/dataset.py b/cartiflette/s3/dataset.py
new file mode 100644
index 00000000..5f9b7820
--- /dev/null
+++ b/cartiflette/s3/dataset.py
@@ -0,0 +1,299 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Classe générique pour travailler autour d'un dataset présent sur le S3
+"""
+
+import logging
+import os
+import shutil
+import tempfile
+from typing import List
+import warnings
+
+from diskcache import Cache
+import pandas as pd
+from s3fs import S3FileSystem
+from retrying import retry
+
+
+from cartiflette.config import FS, RETRYING, MAPSHAPER_QUIET
+from cartiflette.utils import (
+ create_path_bucket,
+ ConfigDict,
+ DICT_CORRESP_ADMINEXPRESS,
+)
+
+logger = logging.getLogger(__name__)
+cache = Cache("cartiflette-s3-cache", timeout=3600)
+
+if not RETRYING:
+ # patch retrying
+ def retry(*args, **kwargs):
+ def decorator(func):
+ return func
+
+ return decorator
+
+
+class S3Dataset:
+ """
+ Base class representing a dataset stored on the S3
+
+ This class is used on it's own only for tabular datasets (to be joined to
+ S3GeoDataset for enrichment)
+ """
+
+ files = None
+ main_filename = None
+ s3_dirpath = None
+ local_dir = None
+
+ def __init__(
+ self,
+ fs: S3FileSystem = FS,
+ filename: str = "*",
+ build_from_local: str = None,
+ **config: ConfigDict,
+ ):
+ """
+ Create a S3Dataset.
+
+ Parameters
+ ----------
+ fs : S3FileSystem, optional
+ S3FileSystem used for storage. The default is FS.
+ filename : str, optional
+ In case there are multiple files into the same folder define it
+ to avoid catching the wrong file from S3FileSystem
+ (this should only occur with the download of raw datasets with
+ COMMUNE.shp and ARRONDISSEMENT_MUNICIPAL.shp being stored in the
+ same directory).
+ The default is "*".
+ For instance, "COMMUNE.shp"
+ build_from_local : str, optional
+ If the object is generated from local files, should be the path
+ to the main file of the dataset.
+ If None, the path will be deduced from the S3 and the main filename
+ also.
+ **config : ConfigDict
+ Other arguments to define the path on the S3 to the dataset.
+ """
+ self.fs = fs
+ self.config = config
+ self.build_from_local = build_from_local
+ self.local_files = []
+
+ self.filename = filename.rsplit(".", maxsplit=1)[0]
+
+ self.source = (
+ f"{config.get('provider', '')}:{config.get('source', '')}"
+ )
+
+ if build_from_local and not os.path.exists(build_from_local):
+ raise ValueError(f"File not found at {build_from_local}")
+
+ self.get_path_of_dataset()
+
+ def __str__(self):
+ return f""
+
+ def __repr__(self):
+ return self.__str__()
+
+ def __enter__(self):
+ "download file into local folder at enter"
+ if not self.build_from_local:
+ self.local_dir = tempfile.mkdtemp()
+ self.to_local_folder_for_mapshaper()
+ return self
+
+ def _get_columns(self, **kwargs):
+ "return the current dataset's columns"
+ df = self.to_frame(**kwargs, nrows=5)
+ return df.columns.tolist()
+
+ @staticmethod
+ def find_column_name(column: str, columns: List[str]) -> str:
+ """
+ Retrieve a column's full name among available columns, using a
+ compiled regex expression from DICT_CORRESP_ADMINEXPRESS.
+
+ Parameters
+ ----------
+ column : str
+ The searched column
+ columns : List[str]
+ The list of columns to search into.
+
+ Raises
+ ------
+ ValueError
+ If the searched column corresponds to more than one result.
+ IndexError
+ If the searched column is not found.
+
+
+ Returns
+ -------
+ str
+ The column's full name.
+
+ """
+ compiled = DICT_CORRESP_ADMINEXPRESS[column]
+ founds = [col for col in columns if compiled.match(col)]
+ if len(founds) > 1:
+ raise ValueError(f"{column=} matched multiple columns : {founds=}")
+ try:
+ return founds[0]
+ except IndexError as exc:
+ raise IndexError(
+ f"{column=}/{compiled=} not found among {columns=}"
+ ) from exc
+
+ def to_frame(self, **kwargs) -> pd.DataFrame:
+ return pd.read_csv(
+ os.path.join(self.local_dir, self.main_filename), **kwargs
+ )
+
+ def clean(self):
+ "remove files from local dir"
+ try:
+ try:
+ shutil.rmtree(self.local_dir)
+ except FileNotFoundError:
+ pass
+ except Exception as exc:
+ warnings.warn(exc)
+
+ def __exit__(self, *args, **kwargs):
+ "remove tempfiles at exit"
+ self.clean()
+
+ def get_path_of_dataset(self):
+ "retrieve dataset's full paths on S3"
+ path = os.path.dirname(create_path_bucket(self.config))
+ search = f"{path}/**/{self.filename}"
+ if self.filename != "*":
+ search += ".*"
+
+ init_level = logging.getLogger("botocore.credentials").level
+ if MAPSHAPER_QUIET:
+ logging.getLogger("botocore.credentials").setLevel(
+ logging.CRITICAL
+ )
+ self.s3_files = self.fs.glob(search)
+ logging.getLogger("botocore.credentials").setLevel(init_level)
+
+ if self.build_from_local:
+ # This S3Dataset has been created from a local file
+ self.s3_dirpath = path
+ self.local_dir = os.path.dirname(self.build_from_local)
+ self.main_filename = os.path.basename(self.build_from_local)
+
+ return
+
+ if not self.s3_files:
+ raise ValueError(
+ f"this dataset is not available on S3 on {search}"
+ )
+
+ if len(self.s3_files) > 1:
+ main_filename = (
+ self.s3_files[0].rsplit(".", maxsplit=1)[0] + ".shp"
+ )
+ else:
+ main_filename = self.s3_files[0]
+
+ self.main_filename = os.path.basename(main_filename)
+ self.s3_dirpath = os.path.dirname(main_filename)
+
+ @retry(stop_max_attempt_number=3, wait_fixed=2000)
+ def to_s3(self):
+ "upload file to S3"
+ target = self.s3_dirpath
+ if not target.endswith("/"):
+ target += "/"
+ logger.debug("sending %s -> %s", self.local_dir, target)
+
+ init_level = logging.getLogger("botocore.credentials").level
+ if MAPSHAPER_QUIET:
+ logging.getLogger("botocore.credentials").setLevel(
+ logging.CRITICAL
+ )
+ self.fs.put(self.local_dir + "/*", target, recursive=True)
+ logging.getLogger("botocore.credentials").setLevel(init_level)
+
+ def _read(self, src: str) -> bytes:
+ """
+ Read bytes from a file on S3FileSystem with disk cache support
+
+ Parameters
+ ----------
+ src : str
+ Source of file
+
+ Returns
+ -------
+ bytes
+ File content
+
+ """
+ try:
+ return cache[src]
+ except KeyError:
+ init_level = logging.getLogger("botocore.credentials").level
+ if MAPSHAPER_QUIET:
+ logging.getLogger("botocore.credentials").setLevel(
+ logging.CRITICAL
+ )
+ with self.fs.open(src, "rb") as f:
+ content = f.read()
+ logging.getLogger("botocore.credentials").setLevel(init_level)
+ cache[src] = content
+ return content
+
+ def download(self, src: str, dest: str):
+ """
+ Download a file from S3FileSystem to localdir with cache support
+
+ Parameters
+ ----------
+ src : str
+ Path of source file on S3FileSystem
+ dest : str
+ Path to write the file's content on local directory.
+
+ Returns
+ -------
+ None.
+
+ """
+ "download to dest with disk cache"
+ content = self._read(src)
+ with open(dest, "wb") as f:
+ f.write(content)
+
+ def to_local_folder_for_mapshaper(self):
+ "download to local dir and prepare for use with mapshaper"
+
+ if not self.s3_files:
+ raise ValueError(
+ f"this dataset is not available on S3 : {self.s3_dirpath}"
+ )
+
+ files = []
+
+ # Get all files (plural in case of shapefile) from Minio
+ logger.debug("downloading %s to %s", self.s3_files, self.local_dir)
+ for file in self.s3_files:
+ path = f"{self.local_dir}/{file.rsplit('/', maxsplit=1)[-1]}"
+ self.download(file, path)
+ logger.info("file written to %s", path)
+ files.append(path)
+
+ self.local_files = files
+
+ def update_s3_path_evaluation(self):
+ path = os.path.dirname(create_path_bucket(self.config))
+ self.s3_dirpath = path
diff --git a/cartiflette/s3/geodataset.py b/cartiflette/s3/geodataset.py
new file mode 100644
index 00000000..3379ab2a
--- /dev/null
+++ b/cartiflette/s3/geodataset.py
@@ -0,0 +1,1077 @@
+# -*- coding: utf-8 -*-
+
+from contextlib import ExitStack
+from copy import deepcopy
+from glob import glob
+from itertools import product
+import logging
+import os
+import re
+import shutil
+import tempfile
+from typing import List
+
+try:
+ from typing import Self
+except ImportError:
+ # python < 3.11
+ Self = "S3GeoDataset"
+
+import fiona
+import geopandas as gpd
+from pebble import ThreadPool
+from s3fs import S3FileSystem
+
+from .dataset import S3Dataset
+from cartiflette.mapshaper import (
+ mapshaper_convert_reproject,
+ mapshaper_enrich,
+ mapshaper_bring_closer,
+ mapshaper_split,
+ mapshaper_dissolve,
+ mapshaper_concat,
+ mapshaper_remove_cities_with_districts,
+ mapshaper_process_communal_districts,
+ mapshaper_combine_districts_and_cities,
+ mapshaper_simplify,
+ mapshaper_add_field,
+ mapshaper_capture_cities_from_ultramarine_territories,
+)
+from cartiflette.utils import ConfigDict
+from cartiflette.config import (
+ FS,
+ THREADS_DOWNLOAD,
+ INTERMEDIATE_FORMAT,
+ MAPSHAPER_QUIET,
+)
+from cartiflette.pipeline_constants import PIPELINE_CRS, PIPELINE_FORMATS
+from cartiflette.utils.dict_correspondance import (
+ create_format_driver,
+ create_format_standardized,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class S3GeoDataset(S3Dataset):
+ """
+ Base class representing a geodataset stored on the S3
+
+ An instance can either be an existing file loaded from the S3 or a new
+ geodataset in the process of creation. In that case, a warning will be
+ displayed at creation to alert that the file is not present on the S3
+ (yet).
+ """
+
+ def __str__(self):
+ return f""
+
+ def __copy__(self):
+ """
+ Copy a S3GeoDataset. If the original S3GeoDataset has already a
+ local_dir attribute, this will create a new tempdir inside it.
+ Note that this new tempdir will be removed at the primary S3GeoDataset
+ object's __exit__ method execution.
+
+ Returns
+ -------
+ new : S3GeoDataset
+ Copied S3GeoDataset.
+
+ """
+
+ if os.path.exists(os.path.join(self.local_dir, self.main_filename)):
+ # file is already on local disk -> create a new tempdir that should
+ # be cleaned on __exit__method anyway
+ new_tempdir = tempfile.mkdtemp()
+ target_name = self.main_filename.rsplit(".", maxsplit=1)[0]
+ for file in glob(os.path.join(self.local_dir, f"{target_name}.*")):
+ shutil.copy(file, new_tempdir)
+
+ new = S3GeoDataset(
+ self.fs,
+ self.filename,
+ build_from_local=os.path.join(
+ self.local_dir, self.main_filename
+ ),
+ **deepcopy(self.config),
+ )
+ new.local_dir = new_tempdir
+
+ else:
+ new = S3GeoDataset(
+ self.fs,
+ self.filename,
+ self.build_from_local,
+ **deepcopy(self.config),
+ )
+
+ new.main_filename = self.main_filename
+
+ return new
+
+ def to_format(self, format_output: str, epsg: int):
+ if format_output == INTERMEDIATE_FORMAT and epsg == 4326:
+ return self
+
+ if format_output in {
+ "shapefile",
+ "geojson",
+ "topojson",
+ "json",
+ "dbf",
+ "csv",
+ "tsv",
+ "svg",
+ }:
+ self.reproject(epsg=epsg, format_output=format_output)
+ else:
+ getattr(self, f"to_{format_output}")(epsg)
+ return self
+
+ def to_gpkg(self, epsg: int):
+ """
+ Replace the current main_file by a geopackage format (not handled by
+ mapshaper, needs geopandas)
+ """
+
+ # init_level = logging.getLogger("pyogrio").level
+ # if MAPSHAPER_QUIET:
+ # logging.getLogger("pyogrio._io").setLevel(logging.CRITICAL)
+
+ # try:
+ path = os.path.join(self.local_dir, self.main_filename)
+ path = path.rsplit(".", maxsplit=1)[0] + ".gpkg"
+ gdf = self.to_frame()
+ if epsg != 4326:
+ gdf = gdf.to_crs(epsg)
+ gdf.to_file(path, driver="GPKG", engine="fiona")
+ self._substitute_main_file(path)
+ self.config["vectorfile_format"] = "gpkg"
+ self.config["crs"] = epsg
+ self.update_s3_path_evaluation()
+ # except Exception:
+ # raise
+ # finally:
+ # logging.getLogger("pyogrio").setLevel(init_level)
+
+ # def to_shapefile(self):
+ # """
+ # TODO Quick and dirty hack, to be removed to handle native mapshaper
+ # output
+ # Replace the current main_file by a shapefile format (using geopandas)
+ # """
+ # path = os.path.join(self.local_dir, self.main_filename)
+ # path = path.rsplit(".", maxsplit=1)[0] + ".shp"
+ # self.to_frame().to_file(path)
+ # self._substitute_main_file(path)
+ # self.config["vectorfile_format"] = "shp"
+ # self.update_s3_path_evaluation()
+
+ def to_frame(self, **kwargs) -> gpd.GeoDataFrame:
+ "Read the geodataset from local file"
+ with fiona.Env(OGR_GEOJSON_MAX_OBJ_SIZE="0"):
+ return gpd.read_file(
+ os.path.join(self.local_dir, self.main_filename),
+ engine="fiona",
+ **kwargs,
+ )
+
+ def _get_columns(self, **kwargs):
+ "Get the columns of the dataset"
+ df = self.to_frame(**kwargs, rows=5)
+ return df.columns.tolist()
+
+ def copy(self):
+ """
+ Create a deepcopy of the S3GeoDataset (with a copy of initial file on
+ a new local dir if the initial object has a local file)
+ """
+ return self.__copy__()
+
+ def _substitute_main_file(self, new_file: str):
+ "Set a new file as reference for the S3GeoDataset from local disk"
+ if not os.path.dirname(new_file) == self.local_dir:
+ raise ValueError(
+ f"cannot substitute main_file with {new_file=} and "
+ f"{self.local_dir=} : directories are not identical"
+ )
+
+ if os.path.basename(new_file) == self.main_filename:
+ return
+
+ os.unlink(f"{self.local_dir}/{self.main_filename}")
+ self.main_filename = os.path.basename(new_file)
+
+ def reproject(
+ self,
+ epsg: int = 4326,
+ format_output: str = "geojson",
+ quiet: bool = MAPSHAPER_QUIET,
+ ):
+ "project to a given EPSG using mapshaper"
+ input_file = f"{self.local_dir}/{self.main_filename}"
+
+ new_file = mapshaper_convert_reproject(
+ input_file=input_file,
+ epsg=epsg,
+ output_dir=self.local_dir,
+ output_name=self.main_filename.rsplit(".", maxsplit=1)[0],
+ output_format=format_output,
+ filter_by=self.config["territory"],
+ quiet=quiet,
+ )
+ self._substitute_main_file(new_file)
+ self.config["crs"] = epsg
+ self.config["vectorfile_format"] = format_output
+ self.update_s3_path_evaluation()
+ return new_file
+
+ def add_field(
+ self,
+ label: str,
+ value: str,
+ format_output: str = "geojson",
+ quiet: bool = MAPSHAPER_QUIET,
+ ):
+ "add a static/dynamic field using mapshaper"
+ input_geodata = f"{self.local_dir}/{self.main_filename}"
+ output = mapshaper_add_field(
+ input_file=input_geodata,
+ label=label,
+ value=value,
+ output_dir=self.local_dir,
+ output_name=self.main_filename.rsplit(".", maxsplit=1)[0],
+ output_format=format_output,
+ quiet=quiet,
+ )
+ self._substitute_main_file(output)
+
+ def enrich(
+ self,
+ metadata_file: S3Dataset,
+ keys: list,
+ dtype: dict,
+ drop: list,
+ rename: dict,
+ format_output: str = "geojson",
+ quiet: bool = MAPSHAPER_QUIET,
+ ):
+ "enrich with metadata using mapshaper"
+ input_metadata = (
+ f"{metadata_file.local_dir}/{metadata_file.main_filename}"
+ )
+ input_geodata = f"{self.local_dir}/{self.main_filename}"
+ output = mapshaper_enrich(
+ input_geodata_file=input_geodata,
+ input_metadata_file=input_metadata,
+ keys=keys,
+ dtype=dtype,
+ drop=drop,
+ rename=rename,
+ output_dir=self.local_dir,
+ output_name=self.main_filename.rsplit(".", maxsplit=1)[0],
+ output_format=format_output,
+ quiet=quiet,
+ )
+ self._substitute_main_file(output)
+
+ def simplify(
+ self,
+ format_output: str,
+ simplification: int = 0,
+ quiet: bool = MAPSHAPER_QUIET,
+ ):
+ "simplify the geometries"
+ simplification = simplification if simplification else 0
+ if simplification != 0:
+ option_simplify = (
+ f"-simplify {simplification}% interval=.5 -clean "
+ )
+ else:
+ option_simplify = ""
+
+ input_geodata = f"{self.local_dir}/{self.main_filename}"
+ output = mapshaper_simplify(
+ input_geodata,
+ option_simplify=option_simplify,
+ output_dir=self.local_dir,
+ output_name=self.main_filename.rsplit(".", maxsplit=1)[0],
+ output_format=format_output,
+ quiet=quiet,
+ )
+
+ # update path on S3
+ self.config["simplification"] = simplification
+ self._substitute_main_file(output)
+ self.update_s3_path_evaluation()
+
+ if format_output.lower() == "topojson":
+ # cannot fix geometries with geopandas anyway
+ return
+
+ format_standardized = create_format_standardized()
+ gpd_driver = create_format_driver()
+ format_write = format_standardized[format_output.lower()]
+ driver = gpd_driver[format_write]
+
+ # Ensure geometries' validity
+ gdf = gpd.read_file(output, engine="fiona")
+ if not gdf["geometry"].is_valid.all():
+ gdf["geometry"] = gdf["geometry"].buffer(0)
+ gdf.to_file(output, driver=driver, engine="fiona")
+
+ def dissolve(
+ self,
+ by: List[str],
+ copy_fields: List[str] = None,
+ calc: List[str] = None,
+ format_output: str = "geojson",
+ quiet: bool = MAPSHAPER_QUIET,
+ ):
+ """
+ Dissolve geometries and rename local file using mapshaper.
+
+ Dissolve geometries on field `bv`, keeping fields `copy_fields`. Other
+ fields should be computaded using javascript functions with `calc`
+ argument. The original file will be overwritten, then renamed to
+ {by}.{formate_intermediate}. self.main_filename will be updated.
+
+
+ Parameters
+ ----------
+ by : List[str]
+ Fields used to dissolve
+ calc : Listr[str], optional
+ Fields on which computed should be operated, describing valid js
+ functions. For instance ["POPULATION=sum(POPULATION)"]. The default
+ is None.
+ copy_fields : List[str], optional
+ Copies values from the first feature in each group of dissolved
+ features. The default is None.
+ format_output : str, optional
+ Output format. The default is geojson
+ quiet : bool, optional
+ If True, inhibits console messages. The default is MAPSHAPER_QUIET.
+
+ Returns
+ -------
+ None.
+
+ """
+ init = f"{self.local_dir}/{self.main_filename}"
+ out = mapshaper_dissolve(
+ input_file=init,
+ by=by,
+ copy_fields=copy_fields,
+ calc=calc,
+ output_dir=self.local_dir,
+ output_name="_".join(by),
+ output_format=format_output,
+ quiet=quiet,
+ )
+ self._substitute_main_file(out)
+
+ def bring_drom_closer(
+ self,
+ level_agreg: str = "DEPARTEMENT",
+ format_output: str = "geojson",
+ bring_out_idf: bool = True,
+ quiet: bool = MAPSHAPER_QUIET,
+ ):
+ """
+ Bring ultramarine territories closer to France. This method is executed
+ **IN PLACE** and the attribute self.main_file will reference the new
+ geodataset.
+
+ Parameters
+ ----------
+ level_agreg : str, optional
+ The desired agregation. The default is "DEPARTEMENT".
+ Should be among ['AIRE_ATTRACTION_VILLES', 'BASSIN_VIE',
+ 'DEPARTEMENT', 'EMPRISES', 'REGION', 'UNITE_URBAINE',
+ 'ZONE_EMPLOI']
+ format_output : str, optional
+ The desired output format (which will also be used for intermediate
+ files creation). The default is "geojson".
+ bring_out_idf : bool, optional
+ If True, will extract IdF and zoom on it. The default is True.
+ quiet : bool, optional
+ If True, inhibits console messages. The default is MAPSHAPER_QUIET.
+
+ Returns
+ -------
+ None.
+
+ """
+
+ out = mapshaper_bring_closer(
+ input_file=f"{self.local_dir}/{self.main_filename}",
+ bring_out_idf=bring_out_idf,
+ output_dir=self.local_dir,
+ output_name="idf_combined",
+ output_format=format_output,
+ level_agreg=level_agreg,
+ quiet=quiet,
+ )
+ self._substitute_main_file(out)
+
+ def split_file(
+ self,
+ split_variable: str,
+ crs: int = 4326,
+ format_output: str = "geojson",
+ simplification: int = 0,
+ quiet: bool = MAPSHAPER_QUIET,
+ **kwargs,
+ ) -> list[Self]:
+ """
+ Split a file into singletons, based on one field (including
+ reprojection, simplification and format conversion if need be)
+
+ Parameters
+ ----------
+ split_variable : str
+ Variable to split files onto
+ crs : int, optional
+ EPSG to project the splitted file onto. The default is 4326.
+ format_output : str, optional
+ Choosen format to write the output on. The default is "geojson".
+ simplification : int, optional
+ Degree of simplification. The default is 0.
+ quiet : bool, optional
+ If True, inhibits console messages. The default is MAPSHAPER_QUIET.
+ kwargs :
+ Optional values for ConfigDict to ensure the correct generation of
+ the afferant geodatasets. For instance, `borders='DEPARTEMENT`
+
+ Returns
+ -------
+ list[S3GeoDataset]
+ return a list of S3GeoDataset objects
+
+ """
+
+ if simplification != 0:
+ option_simplify = (
+ f"-simplify {simplification}% interval=.5 -clean "
+ )
+ else:
+ option_simplify = ""
+
+ files = mapshaper_split(
+ input_file=f"{self.local_dir}/{self.main_filename}",
+ layer_name="",
+ split_variable=split_variable,
+ output_dir=f"{self.local_dir}/splitted",
+ output_format=format_output,
+ crs=crs,
+ option_simplify=option_simplify,
+ quiet=quiet,
+ )
+
+ geodatasets = []
+
+ for file in files:
+ new_config = deepcopy(self.config)
+ new_config.update(kwargs)
+ new_config.update(
+ {
+ "crs": crs,
+ "value": os.path.basename(file).replace(
+ f".{format_output}", ""
+ ),
+ "vectorfile_format": format_output,
+ "simplification": simplification,
+ }
+ )
+
+ geodatasets.append(
+ from_file(
+ file_path=file,
+ fs=self.fs,
+ **new_config,
+ ).copy()
+ )
+
+ return geodatasets
+
+ def create_downstream_geodatasets(
+ self,
+ metadata: S3Dataset,
+ init_geometry_level="IRIS",
+ dissolve_by="COMMUNE",
+ niveau_agreg="DEPARTEMENT",
+ simplification=0,
+ ) -> List[Self]:
+ """
+ TODO : update docstring (arguments also)
+ Create "children" geodatasets based on arguments and send them to S3.
+
+ Do the following processes:
+ - join the current geodataset with the metadata to enrich it;
+ - dissolve geometries if init_geometry_level != dissolve_by
+ - bring ultramarine territories closer
+ if niveau_agreg == "FRANCE_ENTIERE_DROM_RAPPROCHES"
+ - extract IDF if niveau_agreg=="FRANCE_ENTIERE_IDF_DROM_RAPPROCHES"
+ - split the geodataset based on niveau_agreg
+ - project the geodataset into the given CRS
+ - convert the file into the chosen output
+ - upload those datasets to S3 storage system
+
+ The "children" may result to a single file depending of niveau_agreg.
+
+ Note that some of those steps are done **IN PLACE** on the parent
+ geodataset (enrichment, dissolution, agregation). Therefore, the
+ geodataset should not be re-used after a call to this method.
+
+ Parameters
+ ----------
+ metadata : S3Dataset
+ The metadata file to use to enrich the geodataset
+ format_output : str, optional
+ The output format, by default "geojson".
+ init_geometry_level : str, optional
+ The level of basic mesh for the geometries. The default is IRIS.
+ Should be among ['IRIS', 'CANTON', 'ARRONDISSEMENT_MUNICIPAL']
+ dissolve_by : str, optional
+ The level of basic mesh for the geometries. The default is COMMUNE.
+ Should be among [
+ 'REGION', 'DEPARTEMENT', 'BASSIN_VIE',
+ 'AIRE_ATTRACTION_VILLES', 'UNITE_URBAINE', 'ZONE_EMPLOI',
+ 'TERRITOIRE', 'ARRONDISSEMENT_MUNICIPAL', 'EPCI', 'EPT',
+ ]
+ niveau_agreg : str, optional
+ The level of aggregation for splitting the dataset into singletons,
+ by default "DEPARTEMENT".
+ Should be among ['REGION', 'DEPARTEMENT', 'FRANCE_ENTIERE',
+ 'FRANCE_ENTIERE_DROM_RAPPROCHES', 'LIBELLE_REGION',
+ 'LIBELLE_DEPARTEMENT', 'BASSIN_VIE', 'AIRE_ATTRACTION_VILLES',
+ 'UNITE_URBAINE', 'ZONE_EMPLOI', 'TERRITOIRE']
+ crs : int, optional
+ The coordinate reference system (CRS) code to project the children
+ datasets into. By default 4326.
+ simplification : int, optional
+ The degree of wanted simplification, by default 0.
+
+ Returns
+ -------
+ List[S3GeoDataset]
+ The output path of the processed and split shapefiles.
+
+ """
+
+ output_crs_conf = [
+ {"epsg": x[0], "format_output": x[1]}
+ for x in product(PIPELINE_CRS, PIPELINE_FORMATS)
+ ]
+
+ niveau_agreg = niveau_agreg.upper()
+ init_geometry_level = init_geometry_level.upper()
+
+ simplification = simplification if simplification else 0
+
+ # Enrich files with metadata (COG, etc.)
+
+ available_columns = set(self._get_columns()) | set(
+ metadata._get_columns()
+ )
+
+ if init_geometry_level == "IRIS":
+ keys = ["CODE_IRIS", "CODE_IRIS"]
+ drop = ["ID", "NOM_COM"]
+ elif init_geometry_level == "ARRONDISSEMENT_MUNICIPAL":
+ keys = ["INSEE_ARM", "INSEE_ARM"]
+ drop = [
+ "POPULATION",
+ "ID",
+ "NOM_M",
+ ]
+ elif init_geometry_level == "COMMUNE":
+ keys = ["INSEE_COM", "INSEE_COM"]
+ drop = [
+ "POPULATION",
+ "ID",
+ "NOM_M",
+ ]
+ elif init_geometry_level == "CANTON":
+ keys = ["CAN", "CAN"]
+ drop = ["ID"]
+ self.add_field("CAN", "INSEE_DEP+INSEE_CAN")
+ else:
+ # TODO if new base mesh
+ pass
+
+ if len(set(keys) & available_columns) < len(set(keys)):
+ raise ValueError(
+ f"keys must be among {available_columns}, "
+ f"found {set(keys)} instead"
+ )
+
+ if len(set(drop) & available_columns) < len(drop):
+ missing = set(drop) - available_columns
+ raise ValueError(
+ f"drop must be among {available_columns}, following columns "
+ f"are missing : {missing}"
+ )
+
+ dtype = set(keys) | {
+ "SIREN_EPCI",
+ "SIREN_COMMUNE",
+ "INSEE_DEP",
+ "INSEE_REG",
+ "CAN",
+ "BURCENTRAL",
+ "REG",
+ "ZE[0-9]{4}",
+ "TUU[0-9]{4}",
+ "TDUU[0-9]{4}",
+ "TAAV[0-9]{4}",
+ "TDAAV[0-9]{4}",
+ "CATEAAV[0-9]{4}",
+ }
+ dtype = {
+ col: "str"
+ for x in dtype
+ for col in available_columns
+ if re.match(x, col)
+ }
+
+ self.enrich(
+ metadata_file=metadata,
+ keys=keys,
+ dtype=dtype,
+ drop=drop,
+ rename={},
+ format_output=INTERMEDIATE_FORMAT,
+ )
+
+ logger.info("new columns are %s", self._get_columns())
+
+ if init_geometry_level != dissolve_by:
+ # Dissolve geometries if desired (will replace the local file
+ # geodata file based on a communal mesh with one using the desired
+ # mesh
+
+ # Dissolve by both dissolve_by AND niveau_agreg to ensure both
+ # dissolution and splitability
+ gdf = self.to_frame()
+ available_columns = gdf.columns.tolist()
+ by = self.find_column_name(dissolve_by, available_columns)
+ if niveau_agreg not in (
+ "FRANCE_ENTIERE_DROM_RAPPROCHES",
+ "FRANCE_ENTIERE_IDF_DROM_RAPPROCHES",
+ ):
+ aggreg_col = self.find_column_name(
+ niveau_agreg, available_columns
+ )
+ else:
+ aggreg_col = "AREA"
+ keys = [by, aggreg_col]
+
+ # And keep all columns which are identical in each subgroup after
+ # dissolution + summable columns
+ keep = (
+ gdf.drop("geometry", axis=1)
+ .groupby(keys, dropna=False)
+ .nunique()
+ == 1
+ ).all()
+ keep = keep[keep].index.tolist()
+
+ calc = []
+ pops = [
+ x for x in available_columns if re.match("POPULATION.*", x)
+ ]
+ if pops:
+ calc += [f"{x}=sum({x})" for x in pops]
+ if "IDF" in available_columns:
+ calc += ["IDF=max(IDF)"]
+
+ by_keys = [by, aggreg_col]
+
+ self.dissolve(
+ by=by_keys,
+ copy_fields=keep,
+ calc=calc,
+ format_output=INTERMEDIATE_FORMAT,
+ )
+
+ # Bring ultramarine territories closer to France if needed
+ if niveau_agreg in (
+ "FRANCE_ENTIERE_DROM_RAPPROCHES",
+ "FRANCE_ENTIERE_IDF_DROM_RAPPROCHES",
+ ):
+ self.bring_drom_closer(
+ level_agreg=dissolve_by,
+ format_output=INTERMEDIATE_FORMAT,
+ bring_out_idf=(
+ niveau_agreg == "FRANCE_ENTIERE_IDF_DROM_RAPPROCHES"
+ ),
+ )
+
+ # Split datasets, based on the desired "niveau_agreg" and proceed to
+ # desired level of simplification
+ columns = self._get_columns()
+ split_by = self.find_column_name(niveau_agreg, columns)
+
+ new_datasets = self.split_file(
+ crs=4326,
+ format_output=INTERMEDIATE_FORMAT,
+ simplification=simplification,
+ split_variable=split_by,
+ filter_by=niveau_agreg,
+ borders=dissolve_by,
+ )
+
+ # fix config for storage on S3
+ dataset_family = {"dataset_family": "production"}
+ [dset.config.update(dataset_family) for dset in new_datasets]
+
+ new_datasets = [
+ dset.copy().to_format(**config)
+ for dset in new_datasets
+ for config in output_crs_conf
+ ]
+ [dset.update_s3_path_evaluation() for dset in new_datasets]
+
+ # Upload new datasets to S3
+ with ExitStack() as stack:
+ # enter context for each new dataset instead of looping to allow
+ # for multithreading (cleaned locally at exitstack anyway)
+ [stack.enter_context(dset) for dset in new_datasets]
+
+ if THREADS_DOWNLOAD > 1:
+ threads = min(THREADS_DOWNLOAD, len(new_datasets))
+ with ThreadPool(threads) as pool:
+
+ def upload(dset):
+ return dset.to_s3()
+
+ list(pool.map(upload, new_datasets).result())
+ else:
+ [dset.to_s3() for dset in new_datasets]
+
+ return new_datasets
+
+ def only_ultramarine_territories(
+ self, quiet: bool = MAPSHAPER_QUIET
+ ) -> Self:
+ """
+ Extracts only ultramarine territories from the given IRIS file and
+ dissolve it to cities.
+
+ Parameters
+ ----------
+ quiet : bool, optional
+ If True, inhibits console messages. The default is MAPSHAPER_QUIET.
+
+ Returns
+ -------
+ S3GeoDataset : new object with only the subset for COM
+
+ """
+ iris_file = f"{self.local_dir}/{self.main_filename}"
+ tom = mapshaper_capture_cities_from_ultramarine_territories(
+ input_city_file=iris_file,
+ output_dir=f"{self.local_dir}/tom",
+ output_name="TOM",
+ output_format=INTERMEDIATE_FORMAT,
+ quiet=quiet,
+ )
+ new_config = deepcopy(self.config)
+ new_config.update(
+ {"filter_by": "COLLECTIVITE_OUTRE_MER", "value": "France"}
+ )
+ tom = from_file(file_path=tom, **new_config)
+
+ gdf = tom.to_frame()
+ available_columns = gdf.columns.tolist()
+ by = self.find_column_name("COMMUNE", available_columns)
+ # keep all columns which are identical in each subgroup after
+ # dissolution + summable columns (like pop)
+ keep = (
+ gdf.drop("geometry", axis=1).groupby([by]).nunique() == 1
+ ).all()
+ keep = keep[keep].index.tolist()
+
+ calc = []
+ pops = [x for x in available_columns if re.match("POPULATION.*", x)]
+ if pops:
+ calc += [f"{x}=sum({x})" for x in pops]
+ if "IDF" in available_columns:
+ calc += ["IDF=max(IDF)"]
+
+ tom.dissolve(
+ by=[by],
+ copy_fields=keep,
+ calc=calc,
+ format_output=INTERMEDIATE_FORMAT,
+ )
+ return tom
+
+ def substitute_municipal_districts(
+ self,
+ communal_districts: Self,
+ format_output: str = "geojson",
+ quiet: bool = MAPSHAPER_QUIET,
+ ) -> Self:
+ """
+ Create a new composite S3GeoDataset from communal districts (Paris,
+ Lyon and Marseille) and other "classical" cities (having no communal
+ districts)
+
+ Parameters
+ ----------
+ communal_districts : S3GeoDataset
+ S3GeoDataset representing the communal districts (should be
+ already downloaded, so this should be generated through a with
+ statement).
+ format_output : str, optional
+ Desired output format. The default is "geojson".
+ quiet : bool, optional
+ If True, inhibits console messages. The default is MAPSHAPER_QUIET.
+
+ Returns
+ -------
+ S3GeoDataset
+ New S3GeoDataset object reprensenting the dataset. This dataset is
+ **NOT** sent to S3.
+
+ """
+
+ # preprocess cities : remove cities having communal districts
+ city_file = f"{self.local_dir}/{self.main_filename}"
+ city_file = mapshaper_remove_cities_with_districts(
+ input_city_file=city_file,
+ output_dir=f"{self.local_dir}/singles",
+ output_name="COMMUNE",
+ output_format=INTERMEDIATE_FORMAT,
+ quiet=quiet,
+ )
+
+ # note : communal_districts has it's self local_dir which should be
+ # in f"{self.local_dir}/{communal_districts.config['territory']}" !
+ communal_districts.reproject(format_output=format_output, epsg=4326)
+ communal_districts_file = (
+ f"{communal_districts.local_dir}/"
+ f"{communal_districts.main_filename}"
+ )
+
+ communal_districts_file = mapshaper_process_communal_districts(
+ input_communal_districts_file=communal_districts_file,
+ output_dir=f"{self.local_dir}/districts",
+ output_name="ARRONDISSEMENT_MUNICIPAL",
+ output_format=INTERMEDIATE_FORMAT,
+ quiet=quiet,
+ )
+
+ # MERGE CITIES AND ARRONDISSEMENT
+ composite = mapshaper_combine_districts_and_cities(
+ input_city_file=city_file,
+ input_communal_districts_file=communal_districts_file,
+ output_dir=self.local_dir,
+ output_name="ARRONDISSEMENT_MUNICIPAL",
+ output_format=format_output,
+ quiet=quiet,
+ )
+
+ # move file to new tempdir to isolate this file for new S3GeoDataset
+ new_tempdir = tempfile.mkdtemp()
+ shutil.move(composite, composite.replace(self.local_dir, new_tempdir))
+ composite = composite.replace(self.local_dir, new_tempdir)
+
+ os.unlink(city_file)
+ os.unlink(os.path.join(self.local_dir, self.main_filename))
+
+ new_config = deepcopy(self.config)
+ new_config.update({"borders": "ARRONDISSEMENT_MUNICIPAL"})
+ new_dataset = from_file(file_path=composite, **new_config)
+
+ return new_dataset
+
+
+def from_frame(
+ gdf: gpd.GeoDataFrame,
+ fs: S3FileSystem = FS,
+ **config: ConfigDict,
+) -> S3GeoDataset:
+ """
+ Create a new S3GeoDataset from a GeoDataFrame, config and fs.
+
+ The new object will write the the geodataframe into a new tempdir; this
+ tempdir will be cleaned at __exit__ method's execution. Therefore, the new
+ object should be created with a with statement, for instance:
+ >>> new_dset = geodataset.from_frame(gdf, fs, **config) as new_file:
+ >>> with new_dset:
+ >>> print(new_file)
+
+ Parameters
+ ----------
+ gdf : gpd.GeoDataFrame
+ GeoDataFrame to construct the S3GeoDataset from.
+ fs : S3FileSystem, optional
+ The S3FileSytem to use for storage. The default is FS.
+ **config : ConfigDict
+ Other arguments to define the path on the S3 to the dataset.
+
+ Returns
+ -------
+ dset : S3GeoDataset
+ New S3GeoDataset object.
+
+ """
+
+ extension = config.get("vectorfile_format", INTERMEDIATE_FORMAT)
+ filename = config.get("filename", None)
+ if not filename:
+ filename = config.get("borders", "file")
+ if "." not in filename:
+ filename = f"{filename}.{extension}"
+ with tempfile.TemporaryDirectory() as tempdir:
+ gdf.to_file(f"{tempdir}/{filename}", engine="fiona")
+ dset = from_file(f"{tempdir}/{filename}", fs, **config)
+
+ return dset
+
+
+def from_file(
+ file_path: str,
+ fs: S3FileSystem = FS,
+ **config: ConfigDict,
+) -> S3GeoDataset:
+ """
+ Create a new S3GeoDataset from a local file, config and fs.
+
+ The new object will copy the file(s) into a new tempdir; this tempdir will
+ be cleaned at __exit__ method's execution. Therefore, the new object should
+ be created with a with statement, for instance:
+ >>> new_dset = geodataset.from_file("blah.txt", fs, **config) as new_file:
+ >>> with new_dset:
+ >>> print(new_file)
+
+ Parameters
+ ----------
+ file_path : str
+ Path to the geodataset file to instantiate the new S3GeoDataset.
+ fs : S3FileSystem, optional
+ The S3FileSytem to use for storage. The default is FS.
+ **config : ConfigDict
+ Other arguments to define the path on the S3 to the dataset.
+
+ Returns
+ -------
+ dset : S3GeoDataset
+ New S3GeoDataset object.
+
+ """
+ if not os.path.exists(file_path):
+ raise ValueError("file not found from local path")
+
+ local_dir = os.path.dirname(file_path)
+ filename = os.path.basename(file_path)
+ vectorfile_format = filename.rsplit(".", maxsplit=1)[1]
+
+ for key in "filename", "vectorfile_format":
+ try:
+ del config[key]
+ except KeyError:
+ pass
+
+ # Create a new S3GeoDataset
+ dset = S3GeoDataset(
+ fs=fs,
+ filename=filename,
+ vectorfile_format=vectorfile_format,
+ build_from_local=file_path,
+ **config,
+ )
+ dset.local_dir = local_dir
+ dset.main_filename = filename
+
+ # Then create a copy to ensure the creation of a new tempdir
+ dset = dset.copy()
+
+ return dset
+
+
+def concat_s3geodataset(
+ datasets: List[S3GeoDataset],
+ output_name: str = "COMMUNE",
+ vectorfile_format: str = "geojson",
+ output_dir: str = "temp",
+ fs: S3FileSystem = FS,
+ quiet: bool = MAPSHAPER_QUIET,
+ **config_new_dset: ConfigDict,
+) -> S3GeoDataset:
+ """
+ Concatenate S3GeoDataset in the manner of a geopandas.concat using
+ mapshaper. The result is a new S3GeoDataset which will **NOT** be uploaded
+ on S3.
+
+ Parameters
+ ----------
+ datasets : List[S3GeoDataset]
+ The list of S3GeoDataset instances to concatenate.
+ output_name: str, optional
+ The name of the output layer. The default is 'COMMUNE'.
+ vectorfile_format : str, optional
+ The file format to use for creating the new S3GeoDataset. The default
+ is "geojson".
+ output_dir : str, optional
+ The temporary file used for processing the concatenation. The default
+ is "temp".
+ fs : S3FileSystem, optional
+ The S3FileSystem used ultimately to upload the new S3GeoDataset. The
+ default is FS.
+ quiet : bool, optional
+ If True, inhibits console messages. The default is MAPSHAPER_QUIET.
+ **config_new_dset : ConfigDict
+ Configuration reprensenting the new S3GeoDataset (used for initiation).
+ This will determine the path on the S3FileSystem during storage.
+
+ Returns
+ -------
+ S3GeoDataset
+ New concatenated S3GeoDataset
+
+ """
+
+ for k, dset in enumerate(datasets):
+ destination = os.path.join(output_dir, f"{k}.{vectorfile_format}")
+
+ if os.path.exists(os.path.join(dset.local_dir, dset.main_filename)):
+ # already downloaded, but not sure of the current projection
+ dset.reproject(format_output=vectorfile_format, epsg=4326)
+
+ shutil.copy(
+ os.path.join(dset.local_dir, dset.main_filename), destination
+ )
+ else:
+ with dset:
+ dset.reproject(format_output=vectorfile_format, epsg=4326)
+ shutil.copy(
+ os.path.join(dset.local_dir, dset.main_filename),
+ destination,
+ )
+
+ old_files = glob(f"{output_dir}/*.{vectorfile_format}")
+
+ output_path = mapshaper_concat(
+ input_dir=output_dir,
+ input_format=vectorfile_format,
+ output_dir=f"{output_dir}/preprocessed_combined",
+ output_name=output_name,
+ output_format=vectorfile_format,
+ quiet=quiet,
+ )
+
+ logger.info("new S3GeoDataset created at %s", output_path)
+
+ for file in old_files:
+ os.unlink(file)
+
+ file = glob(f"{output_dir}/preprocessed_combined/*.{vectorfile_format}")[0]
+ new_dset = from_file(file_path=file, fs=fs, **config_new_dset)
+
+ return new_dset
diff --git a/cartiflette/s3/inventory.py b/cartiflette/s3/inventory.py
new file mode 100644
index 00000000..7558d7f5
--- /dev/null
+++ b/cartiflette/s3/inventory.py
@@ -0,0 +1,236 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Dec 14 19:18:05 2024
+"""
+
+import json
+import logging
+import re
+
+import pandas as pd
+from s3fs import S3FileSystem
+
+from cartiflette.config import FS, BUCKET, PATH_WITHIN_BUCKET
+from cartiflette.pipeline_constants import COG_TERRITOIRE, IRIS
+
+logger = logging.getLogger(__name__)
+
+
+def nested_dict_from_multiindex(df: pd.DataFrame) -> dict:
+ """
+ Convenience function to transform a multiindexed DataFrame do a nested
+ dict, minimizing the dict's size.
+
+ Parameters
+ ----------
+ df : pd.DataFrame
+ Multiindexed DataFrame.
+
+ Returns
+ -------
+ dict
+ Nested dictionnary
+
+ """
+
+ result = {}
+ for idx, value in df["simplification"].items():
+ d_ref = result
+ for key in idx[:-1]:
+ if key not in d_ref:
+ d_ref[key] = {}
+ d_ref = d_ref[key]
+ d_ref[idx[-1]] = value
+ return result
+
+
+def flatten_dict(d: dict, parent_key: tuple = ()) -> dict:
+ """
+ Convenience function, flattens a nested dictionary and convert it back to
+ dataframe.
+
+ Parameters
+ ----------
+ d : dict
+ Nested dictionary
+ parent_key : tuple, optional
+ Optional key, used for recursive purposes. The default is ().
+
+ Returns
+ -------
+ dict
+ flattened dictionnary
+
+ """
+ items = []
+ for k, v in d.items():
+ new_key = parent_key + (k,)
+ if isinstance(v, dict):
+ items.extend(flatten_dict(v, new_key).items())
+ else:
+ items.append((new_key, v))
+ return dict(items)
+
+
+def make_s3_inventory(
+ fs: S3FileSystem = FS,
+ bucket: str = BUCKET,
+ path_within_bucket: str = PATH_WITHIN_BUCKET,
+):
+ """
+ Compute an inventory of all datasets generated by Cartiflette and push it
+ to the S3 File System as a single json file.
+
+ The json is pushed to f"{bucket}/{path_within_bucket}/inventory.json". It
+ uses a nested dictionnary format to ensure the json is small enough to
+ enhance download performances.
+
+ Parameters
+ ----------
+ fs : S3FileSystem, optional
+ S3 File System. The default is FS.
+ bucket : str, optional
+ Used bucket (both for inventory querying and json storage). The default
+ is BUCKET.
+ path_within_bucket : str, optional
+ Path used within bucket. The default is PATH_WITHIN_BUCKET.
+
+ Returns
+ -------
+ None.
+
+ """
+
+ paths = (
+ f"{bucket}/{path_within_bucket}/"
+ "provider=Cartiflette/dataset_family=production/"
+ "**/*"
+ )
+ # debug
+ # paths = (
+ # f"{bucket}/{path_within_bucket}/"
+ # "provider=Cartiflette/dataset_family=production/"
+ # "source=CONTOUR-IRIS/"
+ # "year=2023/"
+ # "administrative_level=IRIS/"
+ # "crs=4326/"
+ # "**/*"
+ # )
+
+ paths = fs.glob(paths)
+
+ compiled = re.compile(
+ ".*?/"
+ "source=(?P