From d525d5957c7989414c11c59b69bfcfe9603ec981 Mon Sep 17 00:00:00 2001
From: Daniel Wiesmann <yellowcap@users.noreply.github.com>
Date: Thu, 16 Nov 2023 15:46:27 +0000
Subject: [PATCH] Landcover based sampling strategy. (#29)

* Add landcover based sampling scripts

Closes #28

* Drop duplicates, fix typo, uncomment compute_stats function.

* Fix comment that was out of sync with code
---
 scripts/landcover.py | 206 +++++++++++++++++++++++++++++++++++++++++++
 scripts/landcover.sh |  27 ++++++
 2 files changed, 233 insertions(+)
 create mode 100644 scripts/landcover.py
 create mode 100644 scripts/landcover.sh

diff --git a/scripts/landcover.py b/scripts/landcover.py
new file mode 100644
index 00000000..ecb81202
--- /dev/null
+++ b/scripts/landcover.py
@@ -0,0 +1,206 @@
+from pathlib import Path
+
+import click
+import fiona
+import geopandas
+import numpy
+import pandas
+import rasterio
+from fiona.crs import CRS
+from rasterio.windows import from_bounds
+from shapely.geometry import shape
+
+WGS84 = CRS.from_epsg(4326)
+NODATA = 0
+WATER = 80
+WATER_LOWER_TH = 0.2
+WATER_UPPER_TH = 0.7
+RANDOM_SEED = 42
+CLASSES = {
+    10: "Tree cover",
+    20: "Shrubland",
+    30: "Grassland",
+    40: "Cropland",
+    50: "Built-up",
+    60: "Bare / sparse vegetation",
+    70: "Snow and Ice",
+    80: "Permanent water bodies",
+    90: "Herbaceous wetland",
+    95: "Mangroves",
+    100: "Moss and lichen",
+}
+SCHEMA = {
+    "geometry": "MultiPolygon",
+    "properties": {
+        "name": "str",
+        "count": "int",
+        "Tree cover": "int",
+        "Shrubland": "int",
+        "Grassland": "int",
+        "Cropland": "int",
+        "Built-up": "int",
+        "Bare / sparse vegetation": "int",
+        "Snow and Ice": "int",
+        "Permanent water bodies": "int",
+        "Herbaceous wetland": "int",
+        "Mangroves": "int",
+        "Moss and lichen": "int",
+    },
+}
+
+
+@click.command()
+@click.option(
+    "--wd",
+    required=True,
+    type=str,
+)
+@click.option(
+    "--worldcover",
+    required=True,
+    type=str,
+)
+@click.option(
+    "--mgrs",
+    required=True,
+    type=str,
+)
+def process(wd, worldcover, mgrs):
+    """
+    Run statistics and sampling.
+    """
+    compute_stats(wd, worldcover, mgrs)
+    sample(wd)
+
+
+def compute_stats(wd, worldcover, mgrs):
+    """
+    Compute statistics of Worldcover data over MGRS tiles.
+    """
+    result = []
+    with rasterio.open(worldcover) as cover:
+        with fiona.open(mgrs, "r") as tiles:
+            assert cover.crs.to_epsg() == tiles.crs.to_epsg()
+            for tile in tiles:
+                print(tile.properties.get("Name"))
+                # Split polygons in parts, mgrs tiles at the dateline
+                # are split into two parts in the mgrs source file.
+                parts = shape(tile["geometry"]).geoms
+                pixels = []
+                for polygon in parts:
+                    bounds = from_bounds(*polygon.bounds, cover.transform)
+                    pixels.append(
+                        cover.read(
+                            1,
+                            window=bounds,
+                        ).ravel()
+                    )
+                pixels = numpy.hstack(pixels)
+
+                pixels = pixels[pixels != NODATA]
+
+                if not pixels.size:
+                    continue
+                elif numpy.all(pixels == WATER):
+                    continue
+                else:
+                    props = {}
+                    for key, classname in CLASSES.items():
+                        props[str(classname)] = int(numpy.sum(pixels == key))
+                    props["name"] = tile.properties.get("Name")
+                    props["count"] = int(len(numpy.unique(pixels)))
+
+                    result.append(
+                        {
+                            "geometry": dict(tile["geometry"]),
+                            "properties": props,
+                        }
+                    )
+
+    with fiona.open(
+        Path(wd, "mgrs_stats.fgb"),
+        "w",
+        driver="FlatGeobuf",
+        crs=WGS84,
+        schema=SCHEMA,
+    ) as colxn:
+        colxn.writerecords(result)
+
+
+def split_highest(data, column, size, pool=1000, seed=RANDOM_SEED):
+    """
+    Split highest values of a column from a dataframe.
+    """
+    data.sort_values(column, ascending=False, inplace=True)
+    return data[:pool].sample(size, random_state=seed)
+
+
+def percentages(data):
+    """
+    Normalize all numerical columns to percentages
+    """
+    data_num = data.select_dtypes(include="number")
+    data_norm = data_num.div(data_num.sum(axis=1), axis=0)
+    data[data_norm.columns] = data_norm
+
+    return data
+
+
+def sample(wd):
+    """
+    Sample the mgrs tiles based on landcover statistics.
+
+    Target: ~1000 tiles
+    Set very small counts to zero. Exclude high latitudes.
+    200 samples from the 2000 most diverse
+    50 samples from the 1000 highest for all other categories except water
+    100 samples from all tiles with water between 30% an 70% (making sure we
+    capture some, but exclude only purely water so we catch coasts)
+    """
+    data = geopandas.read_file(Path(wd, "mgrs_stats.fgb"))
+
+    data_norm = percentages(data.loc[:, data.columns != "count"])
+    data[data_norm.columns] = data_norm
+
+    diversity = split_highest(data, "count", 200, 2000)
+    urban = split_highest(data, "Built-up", 200)
+    wetland = split_highest(data, "Herbaceous wetland", 50)
+    mangroves = split_highest(data, "Mangroves", 50)
+    moss = split_highest(data, "Moss and lichen", 50)
+    cropland = split_highest(data, "Cropland", 50)
+    trees = split_highest(data, "Tree cover", 50)
+    shrubland = split_highest(data, "Shrubland", 50)
+    grassland = split_highest(data, "Grassland", 50)
+    bare = split_highest(data, "Bare / sparse vegetation", 50)
+    snow = split_highest(data, "Snow and Ice", 50)
+
+    selector = numpy.logical_and(
+        data["Permanent water bodies"] > WATER_LOWER_TH,
+        data["Permanent water bodies"] < WATER_UPPER_TH,
+    )
+    water = data[selector].sample(100, random_state=RANDOM_SEED)
+
+    result = pandas.concat(
+        [
+            diversity,
+            urban,
+            wetland,
+            mangroves,
+            moss,
+            cropland,
+            trees,
+            shrubland,
+            grassland,
+            bare,
+            snow,
+            water,
+        ]
+    )
+
+    result = result.drop_duplicates(subset=["name"])
+
+    result.to_file(Path(wd, "mgrs_sample.geojson", driver="GeoJSON"))
+
+
+if __name__ == "__main__":
+    process()
diff --git a/scripts/landcover.sh b/scripts/landcover.sh
new file mode 100644
index 00000000..8483a2e1
--- /dev/null
+++ b/scripts/landcover.sh
@@ -0,0 +1,27 @@
+wd=/datadisk
+
+# Download Worldcover layer
+aws s3 sync s3://esa-worldcover/v200/2021/map $wd/esa-worldcover-v200-2021-map --no-sign-request
+
+# Download MGRS grid kml and convert to fgb
+curl -o $wd/S2A_OPER_GIP_TILPAR_MPC__20151209T095117_V20150622T000000_21000101T000000_B00.kml https://hls.gsfc.nasa.gov/wp-content/uploads/2016/03/S2A_OPER_GIP_TILPAR_MPC__20151209T095117_V20150622T000000_21000101T000000_B00.kml
+ogr2ogr \
+    -overwrite\
+    $wd/mgrs.fgb\
+    $wd/S2A_OPER_GIP_TILPAR_MPC__20151209T095117_V20150622T000000_21000101T000000_B00.kml \
+    -nlt multipolygon\
+    -sql "select Name, ExtractMultiPolygon(geometry) as geometry from Features"\
+    -dialect sqlite
+
+# Reduce resolution and merge
+counter=0
+for file in $wd/esa-worldcover-v200-2021-map/*.tif; do
+    counter=$((counter+1))
+    echo "$counter $file"
+    gdal_translate -ovr 3 $file $wd/tmp/tmp_$counter.tif
+done
+
+gdal_merge.py -o $wd/worldcover.tif $wd/tmp/*.tif
+
+# Intersect worldcover with mgrs and sample based on landcover
+python landcover.py --wd=$wd --worldcover=$wd/worldcover.tif --mgrs=$wd/mgrs.fgb