ENH: keep the attributes around (#132)

* basic passthrough using 'first' * fix split * fix issues * lint * overwritten _status in agg func * fix lost attributes when there are no doubles to deal with * assert presence of highway * fix mypy tyoing --------- Co-authored-by: James Gaboardi <[email protected]>
uscuni · Dec 4, 2024 · 59344c5 · 59344c5
1 parent 903a253
commit 59344c5
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 20 deletions.
diff --git a/sgeop/nodes.py b/sgeop/nodes.py
@@ -1,3 +1,5 @@
+import typing
+
 import geopandas as gpd
 import momepy
 import networkx as nx
@@ -35,17 +37,21 @@ def split(
     split_points = gpd.GeoSeries(split_points, crs=crs)
     for split in split_points.drop_duplicates():
         _, ix = cleaned_roads.sindex.nearest(split, max_distance=eps)
-        edge = cleaned_roads.geometry.iloc[ix]
+        row = cleaned_roads.iloc[ix]
+        edge = row.geometry
         if edge.shape[0] == 1:
+            row = row.iloc[0]
             lines_split = _snap_n_split(edge.item(), split, eps)
             if lines_split.shape[0] > 1:
                 gdf_split = gpd.GeoDataFrame(geometry=lines_split, crs=crs)
+                for c in row.index.drop(["geometry", "_status"], errors="ignore"):
+                    gdf_split[c] = row[c]
                 gdf_split["_status"] = "changed"
                 cleaned_roads = pd.concat(
                     [cleaned_roads.drop(edge.index[0]), gdf_split],
                     ignore_index=True,
                 )
-        else:
+        elif edge.shape[0] > 1:
             to_be_dropped = []
             to_be_added = []
             for i, e in edge.items():
@@ -55,14 +61,26 @@ def split(
                     to_be_added.append(lines_split)
 
             if to_be_added:
-                gdf_split = gpd.GeoDataFrame(
-                    geometry=np.concatenate(to_be_added), crs=crs
+                gdf_split = pd.DataFrame(
+                    {"geometry": to_be_added, "_orig": to_be_dropped}
+                ).explode("geometry")
+                gdf_split = pd.concat(
+                    [
+                        gdf_split.drop(columns="_orig").reset_index(drop=True),
+                        row.drop(columns="geometry")
+                        .loc[gdf_split["_orig"]]
+                        .reset_index(drop=True),
+                    ],
+                    axis=1,
                 )
                 gdf_split["_status"] = "changed"
                 cleaned_roads = pd.concat(
                     [cleaned_roads.drop(to_be_dropped), gdf_split],
                     ignore_index=True,
                 )
+                cleaned_roads = gpd.GeoDataFrame(
+                    cleaned_roads, geometry="geometry", crs=crs
+                )
 
     return cleaned_roads.reset_index(drop=True)
 
@@ -485,7 +503,7 @@ def consolidate_nodes(
             # TODO: It is temporarily fixed by that explode in return
             geom.iloc[inds] = geom.iloc[inds].difference(cookie)
 
-            status.iloc[inds] = "snapped"
+            status.iloc[inds] = "changed"
             midpoint = np.mean(shapely.get_coordinates(cluster), axis=0)
             midpoints.append(midpoint)
             mids = np.array([midpoint] * len(pts))
@@ -504,7 +522,12 @@ def consolidate_nodes(
         geoms = np.hstack(spiders)
         gdf = pd.concat([gdf, gpd.GeoDataFrame(geometry=geoms, crs=geom.crs)])
 
+    agg: dict[str, str | typing.Callable] = {"_status": _status}
+    for c in gdf.columns.drop(gdf.active_geometry_name):
+        if c != "_status":
+            agg[c] = "first"
     return remove_false_nodes(
         gdf[~gdf.geometry.is_empty].explode(),
-        aggfunc={"_status": _status},
+        # NOTE: this aggfunc needs to be able to process all the columns
+        aggfunc=agg,
     )
diff --git a/sgeop/simplify.py b/sgeop/simplify.py
@@ -1,4 +1,5 @@
 import logging
+import typing
 import warnings
 
 import geopandas as gpd
@@ -253,7 +254,8 @@ def simplify_singletons(
                 stacklevel=2,
             )
 
-    # Split lines on new nodes
+    cleaned_roads = roads.drop(to_drop)
+    # split lines on new nodes
     cleaned_roads = split(split_points, roads.drop(to_drop), roads.crs)
 
     if to_add:
@@ -266,12 +268,20 @@ def simplify_singletons(
         new["_status"] = "new"
         new.geometry = new.simplify(max_segment_length * simplification_factor)
         new_roads = pd.concat([cleaned_roads, new], ignore_index=True)
+        agg: dict[str, str | typing.Callable] = {"_status": _status}
+        for c in cleaned_roads.columns.drop(cleaned_roads.active_geometry_name):
+            if c != "_status":
+                agg[c] = "first"
         non_empties = new_roads[~(new_roads.is_empty | new_roads.geometry.isna())]
-        new_roads = remove_false_nodes(non_empties, aggfunc={"_status": _status})
+        new_roads = remove_false_nodes(non_empties, aggfunc=agg)
 
-        return new_roads
+        final = new_roads
     else:
-        return cleaned_roads
+        final = cleaned_roads
+
+    if "coins_group" in final.columns:
+        final = final.drop(columns=[c for c in roads.columns if c.startswith("coins_")])
+    return final
 
 
 def simplify_pairs(
@@ -358,6 +368,17 @@ def simplify_pairs(
 
     # Determine typology dispatch if artifacts are present
     if not artifacts_w_info.empty:
+        agg = {
+            "coins_group": "first",
+            "coins_end": lambda x: x.any(),
+            "_status": _status,
+        }
+        for c in roads.columns.drop(
+            [roads.active_geometry_name, "coins_count"], errors="ignore"
+        ):
+            if c not in agg:
+                agg[c] = "first"
+
         sol_drop = "solution == 'drop_interline'"
         sol_iter = "solution == 'iterate'"
 
@@ -368,11 +389,7 @@ def simplify_pairs(
         # Re-run node cleaning on subset of fresh road edges
         roads_cleaned = remove_false_nodes(
             _drop_roads,
-            aggfunc={
-                "coins_group": "first",
-                "coins_end": lambda x: x.any(),
-                "_status": _status,
-            },
+            aggfunc=agg,
         )
 
         # Isolate drops to create merged pairs
@@ -398,9 +415,7 @@ def simplify_pairs(
         _1st = pd.DataFrame()
         _2nd = pd.DataFrame()
         for_skeleton = pd.DataFrame()
-        roads_cleaned = roads[
-            ["coins_group", "coins_end", "_status", roads.geometry.name]
-        ]
+        roads_cleaned = roads
 
     # Generate counts of COINs groups for edges
     coins_count = (
@@ -536,8 +551,12 @@ def simplify_clusters(
         max_segment_length * simplification_factor
     )
     new_roads = pd.concat([cleaned_roads, new], ignore_index=True).explode()
+    agg: dict[str, str | typing.Callable] = {"_status": _status}
+    for c in new_roads.columns.drop(new_roads.active_geometry_name):
+        if c != "_status":
+            agg[c] = "first"
     new_roads = remove_false_nodes(
-        new_roads[~new_roads.is_empty], aggfunc={"_status": _status}
+        new_roads[~new_roads.is_empty], aggfunc=agg
     ).drop_duplicates("geometry")
 
     return new_roads
@@ -718,9 +737,10 @@ def simplify_network(
     geopandas.GeoDataFrame
         The final, simplified road network line data.
     """
-
+    # NOTE: this keeps attributes but resets index
     roads = fix_topology(roads, eps=eps)
     # Merge nearby nodes (up to double of distance used in skeleton).
+    # NOTE: this drops attributes and resets index
     roads = consolidate_nodes(roads, tolerance=max_segment_length * 2.1)
 
     # Identify artifacts
@@ -859,6 +879,7 @@ def simplify_loop(
     clusters = artifacts.loc[artifacts["comp"].isin(counts[counts > 2].index)].copy()
 
     if not singles.empty:
+        # NOTE: this drops attributes
         roads = simplify_singletons(
             singles,
             roads,
@@ -887,4 +908,6 @@ def simplify_loop(
             consolidation_tolerance=consolidation_tolerance,
         )
 
+    if "coins_group" in roads.columns:
+        roads = roads.drop(columns=[c for c in roads.columns if c.startswith("coins_")])
     return roads
diff --git a/sgeop/tests/data/apalachicola_simplified_exclusion_mask.parquet b/sgeop/tests/data/apalachicola_simplified_exclusion_mask.parquet
diff --git a/sgeop/tests/data/apalachicola_simplified_standard.parquet b/sgeop/tests/data/apalachicola_simplified_standard.parquet
diff --git a/sgeop/tests/test_simplify.py b/sgeop/tests/test_simplify.py
@@ -87,6 +87,7 @@ def test_simplify_network_full_fua(aoi, tol, known_length):
         geopandas.read_parquet(full_fua_data / aoi / "original.parquet")
     )
     observed_length = observed.geometry.length.sum()
+    assert "highway" in observed.columns
 
     # storing GH artifacts
     artifact_dir = ci_artifacts / aoi