refactor(NHDPlus HR waterbody preprocessing (preprocessing.py::prepro…

…cess_nhdplus_hr_waterbodies): represent NHDPlusIDs as strings, consistent with the rest of SFRmaker. * include min_areasqkm value (with '>=' instead of '>') when culling waterbodies
DOI-USGS · Jan 16, 2025 · 036ea3e · 036ea3e
1 parent ad8cb97
commit 036ea3e
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 8 deletions.
diff --git a/sfrmaker/preprocessing.py b/sfrmaker/preprocessing.py
@@ -1957,7 +1957,7 @@ def preprocess_nhdplus_hr_waterbodies(nhdplus_path, active_area,
         df = gpd.read_file(f, **kwargs)
         dfs.append(df)
     df = pd.concat(dfs)
-    wb_crs = df.crs
+    df['NHDPlusID'] = df['NHDPlusID'].astype(int).astype(str)
     if dest_crs is None:
         dest_crs = df.crs
     else:
@@ -1969,9 +1969,9 @@ def preprocess_nhdplus_hr_waterbodies(nhdplus_path, active_area,
         extent_poly = read_polygon_feature(
             active_area, dest_crs=dest_crs)
         intersects = np.array([g.intersects(extent_poly) for g in df.geometry])
-    loc = intersects & ~df['NHDPlusID'].isin(drop_waterbodies) & (df['AreaSqKm'] > min_areasqkm)
+    loc = intersects & ~df['NHDPlusID'].isin(drop_waterbodies) & (df['AreaSqKm'] >= min_areasqkm)
     df = df.loc[loc].copy()
-    df['NHDPlusID'] = df['NHDPlusID'].astype(int)
+
     df['FDate'] = pd.to_datetime(df['FDate']).dt.strftime('%Y-%m-%d')
     df.to_file(outfile)
     print(f'wrote {outfile}')
diff --git a/sfrmaker/test/test_preprocessing.py b/sfrmaker/test/test_preprocessing.py
@@ -420,15 +420,22 @@ def test_preprocess_nhdplus_hr_waterbodies(project_root_path, outdir):
     outfile = outdir / 'preprocessed_waterbodies.shp'
 
     # drop these waterbodies, regardless of size
-    drop_waterbodies = set()
+    drop_waterbodies = {'75004400013339'}
+
+    expected_lakes = {#'75004400013854', 
+                      '75004400011923', 
+                      '75004400012773'}
 
     preprocess_nhdplus_hr_waterbodies(nhdplus_path, 
-                                      active_area=(-151.00350, 60.64855, -150.96778, 60.67559), 
+                                      active_area=(-151.02, 60.64855, -150.96778, 60.67559), 
                                       drop_waterbodies=drop_waterbodies,
                                       min_areasqkm=0.05,
                                       dest_crs=26905, outfile=outfile)
     df = gpd.read_file(outfile)
     df.crs == 26905
-    assert 'Beaver Lake' in df['GNIS_Name'].values
-    # the next line shouldn't be there either
-    assert 75004400012864 not in df['NHDPlusID'].values
+    df['NHDPlusID'] = df['NHDPlusID'].astype(int).astype(str)
+    assert set(df['NHDPlusID']) == expected_lakes
+    # this lake is < 0.05 km2; should have been culled
+    assert '75004400012864' not in df['NHDPlusID'].values
+    for nhdplusid in drop_waterbodies:
+        assert int(nhdplusid) not in df['NHDPlusID'].values