Skip to content

Commit

Permalink
refactor(NHDPlus HR waterbody preprocessing (preprocessing.py::prepro…
Browse files Browse the repository at this point in the history
…cess_nhdplus_hr_waterbodies): represent NHDPlusIDs as strings, consistent with the rest of SFRmaker.

* include min_areasqkm value (with '>=' instead of '>') when culling waterbodies
  • Loading branch information
aleaf committed Jan 16, 2025
1 parent ad8cb97 commit 036ea3e
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 8 deletions.
6 changes: 3 additions & 3 deletions sfrmaker/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1957,7 +1957,7 @@ def preprocess_nhdplus_hr_waterbodies(nhdplus_path, active_area,
df = gpd.read_file(f, **kwargs)
dfs.append(df)
df = pd.concat(dfs)
wb_crs = df.crs
df['NHDPlusID'] = df['NHDPlusID'].astype(int).astype(str)
if dest_crs is None:
dest_crs = df.crs
else:
Expand All @@ -1969,9 +1969,9 @@ def preprocess_nhdplus_hr_waterbodies(nhdplus_path, active_area,
extent_poly = read_polygon_feature(
active_area, dest_crs=dest_crs)
intersects = np.array([g.intersects(extent_poly) for g in df.geometry])
loc = intersects & ~df['NHDPlusID'].isin(drop_waterbodies) & (df['AreaSqKm'] > min_areasqkm)
loc = intersects & ~df['NHDPlusID'].isin(drop_waterbodies) & (df['AreaSqKm'] >= min_areasqkm)
df = df.loc[loc].copy()
df['NHDPlusID'] = df['NHDPlusID'].astype(int)

df['FDate'] = pd.to_datetime(df['FDate']).dt.strftime('%Y-%m-%d')
df.to_file(outfile)
print(f'wrote {outfile}')
17 changes: 12 additions & 5 deletions sfrmaker/test/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,15 +420,22 @@ def test_preprocess_nhdplus_hr_waterbodies(project_root_path, outdir):
outfile = outdir / 'preprocessed_waterbodies.shp'

# drop these waterbodies, regardless of size
drop_waterbodies = set()
drop_waterbodies = {'75004400013339'}

expected_lakes = {#'75004400013854',
'75004400011923',
'75004400012773'}

preprocess_nhdplus_hr_waterbodies(nhdplus_path,
active_area=(-151.00350, 60.64855, -150.96778, 60.67559),
active_area=(-151.02, 60.64855, -150.96778, 60.67559),
drop_waterbodies=drop_waterbodies,
min_areasqkm=0.05,
dest_crs=26905, outfile=outfile)
df = gpd.read_file(outfile)
df.crs == 26905
assert 'Beaver Lake' in df['GNIS_Name'].values
# the next line shouldn't be there either
assert 75004400012864 not in df['NHDPlusID'].values
df['NHDPlusID'] = df['NHDPlusID'].astype(int).astype(str)
assert set(df['NHDPlusID']) == expected_lakes
# this lake is < 0.05 km2; should have been culled
assert '75004400012864' not in df['NHDPlusID'].values
for nhdplusid in drop_waterbodies:
assert int(nhdplusid) not in df['NHDPlusID'].values

0 comments on commit 036ea3e

Please sign in to comment.