From 149d7da400b4e82be8fd43d3bcd41dd6511eb152 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Tue, 15 Aug 2023 15:14:55 -0700 Subject: [PATCH 01/43] chore(deps): add flox dependency >=0.7.2 --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index eb5ca32c8..11fe3b2ec 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,4 @@ xarray-datatree==0.0.6 psutil>=5.9.1 more-itertools==8.13.0 geopy +flox>=0.7.2,<1.0.0 From d911c45dda2cbee7f973fba1d7589e2c24b911de Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Tue, 15 Aug 2023 15:18:10 -0700 Subject: [PATCH 02/43] fix(commongrid): fixes 'compute_MVBS' so it can work better and scale Under the hood, binning along ping time and echo range now uses flox. This allows for scalability and more community-maintained. --- echopype/commongrid/api.py | 30 +++++++++------- echopype/commongrid/mvbs.py | 72 +++++++++++++++++++++++-------------- 2 files changed, 63 insertions(+), 39 deletions(-) diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index f05cdb96b..0969e5af2 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -94,32 +94,38 @@ def compute_MVBS(ds_Sv, range_meter_bin=20, ping_time_bin="20S"): A dataset containing bin-averaged Sv """ + # Clean up filenames dimension if it exists + # not needed here + if "filenames" in ds_Sv.dims: + ds_Sv = ds_Sv.drop_dims("filenames") + # create bin information for echo_range range_interval = np.arange(0, ds_Sv["echo_range"].max() + range_meter_bin, range_meter_bin) # create bin information needed for ping_time - ping_interval = ( - ds_Sv.ping_time.resample(ping_time=ping_time_bin, skipna=True).asfreq().ping_time.values + d_index = ( + ds_Sv["ping_time"] + .resample(ping_time=ping_time_bin, skipna=True) + .asfreq() + .to_dataframe() + .index ) + ping_interval = d_index.union([d_index[-1] + pd.Timedelta(ping_time_bin)]) # calculate the MVBS along each channel - MVBS_values = get_MVBS_along_channels(ds_Sv, range_interval, ping_interval) + raw_MVBS = get_MVBS_along_channels(ds_Sv, range_interval, ping_interval) # create MVBS dataset + # by transforming the binned dimensions to regular coords ds_MVBS = xr.Dataset( - data_vars={"Sv": (["channel", "ping_time", "echo_range"], MVBS_values)}, + data_vars={"Sv": (["channel", "ping_time", "echo_range"], raw_MVBS.data)}, coords={ - "ping_time": ping_interval, - "channel": ds_Sv.channel, - "echo_range": range_interval[:-1], + "ping_time": np.array([v.left for v in raw_MVBS.ping_time_bins.values]), + "channel": raw_MVBS.channel.values, + "echo_range": np.array([v.left for v in raw_MVBS.echo_range_bins.values]), }, ) - # TODO: look into why 'filenames' exist here as a variable - # Added this check to support the test in test_process.py::test_compute_MVBS - if "filenames" in ds_MVBS.variables: - ds_MVBS = ds_MVBS.drop_vars("filenames") - # ping_time_bin parsing and conversions # Need to convert between pd.Timedelta and np.timedelta64 offsets/frequency strings # https://xarray.pydata.org/en/stable/generated/xarray.Dataset.resample.html diff --git a/echopype/commongrid/mvbs.py b/echopype/commongrid/mvbs.py index 2c0006f35..81095c9f0 100644 --- a/echopype/commongrid/mvbs.py +++ b/echopype/commongrid/mvbs.py @@ -8,6 +8,7 @@ import dask.array import numpy as np import xarray as xr +from flox.xarray import xarray_reduce def get_bin_indices( @@ -404,9 +405,32 @@ def bin_and_mean_2d( return final +def _linear_transform( + data: Union[dask.array.Array, np.ndarray], inverse: bool = False +) -> Union[dask.array.Array, np.ndarray]: + """ + Perform linear transform on data + + Parameters + ---------- + data : dask.array.Array or np.ndarray + The data to be transformed + inverse : bool + If True, perform inverse transform + + Returns + ------- + dask.array.Array or np.ndarray + The transformed data + """ + if inverse: + return 10 * dask.array.log10(data) + return 10 ** (data / 10) + + def get_MVBS_along_channels( ds_Sv: xr.Dataset, echo_range_interval: np.ndarray, ping_interval: np.ndarray -) -> np.ndarray: +) -> xr.Dataset: """ Computes the MVBS of ``ds_Sv`` along each channel for the given intervals. @@ -417,45 +441,39 @@ def get_MVBS_along_channels( A Dataset containing ``Sv`` and ``echo_range`` data with coordinates ``channel``, ``ping_time``, and ``range_sample`` echo_range_interval: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``echo_range`` + 1D array (used by np.digitize) representing + the binning required for ``echo_range`` ping_interval: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``ping_time`` + 1D array (used by np.digitize) representing + the binning required for ``ping_time`` Returns ------- - np.ndarray - The MVBS value of the input ``ds_Sv`` for all channels - - Notes - ----- - If the values in ``ds_Sv`` are delayed then the binning and mean of ``Sv`` with - respect to ``echo_range`` will take place, then the delayed result will be computed, - and lastly the binning and mean with respect to ``ping_time`` will be completed. It - is necessary to apply a compute midway through this method because Dask graph layers - get too large and this makes downstream operations very inefficient. + xr.Dataset + The MVBS dataset of the input ``ds_Sv`` for all channels """ all_MVBS = [] for chan in ds_Sv.channel: - # squeeze to remove "channel" dim if present - # TODO: not sure why not already removed for the AZFP case. Investigate. ds = ds_Sv.sel(channel=chan).squeeze() # average should be done in linear domain - sv = 10 ** (ds["Sv"] / 10) - - # get MVBS for channel in linear domain - chan_MVBS = bin_and_mean_2d( - sv.data, - bins_time=ping_interval, - bins_er=echo_range_interval, - times=sv.ping_time.data, - echo_range=ds["echo_range"], - comprehensive_er_check=True, + sv = ds["Sv"].pipe(_linear_transform) + + res = xarray_reduce( + sv, + ds["ping_time"], + ds["echo_range"], + func="mean", + expected_groups=(ping_interval, echo_range_interval), + isbin=True, + method="map-reduce", + engine="flox", ) # apply inverse mapping to get back to the original domain and store values - all_MVBS.append(10 * np.log10(chan_MVBS)) + chan_MVBS = res.pipe(_linear_transform, inverse=True) + all_MVBS.append(chan_MVBS) # collect the MVBS values for each channel - return np.stack(all_MVBS, axis=0) + return xr.concat(all_MVBS, dim="channel") From a052854f33e7c982978733fd13919b28c80dda39 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Tue, 15 Aug 2023 15:23:15 -0700 Subject: [PATCH 03/43] docs: add small code comment --- echopype/commongrid/mvbs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/echopype/commongrid/mvbs.py b/echopype/commongrid/mvbs.py index 81095c9f0..00f6b8cc6 100644 --- a/echopype/commongrid/mvbs.py +++ b/echopype/commongrid/mvbs.py @@ -460,6 +460,8 @@ def get_MVBS_along_channels( # average should be done in linear domain sv = ds["Sv"].pipe(_linear_transform) + # reduce along ping_time and echo_range + # by binning and averaging res = xarray_reduce( sv, ds["ping_time"], From 364ebce5f7d0a7d118475e1266cffdb26af75310 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Tue, 15 Aug 2023 15:38:57 -0700 Subject: [PATCH 04/43] refactor: change how ping_time index is retrieved --- echopype/commongrid/api.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index 0969e5af2..65b0cfa99 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -107,8 +107,7 @@ def compute_MVBS(ds_Sv, range_meter_bin=20, ping_time_bin="20S"): ds_Sv["ping_time"] .resample(ping_time=ping_time_bin, skipna=True) .asfreq() - .to_dataframe() - .index + .indexes["ping_time"] ) ping_interval = d_index.union([d_index[-1] + pd.Timedelta(ping_time_bin)]) From a0fb46a53f209657fd012b6b3ccc05cc93e34136 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Tue, 15 Aug 2023 15:52:41 -0700 Subject: [PATCH 05/43] refactor: remove for loop for channel --- echopype/commongrid/mvbs.py | 43 ++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/echopype/commongrid/mvbs.py b/echopype/commongrid/mvbs.py index 00f6b8cc6..d58caab87 100644 --- a/echopype/commongrid/mvbs.py +++ b/echopype/commongrid/mvbs.py @@ -453,29 +453,22 @@ def get_MVBS_along_channels( The MVBS dataset of the input ``ds_Sv`` for all channels """ - all_MVBS = [] - for chan in ds_Sv.channel: - ds = ds_Sv.sel(channel=chan).squeeze() - - # average should be done in linear domain - sv = ds["Sv"].pipe(_linear_transform) - - # reduce along ping_time and echo_range - # by binning and averaging - res = xarray_reduce( - sv, - ds["ping_time"], - ds["echo_range"], - func="mean", - expected_groups=(ping_interval, echo_range_interval), - isbin=True, - method="map-reduce", - engine="flox", - ) - - # apply inverse mapping to get back to the original domain and store values - chan_MVBS = res.pipe(_linear_transform, inverse=True) - all_MVBS.append(chan_MVBS) + # average should be done in linear domain + sv = ds_Sv["Sv"].pipe(_linear_transform) + + # reduce along ping_time and echo_range + # by binning and averaging + mvbs = xarray_reduce( + sv, + sv["channel"], + ds_Sv["ping_time"], + ds_Sv["echo_range"], + func="mean", + expected_groups=(None, ping_interval, echo_range_interval), + isbin=[False, True, True], + method="map-reduce", + engine="flox", + ) - # collect the MVBS values for each channel - return xr.concat(all_MVBS, dim="channel") + # apply inverse mapping to get back to the original domain and store values + return mvbs.pipe(_linear_transform, inverse=True) From d5558e8ca5fc07f76b285591b5a7b08af6258b04 Mon Sep 17 00:00:00 2001 From: Wu-Jung Lee Date: Thu, 24 Aug 2023 14:27:10 -0400 Subject: [PATCH 06/43] test(mvbs): add mock Sv datasets and tests for dims (#2) Note that @leewujung also changed mean to nanmean for skipping NaNs in each bin. --- echopype/commongrid/mvbs.py | 2 +- echopype/tests/commongrid/test_mvbs.py | 113 +++++++++++++++++++++++++ 2 files changed, 114 insertions(+), 1 deletion(-) diff --git a/echopype/commongrid/mvbs.py b/echopype/commongrid/mvbs.py index d58caab87..189c24983 100644 --- a/echopype/commongrid/mvbs.py +++ b/echopype/commongrid/mvbs.py @@ -463,7 +463,7 @@ def get_MVBS_along_channels( sv["channel"], ds_Sv["ping_time"], ds_Sv["echo_range"], - func="mean", + func="nanmean", expected_groups=(None, ping_interval, echo_range_interval), isbin=[False, True, True], method="map-reduce", diff --git a/echopype/tests/commongrid/test_mvbs.py b/echopype/tests/commongrid/test_mvbs.py index 449fe0b9c..5eee3d498 100644 --- a/echopype/tests/commongrid/test_mvbs.py +++ b/echopype/tests/commongrid/test_mvbs.py @@ -833,3 +833,116 @@ def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None: # compare known MVBS solution against its calculated counterpart assert np.allclose(calc_MVBS, known_MVBS, atol=1e-10, rtol=1e-10, equal_nan=True) + + +def _gen_ping_time(ping_time_len, ping_time_interval, ping_time_jitter_max_ms=0): + ping_time = pd.date_range("2018-07-01", periods=ping_time_len, freq=ping_time_interval) + if ping_time_jitter_max_ms != 0: # if to add jitter + jitter = np.random.randint(ping_time_jitter_max_ms, size=ping_time_len) / 1000 # convert to seconds + ping_time = pd.to_datetime(ping_time.astype(int)/1e9 + jitter, unit="s") + return ping_time + + +def _gen_Sv_er_regular( + channel_len=2, + depth_len=100, depth_interval=0.5, + ping_time_len=600, ping_time_interval="0.3S", ping_time_jitter_max_ms=0, +): + """ + Generate a Sv dataset with uniform echo_range across all ping_time. + + ping_time_jitter_max_ms controled jitter in milliseonds in ping_time. + """ + # regular echo_range + echo_range = np.array([[np.arange(depth_len)] * ping_time_len] * channel_len) * depth_interval + + # generate dataset + ds_Sv = xr.Dataset( + data_vars={ + "Sv": (["channel", "ping_time", "range_sample"], np.random.rand(channel_len, ping_time_len, depth_len)), + "echo_range": (["channel", "ping_time", "range_sample"], echo_range), + "frequency_nominal": (["channel"], np.arange(channel_len)), + }, + coords={ + "channel": [f"ch_{ch}" for ch in range(channel_len)], + "ping_time": _gen_ping_time(ping_time_len, ping_time_interval, ping_time_jitter_max_ms), + "range_sample": np.arange(depth_len), + } + ) + + return ds_Sv + + +def _gen_Sv_er_irregular( + channel_len=2, + depth_len=100, depth_interval=[0.5, 0.32, 0.13], depth_ping_time_len=[100, 300, 200], + ping_time_len=600, ping_time_interval="0.3S", ping_time_jitter_max_ms=0, +): + """ + Generate a Sv dataset with uniform echo_range across all ping_time. + + ping_time_jitter_max_ms controled jitter in milliseonds in ping_time. + """ + # check input + if len(depth_interval) != len(depth_ping_time_len): + raise ValueError("The number of depth_interval and depth_ping_time_len must be equal!") + + if ping_time_len != np.array(depth_ping_time_len).sum(): + raise ValueError("The number of total pings does not match!") + + # irregular echo_range + echo_range_list = [] + for d, dp in zip(depth_interval, depth_ping_time_len): + echo_range_list.append(np.array([[np.arange(depth_len)] * dp] * channel_len) * d) + echo_range = np.hstack(echo_range_list) + + # generate dataset + ds_Sv = xr.Dataset( + data_vars={ + "Sv": (["channel", "ping_time", "range_sample"], np.random.rand(channel_len, ping_time_len, depth_len)), + "echo_range": (["channel", "ping_time", "range_sample"], echo_range), + "frequency_nominal": (["channel"], np.arange(channel_len)), + }, + coords={ + "channel": [f"ch_{ch}" for ch in range(channel_len)], + "ping_time": _gen_ping_time(ping_time_len, ping_time_interval, ping_time_jitter_max_ms), + "range_sample": np.arange(depth_len), + } + ) + + return ds_Sv + + +@pytest.mark.parametrize( + ("er_type"), + [ + ("regular"), + ("irregular"), + ], +) +def test_compute_MVBS_er_output(er_type): + # set jitter=0 to get predictable number of ping within each echo_range groups + if er_type == "regular": + ds_Sv = _gen_Sv_er_regular(ping_time_jitter_max_ms=0) + else: + depth_interval=[0.5, 0.32, 0.13] + depth_ping_time_len=[100, 300, 200] + ping_time_len=600 + ping_time_interval="0.3S" + ds_Sv = _gen_Sv_er_irregular(depth_interval=depth_interval, depth_ping_time_len=depth_ping_time_len, + ping_time_len=ping_time_len, ping_time_interval=ping_time_interval, + ping_time_jitter_max_ms=0) + + ds_MVBS = ep.commongrid.compute_MVBS(ds_Sv, range_meter_bin=5, ping_time_bin="10S") + + if er_type == "regular": + expected_len = ( + ds_Sv["channel"].size, # channel + np.ceil(np.diff(ds_Sv["ping_time"][[0, -1]].astype(int))/1e9 / 10), # ping_time + np.ceil(ds_Sv["echo_range"].max()/5), # depth + ) + assert ds_MVBS["Sv"].shape == expected_len + else: + assert ds_MVBS["Sv"].isel(ping_time=slice(None, 3)).dropna(dim="echo_range").shape == (2, 3, 10) # full array, no NaN + assert ds_MVBS["Sv"].isel(ping_time=slice(3, 12)).dropna(dim="echo_range").shape == (2, 9, 7) # bottom bins contain NaN + assert ds_MVBS["Sv"].isel(ping_time=slice(12, None)).dropna(dim="echo_range").shape == (2, 6, 3) # bottom bins contain NaN From a1e5ec1dbfb557c23f2bb6eeec536ed9c39397a3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 24 Aug 2023 18:27:39 +0000 Subject: [PATCH 07/43] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- echopype/tests/commongrid/test_mvbs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/echopype/tests/commongrid/test_mvbs.py b/echopype/tests/commongrid/test_mvbs.py index 5eee3d498..1d616bd49 100644 --- a/echopype/tests/commongrid/test_mvbs.py +++ b/echopype/tests/commongrid/test_mvbs.py @@ -851,7 +851,7 @@ def _gen_Sv_er_regular( """ Generate a Sv dataset with uniform echo_range across all ping_time. - ping_time_jitter_max_ms controled jitter in milliseonds in ping_time. + ping_time_jitter_max_ms controlled jitter in milliseconds in ping_time. """ # regular echo_range echo_range = np.array([[np.arange(depth_len)] * ping_time_len] * channel_len) * depth_interval @@ -881,7 +881,7 @@ def _gen_Sv_er_irregular( """ Generate a Sv dataset with uniform echo_range across all ping_time. - ping_time_jitter_max_ms controled jitter in milliseonds in ping_time. + ping_time_jitter_max_ms controlled jitter in milliseconds in ping_time. """ # check input if len(depth_interval) != len(depth_ping_time_len): From f30df737451a39208328d6f3a852ea7dc3b2496c Mon Sep 17 00:00:00 2001 From: Don Setiawan Date: Thu, 24 Aug 2023 11:29:55 -0700 Subject: [PATCH 08/43] fix: change dask to numpy Changed the use of dask for log10 to numpy instead since numpy can also handle dask array inputs properly. --- echopype/commongrid/mvbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/commongrid/mvbs.py b/echopype/commongrid/mvbs.py index 189c24983..19d7b6ce7 100644 --- a/echopype/commongrid/mvbs.py +++ b/echopype/commongrid/mvbs.py @@ -424,7 +424,7 @@ def _linear_transform( The transformed data """ if inverse: - return 10 * dask.array.log10(data) + return 10 * np.log10(data) return 10 ** (data / 10) From 99f2b651529f1d3604a81751458861137fdefa02 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Fri, 25 Aug 2023 15:10:11 -0700 Subject: [PATCH 09/43] feat: Add method argument Added 'method' argument to 'get_MVBS_along_channels' and also expose additional keyword arguments control for flox. --- echopype/commongrid/mvbs.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/echopype/commongrid/mvbs.py b/echopype/commongrid/mvbs.py index 19d7b6ce7..e540b86e9 100644 --- a/echopype/commongrid/mvbs.py +++ b/echopype/commongrid/mvbs.py @@ -429,7 +429,11 @@ def _linear_transform( def get_MVBS_along_channels( - ds_Sv: xr.Dataset, echo_range_interval: np.ndarray, ping_interval: np.ndarray + ds_Sv: xr.Dataset, + echo_range_interval: np.ndarray, + ping_interval: np.ndarray, + method: str = "map-reduce", + **kwargs ) -> xr.Dataset: """ Computes the MVBS of ``ds_Sv`` along each channel for the given @@ -446,6 +450,13 @@ def get_MVBS_along_channels( ping_interval: np.ndarray 1D array (used by np.digitize) representing the binning required for ``ping_time`` + method: str + The flox strategy for reduction of dask arrays only. + See flox `documentation `_ + for more details. + **kwargs + Additional keyword arguments to be passed + to flox reduction function Returns ------- @@ -466,8 +477,9 @@ def get_MVBS_along_channels( func="nanmean", expected_groups=(None, ping_interval, echo_range_interval), isbin=[False, True, True], - method="map-reduce", engine="flox", + method=method, + **kwargs ) # apply inverse mapping to get back to the original domain and store values From a536d22dc42535adb5b83fca30d2cfe330e0dee2 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Tue, 29 Aug 2023 11:57:22 -0700 Subject: [PATCH 10/43] fix(commongrid): Fixed to include lat lon Fixed 'compute_MVBS' function to now include latitude and longitude if the variables exists in the Sv dataset. Additionally, the flox method and keyword arguments are now exposed within the 'compute_MVBS' function. Ref: Issue #1002 --- echopype/commongrid/api.py | 23 ++++++++++++++++++++--- echopype/commongrid/mvbs.py | 20 +++++++++++++++++++- echopype/consolidate/api.py | 4 +++- 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index 65b0cfa99..157945e9d 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -7,6 +7,7 @@ import pandas as pd import xarray as xr +from ..consolidate.api import POSITION_VARIABLES from ..utils.prov import add_processing_level, echopype_prov_attrs, insert_input_processing_level from .mvbs import get_MVBS_along_channels from .nasc import ( @@ -71,7 +72,9 @@ def _set_MVBS_attrs(ds): @add_processing_level("L3*") -def compute_MVBS(ds_Sv, range_meter_bin=20, ping_time_bin="20S"): +def compute_MVBS( + ds_Sv, range_meter_bin=20, ping_time_bin="20S", method="map-reduce", **flox_kwargs +): """ Compute Mean Volume Backscattering Strength (MVBS) based on intervals of range (``echo_range``) and ``ping_time`` specified in physical units. @@ -88,6 +91,13 @@ def compute_MVBS(ds_Sv, range_meter_bin=20, ping_time_bin="20S"): bin size along ``echo_range`` in meters, default to ``20`` ping_time_bin : str bin size along ``ping_time``, default to ``20S`` + method: str + The flox strategy for reduction of dask arrays only. + See flox `documentation `_ + for more details. + **kwargs + Additional keyword arguments to be passed + to flox reduction function. Returns ------- @@ -112,12 +122,14 @@ def compute_MVBS(ds_Sv, range_meter_bin=20, ping_time_bin="20S"): ping_interval = d_index.union([d_index[-1] + pd.Timedelta(ping_time_bin)]) # calculate the MVBS along each channel - raw_MVBS = get_MVBS_along_channels(ds_Sv, range_interval, ping_interval) + raw_MVBS = get_MVBS_along_channels( + ds_Sv, range_interval, ping_interval, method=method, **flox_kwargs + ) # create MVBS dataset # by transforming the binned dimensions to regular coords ds_MVBS = xr.Dataset( - data_vars={"Sv": (["channel", "ping_time", "echo_range"], raw_MVBS.data)}, + data_vars={"Sv": (["channel", "ping_time", "echo_range"], raw_MVBS["Sv"].data)}, coords={ "ping_time": np.array([v.left for v in raw_MVBS.ping_time_bins.values]), "channel": raw_MVBS.channel.values, @@ -125,6 +137,11 @@ def compute_MVBS(ds_Sv, range_meter_bin=20, ping_time_bin="20S"): }, ) + # Add the position variables + if raw_MVBS.attrs.get("has_positions", False): + for var in POSITION_VARIABLES: + ds_MVBS[var] = (["ping_time"], raw_MVBS[var].data) + # ping_time_bin parsing and conversions # Need to convert between pd.Timedelta and np.timedelta64 offsets/frequency strings # https://xarray.pydata.org/en/stable/generated/xarray.Dataset.resample.html diff --git a/echopype/commongrid/mvbs.py b/echopype/commongrid/mvbs.py index e540b86e9..316437c8f 100644 --- a/echopype/commongrid/mvbs.py +++ b/echopype/commongrid/mvbs.py @@ -10,6 +10,8 @@ import xarray as xr from flox.xarray import xarray_reduce +from ..consolidate.api import POSITION_VARIABLES + def get_bin_indices( echo_range: np.ndarray, bins_er: np.ndarray, times: np.ndarray, bins_time: np.ndarray @@ -467,6 +469,21 @@ def get_MVBS_along_channels( # average should be done in linear domain sv = ds_Sv["Sv"].pipe(_linear_transform) + # Get positions if exists + # otherwise just use an empty dataset + ds_Pos = xr.Dataset(attrs={"has_positions": False}) + if all(v in ds_Sv for v in POSITION_VARIABLES): + ds_Pos = xarray_reduce( + ds_Sv[POSITION_VARIABLES], + ds_Sv["ping_time"], + func="nanmean", + expected_groups=(ping_interval), + isbin=True, + engine="flox", + method=method, + ) + ds_Pos.attrs["has_positions"] = True + # reduce along ping_time and echo_range # by binning and averaging mvbs = xarray_reduce( @@ -483,4 +500,5 @@ def get_MVBS_along_channels( ) # apply inverse mapping to get back to the original domain and store values - return mvbs.pipe(_linear_transform, inverse=True) + da_MVBS = mvbs.pipe(_linear_transform, inverse=True) + return xr.merge([ds_Pos, da_MVBS]) diff --git a/echopype/consolidate/api.py b/echopype/consolidate/api.py index a08144a5c..364c76be3 100644 --- a/echopype/consolidate/api.py +++ b/echopype/consolidate/api.py @@ -12,6 +12,8 @@ from ..utils.prov import add_processing_level from .split_beam_angle import add_angle_to_ds, get_angle_complex_samples, get_angle_power_samples +POSITION_VARIABLES = ["latitude", "longitude"] + def swap_dims_channel_frequency(ds: xr.Dataset) -> xr.Dataset: """ @@ -185,7 +187,7 @@ def sel_interp(var, time_dim_name): f"{datetime.datetime.utcnow()} +00:00. " "Interpolated or propagated from Platform latitude/longitude." # noqa ) - for da_name in ["latitude", "longitude"]: + for da_name in POSITION_VARIABLES: interp_ds[da_name] = interp_ds[da_name].assign_attrs({"history": history_attr}) if time_dim_name in interp_ds: From 3cdfebf29a0cc7e634f732f8e811a9d4b9f797f2 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Tue, 29 Aug 2023 15:11:48 -0700 Subject: [PATCH 11/43] refactor: Set defaults to recommended After some investigation, @lsetiawan concluded that at this time the method 'map-reduce', engine 'numpy', and reindex True works the best, so this is now set as default. Also, getting echo range maximum is through direct data slicing rather than computation. --- echopype/commongrid/api.py | 14 ++++++++++++-- echopype/commongrid/mvbs.py | 2 -- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index 157945e9d..7760608b5 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -73,7 +73,11 @@ def _set_MVBS_attrs(ds): @add_processing_level("L3*") def compute_MVBS( - ds_Sv, range_meter_bin=20, ping_time_bin="20S", method="map-reduce", **flox_kwargs + ds_Sv: xr.Dataset, + range_meter_bin: int = 20, + ping_time_bin: str = "20S", + method="map-reduce", + **flox_kwargs, ): """ Compute Mean Volume Backscattering Strength (MVBS) @@ -110,7 +114,10 @@ def compute_MVBS( ds_Sv = ds_Sv.drop_dims("filenames") # create bin information for echo_range - range_interval = np.arange(0, ds_Sv["echo_range"].max() + range_meter_bin, range_meter_bin) + # get the max echo range value + # assuming that it's a grid so just take the last value + echo_range_max = ds_Sv["echo_range"].isel(range_sample=-1, ping_time=0, channel=0).to_numpy() + range_interval = np.arange(0, echo_range_max + range_meter_bin, range_meter_bin) # create bin information needed for ping_time d_index = ( @@ -122,6 +129,9 @@ def compute_MVBS( ping_interval = d_index.union([d_index[-1] + pd.Timedelta(ping_time_bin)]) # calculate the MVBS along each channel + if method == "map-reduce": + # set to the faster compute if not specified + flox_kwargs.setdefault("reindex", True) raw_MVBS = get_MVBS_along_channels( ds_Sv, range_interval, ping_interval, method=method, **flox_kwargs ) diff --git a/echopype/commongrid/mvbs.py b/echopype/commongrid/mvbs.py index 316437c8f..376c5a5a4 100644 --- a/echopype/commongrid/mvbs.py +++ b/echopype/commongrid/mvbs.py @@ -479,7 +479,6 @@ def get_MVBS_along_channels( func="nanmean", expected_groups=(ping_interval), isbin=True, - engine="flox", method=method, ) ds_Pos.attrs["has_positions"] = True @@ -494,7 +493,6 @@ def get_MVBS_along_channels( func="nanmean", expected_groups=(None, ping_interval, echo_range_interval), isbin=[False, True, True], - engine="flox", method=method, **kwargs ) From afa418da7a8a4a22665d2dd3ee4599f383820642 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Tue, 29 Aug 2023 15:41:54 -0700 Subject: [PATCH 12/43] feat(commongrid): Add 'range_var' argument to 'compute_MVBS' Added a new argument 'range_var' so that user can set the range variable to perform binning with. There are 2 options of 'echo_range' and 'depth': - 'echo_range': When this is set, variable 'water_level' is now included in the resulting MVBS dataset - 'depth': A check is in place to ensure that this variable exists before moving forward and use this to perform range binning. Ref: Issue #1002 --- echopype/commongrid/api.py | 40 +++++++++++++++++++++++++++---------- echopype/commongrid/mvbs.py | 10 +++++++--- 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index 7760608b5..01c7582c4 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -1,7 +1,7 @@ """ Functions for enhancing the spatial and temporal coherence of data. """ -from typing import Union +from typing import Literal, Union import numpy as np import pandas as pd @@ -74,6 +74,7 @@ def _set_MVBS_attrs(ds): @add_processing_level("L3*") def compute_MVBS( ds_Sv: xr.Dataset, + range_var: Literal["echo_range", "depth"] = "echo_range", range_meter_bin: int = 20, ping_time_bin: str = "20S", method="map-reduce", @@ -81,7 +82,8 @@ def compute_MVBS( ): """ Compute Mean Volume Backscattering Strength (MVBS) - based on intervals of range (``echo_range``) and ``ping_time`` specified in physical units. + based on intervals of range (``echo_range``) or depth (``depth``) + and ``ping_time`` specified in physical units. Output of this function differs from that of ``compute_MVBS_index_binning``, which computes bin-averaged Sv according to intervals of ``echo_range`` and ``ping_time`` specified as @@ -91,8 +93,14 @@ def compute_MVBS( ---------- ds_Sv : xr.Dataset dataset containing Sv and ``echo_range`` [m] + range_var: str + The variable to use for range binning. + Must be one of ``echo_range`` or ``depth``. + Note that ``depth`` is only available if the input dataset contains + ``depth`` as a data variable. range_meter_bin : Union[int, float] - bin size along ``echo_range`` in meters, default to ``20`` + bin size along ``echo_range`` or ``depth`` in meters, + default to ``20`` ping_time_bin : str bin size along ``ping_time``, default to ``20S`` method: str @@ -113,10 +121,18 @@ def compute_MVBS( if "filenames" in ds_Sv.dims: ds_Sv = ds_Sv.drop_dims("filenames") + # Check if range_var is valid + if range_var not in ["echo_range", "depth"]: + raise ValueError("range_var must be one of 'echo_range' or 'depth'.") + + # Check if range_var exists in ds_Sv + if range_var not in ds_Sv.data_vars: + raise ValueError(f"range_var '{range_var}' does not exist in the input dataset.") + # create bin information for echo_range # get the max echo range value # assuming that it's a grid so just take the last value - echo_range_max = ds_Sv["echo_range"].isel(range_sample=-1, ping_time=0, channel=0).to_numpy() + echo_range_max = ds_Sv[range_var].isel(range_sample=-1, ping_time=0, channel=0).to_numpy() range_interval = np.arange(0, echo_range_max + range_meter_bin, range_meter_bin) # create bin information needed for ping_time @@ -133,17 +149,17 @@ def compute_MVBS( # set to the faster compute if not specified flox_kwargs.setdefault("reindex", True) raw_MVBS = get_MVBS_along_channels( - ds_Sv, range_interval, ping_interval, method=method, **flox_kwargs + ds_Sv, range_interval, ping_interval, range_var=range_var, method=method, **flox_kwargs ) # create MVBS dataset # by transforming the binned dimensions to regular coords ds_MVBS = xr.Dataset( - data_vars={"Sv": (["channel", "ping_time", "echo_range"], raw_MVBS["Sv"].data)}, + data_vars={"Sv": (["channel", "ping_time", range_var], raw_MVBS["Sv"].data)}, coords={ "ping_time": np.array([v.left for v in raw_MVBS.ping_time_bins.values]), "channel": raw_MVBS.channel.values, - "echo_range": np.array([v.left for v in raw_MVBS.echo_range_bins.values]), + range_var: np.array([v.left for v in raw_MVBS[f"{range_var}_bins"].values]), }, ) @@ -152,6 +168,10 @@ def compute_MVBS( for var in POSITION_VARIABLES: ds_MVBS[var] = (["ping_time"], raw_MVBS[var].data) + # Add water level if uses echo_range + if range_var == "echo_range": + ds_MVBS["water_level"] = ds_Sv["water_level"] + # ping_time_bin parsing and conversions # Need to convert between pd.Timedelta and np.timedelta64 offsets/frequency strings # https://xarray.pydata.org/en/stable/generated/xarray.Dataset.resample.html @@ -183,14 +203,14 @@ def compute_MVBS( # Attach attributes _set_MVBS_attrs(ds_MVBS) - ds_MVBS["echo_range"].attrs = {"long_name": "Range distance", "units": "m"} + ds_MVBS[range_var].attrs = {"long_name": "Range distance", "units": "m"} ds_MVBS["Sv"] = ds_MVBS["Sv"].assign_attrs( { "cell_methods": ( f"ping_time: mean (interval: {ping_time_bin_resvalue} {ping_time_bin_resunit_label} " # noqa "comment: ping_time is the interval start) " - f"echo_range: mean (interval: {range_meter_bin} meter " - "comment: echo_range is the interval start)" + f"{range_var}: mean (interval: {range_meter_bin} meter " + f"comment: {range_var} is the interval start)" ), "binning_mode": "physical units", "range_meter_interval": str(range_meter_bin) + "m", diff --git a/echopype/commongrid/mvbs.py b/echopype/commongrid/mvbs.py index 376c5a5a4..b7914ea0b 100644 --- a/echopype/commongrid/mvbs.py +++ b/echopype/commongrid/mvbs.py @@ -3,7 +3,7 @@ """ import warnings -from typing import Tuple, Union +from typing import Literal, Tuple, Union import dask.array import numpy as np @@ -434,6 +434,7 @@ def get_MVBS_along_channels( ds_Sv: xr.Dataset, echo_range_interval: np.ndarray, ping_interval: np.ndarray, + range_var: Literal["echo_range", "depth"] = "echo_range", method: str = "map-reduce", **kwargs ) -> xr.Dataset: @@ -452,6 +453,9 @@ def get_MVBS_along_channels( ping_interval: np.ndarray 1D array (used by np.digitize) representing the binning required for ``ping_time`` + range_var: str + The variable to use for range binning. + Either ``echo_range`` or ``depth``. method: str The flox strategy for reduction of dask arrays only. See flox `documentation `_ @@ -483,13 +487,13 @@ def get_MVBS_along_channels( ) ds_Pos.attrs["has_positions"] = True - # reduce along ping_time and echo_range + # reduce along ping_time and echo_range or depth # by binning and averaging mvbs = xarray_reduce( sv, sv["channel"], ds_Sv["ping_time"], - ds_Sv["echo_range"], + ds_Sv[range_var], func="nanmean", expected_groups=(None, ping_interval, echo_range_interval), isbin=[False, True, True], From 7a861b449c072cbd6a2b815318ebe7b67df013f4 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Tue, 29 Aug 2023 15:52:23 -0700 Subject: [PATCH 13/43] fix: Add missing attributes for lat lon --- echopype/commongrid/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index 01c7582c4..a8294f974 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -166,7 +166,7 @@ def compute_MVBS( # Add the position variables if raw_MVBS.attrs.get("has_positions", False): for var in POSITION_VARIABLES: - ds_MVBS[var] = (["ping_time"], raw_MVBS[var].data) + ds_MVBS[var] = (["ping_time"], raw_MVBS[var].data, ds_Sv[var].attrs) # Add water level if uses echo_range if range_var == "echo_range": From 53cebd4a9f12f008edd553e2322c7bdcaa757396 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Tue, 29 Aug 2023 16:07:39 -0700 Subject: [PATCH 14/43] test: Update test to use random generator --- echopype/tests/commongrid/test_mvbs.py | 34 ++++++++++++++++++++------ 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/echopype/tests/commongrid/test_mvbs.py b/echopype/tests/commongrid/test_mvbs.py index 1d616bd49..e16b1a73c 100644 --- a/echopype/tests/commongrid/test_mvbs.py +++ b/echopype/tests/commongrid/test_mvbs.py @@ -9,6 +9,11 @@ import echopype as ep from echopype.commongrid.mvbs import bin_and_mean_2d +@pytest.fixture +def random_number_generator(): + """Random number generator for tests""" + return default_rng() + @pytest.fixture( params=[ @@ -846,20 +851,29 @@ def _gen_ping_time(ping_time_len, ping_time_interval, ping_time_jitter_max_ms=0) def _gen_Sv_er_regular( channel_len=2, depth_len=100, depth_interval=0.5, - ping_time_len=600, ping_time_interval="0.3S", ping_time_jitter_max_ms=0, + ping_time_len=600, ping_time_interval="0.3S", + ping_time_jitter_max_ms=0, + random_number_generator=None, ): """ Generate a Sv dataset with uniform echo_range across all ping_time. ping_time_jitter_max_ms controlled jitter in milliseconds in ping_time. """ + + if random_number_generator is None: + random_number_generator = default_rng() + # regular echo_range echo_range = np.array([[np.arange(depth_len)] * ping_time_len] * channel_len) * depth_interval # generate dataset ds_Sv = xr.Dataset( data_vars={ - "Sv": (["channel", "ping_time", "range_sample"], np.random.rand(channel_len, ping_time_len, depth_len)), + "Sv": ( + ["channel", "ping_time", "range_sample"], + random_number_generator.random(size=(channel_len, ping_time_len, depth_len)) + ), "echo_range": (["channel", "ping_time", "range_sample"], echo_range), "frequency_nominal": (["channel"], np.arange(channel_len)), }, @@ -876,13 +890,16 @@ def _gen_Sv_er_regular( def _gen_Sv_er_irregular( channel_len=2, depth_len=100, depth_interval=[0.5, 0.32, 0.13], depth_ping_time_len=[100, 300, 200], - ping_time_len=600, ping_time_interval="0.3S", ping_time_jitter_max_ms=0, + ping_time_len=600, ping_time_interval="0.3S", ping_time_jitter_max_ms=0, random_number_generator=None, ): """ Generate a Sv dataset with uniform echo_range across all ping_time. ping_time_jitter_max_ms controlled jitter in milliseconds in ping_time. """ + if random_number_generator is None: + random_number_generator = default_rng() + # check input if len(depth_interval) != len(depth_ping_time_len): raise ValueError("The number of depth_interval and depth_ping_time_len must be equal!") @@ -899,7 +916,10 @@ def _gen_Sv_er_irregular( # generate dataset ds_Sv = xr.Dataset( data_vars={ - "Sv": (["channel", "ping_time", "range_sample"], np.random.rand(channel_len, ping_time_len, depth_len)), + "Sv": ( + ["channel", "ping_time", "range_sample"], + random_number_generator.random(size=(channel_len, ping_time_len, depth_len)) + ), "echo_range": (["channel", "ping_time", "range_sample"], echo_range), "frequency_nominal": (["channel"], np.arange(channel_len)), }, @@ -920,10 +940,10 @@ def _gen_Sv_er_irregular( ("irregular"), ], ) -def test_compute_MVBS_er_output(er_type): +def test_compute_MVBS_er_output(er_type, random_number_generator): # set jitter=0 to get predictable number of ping within each echo_range groups if er_type == "regular": - ds_Sv = _gen_Sv_er_regular(ping_time_jitter_max_ms=0) + ds_Sv = _gen_Sv_er_regular(ping_time_jitter_max_ms=0, random_number_generator=random_number_generator) else: depth_interval=[0.5, 0.32, 0.13] depth_ping_time_len=[100, 300, 200] @@ -931,7 +951,7 @@ def test_compute_MVBS_er_output(er_type): ping_time_interval="0.3S" ds_Sv = _gen_Sv_er_irregular(depth_interval=depth_interval, depth_ping_time_len=depth_ping_time_len, ping_time_len=ping_time_len, ping_time_interval=ping_time_interval, - ping_time_jitter_max_ms=0) + ping_time_jitter_max_ms=0, random_number_generator=random_number_generator) ds_MVBS = ep.commongrid.compute_MVBS(ds_Sv, range_meter_bin=5, ping_time_bin="10S") From 22f03d23cbe15a94c21ed8155f238ebda20e580c Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Tue, 29 Aug 2023 16:07:54 -0700 Subject: [PATCH 15/43] fix: Add case for no 'water_level' Added a case for dataset that doesn't have water level variable. --- echopype/commongrid/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index c8d849c6e..946972ea5 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -162,8 +162,8 @@ def compute_MVBS( for var in POSITION_VARIABLES: ds_MVBS[var] = (["ping_time"], raw_MVBS[var].data, ds_Sv[var].attrs) - # Add water level if uses echo_range - if range_var == "echo_range": + # Add water level if uses echo_range and it exists in Sv dataset + if range_var == "echo_range" and "water_level" in ds_Sv.data_vars: ds_MVBS["water_level"] = ds_Sv["water_level"] # ping_time_bin parsing and conversions From 0a4dbf3a8be54b374ac846f545c460c1ee03dc19 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Wed, 30 Aug 2023 15:57:52 -0700 Subject: [PATCH 16/43] test(nasc): Remove 'compute_NASC' import to avoid failure --- echopype/tests/commongrid/test_nasc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/tests/commongrid/test_nasc.py b/echopype/tests/commongrid/test_nasc.py index b950e7fd0..0430f99c1 100644 --- a/echopype/tests/commongrid/test_nasc.py +++ b/echopype/tests/commongrid/test_nasc.py @@ -4,7 +4,7 @@ from echopype import open_raw from echopype.calibrate import compute_Sv -from echopype.commongrid import compute_NASC +# from echopype.commongrid import compute_NASC from echopype.commongrid.nasc import ( get_distance_from_latlon, get_depth_bin_info, From 7c52b28a7ba8fcf3d9773460f78a5193904d61d5 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Thu, 31 Aug 2023 09:14:50 -0700 Subject: [PATCH 17/43] fix: Removed assumption on echo range max Reverted back the echo range max computation to computing on the fly since there may be some NaN values. --- echopype/commongrid/api.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index 946972ea5..9b8cf0507 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -124,9 +124,8 @@ def compute_MVBS( raise ValueError(f"range_var '{range_var}' does not exist in the input dataset.") # create bin information for echo_range - # get the max echo range value - # assuming that it's a grid so just take the last value - echo_range_max = ds_Sv[range_var].isel(range_sample=-1, ping_time=0, channel=0).to_numpy() + # this computes the echo range max since there might be missing values + echo_range_max = ds_Sv[range_var].max() range_interval = np.arange(0, echo_range_max + range_meter_bin, range_meter_bin) # create bin information needed for ping_time From 828ab6d24ccd7fa30044722f44f6447a10adc926 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Thu, 31 Aug 2023 11:05:28 -0700 Subject: [PATCH 18/43] test: Extract api test and add markings Extracted fixtures to conftest.py for commongrid. Additionally, clean up unused functions and mark tests b/w unit and integration. Added a new test module called 'test_api.py' for 'commongrid.api'. --- echopype/tests/commongrid/conftest.py | 219 ++++++++++ echopype/tests/commongrid/test_api.py | 142 +++++++ echopype/tests/commongrid/test_mvbs.py | 548 +------------------------ 3 files changed, 362 insertions(+), 547 deletions(-) create mode 100644 echopype/tests/commongrid/conftest.py create mode 100644 echopype/tests/commongrid/test_api.py diff --git a/echopype/tests/commongrid/conftest.py b/echopype/tests/commongrid/conftest.py new file mode 100644 index 000000000..68fe59373 --- /dev/null +++ b/echopype/tests/commongrid/conftest.py @@ -0,0 +1,219 @@ +import pytest + +import xarray as xr +import numpy as np +import pandas as pd + + +@pytest.fixture +def random_number_generator(): + """Random number generator for tests""" + return np.random.default_rng() + +@pytest.fixture( + params=[ + ( + ("EK60", "ncei-wcsd", "Summer2017-D20170719-T211347.raw"), + "EK60", + None, + {}, + ), + ( + ("EK80_NEW", "echopype-test-D20211004-T235930.raw"), + "EK80", + None, + {'waveform_mode': 'BB', 'encode_mode': 'complex'}, + ), + ( + ("EK80_NEW", "D20211004-T233354.raw"), + "EK80", + None, + {'waveform_mode': 'CW', 'encode_mode': 'power'}, + ), + ( + ("EK80_NEW", "D20211004-T233115.raw"), + "EK80", + None, + {'waveform_mode': 'CW', 'encode_mode': 'complex'}, + ), + (("ES70", "D20151202-T020259.raw"), "ES70", None, {}), + (("AZFP", "17082117.01A"), "AZFP", ("AZFP", "17041823.XML"), {}), + ( + ("AD2CP", "raw", "090", "rawtest.090.00001.ad2cp"), + "AD2CP", + None, + {}, + ), + ], + ids=[ + "ek60_cw_power", + "ek80_bb_complex", + "ek80_cw_power", + "ek80_cw_complex", + "es70", + "azfp", + "ad2cp", + ], +) +def test_data_samples(request, test_path): + ( + filepath, + sonar_model, + azfp_xml_path, + range_kwargs, + ) = request.param + if sonar_model.lower() in ['es70', 'ad2cp']: + pytest.xfail( + reason="Not supported at the moment", + ) + path_model, *paths = filepath + filepath = test_path[path_model].joinpath(*paths) + + if azfp_xml_path is not None: + path_model, *paths = azfp_xml_path + azfp_xml_path = test_path[path_model].joinpath(*paths) + return ( + filepath, + sonar_model, + azfp_xml_path, + range_kwargs, + ) + +@pytest.fixture +def regular_data_params(): + return { + "channel_len": 4, + "depth_len": 4000, + "ping_time_len": 100, + "ping_time_jitter_max_ms": 0, + } + + +@pytest.fixture +def ds_Sv_er_regular(regular_data_params, random_number_generator): + return _gen_Sv_er_regular( + **regular_data_params, + random_number_generator=random_number_generator, + ) + + +@pytest.fixture +def ds_Sv_er_irregular(random_number_generator): + depth_interval = [0.5, 0.32, 0.13] + depth_ping_time_len = [100, 300, 200] + ping_time_len = 600 + ping_time_interval = "0.3S" + return _gen_Sv_er_irregular( + depth_interval=depth_interval, + depth_ping_time_len=depth_ping_time_len, + ping_time_len=ping_time_len, + ping_time_interval=ping_time_interval, + ping_time_jitter_max_ms=0, + random_number_generator=random_number_generator, + ) + + +# Helper functions to generate mock Sv dataset +def _gen_ping_time(ping_time_len, ping_time_interval, ping_time_jitter_max_ms=0): + ping_time = pd.date_range("2018-07-01", periods=ping_time_len, freq=ping_time_interval) + if ping_time_jitter_max_ms != 0: # if to add jitter + jitter = ( + np.random.randint(ping_time_jitter_max_ms, size=ping_time_len) / 1000 + ) # convert to seconds + ping_time = pd.to_datetime(ping_time.astype(int) / 1e9 + jitter, unit="s") + return ping_time + + +def _gen_Sv_er_regular( + channel_len=2, + depth_len=100, + depth_interval=0.5, + ping_time_len=600, + ping_time_interval="0.3S", + ping_time_jitter_max_ms=0, + random_number_generator=None, +): + """ + Generate a Sv dataset with uniform echo_range across all ping_time. + + ping_time_jitter_max_ms controlled jitter in milliseconds in ping_time. + """ + + if random_number_generator is None: + random_number_generator = np.random.default_rng() + + # regular echo_range + echo_range = np.array([[np.arange(depth_len)] * ping_time_len] * channel_len) * depth_interval + + # generate dataset + ds_Sv = xr.Dataset( + data_vars={ + "Sv": ( + ["channel", "ping_time", "range_sample"], + random_number_generator.random(size=(channel_len, ping_time_len, depth_len)), + ), + "echo_range": (["channel", "ping_time", "range_sample"], echo_range), + "frequency_nominal": (["channel"], np.arange(channel_len)), + }, + coords={ + "channel": [f"ch_{ch}" for ch in range(channel_len)], + "ping_time": _gen_ping_time(ping_time_len, ping_time_interval, ping_time_jitter_max_ms), + "range_sample": np.arange(depth_len), + }, + ) + + return ds_Sv + + +def _gen_Sv_er_irregular( + channel_len=2, + depth_len=100, + depth_interval=[0.5, 0.32, 0.13], + depth_ping_time_len=[100, 300, 200], + ping_time_len=600, + ping_time_interval="0.3S", + ping_time_jitter_max_ms=0, + random_number_generator=None, +): + """ + Generate a Sv dataset with uniform echo_range across all ping_time. + + ping_time_jitter_max_ms controlled jitter in milliseconds in ping_time. + """ + if random_number_generator is None: + random_number_generator = np.random.default_rng() + + # check input + if len(depth_interval) != len(depth_ping_time_len): + raise ValueError("The number of depth_interval and depth_ping_time_len must be equal!") + + if ping_time_len != np.array(depth_ping_time_len).sum(): + raise ValueError("The number of total pings does not match!") + + # irregular echo_range + echo_range_list = [] + for d, dp in zip(depth_interval, depth_ping_time_len): + echo_range_list.append(np.array([[np.arange(depth_len)] * dp] * channel_len) * d) + echo_range = np.hstack(echo_range_list) + + # generate dataset + ds_Sv = xr.Dataset( + data_vars={ + "Sv": ( + ["channel", "ping_time", "range_sample"], + random_number_generator.random(size=(channel_len, ping_time_len, depth_len)), + ), + "echo_range": (["channel", "ping_time", "range_sample"], echo_range), + "frequency_nominal": (["channel"], np.arange(channel_len)), + }, + coords={ + "channel": [f"ch_{ch}" for ch in range(channel_len)], + "ping_time": _gen_ping_time(ping_time_len, ping_time_interval, ping_time_jitter_max_ms), + "range_sample": np.arange(depth_len), + }, + ) + + return ds_Sv + + +# End helper functions diff --git a/echopype/tests/commongrid/test_api.py b/echopype/tests/commongrid/test_api.py new file mode 100644 index 000000000..265096b19 --- /dev/null +++ b/echopype/tests/commongrid/test_api.py @@ -0,0 +1,142 @@ +import pytest +import echopype as ep +import numpy as np +import pandas as pd + +# NASC Tests +@pytest.mark.integration +@pytest.mark.skip(reason="NASC is not implemented yet") +def test_compute_NASC(test_data_samples): + pass + +# MVBS Tests +@pytest.mark.integration +def test_compute_MVBS_index_binning(ds_Sv_er_regular, regular_data_params): + """Test compute_MVBS_index_binning on mock data""" + + ping_num = 3 # number of pings to average over + range_sample_num = 7 # number of range_samples to average over + nchan = regular_data_params["channel_len"] + npings = regular_data_params["ping_time_len"] + nrange_samples = regular_data_params["depth_len"] + + # Binned MVBS test + ds_MVBS = ep.commongrid.compute_MVBS_index_binning( + ds_Sv_er_regular, range_sample_num=range_sample_num, ping_num=ping_num + ) + + # Shape test + data_binned_shape = np.ceil( + (nchan, npings / ping_num, nrange_samples / range_sample_num) + ).astype(int) + assert np.all(ds_MVBS.Sv.shape == data_binned_shape) + + # Expected values compute + # average should be done in linear domain + da_sv = 10 ** (ds_Sv_er_regular["Sv"] / 10) + expected = 10 * np.log10( + da_sv + .coarsen(ping_time=ping_num, range_sample=range_sample_num, boundary="pad") + .mean(skipna=True) + ) + + # Test all values in MVBS + assert np.array_equal(ds_MVBS.Sv.data, expected.data) + +@pytest.mark.unit +@pytest.mark.parametrize("range_var", ["my_range", "echo_range", "depth"]) +def test_compute_MVBS_invalid_range_var(ds_Sv_er_regular, range_var): + """Test compute MVBS range_var on mock data""" + + if range_var == "my_range": + with pytest.raises( + ValueError, + match="range_var must be one of 'echo_range' or 'depth'." + ): + ep.commongrid.compute_MVBS(ds_Sv_er_regular, range_var=range_var) + elif range_var == "depth": + with pytest.raises( + ValueError, + match=f"range_var '{range_var}' does not exist in the input dataset." + ): + ep.commongrid.compute_MVBS(ds_Sv_er_regular, range_var=range_var) + else: + pass + +@pytest.mark.integration +def test_compute_MVBS(test_data_samples): + """ + Test running through from open_raw to compute_MVBS. + """ + ( + filepath, + sonar_model, + azfp_xml_path, + range_kwargs, + ) = test_data_samples + ed = ep.open_raw(filepath, sonar_model, azfp_xml_path) + if ed.sonar_model.lower() == 'azfp': + avg_temperature = ed["Environment"]['temperature'].values.mean() + env_params = { + 'temperature': avg_temperature, + 'salinity': 27.9, + 'pressure': 59, + } + range_kwargs['env_params'] = env_params + if 'azfp_cal_type' in range_kwargs: + range_kwargs.pop('azfp_cal_type') + Sv = ep.calibrate.compute_Sv(ed, **range_kwargs) + ping_time_bin = "20S" + ds_MVBS = ep.commongrid.compute_MVBS(Sv, ping_time_bin=ping_time_bin) + assert ds_MVBS is not None + + # Test to see if ping_time was resampled correctly + expected_ping_time = ( + Sv["ping_time"] + .resample(ping_time=ping_time_bin, skipna=True) + .asfreq() + .indexes["ping_time"] + ) + assert np.array_equal(ds_MVBS.ping_time.data, expected_ping_time.values) + +@pytest.mark.integration +@pytest.mark.parametrize( + ("er_type"), + [ + ("regular"), + ("irregular"), + ], +) +def test_compute_MVBS_er_output(request, er_type): + """Tests the output of compute_MVBS on regular and irregular data.""" + # set jitter=0 to get predictable number of ping within each echo_range groups + if er_type == "regular": + ds_Sv = request.getfixturevalue("ds_Sv_er_regular") + else: + ds_Sv = request.getfixturevalue("ds_Sv_er_irregular") + + ds_MVBS = ep.commongrid.compute_MVBS(ds_Sv, range_meter_bin=5, ping_time_bin="10S") + + if er_type == "regular": + expected_len = ( + ds_Sv["channel"].size, # channel + np.ceil(np.diff(ds_Sv["ping_time"][[0, -1]].astype(int))/1e9 / 10), # ping_time + np.ceil(ds_Sv["echo_range"].max()/5), # depth + ) + assert ds_MVBS["Sv"].shape == expected_len + else: + assert ( + ds_MVBS["Sv"] + .isel(ping_time=slice(None, 3)) + .dropna(dim="echo_range").shape + ) == (2, 3, 10) # full array, no NaN + assert ( + ds_MVBS["Sv"] + .isel(ping_time=slice(3, 12)) + .dropna(dim="echo_range").shape + ) == (2, 9, 7) # bottom bins contain NaN + assert ( + ds_MVBS["Sv"] + .isel(ping_time=slice(12, None)) + .dropna(dim="echo_range").shape + ) == (2, 6, 3) # bottom bins contain NaN diff --git a/echopype/tests/commongrid/test_mvbs.py b/echopype/tests/commongrid/test_mvbs.py index e16b1a73c..cf677edf8 100644 --- a/echopype/tests/commongrid/test_mvbs.py +++ b/echopype/tests/commongrid/test_mvbs.py @@ -1,428 +1,10 @@ import dask.array import numpy as np from numpy.random import default_rng -import pandas as pd import pytest from typing import Tuple, Iterable, Union -import xarray as xr - -import echopype as ep from echopype.commongrid.mvbs import bin_and_mean_2d -@pytest.fixture -def random_number_generator(): - """Random number generator for tests""" - return default_rng() - - -@pytest.fixture( - params=[ - ( - ("EK60", "ncei-wcsd", "Summer2017-D20170719-T211347.raw"), - "EK60", - None, - {}, - ), - ( - ("EK80_NEW", "echopype-test-D20211004-T235930.raw"), - "EK80", - None, - {'waveform_mode': 'BB', 'encode_mode': 'complex'}, - ), - ( - ("EK80_NEW", "D20211004-T233354.raw"), - "EK80", - None, - {'waveform_mode': 'CW', 'encode_mode': 'power'}, - ), - ( - ("EK80_NEW", "D20211004-T233115.raw"), - "EK80", - None, - {'waveform_mode': 'CW', 'encode_mode': 'complex'}, - ), - (("ES70", "D20151202-T020259.raw"), "ES70", None, {}), - (("AZFP", "17082117.01A"), "AZFP", ("AZFP", "17041823.XML"), {}), - ( - ("AD2CP", "raw", "090", "rawtest.090.00001.ad2cp"), - "AD2CP", - None, - {}, - ), - ], - ids=[ - "ek60_cw_power", - "ek80_bb_complex", - "ek80_cw_power", - "ek80_cw_complex", - "es70", - "azfp", - "ad2cp", - ], -) -def test_data_samples(request, test_path): - ( - filepath, - sonar_model, - azfp_xml_path, - range_kwargs, - ) = request.param - if sonar_model.lower() in ['es70', 'ad2cp']: - pytest.xfail( - reason="Not supported at the moment", - ) - path_model, *paths = filepath - filepath = test_path[path_model].joinpath(*paths) - - if azfp_xml_path is not None: - path_model, *paths = azfp_xml_path - azfp_xml_path = test_path[path_model].joinpath(*paths) - return ( - filepath, - sonar_model, - azfp_xml_path, - range_kwargs, - ) - - -def _construct_MVBS_toy_data( - nchan, npings, nrange_samples, ping_size, range_sample_size -): - """Construct data with values that increase every ping_num and ``range_sample_num`` - so that the result of computing MVBS is a smaller array - that increases regularly for each resampled ``ping_time`` and ``range_sample`` - - Parameters - ---------- - nchan : int - number of channels - npings : int - number of pings - nrange_samples : int - number of range samples - ping_size : int - number of pings with the same value - range_sample_size : int - number of range samples with the same value - - Returns - ------- - np.ndarray - Array with blocks of ``ping_time`` and ``range_sample`` with the same value, - so that computing the MVBS will result in regularly increasing values - every row and column - """ - data = np.ones((nchan, npings, nrange_samples)) - for p_i, ping in enumerate(range(0, npings, ping_size)): - for r_i, rb in enumerate(range(0, nrange_samples, range_sample_size)): - data[0, ping : ping + ping_size, rb : rb + range_sample_size] += ( - r_i + p_i - ) - # First channel increases by 1 each row and column, second increases by 2, third by 3, etc. - for f in range(nchan): - data[f] = data[0] * (f + 1) - - return data - - -def _construct_MVBS_test_data(nchan, npings, nrange_samples): - """Construct data for testing the toy data from - `_construct_MVBS_toy_data` after it has gone through the - MVBS calculation. - - Parameters - ---------- - nchan : int - number of channels - npings : int - number of pings - nrange_samples : int - number of range samples - - Returns - ------- - np.ndarray - Array with values that increases regularly - every ping and range sample - """ - - # Construct test array - test_array = np.add(*np.indices((npings, nrange_samples))) - return np.array([(test_array + 1) * (f + 1) for f in range(nchan)]) - - -def test_compute_MVBS_index_binning(): - """Test compute_MVBS_index_binning on toy data""" - - # Parameters for toy data - nchan, npings, nrange_samples = 4, 40, 400 - ping_num = 3 # number of pings to average over - range_sample_num = 7 # number of range_samples to average over - - # Construct toy data that increases regularly every ping_num and range_sample_num - data = _construct_MVBS_toy_data( - nchan=nchan, - npings=npings, - nrange_samples=nrange_samples, - ping_size=ping_num, - range_sample_size=range_sample_num, - ) - - data_log = 10 * np.log10(data) # Convert to log domain - chan_index = np.arange(nchan).astype(str) - ping_index = np.arange(npings) - range_sample = np.arange(nrange_samples) - Sv = xr.DataArray( - data_log, - coords=[ - ('channel', chan_index), - ('ping_time', ping_index), - ('range_sample', range_sample), - ], - ) - Sv.name = "Sv" - ds_Sv = Sv.to_dataset() - ds_Sv["frequency_nominal"] = chan_index # just so there's value in freq_nominal - ds_Sv = ds_Sv.assign( - echo_range=xr.DataArray( - np.array([[np.linspace(0, 10, nrange_samples)] * npings] * nchan), - coords=Sv.coords, - ) - ) - - # Binned MVBS test - ds_MVBS = ep.commongrid.compute_MVBS_index_binning( - ds_Sv, range_sample_num=range_sample_num, ping_num=ping_num - ) - data_test = 10 ** (ds_MVBS.Sv / 10) # Convert to linear domain - - # Shape test - data_binned_shape = np.ceil( - (nchan, npings / ping_num, nrange_samples / range_sample_num) - ).astype(int) - assert np.all(data_test.shape == data_binned_shape) - - # Construct test array that increases by 1 for each range_sample and ping_time - test_array = _construct_MVBS_test_data( - nchan, data_binned_shape[1], data_binned_shape[2] - ) - - # Test all values in MVBS - assert np.allclose(data_test, test_array, rtol=0, atol=1e-12) - - -def _coll_test_comp_MVBS(ds_Sv, nchan, ping_num, - range_sample_num, ping_time_bin, - total_range, range_meter_bin): - """A collection of tests for test_compute_MVBS""" - - ds_MVBS = ep.commongrid.compute_MVBS( - ds_Sv, - range_meter_bin=range_meter_bin, - ping_time_bin=f'{ping_time_bin}S', - ) - - data_test = 10 ** (ds_MVBS.Sv / 10) # Convert to linear domain - - # Shape test - data_binned_shape = np.ceil((nchan, ping_num, range_sample_num)).astype(int) - assert np.all(data_test.shape == data_binned_shape) - - # Construct test array that increases by 1 for each range_sample and ping_time - test_array = _construct_MVBS_test_data( - nchan, data_binned_shape[1], data_binned_shape[2] - ) - - # Test all values in MVBS - assert np.allclose(data_test, test_array, rtol=0, atol=1e-12) - - # Test to see if ping_time was resampled correctly - test_ping_time = pd.date_range( - '1/1/2020', periods=np.ceil(ping_num), freq=f'{ping_time_bin}S' - ) - assert np.array_equal(data_test.ping_time, test_ping_time) - - # Test to see if range was resampled correctly - test_range = np.arange(0, total_range, range_meter_bin) - assert np.array_equal(data_test.echo_range, test_range) - - -def _fill_w_nans(narr, nan_ping_time, nan_range_sample): - """ - A routine that fills a numpy array with nans. - - Parameters - ---------- - narr : numpy array - Array of dimensions (ping_time, range_sample) - nan_ping_time : list - ping times to fill with nans - nan_range_sample: list - range samples to fill with nans - """ - if len(nan_ping_time) != len(nan_range_sample): - raise ValueError('These lists must be the same size!') - - # fill in nans according to the provided lists - for i, j in zip(nan_ping_time, nan_range_sample): - narr[i, j] = np.nan - - return narr - - -def _nan_cases_comp_MVBS(ds_Sv, chan): - """ - For a single channel, obtains numpy array - filled with nans for various cases - """ - - # get echo_range values for a single channel - one_chan_er = ds_Sv.echo_range.sel(channel=chan).copy().values - - # ping times to fill with NaNs - nan_ping_time_1 = [slice(None), slice(None)] - # range samples to fill with NaNs - nan_range_sample_1 = [3, 4] - # pad all ping_times with nans for a certain range_sample - case_1 = _fill_w_nans(one_chan_er, nan_ping_time_1, nan_range_sample_1) - - # get echo_range values for a single channel - one_chan_er = ds_Sv.echo_range.sel(channel=chan).copy().values - # ping times to fill with NaNs - nan_ping_time_2 = [1, 3, 5, 9] - # range samples to fill with NaNs - nan_range_sample_2 = [slice(None), slice(None), slice(None), slice(None)] - # pad all range_samples of certain ping_times - case_2 = _fill_w_nans(one_chan_er, nan_ping_time_2, nan_range_sample_2) - - # get echo_range values for a single channel - one_chan_er = ds_Sv.echo_range.sel(channel=chan).copy().values - # ping times to fill with NaNs - nan_ping_time_3 = [0, 2, 5, 7] - # range samples to fill with NaNs - nan_range_sample_3 = [slice(0, 2), slice(None), slice(None), slice(0, 3)] - # pad all range_samples of certain ping_times and - # pad some ping_times with nans for a certain range_sample - case_3 = _fill_w_nans(one_chan_er, nan_ping_time_3, nan_range_sample_3) - - return case_1, case_2, case_3 - - -def test_compute_MVBS(): - """Test compute_MVBS on toy data""" - - # Parameters for fake data - nchan, npings, nrange_samples = 4, 100, 4000 - range_meter_bin = 7 # range in meters to average over - ping_time_bin = 3 # number of seconds to average over - ping_rate = 2 # Number of pings per second - range_sample_per_meter = 30 # Number of range_samples per meter - - # Useful conversions - ping_num = ( - npings / ping_rate / ping_time_bin - ) # number of pings to average over - range_sample_num = ( - nrange_samples / range_sample_per_meter / range_meter_bin - ) # number of range_samples to average over - total_range = nrange_samples / range_sample_per_meter # total range in meters - - # Construct data with values that increase with range and time - # so that when compute_MVBS is performed, the result is a smaller array - # that increases by a constant for each meter_bin and time_bin - data = _construct_MVBS_toy_data( - nchan=nchan, - npings=npings, - nrange_samples=nrange_samples, - ping_size=ping_rate * ping_time_bin, - range_sample_size=range_sample_per_meter * range_meter_bin, - ) - - data_log = 10 * np.log10(data) # Convert to log domain - chan_index = np.arange(nchan).astype(str) - freq_nom = np.arange(nchan) - # Generate a date range with `npings` number of pings with the frequency of the ping_rate - ping_time = pd.date_range( - '1/1/2020', periods=npings, freq=f'{1/ping_rate}S' - ) - range_sample = np.arange(nrange_samples) - Sv = xr.DataArray( - data_log, - coords=[ - ('channel', chan_index), - ('ping_time', ping_time), - ('range_sample', range_sample), - ], - ) - Sv.name = "Sv" - ds_Sv = Sv.to_dataset() - ds_Sv = ds_Sv.assign( - frequency_nominal=xr.DataArray(freq_nom, coords={'channel': chan_index}), - echo_range=xr.DataArray( - np.array( - [[np.linspace(0, total_range, nrange_samples)] * npings] * nchan - ), - coords=Sv.coords, - ) - ) - - # initial test of compute_MVBS - _coll_test_comp_MVBS(ds_Sv, nchan, ping_num, - range_sample_num, ping_time_bin, - total_range, range_meter_bin) - - # TODO: use @pytest.fixture params/ids - # for multiple similar tests using the same set of parameters - # different nan cases for a single channel - case_1, case_2, case_3 = _nan_cases_comp_MVBS(ds_Sv, chan='0') - - # pad all ping_times with nans for a certain range_sample - ds_Sv['echo_range'].loc[{'channel': '0'}] = case_1 - - _coll_test_comp_MVBS(ds_Sv, nchan, ping_num, - range_sample_num, ping_time_bin, - total_range, range_meter_bin) - - # pad all range_samples of certain ping_times - ds_Sv['echo_range'].loc[{'channel': '0'}] = case_2 - - _coll_test_comp_MVBS(ds_Sv, nchan, ping_num, - range_sample_num, ping_time_bin, - total_range, range_meter_bin) - - # pad all range_samples of certain ping_times and - # pad some ping_times with nans for a certain range_sample - ds_Sv['echo_range'].loc[{'channel': '0'}] = case_3 - - _coll_test_comp_MVBS(ds_Sv, nchan, ping_num, - range_sample_num, ping_time_bin, - total_range, range_meter_bin) - - -def test_commongrid_mvbs(test_data_samples): - """ - Test running through from open_raw to compute_MVBS. - """ - ( - filepath, - sonar_model, - azfp_xml_path, - range_kwargs, - ) = test_data_samples - ed = ep.open_raw(filepath, sonar_model, azfp_xml_path) - if ed.sonar_model.lower() == 'azfp': - avg_temperature = ed["Environment"]['temperature'].values.mean() - env_params = { - 'temperature': avg_temperature, - 'salinity': 27.9, - 'pressure': 59, - } - range_kwargs['env_params'] = env_params - if 'azfp_cal_type' in range_kwargs: - range_kwargs.pop('azfp_cal_type') - Sv = ep.calibrate.compute_Sv(ed, **range_kwargs) - assert ep.commongrid.compute_MVBS(Sv) is not None - def create_bins(csum_array: np.ndarray) -> Iterable: """ @@ -777,7 +359,7 @@ def bin_and_mean_2d_params(request): return list(request.param.values()) - +@pytest.mark.unit def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None: """ Tests the function ``bin_and_mean_2d``, which is the core @@ -838,131 +420,3 @@ def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None: # compare known MVBS solution against its calculated counterpart assert np.allclose(calc_MVBS, known_MVBS, atol=1e-10, rtol=1e-10, equal_nan=True) - - -def _gen_ping_time(ping_time_len, ping_time_interval, ping_time_jitter_max_ms=0): - ping_time = pd.date_range("2018-07-01", periods=ping_time_len, freq=ping_time_interval) - if ping_time_jitter_max_ms != 0: # if to add jitter - jitter = np.random.randint(ping_time_jitter_max_ms, size=ping_time_len) / 1000 # convert to seconds - ping_time = pd.to_datetime(ping_time.astype(int)/1e9 + jitter, unit="s") - return ping_time - - -def _gen_Sv_er_regular( - channel_len=2, - depth_len=100, depth_interval=0.5, - ping_time_len=600, ping_time_interval="0.3S", - ping_time_jitter_max_ms=0, - random_number_generator=None, -): - """ - Generate a Sv dataset with uniform echo_range across all ping_time. - - ping_time_jitter_max_ms controlled jitter in milliseconds in ping_time. - """ - - if random_number_generator is None: - random_number_generator = default_rng() - - # regular echo_range - echo_range = np.array([[np.arange(depth_len)] * ping_time_len] * channel_len) * depth_interval - - # generate dataset - ds_Sv = xr.Dataset( - data_vars={ - "Sv": ( - ["channel", "ping_time", "range_sample"], - random_number_generator.random(size=(channel_len, ping_time_len, depth_len)) - ), - "echo_range": (["channel", "ping_time", "range_sample"], echo_range), - "frequency_nominal": (["channel"], np.arange(channel_len)), - }, - coords={ - "channel": [f"ch_{ch}" for ch in range(channel_len)], - "ping_time": _gen_ping_time(ping_time_len, ping_time_interval, ping_time_jitter_max_ms), - "range_sample": np.arange(depth_len), - } - ) - - return ds_Sv - - -def _gen_Sv_er_irregular( - channel_len=2, - depth_len=100, depth_interval=[0.5, 0.32, 0.13], depth_ping_time_len=[100, 300, 200], - ping_time_len=600, ping_time_interval="0.3S", ping_time_jitter_max_ms=0, random_number_generator=None, -): - """ - Generate a Sv dataset with uniform echo_range across all ping_time. - - ping_time_jitter_max_ms controlled jitter in milliseconds in ping_time. - """ - if random_number_generator is None: - random_number_generator = default_rng() - - # check input - if len(depth_interval) != len(depth_ping_time_len): - raise ValueError("The number of depth_interval and depth_ping_time_len must be equal!") - - if ping_time_len != np.array(depth_ping_time_len).sum(): - raise ValueError("The number of total pings does not match!") - - # irregular echo_range - echo_range_list = [] - for d, dp in zip(depth_interval, depth_ping_time_len): - echo_range_list.append(np.array([[np.arange(depth_len)] * dp] * channel_len) * d) - echo_range = np.hstack(echo_range_list) - - # generate dataset - ds_Sv = xr.Dataset( - data_vars={ - "Sv": ( - ["channel", "ping_time", "range_sample"], - random_number_generator.random(size=(channel_len, ping_time_len, depth_len)) - ), - "echo_range": (["channel", "ping_time", "range_sample"], echo_range), - "frequency_nominal": (["channel"], np.arange(channel_len)), - }, - coords={ - "channel": [f"ch_{ch}" for ch in range(channel_len)], - "ping_time": _gen_ping_time(ping_time_len, ping_time_interval, ping_time_jitter_max_ms), - "range_sample": np.arange(depth_len), - } - ) - - return ds_Sv - - -@pytest.mark.parametrize( - ("er_type"), - [ - ("regular"), - ("irregular"), - ], -) -def test_compute_MVBS_er_output(er_type, random_number_generator): - # set jitter=0 to get predictable number of ping within each echo_range groups - if er_type == "regular": - ds_Sv = _gen_Sv_er_regular(ping_time_jitter_max_ms=0, random_number_generator=random_number_generator) - else: - depth_interval=[0.5, 0.32, 0.13] - depth_ping_time_len=[100, 300, 200] - ping_time_len=600 - ping_time_interval="0.3S" - ds_Sv = _gen_Sv_er_irregular(depth_interval=depth_interval, depth_ping_time_len=depth_ping_time_len, - ping_time_len=ping_time_len, ping_time_interval=ping_time_interval, - ping_time_jitter_max_ms=0, random_number_generator=random_number_generator) - - ds_MVBS = ep.commongrid.compute_MVBS(ds_Sv, range_meter_bin=5, ping_time_bin="10S") - - if er_type == "regular": - expected_len = ( - ds_Sv["channel"].size, # channel - np.ceil(np.diff(ds_Sv["ping_time"][[0, -1]].astype(int))/1e9 / 10), # ping_time - np.ceil(ds_Sv["echo_range"].max()/5), # depth - ) - assert ds_MVBS["Sv"].shape == expected_len - else: - assert ds_MVBS["Sv"].isel(ping_time=slice(None, 3)).dropna(dim="echo_range").shape == (2, 3, 10) # full array, no NaN - assert ds_MVBS["Sv"].isel(ping_time=slice(3, 12)).dropna(dim="echo_range").shape == (2, 9, 7) # bottom bins contain NaN - assert ds_MVBS["Sv"].isel(ping_time=slice(12, None)).dropna(dim="echo_range").shape == (2, 6, 3) # bottom bins contain NaN From db53a72efac0998a57ad859690f0c4b8e3ef34c0 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Thu, 31 Aug 2023 13:44:10 -0700 Subject: [PATCH 19/43] test: Add latlon test for 'compute_MVBS' Added a test for Sv dataset that contains latitude and longitude going through 'compute_MVBS' to ensure that those variables gets propagated through. Ref: #1002 --- echopype/tests/commongrid/conftest.py | 42 +++++++++++++++++++++++++++ echopype/tests/commongrid/test_api.py | 22 ++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/echopype/tests/commongrid/conftest.py b/echopype/tests/commongrid/conftest.py index 68fe59373..2c9e5b555 100644 --- a/echopype/tests/commongrid/conftest.py +++ b/echopype/tests/commongrid/conftest.py @@ -95,6 +95,48 @@ def ds_Sv_er_regular(regular_data_params, random_number_generator): **regular_data_params, random_number_generator=random_number_generator, ) + +@pytest.fixture +def latlon_history_attr(): + return ( + "2023-08-31 12:00:00.000000 +00:00. " + "Interpolated or propagated from Platform latitude/longitude." # noqa + ) + +@pytest.fixture +def lat_attrs(latlon_history_attr): + """Latitude attributes""" + return {'long_name': 'Platform latitude', + 'standard_name': 'latitude', + 'units': 'degrees_north', + 'valid_range': '(-90.0, 90.0)', + 'history': latlon_history_attr + } + +@pytest.fixture +def lon_attrs(latlon_history_attr): + """Longitude attributes""" + return {'long_name': 'Platform longitude', + 'standard_name': 'longitude', + 'units': 'degrees_east', + 'valid_range': '(-180.0, 180.0)', + 'history': latlon_history_attr + } + + +@pytest.fixture +def ds_Sv_er_regular_w_latlon(ds_Sv_er_regular, lat_attrs, lon_attrs): + """Sv dataset with latitude and longitude""" + n_pings = ds_Sv_er_regular.ping_time.shape[0] + latitude = np.linspace(42, 43, num=n_pings) + longitude = np.linspace(-124, -125, num=n_pings) + + ds_Sv_er_regular['latitude'] = (["ping_time"], latitude, lat_attrs) + ds_Sv_er_regular['longitude'] = (["ping_time"], longitude, lon_attrs) + + # Need processing level code for compute MVBS to work! + ds_Sv_er_regular.attrs["processing_level"] = "Level 2A" + return ds_Sv_er_regular @pytest.fixture diff --git a/echopype/tests/commongrid/test_api.py b/echopype/tests/commongrid/test_api.py index 265096b19..2c5418050 100644 --- a/echopype/tests/commongrid/test_api.py +++ b/echopype/tests/commongrid/test_api.py @@ -43,6 +43,28 @@ def test_compute_MVBS_index_binning(ds_Sv_er_regular, regular_data_params): # Test all values in MVBS assert np.array_equal(ds_MVBS.Sv.data, expected.data) +@pytest.mark.unit +def test_compute_MVBS_w_latlon(ds_Sv_er_regular_w_latlon, lat_attrs, lon_attrs): + """Testing for compute_MVBS with latitude and longitude""" + from echopype.consolidate.api import POSITION_VARIABLES + ds_MVBS = ep.commongrid.compute_MVBS( + ds_Sv_er_regular_w_latlon, + range_meter_bin=5, + ping_time_bin="10S" + ) + for var in POSITION_VARIABLES: + # Check to ensure variable is in dataset + assert var in ds_MVBS.data_vars + # Check for correct shape, which is just ping time + assert ds_MVBS[var].shape == ds_MVBS.ping_time.shape + + # Check if attributes match + if var == "latitude": + assert ds_MVBS[var].attrs == lat_attrs + elif var == "longitude": + assert ds_MVBS[var].attrs == lon_attrs + + @pytest.mark.unit @pytest.mark.parametrize("range_var", ["my_range", "echo_range", "depth"]) def test_compute_MVBS_invalid_range_var(ds_Sv_er_regular, range_var): From 3f24590fdd575ad0373e4bcc00c419c82b7a3220 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Thu, 31 Aug 2023 14:46:15 -0700 Subject: [PATCH 20/43] test: Add small get_MVBS_along_channels test Added test for 'get_MVBS_along_channels' with either 'depth' as the 'range_var' or checking for 'has_positions' is True or False. --- echopype/tests/commongrid/conftest.py | 12 ++++++ echopype/tests/commongrid/test_mvbs.py | 51 +++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/echopype/tests/commongrid/conftest.py b/echopype/tests/commongrid/conftest.py index 2c9e5b555..789513cd6 100644 --- a/echopype/tests/commongrid/conftest.py +++ b/echopype/tests/commongrid/conftest.py @@ -4,6 +4,8 @@ import numpy as np import pandas as pd +from echopype.consolidate import add_depth + @pytest.fixture def random_number_generator(): @@ -123,6 +125,11 @@ def lon_attrs(latlon_history_attr): 'history': latlon_history_attr } +@pytest.fixture +def depth_offset(): + """Depth offset for calculating depth""" + return 2.5 + @pytest.fixture def ds_Sv_er_regular_w_latlon(ds_Sv_er_regular, lat_attrs, lon_attrs): @@ -138,6 +145,11 @@ def ds_Sv_er_regular_w_latlon(ds_Sv_er_regular, lat_attrs, lon_attrs): ds_Sv_er_regular.attrs["processing_level"] = "Level 2A" return ds_Sv_er_regular +@pytest.fixture +def ds_Sv_er_regular_w_depth(ds_Sv_er_regular, depth_offset): + """Sv dataset with depth""" + return ds_Sv_er_regular.pipe(add_depth, depth_offset=depth_offset) + @pytest.fixture def ds_Sv_er_irregular(random_number_generator): diff --git a/echopype/tests/commongrid/test_mvbs.py b/echopype/tests/commongrid/test_mvbs.py index cf677edf8..b3bcec5eb 100644 --- a/echopype/tests/commongrid/test_mvbs.py +++ b/echopype/tests/commongrid/test_mvbs.py @@ -1,9 +1,10 @@ import dask.array import numpy as np +import pandas as pd from numpy.random import default_rng import pytest from typing import Tuple, Iterable, Union -from echopype.commongrid.mvbs import bin_and_mean_2d +from echopype.commongrid.mvbs import bin_and_mean_2d, get_MVBS_along_channels def create_bins(csum_array: np.ndarray) -> Iterable: @@ -420,3 +421,51 @@ def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None: # compare known MVBS solution against its calculated counterpart assert np.allclose(calc_MVBS, known_MVBS, atol=1e-10, rtol=1e-10, equal_nan=True) + + +@pytest.mark.unit +@pytest.mark.parametrize(["range_var", "lat_lon"], [("depth", False), ("echo_range", True)]) +def test_get_MVBS_along_channels(request, range_var, lat_lon): + """Testing the underlying function of compute_MVBS""" + range_meter_bin = 20 + ping_time_bin = "20S" + method = "map-reduce" + + flox_kwargs = { + "reindex": True + } + + # Retrieve the correct dataset + if range_var == "depth": + ds_Sv = request.getfixturevalue("ds_Sv_er_regular_w_depth") + elif range_var == "echo_range" and lat_lon is True: + ds_Sv = request.getfixturevalue("ds_Sv_er_regular_w_latlon") + else: + ds_Sv = request.getfixturevalue("ds_Sv_er_regular") + + # compute range interval + echo_range_max = ds_Sv[range_var].max() + range_interval = np.arange(0, echo_range_max + range_meter_bin, range_meter_bin) + + # create bin information needed for ping_time + d_index = ( + ds_Sv["ping_time"] + .resample(ping_time=ping_time_bin, skipna=True) + .asfreq() + .indexes["ping_time"] + ) + ping_interval = d_index.union([d_index[-1] + pd.Timedelta(ping_time_bin)]) + + raw_MVBS = get_MVBS_along_channels( + ds_Sv, range_interval, ping_interval, + range_var=range_var, method=method, **flox_kwargs + ) + + # Check that the range_var is in the dimension + assert f"{range_var}_bins" in raw_MVBS.dims + + # When it's echo_range and lat_lon, the dataset should have positions + if range_var == "echo_range" and lat_lon is True: + assert raw_MVBS.attrs["has_positions"] is True + else: + assert raw_MVBS.attrs["has_positions"] is False From f3c3503407a51b565c93fd22d23c533251a10fe2 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Fri, 1 Sep 2023 15:36:12 -0700 Subject: [PATCH 21/43] refactor: Integrate suggested changes Integrated suggested changes from review such as additional comments in code, fixing some variable names, and extracting out the lin2log and log2lin functions. Additionally, now echopype imports pint library to start having unit checks in the input for compute_MVBS. --- echopype/commongrid/api.py | 50 +++++++++++++++++++------ echopype/commongrid/mvbs.py | 42 +++++---------------- echopype/tests/commongrid/test_api.py | 26 ++++++++++++- echopype/tests/convert/utils/test_ek.py | 14 +++++++ echopype/utils/compute.py | 41 ++++++++++++++++++++ requirements.txt | 1 + 6 files changed, 130 insertions(+), 44 deletions(-) create mode 100644 echopype/tests/convert/utils/test_ek.py create mode 100644 echopype/utils/compute.py diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index 9b8cf0507..6a4ff1744 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -5,12 +5,17 @@ import numpy as np import pandas as pd +import pint import xarray as xr from ..consolidate.api import POSITION_VARIABLES from ..utils.prov import add_processing_level, echopype_prov_attrs, insert_input_processing_level from .mvbs import get_MVBS_along_channels +# Setup pint unit registry +ureg = pint.UnitRegistry() +Q_ = ureg.Quantity + def _set_var_attrs(da, long_name, units, round_digits, standard_name=None): """ @@ -69,7 +74,7 @@ def _set_MVBS_attrs(ds): def compute_MVBS( ds_Sv: xr.Dataset, range_var: Literal["echo_range", "depth"] = "echo_range", - range_meter_bin: int = 20, + range_meter_bin: str = "20m", ping_time_bin: str = "20S", method="map-reduce", **flox_kwargs, @@ -92,9 +97,9 @@ def compute_MVBS( Must be one of ``echo_range`` or ``depth``. Note that ``depth`` is only available if the input dataset contains ``depth`` as a data variable. - range_meter_bin : Union[int, float] + range_meter_bin : str bin size along ``echo_range`` or ``depth`` in meters, - default to ``20`` + default to ``20m`` ping_time_bin : str bin size along ``ping_time``, default to ``20S`` method: str @@ -110,6 +115,31 @@ def compute_MVBS( A dataset containing bin-averaged Sv """ + # First check for bin types + if not isinstance(range_meter_bin, str): + raise TypeError("range_meter_bin must be a string") + + if not isinstance(ping_time_bin, str): + raise TypeError("ping_time_bin must be a string") + + # Parse range_meter sizes w/units + range_meter_bin = Q_(range_meter_bin) + max_range_meter = Q_("1km") + + # Do some checks on range meter inputs + if not range_meter_bin.is_compatible_with("meter"): + # This shouldn't be other units + raise ValueError( + f"Found incompatible units ({range_meter_bin.units}) " + "for 'range_meter_bin'. Must be in meters." + ) + elif range_meter_bin >= max_range_meter: + # Raise error if above 1km bin + raise ValueError(f"range_meter_bin must be less than {max_range_meter}.") + + # Convert back to int or float + range_meter_bin = range_meter_bin.to("meter").magnitude + # Clean up filenames dimension if it exists # not needed here if "filenames" in ds_Sv.dims: @@ -124,7 +154,7 @@ def compute_MVBS( raise ValueError(f"range_var '{range_var}' does not exist in the input dataset.") # create bin information for echo_range - # this computes the echo range max since there might be missing values + # this computes the echo range max since there might NaNs in the data echo_range_max = ds_Sv[range_var].max() range_interval = np.arange(0, echo_range_max + range_meter_bin, range_meter_bin) @@ -132,15 +162,11 @@ def compute_MVBS( d_index = ( ds_Sv["ping_time"] .resample(ping_time=ping_time_bin, skipna=True) - .asfreq() + .first() # Not actually being used, but needed to get the bin groups .indexes["ping_time"] ) - ping_interval = d_index.union([d_index[-1] + pd.Timedelta(ping_time_bin)]) + ping_interval = d_index.union([d_index[-1] + pd.Timedelta(ping_time_bin)]).values - # calculate the MVBS along each channel - if method == "map-reduce": - # set to the faster compute if not specified - flox_kwargs.setdefault("reindex", True) raw_MVBS = get_MVBS_along_channels( ds_Sv, range_interval, ping_interval, range_var=range_var, method=method, **flox_kwargs ) @@ -156,7 +182,9 @@ def compute_MVBS( }, ) - # Add the position variables + # "has_positions" attribute is inserted in get_MVBS_along_channels + # when the dataset has position information + # propagate this to the final MVBS dataset if raw_MVBS.attrs.get("has_positions", False): for var in POSITION_VARIABLES: ds_MVBS[var] = (["ping_time"], raw_MVBS[var].data, ds_Sv[var].attrs) diff --git a/echopype/commongrid/mvbs.py b/echopype/commongrid/mvbs.py index b7914ea0b..615b06e55 100644 --- a/echopype/commongrid/mvbs.py +++ b/echopype/commongrid/mvbs.py @@ -11,6 +11,7 @@ from flox.xarray import xarray_reduce from ..consolidate.api import POSITION_VARIABLES +from ..utils.compute import _lin2log, _log2lin def get_bin_indices( @@ -407,32 +408,9 @@ def bin_and_mean_2d( return final -def _linear_transform( - data: Union[dask.array.Array, np.ndarray], inverse: bool = False -) -> Union[dask.array.Array, np.ndarray]: - """ - Perform linear transform on data - - Parameters - ---------- - data : dask.array.Array or np.ndarray - The data to be transformed - inverse : bool - If True, perform inverse transform - - Returns - ------- - dask.array.Array or np.ndarray - The transformed data - """ - if inverse: - return 10 * np.log10(data) - return 10 ** (data / 10) - - def get_MVBS_along_channels( ds_Sv: xr.Dataset, - echo_range_interval: np.ndarray, + range_interval: np.ndarray, ping_interval: np.ndarray, range_var: Literal["echo_range", "depth"] = "echo_range", method: str = "map-reduce", @@ -447,12 +425,12 @@ def get_MVBS_along_channels( ds_Sv: xr.Dataset A Dataset containing ``Sv`` and ``echo_range`` data with coordinates ``channel``, ``ping_time``, and ``range_sample`` - echo_range_interval: np.ndarray - 1D array (used by np.digitize) representing - the binning required for ``echo_range`` + range_interval: np.ndarray + 1D array representing + the bins required for ``range_var`` ping_interval: np.ndarray - 1D array (used by np.digitize) representing - the binning required for ``ping_time`` + 1D array representing + the bins required for ``ping_time`` range_var: str The variable to use for range binning. Either ``echo_range`` or ``depth``. @@ -471,7 +449,7 @@ def get_MVBS_along_channels( """ # average should be done in linear domain - sv = ds_Sv["Sv"].pipe(_linear_transform) + sv = ds_Sv["Sv"].pipe(_log2lin) # Get positions if exists # otherwise just use an empty dataset @@ -495,12 +473,12 @@ def get_MVBS_along_channels( ds_Sv["ping_time"], ds_Sv[range_var], func="nanmean", - expected_groups=(None, ping_interval, echo_range_interval), + expected_groups=(None, ping_interval, range_interval), isbin=[False, True, True], method=method, **kwargs ) # apply inverse mapping to get back to the original domain and store values - da_MVBS = mvbs.pipe(_linear_transform, inverse=True) + da_MVBS = mvbs.pipe(_lin2log) return xr.merge([ds_Pos, da_MVBS]) diff --git a/echopype/tests/commongrid/test_api.py b/echopype/tests/commongrid/test_api.py index 2c5418050..084ca2bd1 100644 --- a/echopype/tests/commongrid/test_api.py +++ b/echopype/tests/commongrid/test_api.py @@ -43,6 +43,30 @@ def test_compute_MVBS_index_binning(ds_Sv_er_regular, regular_data_params): # Test all values in MVBS assert np.array_equal(ds_MVBS.Sv.data, expected.data) +@pytest.mark.unit +@pytest.mark.parametrize(["range_meter_bin", "ping_time_bin"], [ + (5, "10S"), + ("10m", 10), + ("10km", "10S"), + ("10", "10S") +]) +def test_compute_MVBS_bin_inputs_fail(ds_Sv_er_regular, range_meter_bin, ping_time_bin): + expected_error = ValueError + if isinstance(range_meter_bin, int) or isinstance(ping_time_bin, int): + expected_error = TypeError + match = r'\w+ must be a string' + elif "m" not in range_meter_bin: + match = r'Found incompatible units \(\w+\) *' + elif "km" in range_meter_bin: + match = "range_meter_bin must be less than 1 kilometer." + + with pytest.raises(expected_error, match=match): + ep.commongrid.compute_MVBS( + ds_Sv_er_regular, + range_meter_bin=range_meter_bin, + ping_time_bin=ping_time_bin + ) + @pytest.mark.unit def test_compute_MVBS_w_latlon(ds_Sv_er_regular_w_latlon, lat_attrs, lon_attrs): """Testing for compute_MVBS with latitude and longitude""" @@ -129,7 +153,7 @@ def test_compute_MVBS(test_data_samples): ("irregular"), ], ) -def test_compute_MVBS_er_output(request, er_type): +def test_compute_MVBS_range_output(request, er_type): """Tests the output of compute_MVBS on regular and irregular data.""" # set jitter=0 to get predictable number of ping within each echo_range groups if er_type == "regular": diff --git a/echopype/tests/convert/utils/test_ek.py b/echopype/tests/convert/utils/test_ek.py new file mode 100644 index 000000000..6589e5c6e --- /dev/null +++ b/echopype/tests/convert/utils/test_ek.py @@ -0,0 +1,14 @@ +import pytest +from echopype.convert.utils.ek import COMPLEX_VAR, _get_power_dims + +@pytest.fixture() +def dgram_zarr_vars(): + return {'power': ['timestamp', 'channel'], 'angle': ['timestamp', 'channel']} + +def test__get_power_dims(dgram_zarr_vars): + power_dims = _get_power_dims(dgram_zarr_vars) + assert isinstance(power_dims, list) + assert sorted(power_dims) == sorted(['timestamp', 'channel']) + +def test__extract_datagram_dfs(): + ... \ No newline at end of file diff --git a/echopype/utils/compute.py b/echopype/utils/compute.py new file mode 100644 index 000000000..936a1187d --- /dev/null +++ b/echopype/utils/compute.py @@ -0,0 +1,41 @@ +"""compute.py + +Module containing various helper functions +for performing computations within echopype. +""" +from typing import Union + +import dask.array +import numpy as np + + +def _log2lin(data: Union[dask.array.Array, np.ndarray]) -> Union[dask.array.Array, np.ndarray]: + """Perform log to linear transform on data + + Parameters + ---------- + data : dask.array.Array or np.ndarray + The data to be transformed + + Returns + ------- + dask.array.Array or np.ndarray + The transformed data + """ + return 10 ** (data / 10) + + +def _lin2log(data: Union[dask.array.Array, np.ndarray]) -> Union[dask.array.Array, np.ndarray]: + """Perform linear to log transform on data + + Parameters + ---------- + data : dask.array.Array or np.ndarray + The data to be transformed + + Returns + ------- + dask.array.Array or np.ndarray + The transformed data + """ + return 10 * np.log10(data) diff --git a/requirements.txt b/requirements.txt index 11fe3b2ec..1b13898d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ dask[array,distributed] jinja2 netCDF4>1.6 numpy +pint pynmea2 pytz scipy From 7d457d531a5ee2d5a1a8b5f7c2cf8bc2e18bd7f7 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Fri, 1 Sep 2023 15:50:46 -0700 Subject: [PATCH 22/43] test: Added check for position values --- echopype/tests/commongrid/test_mvbs.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/echopype/tests/commongrid/test_mvbs.py b/echopype/tests/commongrid/test_mvbs.py index b3bcec5eb..2484aac61 100644 --- a/echopype/tests/commongrid/test_mvbs.py +++ b/echopype/tests/commongrid/test_mvbs.py @@ -5,6 +5,8 @@ import pytest from typing import Tuple, Iterable, Union from echopype.commongrid.mvbs import bin_and_mean_2d, get_MVBS_along_channels +from echopype.consolidate.api import POSITION_VARIABLES +from flox.xarray import xarray_reduce def create_bins(csum_array: np.ndarray) -> Iterable: @@ -469,3 +471,19 @@ def test_get_MVBS_along_channels(request, range_var, lat_lon): assert raw_MVBS.attrs["has_positions"] is True else: assert raw_MVBS.attrs["has_positions"] is False + + if range_var == "echo_range": + assert all(v in raw_MVBS for v in POSITION_VARIABLES) + + # Compute xarray reduce manually for this + expected_Pos = xarray_reduce( + ds_Sv[POSITION_VARIABLES], + ds_Sv["ping_time"], + func="nanmean", + expected_groups=(ping_interval), + isbin=True, + method=method, + ) + + for v in POSITION_VARIABLES: + assert np.array_equal(raw_MVBS[v].data, expected_Pos[v].data) From 6a33efa45486044c9b38f3cfbf662a8bad5bd2c0 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Fri, 1 Sep 2023 15:54:08 -0700 Subject: [PATCH 23/43] test: Update range_meter_bin to strings --- echopype/tests/commongrid/test_api.py | 4 ++-- echopype/tests/utils/test_processinglevels_integration.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/echopype/tests/commongrid/test_api.py b/echopype/tests/commongrid/test_api.py index 084ca2bd1..1a64ab241 100644 --- a/echopype/tests/commongrid/test_api.py +++ b/echopype/tests/commongrid/test_api.py @@ -73,7 +73,7 @@ def test_compute_MVBS_w_latlon(ds_Sv_er_regular_w_latlon, lat_attrs, lon_attrs): from echopype.consolidate.api import POSITION_VARIABLES ds_MVBS = ep.commongrid.compute_MVBS( ds_Sv_er_regular_w_latlon, - range_meter_bin=5, + range_meter_bin="5m", ping_time_bin="10S" ) for var in POSITION_VARIABLES: @@ -161,7 +161,7 @@ def test_compute_MVBS_range_output(request, er_type): else: ds_Sv = request.getfixturevalue("ds_Sv_er_irregular") - ds_MVBS = ep.commongrid.compute_MVBS(ds_Sv, range_meter_bin=5, ping_time_bin="10S") + ds_MVBS = ep.commongrid.compute_MVBS(ds_Sv, range_meter_bin="5m", ping_time_bin="10S") if er_type == "regular": expected_len = ( diff --git a/echopype/tests/utils/test_processinglevels_integration.py b/echopype/tests/utils/test_processinglevels_integration.py index b04af8b1e..48a0a4296 100644 --- a/echopype/tests/utils/test_processinglevels_integration.py +++ b/echopype/tests/utils/test_processinglevels_integration.py @@ -129,5 +129,5 @@ def _freqdiff_applymask(test_ds): # No product level is assigned because at present compute_MVBS drops the lat/lon data # associated with the input Sv dataset # ds = ds.rename_vars(name_dict={"Sv": "Sv_unmasked", "Sv_ch0": "Sv"}) - mvbs_ds = ep.commongrid.compute_MVBS(ds, range_meter_bin=30, ping_time_bin='1min') + mvbs_ds = ep.commongrid.compute_MVBS(ds, range_meter_bin="30m", ping_time_bin='1min') _absence_test(mvbs_ds) From b64eca341cb27bdf9c24eb6a5d308b0ed4ea5194 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Fri, 1 Sep 2023 17:34:22 -0700 Subject: [PATCH 24/43] test: Added 'compute_MVBS' values test --- echopype/tests/commongrid/conftest.py | 154 ++++++++++++++++++++++---- echopype/tests/commongrid/test_api.py | 144 ++++++++++++++---------- 2 files changed, 214 insertions(+), 84 deletions(-) diff --git a/echopype/tests/commongrid/conftest.py b/echopype/tests/commongrid/conftest.py index 789513cd6..b120c485f 100644 --- a/echopype/tests/commongrid/conftest.py +++ b/echopype/tests/commongrid/conftest.py @@ -12,6 +12,104 @@ def random_number_generator(): """Random number generator for tests""" return np.random.default_rng() + +@pytest.fixture +def mock_parameters(): + """Small mock parameters""" + return { + "channel_len": 2, + "ping_time_len": 10, + "depth_len": 20, + "ping_time_interval": "0.3S", + } + + +@pytest.fixture +def mock_sv_sample(mock_parameters): + """ + Mock Sv sample + + Dimension: (2, 10, 20) + """ + channel_len = mock_parameters["channel_len"] + ping_time_len = mock_parameters["ping_time_len"] + depth_len = mock_parameters["depth_len"] + + depth_data = np.linspace(0, 1, num=depth_len) + return np.tile(depth_data, (channel_len, ping_time_len, 1)) + + +@pytest.fixture +def mock_sv_dataset_regular(mock_parameters, mock_sv_sample): + ds_Sv = _gen_Sv_er_regular(**mock_parameters, ping_time_jitter_max_ms=0) + ds_Sv["Sv"].data = mock_sv_sample + return ds_Sv + + +@pytest.fixture +def mock_sv_dataset_irregular(mock_parameters, mock_sv_sample): + depth_interval = [0.5, 0.32, np.nan] # Added nans + depth_ping_time_len = [2, 3, 5] + ds_Sv = _gen_Sv_er_irregular( + **mock_parameters, + depth_interval=depth_interval, + depth_ping_time_len=depth_ping_time_len, + ping_time_jitter_max_ms=30, # Added jitter to ping_time + ) + ds_Sv["Sv"].data = mock_sv_sample + return ds_Sv + + +@pytest.fixture +def mock_mvbs_array_regular(): + """ + Mock Sv sample result from compute_MVBS + + Dimension: (2, 3, 5) + Ping time bin: 1s + Range bin: 2m + """ + return np.array( + [ + [ + [0.13197759, 0.3425039, 0.55303022, 0.76355653, 0.94758103], + [0.13197759, 0.3425039, 0.55303022, 0.76355653, 0.94758103], + [0.13197759, 0.3425039, 0.55303022, 0.76355653, 0.94758103], + ], + [ + [0.13197759, 0.3425039, 0.55303022, 0.76355653, 0.94758103], + [0.13197759, 0.3425039, 0.55303022, 0.76355653, 0.94758103], + [0.13197759, 0.3425039, 0.55303022, 0.76355653, 0.94758103], + ], + ] + ) + + +@pytest.fixture +def mock_mvbs_array_irregular(): + """ + Mock Sv sample irregular result from compute_MVBS + + Dimension: (2, 3, 5) + Ping time bin: 1s + Range bin: 2m + """ + return np.array( + [ + [ + [0.16395346, 0.43825143, 0.71315706, 0.81188627, 0.94758103], + [0.18514066, 0.50093013, 0.81671961, 1.0, np.nan], + [np.nan, np.nan, np.nan, np.nan, np.nan], + ], + [ + [0.16395346, 0.43825143, 0.71315706, 0.81188627, 0.94758103], + [0.18514066, 0.50093013, 0.81671961, 1.0, np.nan], + [np.nan, np.nan, np.nan, np.nan, np.nan], + ], + ] + ) + + @pytest.fixture( params=[ ( @@ -24,19 +122,19 @@ def random_number_generator(): ("EK80_NEW", "echopype-test-D20211004-T235930.raw"), "EK80", None, - {'waveform_mode': 'BB', 'encode_mode': 'complex'}, + {"waveform_mode": "BB", "encode_mode": "complex"}, ), ( ("EK80_NEW", "D20211004-T233354.raw"), "EK80", None, - {'waveform_mode': 'CW', 'encode_mode': 'power'}, + {"waveform_mode": "CW", "encode_mode": "power"}, ), ( ("EK80_NEW", "D20211004-T233115.raw"), "EK80", None, - {'waveform_mode': 'CW', 'encode_mode': 'complex'}, + {"waveform_mode": "CW", "encode_mode": "complex"}, ), (("ES70", "D20151202-T020259.raw"), "ES70", None, {}), (("AZFP", "17082117.01A"), "AZFP", ("AZFP", "17041823.XML"), {}), @@ -64,7 +162,7 @@ def test_data_samples(request, test_path): azfp_xml_path, range_kwargs, ) = request.param - if sonar_model.lower() in ['es70', 'ad2cp']: + if sonar_model.lower() in ["es70", "ad2cp"]: pytest.xfail( reason="Not supported at the moment", ) @@ -80,7 +178,8 @@ def test_data_samples(request, test_path): azfp_xml_path, range_kwargs, ) - + + @pytest.fixture def regular_data_params(): return { @@ -97,39 +196,45 @@ def ds_Sv_er_regular(regular_data_params, random_number_generator): **regular_data_params, random_number_generator=random_number_generator, ) - + + @pytest.fixture def latlon_history_attr(): return ( "2023-08-31 12:00:00.000000 +00:00. " "Interpolated or propagated from Platform latitude/longitude." # noqa ) - + + @pytest.fixture def lat_attrs(latlon_history_attr): """Latitude attributes""" - return {'long_name': 'Platform latitude', - 'standard_name': 'latitude', - 'units': 'degrees_north', - 'valid_range': '(-90.0, 90.0)', - 'history': latlon_history_attr + return { + "long_name": "Platform latitude", + "standard_name": "latitude", + "units": "degrees_north", + "valid_range": "(-90.0, 90.0)", + "history": latlon_history_attr, } - + + @pytest.fixture def lon_attrs(latlon_history_attr): """Longitude attributes""" - return {'long_name': 'Platform longitude', - 'standard_name': 'longitude', - 'units': 'degrees_east', - 'valid_range': '(-180.0, 180.0)', - 'history': latlon_history_attr + return { + "long_name": "Platform longitude", + "standard_name": "longitude", + "units": "degrees_east", + "valid_range": "(-180.0, 180.0)", + "history": latlon_history_attr, } - + + @pytest.fixture def depth_offset(): """Depth offset for calculating depth""" return 2.5 - + @pytest.fixture def ds_Sv_er_regular_w_latlon(ds_Sv_er_regular, lat_attrs, lon_attrs): @@ -137,14 +242,15 @@ def ds_Sv_er_regular_w_latlon(ds_Sv_er_regular, lat_attrs, lon_attrs): n_pings = ds_Sv_er_regular.ping_time.shape[0] latitude = np.linspace(42, 43, num=n_pings) longitude = np.linspace(-124, -125, num=n_pings) - - ds_Sv_er_regular['latitude'] = (["ping_time"], latitude, lat_attrs) - ds_Sv_er_regular['longitude'] = (["ping_time"], longitude, lon_attrs) - + + ds_Sv_er_regular["latitude"] = (["ping_time"], latitude, lat_attrs) + ds_Sv_er_regular["longitude"] = (["ping_time"], longitude, lon_attrs) + # Need processing level code for compute MVBS to work! ds_Sv_er_regular.attrs["processing_level"] = "Level 2A" return ds_Sv_er_regular + @pytest.fixture def ds_Sv_er_regular_w_depth(ds_Sv_er_regular, depth_offset): """Sv dataset with depth""" diff --git a/echopype/tests/commongrid/test_api.py b/echopype/tests/commongrid/test_api.py index 1a64ab241..a051d149b 100644 --- a/echopype/tests/commongrid/test_api.py +++ b/echopype/tests/commongrid/test_api.py @@ -3,12 +3,14 @@ import numpy as np import pandas as pd + # NASC Tests @pytest.mark.integration @pytest.mark.skip(reason="NASC is not implemented yet") def test_compute_NASC(test_data_samples): pass + # MVBS Tests @pytest.mark.integration def test_compute_MVBS_index_binning(ds_Sv_er_regular, regular_data_params): @@ -35,80 +37,73 @@ def test_compute_MVBS_index_binning(ds_Sv_er_regular, regular_data_params): # average should be done in linear domain da_sv = 10 ** (ds_Sv_er_regular["Sv"] / 10) expected = 10 * np.log10( - da_sv - .coarsen(ping_time=ping_num, range_sample=range_sample_num, boundary="pad") - .mean(skipna=True) + da_sv.coarsen(ping_time=ping_num, range_sample=range_sample_num, boundary="pad").mean( + skipna=True + ) ) # Test all values in MVBS assert np.array_equal(ds_MVBS.Sv.data, expected.data) - + + @pytest.mark.unit -@pytest.mark.parametrize(["range_meter_bin", "ping_time_bin"], [ - (5, "10S"), - ("10m", 10), - ("10km", "10S"), - ("10", "10S") -]) +@pytest.mark.parametrize( + ["range_meter_bin", "ping_time_bin"], [(5, "10S"), ("10m", 10), ("10km", "10S"), ("10", "10S")] +) def test_compute_MVBS_bin_inputs_fail(ds_Sv_er_regular, range_meter_bin, ping_time_bin): expected_error = ValueError if isinstance(range_meter_bin, int) or isinstance(ping_time_bin, int): expected_error = TypeError - match = r'\w+ must be a string' + match = r"\w+ must be a string" elif "m" not in range_meter_bin: - match = r'Found incompatible units \(\w+\) *' + match = r"Found incompatible units \(\w+\) *" elif "km" in range_meter_bin: match = "range_meter_bin must be less than 1 kilometer." - + with pytest.raises(expected_error, match=match): ep.commongrid.compute_MVBS( - ds_Sv_er_regular, - range_meter_bin=range_meter_bin, - ping_time_bin=ping_time_bin + ds_Sv_er_regular, range_meter_bin=range_meter_bin, ping_time_bin=ping_time_bin ) - + + @pytest.mark.unit def test_compute_MVBS_w_latlon(ds_Sv_er_regular_w_latlon, lat_attrs, lon_attrs): """Testing for compute_MVBS with latitude and longitude""" from echopype.consolidate.api import POSITION_VARIABLES + ds_MVBS = ep.commongrid.compute_MVBS( - ds_Sv_er_regular_w_latlon, - range_meter_bin="5m", - ping_time_bin="10S" + ds_Sv_er_regular_w_latlon, range_meter_bin="5m", ping_time_bin="10S" ) for var in POSITION_VARIABLES: # Check to ensure variable is in dataset assert var in ds_MVBS.data_vars # Check for correct shape, which is just ping time assert ds_MVBS[var].shape == ds_MVBS.ping_time.shape - + # Check if attributes match if var == "latitude": assert ds_MVBS[var].attrs == lat_attrs elif var == "longitude": assert ds_MVBS[var].attrs == lon_attrs - - + + @pytest.mark.unit @pytest.mark.parametrize("range_var", ["my_range", "echo_range", "depth"]) def test_compute_MVBS_invalid_range_var(ds_Sv_er_regular, range_var): """Test compute MVBS range_var on mock data""" if range_var == "my_range": - with pytest.raises( - ValueError, - match="range_var must be one of 'echo_range' or 'depth'." - ): + with pytest.raises(ValueError, match="range_var must be one of 'echo_range' or 'depth'."): ep.commongrid.compute_MVBS(ds_Sv_er_regular, range_var=range_var) elif range_var == "depth": with pytest.raises( - ValueError, - match=f"range_var '{range_var}' does not exist in the input dataset." + ValueError, match=f"range_var '{range_var}' does not exist in the input dataset." ): ep.commongrid.compute_MVBS(ds_Sv_er_regular, range_var=range_var) else: pass + @pytest.mark.integration def test_compute_MVBS(test_data_samples): """ @@ -121,30 +116,28 @@ def test_compute_MVBS(test_data_samples): range_kwargs, ) = test_data_samples ed = ep.open_raw(filepath, sonar_model, azfp_xml_path) - if ed.sonar_model.lower() == 'azfp': - avg_temperature = ed["Environment"]['temperature'].values.mean() + if ed.sonar_model.lower() == "azfp": + avg_temperature = ed["Environment"]["temperature"].values.mean() env_params = { - 'temperature': avg_temperature, - 'salinity': 27.9, - 'pressure': 59, + "temperature": avg_temperature, + "salinity": 27.9, + "pressure": 59, } - range_kwargs['env_params'] = env_params - if 'azfp_cal_type' in range_kwargs: - range_kwargs.pop('azfp_cal_type') + range_kwargs["env_params"] = env_params + if "azfp_cal_type" in range_kwargs: + range_kwargs.pop("azfp_cal_type") Sv = ep.calibrate.compute_Sv(ed, **range_kwargs) ping_time_bin = "20S" ds_MVBS = ep.commongrid.compute_MVBS(Sv, ping_time_bin=ping_time_bin) assert ds_MVBS is not None - + # Test to see if ping_time was resampled correctly expected_ping_time = ( - Sv["ping_time"] - .resample(ping_time=ping_time_bin, skipna=True) - .asfreq() - .indexes["ping_time"] + Sv["ping_time"].resample(ping_time=ping_time_bin, skipna=True).asfreq().indexes["ping_time"] ) assert np.array_equal(ds_MVBS.ping_time.data, expected_ping_time.values) + @pytest.mark.integration @pytest.mark.parametrize( ("er_type"), @@ -160,29 +153,60 @@ def test_compute_MVBS_range_output(request, er_type): ds_Sv = request.getfixturevalue("ds_Sv_er_regular") else: ds_Sv = request.getfixturevalue("ds_Sv_er_irregular") - + ds_MVBS = ep.commongrid.compute_MVBS(ds_Sv, range_meter_bin="5m", ping_time_bin="10S") if er_type == "regular": expected_len = ( ds_Sv["channel"].size, # channel - np.ceil(np.diff(ds_Sv["ping_time"][[0, -1]].astype(int))/1e9 / 10), # ping_time - np.ceil(ds_Sv["echo_range"].max()/5), # depth + np.ceil(np.diff(ds_Sv["ping_time"][[0, -1]].astype(int)) / 1e9 / 10), # ping_time + np.ceil(ds_Sv["echo_range"].max() / 5), # depth ) assert ds_MVBS["Sv"].shape == expected_len else: - assert ( - ds_MVBS["Sv"] - .isel(ping_time=slice(None, 3)) - .dropna(dim="echo_range").shape - ) == (2, 3, 10) # full array, no NaN - assert ( - ds_MVBS["Sv"] - .isel(ping_time=slice(3, 12)) - .dropna(dim="echo_range").shape - ) == (2, 9, 7) # bottom bins contain NaN - assert ( - ds_MVBS["Sv"] - .isel(ping_time=slice(12, None)) - .dropna(dim="echo_range").shape - ) == (2, 6, 3) # bottom bins contain NaN + assert (ds_MVBS["Sv"].isel(ping_time=slice(None, 3)).dropna(dim="echo_range").shape) == ( + 2, + 3, + 10, + ) # full array, no NaN + assert (ds_MVBS["Sv"].isel(ping_time=slice(3, 12)).dropna(dim="echo_range").shape) == ( + 2, + 9, + 7, + ) # bottom bins contain NaN + assert (ds_MVBS["Sv"].isel(ping_time=slice(12, None)).dropna(dim="echo_range").shape) == ( + 2, + 6, + 3, + ) # bottom bins contain NaN + + +@pytest.mark.integration +@pytest.mark.parametrize( + ("er_type"), + [ + ("regular"), + ("irregular"), + ], +) +def test_compute_MVBS_values(request, er_type): + """Tests for the values of compute_MVBS on regular and irregular data.""" + range_meter_bin = "2m" + ping_time_bin = "1s" + + if er_type == "regular": + ds_Sv = request.getfixturevalue("mock_sv_dataset_regular") + expected_mvbs = request.getfixturevalue("mock_mvbs_array_regular") + else: + # Mock irregular dataset contains jitter + # and NaN values in the bottom echo_range + ds_Sv = request.getfixturevalue("mock_sv_dataset_irregular") + expected_mvbs = request.getfixturevalue("mock_mvbs_array_irregular") + + ds_MVBS = ep.commongrid.compute_MVBS( + ds_Sv, range_meter_bin=range_meter_bin, ping_time_bin=ping_time_bin + ) + + assert ds_MVBS.Sv.shape == expected_mvbs.shape + # Floating digits need to check with all close not equal + assert np.allclose(ds_MVBS.Sv.values, expected_mvbs, atol=1e-8, equal_nan=True) From f00c29105a1da8c62e5395df46f8b77e068d784d Mon Sep 17 00:00:00 2001 From: Emilio Mayorga Date: Fri, 1 Sep 2023 20:52:38 -0700 Subject: [PATCH 25/43] Update echopype/tests/utils/test_processinglevels_integration.py compute_MVBS now should preserve the processing level attributes. So, test for presence rather than absence --- echopype/tests/utils/test_processinglevels_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/tests/utils/test_processinglevels_integration.py b/echopype/tests/utils/test_processinglevels_integration.py index 48a0a4296..03ef59cfb 100644 --- a/echopype/tests/utils/test_processinglevels_integration.py +++ b/echopype/tests/utils/test_processinglevels_integration.py @@ -130,4 +130,4 @@ def _freqdiff_applymask(test_ds): # associated with the input Sv dataset # ds = ds.rename_vars(name_dict={"Sv": "Sv_unmasked", "Sv_ch0": "Sv"}) mvbs_ds = ep.commongrid.compute_MVBS(ds, range_meter_bin="30m", ping_time_bin='1min') - _absence_test(mvbs_ds) + _presence_test(mvbs_ds, "Level 3B") From 456fb501c438868b91f6667a38717d1d29d6fd2a Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Fri, 8 Sep 2023 10:32:30 -0700 Subject: [PATCH 26/43] test: Add 'nan' sprinkles Sprinkled 'nan' values all over 'echo_range' to ensure that computed values from 'compute_MVBS' doesn't take into account the 'nan'. Added check for the expected distribution of 'nan' in the resulting array. Ref: https://github.com/OSOceanAcoustics/echopype/pull/1124#issuecomment-1703643268 --- echopype/tests/commongrid/conftest.py | 60 ++++++++++++++++++++++++--- echopype/tests/commongrid/test_api.py | 47 +++++++++++++++++++++ 2 files changed, 102 insertions(+), 5 deletions(-) diff --git a/echopype/tests/commongrid/conftest.py b/echopype/tests/commongrid/conftest.py index b120c485f..ee88e8513 100644 --- a/echopype/tests/commongrid/conftest.py +++ b/echopype/tests/commongrid/conftest.py @@ -13,6 +13,53 @@ def random_number_generator(): return np.random.default_rng() +@pytest.fixture +def mock_nan_ilocs(): + """NaN i locations for irregular Sv dataset + + It's a list of tuples, each tuple contains + (channel, ping_time, range_sample) + + Notes + ----- + This was created with the following code: + + ``` + import numpy as np + + random_positions = [] + for i in range(20): + random_positions.append(( + np.random.randint(0, 2), + np.random.randint(0, 5), + np.random.randint(0, 20)) + ) + ``` + """ + return [ + (1, 1, 10), + (1, 0, 16), + (0, 3, 6), + (0, 2, 11), + (0, 2, 6), + (1, 1, 14), + (0, 1, 17), + (1, 4, 19), + (0, 3, 3), + (0, 0, 19), + (0, 1, 5), + (1, 2, 9), + (1, 4, 18), + (0, 1, 5), + (0, 4, 4), + (0, 1, 6), + (1, 2, 2), + (0, 1, 2), + (0, 4, 8), + (0, 1, 1), + ] + + @pytest.fixture def mock_parameters(): """Small mock parameters""" @@ -47,7 +94,7 @@ def mock_sv_dataset_regular(mock_parameters, mock_sv_sample): @pytest.fixture -def mock_sv_dataset_irregular(mock_parameters, mock_sv_sample): +def mock_sv_dataset_irregular(mock_parameters, mock_sv_sample, mock_nan_ilocs): depth_interval = [0.5, 0.32, np.nan] # Added nans depth_ping_time_len = [2, 3, 5] ds_Sv = _gen_Sv_er_irregular( @@ -57,6 +104,9 @@ def mock_sv_dataset_irregular(mock_parameters, mock_sv_sample): ping_time_jitter_max_ms=30, # Added jitter to ping_time ) ds_Sv["Sv"].data = mock_sv_sample + # Sprinkle nans around echo_range + for pos in mock_nan_ilocs: + ds_Sv["echo_range"][pos] = np.nan return ds_Sv @@ -97,13 +147,13 @@ def mock_mvbs_array_irregular(): return np.array( [ [ - [0.16395346, 0.43825143, 0.71315706, 0.81188627, 0.94758103], - [0.18514066, 0.50093013, 0.81671961, 1.0, np.nan], + [0.15495845, 0.44702859, 0.71315706, 0.81188627, 0.94752788], + [0.18004567, 0.51673084, 0.81671961, 1.0, np.nan], [np.nan, np.nan, np.nan, np.nan, np.nan], ], [ - [0.16395346, 0.43825143, 0.71315706, 0.81188627, 0.94758103], - [0.18514066, 0.50093013, 0.81671961, 1.0, np.nan], + [0.16702056, 0.43637851, 0.72277163, 0.81739217, 0.94758103], + [0.18514066, 0.50093013, 0.7901115, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan, np.nan], ], ] diff --git a/echopype/tests/commongrid/test_api.py b/echopype/tests/commongrid/test_api.py index a051d149b..2c03fecb2 100644 --- a/echopype/tests/commongrid/test_api.py +++ b/echopype/tests/commongrid/test_api.py @@ -191,6 +191,46 @@ def test_compute_MVBS_range_output(request, er_type): ) def test_compute_MVBS_values(request, er_type): """Tests for the values of compute_MVBS on regular and irregular data.""" + + def _parse_nans(mvbs, ds_Sv) -> np.ndarray: + """Go through and figure out nan values in result""" + echo_range_step = np.unique(np.diff(mvbs.Sv.echo_range.values))[0] + expected_outs = [] + # Loop over channels + for chan in mvbs.Sv.channel.values: + # Get ping times for this channel + ping_times = mvbs.Sv.ping_time.values + # Compute the total number of pings + ping_len = len(ping_times) + # Variable to store the expected output for this channel + chan_expected = [] + for idx in range(ping_len): + # Loop over pings and create slices + if idx < ping_len - 1: + ping_slice = slice(ping_times[idx], ping_times[idx + 1]) + else: + ping_slice = slice(ping_times[idx], None) + + # Get the original echo_range data for this channel and ping slice + da = ds_Sv.echo_range.sel(channel=chan, ping_time=ping_slice) + # Drop the nan values since this shouldn't be included in actual + # computation for compute_MVBS, a.k.a. 'nanmean' + mean_values = da.dropna(dim="ping_time", how="all").values + # Compute the histogram of the mean values to get distribution + hist, _ = np.histogram( + mean_values[~np.isnan(mean_values)], + bins=np.append( + mvbs.Sv.echo_range.values, + # Add one more bin to account for the last value + mvbs.Sv.echo_range.values.max() + echo_range_step + ), + ) + # Convert any non-zero values to False, and zero values to True + # to imitate having nan values since there's no value for that bin + chan_expected.append([False if v > 0 else True for v in hist]) + expected_outs.append(chan_expected) + return np.array(expected_outs) + range_meter_bin = "2m" ping_time_bin = "1s" @@ -206,7 +246,14 @@ def test_compute_MVBS_values(request, er_type): ds_MVBS = ep.commongrid.compute_MVBS( ds_Sv, range_meter_bin=range_meter_bin, ping_time_bin=ping_time_bin ) + + expected_outputs = _parse_nans(ds_MVBS, ds_Sv) assert ds_MVBS.Sv.shape == expected_mvbs.shape # Floating digits need to check with all close not equal + # Compare the values of the MVBS array with the expected values assert np.allclose(ds_MVBS.Sv.values, expected_mvbs, atol=1e-8, equal_nan=True) + + # Ensures that the computation of MVBS takes doesn't take into account NaN values + # that are sporadically placed in the echo_range values + assert np.array_equal(np.isnan(ds_MVBS.Sv.values), expected_outputs) From 85784323a2bcd6c42863cf0f27d72ffb2c26f8c7 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Fri, 8 Sep 2023 11:44:21 -0700 Subject: [PATCH 27/43] revert: Revert the use of 'pint' Removed dependency to 'pint' and use simple regex to ensure that 'range_bin' input is unit 'm'. Renamed 'range_meter_bin' argument to 'range_bin'. Ref: https://github.com/OSOceanAcoustics/echopype/pull/1124#issuecomment-1711629755 --- echopype/commongrid/api.py | 42 ++++++++----------- echopype/tests/commongrid/test_api.py | 22 +++++----- echopype/tests/commongrid/test_mvbs.py | 4 +- .../test_processinglevels_integration.py | 2 +- requirements.txt | 1 - 5 files changed, 30 insertions(+), 41 deletions(-) diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index 6a4ff1744..d9cd94e75 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -1,21 +1,17 @@ """ Functions for enhancing the spatial and temporal coherence of data. """ +import re from typing import Literal import numpy as np import pandas as pd -import pint import xarray as xr from ..consolidate.api import POSITION_VARIABLES from ..utils.prov import add_processing_level, echopype_prov_attrs, insert_input_processing_level from .mvbs import get_MVBS_along_channels -# Setup pint unit registry -ureg = pint.UnitRegistry() -Q_ = ureg.Quantity - def _set_var_attrs(da, long_name, units, round_digits, standard_name=None): """ @@ -74,7 +70,7 @@ def _set_MVBS_attrs(ds): def compute_MVBS( ds_Sv: xr.Dataset, range_var: Literal["echo_range", "depth"] = "echo_range", - range_meter_bin: str = "20m", + range_bin: str = "20m", ping_time_bin: str = "20S", method="map-reduce", **flox_kwargs, @@ -97,7 +93,7 @@ def compute_MVBS( Must be one of ``echo_range`` or ``depth``. Note that ``depth`` is only available if the input dataset contains ``depth`` as a data variable. - range_meter_bin : str + range_bin : str bin size along ``echo_range`` or ``depth`` in meters, default to ``20m`` ping_time_bin : str @@ -116,29 +112,25 @@ def compute_MVBS( """ # First check for bin types - if not isinstance(range_meter_bin, str): - raise TypeError("range_meter_bin must be a string") + if not isinstance(range_bin, str): + raise TypeError("range_bin must be a string") if not isinstance(ping_time_bin, str): raise TypeError("ping_time_bin must be a string") - # Parse range_meter sizes w/units - range_meter_bin = Q_(range_meter_bin) - max_range_meter = Q_("1km") + # normalize to lower case + # for range_bin + range_bin = range_bin.strip().lower() + # Only matches meters + match_obj = re.match(r"(\d+)(m)", range_bin) # Do some checks on range meter inputs - if not range_meter_bin.is_compatible_with("meter"): + if match_obj is None: # This shouldn't be other units - raise ValueError( - f"Found incompatible units ({range_meter_bin.units}) " - "for 'range_meter_bin'. Must be in meters." - ) - elif range_meter_bin >= max_range_meter: - # Raise error if above 1km bin - raise ValueError(f"range_meter_bin must be less than {max_range_meter}.") + raise ValueError("Found incompatible units. Must be in meters.") - # Convert back to int or float - range_meter_bin = range_meter_bin.to("meter").magnitude + # Convert back to float + range_bin = float(match_obj.group(1)) # Clean up filenames dimension if it exists # not needed here @@ -156,7 +148,7 @@ def compute_MVBS( # create bin information for echo_range # this computes the echo range max since there might NaNs in the data echo_range_max = ds_Sv[range_var].max() - range_interval = np.arange(0, echo_range_max + range_meter_bin, range_meter_bin) + range_interval = np.arange(0, echo_range_max + range_bin, range_bin) # create bin information needed for ping_time d_index = ( @@ -230,11 +222,11 @@ def compute_MVBS( "cell_methods": ( f"ping_time: mean (interval: {ping_time_bin_resvalue} {ping_time_bin_resunit_label} " # noqa "comment: ping_time is the interval start) " - f"{range_var}: mean (interval: {range_meter_bin} meter " + f"{range_var}: mean (interval: {range_bin} meter " f"comment: {range_var} is the interval start)" ), "binning_mode": "physical units", - "range_meter_interval": str(range_meter_bin) + "m", + "range_meter_interval": str(range_bin) + "m", "ping_time_interval": ping_time_bin, "actual_range": [ round(float(ds_MVBS["Sv"].min().values), 2), diff --git a/echopype/tests/commongrid/test_api.py b/echopype/tests/commongrid/test_api.py index 2c03fecb2..a223aea55 100644 --- a/echopype/tests/commongrid/test_api.py +++ b/echopype/tests/commongrid/test_api.py @@ -48,21 +48,19 @@ def test_compute_MVBS_index_binning(ds_Sv_er_regular, regular_data_params): @pytest.mark.unit @pytest.mark.parametrize( - ["range_meter_bin", "ping_time_bin"], [(5, "10S"), ("10m", 10), ("10km", "10S"), ("10", "10S")] + ["range_bin", "ping_time_bin"], [(5, "10S"), ("10m", 10), ("10km", "10S"), ("10", "10S")] ) -def test_compute_MVBS_bin_inputs_fail(ds_Sv_er_regular, range_meter_bin, ping_time_bin): +def test_compute_MVBS_bin_inputs_fail(ds_Sv_er_regular, range_bin, ping_time_bin): expected_error = ValueError - if isinstance(range_meter_bin, int) or isinstance(ping_time_bin, int): + if isinstance(range_bin, int) or isinstance(ping_time_bin, int): expected_error = TypeError match = r"\w+ must be a string" - elif "m" not in range_meter_bin: - match = r"Found incompatible units \(\w+\) *" - elif "km" in range_meter_bin: - match = "range_meter_bin must be less than 1 kilometer." + else: + match = r"Found incompatible units. Must be in meters." with pytest.raises(expected_error, match=match): ep.commongrid.compute_MVBS( - ds_Sv_er_regular, range_meter_bin=range_meter_bin, ping_time_bin=ping_time_bin + ds_Sv_er_regular, range_bin=range_bin, ping_time_bin=ping_time_bin ) @@ -72,7 +70,7 @@ def test_compute_MVBS_w_latlon(ds_Sv_er_regular_w_latlon, lat_attrs, lon_attrs): from echopype.consolidate.api import POSITION_VARIABLES ds_MVBS = ep.commongrid.compute_MVBS( - ds_Sv_er_regular_w_latlon, range_meter_bin="5m", ping_time_bin="10S" + ds_Sv_er_regular_w_latlon, range_bin="5m", ping_time_bin="10S" ) for var in POSITION_VARIABLES: # Check to ensure variable is in dataset @@ -154,7 +152,7 @@ def test_compute_MVBS_range_output(request, er_type): else: ds_Sv = request.getfixturevalue("ds_Sv_er_irregular") - ds_MVBS = ep.commongrid.compute_MVBS(ds_Sv, range_meter_bin="5m", ping_time_bin="10S") + ds_MVBS = ep.commongrid.compute_MVBS(ds_Sv, range_bin="5m", ping_time_bin="10S") if er_type == "regular": expected_len = ( @@ -231,7 +229,7 @@ def _parse_nans(mvbs, ds_Sv) -> np.ndarray: expected_outs.append(chan_expected) return np.array(expected_outs) - range_meter_bin = "2m" + range_bin = "2m" ping_time_bin = "1s" if er_type == "regular": @@ -244,7 +242,7 @@ def _parse_nans(mvbs, ds_Sv) -> np.ndarray: expected_mvbs = request.getfixturevalue("mock_mvbs_array_irregular") ds_MVBS = ep.commongrid.compute_MVBS( - ds_Sv, range_meter_bin=range_meter_bin, ping_time_bin=ping_time_bin + ds_Sv, range_bin=range_bin, ping_time_bin=ping_time_bin ) expected_outputs = _parse_nans(ds_MVBS, ds_Sv) diff --git a/echopype/tests/commongrid/test_mvbs.py b/echopype/tests/commongrid/test_mvbs.py index 2484aac61..ade292d26 100644 --- a/echopype/tests/commongrid/test_mvbs.py +++ b/echopype/tests/commongrid/test_mvbs.py @@ -429,7 +429,7 @@ def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None: @pytest.mark.parametrize(["range_var", "lat_lon"], [("depth", False), ("echo_range", True)]) def test_get_MVBS_along_channels(request, range_var, lat_lon): """Testing the underlying function of compute_MVBS""" - range_meter_bin = 20 + range_bin = 20 ping_time_bin = "20S" method = "map-reduce" @@ -447,7 +447,7 @@ def test_get_MVBS_along_channels(request, range_var, lat_lon): # compute range interval echo_range_max = ds_Sv[range_var].max() - range_interval = np.arange(0, echo_range_max + range_meter_bin, range_meter_bin) + range_interval = np.arange(0, echo_range_max + range_bin, range_bin) # create bin information needed for ping_time d_index = ( diff --git a/echopype/tests/utils/test_processinglevels_integration.py b/echopype/tests/utils/test_processinglevels_integration.py index 03ef59cfb..d3fa979b5 100644 --- a/echopype/tests/utils/test_processinglevels_integration.py +++ b/echopype/tests/utils/test_processinglevels_integration.py @@ -129,5 +129,5 @@ def _freqdiff_applymask(test_ds): # No product level is assigned because at present compute_MVBS drops the lat/lon data # associated with the input Sv dataset # ds = ds.rename_vars(name_dict={"Sv": "Sv_unmasked", "Sv_ch0": "Sv"}) - mvbs_ds = ep.commongrid.compute_MVBS(ds, range_meter_bin="30m", ping_time_bin='1min') + mvbs_ds = ep.commongrid.compute_MVBS(ds, range_bin="30m", ping_time_bin='1min') _presence_test(mvbs_ds, "Level 3B") diff --git a/requirements.txt b/requirements.txt index 1b13898d7..11fe3b2ec 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,6 @@ dask[array,distributed] jinja2 netCDF4>1.6 numpy -pint pynmea2 pytz scipy From 31032afebce726b4ea13d326d66ad9c7fdd0c397 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Fri, 8 Sep 2023 11:53:30 -0700 Subject: [PATCH 28/43] feat: Allow 'range_bin' to have space --- echopype/commongrid/api.py | 2 +- echopype/tests/commongrid/test_api.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index d9cd94e75..876bb0f87 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -122,7 +122,7 @@ def compute_MVBS( # for range_bin range_bin = range_bin.strip().lower() # Only matches meters - match_obj = re.match(r"(\d+)(m)", range_bin) + match_obj = re.match(r"(\d+)(\s+)?(m)", range_bin) # Do some checks on range meter inputs if match_obj is None: diff --git a/echopype/tests/commongrid/test_api.py b/echopype/tests/commongrid/test_api.py index a223aea55..9d202f2b3 100644 --- a/echopype/tests/commongrid/test_api.py +++ b/echopype/tests/commongrid/test_api.py @@ -1,7 +1,6 @@ import pytest import echopype as ep import numpy as np -import pandas as pd # NASC Tests From d8dd4f8c958bfead0f2bde95bbf24ba000bb382c Mon Sep 17 00:00:00 2001 From: Don Setiawan Date: Fri, 8 Sep 2023 16:41:02 -0700 Subject: [PATCH 29/43] fix: Apply suggestions from code review Applied fix for regex not capturing decimal values by @emiliom Ref: https://github.com/OSOceanAcoustics/echopype/pull/1124/files#r1320422121 Co-authored-by: Emilio Mayorga --- echopype/commongrid/api.py | 7 ++++--- echopype/tests/utils/test_processinglevels_integration.py | 2 -- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index 876bb0f87..9bc7c5c0a 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -99,7 +99,8 @@ def compute_MVBS( ping_time_bin : str bin size along ``ping_time``, default to ``20S`` method: str - The flox strategy for reduction of dask arrays only. + The flox strategy for reduction of dask arrays only, + default to ``map-reduce`` See flox `documentation `_ for more details. **kwargs @@ -122,12 +123,12 @@ def compute_MVBS( # for range_bin range_bin = range_bin.strip().lower() # Only matches meters - match_obj = re.match(r"(\d+)(\s+)?(m)", range_bin) + match_obj = re.match(r"([\d+]*[.,]{0,1}[\d+]*)(\s+)?(m)", range_bin) # Do some checks on range meter inputs if match_obj is None: # This shouldn't be other units - raise ValueError("Found incompatible units. Must be in meters.") + raise ValueError("range_bin must be in meters (e.g., '10m').") # Convert back to float range_bin = float(match_obj.group(1)) diff --git a/echopype/tests/utils/test_processinglevels_integration.py b/echopype/tests/utils/test_processinglevels_integration.py index d3fa979b5..e1525bd43 100644 --- a/echopype/tests/utils/test_processinglevels_integration.py +++ b/echopype/tests/utils/test_processinglevels_integration.py @@ -126,8 +126,6 @@ def _freqdiff_applymask(test_ds): # ---- Compute MVBS # compute_MVBS expects a variable named "Sv" - # No product level is assigned because at present compute_MVBS drops the lat/lon data - # associated with the input Sv dataset # ds = ds.rename_vars(name_dict={"Sv": "Sv_unmasked", "Sv_ch0": "Sv"}) mvbs_ds = ep.commongrid.compute_MVBS(ds, range_bin="30m", ping_time_bin='1min') _presence_test(mvbs_ds, "Level 3B") From 5e5e19e22006869c98d638497d09521ef74d85e5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 8 Sep 2023 23:41:13 +0000 Subject: [PATCH 30/43] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- echopype/commongrid/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index 9bc7c5c0a..74f75bc87 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -99,7 +99,7 @@ def compute_MVBS( ping_time_bin : str bin size along ``ping_time``, default to ``20S`` method: str - The flox strategy for reduction of dask arrays only, + The flox strategy for reduction of dask arrays only, default to ``map-reduce`` See flox `documentation `_ for more details. From 343b31bc8ab4b6de1597655f24cffd751c00b959 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Fri, 8 Sep 2023 16:49:26 -0700 Subject: [PATCH 31/43] test: Fix test text for wrong unit --- echopype/tests/commongrid/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/tests/commongrid/test_api.py b/echopype/tests/commongrid/test_api.py index 9d202f2b3..2bb98c8cf 100644 --- a/echopype/tests/commongrid/test_api.py +++ b/echopype/tests/commongrid/test_api.py @@ -55,7 +55,7 @@ def test_compute_MVBS_bin_inputs_fail(ds_Sv_er_regular, range_bin, ping_time_bin expected_error = TypeError match = r"\w+ must be a string" else: - match = r"Found incompatible units. Must be in meters." + match = "range_bin must be in meters (e.g., '10m')." with pytest.raises(expected_error, match=match): ep.commongrid.compute_MVBS( From ab89fba6119b5ef7f699d6cf932ab49f60f82de4 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Fri, 8 Sep 2023 16:58:10 -0700 Subject: [PATCH 32/43] test: Remove the 'e.g.' part on pytest Removed the part with '(e.g., '10m')' since it's messing up pytests regex matching. --- echopype/tests/commongrid/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/tests/commongrid/test_api.py b/echopype/tests/commongrid/test_api.py index 2bb98c8cf..ec6617637 100644 --- a/echopype/tests/commongrid/test_api.py +++ b/echopype/tests/commongrid/test_api.py @@ -55,7 +55,7 @@ def test_compute_MVBS_bin_inputs_fail(ds_Sv_er_regular, range_bin, ping_time_bin expected_error = TypeError match = r"\w+ must be a string" else: - match = "range_bin must be in meters (e.g., '10m')." + match = r"range_bin must be in meters" with pytest.raises(expected_error, match=match): ep.commongrid.compute_MVBS( From d510d253bd998e801fd53c901e7f55c2ac58b928 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Wed, 20 Sep 2023 14:34:50 -0700 Subject: [PATCH 33/43] test: Remove remnant for test_ek.py --- echopype/tests/convert/utils/test_ek.py | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 echopype/tests/convert/utils/test_ek.py diff --git a/echopype/tests/convert/utils/test_ek.py b/echopype/tests/convert/utils/test_ek.py deleted file mode 100644 index 6589e5c6e..000000000 --- a/echopype/tests/convert/utils/test_ek.py +++ /dev/null @@ -1,14 +0,0 @@ -import pytest -from echopype.convert.utils.ek import COMPLEX_VAR, _get_power_dims - -@pytest.fixture() -def dgram_zarr_vars(): - return {'power': ['timestamp', 'channel'], 'angle': ['timestamp', 'channel']} - -def test__get_power_dims(dgram_zarr_vars): - power_dims = _get_power_dims(dgram_zarr_vars) - assert isinstance(power_dims, list) - assert sorted(power_dims) == sorted(['timestamp', 'channel']) - -def test__extract_datagram_dfs(): - ... \ No newline at end of file From 8aa920162711b94d43f8c677c9de4b9b4bcda460 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Wed, 20 Sep 2023 14:37:45 -0700 Subject: [PATCH 34/43] refactor: Extract range_bin parsing and add close arg Extracts out the 'range_bin' string to float into a private function. Additionally now there's a fine tune argument for bin close edges so user can specify either close is 'left' or 'right'. Bins are converted to pandas interval index before passing into 'get_MVBS_along_channels'. --- echopype/commongrid/api.py | 98 +++++++++++++++++++++++++++++++------- 1 file changed, 80 insertions(+), 18 deletions(-) diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index 74f75bc87..0dabc779a 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -66,6 +66,69 @@ def _set_MVBS_attrs(ds): ) +def _convert_bins_to_interval_index( + bins: list, closed: Literal["left", "right"] = "left" +) -> pd.IntervalIndex: + """ + Convert bins to sorted pandas IntervalIndex + with specified closed end + + Parameters + ---------- + bins : list + The bin edges + closed : {'left', 'right'}, default 'left' + Which side of bin interval is closed + + Returns + ------- + pd.IntervalIndex + The resulting IntervalIndex + """ + return pd.IntervalIndex.from_breaks(bins, closed=closed).sort_values() + + +def _parse_range_bin(range_bin: str) -> float: + """ + Parses range bin string, check unit, + and returns range bin in meters. + + Parameters + ---------- + range_bin : str + Range bin string, e.g., "10m" + + Returns + ------- + float + The resulting range bin value in meters. + + Raises + ------ + ValueError + If the range bin string doesn't include 'm' for meters. + TypeError + If the range bin is not a type string. + """ + # First check for bin types + if not isinstance(range_bin, str): + raise TypeError("range_bin must be a string") + # normalize to lower case + # for range_bin + range_bin = range_bin.strip().lower() + # Only matches meters + match_obj = re.match(r"([\d+]*[.,]{0,1}[\d+]*)(\s+)?(m)", range_bin) + + # Do some checks on range meter inputs + if match_obj is None: + # This shouldn't be other units + raise ValueError("range_bin must be in meters (e.g., '10m').") + + # Convert back to float + range_bin = float(match_obj.group(1)) + return range_bin + + @add_processing_level("L3*") def compute_MVBS( ds_Sv: xr.Dataset, @@ -73,6 +136,7 @@ def compute_MVBS( range_bin: str = "20m", ping_time_bin: str = "20S", method="map-reduce", + closed: Literal["left", "right"] = "left", **flox_kwargs, ): """ @@ -103,6 +167,8 @@ def compute_MVBS( default to ``map-reduce`` See flox `documentation `_ for more details. + closed: {'left', 'right'}, default 'left' + Which side of bin interval is closed. **kwargs Additional keyword arguments to be passed to flox reduction function. @@ -112,26 +178,10 @@ def compute_MVBS( A dataset containing bin-averaged Sv """ - # First check for bin types - if not isinstance(range_bin, str): - raise TypeError("range_bin must be a string") - if not isinstance(ping_time_bin, str): raise TypeError("ping_time_bin must be a string") - # normalize to lower case - # for range_bin - range_bin = range_bin.strip().lower() - # Only matches meters - match_obj = re.match(r"([\d+]*[.,]{0,1}[\d+]*)(\s+)?(m)", range_bin) - - # Do some checks on range meter inputs - if match_obj is None: - # This shouldn't be other units - raise ValueError("range_bin must be in meters (e.g., '10m').") - - # Convert back to float - range_bin = float(match_obj.group(1)) + range_bin = _parse_range_bin(range_bin) # Clean up filenames dimension if it exists # not needed here @@ -146,6 +196,10 @@ def compute_MVBS( if range_var not in ds_Sv.data_vars: raise ValueError(f"range_var '{range_var}' does not exist in the input dataset.") + # Check for closed values + if closed not in ["right", "left"]: + raise ValueError(f"{closed} is not a valid option. Options are 'left' or 'right'.") + # create bin information for echo_range # this computes the echo range max since there might NaNs in the data echo_range_max = ds_Sv[range_var].max() @@ -160,8 +214,16 @@ def compute_MVBS( ) ping_interval = d_index.union([d_index[-1] + pd.Timedelta(ping_time_bin)]).values + # Set interval index for groups + ping_interval = _convert_bins_to_interval_index(ping_interval, closed=closed) + range_interval = _convert_bins_to_interval_index(range_interval, closed=closed) raw_MVBS = get_MVBS_along_channels( - ds_Sv, range_interval, ping_interval, range_var=range_var, method=method, **flox_kwargs + ds_Sv, + range_interval, + ping_interval, + range_var=range_var, + method=method, + **flox_kwargs, ) # create MVBS dataset From 73fb4461979703f9ccac2606aea28c32d8c65816 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Wed, 20 Sep 2023 14:42:09 -0700 Subject: [PATCH 35/43] refactor: Update arg types to include interval index Added argument type options for 'range_interval' and 'ping_interval' to also be interval index. --- echopype/commongrid/mvbs.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/echopype/commongrid/mvbs.py b/echopype/commongrid/mvbs.py index 615b06e55..57890b100 100644 --- a/echopype/commongrid/mvbs.py +++ b/echopype/commongrid/mvbs.py @@ -7,6 +7,7 @@ import dask.array import numpy as np +import pandas as pd import xarray as xr from flox.xarray import xarray_reduce @@ -410,8 +411,8 @@ def bin_and_mean_2d( def get_MVBS_along_channels( ds_Sv: xr.Dataset, - range_interval: np.ndarray, - ping_interval: np.ndarray, + range_interval: Union[pd.IntervalIndex, np.ndarray], + ping_interval: Union[pd.IntervalIndex, np.ndarray], range_var: Literal["echo_range", "depth"] = "echo_range", method: str = "map-reduce", **kwargs @@ -425,11 +426,11 @@ def get_MVBS_along_channels( ds_Sv: xr.Dataset A Dataset containing ``Sv`` and ``echo_range`` data with coordinates ``channel``, ``ping_time``, and ``range_sample`` - range_interval: np.ndarray - 1D array representing + range_interval: pd.IntervalIndex or np.ndarray + 1D array or interval index representing the bins required for ``range_var`` - ping_interval: np.ndarray - 1D array representing + ping_interval: pd.IntervalIndex or np.ndarray + 1D array or interval index representing the bins required for ``ping_time`` range_var: str The variable to use for range binning. From 744228d4d87ba89c2c8017854163967461546d4d Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Wed, 20 Sep 2023 14:44:04 -0700 Subject: [PATCH 36/43] test: Update tests to have brute force creation Changed mock mvbs to be created by doing brute force rather than hard coding. --- echopype/tests/commongrid/conftest.py | 98 ++++++++++++++++++-------- echopype/tests/commongrid/test_api.py | 37 ++++++++-- echopype/tests/commongrid/test_mvbs.py | 68 +----------------- 3 files changed, 102 insertions(+), 101 deletions(-) diff --git a/echopype/tests/commongrid/conftest.py b/echopype/tests/commongrid/conftest.py index ee88e8513..1705a32e3 100644 --- a/echopype/tests/commongrid/conftest.py +++ b/echopype/tests/commongrid/conftest.py @@ -5,6 +5,7 @@ import pandas as pd from echopype.consolidate import add_depth +import echopype as ep @pytest.fixture @@ -111,7 +112,12 @@ def mock_sv_dataset_irregular(mock_parameters, mock_sv_sample, mock_nan_ilocs): @pytest.fixture -def mock_mvbs_array_regular(): +def mock_mvbs_inputs(): + return dict(range_meter_bin=2, ping_time_bin="1s") + + +@pytest.fixture +def mock_mvbs_array_regular(mock_sv_dataset_regular, mock_mvbs_inputs, mock_parameters): """ Mock Sv sample result from compute_MVBS @@ -119,24 +125,19 @@ def mock_mvbs_array_regular(): Ping time bin: 1s Range bin: 2m """ - return np.array( - [ - [ - [0.13197759, 0.3425039, 0.55303022, 0.76355653, 0.94758103], - [0.13197759, 0.3425039, 0.55303022, 0.76355653, 0.94758103], - [0.13197759, 0.3425039, 0.55303022, 0.76355653, 0.94758103], - ], - [ - [0.13197759, 0.3425039, 0.55303022, 0.76355653, 0.94758103], - [0.13197759, 0.3425039, 0.55303022, 0.76355653, 0.94758103], - [0.13197759, 0.3425039, 0.55303022, 0.76355653, 0.94758103], - ], - ] + ds_Sv = mock_sv_dataset_regular + ping_time_bin = mock_mvbs_inputs["ping_time_bin"] + range_bin = mock_mvbs_inputs["range_meter_bin"] + channel_len = mock_parameters["channel_len"] + expected_mvbs_val = _get_expected_mvbs_val( + ds_Sv, ping_time_bin, range_bin, channel_len ) + return expected_mvbs_val + @pytest.fixture -def mock_mvbs_array_irregular(): +def mock_mvbs_array_irregular(mock_sv_dataset_irregular, mock_mvbs_inputs, mock_parameters): """ Mock Sv sample irregular result from compute_MVBS @@ -144,21 +145,16 @@ def mock_mvbs_array_irregular(): Ping time bin: 1s Range bin: 2m """ - return np.array( - [ - [ - [0.15495845, 0.44702859, 0.71315706, 0.81188627, 0.94752788], - [0.18004567, 0.51673084, 0.81671961, 1.0, np.nan], - [np.nan, np.nan, np.nan, np.nan, np.nan], - ], - [ - [0.16702056, 0.43637851, 0.72277163, 0.81739217, 0.94758103], - [0.18514066, 0.50093013, 0.7901115, np.nan, np.nan], - [np.nan, np.nan, np.nan, np.nan, np.nan], - ], - ] + ds_Sv = mock_sv_dataset_irregular + ping_time_bin = mock_mvbs_inputs["ping_time_bin"] + range_bin = mock_mvbs_inputs["range_meter_bin"] + channel_len = mock_parameters["channel_len"] + expected_mvbs_val = _get_expected_mvbs_val( + ds_Sv, ping_time_bin, range_bin, channel_len ) + return expected_mvbs_val + @pytest.fixture( params=[ @@ -324,6 +320,52 @@ def ds_Sv_er_irregular(random_number_generator): # Helper functions to generate mock Sv dataset +def _get_expected_mvbs_val(ds_Sv, ping_time_bin, range_bin, channel_len=2): + """Helper function to brute force compute_MVBS""" + # create bin information needed for ping_time + d_index = ( + ds_Sv["ping_time"] + .resample(ping_time=ping_time_bin, skipna=True) + .first() # Not actually being used, but needed to get the bin groups + .indexes["ping_time"] + ) + ping_interval = d_index.union([d_index[-1] + pd.Timedelta(ping_time_bin)]).values + + # create bin information for echo_range + # this computes the echo range max since there might NaNs in the data + echo_range_max = ds_Sv["echo_range"].max() + range_interval = np.arange(0, echo_range_max + 2, range_bin) + + sv = ds_Sv["Sv"].pipe(ep.utils.compute._log2lin) + + expected_mvbs_val = np.ones((2, len(ping_interval) - 1, len(range_interval) - 1)) * np.nan + + for ch_idx in range(channel_len): + for p_idx in range(len(ping_interval) - 1): + for r_idx in range(len(range_interval) - 1): + echo_range = ( + ds_Sv["echo_range"] + .isel(channel=ch_idx) + .sel(ping_time=slice(ping_interval[p_idx], ping_interval[p_idx + 1])) + .max(dim="ping_time", skipna=True) + .data + ) + cur_sv = sv.assign_coords({"range_sample": echo_range}) + r_idx_active = np.logical_and( + echo_range.data >= range_interval[r_idx], + echo_range.data < range_interval[r_idx + 1], + ) + sv_tmp = cur_sv.isel(channel=ch_idx).sel( + ping_time=slice(ping_interval[p_idx], ping_interval[p_idx + 1]), + range_sample=r_idx_active, + ) + if 0 in sv_tmp.shape: + expected_mvbs_val[ch_idx, p_idx, r_idx] = np.nan + else: + expected_mvbs_val[ch_idx, p_idx, r_idx] = np.mean(sv_tmp) + return ep.utils.compute._lin2log(expected_mvbs_val) + + def _gen_ping_time(ping_time_len, ping_time_interval, ping_time_jitter_max_ms=0): ping_time = pd.date_range("2018-07-01", periods=ping_time_len, freq=ping_time_interval) if ping_time_jitter_max_ms != 0: # if to add jitter diff --git a/echopype/tests/commongrid/test_api.py b/echopype/tests/commongrid/test_api.py index ec6617637..ef1165d0a 100644 --- a/echopype/tests/commongrid/test_api.py +++ b/echopype/tests/commongrid/test_api.py @@ -3,6 +3,31 @@ import numpy as np +# Utilties Tests +@pytest.mark.parametrize( + ["range_bin", "expected_result"], + [ + (5, TypeError), + ("0.2m", 0.2), + ("10m", 10.0), + ("10km", ValueError), + ("10", ValueError) + ], +) +def test__parse_range_bin(range_bin, expected_result): + expected_error_msg = r"range_bin must be in meters" + if isinstance(range_bin, int): + expected_error_msg = r"range_bin must be a string" + elif range_bin in ["10km", "10"]: + expected_error_msg = r"range_bin must be in meters" + + if not isinstance(expected_result, float): + with pytest.raises(expected_result, match=expected_error_msg): + ep.commongrid.api._parse_range_bin(range_bin) + else: + assert ep.commongrid.api._parse_range_bin(range_bin) == expected_result + + # NASC Tests @pytest.mark.integration @pytest.mark.skip(reason="NASC is not implemented yet") @@ -219,7 +244,7 @@ def _parse_nans(mvbs, ds_Sv) -> np.ndarray: bins=np.append( mvbs.Sv.echo_range.values, # Add one more bin to account for the last value - mvbs.Sv.echo_range.values.max() + echo_range_step + mvbs.Sv.echo_range.values.max() + echo_range_step, ), ) # Convert any non-zero values to False, and zero values to True @@ -240,17 +265,15 @@ def _parse_nans(mvbs, ds_Sv) -> np.ndarray: ds_Sv = request.getfixturevalue("mock_sv_dataset_irregular") expected_mvbs = request.getfixturevalue("mock_mvbs_array_irregular") - ds_MVBS = ep.commongrid.compute_MVBS( - ds_Sv, range_bin=range_bin, ping_time_bin=ping_time_bin - ) - + ds_MVBS = ep.commongrid.compute_MVBS(ds_Sv, range_bin=range_bin, ping_time_bin=ping_time_bin) + expected_outputs = _parse_nans(ds_MVBS, ds_Sv) assert ds_MVBS.Sv.shape == expected_mvbs.shape # Floating digits need to check with all close not equal # Compare the values of the MVBS array with the expected values - assert np.allclose(ds_MVBS.Sv.values, expected_mvbs, atol=1e-8, equal_nan=True) - + assert np.allclose(ds_MVBS.Sv.values, expected_mvbs, atol=1e-10, rtol=1e-10, equal_nan=True) + # Ensures that the computation of MVBS takes doesn't take into account NaN values # that are sporadically placed in the echo_range values assert np.array_equal(np.isnan(ds_MVBS.Sv.values), expected_outputs) diff --git a/echopype/tests/commongrid/test_mvbs.py b/echopype/tests/commongrid/test_mvbs.py index ade292d26..143152da4 100644 --- a/echopype/tests/commongrid/test_mvbs.py +++ b/echopype/tests/commongrid/test_mvbs.py @@ -1,10 +1,9 @@ import dask.array import numpy as np import pandas as pd -from numpy.random import default_rng import pytest from typing import Tuple, Iterable, Union -from echopype.commongrid.mvbs import bin_and_mean_2d, get_MVBS_along_channels +from echopype.commongrid.mvbs import get_MVBS_along_channels from echopype.consolidate.api import POSITION_VARIABLES from flox.xarray import xarray_reduce @@ -363,70 +362,7 @@ def bin_and_mean_2d_params(request): return list(request.param.values()) @pytest.mark.unit -def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None: - """ - Tests the function ``bin_and_mean_2d``, which is the core - method for ``compute_MVBS``. This is done by creating mock - data (which can have varying number of ``echo_range`` values - for each ``ping_time``) with known means. - - Parameters - ---------- - create_dask: bool - If True the ``Sv`` and ``echo_range`` values produced will be - dask arrays, else they will be numpy arrays. - final_num_ping_bins: int - The total number of ping time bins - final_num_er_bins: int - The total number of echo range bins - ping_range: list - A list whose first element is the lowest and second element is - the highest possible number of ping time values in a given bin - er_range: list - A list whose first element is the lowest and second element is - the highest possible number of echo range values in a given bin - """ - - # get all parameters needed to create the mock data - create_dask, final_num_ping_bins, final_num_er_bins, ping_range, er_range = bin_and_mean_2d_params - - # randomly generate a seed - seed = np.random.randint(low=10, high=100000, size=1)[0] - - print(f"seed used to generate mock data: {seed}") - - # establish generator for random integers - rng = default_rng(seed=seed) - - # seed dask random generator - if create_dask: - dask.array.random.seed(seed=seed) - - # create echo_range, ping_time, and Sv arrays where the MVBS is known - known_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time = create_known_mean_data(final_num_ping_bins, - final_num_er_bins, - ping_range, er_range, - create_dask, - rng) - - # put the created ping bins into a form that works with bin_and_mean_2d - digitize_ping_bin = np.array([*ping_bins[0]] + [bin_val[1] for bin_val in ping_bins[1:-1]]) - digitize_ping_bin = digitize_ping_bin.astype('datetime64[ns]') - - # put the created echo range bins into a form that works with bin_and_mean_2d - digitize_er_bin = np.array([*er_bins[0]] + [bin_val[1] for bin_val in er_bins[1:]]) - - # calculate MVBS for mock data set - calc_MVBS = bin_and_mean_2d(arr=final_sv, bins_time=digitize_ping_bin, - bins_er=digitize_er_bin, times=final_ping_time, - echo_range=final_er, comprehensive_er_check=True) - - # compare known MVBS solution against its calculated counterpart - assert np.allclose(calc_MVBS, known_MVBS, atol=1e-10, rtol=1e-10, equal_nan=True) - - -@pytest.mark.unit -@pytest.mark.parametrize(["range_var", "lat_lon"], [("depth", False), ("echo_range", True)]) +@pytest.mark.parametrize(["range_var", "lat_lon"], [("depth", False), ("echo_range", True), ("echo_range", False)]) def test_get_MVBS_along_channels(request, range_var, lat_lon): """Testing the underlying function of compute_MVBS""" range_bin = 20 From 48573fe68874603c4fce0c463bd4e0a0410c0dc9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 20 Sep 2023 21:45:40 +0000 Subject: [PATCH 37/43] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- echopype/tests/commongrid/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopype/tests/commongrid/test_api.py b/echopype/tests/commongrid/test_api.py index ef1165d0a..3a75dbcd5 100644 --- a/echopype/tests/commongrid/test_api.py +++ b/echopype/tests/commongrid/test_api.py @@ -3,7 +3,7 @@ import numpy as np -# Utilties Tests +# Utilities Tests @pytest.mark.parametrize( ["range_bin", "expected_result"], [ From f1df1d669e95566cf4671d7b161a1ec9333d9121 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Wed, 20 Sep 2023 15:23:55 -0700 Subject: [PATCH 38/43] test: Fix brute force mvbs gen Fixes the generation of expected mvbs with brute force as well as tweaks to mvbs_along_channel test. --- echopype/tests/commongrid/conftest.py | 17 ++++++----------- echopype/tests/commongrid/test_mvbs.py | 6 ++---- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/echopype/tests/commongrid/conftest.py b/echopype/tests/commongrid/conftest.py index 1705a32e3..19510f476 100644 --- a/echopype/tests/commongrid/conftest.py +++ b/echopype/tests/commongrid/conftest.py @@ -96,7 +96,7 @@ def mock_sv_dataset_regular(mock_parameters, mock_sv_sample): @pytest.fixture def mock_sv_dataset_irregular(mock_parameters, mock_sv_sample, mock_nan_ilocs): - depth_interval = [0.5, 0.32, np.nan] # Added nans + depth_interval = [0.5, 0.32, 0.2] depth_ping_time_len = [2, 3, 5] ds_Sv = _gen_Sv_er_irregular( **mock_parameters, @@ -344,21 +344,16 @@ def _get_expected_mvbs_val(ds_Sv, ping_time_bin, range_bin, channel_len=2): for p_idx in range(len(ping_interval) - 1): for r_idx in range(len(range_interval) - 1): echo_range = ( - ds_Sv["echo_range"] + ds_Sv['echo_range'] .isel(channel=ch_idx) - .sel(ping_time=slice(ping_interval[p_idx], ping_interval[p_idx + 1])) - .max(dim="ping_time", skipna=True) - .data + .sel(ping_time=slice(ping_interval[p_idx], ping_interval[p_idx+1])) ) - cur_sv = sv.assign_coords({"range_sample": echo_range}) r_idx_active = np.logical_and( echo_range.data >= range_interval[r_idx], - echo_range.data < range_interval[r_idx + 1], - ) - sv_tmp = cur_sv.isel(channel=ch_idx).sel( - ping_time=slice(ping_interval[p_idx], ping_interval[p_idx + 1]), - range_sample=r_idx_active, + echo_range.data < range_interval[r_idx+1] ) + sv_tmp = sv.isel(channel=ch_idx).sel( + ping_time=slice(ping_interval[p_idx], ping_interval[p_idx+1])).data[r_idx_active] if 0 in sv_tmp.shape: expected_mvbs_val[ch_idx, p_idx, r_idx] = np.nan else: diff --git a/echopype/tests/commongrid/test_mvbs.py b/echopype/tests/commongrid/test_mvbs.py index 143152da4..49e9e2d8f 100644 --- a/echopype/tests/commongrid/test_mvbs.py +++ b/echopype/tests/commongrid/test_mvbs.py @@ -405,10 +405,6 @@ def test_get_MVBS_along_channels(request, range_var, lat_lon): # When it's echo_range and lat_lon, the dataset should have positions if range_var == "echo_range" and lat_lon is True: assert raw_MVBS.attrs["has_positions"] is True - else: - assert raw_MVBS.attrs["has_positions"] is False - - if range_var == "echo_range": assert all(v in raw_MVBS for v in POSITION_VARIABLES) # Compute xarray reduce manually for this @@ -423,3 +419,5 @@ def test_get_MVBS_along_channels(request, range_var, lat_lon): for v in POSITION_VARIABLES: assert np.array_equal(raw_MVBS[v].data, expected_Pos[v].data) + else: + assert raw_MVBS.attrs["has_positions"] is False From 93b17b67c23620b33001f4fd510ef5dae822c0c4 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Wed, 20 Sep 2023 15:36:36 -0700 Subject: [PATCH 39/43] chore: Clean up old code for doing compute MVBS Removes old code that perfoms compute_MVBS since now we've switched over to flox --- echopype/commongrid/mvbs.py | 398 +------------------------ echopype/tests/commongrid/test_mvbs.py | 356 ---------------------- 2 files changed, 1 insertion(+), 753 deletions(-) diff --git a/echopype/commongrid/mvbs.py b/echopype/commongrid/mvbs.py index 57890b100..2fbc1b38f 100644 --- a/echopype/commongrid/mvbs.py +++ b/echopype/commongrid/mvbs.py @@ -2,10 +2,8 @@ Contains core functions needed to compute the MVBS of an input dataset. """ -import warnings -from typing import Literal, Tuple, Union +from typing import Literal, Union -import dask.array import numpy as np import pandas as pd import xarray as xr @@ -15,400 +13,6 @@ from ..utils.compute import _lin2log, _log2lin -def get_bin_indices( - echo_range: np.ndarray, bins_er: np.ndarray, times: np.ndarray, bins_time: np.ndarray -) -> Tuple[np.ndarray, np.ndarray]: - """ - Obtains the bin index of ``echo_range`` and ``times`` based - on the binning ``bins_er`` and ``bins_time``, respectively. - - Parameters - ---------- - echo_range: np.ndarray - 2D array of echo range values - bins_er: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``echo_range`` - times: np.ndarray - 1D array corresponding to the time values that should be binned - bins_time: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``times`` - - Returns - ------- - digitized_echo_range: np.ndarray - 2D array of bin indices for ``echo_range`` - bin_time_ind: np.ndarray - 1D array of bin indices for ``times`` - """ - - # get bin index for each echo range value - digitized_echo_range = np.digitize(echo_range, bins_er, right=False) - - # turn datetime into integers, so we can use np.digitize - if isinstance(times, dask.array.Array): - times_i8 = times.compute().data.view("i8") - else: - times_i8 = times.view("i8") - - # turn datetime into integers, so we can use np.digitize - bins_time_i8 = bins_time.view("i8") - - # get bin index for each time - bin_time_ind = np.digitize(times_i8, bins_time_i8, right=False) - - return digitized_echo_range, bin_time_ind - - -def bin_and_mean_echo_range( - arr: Union[np.ndarray, dask.array.Array], digitized_echo_range: np.ndarray, n_bin_er: int -) -> Union[np.ndarray, dask.array.Array]: - """ - Bins and means ``arr`` with respect to the ``echo_range`` bins. - - Parameters - ---------- - arr: np.ndarray or dask.array.Array - 2D array (dimension: [``echo_range`` x ``ping_time``]) to bin along ``echo_range`` - and compute mean of each bin - digitized_echo_range: np.ndarray - 2D array of bin indices for ``echo_range`` - n_bin_er: int - The number of echo range bins - - Returns - ------- - er_means: np.ndarray or dask.array.Array - 2D array representing the bin and mean of ``arr`` along ``echo_range`` - """ - - binned_means = [] - for bin_er in range(1, n_bin_er): - # Catch a known warning that can occur, which does not impact the results - with warnings.catch_warnings(): - # ignore warnings caused by taking a mean of an array filled with NaNs - warnings.filterwarnings(action="ignore", message="Mean of empty slice") - - # bin and mean echo_range dimension - er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1) - - # collect all echo_range bins - binned_means.append(er_selected_data) - - # create full echo_range binned array - er_means = np.vstack(binned_means) - - return er_means - - -def get_unequal_rows(mat: np.ndarray, row: np.ndarray) -> np.ndarray: - """ - Obtains those row indices of ``mat`` that are not equal - to ``row``. - - Parameters - ---------- - mat: np.ndarray - 2D array with the same column dimension as the number - of elements in ``row`` - row: np.ndarray - 1D array with the same number of element elements as - the column dimension of ``mat`` - - Returns - ------- - row_ind_not_equal: np.ndarray - The row indices of ``mat`` that are not equal to ``row`` - - Notes - ----- - Elements with NaNs are considered equal if they are in the same position. - """ - - # compare row against all rows in mat (allowing for NaNs to be equal) - element_nan_equal = (mat == row) | (np.isnan(mat) & np.isnan(row)) - - # determine if mat row is equal to row - row_not_equal = np.logical_not(np.all(element_nan_equal, axis=1)) - - if isinstance(row_not_equal, dask.array.Array): - row_not_equal = row_not_equal.compute() - - # get those row indices that are not equal to row - row_ind_not_equal = np.argwhere(row_not_equal).flatten() - - return row_ind_not_equal - - -def if_all_er_steps_identical(er_chan: Union[xr.DataArray, np.ndarray]) -> bool: - """ - A comprehensive check that determines if all ``echo_range`` values - along ``ping_time`` have the same step size. If they do not have - the same step sizes, then grouping of the ``echo_range`` values - will be necessary. - - Parameters - ---------- - er_chan: xr.DataArray or np.ndarray - 2D array containing the ``echo_range`` values for each ``ping_time`` - - Returns - ------- - bool - True, if grouping of ``echo_range`` along ``ping_time`` is necessary, otherwise False - - Notes - ----- - ``er_chan`` should have rows corresponding to ``ping_time`` and columns - corresponding to ``range_sample`` - """ - - # grab the in-memory numpy echo_range values, if necessary - if isinstance(er_chan, xr.DataArray): - er_chan = er_chan.values - - # grab the first ping_time that is not filled with NaNs - ping_index = 0 - while np.all(np.isnan(er_chan[ping_index, :])): - ping_index += 1 - - # determine those rows of er_chan that are not equal to the row ping_index - unequal_ping_ind = get_unequal_rows(er_chan, er_chan[ping_index, :]) - - if len(unequal_ping_ind) > 0: - # see if all unequal_ping_ind are filled with NaNs - all_nans = np.all(np.all(np.isnan(er_chan[unequal_ping_ind, :]), axis=1)) - - if all_nans: - # All echo_range values have the same step size - return False - else: - # Some echo_range values have different step sizes - return True - else: - # All echo_range values have the same step size - return False - - -def if_last_er_steps_identical(er_chan: Union[xr.DataArray, np.ndarray]) -> bool: - """ - An alternative (less comprehensive) check that determines if all - ``echo_range`` values along ``ping_time`` have the same step size. - If they do not have the same step sizes, then grouping of the - ``echo_range`` values will be necessary. - - Parameters - ---------- - er_chan: xr.DataArray or np.ndarray - 2D array containing the ``echo_range`` values for each ``ping_time`` - - Returns - ------- - bool - True, if grouping of ``echo_range`` along ``ping_time`` is necessary, otherwise False - - Notes - ----- - It is possible that this method will incorrectly determine if grouping - is necessary. - - ``er_chan`` should have rows corresponding to ``ping_time`` and columns - corresponding to ``range_sample`` - """ - - # determine the number of NaNs in each ping and find the unique number of NaNs - unique_num_nans = np.unique(np.isnan(er_chan.data).sum(axis=1)) - - # compute the results, if necessary, to allow for downstream checks - if isinstance(unique_num_nans, dask.array.Array): - unique_num_nans = unique_num_nans.compute() - - # determine if any value is not 0 or er_chan.shape[1] - unexpected_num_nans = False in np.logical_or( - unique_num_nans == 0, unique_num_nans == er_chan.shape[1] - ) - - if unexpected_num_nans: - # echo_range varies with ping_time - return True - else: - # make sure that the final echo_range value for each ping_time is the same (account for NaN) - num_non_nans = np.logical_not(np.isnan(np.unique(er_chan.data[:, -1]))).sum() - - # compute the results, if necessary, to allow for downstream checks - if isinstance(num_non_nans, dask.array.Array): - num_non_nans = num_non_nans.compute() - - if num_non_nans > 1: - # echo_range varies with ping_time - return True - else: - # echo_range does not vary with ping_time - return False - - -def is_er_grouping_needed( - echo_range: Union[xr.DataArray, np.ndarray], comprehensive_er_check: bool -) -> bool: - """ - Determines if ``echo_range`` values along ``ping_time`` can change and - thus need to be grouped. - - Parameters - ---------- - echo_range: xr.DataArray or np.ndarray - 2D array containing the ``echo_range`` values for each ``ping_time`` - comprehensive_er_check: bool - If True, a more comprehensive check will be completed to determine if ``echo_range`` - grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done - - Returns - ------- - bool - If True grouping of ``echo_range`` will be required, else it will not - be necessary - """ - - if comprehensive_er_check: - return if_all_er_steps_identical(echo_range) - else: - return if_last_er_steps_identical(echo_range) - - -def group_dig_er_bin_mean_echo_range( - arr: Union[np.ndarray, dask.array.Array], - digitized_echo_range: Union[np.ndarray, dask.array.Array], - n_bin_er: int, -) -> Union[np.ndarray, dask.array.Array]: - """ - Groups the rows of ``arr`` such that they have the same corresponding - row values in ``digitized_echo_range``, then applies ``bin_and_mean_echo_range`` - on each group, and lastly assembles the correctly ordered ``er_means`` array - representing the bin and mean of ``arr`` with respect to ``echo_range``. - - Parameters - ---------- - arr: dask.array.Array or np.ndarray - The 2D array whose values should be binned - digitized_echo_range: dask.array.Array or np.ndarray - 2D array of bin indices for ``echo_range`` - n_bin_er: int - The number of echo range bins - - Returns - ------- - er_means: dask.array.Array or np.ndarray - The bin and mean of ``arr`` with respect to ``echo_range`` - """ - - # compute bin indices to allow for downstream processes (mainly axis argument in unique) - if isinstance(digitized_echo_range, dask.array.Array): - digitized_echo_range = digitized_echo_range.compute() - - # determine the unique rows of digitized_echo_range and the inverse - unique_er_bin_ind, unique_inverse = np.unique(digitized_echo_range, axis=0, return_inverse=True) - - # create groups of row indices using the unique inverse - grps_same_ind = [ - np.argwhere(unique_inverse == grp).flatten() for grp in np.unique(unique_inverse) - ] - - # for each group bin and mean arr along echo_range - # note: the values appended may not be in the correct final order - binned_er = [] - for count, grp in enumerate(grps_same_ind): - binned_er.append( - bin_and_mean_echo_range(arr[grp, :], unique_er_bin_ind[count, :], n_bin_er) - ) - - # construct er_means and put the columns in the correct order - binned_er_array = np.hstack(binned_er) - correct_column_ind = np.argsort(np.concatenate(grps_same_ind)) - er_means = binned_er_array[:, correct_column_ind] - - return er_means - - -def bin_and_mean_2d( - arr: Union[dask.array.Array, np.ndarray], - bins_time: np.ndarray, - bins_er: np.ndarray, - times: np.ndarray, - echo_range: np.ndarray, - comprehensive_er_check: bool = True, -) -> np.ndarray: - """ - Bins and means ``arr`` based on ``times`` and ``echo_range``, - and their corresponding bins. If ``arr`` is ``Sv`` then this - will compute the MVBS. - - Parameters - ---------- - arr: dask.array.Array or np.ndarray - The 2D array whose values should be binned - bins_time: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``times`` - bins_er: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``echo_range`` - times: np.ndarray - 1D array corresponding to the time values that should be binned - echo_range: np.ndarray - 2D array of echo range values - comprehensive_er_check: bool - If True, a more comprehensive check will be completed to determine if ``echo_range`` - grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done - - Returns - ------- - final_reduced: np.ndarray - The final binned and mean ``arr``, if ``arr`` is ``Sv`` then this is the MVBS - - Notes - ----- - This function assumes that ``arr`` has rows corresponding to - ``ping_time`` and columns corresponding to ``echo_range``. - - This function should not be run if the number of ``echo_range`` values - vary amongst ``ping_times``. This should not occur for our current use - of echopype-generated Sv data. - """ - - # get the number of echo range and time bins - n_bin_er = len(bins_er) - n_bin_time = len(bins_time) - - # obtain the bin indices for echo_range and times - digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time) - - # determine if grouping of echo_range values with the same step size is necessary - er_grouping_needed = is_er_grouping_needed(echo_range, comprehensive_er_check) - - if er_grouping_needed: - # groups, bins, and means arr with respect to echo_range - er_means = group_dig_er_bin_mean_echo_range(arr, digitized_echo_range, n_bin_er) - else: - # bin and mean arr with respect to echo_range - er_means = bin_and_mean_echo_range(arr, digitized_echo_range[0, :], n_bin_er) - - # if er_means is a dask array we compute it so the graph does not get too large - if isinstance(er_means, dask.array.Array): - er_means = er_means.compute() - - # create final reduced array i.e. MVBS - final = np.empty((n_bin_time, n_bin_er - 1)) - for bin_time in range(1, n_bin_time + 1): - # obtain er_mean indices corresponding to the time bin - indices = np.argwhere(bin_time_ind == bin_time).flatten() - - if len(indices) == 0: - # fill values with NaN, if there are no values in the bin - final[bin_time - 1, :] = np.nan - else: - # bin and mean the er_mean time bin - final[bin_time - 1, :] = np.nanmean(er_means[:, indices], axis=1) - - return final - - def get_MVBS_along_channels( ds_Sv: xr.Dataset, range_interval: Union[pd.IntervalIndex, np.ndarray], diff --git a/echopype/tests/commongrid/test_mvbs.py b/echopype/tests/commongrid/test_mvbs.py index 49e9e2d8f..b08048c66 100644 --- a/echopype/tests/commongrid/test_mvbs.py +++ b/echopype/tests/commongrid/test_mvbs.py @@ -1,366 +1,10 @@ -import dask.array import numpy as np import pandas as pd import pytest -from typing import Tuple, Iterable, Union from echopype.commongrid.mvbs import get_MVBS_along_channels from echopype.consolidate.api import POSITION_VARIABLES from flox.xarray import xarray_reduce - -def create_bins(csum_array: np.ndarray) -> Iterable: - """ - Constructs bin ranges based off of a cumulative - sum array. - - Parameters - ---------- - csum_array: np.ndarray - 1D array representing a cumulative sum - - Returns - ------- - bins: list - A list whose elements are the lower and upper bin ranges - """ - - bins = [] - - # construct bins - for count, csum in enumerate(csum_array): - - if count == 0: - - bins.append([0, csum]) - - else: - - # add 0.01 so that left bins don't overlap - bins.append([csum_array[count-1] + 0.01, csum]) - - return bins - - -def create_echo_range_related_data(ping_bins: Iterable, - num_pings_in_bin: np.ndarray, - er_range: list, er_bins: Iterable, - final_num_er_bins: int, - create_dask: bool, - rng: np.random.Generator, - ping_bin_nan_ind: int) -> Tuple[list, list, list]: - """ - Creates ``echo_range`` values and associated bin information. - - Parameters - ---------- - ping_bins: list - A list whose elements are the lower and upper ping time bin ranges - num_pings_in_bin: np.ndarray - Specifies the number of pings in each ping time bin - er_range: list - A list whose first element is the lowest and second element is - the highest possible number of echo range values in a given bin - er_bins: list - A list whose elements are the lower and upper echo range bin ranges - final_num_er_bins: int - The total number of echo range bins - create_dask: bool - If True ``final_arrays`` values will be - dask arrays, else they will be numpy arrays - rng: np.random.Generator - The generator for random values - ping_bin_nan_ind: int - The ping bin index to fill with NaNs - - Returns - ------- - all_er_bin_nums: list of np.ndarray - A list whose elements are the number of values in each echo_range - bin, for each ping bin - ping_times_in_bin: list of np.ndarray - A list whose elements are the ping_time values for each corresponding bin - final_arrays: list of np.ndarray or dask.array.Array - A list whose elements are the echo_range values for a given ping and - echo range bin block - """ - - final_arrays = [] - all_er_bin_nums = [] - ping_times_in_bin = [] - - # build echo_range array - for ping_ind, ping_bin in enumerate(ping_bins): - - # create the ping times associated with each ping bin - ping_times_in_bin.append(rng.uniform(ping_bin[0], ping_bin[1], (num_pings_in_bin[ping_ind],))) - - # randomly determine the number of values in each echo_range bin - num_er_in_bin = rng.integers(low=er_range[0], high=er_range[1], size=final_num_er_bins) - - # store the number of values in each echo_range bin - all_er_bin_nums.append(num_er_in_bin) - - er_row_block = [] - for count, bin_val in enumerate(er_bins): - - # create a block of echo_range values - if create_dask: - a = dask.array.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind], - num_er_in_bin[count])) - else: - a = rng.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind], - num_er_in_bin[count])) - - # store the block of echo_range values - er_row_block.append(a) - - # set all echo_range values at ping index to NaN - if ping_ind == ping_bin_nan_ind: - a[:, :] = np.nan - - # collect and construct echo_range row block - final_arrays.append(np.concatenate(er_row_block, axis=1)) - - return all_er_bin_nums, ping_times_in_bin, final_arrays - - -def construct_2d_echo_range_array(final_arrays: Iterable[np.ndarray], - ping_csum: np.ndarray, - create_dask: bool) -> Tuple[Union[np.ndarray, dask.array.Array], int]: - """ - Creates the final 2D ``echo_range`` array with appropriate padding. - - Parameters - ---------- - final_arrays: list of np.ndarray - A list whose elements are the echo_range values for a given ping and - echo range bin block - ping_csum: np.ndarray - 1D array representing the cumulative sum for the number of ping times - in each ping bin - create_dask: bool - If True ``final_er`` will be a dask array, else it will be a numpy array - - Returns - ------- - final_er: np.ndarray or dask.array.Array - The final 2D ``echo_range`` array - max_num_er_elem: int - The maximum number of ``echo_range`` elements amongst all times - """ - - # get maximum number of echo_range elements amongst all times - max_num_er_elem = max([arr.shape[1] for arr in final_arrays]) - - # total number of ping times - tot_num_times = ping_csum[-1] - - # pad echo_range dimension with nans and create final echo_range - if create_dask: - final_er = dask.array.ones(shape=(tot_num_times, max_num_er_elem)) * np.nan - else: - final_er = np.empty((tot_num_times, max_num_er_elem)) - final_er[:] = np.nan - - for count, arr in enumerate(final_arrays): - - if count == 0: - final_er[0:ping_csum[count], 0:arr.shape[1]] = arr - else: - final_er[ping_csum[count - 1]:ping_csum[count], 0:arr.shape[1]] = arr - - return final_er, max_num_er_elem - - -def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray, - all_er_bin_nums: Iterable[np.ndarray], - num_pings_in_bin: np.ndarray, - create_dask: bool, - ping_bin_nan_ind: int) -> Tuple[Union[np.ndarray, dask.array.Array], - np.ndarray]: - """ - Creates the final 2D Sv array with appropriate padding. - - Parameters - ---------- - max_num_er_elem: int - The maximum number of ``echo_range`` elements amongst all times - ping_csum: np.ndarray - 1D array representing the cumulative sum for the number of ping times - in each ping bin - all_er_bin_nums: list of np.ndarray - A list whose elements are the number of values in each echo_range - bin, for each ping bin - num_pings_in_bin: np.ndarray - Specifies the number of pings in each ping time bin - create_dask: bool - If True ``final_sv`` will be a dask array, else it will be a numpy array - ping_bin_nan_ind: int - The ping bin index to fill with NaNs - - Returns - ------- - final_sv: np.ndarray or dask.array.Array - The final 2D Sv array - final_MVBS: np.ndarray - The final 2D known MVBS array - """ - - # total number of ping times - tot_num_times = ping_csum[-1] - - # pad echo_range dimension with nans and create final sv - if create_dask: - final_sv = dask.array.ones(shape=(tot_num_times, max_num_er_elem)) * np.nan - else: - final_sv = np.empty((tot_num_times, max_num_er_elem)) - final_sv[:] = np.nan - - final_means = [] - for count, arr in enumerate(all_er_bin_nums): - - # create sv row values using natural numbers - sv_row_list = [np.arange(1, num_elem + 1, 1, dtype=np.float64) for num_elem in arr] - - # create final sv row - sv_row = np.concatenate(sv_row_list) - - # get final mean which is n+1/2 (since we are using natural numbers) - ping_mean = [(len(elem) + 1) / 2.0 for elem in sv_row_list] - - # create sv row block - sv_row_block = np.tile(sv_row, (num_pings_in_bin[count], 1)) - - if count == ping_bin_nan_ind: - - # fill values with NaNs - ping_mean = [np.nan]*len(sv_row_list) - sv_row_block[:, :] = np.nan - - # store means for ping - final_means.append(ping_mean) - - if count == 0: - final_sv[0:ping_csum[count], 0:sv_row_block.shape[1]] = sv_row_block - else: - final_sv[ping_csum[count - 1]:ping_csum[count], 0:sv_row_block.shape[1]] = sv_row_block - - # create final sv MVBS - final_MVBS = np.vstack(final_means) - - return final_sv, final_MVBS - - -def create_known_mean_data(final_num_ping_bins: int, - final_num_er_bins: int, - ping_range: list, - er_range: list, create_dask: bool, - rng: np.random.Generator) -> Tuple[np.ndarray, np.ndarray, Iterable, - Iterable, np.ndarray, np.ndarray]: - """ - Orchestrates the creation of ``echo_range``, ``ping_time``, and ``Sv`` arrays - where the MVBS is known. - - Parameters - ---------- - final_num_ping_bins: int - The total number of ping time bins - final_num_er_bins: int - The total number of echo range bins - ping_range: list - A list whose first element is the lowest and second element is - the highest possible number of ping time values in a given bin - er_range: list - A list whose first element is the lowest and second element is - the highest possible number of echo range values in a given bin - create_dask: bool - If True the ``Sv`` and ``echo_range`` values produced will be - dask arrays, else they will be numpy arrays. - rng: np.random.Generator - generator for random integers - - Returns - ------- - final_MVBS: np.ndarray - The final 2D known MVBS array - final_sv: np.ndarray - The final 2D Sv array - ping_bins: Iterable - A list whose elements are the lower and upper ping time bin ranges - er_bins: Iterable - A list whose elements are the lower and upper echo range bin ranges - final_er: np.ndarray - The final 2D ``echo_range`` array - final_ping_time: np.ndarray - The final 1D ``ping_time`` array - """ - - # randomly generate the number of pings in each ping bin - num_pings_in_bin = rng.integers(low=ping_range[0], high=ping_range[1], size=final_num_ping_bins) - - # create bins for ping_time dimension - ping_csum = np.cumsum(num_pings_in_bin) - ping_bins = create_bins(ping_csum) - - # create bins for echo_range dimension - num_er_in_bin = rng.integers(low=er_range[0], high=er_range[1], size=final_num_er_bins) - er_csum = np.cumsum(num_er_in_bin) - er_bins = create_bins(er_csum) - - # randomly select one ping bin to fill with NaNs - ping_bin_nan_ind = rng.choice(len(ping_bins)) - - # create the echo_range data and associated bin information - all_er_bin_nums, ping_times_in_bin, final_er_arrays = create_echo_range_related_data(ping_bins, num_pings_in_bin, - er_range, er_bins, - final_num_er_bins, - create_dask, - rng, - ping_bin_nan_ind) - - # create the final echo_range array using created data and padding - final_er, max_num_er_elem = construct_2d_echo_range_array(final_er_arrays, ping_csum, create_dask) - - # get final ping_time dimension - final_ping_time = np.concatenate(ping_times_in_bin).astype('datetime64[ns]') - - # create the final sv array - final_sv, final_MVBS = construct_2d_sv_array(max_num_er_elem, ping_csum, - all_er_bin_nums, num_pings_in_bin, - create_dask, ping_bin_nan_ind) - - return final_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time - - -@pytest.fixture( - params=[ - { - "create_dask": True, - "final_num_ping_bins": 10, - "final_num_er_bins": 10, - "ping_range": [10, 1000], - "er_range": [10, 1000] - }, - { - "create_dask": False, - "final_num_ping_bins": 10, - "final_num_er_bins": 10, - "ping_range": [10, 1000], - "er_range": [10, 1000] - }, - ], - ids=[ - "delayed_data", - "in_memory_data" - ], -) -def bin_and_mean_2d_params(request): - """ - Obtains all necessary parameters for ``test_bin_and_mean_2d``. - """ - - return list(request.param.values()) - @pytest.mark.unit @pytest.mark.parametrize(["range_var", "lat_lon"], [("depth", False), ("echo_range", True), ("echo_range", False)]) def test_get_MVBS_along_channels(request, range_var, lat_lon): From 9443c2a4c6753f4d1eccda8de4dbda95e080639e Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Wed, 20 Sep 2023 15:37:37 -0700 Subject: [PATCH 40/43] chore(pytest): Added custom markers 'unit' and 'integration' --- pytest.ini | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytest.ini b/pytest.ini index 7a3a8bfd2..9ad97f3a7 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,5 +1,7 @@ # test directory [pytest] testpaths = echopype/tests - cache_dir = .cache +markers = + unit: marks tests as unit tests + integration: marks tests as integration tests From 6eec56bb338e02273253d57ea4102f33f447ec1a Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Wed, 20 Sep 2023 15:40:55 -0700 Subject: [PATCH 41/43] docs: Update docstring for `compute_MVBS` Added options for literal arguments --- echopype/commongrid/api.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index 0dabc779a..b4cddadc6 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -152,19 +152,17 @@ def compute_MVBS( ---------- ds_Sv : xr.Dataset dataset containing Sv and ``echo_range`` [m] - range_var: str + range_var: {'echo_range', 'depth'}, default 'echo_range' The variable to use for range binning. Must be one of ``echo_range`` or ``depth``. Note that ``depth`` is only available if the input dataset contains ``depth`` as a data variable. - range_bin : str - bin size along ``echo_range`` or ``depth`` in meters, - default to ``20m`` - ping_time_bin : str - bin size along ``ping_time``, default to ``20S`` - method: str - The flox strategy for reduction of dask arrays only, - default to ``map-reduce`` + range_bin : str, default '20m' + bin size along ``echo_range`` or ``depth`` in meters. + ping_time_bin : str, default '20S' + bin size along ``ping_time`` + method: str, default 'map-reduce' + The flox strategy for reduction of dask arrays only. See flox `documentation `_ for more details. closed: {'left', 'right'}, default 'left' From 56795a57a9571c4ee3207f96aa2221ba4f109870 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Wed, 20 Sep 2023 17:20:32 -0700 Subject: [PATCH 42/43] refactor: Change 'parse_range_bin' to 'parse_x_bin' Make bin parsing to be more general by making it to 'parse_x_bin'. --- echopype/commongrid/api.py | 62 +++++++++++++++++++++------ echopype/tests/commongrid/test_api.py | 34 ++++++++------- 2 files changed, 67 insertions(+), 29 deletions(-) diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index b4cddadc6..38a5f7a4b 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -88,41 +88,75 @@ def _convert_bins_to_interval_index( return pd.IntervalIndex.from_breaks(bins, closed=closed).sort_values() -def _parse_range_bin(range_bin: str) -> float: +def _parse_x_bin(x_bin: str, x_label="range_bin") -> float: """ - Parses range bin string, check unit, - and returns range bin in meters. + Parses x bin string, check unit, + and returns x bin in x unit. + + Currently only available for: + range_bin: meters (m) + dist_bin: nautical miles (nmi) Parameters ---------- - range_bin : str - Range bin string, e.g., "10m" + x_bin : str + X bin string, e.g., "0.5nmi" or "10m" + x_label : {"range_bin", "dist_bin"}, default "range_bin" + The label of the x bin. Returns ------- float - The resulting range bin value in meters. + The resulting x bin value in x unit, + based on label. Raises ------ ValueError - If the range bin string doesn't include 'm' for meters. + If the x bin string doesn't include unit value. TypeError - If the range bin is not a type string. + If the x bin is not a type string. + KeyError + If the x label is not one of the available labels. """ + x_bin_map = { + "range_bin": { + "name": "Range bin", + "unit": "m", + "ex": "10m", + "unit_label": "meters", + "pattern": r"([\d+]*[.,]{0,1}[\d+]*)(\s+)?(m)", + }, + "dist_bin": { + "name": "Distance bin", + "unit": "nmi", + "ex": "0.5nmi", + "unit_label": "nautical miles", + "pattern": r"([\d+]*[.,]{0,1}[\d+]*)(\s+)?(nmi)", + }, + } + x_bin_info = x_bin_map.get(x_label, None) + + if x_bin_info is None: + raise KeyError(f"x_label must be one of {list(x_bin_map.keys())}") + # First check for bin types - if not isinstance(range_bin, str): - raise TypeError("range_bin must be a string") + if not isinstance(x_bin, str): + raise TypeError("'x_bin' must be a string") # normalize to lower case # for range_bin - range_bin = range_bin.strip().lower() + x_bin = x_bin.strip().lower() # Only matches meters - match_obj = re.match(r"([\d+]*[.,]{0,1}[\d+]*)(\s+)?(m)", range_bin) + match_obj = re.match(x_bin_info["pattern"], x_bin) # Do some checks on range meter inputs if match_obj is None: # This shouldn't be other units - raise ValueError("range_bin must be in meters (e.g., '10m').") + raise ValueError( + f"{x_bin_info['name']} must be in " + f"{x_bin_info['unit_label']} " + f"(e.g., '{x_bin_info['ex']}')." + ) # Convert back to float range_bin = float(match_obj.group(1)) @@ -179,7 +213,7 @@ def compute_MVBS( if not isinstance(ping_time_bin, str): raise TypeError("ping_time_bin must be a string") - range_bin = _parse_range_bin(range_bin) + range_bin = _parse_x_bin(range_bin, "range_bin") # Clean up filenames dimension if it exists # not needed here diff --git a/echopype/tests/commongrid/test_api.py b/echopype/tests/commongrid/test_api.py index 3a75dbcd5..b0276ca72 100644 --- a/echopype/tests/commongrid/test_api.py +++ b/echopype/tests/commongrid/test_api.py @@ -5,28 +5,32 @@ # Utilities Tests @pytest.mark.parametrize( - ["range_bin", "expected_result"], + ["x_bin", "x_label", "expected_result"], [ - (5, TypeError), - ("0.2m", 0.2), - ("10m", 10.0), - ("10km", ValueError), - ("10", ValueError) + # Success + ("10m", "range_bin", 10.0), + ("0.2m", "range_bin", 0.2), + ("0.5nmi", "dist_bin", 0.5), + # Errored + (10, "range_bin", TypeError), + ("10km", "range_bin", ValueError), + ("10", "range_bin", ValueError), + ("10m", "invalid_label", KeyError), ], ) -def test__parse_range_bin(range_bin, expected_result): - expected_error_msg = r"range_bin must be in meters" - if isinstance(range_bin, int): - expected_error_msg = r"range_bin must be a string" - elif range_bin in ["10km", "10"]: - expected_error_msg = r"range_bin must be in meters" +def test__parse_x_bin(x_bin, x_label, expected_result): + if x_label == "invalid_label": + expected_error_msg = r"x_label must be one of" + elif isinstance(x_bin, int): + expected_error_msg = r"must be a string" + elif x_bin in ["10km", "10"]: + expected_error_msg = r"must be in" if not isinstance(expected_result, float): with pytest.raises(expected_result, match=expected_error_msg): - ep.commongrid.api._parse_range_bin(range_bin) + ep.commongrid.api._parse_x_bin(x_bin, x_label) else: - assert ep.commongrid.api._parse_range_bin(range_bin) == expected_result - + assert ep.commongrid.api._parse_x_bin(x_bin, x_label) == expected_result # NASC Tests @pytest.mark.integration From 70a4dd8898ccc5c451113ae1041e82612add8382 Mon Sep 17 00:00:00 2001 From: Landung 'Don' Setiawan Date: Thu, 21 Sep 2023 12:56:07 -0700 Subject: [PATCH 43/43] chore: Update suggested changes Update some texts from suggested review as discussed. Ref: https://github.com/OSOceanAcoustics/echopype/pull/1124#pullrequestreview-1636819555 --- echopype/commongrid/api.py | 10 +-- echopype/tests/commongrid/conftest.py | 109 ++++++++++++++++++------- echopype/tests/commongrid/test_api.py | 40 +++++---- echopype/tests/commongrid/test_mvbs.py | 6 +- 4 files changed, 112 insertions(+), 53 deletions(-) diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index 38a5f7a4b..8c9132b99 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -91,7 +91,7 @@ def _convert_bins_to_interval_index( def _parse_x_bin(x_bin: str, x_label="range_bin") -> float: """ Parses x bin string, check unit, - and returns x bin in x unit. + and returns x bin in the specified unit. Currently only available for: range_bin: meters (m) @@ -144,12 +144,12 @@ def _parse_x_bin(x_bin: str, x_label="range_bin") -> float: if not isinstance(x_bin, str): raise TypeError("'x_bin' must be a string") # normalize to lower case - # for range_bin + # for x_bin x_bin = x_bin.strip().lower() # Only matches meters match_obj = re.match(x_bin_info["pattern"], x_bin) - # Do some checks on range meter inputs + # Do some checks on x_bin inputs if match_obj is None: # This shouldn't be other units raise ValueError( @@ -159,8 +159,8 @@ def _parse_x_bin(x_bin: str, x_label="range_bin") -> float: ) # Convert back to float - range_bin = float(match_obj.group(1)) - return range_bin + x_bin = float(match_obj.group(1)) + return x_bin @add_processing_level("L3*") diff --git a/echopype/tests/commongrid/conftest.py b/echopype/tests/commongrid/conftest.py index 19510f476..27b166a03 100644 --- a/echopype/tests/commongrid/conftest.py +++ b/echopype/tests/commongrid/conftest.py @@ -73,7 +73,7 @@ def mock_parameters(): @pytest.fixture -def mock_sv_sample(mock_parameters): +def mock_Sv_sample(mock_parameters): """ Mock Sv sample @@ -88,23 +88,23 @@ def mock_sv_sample(mock_parameters): @pytest.fixture -def mock_sv_dataset_regular(mock_parameters, mock_sv_sample): - ds_Sv = _gen_Sv_er_regular(**mock_parameters, ping_time_jitter_max_ms=0) - ds_Sv["Sv"].data = mock_sv_sample +def mock_Sv_dataset_regular(mock_parameters, mock_Sv_sample): + ds_Sv = _gen_Sv_echo_range_regular(**mock_parameters, ping_time_jitter_max_ms=0) + ds_Sv["Sv"].data = mock_Sv_sample return ds_Sv @pytest.fixture -def mock_sv_dataset_irregular(mock_parameters, mock_sv_sample, mock_nan_ilocs): +def mock_Sv_dataset_irregular(mock_parameters, mock_Sv_sample, mock_nan_ilocs): depth_interval = [0.5, 0.32, 0.2] depth_ping_time_len = [2, 3, 5] - ds_Sv = _gen_Sv_er_irregular( + ds_Sv = _gen_Sv_echo_range_irregular( **mock_parameters, depth_interval=depth_interval, depth_ping_time_len=depth_ping_time_len, ping_time_jitter_max_ms=30, # Added jitter to ping_time ) - ds_Sv["Sv"].data = mock_sv_sample + ds_Sv["Sv"].data = mock_Sv_sample # Sprinkle nans around echo_range for pos in mock_nan_ilocs: ds_Sv["echo_range"][pos] = np.nan @@ -117,7 +117,7 @@ def mock_mvbs_inputs(): @pytest.fixture -def mock_mvbs_array_regular(mock_sv_dataset_regular, mock_mvbs_inputs, mock_parameters): +def mock_mvbs_array_regular(mock_Sv_dataset_regular, mock_mvbs_inputs, mock_parameters): """ Mock Sv sample result from compute_MVBS @@ -125,7 +125,7 @@ def mock_mvbs_array_regular(mock_sv_dataset_regular, mock_mvbs_inputs, mock_para Ping time bin: 1s Range bin: 2m """ - ds_Sv = mock_sv_dataset_regular + ds_Sv = mock_Sv_dataset_regular ping_time_bin = mock_mvbs_inputs["ping_time_bin"] range_bin = mock_mvbs_inputs["range_meter_bin"] channel_len = mock_parameters["channel_len"] @@ -137,7 +137,7 @@ def mock_mvbs_array_regular(mock_sv_dataset_regular, mock_mvbs_inputs, mock_para @pytest.fixture -def mock_mvbs_array_irregular(mock_sv_dataset_irregular, mock_mvbs_inputs, mock_parameters): +def mock_mvbs_array_irregular(mock_Sv_dataset_irregular, mock_mvbs_inputs, mock_parameters): """ Mock Sv sample irregular result from compute_MVBS @@ -145,7 +145,7 @@ def mock_mvbs_array_irregular(mock_sv_dataset_irregular, mock_mvbs_inputs, mock_ Ping time bin: 1s Range bin: 2m """ - ds_Sv = mock_sv_dataset_irregular + ds_Sv = mock_Sv_dataset_irregular ping_time_bin = mock_mvbs_inputs["ping_time_bin"] range_bin = mock_mvbs_inputs["range_meter_bin"] channel_len = mock_parameters["channel_len"] @@ -237,8 +237,8 @@ def regular_data_params(): @pytest.fixture -def ds_Sv_er_regular(regular_data_params, random_number_generator): - return _gen_Sv_er_regular( +def ds_Sv_echo_range_regular(regular_data_params, random_number_generator): + return _gen_Sv_echo_range_regular( **regular_data_params, random_number_generator=random_number_generator, ) @@ -283,33 +283,33 @@ def depth_offset(): @pytest.fixture -def ds_Sv_er_regular_w_latlon(ds_Sv_er_regular, lat_attrs, lon_attrs): +def ds_Sv_echo_range_regular_w_latlon(ds_Sv_echo_range_regular, lat_attrs, lon_attrs): """Sv dataset with latitude and longitude""" - n_pings = ds_Sv_er_regular.ping_time.shape[0] + n_pings = ds_Sv_echo_range_regular.ping_time.shape[0] latitude = np.linspace(42, 43, num=n_pings) longitude = np.linspace(-124, -125, num=n_pings) - ds_Sv_er_regular["latitude"] = (["ping_time"], latitude, lat_attrs) - ds_Sv_er_regular["longitude"] = (["ping_time"], longitude, lon_attrs) + ds_Sv_echo_range_regular["latitude"] = (["ping_time"], latitude, lat_attrs) + ds_Sv_echo_range_regular["longitude"] = (["ping_time"], longitude, lon_attrs) # Need processing level code for compute MVBS to work! - ds_Sv_er_regular.attrs["processing_level"] = "Level 2A" - return ds_Sv_er_regular + ds_Sv_echo_range_regular.attrs["processing_level"] = "Level 2A" + return ds_Sv_echo_range_regular @pytest.fixture -def ds_Sv_er_regular_w_depth(ds_Sv_er_regular, depth_offset): +def ds_Sv_echo_range_regular_w_depth(ds_Sv_echo_range_regular, depth_offset): """Sv dataset with depth""" - return ds_Sv_er_regular.pipe(add_depth, depth_offset=depth_offset) + return ds_Sv_echo_range_regular.pipe(add_depth, depth_offset=depth_offset) @pytest.fixture -def ds_Sv_er_irregular(random_number_generator): +def ds_Sv_echo_range_irregular(random_number_generator): depth_interval = [0.5, 0.32, 0.13] depth_ping_time_len = [100, 300, 200] ping_time_len = 600 ping_time_interval = "0.3S" - return _gen_Sv_er_irregular( + return _gen_Sv_echo_range_irregular( depth_interval=depth_interval, depth_ping_time_len=depth_ping_time_len, ping_time_len=ping_time_len, @@ -319,9 +319,25 @@ def ds_Sv_er_irregular(random_number_generator): ) -# Helper functions to generate mock Sv dataset -def _get_expected_mvbs_val(ds_Sv, ping_time_bin, range_bin, channel_len=2): - """Helper function to brute force compute_MVBS""" +# Helper functions to generate mock Sv and MVBS dataset +def _get_expected_mvbs_val( + ds_Sv: xr.Dataset, ping_time_bin: str, range_bin: float, channel_len: int = 2 +) -> np.ndarray: + """ + Helper functions to generate expected MVBS outputs from mock Sv dataset + by brute-force looping and compute the mean + + Parameters + ---------- + ds_Sv : xr.Dataset + Mock Sv dataset + ping_time_bin : str + Ping time bin + range_bin : float + Range bin + channel_len : int, default 2 + Number of channels + """ # create bin information needed for ping_time d_index = ( ds_Sv["ping_time"] @@ -371,7 +387,7 @@ def _gen_ping_time(ping_time_len, ping_time_interval, ping_time_jitter_max_ms=0) return ping_time -def _gen_Sv_er_regular( +def _gen_Sv_echo_range_regular( channel_len=2, depth_len=100, depth_interval=0.5, @@ -384,6 +400,21 @@ def _gen_Sv_er_regular( Generate a Sv dataset with uniform echo_range across all ping_time. ping_time_jitter_max_ms controlled jitter in milliseconds in ping_time. + + Parameters + ------------ + channel_len + number of channels + depth_len + number of total depth bins + depth_interval + depth intervals, may have multiple values + ping_time_len + total number of ping_time + ping_time_interval + interval between pings + ping_time_jitter_max_ms + jitter of ping_time in milliseconds """ if random_number_generator is None: @@ -412,7 +443,7 @@ def _gen_Sv_er_regular( return ds_Sv -def _gen_Sv_er_irregular( +def _gen_Sv_echo_range_irregular( channel_len=2, depth_len=100, depth_interval=[0.5, 0.32, 0.13], @@ -426,6 +457,28 @@ def _gen_Sv_er_irregular( Generate a Sv dataset with uniform echo_range across all ping_time. ping_time_jitter_max_ms controlled jitter in milliseconds in ping_time. + + Parameters + ------------ + channel_len + number of channels + depth_len + number of total depth bins + depth_interval + depth intervals, may have multiple values + depth_ping_time_len + the number of pings to use each of the depth_interval + for example, with depth_interval=[0.5, 0.32, 0.13] + and depth_ping_time_len=[100, 300, 200], + the first 100 pings have echo_range with depth intervals of 0.5 m, + the next 300 pings have echo_range with depth intervals of 0.32 m, + and the last 200 pings have echo_range with depth intervals of 0.13 m. + ping_time_len + total number of ping_time + ping_time_interval + interval between pings + ping_time_jitter_max_ms + jitter of ping_time in milliseconds """ if random_number_generator is None: random_number_generator = np.random.default_rng() diff --git a/echopype/tests/commongrid/test_api.py b/echopype/tests/commongrid/test_api.py index b0276ca72..d618d6443 100644 --- a/echopype/tests/commongrid/test_api.py +++ b/echopype/tests/commongrid/test_api.py @@ -41,7 +41,7 @@ def test_compute_NASC(test_data_samples): # MVBS Tests @pytest.mark.integration -def test_compute_MVBS_index_binning(ds_Sv_er_regular, regular_data_params): +def test_compute_MVBS_index_binning(ds_Sv_echo_range_regular, regular_data_params): """Test compute_MVBS_index_binning on mock data""" ping_num = 3 # number of pings to average over @@ -52,7 +52,7 @@ def test_compute_MVBS_index_binning(ds_Sv_er_regular, regular_data_params): # Binned MVBS test ds_MVBS = ep.commongrid.compute_MVBS_index_binning( - ds_Sv_er_regular, range_sample_num=range_sample_num, ping_num=ping_num + ds_Sv_echo_range_regular, range_sample_num=range_sample_num, ping_num=ping_num ) # Shape test @@ -63,7 +63,7 @@ def test_compute_MVBS_index_binning(ds_Sv_er_regular, regular_data_params): # Expected values compute # average should be done in linear domain - da_sv = 10 ** (ds_Sv_er_regular["Sv"] / 10) + da_sv = 10 ** (ds_Sv_echo_range_regular["Sv"] / 10) expected = 10 * np.log10( da_sv.coarsen(ping_time=ping_num, range_sample=range_sample_num, boundary="pad").mean( skipna=True @@ -78,27 +78,27 @@ def test_compute_MVBS_index_binning(ds_Sv_er_regular, regular_data_params): @pytest.mark.parametrize( ["range_bin", "ping_time_bin"], [(5, "10S"), ("10m", 10), ("10km", "10S"), ("10", "10S")] ) -def test_compute_MVBS_bin_inputs_fail(ds_Sv_er_regular, range_bin, ping_time_bin): +def test_compute_MVBS_bin_inputs_fail(ds_Sv_echo_range_regular, range_bin, ping_time_bin): expected_error = ValueError if isinstance(range_bin, int) or isinstance(ping_time_bin, int): expected_error = TypeError - match = r"\w+ must be a string" + match = r"must be a string" else: - match = r"range_bin must be in meters" + match = r"Range bin must be in meters" with pytest.raises(expected_error, match=match): ep.commongrid.compute_MVBS( - ds_Sv_er_regular, range_bin=range_bin, ping_time_bin=ping_time_bin + ds_Sv_echo_range_regular, range_bin=range_bin, ping_time_bin=ping_time_bin ) @pytest.mark.unit -def test_compute_MVBS_w_latlon(ds_Sv_er_regular_w_latlon, lat_attrs, lon_attrs): +def test_compute_MVBS_w_latlon(ds_Sv_echo_range_regular_w_latlon, lat_attrs, lon_attrs): """Testing for compute_MVBS with latitude and longitude""" from echopype.consolidate.api import POSITION_VARIABLES ds_MVBS = ep.commongrid.compute_MVBS( - ds_Sv_er_regular_w_latlon, range_bin="5m", ping_time_bin="10S" + ds_Sv_echo_range_regular_w_latlon, range_bin="5m", ping_time_bin="10S" ) for var in POSITION_VARIABLES: # Check to ensure variable is in dataset @@ -115,17 +115,17 @@ def test_compute_MVBS_w_latlon(ds_Sv_er_regular_w_latlon, lat_attrs, lon_attrs): @pytest.mark.unit @pytest.mark.parametrize("range_var", ["my_range", "echo_range", "depth"]) -def test_compute_MVBS_invalid_range_var(ds_Sv_er_regular, range_var): +def test_compute_MVBS_invalid_range_var(ds_Sv_echo_range_regular, range_var): """Test compute MVBS range_var on mock data""" if range_var == "my_range": with pytest.raises(ValueError, match="range_var must be one of 'echo_range' or 'depth'."): - ep.commongrid.compute_MVBS(ds_Sv_er_regular, range_var=range_var) + ep.commongrid.compute_MVBS(ds_Sv_echo_range_regular, range_var=range_var) elif range_var == "depth": with pytest.raises( ValueError, match=f"range_var '{range_var}' does not exist in the input dataset." ): - ep.commongrid.compute_MVBS(ds_Sv_er_regular, range_var=range_var) + ep.commongrid.compute_MVBS(ds_Sv_echo_range_regular, range_var=range_var) else: pass @@ -173,12 +173,18 @@ def test_compute_MVBS(test_data_samples): ], ) def test_compute_MVBS_range_output(request, er_type): - """Tests the output of compute_MVBS on regular and irregular data.""" + """ + Tests the shape of compute_MVBS output on regular and irregular data. + The irregularity in the input echo_range would cause some rows or columns + of the output Sv to contain NaN. + Here we test for the expected shape after dropping the NaNs + for specific ping_time bins. + """ # set jitter=0 to get predictable number of ping within each echo_range groups if er_type == "regular": - ds_Sv = request.getfixturevalue("ds_Sv_er_regular") + ds_Sv = request.getfixturevalue("ds_Sv_echo_range_regular") else: - ds_Sv = request.getfixturevalue("ds_Sv_er_irregular") + ds_Sv = request.getfixturevalue("ds_Sv_echo_range_irregular") ds_MVBS = ep.commongrid.compute_MVBS(ds_Sv, range_bin="5m", ping_time_bin="10S") @@ -261,12 +267,12 @@ def _parse_nans(mvbs, ds_Sv) -> np.ndarray: ping_time_bin = "1s" if er_type == "regular": - ds_Sv = request.getfixturevalue("mock_sv_dataset_regular") + ds_Sv = request.getfixturevalue("mock_Sv_dataset_regular") expected_mvbs = request.getfixturevalue("mock_mvbs_array_regular") else: # Mock irregular dataset contains jitter # and NaN values in the bottom echo_range - ds_Sv = request.getfixturevalue("mock_sv_dataset_irregular") + ds_Sv = request.getfixturevalue("mock_Sv_dataset_irregular") expected_mvbs = request.getfixturevalue("mock_mvbs_array_irregular") ds_MVBS = ep.commongrid.compute_MVBS(ds_Sv, range_bin=range_bin, ping_time_bin=ping_time_bin) diff --git a/echopype/tests/commongrid/test_mvbs.py b/echopype/tests/commongrid/test_mvbs.py index b08048c66..78a77be0a 100644 --- a/echopype/tests/commongrid/test_mvbs.py +++ b/echopype/tests/commongrid/test_mvbs.py @@ -19,11 +19,11 @@ def test_get_MVBS_along_channels(request, range_var, lat_lon): # Retrieve the correct dataset if range_var == "depth": - ds_Sv = request.getfixturevalue("ds_Sv_er_regular_w_depth") + ds_Sv = request.getfixturevalue("ds_Sv_echo_range_regular_w_depth") elif range_var == "echo_range" and lat_lon is True: - ds_Sv = request.getfixturevalue("ds_Sv_er_regular_w_latlon") + ds_Sv = request.getfixturevalue("ds_Sv_echo_range_regular_w_latlon") else: - ds_Sv = request.getfixturevalue("ds_Sv_er_regular") + ds_Sv = request.getfixturevalue("ds_Sv_echo_range_regular") # compute range interval echo_range_max = ds_Sv[range_var].max()