diff --git a/CHANGES.md b/CHANGES.md index 9a06cef79..27a9a0336 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -3,6 +3,7 @@ * Values of scalar variables are now always shown in **VARIABLES** panel in Cate Desktop [#702](https://github.com/CCI-Tools/cate/issues/702) * Added information about resources of type `GeoDataFrame` (Shapefiles, GeoJSON) in the details section of the **WORSPACE** panel in Cate Desktop [#705](https://github.com/CCI-Tools/cate/issues/705) +* Added new operation `merge()` [#740](https://github.com/CCI-Tools/cate/issues/740) * Added new operation `data_frame_subset()` [#708](https://github.com/CCI-Tools/cate/issues/708) * Fixed display of CCI Sea Level MSLAMPH data [#722](https://github.com/CCI-Tools/cate/issues/722) * Improve indexers to first do a validation with respect to the available dimensions and the selected remaining_dims diff --git a/cate/ops/utility.py b/cate/ops/utility.py index a4977e096..1d545991c 100644 --- a/cate/ops/utility.py +++ b/cate/ops/utility.py @@ -37,7 +37,69 @@ from cate.util.monitor import Monitor -@op(tags=['utility', 'internal']) +@op(tags=['utility']) +@op_input('ds_1', data_type=DatasetLike) +@op_input('ds_2', data_type=DatasetLike) +@op_input('ds_3', data_type=DatasetLike) +@op_input('ds_4', data_type=DatasetLike) +@op_input('join', value_set=["outer", "inner", "left", "right", "exact"]) +@op_input('compat', value_set=["identical", "equals", "broadcast_equals", "no_conflicts"]) +def merge(ds_1: DatasetLike.TYPE, + ds_2: DatasetLike.TYPE, + ds_3: DatasetLike.TYPE = None, + ds_4: DatasetLike.TYPE = None, + join: str = 'outer', + compat: str = 'no_conflicts') -> xr.Dataset: + """ + Merge up to four datasets to produce a new dataset with combined variables from each input dataset. + + This is a wrapper for the ``xarray.merge()`` function. + + For documentation refer to xarray documentation at + http://xarray.pydata.org/en/stable/generated/xarray.Dataset.merge.html#xarray.Dataset.merge + + The *compat* argument indicates how to compare variables of the same name for potential conflicts: + + * "broadcast_equals": all values must be equal when variables are broadcast + against each other to ensure common dimensions. + * "equals": all values and dimensions must be the same. + * "identical": all values, dimensions and attributes must be the same. + * "no_conflicts": only values which are not null in both datasets must be equal. + The returned dataset then contains the combination of all non-null values. + + :param ds_1: The first input dataset. + :param ds_2: The second input dataset. + :param ds_3: An optional 3rd input dataset. + :param ds_4: An optional 4th input dataset. + :param join: How to combine objects with different indexes. + :param compat: How to compare variables of the same name for potential conflicts. + :return: A new dataset with combined variables from each input dataset. + """ + + ds_1 = DatasetLike.convert(ds_1) + ds_2 = DatasetLike.convert(ds_2) + ds_3 = DatasetLike.convert(ds_3) + ds_4 = DatasetLike.convert(ds_4) + + datasets = [] + for ds in (ds_1, ds_2, ds_3, ds_4): + if ds is not None: + included = False + for ds2 in datasets: + if ds is ds2: + included = True + if not included: + datasets.append(ds) + + if len(datasets) == 0: + raise ValidationError('At least two different datasets must be given') + elif len(datasets) == 1: + return datasets[0] + else: + return xr.merge(datasets, compat=compat, join=join) + + +@op(tags=['utility']) @op_input('ds', data_type=DatasetLike) @op_input('point', data_type=PointLike, units='degree') @op_input('time', data_type=TimeLike) diff --git a/test/cli/test_main.py b/test/cli/test_main.py index ec516fd84..e0a91a6fe 100644 --- a/test/cli/test_main.py +++ b/test/cli/test_main.py @@ -311,7 +311,7 @@ def test_op_list(self): self.assert_main(['op', 'list'], expected_stdout=['operations found']) self.assert_main(['op', 'list', '-n', 'read'], expected_stdout=['operations found']) self.assert_main(['op', 'list', '-n', 'nevermatch'], expected_stdout=['No operations found']) - self.assert_main(['op', 'list', '--internal'], expected_stdout=['2 operations found']) + self.assert_main(['op', 'list', '--internal'], expected_stdout=['One operation found']) self.assert_main(['op', 'list', '--tag', 'input'], expected_stdout=['9 operations found']) self.assert_main(['op', 'list', '--tag', 'output'], expected_stdout=['6 operations found']) self.assert_main(['op', 'list', '--deprecated'], expected_stdout=['2 operations found']) diff --git a/test/ops/test_utility.py b/test/ops/test_utility.py index 887080f26..1195283a3 100644 --- a/test/ops/test_utility.py +++ b/test/ops/test_utility.py @@ -9,42 +9,67 @@ import xarray as xr from cate.core.op import OP_REGISTRY -from cate.ops.utility import sel, from_dataframe, identity, literal, pandas_fillna +from cate.core.types import ValidationError +from cate.ops.utility import merge, sel, from_dataframe, identity, literal, pandas_fillna from cate.util.misc import object_to_qualified_name -def new_ds(): - lon = [10.1, 10.2, 10.3, 10.4] - lat = [34.5, 34.6] - time = pd.date_range('2014-09-06', periods=10) - reference_time = pd.Timestamp('2014-09-05') - - time_res = len(time) - lon_res = len(lon) - lat_res = len(lat) - - temperature = (15 + 8 * np.random.randn(lon_res, lat_res, time_res)).round(decimals=1) - precipitation = (10 * np.random.rand(lon_res, lat_res, time_res)).round(decimals=1) - - ds = xr.Dataset({'temperature': (['lon', 'lat', 'time'], temperature), - 'precipitation': (['lon', 'lat', 'time'], precipitation) - }, - coords={'lon': lon, - 'lat': lat, - 'time': time, - 'reference_time': reference_time - }) - return ds - - -def assert_dataset_equal(expected, actual): - # this method is functionally equivalent to - # `assert expected == actual`, but it checks each aspect - # of equality separately for easier debugging - assert expected.equals(actual), (expected, actual) - - -class TestSel(TestCase): +class MergeTest(TestCase): + def test_nominal(self): + """ + Test nominal execution + """ + periods = 5 + time = pd.date_range('2000-01-01', periods=periods) + + ds_1 = xr.Dataset({'A': (['time'], np.random.randn(periods)), + 'B': (['time'], np.random.randn(periods)), + 'time': time}) + ds_2 = xr.Dataset({'C': (['time'], np.random.randn(periods)), + 'D': (['time'], np.random.randn(periods)), + 'time': time}) + new_ds = merge(ds_1=ds_1, ds_2=ds_2, ds_3=None, ds_4=None) + self.assertTrue('A' in new_ds) + self.assertTrue('B' in new_ds) + self.assertTrue('C' in new_ds) + self.assertTrue('D' in new_ds) + + new_ds = merge(ds_1=ds_1, ds_2=ds_1, ds_3=ds_1, ds_4=ds_2) + self.assertTrue('A' in new_ds) + self.assertTrue('B' in new_ds) + self.assertTrue('C' in new_ds) + self.assertTrue('D' in new_ds) + + new_ds = merge(ds_1=ds_1, ds_2=ds_1, ds_3=ds_1, ds_4=ds_1) + self.assertIs(new_ds, ds_1) + + new_ds = merge(ds_1=ds_2, ds_2=ds_2, ds_3=ds_2, ds_4=ds_2) + self.assertIs(new_ds, ds_2) + + ds_3 = xr.Dataset({'E': (['time'], np.random.randn(periods)), + 'time': time}) + new_ds = merge(ds_1=ds_1, ds_2=ds_2, ds_3=ds_3, ds_4=None) + self.assertTrue('A' in new_ds) + self.assertTrue('B' in new_ds) + self.assertTrue('C' in new_ds) + self.assertTrue('D' in new_ds) + self.assertTrue('E' in new_ds) + + ds_4 = xr.Dataset({'F': (['time'], np.random.randn(periods)), + 'time': time}) + new_ds = merge(ds_1=ds_1, ds_2=ds_2, ds_3=ds_3, ds_4=ds_4) + self.assertTrue('A' in new_ds) + self.assertTrue('B' in new_ds) + self.assertTrue('C' in new_ds) + self.assertTrue('D' in new_ds) + self.assertTrue('E' in new_ds) + + def test_failures(self): + with self.assertRaises(ValidationError): + merge(ds_1=None, ds_2=None, ds_3=None, ds_4=None) + + +class SelTest(TestCase): def test_nominal(self): ds = new_ds() @@ -173,6 +198,7 @@ class TestFillna(TestCase): """ Test fillna operation """ + def test_nominal(self): """ Test nominal operation @@ -214,3 +240,34 @@ def test_registered(self): actual = reg_op(df=df, method='ffill') self.assertTrue(actual.equals(expected)) + + +def new_ds(): + lon = [10.1, 10.2, 10.3, 10.4] + lat = [34.5, 34.6] + time = pd.date_range('2014-09-06', periods=10) + reference_time = pd.Timestamp('2014-09-05') + + time_res = len(time) + lon_res = len(lon) + lat_res = len(lat) + + temperature = (15 + 8 * np.random.randn(lon_res, lat_res, time_res)).round(decimals=1) + precipitation = (10 * np.random.rand(lon_res, lat_res, time_res)).round(decimals=1) + + ds = xr.Dataset({'temperature': (['lon', 'lat', 'time'], temperature), + 'precipitation': (['lon', 'lat', 'time'], precipitation) + }, + coords={'lon': lon, + 'lat': lat, + 'time': time, + 'reference_time': reference_time + }) + return ds + + +def assert_dataset_equal(expected, actual): + # this method is functionally equivalent to + # `assert expected == actual`, but it checks each aspect + # of equality separately for easier debugging + assert expected.equals(actual), (expected, actual)