diff --git a/README.md b/README.md index da13683..ddefe9c 100644 --- a/README.md +++ b/README.md @@ -66,15 +66,22 @@ By default, a data ID represents one item, which is opened as a dataset, with ea asset becoming a data variable within the dataset. Additionally, a stack mode is -available, enabling the stacking of items using [odc-stac](https://odc-stac.readthedocs.io/en/latest/). -This allows for mosaicking multiple tiles and concatenating the datacube along the -temporal axis. +available, enabling the stacking of items using the core functionality of [xcube](https://xcube.readthedocs.io/en/latest/). +This allows for mosaicking multiple tiles grouped by solar day, and concatenating +the datacube along the temporal axis. -Also, [stackstac](https://stackstac.readthedocs.io/en/latest/) has been +Also, [odc-stac](https://odc-stac.readthedocs.io/en/latest/) and +[stackstac](https://stackstac.readthedocs.io/en/latest/) has been considered during the evaluation of python libraries supporting stacking of STAC items. -However, the [benchmarking report](https://benchmark-odc-stac-vs-stackstac.netlify.app/) -comparing stackstac and odc-stac shows that ocd-stac outperforms stackstac. Furthermore, -stackstac shows an [issue](https://github.com/gjoseph92/stackstac/issues/196) in making +However, both stacking libraries depend on GDAL driver for reading the data with +`rasterio.open`, which prohibit the reading the data from the +[CDSE S3 endpoint](https://documentation.dataspace.copernicus.eu/APIs/S3.html), due to +blocking of the rasterio AWS environments. +Comparing [odc-stac](https://odc-stac.readthedocs.io/en/latest/) and +[stackstac](https://stackstac.readthedocs.io/en/latest/), +the [benchmarking report](https://benchmark-odc-stac-vs-stackstac.netlify.app/) shows +that ocd-stac outperforms stackstac. Furthermore, stackstac shows an +[issue](https://github.com/gjoseph92/stackstac/issues/196) in making use of the overview levels of COGs files. Still, stackstac shows high popularity in the community and might be supported in the future. @@ -83,6 +90,11 @@ community and might be supported in the future. ### Overview of Jupyter notebooks The following Jupyter notebooks provide some examples: +* `example/notebooks/cdse_sentinel_2.ipynb`: + This notebook shows an example how to stack multiple tiles of Sentinel-2 L2A data + using the [CDSE STAC API](https://documentation.dataspace.copernicus.eu/APIs/STAC.html). + It shows stacking of individual tiles and mosaicking of multiple tiles measured on + the same solar day. * `example/notebooks/earth_search_sentinel2_l2a_stack_mode.ipynb`: This notebook shows an example how to stack multiple tiles of Sentinel-2 L2A data from Earth Search by Element 84 STAC API. It shows stacking of individual tiles and @@ -124,8 +136,7 @@ and is specified by the segment of the URL that follows the catalog's URL. The `data_type` can be set to `dataset` and `mldataset`, which returns a `xr.Dataset` and a [xcube multi-resoltuion dataset](https://xcube.readthedocs.io/en/latest/mldatasets.html), respectively. Note that in the above example, if `data_type` is not assigned, -a multi-resolution dataset will be returned. This is because the item's asset links to -GeoTIFFs, which are opened as multi-resolution datasets by default. +a `xarray.Dataset` will be returned. To use the stac-mode, initiate a stac store with the argument `stack_mode=True`. @@ -138,50 +149,21 @@ store = new_data_store( stack_mode=True ) ds = store.open_data( - "sentinel-2-l2a", - data_type="dataset", - bbox=[9.1, 53.1, 10.7, 54], - time_range= ["2020-07-01", "2020-08-01"], - query={"s2:processing_baseline": {"eq": "02.14"}}, + bbox=[506700, 5883400, 611416, 5984840], + time_range=["2020-07-15", "2020-08-01"], + crs="EPSG:32632", + spatial_res=20, + asset_names=["red", "green", "blue"], + apply_scaling=True, ) ``` In the stacking mode, the data IDs are the collection IDs within the STAC catalog. To get Sentinel-2 L2A data, we assign `data_id` to `"sentinel-2-l2a"`. The bounding box and -time range are assigned to define the temporal and spatial extent of the data cube. -Additionally, for this example, we need to set a query argument to select a specific -[Sentinel-2 processing baseline](https://sentiwiki.copernicus.eu/web/s2-processing#S2Processing-L2Aprocessingbaseline), -as the collection contains multiple items for the same tile with different processing -procedures. Note that this requirement can vary between collections and must be -specified by the user. To set query arguments, the STAC catalog needs to be conform with -the [query extension](https://github.com/stac-api-extensions/query). - -The stacking is performed using [odc-stac](https://odc-stac.readthedocs.io/en/latest/). -All arguments of [odc.stac.load](https://odc-stac.readthedocs.io/en/latest/_api/odc.stac.load.html) -can be passed into the `open_data(...)` method, which forwards them to the -`odc.stac.load` function. - -To apply mosaicking, we need to assign `groupby="solar_day"`, as shown in the -[documentation of `odc.stac.load`](https://odc-stac.readthedocs.io/en/latest/_api/odc.stac.load.html). -The following few lines of code show a small example including mosaicking. - -```python -from xcube.core.store import new_data_store - -store = new_data_store( - "stac", - url="https://earth-search.aws.element84.com/v1", - stack_mode=True -) -ds = store.open_data( - "sentinel-2-l2a", - data_type="dataset", - bbox=[9.1, 53.1, 10.7, 54], - time_range= ["2020-07-01", "2020-08-01"], - query={"s2:processing_baseline": {"eq": "02.14"}}, - groupby="solar_day", -) -``` +time range are assigned to define the temporal and spatial extent of the data cube. The +parameter `crs` and `spatial_res` are required as well and define the coordinate +reference system (CRS) and the spatial resolution respectively. Note, that the bounding +box and spatial resolution needs to be given in the respective CRS. ## Testing diff --git a/test/test_accessor.py b/test/test_accessor.py index beca2aa..8f28822 100644 --- a/test/test_accessor.py +++ b/test/test_accessor.py @@ -21,6 +21,7 @@ import unittest from unittest.mock import patch +from unittest.mock import MagicMock import dask import dask.array as da @@ -59,18 +60,25 @@ def test_del(self): def test_root(self): self.assertEqual("eodata", self.accessor.root) + @patch("rasterio.open") @patch("rioxarray.open_rasterio") - def test_open_data(self, mock_open_rasterio): - # set-up mock + def test_open_data(self, mock_rioxarray_open, mock_rasterio_open): + # set-up mock for rioxarray.open_rasterio mock_data = { "band_1": (("y", "x"), da.ones((2048, 2048), chunks=(1024, 1024))), } mock_ds = xr.Dataset(mock_data) - mock_open_rasterio.return_value = mock_ds + mock_rioxarray_open.return_value = mock_ds + # set-up mock for rasterio.open + mock_rio_dataset = MagicMock() + mock_rio_dataset.overviews.return_value = [2, 4, 8] + mock_rasterio_open.return_value.__enter__.return_value = mock_rio_dataset + + # start tests access_params = dict(protocol="s3", root="eodata", fs_path="test.tif") ds = self.accessor.open_data(access_params) - mock_open_rasterio.assert_called_once_with( + mock_rioxarray_open.assert_called_once_with( "s3://eodata/test.tif", chunks=dict(x=1024, y=1024), band_as_variable=True, @@ -97,8 +105,10 @@ def test_open_data(self, mock_open_rasterio): mlds = self.accessor.open_data(access_params, data_type="mldataset") self.assertIsInstance(mlds, MultiLevelDataset) + self.assertEqual(4, mlds.num_levels) + mock_rasterio_open.assert_called_once_with("s3://eodata/test.tif") ds = mlds.base_dataset - mock_open_rasterio.assert_called_with( + mock_rioxarray_open.assert_called_with( "s3://eodata/test.tif", overview_level=None, chunks=dict(x=1024, y=1024), diff --git a/test/test_helper.py b/test/test_helper.py new file mode 100644 index 0000000..b422c11 --- /dev/null +++ b/test/test_helper.py @@ -0,0 +1,145 @@ +# The MIT License (MIT) +# Copyright (c) 2024 by the xcube development team and contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NON INFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import unittest +from unittest.mock import patch +import datetime + +import pystac + +from xcube_stac.helper import HelperCdse + + +class HelperCdseTest(unittest.TestCase): + + def setUp(self): + self.asset = pystac.Asset( + href="test_href", + media_type="dummy", + roles=["data"], + extra_fields=dict( + alternate=dict( + s3=dict( + href=( + "/eodata/Sentinel-2/MSI/L2A/2024/11/07/S2A_MSIL2A_20241107" + "T113311_N0511_R080_T31VDG_20241107T123948.SAFE" + ) + ) + ) + ), + ) + self.item = pystac.Item( + id="cdse_item_parts", + geometry={ + "type": "Polygon", + "coordinates": [ + [ + [100.0, 0.0], + [101.0, 0.0], + [101.0, 1.0], + [100.0, 1.0], + [100.0, 0.0], + ] + ], + }, + bbox=[100.0, 0.0, 101.0, 1.0], + datetime=datetime.datetime(2023, 1, 1, 0, 0, 0), + properties=dict( + tileId="title_id", + orbitNumber=0, + ), + ) + self.item.add_asset("PRODUCT", self.asset) + + @patch("s3fs.S3FileSystem.glob") + def test_parse_item(self, mock_glob): + mock_glob.return_value = [ + "eodata/Sentinel-2/MSI/L2A/2024/11/07/S2A_MSIL2A_20241107T113311_N0511" + "_R080_T31VDG_20241107T123948.SAFE/GRANULE/L2A_T32TMT_A017394_" + "20200705T101917/IMG_DATA/dummy.jp2" + ] + + helper = HelperCdse( + client_kwargs=dict(endpoint_url="https://eodata.dataspace.copernicus.eu"), + key="xxx", + secret="xxx", + ) + + item = self.item + item.properties["processorVersion"] = "02.14" + item_parsed = helper.parse_item( + self.item, asset_names=["B01", "B02"], crs="EPSG:4326", spatial_res=0.001 + ) + self.assertIn("B01", item_parsed.assets) + self.assertEqual( + 0, item_parsed.assets["B01"].extra_fields["raster:bands"][0]["offset"] + ) + self.assertIn("B02", item_parsed.assets) + self.assertEqual( + ( + "eodata/Sentinel-2/MSI/L2A/2024/11/07/S2A_MSIL2A_20241107T113311_N0511" + "_R080_T31VDG_20241107T123948.SAFE/GRANULE/L2A_Ttitle_id_A000000_" + "20200705T101917/IMG_DATA/R60m/Ttitle_id_parts_B02_60m.jp2" + ), + item_parsed.assets["B02"].href, + ) + item = self.item + item.properties["processorVersion"] = "05.00" + item_parsed = helper.parse_item( + self.item, asset_names=["B01", "B02"], crs="EPSG:4326", spatial_res=0.001 + ) + self.assertIn("B01", item_parsed.assets) + self.assertEqual( + -0.1, item_parsed.assets["B01"].extra_fields["raster:bands"][0]["offset"] + ) + self.assertIn("B02", item_parsed.assets) + + @patch("s3fs.S3FileSystem.glob") + def test_get_data_access_params(self, mock_glob): + mock_glob.return_value = [ + "eodata/Sentinel-2/MSI/L2A/2024/11/07/S2A_MSIL2A_20241107T113311_N0511" + "_R080_T31VDG_20241107T123948.SAFE/GRANULE/L2A_T32TMT_A017394_" + "20200705T101917/IMG_DATA/dummy.jp2" + ] + helper = HelperCdse( + client_kwargs=dict(endpoint_url="https://eodata.dataspace.copernicus.eu"), + key="xxx", + secret="xxx", + ) + item = self.item + item.properties["processorVersion"] = "05.00" + item_parsed = helper.parse_item( + self.item, asset_names=["B01", "B02"], crs="EPSG:3035", spatial_res=20 + ) + data_access_params = helper.get_data_access_params( + item_parsed, asset_names=["B01", "B02"], crs="EPSG:3035", spatial_res=20 + ) + self.assertEqual("B01", data_access_params["B01"]["name"]) + self.assertEqual("s3", data_access_params["B01"]["protocol"]) + self.assertEqual("eodata", data_access_params["B01"]["root"]) + self.assertEqual( + ( + "Sentinel-2/MSI/L2A/2024/11/07/S2A_MSIL2A_20241107T113311_N0511_R080_" + "T31VDG_20241107T123948.SAFE/GRANULE/L2A_T32TMT_A017394_20200705T101917" + "/IMG_DATA/dummy.jp2" + ), + data_access_params["B01"]["fs_path"], + ) diff --git a/test/test_store.py b/test/test_store.py index 2e4d8c5..ca081a7 100644 --- a/test/test_store.py +++ b/test/test_store.py @@ -38,7 +38,6 @@ from xcube_stac.constants import DATA_STORE_ID from xcube_stac.constants import DATA_STORE_ID_XCUBE from xcube_stac.constants import DATA_STORE_ID_CDSE -from xcube_stac.sen2.constants import CDSE_SENTINEL_2_LEVEL_BAND_RESOLUTIONS from xcube_stac.accessor import HttpsDataAccessor from xcube_stac.accessor import S3DataAccessor @@ -253,6 +252,7 @@ def test_get_data_opener_ids(self): data_id=self.data_id_nonsearchable, data_type="dataset" ), ) + # CDSE STAC API Sentinel-2 store = new_data_store( DATA_STORE_ID_CDSE, @@ -440,13 +440,22 @@ def test_open_data_tiff(self): ) # open data with open_params - mlds = store.open_data( - self.data_id_time_range, - asset_names=["blue_p25", "crs"], - data_type="mldataset", + with self.assertLogs("xcube.stac", level="WARNING") as cm: + mlds = store.open_data( + self.data_id_time_range, + asset_names=["blue_p25"], + data_type="mldataset", + apply_scaling=True, + ) + ds = mlds.base_dataset + self.assertEqual(1, len(cm.output)) + msg = ( + f"WARNING:xcube.stac:The asset blue_p25 in item " + "lcv_blue_landsat.glad.ard_1999.12.02..2000.03.20 is not conform to " + f"the stac-extension 'raster'. No scaling is applied." ) + self.assertEqual(msg, str(cm.output[-1])) self.assertIsInstance(mlds, MultiLevelDataset) - ds = mlds.base_dataset self.assertCountEqual(["blue_p25", "crs"], list(ds.data_vars)) self.assertCountEqual([151000, 188000], [ds.sizes["y"], ds.sizes["x"]]) self.assertCountEqual( @@ -587,24 +596,30 @@ def test_open_data_xcube_server(self): ], ) - # open data store in tif format - mlds = store.open_data( - "collections/datacubes/items/cog_local", data_type="mldataset" - ) - ds = mlds.base_dataset - self.assertIsInstance(mlds, MultiLevelDataset) - self.assertEqual(3, mlds.num_levels) - self.assertIsInstance(ds, xr.Dataset) - self.assertCountEqual( - [ - "analytic_multires_band_1", - "analytic_multires_band_2", - "analytic_multires_band_3", - "analytic_multires_spatial_ref", - ], - list(ds.data_vars), - ) - self.assertCountEqual([343, 343], [ds.sizes["y"], ds.sizes["x"]]) + # open data as ml dataset + mldss = [ + store.open_data( + "collections/datacubes/items/cog_local", data_type="mldataset" + ), + store.open_data( + "collections/datacubes/items/cog_local", opener_id="mldataset:levels:s3" + ), + ] + for mlds in mldss: + ds = mlds.base_dataset + self.assertIsInstance(mlds, MultiLevelDataset) + self.assertEqual(3, mlds.num_levels) + self.assertIsInstance(ds, xr.Dataset) + self.assertCountEqual( + [ + "analytic_multires_band_1", + "analytic_multires_band_2", + "analytic_multires_band_3", + "analytic_multires_spatial_ref", + ], + list(ds.data_vars), + ) + self.assertCountEqual([343, 343], [ds.sizes["y"], ds.sizes["x"]]) # raise error when selecting "analytic" (asset linking to the dataset) and # "analytic_multires" (asset linking to the mldataset) @@ -648,82 +663,6 @@ def test_open_data_stack_mode(self): [ds.chunksizes["time"][0], ds.chunksizes["y"][0], ds.chunksizes["x"][0]], ) - # TODO: the following two tests work fine locally. However, s3fs is not supported - # TODO: by vcrpy. New testing strategy needs to be developed in an upcoming PR. - # @pytest.mark.vcr() - # def test_open_data_cdse_seninel_2(self): - # store = new_data_store( - # DATA_STORE_ID_CDSE, - # key=CDSE_CREDENTIALS["key"], - # secret=CDSE_CREDENTIALS["secret"], - # apply_scaling=True, - # ) - # - # # open data as dataset - # ds = store.open_data(self.data_id_cdse_sen2) - # self.assertIsInstance(ds, xr.Dataset) - # data_vars = list(ds.data_vars) - # data_vars.remove("crs") - # self.assertCountEqual( - # CDSE_SENTINEL_2_LEVEL_BAND_RESOLUTIONS["L2A"].keys(), - # data_vars, - # ) - # self.assertCountEqual([10980, 10980], [ds.sizes["y"], ds.sizes["x"]]) - # self.assertCountEqual( - # [1024, 1024], [ds.B01.chunksizes["x"][0], ds.AOT.chunksizes["y"][0]] - # ) - # - # # open data as multi-level dataset - # mlds = store.open_data( - # self.data_id_cdse_sen2, - # asset_names=["B04", "B03", "B02"], - # data_type="mldataset", - # apply_scaling=True, - # ) - # self.assertIsInstance(mlds, MultiLevelDataset) - # ds = mlds.get_dataset(2) - # data_vars = list(ds.data_vars) - # data_vars.remove("crs") - # self.assertCountEqual(["B04", "B03", "B02"], data_vars) - # self.assertCountEqual([2745, 2745], [ds.sizes["y"], ds.sizes["x"]]) - # self.assertCountEqual( - # [1024, 1024], [ds.chunksizes["x"][0], ds.chunksizes["y"][0]] - # ) - - # @pytest.mark.vcr() - # def test_open_data_cdse_seninel_2_stack_mode(self): - # store = new_data_store( - # DATA_STORE_ID_CDSE, - # key=CDSE_CREDENTIALS["key"], - # secret=CDSE_CREDENTIALS["secret"], - # stack_mode=True, - # ) - # - # # open data as dataset - # bbox_utm = [659574, 5892990, 659724, 5893140] - # ds = store.open_data( - # data_id="SENTINEL-2", - # bbox=bbox_utm, - # time_range=["2023-11-01", "2023-11-10"], - # processing_level="L2A", - # spatial_res=10, - # crs="EPSG:32635", - # asset_names=["B03", "B04"], - # apply_scaling=True, - # ) - # self.assertIsInstance(ds, xr.Dataset) - # data_vars = list(ds.data_vars) - # data_vars.remove("crs") - # self.assertCountEqual(["B03", "B04"], data_vars) - # self.assertCountEqual( - # [4, 16, 16], - # [ds.sizes["time"], ds.sizes["y"], ds.sizes["x"]], - # ) - # self.assertCountEqual( - # [1, 16, 16], - # [ds.chunksizes["time"][0], ds.chunksizes["y"][0], ds.chunksizes["x"][0]], - # ) - @pytest.mark.vcr() def test_open_data_wrong_opener_id(self): store = new_data_store(DATA_STORE_ID, url=self.url_nonsearchable) diff --git a/xcube_stac/helper.py b/xcube_stac/helper.py index ba39f8d..d03b392 100644 --- a/xcube_stac/helper.py +++ b/xcube_stac/helper.py @@ -4,7 +4,6 @@ import pystac import pystac_client.client from xcube.core.store import DataStoreError -from xcube.util.jsonschema import JsonObjectSchema import s3fs from .accessor import S3DataAccessor @@ -47,12 +46,6 @@ def __init__(self): def parse_item(self, item: pystac.Item, **open_params) -> pystac.Item: return item - def parse_items_stack(self, items: dict[list[pystac.Item]], **open_params) -> dict: - parsed_items = {} - for key, items in items.items(): - parsed_items[key] = [self.parse_item(item, **open_params) for item in items] - return dict(parsed_items) - def get_data_access_params(self, item: pystac.Item, **open_params) -> dict: assets = list_assets_from_item( item, @@ -91,19 +84,6 @@ def is_mldataset_available(self, item: pystac.Item, **open_params) -> bool: format_ids = self.get_format_ids(item, **open_params) return all(format_id in MLDATASET_FORMATS for format_id in format_ids) - def get_search_params_schema(self) -> JsonObjectSchema: - return JsonObjectSchema( - properties=dict(**STAC_SEARCH_PARAMETERS), - required=[], - additional_properties=False, - ) - - def get_open_data_params_schema(self) -> JsonObjectSchema: - return STAC_OPEN_PARAMETERS - - def get_open_data_params_schema_stack(self) -> JsonObjectSchema: - return STAC_OPEN_PARAMETERS_STACK_MODE - def search_items( self, catalog: Union[pystac.Catalog, pystac_client.client.Client], @@ -163,22 +143,9 @@ def get_data_access_params(self, item: pystac.Item, **open_params) -> dict: ) return data_access_params - def get_protocols(self, item: pystac.Item, **open_params) -> list[str]: - return ["s3"] - - def get_format_ids(self, item: pystac.Item, **open_params) -> list[str]: - return ["zarr", "levels"] - def is_mldataset_available(self, item: pystac.Item, **open_params) -> bool: return True - def get_search_params_schema(self) -> JsonObjectSchema: - return JsonObjectSchema( - properties=dict(**STAC_SEARCH_PARAMETERS), - required=[], - additional_properties=False, - ) - class HelperCdse(Helper): diff --git a/xcube_stac/mldataset.py b/xcube_stac/mldataset.py index f79ae78..b90dd0b 100644 --- a/xcube_stac/mldataset.py +++ b/xcube_stac/mldataset.py @@ -29,7 +29,6 @@ from xcube.core.mldataset import MultiLevelDataset, LazyMultiLevelDataset from xcube.core.gridmapping import GridMapping -from .constants import LOG from ._utils import rename_dataset from ._utils import merge_datasets from .stac_extension.raster import apply_offset_scaling