From 81ccf856e4d4ffc21ea7d67c33f11a5f773183db Mon Sep 17 00:00:00 2001 From: Damien Ayers Date: Mon, 24 Jul 2023 08:47:45 +1000 Subject: [PATCH 01/26] Update installation/contents documentation --- README.md | 81 ++++++++++++++++++++++++------------------------------- 1 file changed, 35 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index 896bb872e..fc09f6e5a 100644 --- a/README.md +++ b/README.md @@ -4,35 +4,38 @@ DEA Prototype Code ================== -- AWS s3 tools -- Rasterio from S3 investigations -- Utilities for data visualizations in notebooks - -Installation -============ +This repository provides developmental [libraries](https://github.com/opendatacube/odc-tools/tree/develop/libs) +and [CLI tools](https://github.com/opendatacube/odc-tools/tree/develop/apps) for Open Datacube. -This repository provides a number of small [libraries](https://github.com/opendatacube/odc-tools/tree/develop/libs) -and [CLI tools](https://github.com/opendatacube/odc-tools/tree/develop/apps). +- AWS S3 tools +- CLIs for using ODC data from AWS S3 and SQS +- Utilities for data visualizations in notebooks +- Experiments on optimising Rasterio usage on AWS S3 Full list of libraries, and install instructions: - `odc.algo` algorithms (GeoMedian wrapper is here) -- `odc.stats` large scale processing framework (Moved to [odc-stats](http://github.com/opendatacube/odc-stats)) - `odc.ui` tools for data visualization in notebook/lab -- `odc.stac` STAC to ODC conversion tools (Moved to [odc-stac](https://github.com/opendatacube/odc-stac)) -- `odc.dscache` experimental key-value store where `key=UUID`, `value=Dataset` (moved to [odc-dscache](https://github.com/opendatacube/odc-dscache)) - `odc.io` common IO utilities, used by apps mainly - `odc-cloud[ASYNC,AZURE,THREDDS]` cloud crawling support package - `odc.aws` AWS/S3 utilities, used by apps mainly - `odc.aio` faster concurrent fetching from S3 with async, used by apps `odc-cloud[ASYNC]` - `odc.{thredds,azure}` internal libs for cloud IO `odc-cloud[THREDDS,AZURE]` -Pre-release of these libraries is on PyPI now, so can be installed with `pip` -"the normal way". Most recent development versions of `odc-tools` packages are -pushed to `https://packages.dea.ga.gov.au`, and can be installed like so: +## Promoted to their own repositories +- `odc.stats` large scale processing framework (Moved to [odc-stats](http://github.com/opendatacube/odc-stats)) +- `odc.stac` STAC to ODC conversion tools (Moved to [odc-stac](https://github.com/opendatacube/odc-stac)) +- `odc.dscache` experimental key-value store where `key=UUID`, `value=Dataset` (moved to [odc-dscache](https://github.com/opendatacube/odc-dscache)) + +Installation +============ + + +Libraries and applications in this repository are published to PyPI, and can be installed \ +with `pip` like so: ``` -pip install --extra-index-url="https://packages.dea.ga.gov.au" \ +pip install \ odc-ui \ odc-stac \ odc-stats \ @@ -42,14 +45,10 @@ pip install --extra-index-url="https://packages.dea.ga.gov.au" \ odc-dscache ``` -**NOTE**: on Ubuntu 18.04 the default `pip` version is awfully old and does not -support `--extra-index-url` command line option, so make sure to upgrade `pip` -first: `pip3 install --upgrade pip`. - For Conda Users --------------- -Currently there are no `odc-tools` conda packages. But majority of `odc-tools` +`odc-tools` aren't available as conda packages, but the most dependencies can be installed with conda from `conda-forge` channel. Use `conda env update -f ` to install all needed dependencies for @@ -150,12 +149,12 @@ CLI Tools Installation ------------ -Cloud tools depend on `aiobotocore` package which has a dependency on a specific -version of `botocore`. Another package we use, `boto3`, also depends on a -specific version of `botocore`. As a result having both `aiobotocore` and -`boto3` in one environment can be a bit tricky. The easiest way to solve this, -is to install `aiobotocore[awscli,boto3]` before anything else, which will pull -in a compatible version of `boto3` and `awscli` into the environment. +Cloud tools depend on the `aiobotocore` package which depend on a specific +versions of `botocore`. Another package we use, `boto3`, also depends on a +specific versions of `botocore`. As a result having both `aiobotocore` and +`boto3` in one environment can be a bit tricky. The way to solve this +is to install `aiobotocore[awscli,boto3]` before anything else, which will install +compatible versions of `boto3` and `awscli` into the environment. ``` pip install -U "aiobotocore[awscli,boto3]==1.3.3" @@ -163,9 +162,6 @@ pip install -U "aiobotocore[awscli,boto3]==1.3.3" conda install "aiobotocore==1.3.3" boto3 awscli ``` -The specific version of `aiobotocore` is not relevant, but it is needed in -practice to limit `pip`/`conda` package resolution search. - 1. For cloud (AWS only) ``` @@ -228,7 +224,7 @@ dc-index-from-tar --protocol gs --env mangroves --ignore-lineage metadata.tar.gz Local Development ================= -The following steps is used in github workflow `main.yml` +The following steps are used in the GitHub Actions workflow `main.yml` ```bash @@ -259,23 +255,16 @@ conda env remove -n odc-tests-py38 Release Process =============== -Development versions of packages are pushed to [DEA packages -repo](https://packages.dea.ga.gov.au/) on every push to `develop` branch, -version is automatically increased by a script that runs before creating wheels -and source distribution tar balls. Right now new dev version is pushed for all -the packages even the ones that have not changed since last push. - -Publishing to [PyPi](https://pypi.org/) happens automatically when changes are -pushed to a protected `pypi/publish` branch. Only members of [Open Datacube -Admins](https://github.com/orgs/opendatacube/teams/admins) group have the -permission to push to this branch. - -Process: - 1. Manually edit `{lib,app}/{pkg}/odc/{pkg}/_version.py` file to increase version number -2. Merge it to `develop` branch via PR -3. Fast forward `pypi/publish` branch to match `develop` -4. Push it to GitHub +2. Merge changes to the `develop` branch via a Pull Request +3. Fast-forward the `pypi/publish` branch to match `develop` +4. Push to GitHub Steps 3 and 4 can be done by an authorized user with `./scripts/sync-publish-branch.sh` script. + + +Publishing to [PyPi](https://pypi.org/) happens automatically when changes are +pushed to the protected `pypi/publish` branch. Only members of [Open Datacube +Admins](https://github.com/orgs/opendatacube/teams/admins) group have the +permission to push to this branch. From a680758312b23fa8c55473e9bb56259d56c198bd Mon Sep 17 00:00:00 2001 From: Damien Ayers Date: Tue, 25 Jul 2023 10:37:20 +1000 Subject: [PATCH 02/26] Update README.md Co-authored-by: Ariana-B <40238244+Ariana-B@users.noreply.github.com> --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index fc09f6e5a..4382c04e9 100644 --- a/README.md +++ b/README.md @@ -149,9 +149,9 @@ CLI Tools Installation ------------ -Cloud tools depend on the `aiobotocore` package which depend on a specific -versions of `botocore`. Another package we use, `boto3`, also depends on a -specific versions of `botocore`. As a result having both `aiobotocore` and +Cloud tools depend on the `aiobotocore` package, which depends on specific +versions of `botocore`. Another package we use, `boto3`, also depends on +specific versions of `botocore`. As a result, having both `aiobotocore` and `boto3` in one environment can be a bit tricky. The way to solve this is to install `aiobotocore[awscli,boto3]` before anything else, which will install compatible versions of `boto3` and `awscli` into the environment. From c78b992caff73152e3c40b464255cf6c72018bae Mon Sep 17 00:00:00 2001 From: Damien Ayers Date: Mon, 7 Aug 2023 16:42:56 +1000 Subject: [PATCH 03/26] Mention conda installation in README --- README.md | 191 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 98 insertions(+), 93 deletions(-) diff --git a/README.md b/README.md index 4382c04e9..840b59e5d 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,6 @@ Full list of libraries, and install instructions: Installation ============ - Libraries and applications in this repository are published to PyPI, and can be installed \ with `pip` like so: @@ -48,103 +47,17 @@ pip install \ For Conda Users --------------- -`odc-tools` aren't available as conda packages, but the most -dependencies can be installed with conda from `conda-forge` channel. - -Use `conda env update -f ` to install all needed dependencies for -`odc-tools` libraries and apps. - -
Conda `environment.yaml` (click to expand)
- -```yaml -channels: - - conda-forge -dependencies: - # Datacube - - datacube>=1.8.5 - - # odc.dscache - - python-lmdb - - zstandard - - # odc.algo - - dask-image - - numexpr - - scikit-image - - scipy - - toolz - - # odc.ui - - ipywidgets - - ipyleaflet - - tqdm - - # odc-apps-dc-tools - - pystac>=1 - - pystac-client>=0.2.0 - - azure-storage-blob - - fsspec - - lxml # needed for thredds-crawler - - # odc.{aio,aws}: aiobotocore/boto3 - # pin aiobotocore for easier resolution of dependencies - - aiobotocore==1.3.3 - - boto3 - - # eodatasets3 (used by odc-stats) - - boltons - - ciso8601 - - python-rapidjson - - requests-cache - - ruamel.yaml - - structlog - - url-normalize - - # for dev - - pylint - - autopep8 - - flake8 - - isort - - black - - mypy - - # For tests - - pytest - - pytest-httpserver - - pytest-cov - - pytest-timeout - - moto - - deepdiff - - - pip>=20 - - pip: - # odc.apps.dc-tools - - thredds-crawler - - # odc.stats - - eodatasets3 - - # tests - - pytest-depends +Some **odc-tools** are available via `conda` from the `conda-forge` channel. - # odc.ui - - jupyter-ui-poll - # odc-tools libs - - odc-stac - - odc-algo - - odc-ui - - odc-dscache - - odc-stats +``` +conda install -c conda-forge odc-apps-dc-tools odc-algo odc-io odc-cloud - # odc-tools CLI apps - - odc-apps-cloud - - odc-apps-dc-tools ``` -
-CLI Tools -========= + +Cloud Tools +=========== Installation ------------ @@ -252,6 +165,98 @@ libs apps conda env remove -n odc-tests-py38 ``` +Use `conda env update -f ` to install all needed dependencies for +`odc-tools` libraries and apps. + +
Conda `environment.yaml` (click to expand)
+ +```yaml +channels: + - conda-forge +dependencies: + # Datacube + - datacube>=1.8.5 + + # odc.dscache + - python-lmdb + - zstandard + + # odc.algo + - dask-image + - numexpr + - scikit-image + - scipy + - toolz + + # odc.ui + - ipywidgets + - ipyleaflet + - tqdm + + # odc-apps-dc-tools + - pystac>=1 + - pystac-client>=0.2.0 + - azure-storage-blob + - fsspec + - lxml # needed for thredds-crawler + + # odc.{aio,aws}: aiobotocore/boto3 + # pin aiobotocore for easier resolution of dependencies + - aiobotocore==1.3.3 + - boto3 + + # eodatasets3 (used by odc-stats) + - boltons + - ciso8601 + - python-rapidjson + - requests-cache + - ruamel.yaml + - structlog + - url-normalize + + # for dev + - pylint + - autopep8 + - flake8 + - isort + - black + - mypy + + # For tests + - pytest + - pytest-httpserver + - pytest-cov + - pytest-timeout + - moto + - deepdiff + + - pip>=20 + - pip: + # odc.apps.dc-tools + - thredds-crawler + + # odc.stats + - eodatasets3 + + # tests + - pytest-depends + + # odc.ui + - jupyter-ui-poll + + # odc-tools libs + - odc-stac + - odc-algo + - odc-ui + - odc-dscache + - odc-stats + + # odc-tools CLI apps + - odc-apps-cloud + - odc-apps-dc-tools +``` +
+ Release Process =============== From e05a80ca8c4d40ba56a5089d85cff51ea75d766b Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Wed, 6 Sep 2023 15:59:39 +0930 Subject: [PATCH 04/26] release new version dc tools app (#577) Co-authored-by: Emma Ai --- apps/dc_tools/odc/apps/dc_tools/_version.py | 2 +- apps/dc_tools/setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/dc_tools/odc/apps/dc_tools/_version.py b/apps/dc_tools/odc/apps/dc_tools/_version.py index b5c9b6cb7..11ef09286 100644 --- a/apps/dc_tools/odc/apps/dc_tools/_version.py +++ b/apps/dc_tools/odc/apps/dc_tools/_version.py @@ -1 +1 @@ -__version__ = "0.2.12" +__version__ = "0.2.13" diff --git a/apps/dc_tools/setup.cfg b/apps/dc_tools/setup.cfg index 1e0cbbc5b..d00307864 100644 --- a/apps/dc_tools/setup.cfg +++ b/apps/dc_tools/setup.cfg @@ -27,7 +27,7 @@ install_requires = pystac-client>=0.2.0 toolz pyyaml - datacube>=1.8.13 + datacube>=1.8.15 odc_io odc-cloud[ASYNC]>=0.2.3 pystac>=1.0.0 From baf463cb9588e9aeae60ce66086f85f8945921d3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 11 Sep 2023 18:47:49 +0000 Subject: [PATCH 05/26] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 23.7.0 → 23.9.1](https://github.com/psf/black/compare/23.7.0...23.9.1) - [github.com/PyCQA/flake8: 6.0.0 → 6.1.0](https://github.com/PyCQA/flake8/compare/6.0.0...6.1.0) --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index dd97769ca..b7da4ad5f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,7 +26,7 @@ repos: # name: isort (python) # args: [ "--profile", "black", "--filter-files" ] - repo: https://github.com/psf/black - rev: 23.7.0 + rev: 23.9.1 hooks: - id: black - repo: https://github.com/pre-commit/mirrors-pylint @@ -34,6 +34,6 @@ repos: hooks: - id: pylint - repo: https://github.com/PyCQA/flake8 - rev: '6.0.0' + rev: '6.1.0' hooks: - id: flake8 From 3c093d4cc50dfd590b0b99005527a2e0003abf5d Mon Sep 17 00:00:00 2001 From: Damien Ayers Date: Tue, 12 Sep 2023 09:03:40 +1000 Subject: [PATCH 06/26] Fix type checking error in tests --- apps/dc_tools/tests/test_sqs_to_dc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/dc_tools/tests/test_sqs_to_dc.py b/apps/dc_tools/tests/test_sqs_to_dc.py index 122a94bc7..30fdc6dbc 100644 --- a/apps/dc_tools/tests/test_sqs_to_dc.py +++ b/apps/dc_tools/tests/test_sqs_to_dc.py @@ -124,7 +124,7 @@ def test_extract_metadata_from_message(aws_credentials, odc_test_db_with_product uri == "s3://dea-public-data/cemp_insar/insar/displacement/alos/2009/06/17/alos_cumul_2009-06-17.yaml" ) - assert type(data) == dict + assert isinstance(data, dict) doc2ds = Doc2Dataset(dc.index, products=["cemp_insar_alos_displacement"]) index_update_dataset( From af33a23c5c48b1185aa9eaedced5f2f01ec83f0d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Oct 2023 17:04:58 +0000 Subject: [PATCH 07/26] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/pre-commit/pre-commit-hooks: v4.4.0 → v4.5.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.4.0...v4.5.0) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b7da4ad5f..3c680d4c4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ repos: - id: yamllint args: ['-c', '.yamllint'] - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: end-of-file-fixer - id: check-docstring-first From b66a1267660af897abcc145049db1f7fb03d880d Mon Sep 17 00:00:00 2001 From: Ariana-B <40238244+Ariana-B@users.noreply.github.com> Date: Wed, 11 Oct 2023 11:06:47 +1100 Subject: [PATCH 08/26] update archive_less_mature option to expect int value (#579) * update archive_less_mature option to use int/None instead of bool to be in line with core * fix pylint issues * ensure archive_less_mature is an int * appease pre-commit * specify --archive-less-mature type as int * one last test + update version number --------- Co-authored-by: Ariana Barzinpour --- apps/dc_tools/odc/apps/dc_tools/_version.py | 2 +- .../dc_tools/odc/apps/dc_tools/azure_to_dc.py | 4 +-- .../odc/apps/dc_tools/cop_dem_to_dc.py | 2 +- .../odc/apps/dc_tools/esa_worldcover_to_dc.py | 4 +-- .../odc/apps/dc_tools/stac_api_to_dc.py | 4 +-- apps/dc_tools/odc/apps/dc_tools/utils.py | 26 +++++++++------- apps/dc_tools/tests/test_fs_to_dc.py | 30 +++++++++++++++++++ 7 files changed, 53 insertions(+), 19 deletions(-) diff --git a/apps/dc_tools/odc/apps/dc_tools/_version.py b/apps/dc_tools/odc/apps/dc_tools/_version.py index 11ef09286..f3291e93b 100644 --- a/apps/dc_tools/odc/apps/dc_tools/_version.py +++ b/apps/dc_tools/odc/apps/dc_tools/_version.py @@ -1 +1 @@ -__version__ = "0.2.13" +__version__ = "0.2.14" diff --git a/apps/dc_tools/odc/apps/dc_tools/azure_to_dc.py b/apps/dc_tools/odc/apps/dc_tools/azure_to_dc.py index 2fb90f7ab..52ee4ec4f 100644 --- a/apps/dc_tools/odc/apps/dc_tools/azure_to_dc.py +++ b/apps/dc_tools/odc/apps/dc_tools/azure_to_dc.py @@ -39,7 +39,7 @@ def dump_list_to_odc( update: Optional[bool] = False, update_if_exists: Optional[bool] = False, allow_unsafe: Optional[bool] = False, - archive_less_mature: Optional[bool] = False, + archive_less_mature: Optional[int] = None, publish_action: Optional[str] = None, ): ds_added = 0 @@ -102,7 +102,7 @@ def cli( allow_unsafe: bool, stac: bool, statsd_setting: str, - archive_less_mature: bool, + archive_less_mature: int, publish_action: str, account_url: str, container_name: str, diff --git a/apps/dc_tools/odc/apps/dc_tools/cop_dem_to_dc.py b/apps/dc_tools/odc/apps/dc_tools/cop_dem_to_dc.py index 3a9cc06e5..0acd4b788 100644 --- a/apps/dc_tools/odc/apps/dc_tools/cop_dem_to_dc.py +++ b/apps/dc_tools/odc/apps/dc_tools/cop_dem_to_dc.py @@ -107,7 +107,7 @@ def process_uri_tile( dc: Datacube, doc2ds: Doc2Dataset, update_if_exists: bool = True, - archive_less_mature: bool = False, + archive_less_mature: int = None, publish_action: str = None, ) -> Tuple[pystac.Item, str]: product_name = f"dem_{product}" diff --git a/apps/dc_tools/odc/apps/dc_tools/esa_worldcover_to_dc.py b/apps/dc_tools/odc/apps/dc_tools/esa_worldcover_to_dc.py index a1bd75286..8474bc410 100644 --- a/apps/dc_tools/odc/apps/dc_tools/esa_worldcover_to_dc.py +++ b/apps/dc_tools/odc/apps/dc_tools/esa_worldcover_to_dc.py @@ -117,7 +117,7 @@ def process_uri_tile( dc: Datacube, doc2ds: Doc2Dataset, update_if_exists: bool = True, - archive_less_mature: bool = False, + archive_less_mature: int = None, publish_action: str = None, ) -> Tuple[pystac.Item, str]: product_name = "esa_worldcover_" + map_version["year"] @@ -166,7 +166,7 @@ def esa_wc_to_dc( limit: int, update: bool, n_workers: int = 100, - archive_less_mature: bool = False, + archive_less_mature: int = None, publish_action: str = None, ) -> Tuple[int, int]: doc2ds = Doc2Dataset(dc.index) diff --git a/apps/dc_tools/odc/apps/dc_tools/stac_api_to_dc.py b/apps/dc_tools/odc/apps/dc_tools/stac_api_to_dc.py index 34ec66b8d..35863f260 100644 --- a/apps/dc_tools/odc/apps/dc_tools/stac_api_to_dc.py +++ b/apps/dc_tools/odc/apps/dc_tools/stac_api_to_dc.py @@ -125,7 +125,7 @@ def process_item( allow_unsafe: bool, rewrite: Optional[Tuple[str, str]] = None, rename_product: Optional[str] = None, - archive_less_mature: bool = False, + archive_less_mature: int = None, publish_action: bool = False, ): meta, uri, stac = item_to_meta_uri(item, rewrite, rename_product) @@ -150,7 +150,7 @@ def stac_api_to_odc( allow_unsafe: bool = True, rewrite: Optional[Tuple[str, str]] = None, rename_product: Optional[str] = None, - archive_less_mature: bool = False, + archive_less_mature: int = None, publish_action: Optional[str] = None, ) -> Tuple[int, int, int]: doc2ds = Doc2Dataset(dc.index) diff --git a/apps/dc_tools/odc/apps/dc_tools/utils.py b/apps/dc_tools/odc/apps/dc_tools/utils.py index 05409d441..60d78b47d 100644 --- a/apps/dc_tools/odc/apps/dc_tools/utils.py +++ b/apps/dc_tools/odc/apps/dc_tools/utils.py @@ -3,7 +3,7 @@ import importlib_resources from datadog import statsd, initialize from odc.aws.queue import publish_to_topic -from typing import Iterable, Optional, Union +from typing import Optional from datacube import Datacube from datacube.index.hl import Doc2Dataset @@ -113,13 +113,17 @@ class SkippedException(Exception): archive_less_mature = click.option( "--archive-less-mature", - is_flag=True, - default=False, + is_flag=False, + flag_value=500, + default=None, + type=int, help=( "Archive existing any datasets that match product, " "time and region-code, but have lower dataset-maturity." "Note: An error will be raised and the dataset add will " "fail if a matching dataset with higher or equal dataset-maturity." + "Can specify an of leniency for comparing timestamps, provided in milliseconds. " + "Default value is 500ms." ), ) @@ -176,7 +180,7 @@ def index_update_dataset( update: bool = False, update_if_exists: bool = False, allow_unsafe: bool = False, - archive_less_mature: Optional[Union[bool, Iterable[str]]] = None, + archive_less_mature: Optional[int] = None, publish_action: Optional[str] = None, stac_doc: Optional[dict] = None, ) -> int: @@ -191,13 +195,12 @@ def index_update_dataset( :param update_if_exists: If true allow insert or update. :param allow_unsafe: Allow unsafe (arbitrary) dataset updates. :param archive_less_mature: Enforce dataset maturity. - * If None (the default) or False or an empty iterable, ignore dataset maturity. - * If True, enforce dataset maturity by looking for existing datasets with same product, region_code and time + * If None (the default), ignore dataset maturity. + * If int, enforce dataset maturity by looking for existing datasets with same product, region_code and time values. If a less mature match is found, it is archived and replaced with the new dataset being inserted. If a match of the same or greater maturity is found a SkippedException is raised. - * If an iterable of valid search field names is provided, it is used as the "grouping" fields for - identifying dataset maturity matches. - (i.e. `archive_less_mature=True` is the same as `archive_less_mature=['region_code', 'time']) + The integer value is used as the timedelta value for allowing a leniency when comparing + timestamp values, for datasets where there is a slight discrepancy. Default is 500ms. :param publish_action: SNS topic arn to publish action to. :param stac_doc: STAC document for publication to SNS topic. :return: Returns nothing. Raises an exception if anything goes wrong. @@ -221,8 +224,9 @@ def index_update_dataset( archive_stacs = [] added = False updated = False - if archive_less_mature and publish_action: - dupes = dc.index.datasets.find_less_mature(ds, 500) + + if isinstance(archive_less_mature, int) and publish_action: + dupes = dc.index.datasets.find_less_mature(ds, archive_less_mature) for dupe in dupes: archive_stacs.append(ds_to_stac(dupe)) diff --git a/apps/dc_tools/tests/test_fs_to_dc.py b/apps/dc_tools/tests/test_fs_to_dc.py index 2daa7a61e..49edadef9 100644 --- a/apps/dc_tools/tests/test_fs_to_dc.py +++ b/apps/dc_tools/tests/test_fs_to_dc.py @@ -51,6 +51,36 @@ def test_archive_less_mature(odc_db, test_data_dir, nrt_dsid, final_dsid): assert dc.index.datasets.get(nrt_dsid).archived_time is not None +def test_dont_archive_less_mature(odc_db, test_data_dir, nrt_dsid, final_dsid): + # no archiving should be done if --archive-less-mature is not set + dc = odc_db + runner = CliRunner() + + # Index NRT dataset + result = runner.invoke( + fs_to_dc_cli, + [ + test_data_dir, + "--glob=**/maturity-nrt.odc-metadata.yaml", + ], + ) + assert result.exit_code == 0 + assert dc.index.datasets.get(final_dsid) is None + assert dc.index.datasets.get(nrt_dsid).archived_time is None + + # Index Final dataset (autoarchiving NRT) + result = runner.invoke( + fs_to_dc_cli, + [ + test_data_dir, + "--glob=**/maturity-final.odc-metadata.yaml", + ], + ) + assert result.exit_code == 0 + assert dc.index.datasets.get(final_dsid).archived_time is None + assert dc.index.datasets.get(nrt_dsid).archived_time is None + + def test_keep_more_mature(odc_db, test_data_dir, nrt_dsid, final_dsid): dc = odc_db runner = CliRunner() From 24f923d78faceb1999fbbf7d96b9dae454e4e79e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 23 Oct 2023 17:04:00 +0000 Subject: [PATCH 09/26] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 23.9.1 → 23.10.0](https://github.com/psf/black/compare/23.9.1...23.10.0) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3c680d4c4..301e3996d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,7 +26,7 @@ repos: # name: isort (python) # args: [ "--profile", "black", "--filter-files" ] - repo: https://github.com/psf/black - rev: 23.9.1 + rev: 23.10.0 hooks: - id: black - repo: https://github.com/pre-commit/mirrors-pylint From 0738a14babd8f31dcaf67b8393300b270498280a Mon Sep 17 00:00:00 2001 From: Ariana Barzinpour Date: Fri, 27 Oct 2023 04:31:16 +0000 Subject: [PATCH 10/26] update min python version to 3.9, enable list of absolute urls and make product optional in s3-to-dc --- README.md | 8 +-- apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py | 63 ++++++++++++++------- apps/dc_tools/setup.cfg | 2 +- apps/dc_tools/tests/test_s3_to_dc.py | 59 ++++++++++++++++++- tests/{test-env-py38.yml => test-env.yml} | 10 ++-- 5 files changed, 108 insertions(+), 34 deletions(-) rename tests/{test-env-py38.yml => test-env.yml} (88%) diff --git a/README.md b/README.md index 840b59e5d..43501b109 100644 --- a/README.md +++ b/README.md @@ -142,10 +142,10 @@ The following steps are used in the GitHub Actions workflow `main.yml` ```bash # build environment from file -mamba env create -f tests/test-env-py38.yml +mamba env create -f tests/test-env.yml -# this environment name is defined in tests/test-env-py38.yml file -conda activate odc-tests-py38 +# this environment name is defined in tests/test-env.yml file +conda activate odc-tools-tests # install additional packages ./scripts/dev-install.sh --no-deps @@ -162,7 +162,7 @@ pytest --cov=. \ libs apps # Optional, to delete the environment -conda env remove -n odc-tests-py38 +conda env remove -n odc-tools-tests ``` Use `conda env update -f ` to install all needed dependencies for diff --git a/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py b/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py index c128042de..6b0504a5c 100755 --- a/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py +++ b/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py @@ -48,7 +48,7 @@ def doc_error(uri, doc): def dump_to_odc( document_stream, dc: Datacube, - products: list, + product: list, transform=None, update=False, update_if_exists=False, @@ -57,7 +57,7 @@ def dump_to_odc( publish_action=None, **kwargs, ) -> Tuple[int, int, int]: - doc2ds = Doc2Dataset(dc.index, products=products, **kwargs) + doc2ds = Doc2Dataset(dc.index, products=product, **kwargs) ds_added = 0 ds_failed = 0 @@ -110,7 +110,7 @@ def dump_to_odc( @request_payer @archive_less_mature @publish_action -@click.argument("uri", type=str, nargs=1) +@click.argument("uri", type=str, nargs=-1) @click.argument("product", type=str, nargs=1) def cli( skip_lineage, @@ -139,37 +139,58 @@ def cli( else: transform = stac_transform - candidate_products = product.split() - opts = {} if request_payer: opts["RequestPayer"] = "requester" - # Check datacube connection and products dc = Datacube() - odc_products = dc.list_products().name.values - - odc_products = set(odc_products) - if not set(candidate_products).issubset(odc_products): - missing_products = list(set(candidate_products) - odc_products) - print( - f"Error: Requested Product/s {', '.join(missing_products)} {'is' if len(missing_products) == 1 else 'are'} " - "not present in the ODC Database", - file=sys.stderr, - ) - sys.exit(1) + + # if it's a uri, a product wasn't provided, and 'product' is actually another uri + if product.startswith("s3://"): + candidate_product = [] + if isinstance(uri, str): + uri = [uri, product] + else: + uri = list(uri) + uri.append(product) + else: + # Check datacube connection and products + candidate_product = product.split() + odc_products = dc.list_products().name.values + + odc_products = set(odc_products) + if not set(candidate_product).issubset(odc_products): + print( + f"Error: Requested Product {product} is not present in the ODC Database", + file=sys.stderr, + ) + sys.exit(1) + + is_glob = True + # we assume the uri to be an absolute URL if it contains no wildcards + # or if there are multiple uri values provided + if (len(uri) > 1) or ("*" not in uri[0]): + is_glob = False + for url in uri: + if "*" in url: + logging.warning("A list of uris is assumed to include only absolute URLs. " + "Any wildcard characters will be escaped.") # Get a generator from supplied S3 Uri for candidate documents fetcher = S3Fetcher(aws_unsigned=no_sign_request) # Grab the URL from the resulting S3 item - document_stream = ( - url.url for url in s3_find_glob(uri, skip_check=skip_check, s3=fetcher, **opts) - ) + if is_glob: + document_stream = ( + url.url for url in s3_find_glob(uri[0], skip_check=skip_check, s3=fetcher, **opts) + ) + else: + # if working with absolute URLs, no need for all the globbing logic + document_stream = uri added, failed, skipped = dump_to_odc( fetcher(document_stream), dc, - candidate_products, + candidate_product, skip_lineage=skip_lineage, fail_on_missing_lineage=fail_on_missing_lineage, verify_lineage=verify_lineage, diff --git a/apps/dc_tools/setup.cfg b/apps/dc_tools/setup.cfg index d00307864..92ef317aa 100644 --- a/apps/dc_tools/setup.cfg +++ b/apps/dc_tools/setup.cfg @@ -16,7 +16,7 @@ url = https://github.com/opendatacube/odc-tools/ include_package_data = true zip_safe = false packages = find_namespace: -python_requires = >=3.7 +python_requires = >=3.9 tests_require = pytest deepdiff diff --git a/apps/dc_tools/tests/test_s3_to_dc.py b/apps/dc_tools/tests/test_s3_to_dc.py index d202c141c..90745b01c 100644 --- a/apps/dc_tools/tests/test_s3_to_dc.py +++ b/apps/dc_tools/tests/test_s3_to_dc.py @@ -102,9 +102,7 @@ def test_s3_to_dc_fails_to_index_non_dataset_yaml( s3_to_dc, [ "--no-sign-request", - # absolute single file s3 uri won't work with s3-to-dc, only uri string contain * - # absolute path = "s3://dea-public-data/derivative/ga_ls5t_nbart_gm_cyear_3/3-0-0/x08/y23/1994--P1Y/ga_ls5t_nbart_gm_cyear_3_x08y23_1994--P1Y_final.proc-info.yaml", - "s3://odc-tools-test/derivative/ga_ls5t_nbart_gm_cyear_3/3-0-0/x08/y23/1994--P1Y/*.proc-info.yaml", + "s3://dea-public-data/derivative/ga_ls5t_nbart_gm_cyear_3/3-0-0/x08/y23/1994--P1Y/ga_ls5t_nbart_gm_cyear_3_x08y23_1994--P1Y_final.proc-info.yaml", "ga_ls5t_nbart_gm_cyear_3", ], catch_exceptions=False, @@ -133,3 +131,58 @@ def test_s3_to_dc_partially_succeeds_when_given_invalid_and_valid_dataset_yamls( assert ( result.output == "Added 1 datasets, skipped 0 datasets and failed 1 datasets.\n" ) + + +def test_s3_to_dc_list_absolute_urls( + mocked_s3_datasets, odc_test_db_with_products +): + # provide mulitple uris, as absolute URLs + runner = CliRunner() + result = runner.invoke( + s3_to_dc, + [ + "--no-sign-request", + "s3://odc-tools-test/cemp_insar/01/07/alos_cumul_2010-01-07.yaml", + "s3://odc-tools-test/cemp_insar/04/01/alos_cumul_2010-04-01.yaml", + "s3://odc-tools-test/cemp_insar/08/11/alos_cumul_2010-08-11.yaml", + "cemp_insar_alos_displacement", + ], + ) + assert result.exit_code == 0 + assert ( + result.output == "Added 3 datasets, skipped 0 datasets and failed 0 datasets.\n" + ) + + +def test_s3_to_dc_no_product( + mocked_s3_datasets, odc_test_db_with_products +): + # product should not need to be specified + runner = CliRunner() + result = runner.invoke( + s3_to_dc, + [ + "--no-sign-request", + "s3://odc-tools-test/cemp_insar/01/07/alos_cumul_2010-01-07.yaml", + ], + catch_exceptions=False, + ) + assert result.exit_code == 0 + assert ( + result.output == "Added 1 datasets, skipped 0 datasets and failed 0 datasets.\n" + ) + + # test with glob + result2 = CliRunner().invoke( + s3_to_dc, + [ + "--no-sign-request", + "--stac", + "s3://odc-tools-test/sentinel-s2-l2a-cogs/31/Q/GB/2020/8/S2B_31QGB_20200831_0_L2A/*_L2A.json", + ], + catch_exceptions=False, + ) + assert result2.exit_code == 0 + assert ( + result2.output == "Added 1 datasets, skipped 0 datasets and failed 0 datasets.\n" + ) diff --git a/tests/test-env-py38.yml b/tests/test-env.yml similarity index 88% rename from tests/test-env-py38.yml rename to tests/test-env.yml index 4ebce4e59..77fa858e4 100644 --- a/tests/test-env-py38.yml +++ b/tests/test-env.yml @@ -1,13 +1,13 @@ # Conda environment for running tests in odc-tools -# conda env create -f test-env-py38.yml -# conda activate odc-tests-py38 +# conda env create -f test-env.yml +# conda activate odc-tools-tests -name: odc-tests-py38 +name: odc-tools-tests channels: - conda-forge dependencies: - - python=3.8 + - python>=3.9 # Datacube - datacube>=1.8.15 @@ -56,7 +56,7 @@ dependencies: - sphinx-autodoc-typehints - nbsphinx - - pip=20 + - pip=23 - pip: # odc.apps.dc-tools - thredds-crawler From 526820dcdbc838b3318c7dc357349cac56031215 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 27 Oct 2023 04:34:52 +0000 Subject: [PATCH 11/26] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py | 9 ++++++--- apps/dc_tools/tests/test_s3_to_dc.py | 11 ++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py b/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py index 6b0504a5c..d399aa1d5 100755 --- a/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py +++ b/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py @@ -173,15 +173,18 @@ def cli( is_glob = False for url in uri: if "*" in url: - logging.warning("A list of uris is assumed to include only absolute URLs. " - "Any wildcard characters will be escaped.") + logging.warning( + "A list of uris is assumed to include only absolute URLs. " + "Any wildcard characters will be escaped." + ) # Get a generator from supplied S3 Uri for candidate documents fetcher = S3Fetcher(aws_unsigned=no_sign_request) # Grab the URL from the resulting S3 item if is_glob: document_stream = ( - url.url for url in s3_find_glob(uri[0], skip_check=skip_check, s3=fetcher, **opts) + url.url + for url in s3_find_glob(uri[0], skip_check=skip_check, s3=fetcher, **opts) ) else: # if working with absolute URLs, no need for all the globbing logic diff --git a/apps/dc_tools/tests/test_s3_to_dc.py b/apps/dc_tools/tests/test_s3_to_dc.py index 90745b01c..7c1a177f9 100644 --- a/apps/dc_tools/tests/test_s3_to_dc.py +++ b/apps/dc_tools/tests/test_s3_to_dc.py @@ -133,9 +133,7 @@ def test_s3_to_dc_partially_succeeds_when_given_invalid_and_valid_dataset_yamls( ) -def test_s3_to_dc_list_absolute_urls( - mocked_s3_datasets, odc_test_db_with_products -): +def test_s3_to_dc_list_absolute_urls(mocked_s3_datasets, odc_test_db_with_products): # provide mulitple uris, as absolute URLs runner = CliRunner() result = runner.invoke( @@ -154,9 +152,7 @@ def test_s3_to_dc_list_absolute_urls( ) -def test_s3_to_dc_no_product( - mocked_s3_datasets, odc_test_db_with_products -): +def test_s3_to_dc_no_product(mocked_s3_datasets, odc_test_db_with_products): # product should not need to be specified runner = CliRunner() result = runner.invoke( @@ -184,5 +180,6 @@ def test_s3_to_dc_no_product( ) assert result2.exit_code == 0 assert ( - result2.output == "Added 1 datasets, skipped 0 datasets and failed 0 datasets.\n" + result2.output + == "Added 1 datasets, skipped 0 datasets and failed 0 datasets.\n" ) From f55c58b707e4fae4e1f3cf21cdbda4f292402ad2 Mon Sep 17 00:00:00 2001 From: Ariana Barzinpour Date: Fri, 27 Oct 2023 04:45:48 +0000 Subject: [PATCH 12/26] improve help message --- apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py b/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py index d399aa1d5..bae0f584f 100755 --- a/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py +++ b/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py @@ -111,7 +111,7 @@ def dump_to_odc( @archive_less_mature @publish_action @click.argument("uri", type=str, nargs=-1) -@click.argument("product", type=str, nargs=1) +@click.argument("product", type=str, nargs=1, required=False) def cli( skip_lineage, fail_on_missing_lineage, @@ -130,7 +130,15 @@ def cli( uri, product, ): - """Iterate through files in an S3 bucket and add them to datacube""" + """ + Iterate through files in an S3 bucket and add them to datacube. + + File uris can be provided as a glob, or as a list of absolute URLs. + If more than one uri is given, all will be treated as absolute URLs. + + Product is optional; if one is provided, it must match all datasets. + Only one product can be provided. + """ transform = None if stac: @@ -144,6 +152,7 @@ def cli( opts["RequestPayer"] = "requester" dc = Datacube() + logging.warning(f"product is: {product}") # if it's a uri, a product wasn't provided, and 'product' is actually another uri if product.startswith("s3://"): From 549e084b45f13c52c8429398526007037a38ef22 Mon Sep 17 00:00:00 2001 From: Ariana Barzinpour Date: Fri, 27 Oct 2023 04:48:28 +0000 Subject: [PATCH 13/26] update test env path in workflow --- .github/workflows/main.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f81e83f11..c4070c9ca 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -76,7 +76,7 @@ jobs: path: | tests/env - key: ${{ runner.os }}-test-env-py38-${{ hashFiles('tests/test-env-py38.yml') }} + key: ${{ runner.os }}-test-env-${{ hashFiles('tests/test-env.yml') }} - uses: conda-incubator/setup-miniconda@v2 if: steps.conda_cache.outputs.cache-hit != 'true' @@ -103,7 +103,7 @@ jobs: shell: bash -l {0} if: steps.conda_cache.outputs.cache-hit != 'true' run: | - mamba env create -f tests/test-env-py38.yml -p tests/env + mamba env create -f tests/test-env.yml -p tests/env - name: Check Python Env shell: bash -l {0} @@ -128,7 +128,7 @@ jobs: path: | tests/env - key: ${{ runner.os }}-test-env-py38-${{ hashFiles('tests/test-env-py38.yml') }} + key: ${{ runner.os }}-test-env-${{ hashFiles('tests/test-env.yml') }} - name: Update PATH shell: bash @@ -211,7 +211,7 @@ jobs: path: | tests/env - key: ${{ runner.os }}-test-env-py38-${{ hashFiles('tests/test-env-py38.yml') }} + key: ${{ runner.os }}-test-env-${{ hashFiles('tests/test-env.yml') }} - name: Update PATH shell: bash From 0b56c535a446b1e339c79273918e92d2faee8c05 Mon Sep 17 00:00:00 2001 From: Ariana Barzinpour Date: Fri, 27 Oct 2023 04:49:48 +0000 Subject: [PATCH 14/26] remove stray log --- apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py b/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py index bae0f584f..3118e2051 100755 --- a/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py +++ b/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py @@ -152,7 +152,6 @@ def cli( opts["RequestPayer"] = "requester" dc = Datacube() - logging.warning(f"product is: {product}") # if it's a uri, a product wasn't provided, and 'product' is actually another uri if product.startswith("s3://"): From ff35f80012da926d7691237c69a78443720e3b6a Mon Sep 17 00:00:00 2001 From: Ariana Barzinpour Date: Fri, 27 Oct 2023 05:50:02 +0000 Subject: [PATCH 15/26] pin python <3.12 to avoid botocore module issue --- tests/test-env.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-env.yml b/tests/test-env.yml index 77fa858e4..a340aa518 100644 --- a/tests/test-env.yml +++ b/tests/test-env.yml @@ -7,7 +7,7 @@ channels: - conda-forge dependencies: - - python>=3.9 + - python>=3.9,<3.12 # Datacube - datacube>=1.8.15 From 542d05ac0df98f599b482748cb87dff2c25cc0ae Mon Sep 17 00:00:00 2001 From: Ariana Barzinpour Date: Wed, 1 Nov 2023 01:40:53 +0000 Subject: [PATCH 16/26] raise click exception instead of abort, allow pseudo list of products --- apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py b/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py index 3118e2051..364b364f0 100755 --- a/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py +++ b/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py @@ -48,7 +48,7 @@ def doc_error(uri, doc): def dump_to_odc( document_stream, dc: Datacube, - product: list, + products: list, transform=None, update=False, update_if_exists=False, @@ -57,7 +57,7 @@ def dump_to_odc( publish_action=None, **kwargs, ) -> Tuple[int, int, int]: - doc2ds = Doc2Dataset(dc.index, products=product, **kwargs) + doc2ds = Doc2Dataset(dc.index, products=products, **kwargs) ds_added = 0 ds_failed = 0 @@ -90,7 +90,7 @@ def dump_to_odc( except SkippedException: ds_skipped += 1 if not found_docs: - raise click.Abort("Doc stream was empty") + raise click.ClickException("Doc stream was empty") return ds_added, ds_failed, ds_skipped @@ -155,7 +155,7 @@ def cli( # if it's a uri, a product wasn't provided, and 'product' is actually another uri if product.startswith("s3://"): - candidate_product = [] + candidate_products = [] if isinstance(uri, str): uri = [uri, product] else: @@ -163,13 +163,15 @@ def cli( uri.append(product) else: # Check datacube connection and products - candidate_product = product.split() + candidate_products = product.split() odc_products = dc.list_products().name.values odc_products = set(odc_products) - if not set(candidate_product).issubset(odc_products): + if not set(candidate_products).issubset(odc_products): + missing_products = list(set(candidate_products) - odc_products) print( - f"Error: Requested Product {product} is not present in the ODC Database", + f"Error: Requested Product/s {', '.join(missing_products)} {'is' if len(missing_products) == 1 else 'are'} " + "not present in the ODC Database", file=sys.stderr, ) sys.exit(1) @@ -201,7 +203,7 @@ def cli( added, failed, skipped = dump_to_odc( fetcher(document_stream), dc, - candidate_product, + candidate_products, skip_lineage=skip_lineage, fail_on_missing_lineage=fail_on_missing_lineage, verify_lineage=verify_lineage, From 320e7e77bc9826ec0eb5e2e4e2cd041e89194b0c Mon Sep 17 00:00:00 2001 From: Ariana Barzinpour Date: Wed, 1 Nov 2023 01:42:47 +0000 Subject: [PATCH 17/26] appease linter --- apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py b/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py index 364b364f0..7f7c0a7b8 100755 --- a/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py +++ b/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py @@ -170,7 +170,8 @@ def cli( if not set(candidate_products).issubset(odc_products): missing_products = list(set(candidate_products) - odc_products) print( - f"Error: Requested Product/s {', '.join(missing_products)} {'is' if len(missing_products) == 1 else 'are'} " + f"Error: Requested Product/s {', '.join(missing_products)} " + f"{'is' if len(missing_products) == 1 else 'are'} " "not present in the ODC Database", file=sys.stderr, ) From f3d143b92628517561a0a1b1e1f1b8813cb2e47e Mon Sep 17 00:00:00 2001 From: Paul Haesler Date: Fri, 10 Nov 2023 11:42:38 +1100 Subject: [PATCH 18/26] Fix Python versions (#587) * Pin dc_tools python to 3.9-3.10 * Pin all other packages to Python>=3.9 * Updated GHAs and test environment to Python3.9 * Fix conda env references in GHA. * Require moto[server]?? * Add vim swapfiles to gitignore. * Remove ancient pin on aiobotocore * Remove upper bound on Python version in dc_tools. --- .github/workflows/main.yml | 10 +++++----- .gitignore | 3 +++ README.md | 2 +- apps/cloud/setup.cfg | 2 +- apps/dc_tools/setup.cfg | 2 +- libs/algo/setup.cfg | 2 +- libs/cloud/setup.cfg | 2 +- libs/io/setup.cfg | 2 +- libs/ui/setup.cfg | 2 +- tests/{test-env-py38.yml => test-env-py39.yml} | 11 +++++------ 10 files changed, 20 insertions(+), 18 deletions(-) rename tests/{test-env-py38.yml => test-env-py39.yml} (87%) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f81e83f11..8b0c406c7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -28,7 +28,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v1 with: - python-version: 3.8 + python-version: 3.9 - uses: actions/cache@v3 id: wheels_cache @@ -76,7 +76,7 @@ jobs: path: | tests/env - key: ${{ runner.os }}-test-env-py38-${{ hashFiles('tests/test-env-py38.yml') }} + key: ${{ runner.os }}-test-env-py39-${{ hashFiles('tests/test-env-py39.yml') }} - uses: conda-incubator/setup-miniconda@v2 if: steps.conda_cache.outputs.cache-hit != 'true' @@ -103,7 +103,7 @@ jobs: shell: bash -l {0} if: steps.conda_cache.outputs.cache-hit != 'true' run: | - mamba env create -f tests/test-env-py38.yml -p tests/env + mamba env create -f tests/test-env-py39.yml -p tests/env - name: Check Python Env shell: bash -l {0} @@ -128,7 +128,7 @@ jobs: path: | tests/env - key: ${{ runner.os }}-test-env-py38-${{ hashFiles('tests/test-env-py38.yml') }} + key: ${{ runner.os }}-test-env-py39-${{ hashFiles('tests/test-env-py39.yml') }} - name: Update PATH shell: bash @@ -211,7 +211,7 @@ jobs: path: | tests/env - key: ${{ runner.os }}-test-env-py38-${{ hashFiles('tests/test-env-py38.yml') }} + key: ${{ runner.os }}-test-env-py39-${{ hashFiles('tests/test-env-py39.yml') }} - name: Update PATH shell: bash diff --git a/.gitignore b/.gitignore index cf46a4390..7a54edea4 100644 --- a/.gitignore +++ b/.gitignore @@ -59,6 +59,9 @@ coverage.xml # Sphinx documentation docs/_build/ +# VIM swap files +.*.sw? + # PyBuilder target/ .idea/ diff --git a/README.md b/README.md index 840b59e5d..93edf6524 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ The following steps are used in the GitHub Actions workflow `main.yml` ```bash # build environment from file -mamba env create -f tests/test-env-py38.yml +mamba env create -f tests/test-env-py39.yml # this environment name is defined in tests/test-env-py38.yml file conda activate odc-tests-py38 diff --git a/apps/cloud/setup.cfg b/apps/cloud/setup.cfg index 3d2bbc25e..2fccd5935 100644 --- a/apps/cloud/setup.cfg +++ b/apps/cloud/setup.cfg @@ -16,7 +16,7 @@ url = https://github.com/opendatacube/odc-tools/ include_package_data = true zip_safe = false packages = find_namespace: -python_requires = >=3.6 +python_requires = >=3.9 tests_require = pytest install_requires = odc-cloud[ASYNC] diff --git a/apps/dc_tools/setup.cfg b/apps/dc_tools/setup.cfg index d00307864..b9d13d21a 100644 --- a/apps/dc_tools/setup.cfg +++ b/apps/dc_tools/setup.cfg @@ -16,7 +16,7 @@ url = https://github.com/opendatacube/odc-tools/ include_package_data = true zip_safe = false packages = find_namespace: -python_requires = >=3.7 +python_requires = >= 3.9 tests_require = pytest deepdiff diff --git a/libs/algo/setup.cfg b/libs/algo/setup.cfg index ab6f116f6..64012aec4 100644 --- a/libs/algo/setup.cfg +++ b/libs/algo/setup.cfg @@ -16,7 +16,7 @@ url = https://github.com/opendatacube/odc-tools/ include_package_data = true zip_safe = false packages = find_namespace: -python_requires = >=3.6 +python_requires = >=3.9 tests_require = pytest install_requires = affine diff --git a/libs/cloud/setup.cfg b/libs/cloud/setup.cfg index 7c7c97fdc..4f3582483 100644 --- a/libs/cloud/setup.cfg +++ b/libs/cloud/setup.cfg @@ -16,7 +16,7 @@ url = https://github.com/opendatacube/odc-tools/ include_package_data = true zip_safe = false packages = find_namespace: -python_requires = >=3.7 +python_requires = >=3.9 tests_require = pytest install_requires = botocore diff --git a/libs/io/setup.cfg b/libs/io/setup.cfg index 74822f35e..aed11f91c 100644 --- a/libs/io/setup.cfg +++ b/libs/io/setup.cfg @@ -16,7 +16,7 @@ url = https://github.com/opendatacube/odc-tools/ include_package_data = true zip_safe = false packages = find_namespace: -python_requires = >=3.6 +python_requires = >=3.9 tests_require = pytest install_requires = diff --git a/libs/ui/setup.cfg b/libs/ui/setup.cfg index 868acf736..625fa37e4 100644 --- a/libs/ui/setup.cfg +++ b/libs/ui/setup.cfg @@ -16,7 +16,7 @@ url = https://github.com/opendatacube/odc-tools/ include_package_data = true zip_safe = false packages = find_namespace: -python_requires = >=3.6 +python_requires = >=3.9 tests_require = pytest install_requires = datacube diff --git a/tests/test-env-py38.yml b/tests/test-env-py39.yml similarity index 87% rename from tests/test-env-py38.yml rename to tests/test-env-py39.yml index 4ebce4e59..606c7cc70 100644 --- a/tests/test-env-py38.yml +++ b/tests/test-env-py39.yml @@ -1,13 +1,13 @@ # Conda environment for running tests in odc-tools -# conda env create -f test-env-py38.yml -# conda activate odc-tests-py38 +# conda env create -f test-env-py39.yml +# conda activate odc-tests-py39 -name: odc-tests-py38 +name: odc-tests-py39 channels: - conda-forge dependencies: - - python=3.8 + - python=3.9 # Datacube - datacube>=1.8.15 @@ -38,7 +38,7 @@ dependencies: # odc.{aws,aio}: aiobotocore/boto3 # pin aiobotocore for easier resolution of dependencies - - aiobotocore==1.4.2 + - aiobotocore - boto3 # For tests @@ -49,7 +49,6 @@ dependencies: - moto - deepdiff - # for docs - sphinx - sphinx_rtd_theme From f25040f5de2216a98e4f8d3a33ab907e0572a8ea Mon Sep 17 00:00:00 2001 From: Paul Haesler Date: Fri, 10 Nov 2023 12:02:24 +1100 Subject: [PATCH 19/26] Remove odc-algo. (Now has it's own repo) --- .github/workflows/main.yml | 1 - README.md | 12 +- libs/algo/LICENSE | 201 -------- libs/algo/README.md | 16 - libs/algo/odc/algo/__init__.py | 110 ---- libs/algo/odc/algo/_broadcast.py | 91 ---- libs/algo/odc/algo/_dask.py | 614 ---------------------- libs/algo/odc/algo/_dask_stream.py | 90 ---- libs/algo/odc/algo/_geomedian.py | 450 ---------------- libs/algo/odc/algo/_grouper.py | 94 ---- libs/algo/odc/algo/_masking.py | 797 ----------------------------- libs/algo/odc/algo/_memsink.py | 430 ---------------- libs/algo/odc/algo/_numeric.py | 97 ---- libs/algo/odc/algo/_numexpr.py | 151 ------ libs/algo/odc/algo/_percentile.py | 139 ----- libs/algo/odc/algo/_rgba.py | 168 ------ libs/algo/odc/algo/_tiff.py | 572 --------------------- libs/algo/odc/algo/_tools.py | 34 -- libs/algo/odc/algo/_types.py | 9 - libs/algo/odc/algo/_version.py | 1 - libs/algo/odc/algo/_warp.py | 351 ------------- libs/algo/odc/algo/io.py | 401 --------------- libs/algo/odc/algo/pixel.py | 167 ------ libs/algo/pyproject.toml | 3 - libs/algo/setup.cfg | 42 -- libs/algo/setup.py | 3 - libs/algo/tests/test_dask.py | 190 ------- libs/algo/tests/test_grouper.py | 67 --- libs/algo/tests/test_io.py | 41 -- libs/algo/tests/test_masking.py | 274 ---------- libs/algo/tests/test_memsink.py | 146 ------ libs/algo/tests/test_numeric.py | 57 --- libs/algo/tests/test_percentile.py | 136 ----- libs/algo/tests/test_warp.py | 8 - tests/test-env-py39.yml | 10 - 35 files changed, 1 insertion(+), 5972 deletions(-) delete mode 100644 libs/algo/LICENSE delete mode 100644 libs/algo/README.md delete mode 100644 libs/algo/odc/algo/__init__.py delete mode 100644 libs/algo/odc/algo/_broadcast.py delete mode 100644 libs/algo/odc/algo/_dask.py delete mode 100644 libs/algo/odc/algo/_dask_stream.py delete mode 100644 libs/algo/odc/algo/_geomedian.py delete mode 100644 libs/algo/odc/algo/_grouper.py delete mode 100644 libs/algo/odc/algo/_masking.py delete mode 100644 libs/algo/odc/algo/_memsink.py delete mode 100644 libs/algo/odc/algo/_numeric.py delete mode 100644 libs/algo/odc/algo/_numexpr.py delete mode 100644 libs/algo/odc/algo/_percentile.py delete mode 100644 libs/algo/odc/algo/_rgba.py delete mode 100644 libs/algo/odc/algo/_tiff.py delete mode 100644 libs/algo/odc/algo/_tools.py delete mode 100644 libs/algo/odc/algo/_types.py delete mode 100644 libs/algo/odc/algo/_version.py delete mode 100644 libs/algo/odc/algo/_warp.py delete mode 100644 libs/algo/odc/algo/io.py delete mode 100644 libs/algo/odc/algo/pixel.py delete mode 100644 libs/algo/pyproject.toml delete mode 100644 libs/algo/setup.cfg delete mode 100644 libs/algo/setup.py delete mode 100644 libs/algo/tests/test_dask.py delete mode 100644 libs/algo/tests/test_grouper.py delete mode 100644 libs/algo/tests/test_io.py delete mode 100644 libs/algo/tests/test_masking.py delete mode 100644 libs/algo/tests/test_memsink.py delete mode 100644 libs/algo/tests/test_numeric.py delete mode 100644 libs/algo/tests/test_percentile.py delete mode 100644 libs/algo/tests/test_warp.py diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8b0c406c7..09dfeca26 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -265,7 +265,6 @@ jobs: strategy: matrix: pkg: - - odc-algo - odc-cloud - odc-io - odc-ui diff --git a/README.md b/README.md index 93edf6524..11184c5b2 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,6 @@ and [CLI tools](https://github.com/opendatacube/odc-tools/tree/develop/apps) for Full list of libraries, and install instructions: -- `odc.algo` algorithms (GeoMedian wrapper is here) - `odc.ui` tools for data visualization in notebook/lab - `odc.io` common IO utilities, used by apps mainly - `odc-cloud[ASYNC,AZURE,THREDDS]` cloud crawling support package @@ -38,7 +37,6 @@ pip install \ odc-ui \ odc-stac \ odc-stats \ - odc-algo \ odc-io \ odc-cloud[ASYNC] \ odc-dscache @@ -51,7 +49,7 @@ Some **odc-tools** are available via `conda` from the `conda-forge` channel. ``` -conda install -c conda-forge odc-apps-dc-tools odc-algo odc-io odc-cloud +conda install -c conda-forge odc-apps-dc-tools odc-io odc-cloud ``` @@ -181,13 +179,6 @@ dependencies: - python-lmdb - zstandard - # odc.algo - - dask-image - - numexpr - - scikit-image - - scipy - - toolz - # odc.ui - ipywidgets - ipyleaflet @@ -246,7 +237,6 @@ dependencies: # odc-tools libs - odc-stac - - odc-algo - odc-ui - odc-dscache - odc-stats diff --git a/libs/algo/LICENSE b/libs/algo/LICENSE deleted file mode 100644 index 261eeb9e9..000000000 --- a/libs/algo/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/libs/algo/README.md b/libs/algo/README.md deleted file mode 100644 index 2a2276b41..000000000 --- a/libs/algo/README.md +++ /dev/null @@ -1,16 +0,0 @@ -odc.algo -======== - -Algorithm utils of various kind. - -Installation ------------- - -``` -pip install odc-algo -``` - -Usage ------ - -TODO diff --git a/libs/algo/odc/algo/__init__.py b/libs/algo/odc/algo/__init__.py deleted file mode 100644 index 6b3e33e28..000000000 --- a/libs/algo/odc/algo/__init__.py +++ /dev/null @@ -1,110 +0,0 @@ -""" Various Algorithmic Helpers - -""" - -from ._broadcast import pool_broadcast -from ._dask import ( - chunked_persist, - chunked_persist_da, - chunked_persist_ds, - randomize, - reshape_yxbt, - wait_for_future, -) -from ._dask_stream import dask_compute_stream, seq_to_bags -from ._geomedian import ( - geomedian_with_mads, - int_geomedian, - int_geomedian_np, - reshape_for_geomedian, - xr_geomedian, -) -from ._masking import ( - binary_closing, - binary_dilation, - binary_erosion, - binary_opening, - choose_first_valid, - enum_to_bool, - erase_bad, - fmask_to_bool, - from_float, - from_float_np, - gap_fill, - keep_good_np, - keep_good_only, - mask_cleanup, - mask_cleanup_np, - to_f32, - to_f32_np, - to_float, - to_float_np, -) -from ._memsink import ( - da_mem_sink, - da_yxbt_sink, - da_yxt_sink, - store_to_mem, - yxbt_sink, - yxbt_sink_to_mem, - yxt_sink, -) -from ._numexpr import apply_numexpr, safe_div -from ._percentile import xr_quantile -from ._rgba import colorize, is_rgb, to_rgba, to_rgba_np -from ._tiff import save_cog -from ._version import __version__ -from ._warp import xr_reproject - -__all__ = ( - "apply_numexpr", - "safe_div", - "keep_good_np", - "keep_good_only", - "erase_bad", - "from_float", - "from_float_np", - "to_f32", - "to_f32_np", - "to_float", - "to_float_np", - "fmask_to_bool", - "enum_to_bool", - "mask_cleanup", - "mask_cleanup_np", - "binary_opening", - "binary_closing", - "binary_dilation", - "binary_erosion", - "gap_fill", - "choose_first_valid", - "xr_geomedian", - "int_geomedian", - "int_geomedian_np", - "reshape_for_geomedian", - "geomedian_with_mads", - "reshape_yxbt", - "wait_for_future", - "chunked_persist", - "chunked_persist_da", - "chunked_persist_ds", - "randomize", - "store_to_mem", - "yxbt_sink_to_mem", - "yxbt_sink", - "yxt_sink", - "da_yxt_sink", - "da_mem_sink", - "da_yxbt_sink", - "is_rgb", - "to_rgba", - "to_rgba_np", - "colorize", - "xr_reproject", - "save_cog", - "xr_quantile", - "pool_broadcast", - "dask_compute_stream", - "seq_to_bags", - "__version__", -) diff --git a/libs/algo/odc/algo/_broadcast.py b/libs/algo/odc/algo/_broadcast.py deleted file mode 100644 index 610cee745..000000000 --- a/libs/algo/odc/algo/_broadcast.py +++ /dev/null @@ -1,91 +0,0 @@ -""" Dask Distributed Tools - - - pool_broadcast -""" -from dask.distributed import Client, Queue -from random import randint -from typing import Any, Dict, List - - -def _bcast_action( - q1: Queue, q2: Queue, tk: int, action: Any, args: List[Any], kwargs: Dict[str, Any] -) -> Any: - """ - - :param q1: Will put to ``tk`` into this queue first - :param q2: Will get an item from this queue after completing action - :param tk: Token to identify this task - :param action: Callable - :param args: Ordered arguments to action - :param kwargs: Named arguments to action - - :returns: result of calling action(*args, **kwargs) - """ - q1.put(tk) # tell main thread we started - try: - x = action(*args, **kwargs) - finally: - # wait for all threads to start - # (q2 is expected to be empty until q1 is filled) - q2.get() - return x - - -def pool_broadcast( - client: Client, action: Any, *args: List[Any], **kwargs: Dict[str, Any] -): - """Call ``action(*args, **kwargs)`` on every worker thread. - - This function block until all tasks are complete, expectation is - that this is called at the very beginning on an empty pool, if called - on a busy pool this will block until all active tasks are complete. - - Broadcast is achieved by blocking every task until all tasks have started, - every worker does the following: - - 1. Let the primary task know this task has started - 2. Perform action - 3. Wait for all other tasks to start - 4. Finish - - Steps (1) and (3) are achieved using distributed Queues, step (1) is a - non-blocking ``put`` and step (3) is a blocking ``get``. - - :param client: Dask client object - :param action: Callable `action(*args, **kwargs)` - :param args: Ordered arguments to action - :param kwargs: Named arguments to action - - """ - postfix = "-{:02x}".format(randint(0, 1 << 64)) - total_worker_threads = sum(client.ncores().values()) - q1 = Queue("q1" + postfix, client=client, maxsize=total_worker_threads) - q2 = Queue("q2" + postfix, client=client, maxsize=total_worker_threads) - - ff = [ - client.submit( - _bcast_action, - q1, - q2, - i, - action, - args, - kwargs, - key="broadcast_action_{:04d}{}".format(i, postfix), - ) - for i in range(total_worker_threads) - ] - - tks = set() - for _ in range(total_worker_threads): - tks.add(q1.get()) # blocking - - assert len(tks) == total_worker_threads - - # at this point all workers have launched - # allow them to continue - for i in range(total_worker_threads): - q2.put(i) # should not block - - # block until all done and return result - return [f.result() for f in ff] diff --git a/libs/algo/odc/algo/_dask.py b/libs/algo/odc/algo/_dask.py deleted file mode 100644 index ce2732886..000000000 --- a/libs/algo/odc/algo/_dask.py +++ /dev/null @@ -1,614 +0,0 @@ -""" -Generic dask helpers -""" - -import dask -import dask.array as da -import functools -import numpy as np -import toolz -import xarray as xr -from bisect import bisect_left, bisect_right -from dask import is_dask_collection -import dask.distributed -from dask.distributed import wait as dask_wait -from dask.highlevelgraph import HighLevelGraph -from datetime import datetime -from random import randint -from toolz import partition_all -from typing import Any, Dict, Hashable, Iterator, List, Optional, Tuple, Union, cast - -from ._tools import ROI, roi_shape, slice_in_out - - -def chunked_persist(data, n_concurrent, client, verbose=False): - """ - Force limited concurrency when persisting a large collection. - - This is useful to control memory usage when operating close to capacity. - - Sometimes `client.persist(data)` will run out of memory, not because - fully-realized data is large, but because of intermediate data memory - requirements. This is particularly common when using local dask cluster - with only one worker. - - This function forces evaluation order of the dask graph to control peak - memory usage. - - Say you have a largish task graph of 10x10 top-level sub-tasks, you have - enough memory to process 5 sub-tasks concurrently, but Dask might decide - to schedule more than that and will cause worker restarts due to out of - memory errors. With this function you can force dask scheduler to - persist this collection in batches of 5 concurrent sub-tasks, keeping - the computation within the memory budget. - """ - delayed = data.to_delayed().ravel() - - persisted = [] - for chunk in partition_all(n_concurrent, delayed): - chunk = client.persist(chunk) - _ = dask_wait(chunk) - persisted.extend(chunk) - if verbose: - print(".", end="") - - # at this point it should be almost no-op - return client.persist(data) - - -def chunked_persist_da( - xx: xr.DataArray, n_concurrent, client, verbose=False -) -> xr.DataArray: - data = chunked_persist(xx.data, n_concurrent, client=client, verbose=verbose) - return xr.DataArray(data, dims=xx.dims, coords=xx.coords, attrs=xx.attrs) - - -def chunked_persist_ds(xx: xr.Dataset, client, verbose: bool = False) -> xr.Dataset: - names = list(xx.data_vars) - data = [xx[n].data for n in names] - delayed = [d.to_delayed().ravel() for d in data] - delayed = list(zip(*delayed)) - - persisted = [] - for chunk in delayed: - chunk = client.persist(chunk) - _ = dask_wait(chunk) - persisted.extend(chunk) - if verbose: - print(".", end="") - - # at this point it should be almost no-op - data = client.persist(data) - - # reconstruct xr.Dataset from persisted chunks - _vars = {} - for n, d in zip(names, data): - dv = xx[n] - _vars[n] = xr.DataArray(data=d, dims=dv.dims, coords=dv.coords, name=n) - - return xr.Dataset(_vars) - - -def randomize(prefix: str) -> str: - """ - Append random token to name - """ - return "{}-{:08x}".format(prefix, randint(0, 0xFFFFFFFF)) - - -@dask.delayed -def with_deps(value, *deps): - return value - - -def list_reshape(x: List[Any], shape: Tuple[int, ...]) -> List[Any]: - """ - similar to numpy version of x.reshape(shape), but only works on flat list on input. - """ - for n in shape[1:][::-1]: - x = list(map(list, toolz.partition(n, x))) - return x - - -def unpack_chunksize(chunk: int, N: int) -> Tuple[int, ...]: - """ - Compute chunk sizes - Example: 4, 11 -> (4, 4, 3) - """ - if chunk >= N or chunk < 0: - return (N,) - - nb = N // chunk - last_chunk = N - chunk * nb - if last_chunk == 0: - return tuple(chunk for _ in range(nb)) - - return tuple(chunk for _ in range(nb)) + (last_chunk,) - - -def unpack_chunks( - chunks: Tuple[int, ...], shape: Tuple[int, ...] -) -> Tuple[Tuple[int, ...], ...]: - """ - Expand chunks - """ - assert len(chunks) == len(shape) - return tuple(unpack_chunksize(ch, n) for ch, n in zip(chunks, shape)) - - -def _roi_from_chunks(chunks: Tuple[int, ...]) -> Iterator[slice]: - off = 0 - for v in chunks: - off_next = off + v - yield slice(off, off_next) - off = off_next - - -def _split_chunks( - chunks: Tuple[int, ...], max_chunk: int -) -> Iterator[Tuple[int, int, slice]]: - """ - For every input chunk split it into smaller chunks. - Return a list of tuples describing output chunks and their relation to input chunks. - - Output: [(dst_idx: int, src_idx: int, src_roi: slice] - - Note that every output chunk has only one chunk on input, - so chunking might be irregular on output. This is by design, to avoid - creating cross chunk dependencies. - """ - dst_idx = 0 - for src_idx, src_sz in enumerate(chunks): - off = 0 - while off < src_sz: - sz = src_sz - off - if max_chunk > 0: - sz = min(sz, max_chunk) - yield (dst_idx, src_idx, slice(off, off + sz)) - dst_idx += 1 - off += sz - - -def _get_chunks_asarray(xx: da.Array) -> np.ndarray: - """ - Returns 2 ndarrays of equivalent shapes - - - First one contains dask tasks: (name: str, idx0:int, idx1:int) - - Second one contains sizes of blocks (Tuple[int,...]) - """ - shape_in_chunks = xx.numblocks - name = xx.name - - chunks = np.ndarray(shape_in_chunks, dtype="object") - shapes = np.ndarray(shape_in_chunks, dtype="object") - for idx in np.ndindex(shape_in_chunks): - chunks[idx] = (name, *idx) - shapes[idx] = tuple(xx.chunks[k][i] for k, i in enumerate(idx)) - return chunks, shapes - - -def _get_chunks_for_all_bands(xx: xr.Dataset): - """ - Equivalent to _get_chunks_asarray(xx.to_array('band').data) - """ - blocks = [] - shapes = [] - - for dv in xx.data_vars.values(): - b, s = _get_chunks_asarray(dv.data) - blocks.append(b) - shapes.append(s) - - blocks = np.stack(blocks) - shapes = np.stack(shapes) - return blocks, shapes - - -def _get_all_chunks(xx: da.Array, flat: bool = True) -> List[Any]: - shape_in_chunks = xx.numblocks - name = xx.name - chunks = [(name, *idx) for idx in np.ndindex(shape_in_chunks)] - if flat: - return chunks - return list_reshape(chunks, shape_in_chunks) - - -def is_single_chunk_xy(x: da.Array): - """ - True if last 2 dimensions are 1x1 sized in blocks - """ - return x.numblocks[-2:] == (1, 1) - - -def empty_maker(fill_value, dtype, dsk, name="empty"): - cache = {} - - def mk_empty(shape: Tuple[int, ...]) -> str: - x = cache.get(shape, None) - if x is not None: - return x - - b_name = name + "_" + "x".join(str(i) for i in shape) - b_name = randomize(b_name) - cache[shape] = b_name - dsk[b_name] = (np.full, shape, fill_value, dtype) - return b_name - - return mk_empty - - -def _stack_2d_np(shape_in_blocks, *blocks, out=None, axis=0): - """ - Stack a bunch of blocks into one plane. - - Takes a flat sequence of blocks in row major order an rearranges them onto a plane. - - Example: - (2, 3) [a0, a1, a2, a3, a4, a5] - >> - [[a0, a1, a2], - [a3, a4, a5]] - - By default assume that y,x dimensions are first, i.e. a[y,x] or a[y, x, band], - but one can also stack blocks with extra dimensions by supplying axis= parameter, - Example: for blocks like this: a[t, y, x, band] use axis=1 - - :param shape_in_blocks: (ny, nx) number of blocks - :param blocks: Blocks in row major order - :param out: Allows re-use of memory, it must match dtype and output shape exactly - :param axis: Index of y axis, x axis is then axis+1 (default is axis=0) - """ - assert len(blocks) > 0 - assert len(shape_in_blocks) == 2 - assert shape_in_blocks[0] * shape_in_blocks[1] == len(blocks) - - dtype = blocks[0].dtype - bshape = blocks[0].shape - dims1 = bshape[:axis] - dims2 = bshape[axis + 2 :] - idx1 = tuple(slice(0, None) for _ in range(len(dims1))) - idx2 = tuple(slice(0, None) for _ in range(len(dims2))) - - h, w = shape_in_blocks - - chunk_y = [b.shape[axis + 0] for b in blocks[0 : h * w : w]] - chunk_x = [b.shape[axis + 1] for b in blocks[:w]] - offset = [np.cumsum(x) for x in ([0] + chunk_y, [0] + chunk_x)] - ny, nx = [x[-1] for x in offset] - - if out is None: - out = np.empty((*dims1, ny, nx, *dims2), dtype=dtype) - else: - pass # TODO: verify out shape is ok - - for block, idx in zip(blocks, np.ndindex(shape_in_blocks)): - ny, nx = block.shape[axis : axis + 2] - _y, _x = (offset[i][j] for i, j in zip([0, 1], idx)) - - idx = (*idx1, slice(_y, _y + ny), slice(_x, _x + nx), *idx2) - - out[idx] = block - - return out - - -def _extract_as_one_block(axis, crop, shape_in_blocks, *blocks): - out = _stack_2d_np(shape_in_blocks, *blocks, axis=axis) - if crop is None: - return out - return out[crop] - - -def _chunk_getter(xx: da.Array): - """ - _chunk_getter(xx)(np.s_[:3, 2:4]) -> ( - (xx.name, 0, 2), - (xx.name, 0, 3), - (xx.name, 1, 2), - ...) - """ - shape_in_chunks = xx.numblocks - name = xx.name - xx = np.asarray([{"v": tuple(idx)} for idx in np.ndindex(shape_in_chunks)]).reshape( - shape_in_chunks - ) - - def getter(roi): - return tuple((name, *x["v"]) for x in xx[roi].ravel()) - - return getter - - -def _rechunk_2x2(xx, name="2x2"): - """ - this is for testing only, ignore it, it's not robust - """ - assert xx.ndim == 2 - name = randomize(name) - ny, nx = (len(ch) // 2 for ch in xx.chunks[:2]) - - dsk = {} - chunks = _chunk_getter(xx) - - for r, c in np.ndindex((ny, nx)): - r2 = r * 2 - c2 = c * 2 - ch_idx = np.s_[r2 : r2 + 2, c2 : c2 + 2] - _xx = chunks(ch_idx) - dsk[(name, r, c)] = (_stack_2d_np, (2, 2), *_xx) - - chy = tuple(xx.chunks[0][i * 2] + xx.chunks[0][i * 2 + 1] for i in range(ny)) - chx = tuple(xx.chunks[1][i * 2] + xx.chunks[1][i * 2 + 1] for i in range(nx)) - - chunks = (chy, chx) - dsk = HighLevelGraph.from_collections(name, dsk, dependencies=(xx,)) - - return da.Array(dsk, name, chunks=chunks, dtype=xx.dtype, shape=xx.shape) - - -def _compute_chunk_range( - span: slice, chunks: Tuple[int, ...], summed: bool = False -) -> Tuple[slice, slice]: - """ - Compute slice in chunk space and slice after taking just those chunks - - :param span: example: `np.s_[:10]` - :param chunks: example: xx.chunks[0] - - """ - cs = chunks if summed else tuple(np.cumsum(chunks)) - n = cs[-1] - - _in, _out = slice_in_out(span, n) - - b_start = bisect_right(cs, _in) - b_end = bisect_left(cs, _out) + 1 - - offset = _in - (0 if b_start == 0 else cs[b_start - 1]) - sz = _out - _in - - return slice(b_start, b_end), slice(offset, offset + sz) - - -def compute_chunk_range( - roi: ROI, - chunks: Union[Tuple[int, ...], Tuple[Tuple[int, ...]]], - summed: bool = False, -) -> Tuple[ROI, ROI]: - """ - Convert ROI in pixels to ROI in blocks (broi) + ROI in pixels (crop) such that - - xx[roi] == stack_blocks(blocks(xx)[broi])[crop] - - Returns - ======= - broi, crop - """ - if isinstance(roi, slice): - chunks = cast(Tuple[int, ...], chunks) - return _compute_chunk_range(roi, chunks, summed) - - chunks = cast(Tuple[Tuple[int, ...]], chunks) - assert len(roi) == len(chunks) - broi = [] - crop = [] - - for span, _chunks in zip(roi, chunks): - bspan, pspan = _compute_chunk_range(span, _chunks) - broi.append(bspan) - crop.append(pspan) - - return tuple(broi), tuple(crop) - - -def crop_2d_dense( - xx: da.Array, yx_roi: Tuple[slice, slice], name: str = "crop_2d", axis: int = 0 -) -> da.Array: - """ - xx[.., yx_roi, ..] -> Dask array with 1 single chunk in y,x dimension - """ - assert len(yx_roi) == 2 - - yx_broi, yx_crop = compute_chunk_range(yx_roi, xx.chunks[axis : axis + 2]) - assert isinstance(yx_crop, tuple) - assert isinstance(yx_broi, tuple) - - xx_chunks = _chunk_getter(xx) - bshape = roi_shape(yx_broi) - - # tuple(*dims1, y, x, *dims2) -- complete shape in blocks - dims1 = tuple(map(len, xx.chunks[:axis])) - dims2 = tuple(map(len, xx.chunks[axis + 2 :])) - - # Adjust crop to include non-yx dimensions - crop = ( - tuple(slice(0, None) for _ in dims1) - + yx_crop - + tuple(slice(0, None) for _ in dims2) - ) - - name = randomize(name) - dsk = {} - for ii1 in np.ndindex(dims1): - roi_ii1 = tuple(slice(i, i + 1) for i in ii1) - for ii2 in np.ndindex(dims2): - roi_ii2 = tuple(slice(i, i + 1) for i in ii2) - broi = roi_ii1 + yx_broi + roi_ii2 - blocks = xx_chunks(broi) - assert len(blocks) == bshape[0] * bshape[1] - dsk[(name, *ii1, 0, 0, *ii2)] = ( - _extract_as_one_block, - axis, - crop, - bshape, - *blocks, - ) - - dsk = HighLevelGraph.from_collections(name, dsk, dependencies=(xx,)) - yx_shape = roi_shape(yx_crop) - yx_chunks = tuple((n,) for n in yx_shape) - chunks = xx.chunks[:axis] + yx_chunks + xx.chunks[axis + 2 :] - shape = (*xx.shape[:axis], *yx_shape, *xx.shape[axis + 2 :]) - - return da.Array(dsk, name, chunks=chunks, dtype=xx.dtype, shape=shape) - - -def _reshape_yxbt_impl(blocks, crop_yx=None, dtype=None): - """ - input axis order : (band, time,) (blocks: y, x) - output axis order: y, x, band, time - """ - - def squeeze_to_yx(x): - idx = tuple(0 for _ in x.shape[:-2]) - return x[idx] - - assert len(blocks) > 0 - nb = len(blocks) - nt = len(blocks[0]) - b = squeeze_to_yx(blocks[0][0]) - - if dtype is None: - dtype = b.dtype - - if crop_yx: - b = b[crop_yx] - ny, nx = b.shape - - dst = np.empty((ny, nx, nb, nt), dtype=dtype) - for it, ib in np.ndindex((nt, nb)): - b = squeeze_to_yx(blocks[ib][it]) - if crop_yx is not None: - b = b[crop_yx] - - assert b.shape == (ny, nx) - dst[:, :, ib, it] = b - - return dst - - -def reshape_yxbt( - xx: xr.Dataset, - name: str = "reshape_yxbt", - yx_chunks: Union[int, Tuple[int, int]] = -1, -) -> xr.DataArray: - """ - Reshape Dask-backed ``xr.Dataset[Time,Y,X]`` into - ``xr.DataArray[Y,X,Band,Time]``. On the output DataArray there is - exactly one chunk along both Time and Band dimensions. - - :param xx: Dataset with 3 dimensional bands, dimension order (time, y, x) - - :param name: Dask name of the output operation - - :param yx_chunks: If supplied subdivide YX chunks of input into smaller - sections, note that this can only make yx chunks smaller - not bigger. Every output chunk depends on one input chunk - only, so output chunks might not be regular, for example - if input chunk sizes are 10, and yx_chunks=3, you'll get - chunks sized 3,3,3,1,3,3,3,1... (example only, never use chunks - that small) - - .. note: - - Chunks along first dimension ought to be of size 1 exactly (default for - time dimension when using dc.load). - """ - if isinstance(yx_chunks, int): - yx_chunks = (yx_chunks, yx_chunks) - - if not is_dask_collection(xx): - raise ValueError("Currently this code works only on Dask inputs") - - if not all( - dv.data.numblocks[0] == dv.data.shape[0] for dv in xx.data_vars.values() - ): - raise ValueError("All input bands should have chunk=1 for the first dimension") - - name0 = name - name = randomize(name) - - blocks, _ = _get_chunks_for_all_bands(xx) - b0, *_ = xx.data_vars.values() - - attrs = dict(b0.attrs) - nb = len(xx.data_vars.values()) - nt, ny, nx = b0.shape - - deps = [dv.data for dv in xx.data_vars.values()] - shape = (ny, nx, nb, nt) - dtype = b0.dtype - dims = b0.dims[1:] + ("band", b0.dims[0]) - - maxy, maxx = yx_chunks - ychunks, xchunks = b0.data.chunks[1:3] - _yy = list(_split_chunks(ychunks, maxy)) - _xx = list(_split_chunks(xchunks, maxx)) - ychunks = tuple(roi.stop - roi.start for _, _, roi in _yy) - xchunks = tuple(roi.stop - roi.start for _, _, roi in _xx) - - chunks = [ychunks, xchunks, (nb,), (nt,)] - - dsk = {} - for iy, iy_src, y_roi in _yy: - for ix, ix_src, x_roi in _xx: - crop_yx = (y_roi, x_roi) - _blocks = blocks[:, :, iy_src, ix_src].tolist() - dsk[(name, iy, ix, 0, 0)] = ( - functools.partial(_reshape_yxbt_impl, crop_yx=crop_yx), - _blocks, - ) - - dsk = HighLevelGraph.from_collections(name, dsk, dependencies=deps) - data = da.Array(dsk, name, chunks=chunks, dtype=dtype, shape=shape) - - coords: Dict[Hashable, Any] = dict(xx.coords.items()) - coords["band"] = list(xx.data_vars) - - return xr.DataArray(data=data, dims=dims, coords=coords, name=name0, attrs=attrs) - - -def flatten_kv(xx): - """ - Turn dictionary into a flat list: [k0, v0, k1, v1, ...]. - - Useful for things like map_blocks when passing Dict[str, da.Array] for example. - """ - - def _kv(xx): - for k, v in xx.items(): - yield k - yield v - - return list(_kv(xx)) - - -def unflatten_kv(xx): - """ - Reverse operation of `flatten_kv` - """ - return dict(toolz.partition_all(2, xx)) - - -def wait_for_future( - future, poll_timeout: float = 1.0, t0: Optional[datetime] = None -) -> Iterator[Tuple[float, datetime]]: - """ - Generate a sequence of (time_passed, timestamp) tuples, stop when future becomes ready. - - :param future: Dask future - :param poll_timeout: Controls how often - :param t0: From what point to start counting (defaults to right now) - """ - if t0 is None: - t0 = datetime.utcnow() - - while not future.done(): - try: - dask_wait(future, timeout=poll_timeout, return_when="FIRST_COMPLETED") - return - except dask.distributed.TimeoutError: - pass - t_now = datetime.utcnow() - - yield (t_now - t0).total_seconds(), t_now diff --git a/libs/algo/odc/algo/_dask_stream.py b/libs/algo/odc/algo/_dask_stream.py deleted file mode 100644 index 9b4f963fb..000000000 --- a/libs/algo/odc/algo/_dask_stream.py +++ /dev/null @@ -1,90 +0,0 @@ -""" Dask Distributed Tools - - - dask_compute_stream -""" -import dask.bag -import queue -import threading -import toolz -from dask.distributed import Client -from random import randint -from typing import Any, Iterable - - -def _randomize(prefix): - return "{}-{:08x}".format(prefix, randint(0, 0xFFFFFFFF)) - - -def seq_to_bags(its: Iterable[Any], chunk_sz: int, name: str = "data"): - """Take a stream of data items and return a stream of dask.bag.Bag - each bag (except last) containing ``chunk_sz`` elements in 1 partition. - """ - for chunk in toolz.partition_all(chunk_sz, its): - prefix = _randomize(name) - dsk = {(prefix, 0): chunk} - yield dask.bag.Bag(dsk, prefix, 1) - - -def dask_compute_stream( - client: Client, - func: Any, - its: Iterable[Any], - lump: int = 10, - max_in_flight: int = 1000, - name: str = "compute", -) -> Iterable[Any]: - """Parallel map with back pressure. - - Equivalent to this: - - (func(x) for x in its) - - Except that ``func(x)`` runs concurrently on dask cluster. - - :param client: Connected dask client - :param func: Method that will be applied concurrently to data from ``its`` - :param its: Iterator of input values - :param lump: Group this many datasets into one task - :param max_in_flight: Maximum number of active tasks to submit - :param name: Dask name for computation - """ - - def lump_proc(dd): - if dd is None: - return None - return [func(d) for d in dd] - - max_in_flight = max(2, max_in_flight // lump) - wrk_q = queue.Queue(maxsize=max_in_flight) - - data_name = _randomize("data_" + name) - name = _randomize(name) - priority = 2**31 - - def feeder(its, lump, q, client): - for i, x in enumerate(toolz.partition_all(lump, its)): - key = name + str(i) - data_key = data_name + str(i) - task = client.get( - {key: (lump_proc, data_key), data_key: x}, - key, - priority=priority - i, - sync=False, - ) - q.put(task) # maybe blocking - - q.put(None) # EOS marker - - in_thread = threading.Thread(target=feeder, args=(its, lump, wrk_q, client)) - in_thread.start() - - while True: - yy = wrk_q.get() # maybe blocking - - if yy is None: - break - - yield from yy.result() - del yy - - in_thread.join() diff --git a/libs/algo/odc/algo/_geomedian.py b/libs/algo/odc/algo/_geomedian.py deleted file mode 100644 index 31fed3e7b..000000000 --- a/libs/algo/odc/algo/_geomedian.py +++ /dev/null @@ -1,450 +0,0 @@ -""" Helper methods for Geometric Median computation. -""" -import dask -import dask.array as da -import functools -import numpy as np -import xarray as xr -from typing import Optional, Tuple, Union - -from ._dask import randomize, reshape_yxbt -from ._masking import from_float_np, to_float_np -from ._memsink import yxbt_sink - -# pylint: disable=import-outside-toplevel - - -def reshape_for_geomedian(ds, axis="time"): - dims = set(v.dims for v in ds.data_vars.values()) - if len(dims) != 1: - raise ValueError("All bands should have same dimensions") - - dims = dims.pop() - if len(dims) != 3: - raise ValueError("Expect 3 dimensions on input") - - if axis not in dims: - raise ValueError(f"No such axis: {axis}") - - dims = tuple(d for d in dims if d != axis) + ("band", axis) - - nodata = set(getattr(v, "nodata", None) for v in ds.data_vars.values()) - if len(nodata) == 1: - nodata = nodata.pop() - else: - nodata = None - - # xx: {y, x}, band, time - xx = ds.to_array(dim="band").transpose(*dims) - - if nodata is not None: - xx.attrs.update(nodata=nodata) - - return xx - - -def xr_geomedian(ds, axis="time", where=None, **kw): - """ - - :param ds: xr.Dataset|xr.DataArray|numpy array - - Other parameters: - **kwargs -- passed on to pcm.gnmpcm - maxiters : int 1000 - eps : float 0.0001 - num_threads: int| None None - """ - from hdstats import nangeomedian_pcm - - def norm_input(ds, axis): - if isinstance(ds, xr.DataArray): - xx = ds - if len(xx.dims) != 4: - raise ValueError("Expect 4 dimensions on input: y,x,band,time") - if axis is not None and xx.dims[3] != axis: - raise ValueError( - f"Can only reduce last dimension, expect: y,x,band,{axis}" - ) - return None, xx, xx.data - elif isinstance(ds, xr.Dataset): - xx = reshape_for_geomedian(ds, axis) - return ds, xx, xx.data - else: # assume numpy or similar - xx_data = ds - if xx_data.ndim != 4: - raise ValueError("Expect 4 dimensions on input: y,x,band,time") - return None, None, xx_data - - kw.setdefault("nocheck", True) - kw.setdefault("num_threads", 1) - kw.setdefault("eps", 1e-6) - - ds, xx, xx_data = norm_input(ds, axis) - is_dask = dask.is_dask_collection(xx_data) - - if where is not None: - if is_dask: - raise NotImplementedError( - "Dask version doesn't support output masking currently" - ) - - if where.shape != xx_data.shape[:2]: - raise ValueError("Shape for `where` parameter doesn't match") - set_nan = ~where # pylint: disable=invalid-unary-operand-type - else: - set_nan = None - - if is_dask: - if xx_data.shape[-2:] != xx_data.chunksize[-2:]: - xx_data = xx_data.rechunk(xx_data.chunksize[:2] + (-1, -1)) - - data = da.map_blocks( - lambda x: nangeomedian_pcm(x, **kw), # pylint: disable=unnecessary-lambda - xx_data, - name=randomize("geomedian"), - dtype=xx_data.dtype, - drop_axis=3, - ) - else: - data = nangeomedian_pcm(xx_data, **kw) - - if set_nan is not None: - data[set_nan, :] = np.nan - - if xx is None: - return data - - dims = xx.dims[:-1] - cc = {k: xx.coords[k] for k in dims} - xx_out = xr.DataArray(data, dims=dims, coords=cc) - - if ds is None: - xx_out.attrs.update(xx.attrs) - return xx_out - - ds_out = xx_out.to_dataset(dim="band") - for b in ds.data_vars.keys(): - src, dst = ds[b], ds_out[b] - dst.attrs.update(src.attrs) - - return ds_out - - -def _slices(step, n): - if step < 0: - yield slice(0, n) - return - - for x in range(0, n, step): - yield slice(x, min(x + step, n)) - - -def int_geomedian_np(*bands, nodata=None, scale=1, offset=0, wk_rows=-1, **kw): - """On input each band is expected to be same shape and dtype with 3 dimensions: time, y, x - On output: band, y, x - """ - from hdstats import nangeomedian_pcm - - nt, ny, nx = bands[0].shape - dtype = bands[0].dtype - nb = len(bands) - gm_int = np.empty((nb, ny, nx), dtype=dtype) - - if wk_rows > ny or wk_rows <= 0: - wk_rows = ny - - _wk_f32 = np.empty((wk_rows, nx, nb, nt), dtype="float32") - - for _y in _slices(wk_rows, ny): - _ny = _y.stop - _y.start - bb_f32 = _wk_f32[:_ny, ...] - # extract part of the image with scaling - for b_idx, b in enumerate(bands): - for t_idx in range(nt): - bb_f32[:, :, b_idx, t_idx] = to_float_np( - b[t_idx, _y, :], - nodata=nodata, - scale=scale, - offset=offset, - dtype="float32", - ) - - # run partial computation - gm_f32 = nangeomedian_pcm(bb_f32, **kw) - - # extract results with scaling back - for b_idx in range(nb): - gm_int[b_idx, _y, :] = from_float_np( - gm_f32[:, :, b_idx], - dtype, - nodata=nodata, - scale=1 / scale, - offset=-offset / scale, - ) - - return gm_int - - -def int_geomedian(ds, scale=1, offset=0, wk_rows=-1, as_array=False, **kw): - """ds -- xr.Dataset (possibly dask) with dims: (time, y, x) for each band - - on output time dimension is removed - - :param ds: Dataset with int data variables - :param scale: Normalize data for running computation (output is scaled back to original values) - :param offset: ``(x*scale + offset)`` - :param wk_rows: reduce memory requirements by processing that many rows of a chunk at a time - :param as_array: If set to True return DataArray with band dimension instead of Dataset - :param kw: Passed on to hdstats (eps=1e-4, num_threads=1, maxiters=10_000, nocheck=True) - - """ - band_names = [dv.name for dv in ds.data_vars.values()] - xx, *_ = ds.data_vars.values() - nodata = getattr(xx, "nodata", None) - - is_dask = dask.is_dask_collection(xx) - if is_dask: - if xx.data.chunksize[0] != xx.shape[0]: - ds = ds.chunk(chunks={xx.dims[0]: -1}) - xx, *_ = ds.data_vars.values() - - nt, ny, nx = xx.shape - bands = [dv.data for dv in ds.data_vars.values()] - band = bands[0] - nb = len(bands) - dtype = band.dtype - - kw.setdefault("nocheck", True) - kw.setdefault("num_threads", 1) - kw.setdefault("eps", 1e-4) - kw.setdefault("maxiters", 10_000) - - if is_dask: - chunks = ((nb,), *xx.chunks[1:]) - - data = da.map_blocks( - int_geomedian_np, - *bands, - nodata=nodata, - scale=scale, - offset=offset, - wk_rows=wk_rows, - **kw, - name=randomize("geomedian"), - dtype=dtype, - chunks=chunks, - drop_axis=[0], # time is dropped - new_axis=[0], - ) # band is added on the left - else: - data = int_geomedian_np( - *bands, nodata=nodata, scale=scale, offset=offset, wk_rows=wk_rows, **kw - ) - - dims = ("band", *xx.dims[1:]) - cc = {k: xx.coords[k] for k in dims[1:]} - cc["band"] = band_names - - da_out = xr.DataArray(data, dims=dims, coords=cc) - - if as_array: - if nodata is not None: - da_out.attrs["nodata"] = nodata - return da_out - - ds_out = da_out.to_dataset(dim="band") - ds_out.attrs.update(ds.attrs) - for b in ds.data_vars.keys(): - src, dst = ds[b], ds_out[b] - dst.attrs.update(src.attrs) - - return ds_out - - -def _gm_mads_compute_f32( - yxbt, compute_mads=True, compute_count=True, nodata=None, scale=1, offset=0, **kw -): - """ - output axis order is: - - y, x, band - - When extra stats are compute they ar returned in the following order: - [*bands, smad, emad, bcmad, count] - - note that when supplying non-float input, it is scaled according to scale/offset/nodata parameters, - output is however returned in that scaled range. - """ - import hdstats - - if yxbt.dtype.kind != "f": - yxbt = to_float_np(yxbt, scale=scale, offset=offset, nodata=nodata) - - gm = hdstats.nangeomedian_pcm(yxbt, nocheck=True, **kw) - - stats_bands = [] - - if compute_mads: - mads = [hdstats.smad_pcm, hdstats.emad_pcm, hdstats.bcmad_pcm] - - for i, op in enumerate(mads): - stats_bands.append(op(yxbt, gm, num_threads=kw.get("num_threads", 1))) - - if compute_count: - nbads = np.isnan(yxbt).sum(axis=2, dtype="bool").sum(axis=2, dtype="uint16") - count = yxbt.dtype.type(yxbt.shape[-1]) - nbads - stats_bands.append(count) - - if len(stats_bands) == 0: - return gm - - stats_bands = [a[..., np.newaxis] for a in stats_bands] - - return np.concatenate([gm, *stats_bands], axis=2) - - -def geomedian_with_mads( - src: Union[xr.Dataset, xr.DataArray], - compute_mads: bool = True, - compute_count: bool = True, - out_chunks: Optional[Tuple[int, int, int]] = None, - reshape_strategy: str = "mem", - scale: float = 1.0, - offset: float = 0.0, - eps: Optional[float] = None, - maxiters: int = 1000, - num_threads: int = 1, - **kw, -) -> xr.Dataset: - """ - Compute Geomedian on Dask backed Dataset. - - NOTE: Default configuration of this code assumes that entire input can be - loaded in to RAM on the Dask worker. It also assumes that there is only one - worker in the cluster, or that entire task will get scheduled on one single - worker only. See ``reshape_strategy`` parameter. - - :param src: xr.Dataset or a single array in YXBT order, bands can be either - float or integer with `nodata` values to indicate gaps in data. - - :param compute_mads: Whether to compute smad,emad,bcmad statistics - - :param compute_count: Whether to compute count statistic (number of - contributing observations per output pixels) - - :param out_chunks: Advanced option, allows to rechunk output internally, - order is ``(ny, nx, nband)`` - - :param reshape_strategy: One of ``mem`` (default) or ``yxbt``. This is only - applicable when supplying Dataset object. It controls how Dataset is - reshaped into DataArray in the format expected by Geomedian code. If you - have enough RAM and use single-worker Dask cluster, then use ``mem``, it - should be the most efficient. If there is not enough RAM to load entire - input you can try ``yxbt`` mode, but you might still run out of RAM anyway. - If using multi-worker Dask cluster you have to use ``yxbt`` strategy. - - :param scale, offset: Only used when input contains integer values, actual - Geomedian will run on scaled values - ``scale*X+offset``. Only affects internal - computation, final result is scaled back to the - original value range. - - :param eps: Termination criteria passed on to geomedian algorithm - - :param maxiters: Maximum number of iterations done per output pixel - - :param num_threads: Configure internal concurrency of the Geomedian - computation. Default is 1 as we assume that Dask will - run a bunch of those concurrently. - - :param work_chunks: Default is ``(100, 100)``, only applicable when input - is Dataset. - """ - if not dask.is_dask_collection(src): - raise ValueError("This method only works on Dask inputs") - - if isinstance(src, xr.DataArray): - yxbt = src - else: - # TODO: better automatic defaults for work_chunks - ny, nx = kw.get("work_chunks", (100, 100)) - if reshape_strategy == "mem": - yxbt = yxbt_sink(src, (ny, nx, -1, -1)) - elif reshape_strategy == "yxbt": - yxbt = reshape_yxbt(src, yx_chunks=(ny, nx)) - else: - raise ValueError( - f"Reshape strategy '{reshape_strategy}' not understood use one of: mem or yxbt" - ) - - ny, nx, nb, nt = yxbt.shape - nodata = yxbt.attrs.get("nodata", None) - assert yxbt.chunks is not None - if yxbt.data.numblocks[2:4] != (1, 1): - raise ValueError("There should be one dask block along time and band dimension") - - n_extras = (3 if compute_mads else 0) + (1 if compute_count else 0) - chunks = (*yxbt.chunks[:2], (nb + n_extras,)) - - is_float = yxbt.dtype.kind == "f" - - if eps is None: - eps = 1e-4 if is_float else 0.1 * scale - - op = functools.partial( - _gm_mads_compute_f32, - compute_mads=compute_mads, - compute_count=compute_count, - nodata=nodata, - scale=scale, - offset=offset, - eps=eps, - maxiters=maxiters, - num_threads=num_threads, - ) - - _gm = da.map_blocks( - op, yxbt.data, dtype="float32", drop_axis=3, chunks=chunks, name="geomedian" - ) - if out_chunks is not None: - _gm = _gm.rechunk(out_chunks) - - gm_data = _gm[:, :, :nb] - if not is_float: - gm_data = da.map_blocks( - lambda x: from_float_np( - x, yxbt.dtype, nodata, scale=1 / scale, offset=-offset / scale - ), - gm_data, - dtype=yxbt.dtype, - ) - - dims = yxbt.dims[:3] - coords = {k: yxbt.coords[k] for k in dims} - result = xr.DataArray( - data=gm_data, dims=dims, coords=coords, attrs=yxbt.attrs - ).to_dataset("band") - - for dv in result.data_vars.values(): - dv.attrs.update(yxbt.attrs) - - next_stat = nb - if compute_mads: - smad = _gm[:, :, next_stat + 0] - emad = _gm[:, :, next_stat + 1] - bcmad = _gm[:, :, next_stat + 2] - next_stat += 3 - - if not is_float: - emad = emad * (1 / scale) - - result["smad"] = xr.DataArray(data=smad, dims=dims[:2], coords=result.coords) - result["emad"] = xr.DataArray(data=emad, dims=dims[:2], coords=result.coords) - result["bcmad"] = xr.DataArray(data=bcmad, dims=dims[:2], coords=result.coords) - - if compute_count: - count = _gm[:, :, next_stat].astype("uint16") - next_stat += 1 - result["count"] = xr.DataArray(data=count, dims=dims[:2], coords=result.coords) - - return result diff --git a/libs/algo/odc/algo/_grouper.py b/libs/algo/odc/algo/_grouper.py deleted file mode 100644 index 37974041b..000000000 --- a/libs/algo/odc/algo/_grouper.py +++ /dev/null @@ -1,94 +0,0 @@ -"""Methods for grouping Datasets spatialy and otherwise.""" -import numpy as np -import pandas as pd -import xarray as xr -from datetime import timedelta -from typing import Any, Dict, Hashable, Iterable, Iterator, List, Optional - -from datacube.model import Dataset -from datacube.utils.dates import normalise_dt -from datacube.utils.geometry import Geometry - - -def mid_longitude(geom: Geometry) -> float: - """Return longitude of the middle point of the geomtry.""" - ((lon,), _) = geom.centroid.to_crs("epsg:4326").xy - return lon - - -def solar_offset(geom: Geometry, precision: str = "h") -> timedelta: - """ - Given a geometry compute offset to add to UTC timestamp to get solar day right. - - This only work when geometry is "local enough". - :param precision: one of ``'h'`` or ``'s'``, defaults to hour precision - """ - lon = mid_longitude(geom) - - if precision == "h": - return timedelta(hours=int(lon * 24 / 360 + 0.5)) - - # 240 == (24*60*60)/360 (seconds of a day per degree of longitude) - return timedelta(seconds=int(lon * 240)) - - -def key2num( - objs: Iterable[Hashable], reverse_map: Optional[Dict[int, Any]] = None -) -> Iterator[int]: - """ - Given a sequence of hashable objects return sequence of numeric ids starting from 0. - - For example ``'A' 'B' 'A' 'A' 'C' -> 0 1 0 0 2`` - """ - o2id: Dict[Any, int] = {} - c = 0 - for obj in objs: - _c = o2id.setdefault(obj, c) - if _c == c: - c = c + 1 - if reverse_map is not None: - reverse_map[_c] = obj - yield _c - - -def group_by_nothing( - dss: List[Dataset], solar_day_offset: Optional[timedelta] = None -) -> xr.DataArray: - """ - No op grouping of datasets. - - Construct "sources" just like ``.group_dataset`` but with every slice - containing just one Dataset object wrapped in a tuple. - - Time -> (Dataset,) - """ - dss = sorted(dss, key=lambda ds: (normalise_dt(ds.center_time), ds.id)) # type: ignore - time = [normalise_dt(ds.center_time) for ds in dss] # type: ignore - solar_day = None - - if solar_day_offset is not None: - solar_day = np.asarray( - [(dt + solar_day_offset).date() for dt in time], dtype="datetime64[D]" - ) - - idx = np.arange(0, len(dss), dtype="uint32") - uuids = np.empty(len(dss), dtype="O") - data = np.empty(len(dss), dtype="O") - grid2crs: Dict[int, Any] = {} - grid = list(key2num((ds.crs for ds in dss), grid2crs)) - - for i, ds in enumerate(dss): - data[i] = (ds,) - uuids[i] = ds.id - - coords = [np.asarray(time, dtype="datetime64[ms]"), idx, uuids, grid] - names = ["time", "idx", "uuid", "grid"] - if solar_day is not None: - coords.append(solar_day) - names.append("solar_day") - - coord = pd.MultiIndex.from_arrays(coords, names=names) - - return xr.DataArray( - data=data, coords=dict(spec=coord), attrs={"grid2crs": grid2crs}, dims=("spec",) - ) diff --git a/libs/algo/odc/algo/_masking.py b/libs/algo/odc/algo/_masking.py deleted file mode 100644 index 120363905..000000000 --- a/libs/algo/odc/algo/_masking.py +++ /dev/null @@ -1,797 +0,0 @@ -""" -Mostly masking related. - -Also converting between float[with nans] and int[with nodata]. -""" - -import dask -import dask.array as da -import numexpr as ne -import numpy as np -import xarray as xr -from dask.highlevelgraph import HighLevelGraph -from functools import partial -from typing import Any, Dict, Iterable, Optional, Tuple, Union - -from ._dask import _get_chunks_asarray, randomize - - -def default_nodata(dtype): - """Default `nodata` for a given dtype - - nan for float{*} - - 0 for any other type - """ - if dtype.kind == "f": - return dtype.type(np.nan) - return dtype.type(0) - - -def keep_good_np(xx, where, nodata, out=None): - if out is None: - out = np.full_like(xx, nodata) - else: - assert out.shape == xx.shape - assert out.dtype == xx.dtype - assert out is not xx - out[:] = nodata - np.copyto(out, xx, where=where) - return out - - -def keep_good_only(x, where, inplace=False, nodata=None): - """Return a copy of x, but with some pixels replaced with `nodata`. - - This function can work on dask arrays, - in which case output will be a dask array as well. - - If x is a Dataset then operation will be applied to all data variables. - - :param x: xarray.DataArray with `nodata` property - :param where: xarray.DataArray True -- keep, False -- replace with `x.nodata` - :param inplace: Modify pixels in x directly, not valid for dask arrays. - - For every pixel of x[idx], output is: - - - nodata if where[idx] == False - - x[idx] if where[idx] == True - """ - if isinstance(x, xr.Dataset): - return x.apply( - lambda x: keep_good_only(x, where, inplace=inplace, nodata=nodata), - keep_attrs=True, - ) - - assert x.shape == where.shape - if nodata is None: - nodata = getattr(x, "nodata", 0) - - if inplace: - if dask.is_dask_collection(x): - raise ValueError("Can not perform inplace operation on a dask array") - - np.copyto(x.data, nodata, where=~where.data) - return x - - if dask.is_dask_collection(x): - data = da.map_blocks( - keep_good_np, - x.data, - where.data, - nodata, - name=randomize("keep_good"), - dtype=x.dtype, - ) - else: - data = keep_good_np(x.data, where.data, nodata) - - return xr.DataArray(data, dims=x.dims, coords=x.coords, attrs=x.attrs, name=x.name) - - -def erase_bad_np(xx, where, nodata, out=None): - if out is None: - out = np.copy(xx) - else: - assert out.shape == xx.shape - assert out.dtype == xx.dtype - assert out is not xx - out[:] = xx - np.copyto(out, nodata, where=where) - return out - - -def erase_bad(x, where, inplace=False, nodata=None): - """ - Return a copy of x, but with some pixels replaced with `nodata`. - - This function can work on dask arrays, - in which case output will be a dask array as well. - - If x is a Dataset then operation will be applied to all data variables. - - :param x: xarray.DataArray with `nodata` property - :param where: xarray.DataArray True -- replace with `x.nodata` - False -- keep as it were - :param inplace: Modify pixels in x directly, not valid for dask arrays. - - For every pixel of x[idx], output is: - - - nodata if where[idx] == True - - x[idx] if where[idx] == False - """ - if isinstance(x, xr.Dataset): - return x.apply(lambda x: erase_bad(x, where, inplace=inplace), keep_attrs=True) - - assert x.shape == where.shape - if nodata is None: - nodata = getattr(x, "nodata", 0) - - if inplace: - if dask.is_dask_collection(x): - raise ValueError("Can not perform inplace operation on a dask array") - - np.copyto(x.data, nodata, where=where.data) - return x - - if dask.is_dask_collection(x): - data = da.map_blocks( - erase_bad_np, - x.data, - where.data, - nodata, - name=randomize("erase_bad"), - dtype=x.dtype, - ) - else: - data = erase_bad_np(x.data, where.data, nodata) - - return xr.DataArray(data, dims=x.dims, coords=x.coords, attrs=x.attrs, name=x.name) - - -def from_float_np(x, dtype, nodata, scale=1, offset=0, where=None, out=None): - scale = np.float32(scale) - offset = np.float32(offset) - - if out is None: - out = np.empty_like(x, dtype=dtype) - else: - assert out.shape == x.shape - - params = dict(x=x, nodata=nodata, scale=scale, offset=offset) - - # `x == x` is equivalent to `~np.isnan(x)` - - if where is not None: - assert x.shape == where.shape - params["m"] = where - expr = "where((x == x)&m, x*scale + offset, nodata)" - else: - expr = "where(x == x, x*scale + offset, nodata)" - - ne.evaluate(expr, local_dict=params, out=out, casting="unsafe") - - return out - - -def to_float_np(x, nodata=None, scale=1, offset=0, dtype="float32", out=None): - float_type = np.dtype(dtype).type - - _nan = float_type(np.nan) - scale = float_type(scale) - offset = float_type(offset) - - params = dict(_nan=_nan, scale=scale, offset=offset, x=x, nodata=nodata) - if out is None: - out = np.empty_like(x, dtype=dtype) - else: - assert out.shape == x.shape - - if nodata is None: - return ne.evaluate( - "x*scale + offset", out=out, casting="unsafe", local_dict=params - ) - elif scale == 1 and offset == 0: - return ne.evaluate( - "where(x == nodata, _nan, x)", out=out, casting="unsafe", local_dict=params - ) - else: - return ne.evaluate( - "where(x == nodata, _nan, x*scale + offset)", - out=out, - casting="unsafe", - local_dict=params, - ) - - -def to_f32_np(x, nodata=None, scale=1, offset=0, out=None): - return to_float_np( - x, nodata=nodata, scale=scale, offset=offset, dtype="float32", out=out - ) - - -def to_float(x, scale=1, offset=0, dtype="float32"): - if isinstance(x, xr.Dataset): - return x.apply( - to_float, scale=scale, offset=offset, dtype=dtype, keep_attrs=True - ) - - attrs = x.attrs.copy() - nodata = attrs.pop("nodata", None) - - if dask.is_dask_collection(x.data): - data = da.map_blocks( - to_float_np, - x.data, - nodata, - scale, - offset, - dtype, - dtype=dtype, - name=randomize("to_float"), - ) - else: - data = to_float_np( - x.data, nodata=nodata, scale=scale, offset=offset, dtype=dtype - ) - - return xr.DataArray(data, dims=x.dims, coords=x.coords, name=x.name, attrs=attrs) - - -def to_f32(x, scale=1, offset=0): - return to_float(x, scale=scale, offset=offset, dtype="float32") - - -def from_float(x, dtype, nodata, scale=1, offset=0): - if isinstance(x, xr.Dataset): - return x.apply(from_float, keep_attrs=True, args=(dtype, nodata, scale, offset)) - - attrs = x.attrs.copy() - attrs["nodata"] = nodata - - if dask.is_dask_collection(x.data): - data = da.map_blocks( - from_float_np, - x.data, - dtype, - nodata, - scale=scale, - offset=offset, - dtype=dtype, - name=randomize("from_float"), - ) - else: - data = from_float_np(x.data, dtype, nodata, scale=scale, offset=offset) - - return xr.DataArray(data, dims=x.dims, coords=x.coords, name=x.name, attrs=attrs) - - -def _impl_to_bool(x, m): - return ((1 << x) & m) > 0 - - -def _impl_to_bool_inverted(x, m): - return ((1 << x) & m) == 0 - - -def _flags_invert(flags: Dict[str, Any]) -> Dict[str, Any]: - _out = dict(**flags) - _out["values"] = {n: int(v) for v, n in flags["values"].items()} - return _out - - -def _get_enum_values( - names: Iterable[str], flags_definition: Dict[str, Dict[str, Any]], flag: str = "" -) -> Tuple[int, ...]: - """ - Lookup enum values in flags definition library - - :param names: enum value to lookup e.g. ("cloud", "shadow") - :param flags_definition: Flags definition dictionary as used by Datacube - :param flag: Name of the enum (for example "fmask", auto-guessed if omitted) - """ - if flag != "": - flags_definition = {flag: flags_definition[flag]} - - names = list(names) - names_set = set(names) - unmatched = set() - for ff in flags_definition.values(): - values = _flags_invert(ff)["values"] - unmatched = names_set - set(values.keys()) - if len(unmatched) == 0: - return tuple(values[n] for n in names) - - if len(flags_definition) > 1: - raise ValueError("Can not find flags definitions that match query") - unmatched_human = ",".join(f'"{name}"' for name in unmatched) - raise ValueError(f"Not all enumeration names were found: {unmatched_human}") - - -def _mk_ne_isin_condition( - values: Tuple[int, ...], var_name: str = "a", invert: bool = False -) -> str: - """ - Produce numexpr expression equivalent to numpys `.isin` - - - ((a==v1)|(a==v2)|..|a==vn) when invert=False - - ((a!=v1)&(a!=v2)&..&a!=vn) when invert=True - """ - op1, op2 = ("!=", "&") if invert else ("==", "|") - parts = [f"({var_name}{op1}{val})" for val in values] - return f"({op2.join(parts)})" - - -def _enum_to_mask_numexpr( - mask: np.ndarray, - classes: Tuple[int, ...], - invert: bool = False, - value_true: int = 1, - value_false: int = 0, - dtype: Union[str, np.dtype] = "bool", -) -> np.ndarray: - cond = _mk_ne_isin_condition(classes, "m", invert=invert) - expr = f"where({cond}, {value_true}, {value_false})" - out = np.empty_like(mask, dtype=dtype) - - ne.evaluate(expr, local_dict=dict(m=mask), out=out, casting="unsafe") - - return out - - -# pylint: disable=import-outside-toplevel -def _disk(r: int, ndim: int = 2) -> np.ndarray: - from skimage.morphology import disk - - kernel = disk(r) - while kernel.ndim < ndim: - kernel = kernel[np.newaxis] - return kernel - - -def xr_apply_morph_op( - xx: xr.DataArray, operation: str, radius: int = 1, **kw -) -> xr.DataArray: - """ - Apply morphological operation to Dask based xarray Array - - :param kw: passed on to the underlying operation - border_value - - """ - import dask_image.ndmorph - - ops = { - "dilation": dask_image.ndmorph.binary_dilation, - "erosion": dask_image.ndmorph.binary_erosion, - "opening": dask_image.ndmorph.binary_opening, - "closing": dask_image.ndmorph.binary_closing, - } - assert dask.is_dask_collection(xx.data) - assert operation in ops - - kernel = _disk(radius, xx.ndim) - data = ops[operation](xx.data, kernel, **kw) - - return xr.DataArray(data=data, coords=xx.coords, dims=xx.dims, attrs=xx.attrs) - - -def binary_erosion(xx: xr.DataArray, radius: int = 1, **kw) -> xr.DataArray: - return xr_apply_morph_op(xx, "erosion", radius, **kw) - - -def binary_dilation(xx: xr.DataArray, radius: int = 1, **kw) -> xr.DataArray: - return xr_apply_morph_op(xx, "dilation", radius, **kw) - - -def binary_opening(xx: xr.DataArray, radius: int = 1, **kw) -> xr.DataArray: - return xr_apply_morph_op(xx, "opening", radius, **kw) - - -def binary_closing(xx: xr.DataArray, radius: int = 1, **kw) -> xr.DataArray: - return xr_apply_morph_op(xx, "closing", radius, **kw) - - -def mask_cleanup_np( - mask: np.ndarray, - mask_filters: Iterable[Tuple[str, int]] = None, -) -> np.ndarray: - """ - Apply morphological operations on given binary mask. - - :param mask: Binary image to process - :param mask_filters: Iterable tuples of morphological operations to apply on mask - """ - import skimage.morphology as morph - - assert mask.dtype == "bool" - - ops = dict( - opening=morph.binary_opening, - closing=morph.binary_closing, - dilation=morph.binary_dilation, - erosion=morph.binary_erosion, - ) - - mask_filters = ( - [("opening", 2), ("dilation", 5)] if mask_filters is None else mask_filters - ) - for operation, radius in mask_filters: - op = ops.get(operation, None) - if op is None: - raise ValueError(f"Not supported morphological operation: {operation}") - if radius > 0: - mask = op(mask, _disk(radius, mask.ndim)) - return mask - - -# pylint: enable=import-outside-toplevel -def _compute_overlap_depth(r: Iterable[int], ndim: int) -> Tuple[int, ...]: - _r = max(r) - return (0,) * (ndim - 2) + (_r, _r) - - -def mask_cleanup( - mask: xr.DataArray, - mask_filters: Iterable[Tuple[str, int]] = None, - name: Optional[str] = None, -) -> xr.DataArray: - """ - Apply morphological operations on given binary mask. - - As we fuse those operations into single Dask task, it could be faster to run. - - Default mask_filters value is bit-equivalent to ``mask |> opening |> dilation``. - - :param mask: Binary image to process - :param mask_filters: iterable tuples of morphological operations - - ("", ) - to apply on mask, where - operation: string, can be one of this morphological operations - - closing = remove small holes in cloud - morphological closing - opening = shrinks away small areas of the mask - dilation = adds padding to the mask - erosion = shrinks bright regions and enlarges dark regions - radius: int - :param name: Used when building Dask graphs - """ - data = mask.data - - mask_filters = ( - [("opening", 2), ("dilation", 5)] if mask_filters is None else mask_filters - ) - - if dask.is_dask_collection(data): - rr = [radius for _, radius in mask_filters] - depth = _compute_overlap_depth(rr, data.ndim) - - if name is None: - name = "mask_cleanup" - for radius in rr: - name = name + f"_{radius}" - - data = data.map_overlap( - partial(mask_cleanup_np, mask_filters=mask_filters), - depth, - boundary="none", - name=randomize(name), - ) - else: - data = mask_cleanup_np(data, mask_filters=mask_filters) - - return xr.DataArray(data, attrs=mask.attrs, coords=mask.coords, dims=mask.dims) - - -def enum_to_bool( - mask: xr.DataArray, - categories: Iterable[Union[str, int]], - invert: bool = False, - flag_name: str = "", - value_true: int = 1, - value_false: int = 0, - dtype: Union[str, np.dtype] = "bool", - name: str = "enum_to_bool", -) -> xr.DataArray: - """ - This method works for fmask and other "enumerated" masks - - It is equivalent to `np.isin(mask, categories)` - - example: - xx = dc.load(.., measurements=['fmask', ...]) - no_cloud = enum_to_bool(xx.fmask, ('valid', 'snow', 'water')) - - xx.where(no_cloud).isel(time=0).nbar_red.plot.imshow() - - """ - categories_s = tuple(c for c in categories if isinstance(c, str)) - classes = tuple(c for c in categories if isinstance(c, int)) - - if len(categories_s) > 0: - flags = getattr(mask, "flags_definition", None) - if flags is None: - raise ValueError("Missing flags_definition attribute") - - classes = classes + _get_enum_values(categories_s, flags, flag=flag_name) - - op = partial( - _enum_to_mask_numexpr, - classes=classes, - invert=invert, - value_false=value_false, - value_true=value_true, - dtype=dtype, - ) - - if dask.is_dask_collection(mask.data): - data = da.map_blocks(op, mask.data, name=randomize(name), dtype=dtype) - else: - data = op(mask) - - attrs = dict(mask.attrs) - attrs.pop("flags_definition", None) - attrs.pop("nodata", None) - - bmask = xr.DataArray(data=data, dims=mask.dims, coords=mask.coords, attrs=attrs) - - return bmask - - -def fmask_to_bool( - mask: xr.DataArray, - categories: Iterable[str], - invert: bool = False, - flag_name: str = "", - **kw, -) -> xr.DataArray: - """ - This method works for fmask and other "enumerated" masks - - It is equivalent to `np.isin(mask, categories)` - - example: - xx = dc.load(.., measurements=['fmask', ...]) - no_cloud = fmask_to_bool(xx.fmask, ('valid', 'snow', 'water')) - - xx.where(no_cloud).isel(time=0).nbar_red.plot.imshow() - - """ - return enum_to_bool(mask, categories, invert=invert, flag_name=flag_name, **kw) - - -def _gap_fill_np(a, fallback, nodata): - params = dict(a=a, b=fallback, nodata=a.dtype.type(nodata)) - - out = np.empty_like(a) - - if np.isnan(nodata): - # a==a equivalent to `not isnan(a)` - expr = "where(a==a, a, b)" - else: - expr = "where(a!=nodata, a, b)" - - return ne.evaluate(expr, local_dict=params, out=out, casting="unsafe") - - -def gap_fill(x: xr.DataArray, fallback: xr.DataArray, nodata=None, attrs=None): - """Fill missing values in `x` with values from `fallback`. - - x,fallback are expected to be xarray.DataArray with identical shape and dtype. - - out[pix] = x[pix] if x[pix] != x.nodata else fallback[pix] - """ - - if nodata is None: - nodata = getattr(x, "nodata", None) - - if nodata is None: - nodata = default_nodata(x.dtype) - else: - nodata = x.dtype.type(nodata) - - if attrs is None: - attrs = x.attrs.copy() - - if dask.is_dask_collection(x): - data = da.map_blocks( - _gap_fill_np, - x.data, - fallback.data, - nodata, - name=randomize("gap_fill"), - dtype=x.dtype, - ) - else: - data = _gap_fill_np(x.data, fallback.data, nodata) - - return xr.DataArray(data, attrs=attrs, dims=x.dims, coords=x.coords, name=x.name) - - -def _first_valid_np( - *aa: np.ndarray, nodata: Union[float, int, None] = None -) -> np.ndarray: - out = aa[0].copy() - if nodata is None: - nodata = default_nodata(out.dtype) - - for a in aa[1:]: - out = _gap_fill_np(out, a, nodata) - - return out - - -def _fuse_min_np(*aa: np.ndarray) -> np.ndarray: - """ - Element wise min (propagates NaN values). - """ - out = aa[0].copy() - for a in aa[1:]: - out = np.minimum(out, a, out=out) - return out - - -def _fuse_max_np(*aa: np.ndarray) -> np.ndarray: - """ - Element wise max (propagates NaN values). - """ - out = aa[0].copy() - for a in aa[1:]: - out = np.maximum(out, a, out=out) - return out - - -def _fuse_and_np(*aa: np.ndarray) -> np.ndarray: - """ - Element wise bit and. - """ - assert len(aa) > 0 - out = aa[0].copy() - for a in aa[1:]: - out &= a - return out - - -def _fuse_or_np(*aa: np.ndarray) -> np.ndarray: - """ - Element wise bit or. - """ - assert len(aa) > 0 - out = aa[0].copy() - for a in aa[1:]: - out |= a - return out - - -def _da_fuse_with_custom_op(xx: da.Array, op, name="fuse") -> da.Array: - """ - Out[0, y, x] = op(In[0:1, y, x], In[1:2, y, x], In[2:3, y, x]...) - - """ - can_do_flat = all(ch == 1 for ch in xx.chunks[0]) - if not can_do_flat: - slices = [xx[i : i + 1] for i in range(xx.shape[0])] - return da.map_blocks(op, *slices, name=name) - - chunks, shapes = _get_chunks_asarray(xx) - dsk = {} - name = randomize(name) - for idx in np.ndindex(chunks.shape[1:]): - blocks = chunks[(slice(None), *idx)].ravel() - dsk[(name, 0, *idx)] = (op, *blocks) - - dsk = HighLevelGraph.from_collections(name, dsk, dependencies=[xx]) - shape = (1, *xx.shape[1:]) - chunks = ((1,), *xx.chunks[1:]) - - return da.Array(dsk, name, shape=shape, dtype=xx.dtype, chunks=chunks) - - -def _fuse_with_custom_op(x: xr.DataArray, op, name="fuse") -> xr.DataArray: - """ - Out[0, y, x] = op(In[0:1, y, x], In[1:2, y, x], In[2:3, y, x]...) - - Expects data in _,y,x order. Works on Dask inputs too. - """ - if x.shape[0] > 1: - if dask.is_dask_collection(x.data): - data = _da_fuse_with_custom_op(x.data, op, name=name) - else: - slices = [x.data[i : i + 1] for i in range(x.shape[0])] - data = op(*slices) - else: - data = x.data - - coords = dict(x.coords.items()) - for key, val in dict(x.coords[x.dims[0]][0:1].coords).items(): - coords[key] = val - - res = xr.DataArray(data, attrs=x.attrs, dims=x.dims, coords=coords, name=x.name) - - # must only set index on coords when - # a) it is defined on a dim - # b) it is not named after dim and - # c) it is not "indexed" - - indexes = [ - ind - for ind, val in coords.items() - if len(val.dims) > 0 and ind not in x.dims and ind not in res.indexes - ] - if indexes == []: - return res - else: - return res.set_xindex(indexes) - - -def _xr_fuse(xx, op, name): - if isinstance(xx, xr.Dataset): - return xx.map(partial(_xr_fuse, op=op, name=name)) - - return _fuse_with_custom_op(xx, op, name=name) - - -def _or_fuser(xx): - """ - meant to be called by `xx.groupby(..).map(_or_fuser)` - """ - return _xr_fuse(xx, _fuse_or_np, "fuse_or") - - -def _and_fuser(xx): - """ - meant to be called by `xx.groupby(..).map(_and_fuser)` - """ - return _xr_fuse(xx, _fuse_and_np, "fuse_and") - - -def _min_fuser(xx): - """ - meant to be called by `xx.groupby(..).map(_min_fuser)` - """ - return _xr_fuse(xx, _fuse_min_np, "fuse_min") - - -def _max_fuser(xx): - """ - meant to be called by `xx.groupby(..).map(_max_fuser)` - """ - return _xr_fuse(xx, _fuse_max_np, "fuse_max") - - -def choose_first_valid(x: xr.DataArray, nodata=None) -> xr.DataArray: - """ - ``Out[0, y, x] = In[i, y, x]`` where ``i`` is index of the first slice - with valid data in it for a pixel at ``y,x`` coordinate. - - Expects data in ``_, y, x`` order. Works on Dask inputs too. - """ - if nodata is None: - nodata = x.attrs.get("nodata", None) - - return _fuse_with_custom_op( - x, partial(_first_valid_np, nodata=nodata), name="choose_first_valid" - ) - - -def _nodata_fuser(xx, **kw): - """ - meant to be called by `xx.groupby(..).map(_nodata_fuser)` - """ - if isinstance(xx, xr.Dataset): - return xx.map(choose_first_valid, **kw) - if xx.shape[0] <= 1: - return xx - return choose_first_valid(xx, **kw) - - -def _fuse_mean_np(*aa, nodata): - assert len(aa) > 0 - - out = aa[0].astype(np.float32) - count = (aa[0] != nodata).astype(np.float32) - for a in aa[1:]: - out += a.astype(np.float32) - count += a != nodata - - out -= (len(aa) - count) * nodata - with np.errstate(divide="ignore", invalid="ignore"): - out = np.round(out / count).astype(aa[0].dtype) - out[count == 0] = nodata - return out diff --git a/libs/algo/odc/algo/_memsink.py b/libs/algo/odc/algo/_memsink.py deleted file mode 100644 index 1287d0ed3..000000000 --- a/libs/algo/odc/algo/_memsink.py +++ /dev/null @@ -1,430 +0,0 @@ -import dask -import dask.array as da -import numpy as np -import uuid -import xarray as xr -from dask.base import tokenize -from dask.delayed import Delayed -from dask.highlevelgraph import HighLevelGraph -from distributed import Client -from typing import Any, Dict, Hashable, Optional, Tuple, Union - -from ._dask import _roi_from_chunks, unpack_chunks - -ShapeLike = Union[int, Tuple[int, ...]] -DtypeLike = Union[str, np.dtype] -ROI = Union[slice, Tuple[slice, ...]] -MaybeROI = Optional[ROI] -CacheKey = Union["Token", str] - -_cache: Dict[str, np.ndarray] = {} - - -class Token: - __slots__ = ["_k"] - - def __init__(self, k: str): - # print(f"Token.init(<{k}>)@0x{id(self):08X}") - self._k = k - - def __str__(self) -> str: - return self._k - - def __bool__(self): - return len(self._k) > 0 - - def release(self): - if self: - Cache.pop(self) - self._k = "" - - def __del__(self): - # print(f"Token.del(<{self._k}>)@0x{id(self):08X}") - self.release() - - def __getstate__(self): - print(f"Token.__getstate__() <{self._k}>@0x{id(self):08X}") - raise ValueError("Token should not be pickled") - - def __setstate__(self, k): - print(f"Token.__setstate__(<{k}>)@0x{id(self):08X}") - raise ValueError("Token should not be pickled") - - -class Cache: - @staticmethod - def new(shape: ShapeLike, dtype: DtypeLike) -> Token: - return Cache.put(np.ndarray(shape, dtype=dtype)) - - @staticmethod - def dask_new(shape: ShapeLike, dtype: DtypeLike, name: str = "") -> Delayed: - if name == "": - name = f"mem_array_{str(dtype)}" - - name = name + "-" + tokenize(name, shape, dtype) - dsk = {name: (Cache.new, shape, dtype)} - return Delayed(name, dsk) - - @staticmethod - def put(x: np.ndarray) -> Token: - k = uuid.uuid4().hex - _cache[k] = x - return Token(k) - - @staticmethod - def get(k: CacheKey) -> Optional[np.ndarray]: - return _cache.get(str(k), None) - - @staticmethod - def pop(k: CacheKey) -> Optional[np.ndarray]: - return _cache.pop(str(k), None) - - -class CachedArray: - def __init__(self, token_or_key: CacheKey): - self._tk = token_or_key - - @property - def data(self) -> np.ndarray: - xx = Cache.get(self._tk) - if xx is None: - raise ValueError("Source array is missing from cache") - return xx - - @property - def shape(self) -> Tuple[int, ...]: - return self.data.shape - - @property - def dtype(self): - return self.data.dtype - - @property - def ndim(self): - return self.data.ndim - - def __getitem__(self, key: ROI) -> np.ndarray: - return self.data[key] - - def __setitem__(self, key, item): - self.data[key] = item - - @staticmethod - def new(shape: ShapeLike, dtype: DtypeLike) -> "CachedArray": - return CachedArray(Cache.new(shape, dtype)) - - @staticmethod - def wrap(x: np.ndarray) -> "CachedArray": - return CachedArray(Cache.put(x)) - - def release(self) -> Optional[np.ndarray]: - return Cache.pop(self._tk) - - -class _YXBTSink: - def __init__( - self, - token_or_key: CacheKey, - band: Union[int, Tuple[slice, slice, slice, slice]], - ): - if isinstance(band, int): - band = np.s_[:, :, band, :] - - self._tk = token_or_key - self._roi = band - - @property - def data(self): - xx = Cache.get(self._tk) - if xx is None: - return None - return xx[self._roi] - - def __setitem__(self, key, item): - assert len(key) == 3 - assert item.ndim == 3 - - it, iy, ix = key - self.data[iy, ix, it] = item.transpose([1, 2, 0]) - - -class _YXTSink: - def __init__( - self, - token_or_key: CacheKey, - ): - self._tk = token_or_key - - @property - def data(self): - return Cache.get(self._tk) - - def __setitem__(self, key, item): - assert len(key) == 3 - assert item.ndim == 3 - - it, iy, ix = key - self.data[iy, ix, it] = item.transpose([1, 2, 0]) - - -def store_to_mem( - xx: da.Array, client: Client, out: Optional[np.ndarray] = None -) -> np.ndarray: - assert client.scheduler.address.startswith("inproc://") - token = None - if out is None: - sink = dask.delayed(CachedArray.new)(xx.shape, xx.dtype) - else: - assert out.shape == xx.shape - token = Cache.put(out) - sink = dask.delayed(CachedArray)(str(token)) - - try: - fut = da.store(xx, sink, lock=False, compute=False) - fut, _sink = client.compute([fut, sink]) - fut.result() - return _sink.result().data - finally: - if token is not None: - token.release() - - -def yxbt_sink_to_mem(bands: Tuple[da.Array, ...], client: Client) -> np.ndarray: - assert client.scheduler.address.startswith("inproc://") - - b = bands[0] - dtype = b.dtype - nt, ny, nx = b.shape - nb = len(bands) - token = Cache.new((ny, nx, nb, nt), dtype) - sinks = [_YXBTSink(str(token), idx) for idx in range(nb)] - try: - fut = da.store(bands, sinks, lock=False, compute=False) - fut = client.compute(fut) - fut.result() - return Cache.get(token) - finally: - token.release() - - -def _chunk_extractor(cache_key: CacheKey, roi: ROI, *deps) -> np.ndarray: - src = Cache.get(cache_key) - assert src is not None - return src[roi] - - -def _da_from_mem( - token: Delayed, - shape: ShapeLike, - dtype: DtypeLike, - chunks: Tuple[int, ...], - name: str = "from_mem", -) -> da.Array: - """ - Construct dask view of some yet to be computed in RAM store. - - :param token: Should evaluate to either Token or string key in to the Cache, - which is expected to contain ``numpy`` array of supplied - ``shape`` and ``dtype`` - - :param shape: Expected shape of the future array - - :param dtype: Expected dtype of the future array - - :param chunks: Tuple of integers describing chunk partitioning for output array - - :param name: Dask name - - Gotchas - ======= - - - Output array can not be moved from one worker to another. - - Works with in-process Client - - Works with single worker cluster - - Can work if scheduler is told to schedule this on a single worker - - - Cache life cycle management can be tough. If token evaluates to a - ``Token`` object then automatic cache cleanup should happen when output - array is destroyed. If it is just a string, then it's up to caller to - ensure that there is cleanup and no use after free. - - Returns - ======= - Dask Array - """ - if not isinstance(shape, tuple): - shape = (shape,) - - assert dask.is_dask_collection(token) - assert len(shape) == len(chunks) - - _chunks = unpack_chunks(chunks, shape) - _rois = [tuple(_roi_from_chunks(ch)) for ch in _chunks] - - def _roi(idx): - return tuple(_rois[i][k] for i, k in enumerate(idx)) - - shape_in_chunks = tuple(len(ch) for ch in _chunks) - - dsk = {} - name = name + "-" + tokenize(token, shape, dtype, chunks) - dsk[name] = [] - - for idx in np.ndindex(shape_in_chunks): - dsk[(name, *idx)] = (_chunk_extractor, token.key, _roi(idx)) - dsk[name].append((name, *idx)) - - dsk = HighLevelGraph.from_collections(name, dsk, dependencies=[token]) - - return da.Array(dsk, name, shape=shape, dtype=dtype, chunks=_chunks) - - -def da_mem_sink(xx: da.Array, chunks: Tuple[int, ...], name="memsink") -> da.Array: - """ - It's a kind of fancy rechunk for special needs. - - Assumptions - - Single worker only - - ``xx`` can fit in RAM of the worker - - Note that every output chunk depends on ALL of input chunks. - - On some Dask worker: - - Fully evaluate ``xx`` and serialize to RAM - - Present in RAM view of the result with a different chunking regime - - A common use case would be to load a large collection (>50% of RAM) that - needs to be processed by some non-Dask code as a whole. A simple - ``do_stuff(xx.compute())`` would not work as duplicating RAM is not an - option in that scenario. Normal rechunk might also run out of RAM and - introduces large memory copy overhead as all input chunks need to be cached - then re-assembled into a different chunking structure. - """ - tk = tokenize(xx) - - token = Cache.dask_new(xx.shape, xx.dtype, f"{name}_alloc") - - # Store everything to MEM and only then evaluate to Token - sink = dask.delayed(CachedArray)(token) - fut = da.store(xx, sink, lock=False, compute=False) - sink_name = f"{name}_collect-{tk}" - dsk = dict(fut.dask) - dsk[sink_name] = (lambda *x: x[0], token.key, *fut.dask[fut.key]) - dsk = HighLevelGraph.from_collections(sink_name, dsk, dependencies=[sink]) - token_done = Delayed(sink_name, dsk) - - return _da_from_mem( - token_done, shape=xx.shape, dtype=xx.dtype, chunks=chunks, name=name - ) - - -def da_yxt_sink(band: da.Array, chunks: Tuple[int, int, int], name="yxt") -> da.Array: - """ - band is in - output is - - eval(band) |> transpose(YXT) |> Store(RAM) |> DaskArray(RAM, chunks) - """ - tk = tokenize(band, "da_yxt_sink", chunks, name) - - dtype = band.dtype - nt, ny, nx = band.shape - shape = (ny, nx, nt) - - token = Cache.dask_new(shape, dtype, f"{name}_alloc") - - sink = dask.delayed(_YXTSink)(token) - fut = da.store([band], [sink], lock=False, compute=False) - sink_name = f"{name}_collect-{tk}" - dsk = dict(fut.dask) - dsk[sink_name] = (lambda *x: x[0], token.key, *fut.dask[fut.key]) - dsk = HighLevelGraph.from_collections(sink_name, dsk, dependencies=[sink]) - token_done = Delayed(sink_name, dsk) - - return _da_from_mem(token_done, shape=shape, dtype=dtype, chunks=chunks, name=name) - - -def da_yxbt_sink( - bands: Tuple[da.Array, ...], chunks: Tuple[int, ...], name="yxbt" -) -> da.Array: - """ - each band is in - output is - - eval(bands) |> transpose(YXBT) |> Store(RAM) |> DaskArray(RAM, chunks) - """ - tk = tokenize(*bands, chunks, name) - - b = bands[0] - dtype = b.dtype - nt, ny, nx = b.shape - nb = len(bands) - shape = (ny, nx, nb, nt) - - token = Cache.dask_new(shape, dtype, f"{name}_alloc") - - sinks = [dask.delayed(_YXBTSink)(token, idx) for idx in range(nb)] - fut = da.store(bands, sinks, lock=False, compute=False) - sink_name = f"{name}_collect-{tk}" - dsk = dict(fut.dask) - dsk[sink_name] = (lambda *x: x[0], token.key, *fut.dask[fut.key]) - dsk = HighLevelGraph.from_collections(sink_name, dsk, dependencies=sinks) - token_done = Delayed(sink_name, dsk) - - return _da_from_mem(token_done, shape=shape, dtype=dtype, chunks=chunks, name=name) - - -def yxbt_sink(ds: xr.Dataset, chunks: Tuple[int, int, int, int]) -> xr.DataArray: - """ - Given a Dask dataset with several bands and ``T,Y,X`` axis order on input, - turn that into a Dask DataArray with axis order being ``Y, X, Band, T``. - - The way this function work is - - Evaluate all input data before making any output chunk available for further processing - - For each input block store it into appropriate location in RAM. - - Expose in RAM store as Dask Array with requested chunking regime - - This is used for Geomedian computation mostly, for GM chunks need to be ``(ny, nx, -1,-1)``. - - :param ds: Dataset with Dask based arrays ``T,Y,X`` axis order - :param chunks: Chunk size for output array, example: ``(100, 100, -1, -1)`` - - Gotchas - ======= - - - Output array can not be moved from one worker to another. - - Works with in-process Client - - Works with single worker cluster - - Can work if scheduler is told to schedule this on a single worker - - - Returns - ======= - xarray DataArray backed by Dask array. - """ - b0, *_ = ds.data_vars.values() - data = da_yxbt_sink(tuple(dv.data for dv in ds.data_vars.values()), chunks) - attrs = dict(b0.attrs) - dims = b0.dims[1:] + ("band", b0.dims[0]) - - coords: Dict[Hashable, Any] = dict(ds.coords.items()) - coords["band"] = list(ds.data_vars) - - return xr.DataArray(data=data, dims=dims, coords=coords, attrs=attrs) - - -def yxt_sink(band: xr.DataArray, chunks: Tuple[int, int, int]) -> xr.DataArray: - """ - Load ``T,Y,X` dataset into RAM with transpose to ``Y,X,T``, then present - that as Dask array with specified chunking. - - :param band: - Dask backed :class:`xr.DataArray` data in ``T,Y,X`` order - :param chunks: - Desired output chunk size in output order ``Y,X,T`` - :return: - Dask backed :class:`xr.DataArray` with requested chunks and ``Y,X,T`` axis order - """ - data = da_yxt_sink(band.data, chunks=chunks) - dims = band.dims[1:] + (band.dims[0],) - return xr.DataArray(data=data, dims=dims, coords=band.coords, attrs=band.attrs) diff --git a/libs/algo/odc/algo/_numeric.py b/libs/algo/odc/algo/_numeric.py deleted file mode 100644 index 63f5dfbaa..000000000 --- a/libs/algo/odc/algo/_numeric.py +++ /dev/null @@ -1,97 +0,0 @@ -""" -Misc numeric tooling -""" -import numpy as np -from typing import Optional, Tuple - -from ._types import NumpyIndex, NumpyIndex1 - - -def half_up(n: int) -> int: - """ - Divide by 2 and round up when input is odd. - - even -> n//2 - odd -> n//2 + 1 - """ - return (n + 1) // 2 - - -def np_slice_to_idx(idx: NumpyIndex1, n: int) -> Tuple[int, ...]: - """ - Convert slice into a tuple of 0-based indexes - """ - ii = np.arange(n, dtype=np.int32)[idx] - if isinstance(ii, np.int32): - return (int(ii),) - return tuple(ii) - - -def roundup2(x: int) -> int: - """ - @returns smallest integer Y that satisfies: (Y%2 == 0) and (Y >= x) - """ - return (x + 1) & (~0x1) - - -def roundup16(x: int) -> int: - """ - @returns smallest integer Y that satisfies: (Y%16 == 0) and (Y >= x) - """ - return (x + 15) & (~0xF) - - -def roi_shrink2(idx: NumpyIndex, axis: int = 0) -> NumpyIndex: - """ - Shrink 2d array slice - - :param idx: Slice into full sized image. - :param axis: Index of the Y axis, assumed to be 0 if not supplied. For example for (B, Y, X) supply axis=1. - """ - - def maybe_half(x: Optional[int]) -> Optional[int]: - if x is None: - return None - return half_up(x) - - def _shrink2(idx: NumpyIndex1) -> NumpyIndex1: - if isinstance(idx, int): - return half_up(idx) - if isinstance(idx, slice): - return slice( - maybe_half(idx.start), maybe_half(idx.stop), maybe_half(idx.step) - ) - raise ValueError("idx must be int or slice") - - return ( - idx[:axis] + tuple(_shrink2(i) for i in idx[axis : axis + 2]) + idx[axis + 2 :] - ) - - -def shape_shrink2(shape: Tuple[int, ...], axis: int = 0) -> Tuple[int, ...]: - """ - Given a shape compute half sized image shape - - :param shape: Input image shape, default order is (Y, X, [Band]) - :param axis: Index of the Y axis, assumed to be 0 if not supplied. For example for (B, Y, X) supply axis=1. - """ - n1, n2 = map(half_up, shape[axis : axis + 2]) - return shape[:axis] + (n1, n2) + shape[axis + 2 :] - - -def work_dtype(dtype: np.dtype) -> np.dtype: - """ - For integer types return {u}int32 for {u}int{8,16} and {u}int64 for others. - For non-integer types returns input dtype. - """ - if dtype.kind == "u": - if dtype.itemsize < 4: - return np.dtype("uint32") - else: - return np.dtype("uint64") - elif dtype.kind == "i": - if dtype.itemsize < 4: - return np.dtype("int32") - else: - return np.dtype("int64") - return dtype diff --git a/libs/algo/odc/algo/_numexpr.py b/libs/algo/odc/algo/_numexpr.py deleted file mode 100644 index fc53cdec0..000000000 --- a/libs/algo/odc/algo/_numexpr.py +++ /dev/null @@ -1,151 +0,0 @@ -import dask -import dask.array as da -import functools -import numexpr as ne -import numpy as np -import xarray as xr -from typing import Any, Dict, Optional - -from ._dask import flatten_kv, randomize, unflatten_kv - - -def apply_numexpr_np( - expr: str, - data: Optional[Dict[str, Any]] = None, - dtype=None, - out: Optional[np.ndarray] = None, - casting="safe", - order="K", - **params, -) -> np.ndarray: - """ - Apply numexpr to numpy arrays - """ - - if data is None: - data = params - else: - data.update(params) - - if out is None and dtype is not None: - # This needs to be np.ndarray - arrays = [x for x in data.values() if isinstance(x, np.ndarray)] - if len(arrays) == 0: - raise ValueError("Could not find any arrays on input") - - sample_input = arrays[0] - out = np.empty_like(sample_input, dtype=dtype) - - return ne.evaluate(expr, local_dict=data, out=out, casting=casting, order=order) - - -def apply_numexpr( - expr: str, - xx: xr.Dataset, - dtype=None, - name="result", - casting="safe", - order="K", - **params, -): - """ - Apply numexpr to variables within a Dataset. - - numexpr library offers a limited subset of types and operations supported - by numpy, but is much faster and memory efficient, particularly for complex - expressions. See numexpr documentation for a more detailed explanation of - performance advantages of using this library over numpy operations, - summary: single pass over input memory, no temporary arrays, cache - locality. - - :param expr: Numexpr compatible string to evaluate - :param xx: Dataset object that contains arrays to be used in the ``expr`` (can be Dask) - :param dtype: specify output dtype - :param name: Used to name computation when input is Dask - :param casting: Passed to ``numexpr.evaluate`` - :param order: Passed to ``numexpr.evaluate`` - :param params: Any other constants you use in the expression - :raturns: xr.DataArray containing result of the equation (Dask is input is Dask) - - Example: - - .. code-block:: python - - # Given a Dataset with bands `red` and `nir` - xx = dc.load(..., measurements=["red", "nir"], dask_chunks={}) - - # Compute NDVI (ignore nodata for simplicity of the example) - ndvi = apply_numexpr("(_1f*nir - red)/(_1f*nir + red)", - xx, - dtype='float32', # Output is float32 - _1f=np.float32(1) # Define constant `_1f` being a float32(1), - # used for casting to float32 - ) - """ - - bands = {} - sample_band = None - - for band, x in xx.data_vars.items(): - band = str(band) - - if band in params: - raise ValueError(f"Variable: `{band}` is aliased by a parameter") - if band in expr: - bands[band] = x.data - - if sample_band is None: - sample_band = x - - if sample_band is None: - raise ValueError("Found no bands on input") - - op = functools.partial( - apply_numexpr_np, expr, dtype=dtype, casting=casting, order=order, **params - ) - - if dask.is_dask_collection(xx): - # Passing through dictionary of Dask Arrays didn't work, so we have - # adaptor that accepts var args in the form of [k0,v0, k1,v1, ...] and then reconstructs dict - data = da.map_blocks( - lambda op, *bands: op(unflatten_kv(bands)), - op, - *flatten_kv(bands), - name=randomize(name), - dtype=dtype, - ) - else: - data = op(bands) - - return xr.DataArray( - data=data, - attrs=sample_band.attrs, - dims=sample_band.dims, - coords=sample_band.coords, - name=name, - ) - - -def safe_div(x1: xr.DataArray, x2: xr.DataArray, dtype="float32") -> xr.DataArray: - """ - Compute ``x1.astype(dtype)/x2.astype(dtype)`` taking care of cases where x2==0. - - For every element compute the following: - - :: - - x2 is 0 => NaN - else => float(x1)/float(x2) - - TODO: currently doesn't treat nodata values in any special way. - """ - dtype = np.dtype(dtype) - - # TODO: support nodata on input - return apply_numexpr( - "where(x2 == 0, nan, (_1f * x1) / x2)", - xr.Dataset(dict(x1=x1, x2=x2)), - dtype=dtype, - nan=dtype.type("nan"), - _1f=dtype.type(1), - ) diff --git a/libs/algo/odc/algo/_percentile.py b/libs/algo/odc/algo/_percentile.py deleted file mode 100644 index 51e6cc7b1..000000000 --- a/libs/algo/odc/algo/_percentile.py +++ /dev/null @@ -1,139 +0,0 @@ -import dask -import dask.array as da -import numpy as np -import xarray as xr -from dask.base import tokenize -from functools import partial -from typing import Sequence - -from ._masking import keep_good_np - - -def np_percentile(xx, percentile, nodata): - if np.isnan(nodata): - high = True - mask = ~np.isnan(xx) - else: - high = nodata >= xx.max() - mask = xx != nodata - - valid_counts = mask.sum(axis=0) - - xx = np.sort(xx, axis=0) - - indices = np.round(percentile * (valid_counts - 1)) - if not high: - indices += xx.shape[0] - valid_counts - indices[valid_counts == 0] = 0 - - indices = indices.astype(np.int64).flatten() - step = xx.size // xx.shape[0] - indices = step * indices + np.arange(len(indices)) - - xx = xx.take(indices).reshape(xx.shape[1:]) - - return keep_good_np(xx, (valid_counts >= 3), nodata) - - -def xr_quantile_bands( - src: xr.Dataset, - quantiles: Sequence, - nodata, -) -> xr.Dataset: - """ - Calculates the quantiles of the input data along the time dimension. - - This approach is approximately 700x faster than the `numpy` and `xarray` nanpercentile functions. - - :param src: xr.Dataset, bands can be either - float or integer with `nodata` values to indicate gaps in data. - `nodata` must be the largest or smallest values in the dataset or NaN. - - :param quantiles: A sequence of quantiles in the [0.0, 1.0] range - - :param nodata: The `nodata` value - """ - # pylint: disable=undefined-loop-variable - - data_vars = {} - for band, xx in src.data_vars.items(): - xx_data = xx.data - - if dask.is_dask_collection(xx_data): - if len(xx.chunks[0]) > 1: - xx_data = xx_data.rechunk({0: -1}) - - tk = tokenize(xx_data, quantiles, nodata) - for quantile in quantiles: - name = f"{band}_pc_{int(100 * quantile)}" - if dask.is_dask_collection(xx_data): - yy = da.map_blocks( - partial(np_percentile, percentile=quantile, nodata=nodata), - xx_data, - drop_axis=0, - meta=np.array([], dtype=xx.dtype), - name=f"{name}-{tk}", - ) - else: - yy = np_percentile(xx_data, percentile=quantile, nodata=nodata) - data_vars[name] = xr.DataArray(yy, dims=xx.dims[1:], attrs=xx.attrs) - - coords = dict((dim, src.coords[dim]) for dim in xx.dims[1:]) - return xr.Dataset(data_vars=data_vars, coords=coords, attrs=src.attrs) - - -def xr_quantile( - src: xr.Dataset, - quantiles: Sequence, - nodata, -) -> xr.Dataset: - """ - Calculates the percentiles of the input data along the time dimension. - - This approach is approximately 700x faster than the `numpy` and `xarray` nanpercentile functions. - - :param src: xr.Dataset, bands can be either - float or integer with `nodata` values to indicate gaps in data. - `nodata` must be the largest or smallest values in the dataset or NaN. - - :param percentiles: A sequence of quantiles in the [0.0, 1.0] range - - :param nodata: The `nodata` value - """ - - data_vars = {} - for band, xx in src.data_vars.items(): - xx_data = xx.data - out_dims = ("quantile",) + xx.dims[1:] - - if dask.is_dask_collection(xx_data): - if len(xx.chunks[0]) > 1: - xx_data = xx_data.rechunk({0: -1}) - - tk = tokenize(xx_data, quantiles, nodata) - data = [] - for quantile in quantiles: - name = f"{band}_pc_{int(100 * quantile)}" - if dask.is_dask_collection(xx_data): - yy = da.map_blocks( - partial(np_percentile, percentile=quantile, nodata=nodata), - xx_data, - drop_axis=0, - meta=np.array([], dtype=xx.dtype), - name=f"{name}-{tk}", - ) - else: - yy = np_percentile(xx_data, percentile=quantile, nodata=nodata) - data.append(yy) - - if dask.is_dask_collection(yy): - data_vars[band] = (out_dims, da.stack(data, axis=0)) - else: - data_vars[band] = (out_dims, np.stack(data, axis=0)) - - # pylint: disable=undefined-loop-variable - coords = dict( - (dim, src.coords[dim]) for dim in xx.dims[1:] - ) # pylint: disable=undefined-loop-variable - coords["quantile"] = np.array(quantiles) - return xr.Dataset(data_vars=data_vars, coords=coords, attrs=src.attrs) diff --git a/libs/algo/odc/algo/_rgba.py b/libs/algo/odc/algo/_rgba.py deleted file mode 100644 index 24206ed3f..000000000 --- a/libs/algo/odc/algo/_rgba.py +++ /dev/null @@ -1,168 +0,0 @@ -""" Helpers for dealing with RGB(A) images. -""" -import dask -import dask.array as da -import numpy as np -import xarray as xr -from typing import List, Optional, Tuple, Union - -from ._dask import randomize - - -def is_rgb(x: xr.DataArray): - if x.dtype != "uint8": - return False - if x.ndim < 3: - return False - if x.shape[-1] not in (3, 4): - return False - return True - - -def guess_rgb_names(bands: List[str]) -> Tuple[str, str, str]: - out = [] - for c in ("red", "green", "blue"): - candidates = [name for name in bands if c in name] - n = len(candidates) - if n == 0: - raise ValueError('Found no candidate for color "{}"'.format(c)) - - if n > 1: - raise ValueError('Found too many candidates for color "{}"'.format(c)) - - out.append(candidates[0]) - r, g, b = out # pylint:disable=unbalanced-tuple-unpacking - return r, g, b - - -def auto_guess_clamp(ds: xr.Dataset): - # TODO: deal with nodata > 0 case - return 0, max(x.data.max() for x in ds.data_vars.values()) - - -def to_u8(x: np.ndarray, x_min: float, x_max: float) -> np.ndarray: - x = np.clip(x, x_min, x_max) - - if x.dtype.kind == "f": - x = (x - x_min) * (255.0 / (x_max - x_min)) - else: - x = (x - x_min).astype("uint32") * 255 // (x_max - x_min) - return x.astype("uint8") - - -def to_rgba_np( - r: np.ndarray, - g: np.ndarray, - b: np.ndarray, - nodata: Optional[float], - clamp: Tuple[float, float], -) -> np.ndarray: - rgba = np.zeros((*r.shape, 4), dtype="uint8") - - if r.dtype.kind == "f": - valid = ~np.isnan(r) - if nodata is not None: - valid = valid * (r != nodata) - elif nodata is not None: - valid = r != nodata - else: - valid = np.ones(r.shape, dtype=bool) - - rgba[..., 3] = valid.astype("uint8") * (0xFF) - for idx, band in enumerate([r, g, b]): - rgba[..., idx] = to_u8(band, *clamp) - - return rgba - - -def to_rgba( - ds: xr.Dataset, - clamp: Optional[Union[float, Tuple[float, float]]] = None, - bands: Optional[Tuple[str, str, str]] = None, -) -> xr.DataArray: - """Given `xr.Dataset` with bands `red,green,blue` construct `xr.Datarray` - containing uint8 rgba image. - - :param ds: xarray Dataset - :param clamp: [min_intensity, max_intensity] | max_intensity == [0, max_intensity] - Can also supply None for non-dask array, in which case clamp is set to [0, max(r,g,b)] - :param bands: Which bands to use, order should be red,green,blue - """ - if bands is None: - bands = guess_rgb_names(list(ds.data_vars)) - - is_dask = dask.is_dask_collection(ds) - - if clamp is None: - if is_dask: - raise ValueError("Must specify clamp for dask inputs") - - clamp = auto_guess_clamp(ds[list(bands)]) - elif not isinstance(clamp, tuple): - clamp = (0, clamp) - - red_band = ds[bands[0]] - nodata = getattr(red_band, "nodata", None) - dims = red_band.dims + ("band",) - geobox = red_band.geobox - crs = str(geobox.crs) if geobox is not None else None - - r, g, b = (ds[name].data for name in bands) - if is_dask: - data = da.map_blocks( - to_rgba_np, - r, - g, - b, - nodata, - clamp, - dtype=np.uint8, - new_axis=[r.ndim], - name=randomize("ro_rgba"), - chunks=(*red_band.chunks, 4), - ) - else: - data = to_rgba_np(r, g, b, nodata, clamp) - - coords = dict(red_band.coords.items()) - coords.update(band=["r", "g", "b", "a"]) - - attrs = {} - if crs is not None: - attrs["crs"] = crs - - rgba = xr.DataArray(data, coords=coords, dims=dims, attrs=attrs) - return rgba - - -def colorize(x: xr.DataArray, cmap: np.ndarray, attrs=None) -> xr.DataArray: - """ - Map categorical values from x to RGBA according to cmap lookup table - - :param x: Input xarray data array (can be Dask) - :param cmap: Lookup table cmap[x] -> RGB(A) - :param attrs: xarray attributes table, if not supplied input attributes are copied across - """ - assert cmap.ndim == 2 - assert cmap.shape[1] in (3, 4) - - if attrs is None: - attrs = x.attrs - - dims = x.dims + ("band",) - coords = dict(x.coords.items()) - coords["band"] = ["r", "g", "b", "a"] - - if dask.is_dask_collection(x.data): - data = da.map_blocks( - lambda x: cmap[x], - x.data, - dtype=cmap.dtype, - new_axis=[x.data.ndim], - chunks=x.chunks + (cmap.shape[1],), - name=randomize("colorize"), - ) - else: - data = cmap[x.data] - - return xr.DataArray(data=data, dims=dims, coords=coords, attrs=attrs) diff --git a/libs/algo/odc/algo/_tiff.py b/libs/algo/odc/algo/_tiff.py deleted file mode 100644 index 0cd1455af..000000000 --- a/libs/algo/odc/algo/_tiff.py +++ /dev/null @@ -1,572 +0,0 @@ -import dask -import dask.array as da -import numpy as np -import rasterio -import threading -import warnings -import xarray as xr -from affine import Affine -from dask.base import tokenize -from dask.delayed import Delayed -from dataclasses import dataclass -from pathlib import Path -from rasterio import MemoryFile -from rasterio.shutil import copy as rio_copy -from rasterio.windows import Window -from typing import Any, Dict, Optional, Tuple, Union -from uuid import uuid4 - -from ._numeric import half_up, np_slice_to_idx, roi_shrink2, roundup16 -from ._types import NodataType, NumpyIndex -from ._warp import _shrink2 - -# pylint: disable=import-outside-toplevel,invalid-name -_UNSET = ":unset:-427d8b3f1944" - - -def _adjust_blocksize(block: int, dim: int) -> int: - if block > dim: - return roundup16(dim) - return roundup16(block) - - -@dataclass -class GeoRasterInfo: - width: int - height: int - count: int - dtype: str - crs: str - transform: Affine - nodata: Optional[NodataType] = None - axis: int = 0 - - def gdal_opts(self): - out = dict(**self.__dict__) - if self.nodata is None: - out.pop("nodata") - out.pop("axis") - return out - - def raster_size(self) -> int: - """ - Compute raster size in bytes - """ - return np.dtype(self.dtype).itemsize * self.width * self.height * self.count - - @staticmethod - def from_xarray(xx: xr.DataArray) -> "GeoRasterInfo": - axis = 0 - geobox = getattr(xx, "geobox", None) - if geobox is None: - raise ValueError("Missing .geobox on input array") - - height, width = geobox.shape - if xx.ndim == 2: - count = 1 - elif xx.ndim == 3: - if xx.shape[:2] == (height, width): - count = xx.shape[2] - elif xx.shape[1:] == (height, width): - count = xx.shape[0] - axis = 1 - else: - raise ValueError("Geobox shape does not match array size") - - nodata = getattr(xx, "nodata", None) - - return GeoRasterInfo( - width, - height, - count, - xx.dtype.name, - str(geobox.crs), - geobox.transform, - nodata, - axis=axis, - ) - - def shrink2(self) -> "GeoRasterInfo": - return GeoRasterInfo( - width=half_up(self.width), - height=half_up(self.height), - count=self.count, - dtype=self.dtype, - crs=self.crs, - transform=self.transform * Affine.scale(2, 2), - nodata=self.nodata, - ) - - -class TIFFSink: - def __init__( - self, - info: GeoRasterInfo, - dst: Union[str, MemoryFile], - blocksize: Optional[int] = None, - bigtiff: Union[str, bool] = "auto", - lock: bool = True, - **extra_rio_opts, - ): - if blocksize is None: - blocksize = 512 - - if bigtiff == "auto": - # do bigtiff if raw raster is larger than 4GB - bigtiff = info.raster_size() > (1 << 32) - - opts = dict( - driver="GTiff", - bigtiff=bigtiff, - tiled=True, - blockxsize=_adjust_blocksize(blocksize, info.width), - blockysize=_adjust_blocksize(blocksize, info.height), - compress="DEFLATE", - zlevel=6, - predictor=2, - num_threads="ALL_CPUS", - ) - opts.update(info.gdal_opts()) - opts.update(extra_rio_opts) - - mem: Optional[MemoryFile] = None - self._mem_mine: Optional[MemoryFile] = None - - if isinstance(dst, str): - if dst == ":mem:": - mem = MemoryFile() - out = mem.open(**opts) - self._mem_mine = mem - else: - out = rasterio.open(dst, mode="w", **opts) - else: - mem = dst - out = dst.open(**opts) - - self._mem = mem - self._info = info - self._out = out - self._lock = threading.Lock() if lock else None - - def __str__(self) -> str: - ii = self._info - return f"TIFFSink: {ii.width}x{ii.height}..{ii.count}..{ii.dtype}" - - def __repr__(self) -> str: - return self.__str__() - - @property - def name(self) -> str: - return self._out.name - - @property - def info(self) -> GeoRasterInfo: - return self._info - - def close(self): - self._out.close() - - def __del__(self): - self.close() - - if self._mem_mine: - self._mem_mine.close() - self._mem_mine = None - - def __setitem__(self, key: NumpyIndex, item: np.ndarray): - ndim = len(key) - info = self._info - assert ndim in (2, 3) - - yx_key = key[info.axis : info.axis + 2] - if ndim == 2: - assert info.axis == 0 - assert item.ndim == 2 - bands: Union[int, Tuple[int, ...]] = 1 - block = item - elif ndim == 3: - if info.axis == 0: - # Y, X, B - bands = np_slice_to_idx(key[2], info.count) - if item.ndim == 2: - block = np.expand_dims(item, axis=0) - else: - # rio expects band to be the first dimension - block = item.transpose([2, 0, 1]) - else: - # B, Y, X - bands = np_slice_to_idx(key[0], info.count) - if item.ndim == 2: - block = np.expand_dims(item, axis=0) - else: - block = item - - # rio wants 1 based indexing - bands = tuple(i + 1 for i in bands) - else: - raise ValueError("Only accept 2 and 3 dimensional data") - - win = Window.from_slices(*yx_key, height=info.height, width=info.width) - if self._lock: - with self._lock: - self._out.write(block, indexes=bands, window=win) - else: - self._out.write(block, indexes=bands, window=win) - - -class COGSink: - def __init__( - self, - info: GeoRasterInfo, - dst: str, - blocksize: Optional[int] = None, - ovr_blocksize: Optional[int] = None, - bigtiff: Union[bool, str] = "auto", - lock: bool = True, - temp_folder: Optional[str] = None, - overview_resampling: str = "average", - rio_opts_first_pass: Optional[Dict[str, Any]] = None, - use_final_blocksizes: bool = False, - **extra_rio_opts, - ): - if blocksize is None: - blocksize = 512 - - if ovr_blocksize is None: - ovr_blocksize = blocksize - - if bigtiff == "auto": - # do bigtiff if raw raster is larger than 4GB - bigtiff = info.raster_size() > (1 << 32) - - opts = dict( - driver="GTiff", - bigtiff=bigtiff, - tiled=True, - blockxsize=_adjust_blocksize(blocksize, info.width), - blockysize=_adjust_blocksize(blocksize, info.height), - compress="DEFLATE", - zlevel=6, - predictor=2, - num_threads="ALL_CPUS", - ) - opts.update(extra_rio_opts) - - if rio_opts_first_pass is None: - rio_opts_first_pass = dict( - compress="zstd", - zstd_level=1, - predictor=1, - num_threads="ALL_CPUS", - sparse_ok=True, - interleave=opts.get("interleave", "pixel"), - ) - - layers = [] - temp = str(uuid4()) - t_dir = "" - if temp_folder: - t_name = temp - else: - t_dir, t_name = temp[:8], temp[9:] - - ext = ".tif" - ii = info - bsz = 2048 - for idx in range(7 + 1): - if temp_folder: - _dst = str(Path(temp_folder) / f"{t_name}{ext}") - else: - _dst = MemoryFile(dirname=t_dir, filename=t_name + ext) - - if use_final_blocksizes: - _bsz = blocksize if idx == 0 else ovr_blocksize - else: - _bsz = bsz - - sink = TIFFSink( - ii, - _dst, - lock=lock, - blocksize=_bsz, - bigtiff=bigtiff, - **rio_opts_first_pass, - ) - layers.append(sink) - - # If last overview was smaller than 1 block along any dimension don't - # go further - if min(ii.width, ii.height) < ovr_blocksize: - break - - ii = ii.shrink2() - ext = ext + ".ovr" - if bsz > 64: - bsz = bsz // 2 - - self._layers = layers - self._mem = MemoryFile() if dst == ":mem:" else None - self._dst = dst - self._rio_opts = opts - self._ovr_blocksize = ovr_blocksize - self._resampling = overview_resampling - self._info = info - - def _shrink2(self, xx, roi): - axis = self._info.axis - out_roi = roi_shrink2(roi, axis=axis) - out = _shrink2( - xx, resampling=self._resampling, nodata=self._info.nodata, axis=axis - ) - - return out_roi, out - - def __setitem__(self, key: NumpyIndex, item: np.ndarray): - dst, *ovrs = self._layers - dst[key] = item - for dst in ovrs: - key, item = self._shrink2(item, key) - dst[key] = item - - def close(self, idx=-1): - if idx < 0: - for dst in self._layers: - dst.close() - elif idx < len(self._layers): - self._layers[idx].close() - - def _copy_cog(self, extract=False, strict=False) -> Optional[bytes]: - with rasterio.Env( - GDAL_TIFF_OVR_BLOCKSIZE=self._ovr_blocksize, - GDAL_DISABLE_READDIR_ON_OPEN=False, - NUM_THREADS="ALL_CPUS", - GDAL_NUM_THREADS="ALL_CPUS", - ): - src = self._layers[0].name - if self._mem is not None: - rio_copy( - src, - self._mem.name, - copy_src_overviews=True, - strict=strict, - **self._rio_opts, - ) - if extract: - # NOTE: this creates a copy of compressed bytes - return bytes(self._mem.getbuffer()) - else: - rio_copy( - src, - self._dst, - copy_src_overviews=True, - strict=strict, - **self._rio_opts, - ) - return None - - def finalise(self, extract=False, strict=False) -> Optional[bytes]: - self.close() # Write out any remainders if needed - return self._copy_cog(extract=extract, strict=strict) - - def mem(self): - return self._mem - - def dump_to_s3(self, url, creds=None, **kw): - import boto3 - from boto3.s3.transfer import TransferConfig - from odc.aws import s3_url_parse - - assert self._mem is not None - - GB = 1 << 30 - transfer_config = TransferConfig(multipart_threshold=5 * GB) - bucket, key = s3_url_parse(url) - creds_opts = ( - {} - if creds is None - else dict( - aws_access_key_id=creds.access_key, - aws_secret_access_key=creds.secret_key, - aws_session_token=creds.token, - ) - ) - s3 = boto3.client("s3", **creds_opts) - - return s3.upload_fileobj( - self._mem, bucket, key, ExtraArgs=kw, Config=transfer_config - ) - - @staticmethod - def dask_finalise( - sink: Delayed, *deps, extract=False, strict=False, return_value=_UNSET - ) -> Delayed: - """ - - When extract=True --> returns bytes (doubles memory requirements!!!) - When extract=False -> returns return_value if supplied, or sink after completing everything - """ - tk = tokenize(sink, extract, strict) - delayed_close = dask.delayed(lambda sink, idx, *deps: sink.close(idx)) - parts = [ - delayed_close(sink, idx, *deps, dask_key_name=(f"cog_close-{tk}", idx)) - for idx in range(8) - ] - - def _copy_cog(sink, extract, strict, return_value, *parts): - bb = sink._copy_cog(extract=extract, strict=strict) - if return_value == _UNSET: - return bb if extract else sink - else: - return return_value - - return dask.delayed(_copy_cog)( - sink, extract, strict, return_value, *parts, dask_key_name=f"cog_copy-{tk}" - ) - - -def save_cog( - xx: xr.DataArray, - dst: str, - blocksize: Optional[int] = None, - ovr_blocksize: Optional[int] = None, - bigtiff: Union[bool, str] = "auto", - temp_folder: Optional[str] = None, - overview_resampling: str = "average", - rio_opts_first_pass: Optional[Dict[str, Any]] = None, - use_final_blocksizes: bool = False, - ACL: Optional[str] = None, - creds: Optional[Any] = None, - **extra_rio_opts, -): - """ - Save Dask array to COG incrementally (without instantiating whole image at once). - - COG generation is a two stage process. First we create a bunch of TIFF - images, one for each overview levels, these are compressed with fast ZSTD - compression (lossless and with quick rather than good compression - settings). Overviews are generated block by block so we do not keep them - in-memory uncompressed. To avoid block boundary artefacts, input blocks are - set to be 2048x2048 (2**11, edge blocks can have any size). Use that size - at construction time for optimal performance. - - :param xx: Geo registered Array, data could be arranged in Y,X or Y,X,B or B,Y,X order. - To avoid re-chunking use block sizes of 2048x2048. - :param dst: ":mem:" or file path or s3 url - :param blocksize: Block size of the final COG (512 pixels) - :param ovr_blocksize: Block size of the overview images (default same as main image) - :param bigtiff: True|False|"auto" Default is to use bigtiff for inputs greater than 4Gb uncompressed - :param temp_folder: By default first pass images are written to RAM, with this option they - can be written to disk instead - :param overview_resampling: Resampling to use for overview generation: nearest|average|bilinear|... - - :param rio_opts_first_pass: Change defaults for first pass compression - :param use_final_blocksizes: By default first pass blocksizes are fixed at - 2048x2048, 1024x1024, 512x512,...64x64, this - way blocks across different overview levels - have one to one mapping. With this option one - can use final image block sizes for the first - pass instead. - :param ACL: Used when dst is S3 - - :param creds: Credentials to use for writing to S3. If not supplied will - attempt to obtain them locally and pass on to the worker. If - local credentials are absent then it will print a warning and - will attempt to credentialize on the worker instead. - """ - assert dask.is_dask_collection(xx) - tk = tokenize( - xx, - dst, - blocksize, - ovr_blocksize, - bigtiff, - temp_folder, - overview_resampling, - extra_rio_opts, - ) - - info = GeoRasterInfo.from_xarray(xx) - - # Rechunk to 2048x2048 in YX, if needed - axis = info.axis - data = xx.data - chunks = data.chunksize - yx_chunks = chunks[axis : axis + 2] - - if yx_chunks != (2048, 2048): - data = data.rechunk(chunks[:axis] + (2048, 2048) + chunks[axis + 2 :]) - - s3_url: Optional[str] = None - extract = False - if dst == ":mem:": - extract = True - elif dst.startswith("s3:"): - from odc.aws import get_creds_with_retry, mk_boto_session - - if creds is None: - _creds = get_creds_with_retry(mk_boto_session()) - - if _creds is None: - warnings.warn( - "Found no credentials locally assuming workers can credentialize" - ) - else: - creds = _creds.get_frozen_credentials() - - s3_url, dst = dst, ":mem:" - else: - # Assume file path - # TODO: check if overwrite? - # TODO: create folder structure? - pass - - # set up sink - sink = dask.delayed(COGSink)( - info, - dst, - blocksize=blocksize, - ovr_blocksize=ovr_blocksize, - bigtiff=bigtiff, - temp_folder=temp_folder, - overview_resampling=overview_resampling, - rio_opts_first_pass=rio_opts_first_pass, - use_final_blocksizes=use_final_blocksizes, - **extra_rio_opts, - ) - - rr = da.store(data, sink, lock=False, compute=False) - - # wait for all stores to complete - # - # NOTE: here we edit dask graph returned from `da.store`, essentially - # replacing top level result with a lambda that returns original COGSink - # once all parallel stores are done. One could just depend on `rr` itself - # in theory, but in practice Dask optimizer removes it from the graph. So - # if you had something like: - # - # dask.delayed(lambda sink, rr: sink)(sink, rr).compute() - # - # it would throw an exception from deep inside Dask, - # `.compute(optimize=False)` does work though. - - dsk = dict(rr.dask) - deps = dsk.pop(rr.key) - name = "cog_finish-" + tk - dsk[name] = ((lambda sink, *deps: sink), sink.key, *deps) - cog_finish = Delayed(name, dsk) - - if s3_url is not None: - s3_opts = dict(ContentType="image/tiff") - if ACL is not None: - s3_opts["ACL"] = ACL - if creds is not None: - s3_opts["creds"] = creds - - cog_finish = COGSink.dask_finalise(cog_finish, extract=False) - return dask.delayed(lambda sink, url, opts: sink.dump_to_s3(url, **opts))( - cog_finish, s3_url, s3_opts, dask_key_name=f"dump_to_s3-{tk}" - ) - elif extract: - return COGSink.dask_finalise(cog_finish, extract=True) - else: - return COGSink.dask_finalise(cog_finish, extract=False, return_value=dst) diff --git a/libs/algo/odc/algo/_tools.py b/libs/algo/odc/algo/_tools.py deleted file mode 100644 index b7504101f..000000000 --- a/libs/algo/odc/algo/_tools.py +++ /dev/null @@ -1,34 +0,0 @@ -""" -Various utilities -""" -from typing import Optional, Tuple, Union - -ROI = Union[slice, Tuple[slice, ...]] - - -def slice_in_out(s: slice, n: int) -> Tuple[int, int]: - def fill_if_none(x: Optional[int], val_if_none: int) -> int: - return val_if_none if x is None else x - - start = fill_if_none(s.start, 0) - stop = fill_if_none(s.stop, n) - start, stop = [x if x >= 0 else n + x for x in (start, stop)] - return (start, stop) - - -def roi_shape( - roi: ROI, shape: Optional[Union[int, Tuple[int, ...]]] = None -) -> Tuple[int, ...]: - if isinstance(shape, int): - shape = (shape,) - - if isinstance(roi, slice): - roi = (roi,) - - if shape is None: - # Assume slices are normalised - return tuple(s.stop - (s.start or 0) for s in roi) - - return tuple( - _out - _in for _in, _out in (slice_in_out(s, n) for s, n in zip(roi, shape)) - ) diff --git a/libs/algo/odc/algo/_types.py b/libs/algo/odc/algo/_types.py deleted file mode 100644 index a618f8350..000000000 --- a/libs/algo/odc/algo/_types.py +++ /dev/null @@ -1,9 +0,0 @@ -import numpy as np -from typing import Tuple, Union - -NumpyIndex1 = Union[int, slice] -NumpyIndex2 = Tuple[NumpyIndex1, NumpyIndex1] -NumpyIndex = Tuple[NumpyIndex1, ...] -NodataType = Union[int, float] -ShapeLike = Union[int, Tuple[int, ...]] -DtypeLike = Union[str, np.dtype] diff --git a/libs/algo/odc/algo/_version.py b/libs/algo/odc/algo/_version.py deleted file mode 100644 index d31c31eae..000000000 --- a/libs/algo/odc/algo/_version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = "0.2.3" diff --git a/libs/algo/odc/algo/_warp.py b/libs/algo/odc/algo/_warp.py deleted file mode 100644 index 53a1a558f..000000000 --- a/libs/algo/odc/algo/_warp.py +++ /dev/null @@ -1,351 +0,0 @@ -""" -Dask aware reproject implementation -""" -import dask.array as da -import dask.utils as du -import numpy as np -import xarray as xr -from affine import Affine -from dask import is_dask_collection -from dask.highlevelgraph import HighLevelGraph -from typing import Any, Dict, Optional, Tuple, Union - -from datacube.utils import spatial_dims -from datacube.utils.geometry import ( - GeoBox, - compute_reproject_roi, - rio_reproject, - warp_affine, -) -from datacube.utils.geometry.gbox import GeoboxTiles -from ._dask import crop_2d_dense, empty_maker, randomize, unpack_chunks -from ._numeric import shape_shrink2 -from ._types import NodataType - - -def _reproject_block_impl( - src: np.ndarray, - src_geobox: GeoBox, - dst_geobox: GeoBox, - resampling: str = "nearest", - src_nodata: Optional[NodataType] = None, - dst_nodata: Optional[NodataType] = None, - axis: int = 0, - **kwargs, -) -> np.ndarray: - dst_shape = src.shape[:axis] + dst_geobox.shape + src.shape[axis + 2 :] - dst = np.empty(dst_shape, dtype=src.dtype) - - if dst.ndim == 2 or (dst.ndim == 3 and axis == 1): - rio_reproject( - src, - dst, - src_geobox, - dst_geobox, - resampling, - src_nodata, - dst_nodata, - **kwargs, - ) - else: - for prefix in np.ndindex(src.shape[:axis]): - rio_reproject( - src[prefix], - dst[prefix], - src_geobox, - dst_geobox, - resampling, - src_nodata, - dst_nodata, - **kwargs, - ) - return dst - - -def _reproject_block_bool_impl( - src: np.ndarray, - src_geobox: GeoBox, - dst_geobox: GeoBox, - resampling: str = "nearest", - src_nodata: Optional[NodataType] = None, - dst_nodata: Optional[NodataType] = None, - axis: int = 0, - **kwargs, -) -> np.ndarray: - assert src.dtype == "bool" - assert src_nodata is None - assert dst_nodata is None - src = src.astype("uint8") << 7 # False:0, True:128 - dst = _reproject_block_impl( - src, - src_geobox, - dst_geobox, - resampling=resampling, - axis=axis, - **kwargs, - ) - return dst > 64 - - -# pylint: disable=too-many-locals -def dask_reproject( - src: da.Array, - src_geobox: GeoBox, - dst_geobox: GeoBox, - resampling: str = "nearest", - chunks: Optional[Tuple[int, int]] = None, - src_nodata: Optional[NodataType] = None, - dst_nodata: Optional[NodataType] = None, - axis: int = 0, - name: str = "reproject", - **kwargs, -) -> da.Array: - """ - Reproject to GeoBox as dask operation - - :param src : Input src[(time,) y,x (, band)] - :param src_geobox: GeoBox of the source array - :param dst_geobox: GeoBox of the destination - :param resampling: Resampling strategy as a string: - nearest, bilinear, average, mode ... - :param chunks : In Y,X dimensions only, default is to use same input chunk size - :param axis : Index of Y axis (default is 0) - :param src_nodata: nodata marker for source image - :param dst_nodata: nodata marker for dst image - :param name : Dask graph name, "reproject" is the default - :param kwargs: Options given to GDAL as in rasterio.warp - """ - if chunks is None: - chunks = src.chunksize[axis : axis + 2] - - if dst_nodata is None: - dst_nodata = src_nodata - - assert src.shape[axis : axis + 2] == src_geobox.shape - yx_shape = dst_geobox.shape - yx_chunks = unpack_chunks(chunks, yx_shape) - - dst_chunks = src.chunks[:axis] + yx_chunks + src.chunks[axis + 2 :] - dst_shape = src.shape[:axis] + yx_shape + src.shape[axis + 2 :] - - # tuple(*dims1, y, x, *dims2) -- complete shape in blocks - dims1 = tuple(map(len, dst_chunks[:axis])) - dims2 = tuple(map(len, dst_chunks[axis + 2 :])) - assert not dims2 - deps = [src] - - tile_shape = (yx_chunks[0][0], yx_chunks[1][0]) - gbt = GeoboxTiles(dst_geobox, tile_shape) - xy_chunks_with_data = list(gbt.tiles(src_geobox.extent)) - - name = randomize(name) - dsk: Dict[Any, Any] = {} - - block_impl = ( - _reproject_block_bool_impl if src.dtype == "bool" else _reproject_block_impl - ) - - for idx in xy_chunks_with_data: - _dst_geobox = gbt[idx] - rr = compute_reproject_roi(src_geobox, _dst_geobox) - _src = crop_2d_dense(src, rr.roi_src, axis=axis) - _src_geobox = src_geobox[rr.roi_src] - - deps.append(_src) - - for ii1 in np.ndindex(dims1): - # TODO: band dims - dsk[(name, *ii1, *idx)] = ( - du.apply, - block_impl, - [ - (_src.name, *ii1, 0, 0), - _src_geobox, - _dst_geobox, - resampling, - src_nodata, - dst_nodata, - axis, - ], - kwargs, - ) - - fill_value = 0 if dst_nodata is None else dst_nodata - shape_in_blocks = tuple(map(len, dst_chunks)) - - mk_empty = empty_maker(fill_value, src.dtype, dsk) - - for idx in np.ndindex(shape_in_blocks): - # TODO: other dims - k = (name, *idx) - if k not in dsk: - bshape = tuple(ch[i] for ch, i in zip(dst_chunks, idx)) - dsk[k] = mk_empty(bshape) - - dsk = HighLevelGraph.from_collections(name, dsk, dependencies=deps) - - return da.Array(dsk, name, chunks=dst_chunks, dtype=src.dtype, shape=dst_shape) - - -def xr_reproject_array( - src: xr.DataArray, - geobox: GeoBox, - resampling: str = "nearest", - chunks: Optional[Tuple[int, int]] = None, - dst_nodata: Optional[NodataType] = None, - **kwargs, -) -> xr.DataArray: - """ - Reproject DataArray to a given GeoBox - - :param src : Input src[(time,) y,x (, band)] - :param geobox : GeoBox of the destination - :param resampling: Resampling strategy as a string: - nearest, bilinear, average, mode ... - :param chunks : In Y,X dimensions only, default is to use input chunk size - :param dst_nodata: nodata marker for dst image (default is to use src.nodata) - :param kwargs: Options given to GDAL as in rasterio.warp - """ - src_nodata = getattr(src, "nodata", None) - if dst_nodata is None: - dst_nodata = src_nodata - - src_geobox = src.geobox - assert src_geobox is not None - - yx_dims = spatial_dims(src) - axis = tuple(src.dims).index(yx_dims[0]) - - src_dims = tuple(src.dims) - dst_dims = src_dims[:axis] + geobox.dims + src_dims[axis + 2 :] - - coords = geobox.xr_coords(with_crs=True) - - # copy non-spatial coords from src to dst - src_non_spatial_dims = src_dims[:axis] + src_dims[axis + 2 :] - for dim in src_non_spatial_dims: - if dim not in coords: - coords[dim] = src.coords[dim] - - attrs = {} - if dst_nodata is not None: - attrs["nodata"] = dst_nodata - - if is_dask_collection(src): - data = dask_reproject( - src.data, - src_geobox, - geobox, - resampling=resampling, - chunks=chunks, - src_nodata=src_nodata, - dst_nodata=dst_nodata, - axis=axis, - **kwargs, - ) - else: - data = _reproject_block_impl( - src.data, - src_geobox, - geobox, - resampling=resampling, - src_nodata=src_nodata, - dst_nodata=dst_nodata, - axis=axis, - **kwargs, - ) - - return xr.DataArray(data, name=src.name, coords=coords, dims=dst_dims, attrs=attrs) - - -# pylint: enable=too-many-locals -def xr_reproject( - src: Union[xr.DataArray, xr.Dataset], - geobox: GeoBox, - resampling: str = "nearest", - chunks: Optional[Tuple[int, int]] = None, - dst_nodata: Optional[NodataType] = None, - **kwargs, -) -> Union[xr.DataArray, xr.Dataset]: - """ - Reproject DataArray to a given GeoBox - - :param src : Input src[(time,) y, x] - :param geobox : GeoBox of the destination - :param resampling: Resampling strategy as a string: - nearest, bilinear, average, mode ... - :param chunks : In Y,X dimensions only, default is to use input chunk size - (ignored if input is not a dask array) - :param dst_nodata: nodata marker for dst image (default is to use src.nodata) - :param kwargs: Options given to GDAL as in rasterio.warp - """ - - if isinstance(src, xr.DataArray): - return xr_reproject_array( - src, - geobox, - resampling=resampling, - chunks=chunks, - dst_nodata=dst_nodata, - **kwargs, - ) - - bands = { - name: xr_reproject_array( - band, geobox, resampling=resampling, chunks=chunks, **kwargs - ) - for name, band in src.data_vars.items() - } - - return xr.Dataset(data_vars=bands) - - -def _shrink2( - xx: np.ndarray, - resampling: str = "nearest", - nodata: Optional[NodataType] = None, - axis: int = 0, -): - """ - :param xx: Image to shrink - :param resampling: Resampling strategy to use - :param nodata: nodata value for missing value fill - :param axis: Y-axis index, to distinguish Y,X,B (default) vs B,Y,X (axis=1) - """ - out_shape = shape_shrink2(xx.shape, axis=axis) - - if xx.ndim == 2 or (xx.ndim == 3 and axis == 1): - # [Y, X] or [B, Y, X] - out = np.empty(out_shape, dtype=xx.dtype) - warp_affine( - xx, - out, - Affine.scale(2), - resampling=resampling, - src_nodata=nodata, - dst_nodata=nodata, - ) - elif xx.ndim == 3 and axis == 0: - # [Y, X, B] -> [Y', X', B] - assert xx.ndim == 3 - - # Need to turn into B,Y,X order - xx = xx.transpose((2, 0, 1)) - out = np.empty(out_shape[2:] + out_shape[:2], dtype=xx.dtype) - warp_affine( - xx, - out, - Affine.scale(2), - resampling=resampling, - src_nodata=nodata, - dst_nodata=nodata, - ) - - # back to Y',X',B - out = out.transpose((1, 2, 0)) - - assert out_shape == out.shape - else: - raise ValueError("Only support Y,X | Y,X,B | B,Y,X inputs") - - return out diff --git a/libs/algo/odc/algo/io.py b/libs/algo/odc/algo/io.py deleted file mode 100644 index a4db36829..000000000 --- a/libs/algo/odc/algo/io.py +++ /dev/null @@ -1,401 +0,0 @@ -"""Native load and masking.""" - -import json -import xarray as xr -import pandas as pd -from pyproj import aoi, transformer -from typing import ( - Callable, - Dict, - Iterable, - List, - Optional, - Sequence, - Tuple, - Union, - cast, -) - -from datacube import Datacube -from datacube.model import Dataset -from datacube.testutils.io import native_geobox -from datacube.utils.geometry import GeoBox, gbox -from ._grouper import group_by_nothing, solar_offset -from ._masking import _max_fuser, _nodata_fuser, _or_fuser, enum_to_bool, mask_cleanup -from ._warp import xr_reproject - - -def compute_native_load_geobox( - dst_geobox: GeoBox, ds: Dataset, band: str, buffer: Optional[float] = None -) -> GeoBox: - """ - Compute area of interest for a given Dataset given query. - - Take native projection and resolution from ``ds, band`` pair and compute - region in that projection that fully encloses footprint of the - ``dst_geobox`` with some padding. Construct GeoBox that encloses that - region fully with resolution/pixel alignment copied from supplied band. - - :param dst_geobox: - :param ds: Sample dataset (only resolution and projection is used, not footprint) - :param band: Reference band to use - (resolution of output GeoBox will match resolution of this band) - :param buffer: Buffer in units of CRS of ``ds`` (meters usually), - default is 10 pixels worth - """ - native: GeoBox = native_geobox(ds, basis=band) - if buffer is None: - buffer = 10 * cast(float, max(map(abs, native.resolution))) # type: ignore - - assert native.crs is not None - return GeoBox.from_geopolygon( - dst_geobox.extent.to_crs(native.crs).buffer(buffer), - crs=native.crs, - resolution=native.resolution, - align=native.alignment, - ) - - -def choose_transform_path( - src_crs: str, - dst_crs: str, - transform_code: Optional[str] = None, - area_of_interest: Optional[Sequence[float]] = None, -) -> str: - # leave gdal to choose the best option if nothing is specified - if transform_code is None and area_of_interest is None: - return {} - - if area_of_interest is not None: - assert len(area_of_interest) == 4 - area_of_interest = aoi.AreaOfInterest(*area_of_interest) - - transformer_group = transformer.TransformerGroup( - src_crs, dst_crs, area_of_interest=area_of_interest - ) - if transform_code is None: - return {"COORDINATE_OPERATION": transformer_group.transformers[0].to_proj4()} - else: - for t in transformer_group.transformers: - for step in json.loads(t.to_json()).get("steps", []): - if step.get("type", "") == "Transformation": - authority_code = step.get("id", {}) - if transform_code.split(":")[0].upper() in authority_code.get( - "authority", "" - ) and transform_code.split(":")[1] == str( - authority_code.get("code", "") - ): - return {"COORDINATE_OPERATION": t.to_proj4()} - # raise error if nothing is available - raise ValueError(f"Not able to find transform path by {transform_code}") - - -def _split_by_grid(xx: xr.DataArray) -> List[xr.DataArray]: - def extract(ii): - yy = xx[ii] - crs = xx.grid2crs[xx.grid.data[0]] - yy.attrs.update(crs=crs) - yy.attrs.pop("grid2crs", None) - return yy - - return [extract(ii) for ii in xx.groupby(xx.grid).groups.values()] - - -# pylint: disable=too-many-arguments, too-many-locals -def _load_with_native_transform_1( - sources: xr.DataArray, - bands: Tuple[str, ...], - geobox: GeoBox, - native_transform: Callable[[xr.Dataset], xr.Dataset], - basis: Optional[str] = None, - groupby: Optional[str] = None, - fuser: Optional[Callable[[xr.Dataset], xr.Dataset]] = None, - resampling: str = "nearest", - chunks: Optional[Dict[str, int]] = None, - load_chunks: Optional[Dict[str, int]] = None, - pad: Optional[int] = None, - **kwargs, -) -> xr.Dataset: - if basis is None: - basis = bands[0] - - if load_chunks is None: - load_chunks = chunks - - (ds,) = sources.data[0] - load_geobox = compute_native_load_geobox(geobox, ds, basis) - if pad is not None: - load_geobox = gbox.pad(load_geobox, pad) - - mm = ds.type.lookup_measurements(bands) - xx = Datacube.load_data(sources, load_geobox, mm, dask_chunks=load_chunks) - xx = native_transform(xx) - - if groupby is not None: - if fuser is None: - fuser = _nodata_fuser # type: ignore - - for dim in xx.dims: - if isinstance(xx.get_index(dim), pd.MultiIndex): - xx = xx.reset_index(dim) - - if groupby not in xx.indexes.keys(): - xx = xx.set_xindex(groupby) - xx = xx.groupby(groupby).map(fuser) - - _chunks = None - if chunks is not None: - _chunks = tuple(chunks.get(ax, -1) for ax in ("y", "x")) - - return xr_reproject( - xx, geobox, chunks=_chunks, resampling=resampling, **kwargs - ) # type: ignore - - -def load_with_native_transform( - dss: Sequence[Dataset], - bands: Sequence[str], - geobox: GeoBox, - native_transform: Callable[[xr.Dataset], xr.Dataset], - basis: Optional[str] = None, - groupby: Optional[str] = None, - fuser: Optional[Callable[[xr.Dataset], xr.Dataset]] = None, - resampling: str = "nearest", - chunks: Optional[Dict[str, int]] = None, - load_chunks: Optional[Dict[str, int]] = None, - pad: Optional[int] = None, - **kw, -) -> xr.Dataset: - """ - Load a bunch of datasets with native pixel transform. - - :param dss: A list of datasets to load - :param bands: Which measurements to load - :param geobox: GeoBox of the final output - :param native_transform: ``xr.Dataset -> xr.Dataset`` transform, - should support Dask inputs/outputs - :param basis: Name of the band to use as a reference for what is "native projection" - :param groupby: One of 'solar_day'|'time'|'idx'|None - :param fuser: Optional ``xr.Dataset -> xr.Dataset`` transform - :param resampling: Any resampling mode supported by GDAL as a string: - nearest, bilinear, average, mode, cubic, etc... - :param chunks: If set use Dask, must be in dictionary form - ``{'x': 4000, 'y': 4000}`` - - :param load_chunks: Defaults to ``chunks`` but can be different if supplied - (different chunking for native read vs reproject) - - :param pad: Optional padding in native pixels, if set will load extra - pixels beyond of what is needed to reproject to final - destination. This is useful when you plan to apply convolution - filter or morphological operators on input data. - - :param kw: Used to support old names ``dask_chunks`` and ``group_by`` - also kwargs for reproject ``tranform_code`` in the form of - "authority:code", e.g., "epsg:9688", and ``area_of_interest``, - e.g., [-180, -90, 180, 90] - - 1. Partition datasets by native Projection - 2. For every group do - - Load data - - Apply native_transform - - [Optional] fuse rasters that happened on the same day/time - - Reproject to final geobox - 3. Stack output of (2) - 4. [Optional] fuse rasters that happened on the same day/time - """ - if fuser is None: - fuser = _nodata_fuser - - if groupby is None: - groupby = kw.get("group_by", "idx") - - if chunks is None: - chunks = kw.get("dask_chunks", None) - - sources = group_by_nothing(list(dss), solar_offset(geobox.extent)) - _xx = [] - # fail if the intended transform not available - # to avoid any unexpected results - for srcs in _split_by_grid(sources): - extra_args = choose_transform_path( - srcs.crs, - geobox.crs, - kw.get("transform_code"), - kw.get("area_of_interest"), - ) - - _xx += [ - _load_with_native_transform_1( - srcs, - tuple(bands), - geobox, - native_transform, - basis=basis, - resampling=resampling, - groupby=groupby, - fuser=fuser, - chunks=chunks, - load_chunks=load_chunks, - pad=pad, - **extra_args, - ) - ] - - if len(_xx) == 1: - xx = _xx[0] - else: - xx = xr.concat(_xx, sources.dims[0]) # type: ignore - if groupby != "idx": - for dim in xx.dims: - if isinstance(xx.get_index(dim), pd.MultiIndex): - xx = xx.reset_index(dim) - if groupby not in xx.indexes.keys(): - xx = xx.set_xindex(groupby) - xx = xx.groupby(groupby).map(fuser) - - # TODO: probably want to replace spec MultiIndex with just `time` component - - return xx - - -def load_enum_mask( - dss: List[Dataset], - band: str, - geobox: GeoBox, - categories: Iterable[Union[str, int]], - invert: bool = False, - resampling: str = "nearest", - groupby: Optional[str] = None, - chunks: Optional[Dict[str, int]] = None, - **kw, -) -> xr.DataArray: - """ - Load enumerated mask (like fmask). - - 1. Load each mask time slice separately in native projection of the file - 2. Convert enum to Boolean (F:0, T:255) - 3. Optionally (groupby='solar_day') group observations on the same day - using OR for pixel fusing: T,F->T - 4. Reproject to destination GeoBox (any resampling mode is ok) - 5. Optionally group observations on the same day using OR for pixel fusing T,F->T - 6. Finally convert to real Bool - """ - - def native_op(ds): - return ds.map( - enum_to_bool, - categories=categories, - invert=invert, - dtype="uint8", - value_true=255, - ) - - xx = load_with_native_transform( - dss, - (band,), - geobox, - native_op, - basis=band, - resampling=resampling, - groupby=groupby, - chunks=chunks, - fuser=_max_fuser, - **kw, - ) - return xx[band] > 127 - - -def load_enum_filtered( - dss: Sequence[Dataset], - band: str, - geobox: GeoBox, - categories: Iterable[Union[str, int]], - filters: Optional[Iterable[Tuple[str, int]]] = None, - groupby: Optional[str] = None, - resampling: str = "nearest", - chunks: Optional[Dict[str, int]] = None, - **kw, -) -> xr.DataArray: - """ - Load enumerated mask (like fmask/SCL) with native pixel filtering. - - The idea is to load "cloud" classes while adding some padding, then erase - pixels that were classified as cloud in any of the observations on a given - day. - - This method converts enum-mask to a boolean image in the native projection - of the data and then reprojects boolean image to the final - projections/resolution. This allows one to use any resampling strategy, - like ``average`` or ``cubic`` and not be limited to a few resampling - strategies that support operations on categorical data. - - :param dss: A list of datasets to load - :param band: Which measurement band to load - :param geobox: GeoBox of the final output - :param categories: Enum values or names - - :param filters: iterable tuples of morphological operations in the order - you want them to perform, e.g., [("opening", 2), ("dilation", 5)] - :param groupby: One of 'solar_day'|'time'|'idx'|None - :param resampling: Any resampling mode supported by GDAL as a string: - nearest, bilinear, average, mode, cubic, etc... - :param chunks: If set use Dask, must be in dictionary form - ``{'x': 4000, 'y': 4000}`` - :param kw: Passed on to ``load_with_native_transform`` - - - 1. Load each mask time slice separately in native projection of the file - 2. Convert enum to Boolean - 3. Optionally (groupby='solar_day') group observations on the same day - using OR for pixel fusing: T,F->T - 4. Optionally apply ``mask_cleanup`` in native projection (after fusing) - 4. Reproject to destination GeoBox (any resampling mode is ok) - 5. Optionally group observations on the same day using OR for pixel fusing T,F->T - """ - - def native_op(xx: xr.Dataset) -> xr.Dataset: - _xx = enum_to_bool(xx[band], categories) - return xr.Dataset( - {band: _xx}, - attrs={"native": True}, # <- native flag needed for fuser - ) - - def fuser(xx: xr.Dataset) -> xr.Dataset: - """ - Fuse with OR. - - Fuse with OR, and when fusing in native pixel domain apply mask_cleanup if - requested - """ - is_native = xx.attrs.get("native", False) - xx = xx.map(_or_fuser) - xx.attrs.pop("native", None) - - if is_native and filters is not None: - _xx = xx[band] - assert isinstance(_xx, xr.DataArray) - xx[band] = mask_cleanup(_xx, mask_filters=filters) - - return xx - - # unless set by user to some value use largest filter radius for pad value - pad: Optional[int] = kw.pop("pad", None) - if pad is None: - if filters is not None: - pad = max(list(zip(*filters))[1]) # type: ignore - - xx = load_with_native_transform( - dss, - (band,), - geobox, - native_op, - fuser=fuser, - groupby=groupby, - resampling=resampling, - chunks=chunks, - pad=pad, - **kw, - )[band] - assert isinstance(xx, xr.DataArray) - return xx diff --git a/libs/algo/odc/algo/pixel.py b/libs/algo/odc/algo/pixel.py deleted file mode 100644 index 645ce2c9b..000000000 --- a/libs/algo/odc/algo/pixel.py +++ /dev/null @@ -1,167 +0,0 @@ -"""Helper methods for accessing single pixel from a rasterio file object. - -""" -import rasterio -import rasterio.crs -import rasterio.warp -from typing import Iterable, List, Optional, Tuple, Union - -RowCol = Tuple[int, int] -XY = Tuple[float, float] -LonLat = Tuple[float, float] -SomeCoord = Union[RowCol, XY, LonLat] -PixelValue = Union[float, int] - - -NOTSET = object() - - -def make_pixel_extractor( - mode="pixel", - band=1, - src_nodata_fallback=None, - src_nodata_override=None, - dst_nodata=NOTSET, -): - """Returns function that can extract single pixel from opened rasterio file. - - Signature of the returned function is: - `src, coordinate_tuple, [band] -> pixel` - - Where coordinate_tuple is interpreted according to `mode` - - - mode - How to interpret coordinate: - - pixel: (row, col) - - native: (x, y) in file native coordinate space - - lonlat: (lon, lat) (specifically EPSG:4326) - - band - Default band to read, can be over-written on read - - dst_nodata - when set use that instead of defaulting to src nodata value, - can be set to `None` to remap to `None` - - src_nodata_fallback - nodata value to use if src file is missing nodata value - - src_nodata_override - when set use that instead of what's in the file, - useful when nodata metadata is incorrect in the file - but correct value is available out of band. - - """ - default_band = band - - if dst_nodata is NOTSET: - - def _dst_nodata(src_nodata): - return src_nodata - - else: - - def _dst_nodata(src_nodata): - return dst_nodata - - def remap_pix(pix, src_nodata, dst_nodata): - # TODO: special case src_nodata is nan case - return dst_nodata if pix == src_nodata else pix - - def extract_pixel(src, coord, band=default_band): - ri, ci = coord - - src_nodata = _resolve_nodata( - src, band, fallback=src_nodata_fallback, override=src_nodata_override - ) - - dst_nodata = _dst_nodata(src_nodata) - - if 0 <= ri < src.height and 0 <= ci < src.width: - window = ((ri, ri + 1), (ci, ci + 1)) - - pix = src.read(band, window=window) - # TODO: support band being a list of bands - return remap_pix(pix[0][0], src_nodata, dst_nodata) - else: - return dst_nodata - - def extract_native(src, coord, band=default_band): - return extract_pixel(src, src.index(*coord), band=band) - - def extract_lonlat(src, coord, band=default_band): - lon, lat = coord - x, y = rasterio.warp.transform( - rasterio.crs.CRS.from_epsg(4326), src.crs, [lon], [lat] - ) - xy = (x[0], y[0]) - return extract_native(src, xy, band=band) - - extractors = dict(pixel=extract_pixel, native=extract_native, lonlat=extract_lonlat) - - extractor = extractors.get(mode) - if extractor is None: - raise ValueError("Only support mode=") - - return extractor - - -def _resolve_nodata(src, band, fallback=None, override=None): - """Figure out what value to use for nodata given a band and fallback/override - settings - - :param src: Rasterio file - """ - if override is not None: - return override - - band0 = band if isinstance(band, int) else band[0] - nodata = src.nodatavals[band0 - 1] - - if nodata is None: - return fallback - - return nodata - - -def _mode_value( - pixel: Optional[RowCol] = None, - xy: Optional[XY] = None, - lonlat: Optional[LonLat] = None, -) -> Union[Tuple[str, SomeCoord], Tuple[None, None]]: - if pixel is not None: - return "pixel", pixel - - if xy is not None: - return "native", xy - - if lonlat is not None: - return "lonlat", lonlat - - return (None, None) - - -def read_pixels( - urls: Iterable[str], - pixel: Optional[RowCol] = None, - xy: Optional[XY] = None, - lonlat: Optional[LonLat] = None, - band: int = 1, - **kwargs, -) -> List[PixelValue]: - """Read a single pixel at the same location from a bunch of different files. - - Location can be specified in 3 different ways: - - pixel (row: int, column: int) -- in pixel coords - xy (X: float, Y: float) -- in Projected coordinates of the native CRS of the image - lonlat (lon: float, lat: float) -- in EPSG:4326 - """ - mode, coord = _mode_value(pixel=pixel, xy=xy, lonlat=lonlat) - if mode is None: - raise ValueError("Have to supply one of: pixel, xy, or lonlat.") - - extractor = make_pixel_extractor(mode=mode, band=band, **kwargs) - - def read_from_url(url): - url = rasterio.parse_path(url) - with rasterio.DatasetReader(url, sharing=False) as src: - return extractor(src, coord=coord) - - return [read_from_url(url) for url in urls] diff --git a/libs/algo/pyproject.toml b/libs/algo/pyproject.toml deleted file mode 100644 index 3607e0fbd..000000000 --- a/libs/algo/pyproject.toml +++ /dev/null @@ -1,3 +0,0 @@ -[build-system] -requires = ["setuptools>=51.0.0", "wheel"] -build-backend = "setuptools.build_meta" diff --git a/libs/algo/setup.cfg b/libs/algo/setup.cfg deleted file mode 100644 index 64012aec4..000000000 --- a/libs/algo/setup.cfg +++ /dev/null @@ -1,42 +0,0 @@ -[metadata] -name = odc-algo -description = Miscellaneous Algorithmic helper methods -version = attr: odc.algo._version.__version__ -author = Open Data Cube -author_email = -maintainer = Open Data Cube -maintainer_email = -long_description_content_type = text/markdown -long_description = file: README.md -platforms = any -license = Apache License 2.0 -url = https://github.com/opendatacube/odc-tools/ - -[options] -include_package_data = true -zip_safe = false -packages = find_namespace: -python_requires = >=3.9 -tests_require = pytest -install_requires = - affine - dask - dask_image - datacube>=1.8.5 - distributed - numexpr - numpy - rasterio>=1.3.2 - scikit-image - toolz - xarray - -[options.extras_require] -hdstats = hdstats>=0.1.7.post5 -s3 = - boto3 - odc-cloud - -[options.packages.find] -include = - odc* diff --git a/libs/algo/setup.py b/libs/algo/setup.py deleted file mode 100644 index 606849326..000000000 --- a/libs/algo/setup.py +++ /dev/null @@ -1,3 +0,0 @@ -from setuptools import setup - -setup() diff --git a/libs/algo/tests/test_dask.py b/libs/algo/tests/test_dask.py deleted file mode 100644 index 690bb2bbc..000000000 --- a/libs/algo/tests/test_dask.py +++ /dev/null @@ -1,190 +0,0 @@ -import dask.array as da -import numpy as np -import pytest -import toolz -from dask import delayed -from dask.distributed import Client -from odc.algo._dask import ( - _rechunk_2x2, - _stack_2d_np, - compute_chunk_range, - crop_2d_dense, - unpack_chunksize, - wait_for_future, -) - - -@delayed -def slow_compute(delay, value=None, fail=False): - import time - - if delay > 0: - time.sleep(delay) - if fail: - raise ValueError("Failing as requested") - return value - - -def test_wait_for_future(): - client = Client( - processes=False, n_workers=1, threads_per_worker=1, dashboard_address=None - ) - fut = client.compute(slow_compute(1)) - rr = list(wait_for_future(fut, 0.1)) - assert fut.done() - assert len(rr) > 1 - - # Check that exception doesn't leak out - fut = client.compute(slow_compute(1, fail=True)) - rr = list(wait_for_future(fut, 0.1)) - assert fut.done() - assert fut.status == "error" - assert len(rr) > 1 - print(fut) - - -def test_1(): - xx = da.random.uniform(0, 10, size=(16, 6), chunks=(4, 3)).astype("uint8") - yy = _rechunk_2x2(xx) - assert xx.dtype == yy.dtype - assert xx.shape == yy.shape - assert (xx.compute() == yy.compute()).all() - - -@pytest.mark.parametrize( - "chunk, n, expect", - [ - (4, 7, (4, 3)), - (3, 9, (3, 3, 3)), - (8, 8, (8,)), - (1, 3, (1, 1, 1)), - (10, 3, (3,)), - ], -) -def test_unpack_chunks(chunk, n, expect): - assert unpack_chunksize(chunk, n) == expect - - -@pytest.mark.parametrize( - "shape, block_shape", - [ - [(2, 3), (2, 2)], - [(3, 2), (1, 2)], - [(1, 2), (2, 3)], - [(1, 1), (2, 3)], - [(2, 3), (2, 3, 3)], - [(2, 3), (3, 2, 4)], - [(2, 3), (3, 2, 4, 1)], - ], -) -def test_stack2d_np(shape, block_shape, verbose=False): - aa = np.zeros((block_shape), dtype="int8") - - h, w = shape - seq = [aa + i for i in range(w * h)] - - expect = np.vstack([np.hstack(row) for row in toolz.partition_all(w, seq)]) - - cc = _stack_2d_np(shape, *seq) - - assert (cc == expect).all() - - if verbose: - print() - if cc.ndim == 2: - print(cc) - elif cc.ndim == 3: - print(cc[:, :, 0], f"x{cc.shape[2:]}") - else: - print(f"x{cc.shape}") - - -def test_stack2d_np_ndim(verbose=False): - shape = (4, 3) - h, w = shape - - aa = np.zeros((10, 2, 3, 3), dtype="int8") - seq = [aa + i for i in range(w * h)] - - cc = _stack_2d_np(shape, *seq, axis=1) - assert cc.shape == (10, 8, 9, 3) - if verbose: - print() - print(cc[0, :, :, 0]) - - -@pytest.mark.parametrize( - "span, chunks, summed, bspan, pspan", - [ - (np.s_[:], (4, 4), False, slice(0, 2), slice(0, 8)), - (np.s_[0:], (4, 4), False, slice(0, 2), slice(0, 8)), - (np.s_[0:-1], (4, 4), False, slice(0, 2), slice(0, 7)), - (np.s_[-1:], (4, 4), False, slice(1, 2), slice(3, 4)), - (np.s_[-4:], (4, 4), False, slice(1, 2), slice(0, 4)), - (np.s_[0:8], (4, 4), False, slice(0, 2), slice(0, 8)), - (np.s_[1:], (4, 4), False, slice(0, 2), slice(1, 8)), - (np.s_[1:4], (4, 4), False, slice(0, 1), slice(1, 4)), - (np.s_[:], (2, 4, 6, 11, 13), True, slice(0, 5), slice(0, 13)), - (np.s_[2:7], (2, 4, 6, 11, 13), True, slice(1, 4), slice(0, 5)), - (np.s_[3:7], (2, 4, 6, 11, 13), True, slice(1, 4), slice(1, 5)), - (np.s_[3:], (2, 4, 6, 11, 13), True, slice(1, 5), slice(1, 13 - 3 + 1)), - ], -) -def test_chunk_range(span, chunks, summed, bspan, pspan): - _bspan, _pspan = compute_chunk_range(span, chunks, summed) - assert _bspan == bspan - assert _pspan == pspan - - -@pytest.mark.parametrize( - "yx_roi", - [np.s_[:, :], np.s_[:1, :1], np.s_[3:, 1:], np.s_[3:-3, 1:-5], np.s_[3:-3, -5:]], -) -def test_crop_2d_dense(yx_roi): - # Y,X - xx = da.random.uniform(0, 10, size=(16, 6), chunks=(4, 3)).astype("uint8") - - yy = crop_2d_dense(xx, yx_roi) - assert xx.dtype == yy.dtype - assert yy.shape == xx[yx_roi].shape - assert yy.shape == yy.chunksize - - assert (xx[yx_roi].compute() == yy.compute()).all() - - # Y, X, Band - xx = da.random.uniform(0, 10, size=(16, 6, 4), chunks=(4, 3, 4)).astype("uint8") - - yy = crop_2d_dense(xx, yx_roi) - _roi = (*yx_roi, np.s_[:]) - assert xx.dtype == yy.dtype - assert yy.shape == xx[_roi].shape - assert yy.shape[:2] == yy.chunksize[:2] - - assert (xx[_roi].compute() == yy.compute()).all() - - # Time, Y, X - xx = da.random.uniform(0, 10, size=(5, 16, 6), chunks=(1, 4, 3)).astype("uint8") - - yy = crop_2d_dense(xx, yx_roi, axis=1) - _roi = (np.s_[:], *yx_roi) - assert xx.dtype == yy.dtype - assert yy.shape == xx[_roi].shape - assert yy.shape[1:3] == yy.chunksize[1:3] - assert yy.chunksize[0] == xx.chunksize[0] - - assert (xx[_roi].compute() == yy.compute()).all() - - # Time, Y, X, Band - xx = da.random.uniform(0, 10, size=(5, 16, 6, 3), chunks=(1, 4, 3, 3)).astype( - "uint8" - ) - - yy = crop_2d_dense(xx, yx_roi, axis=1) - _roi = (np.s_[:], *yx_roi, np.s_[:]) - assert xx.dtype == yy.dtype - assert yy.shape == xx[_roi].shape - assert yy.shape[1:3] == yy.chunksize[1:3] - assert yy.chunksize[0] == xx.chunksize[0] - assert yy.chunksize[-1] == xx.chunksize[-1] - - assert (xx[_roi].compute() == yy.compute()).all() diff --git a/libs/algo/tests/test_grouper.py b/libs/algo/tests/test_grouper.py deleted file mode 100644 index 4a598508e..000000000 --- a/libs/algo/tests/test_grouper.py +++ /dev/null @@ -1,67 +0,0 @@ -import pytest -from odc.algo._grouper import group_by_nothing, key2num, mid_longitude, solar_offset - -from datacube.testutils import mk_sample_dataset -from datacube.utils import geometry as geom - - -@pytest.mark.parametrize("lon,lat", [(0, 10), (100, -10), (-120, 30)]) -def test_mid_lon(lon, lat): - r = 0.1 - rect = geom.box(lon - r, lat - r, lon + r, lat + r, "epsg:4326") - assert rect.centroid.coords[0] == pytest.approx((lon, lat)) - - assert mid_longitude(rect) == pytest.approx(lon) - assert mid_longitude(rect.to_crs("epsg:3857")) == pytest.approx(lon) - - offset = solar_offset(rect, "h") - assert offset.seconds % (60 * 60) == 0 - - offset_sec = solar_offset(rect, "s") - assert abs((offset - offset_sec).seconds) <= 60 * 60 - - -@pytest.mark.parametrize( - "input,expect", - [ - ("ABAAC", [0, 1, 0, 0, 2]), - ("B", [0]), - ([1, 1, 1], [0, 0, 0]), - ("ABCC", [0, 1, 2, 2]), - ], -) -def test_key2num(input, expect): - rr = list(key2num(input)) - assert rr == expect - - reverse = {} - rr = list(key2num(input, reverse)) - assert rr == expect - assert set(reverse.keys()) == set(range(len(set(input)))) - assert set(reverse.values()) == set(input) - # first entry always gets an index of 0 - assert reverse[0] == input[0] - - -@pytest.fixture -def sample_geobox(): - yield geom.GeoBox.from_geopolygon( - geom.box(-10, -20, 11, 22, "epsg:4326"), resolution=(-1, 1) - ) - - -@pytest.fixture -def sample_ds(sample_geobox): - yield mk_sample_dataset([dict(name="red")], geobox=sample_geobox) - - -def test_grouper(sample_ds): - xx = group_by_nothing([sample_ds]) - assert xx.values[0] == (sample_ds,) - assert xx.uuid.values[0] == sample_ds.id - - xx = group_by_nothing([sample_ds, sample_ds], solar_offset(sample_ds.extent)) - assert xx.values[0] == (sample_ds,) - assert xx.values[0] == (sample_ds,) - assert xx.uuid.values[1] == sample_ds.id - assert xx.uuid.values[1] == sample_ds.id diff --git a/libs/algo/tests/test_io.py b/libs/algo/tests/test_io.py deleted file mode 100644 index 18df758b4..000000000 --- a/libs/algo/tests/test_io.py +++ /dev/null @@ -1,41 +0,0 @@ -import pytest -from odc.algo.io import choose_transform_path - - -@pytest.mark.xfail(reason="Fragile test code, I think between different GDAL versions") -@pytest.mark.parametrize("transform_code", [None, "EPSG:9688", "EPSG:1150"]) -@pytest.mark.parametrize("area_of_interest", [None, [-180, -90, 180, 90]]) -def test_choose_transform_path(transform_code, area_of_interest): - src_crs = "EPSG:32649" - dst_crs = "EPSG:3577" - proj_str = { - "9688": "+proj=pipeline +step +inv +proj=utm +zone=49 +ellps=WGS84 " - "+step +proj=push +v_3 " - "+step +proj=cart +ellps=WGS84 " - "+step +inv +proj=helmert +x=0.06155 +y=-0.01087 +z=-0.04019 " - "+rx=-0.0394924 +ry=-0.0327221 +rz=-0.0328979 +s=-0.009994 " - "+convention=coordinate_frame " - "+step +inv +proj=cart +ellps=GRS80 " - "+step +proj=pop +v_3 " - "+step +proj=aea +lat_0=0 +lon_0=132 +lat_1=-18 +lat_2=-36 " - "+x_0=0 +y_0=0 +ellps=GRS80", - "1150": "+proj=pipeline +step +inv +proj=utm +zone=49 +ellps=WGS84 " - "+step +proj=aea +lat_0=0 +lon_0=132 +lat_1=-18 +lat_2=-36 " - "+x_0=0 +y_0=0 +ellps=GRS80", - } - if transform_code is None and area_of_interest is None: - assert ( - choose_transform_path(src_crs, dst_crs, transform_code, area_of_interest) - == {} - ) - elif area_of_interest is None: - with pytest.raises(ValueError): - choose_transform_path(src_crs, dst_crs, transform_code, area_of_interest) - elif transform_code is None: - assert choose_transform_path( - src_crs, dst_crs, transform_code, area_of_interest - ) == {"COORDINATE_OPERATION": proj_str.get("9688")} - else: - assert choose_transform_path( - src_crs, dst_crs, transform_code, area_of_interest - ) == {"COORDINATE_OPERATION": proj_str.get(transform_code.split(":")[1], "")} diff --git a/libs/algo/tests/test_masking.py b/libs/algo/tests/test_masking.py deleted file mode 100644 index 72e7743dd..000000000 --- a/libs/algo/tests/test_masking.py +++ /dev/null @@ -1,274 +0,0 @@ -import dask -import dask.array as da -import numpy as np -import pytest -import xarray as xr -from odc.algo._masking import ( - _enum_to_mask_numexpr, - _fuse_mean_np, - _gap_fill_np, - _get_enum_values, - enum_to_bool, - fmask_to_bool, - gap_fill, - mask_cleanup_np, -) - - -def test_gap_fill(): - a = np.zeros((5,), dtype="uint8") - b = np.empty_like(a) - b[:] = 33 - - a[0] = 11 - ab = _gap_fill_np(a, b, 0) - assert ab.dtype == a.dtype - assert ab.tolist() == [11, 33, 33, 33, 33] - - xa = xr.DataArray( - a, - name="test_a", - dims=("t",), - attrs={"p1": 1, "nodata": 0}, - coords=dict(t=np.arange(a.shape[0])), - ) - xb = xa + 0 - xb.data[:] = b - xab = gap_fill(xa, xb) - assert xab.name == xa.name - assert xab.attrs == xa.attrs - assert xab.data.tolist() == [11, 33, 33, 33, 33] - - xa.attrs["nodata"] = 11 - assert gap_fill(xa, xb).data.tolist() == [33, 0, 0, 0, 0] - - a = np.zeros((5,), dtype="float32") - a[1:] = np.nan - b = np.empty_like(a) - b[:] = 33 - ab = _gap_fill_np(a, b, np.nan) - - assert ab.dtype == a.dtype - assert ab.tolist() == [0, 33, 33, 33, 33] - - xa = xr.DataArray( - a, - name="test_a", - dims=("t",), - attrs={"p1": 1}, - coords=dict(t=np.arange(a.shape[0])), - ) - xb = xa + 0 - xb.data[:] = b - xab = gap_fill(xa, xb) - assert xab.name == xa.name - assert xab.attrs == xa.attrs - assert xab.data.tolist() == [0, 33, 33, 33, 33] - - xa = xr.DataArray( - da.from_array(a), - name="test_a", - dims=("t",), - attrs={"p1": 1}, - coords=dict(t=np.arange(a.shape[0])), - ) - - xb = xr.DataArray( - da.from_array(b), - name="test_a", - dims=("t",), - attrs={"p1": 1}, - coords=dict(t=np.arange(b.shape[0])), - ) - - assert dask.is_dask_collection(xa) - assert dask.is_dask_collection(xb) - xab = gap_fill(xa, xb) - - assert dask.is_dask_collection(xab) - assert xab.name == xa.name - assert xab.attrs == xa.attrs - assert xab.compute().values.tolist() == [0, 33, 33, 33, 33] - - -def test_fmask_to_bool(): - def _fake_flags(prefix="cat_", n=65): - return dict( - bits=list(range(8)), values={str(i): f"{prefix}{i}" for i in range(0, n)} - ) - - flags_definition = dict(fmask=_fake_flags()) - - fmask = xr.DataArray( - np.arange(0, 65, dtype="uint8"), attrs=dict(flags_definition=flags_definition) - ) - - mm = fmask_to_bool(fmask, ("cat_1", "cat_3")) - (ii,) = np.where(mm) - assert tuple(ii) == (1, 3) - - # upcast to uint16 internally - mm = fmask_to_bool(fmask, ("cat_0", "cat_15")) - (ii,) = np.where(mm) - assert tuple(ii) == (0, 15) - - # upcast to uint32 internally - mm = fmask_to_bool(fmask, ("cat_1", "cat_3", "cat_31")) - (ii,) = np.where(mm) - assert tuple(ii) == (1, 3, 31) - - # upcast to uint64 internally - mm = fmask_to_bool(fmask, ("cat_0", "cat_32", "cat_37", "cat_63")) - (ii,) = np.where(mm) - assert tuple(ii) == (0, 32, 37, 63) - - with pytest.raises(ValueError): - fmask_to_bool(fmask, ("cat_64")) - - mm = fmask_to_bool(fmask.chunk(3), ("cat_0",)).compute() - (ii,) = np.where(mm) - assert tuple(ii) == (0,) - - mm = fmask_to_bool(fmask.chunk(3), ("cat_31", "cat_63")).compute() - (ii,) = np.where(mm) - assert tuple(ii) == (31, 63) - - # check _get_enum_values - flags_definition = dict(cat=_fake_flags("cat_"), dog=_fake_flags("dog_")) - assert _get_enum_values(("cat_0",), flags_definition) == (0,) - assert _get_enum_values(("cat_0", "cat_12"), flags_definition) == (0, 12) - assert _get_enum_values(("dog_0", "dog_13"), flags_definition) == (0, 13) - assert _get_enum_values(("dog_0", "dog_13"), flags_definition, flag="dog") == ( - 0, - 13, - ) - - with pytest.raises(ValueError) as e: - _get_enum_values(("cat_10", "_nope"), flags_definition) - assert "Can not find flags definitions" in str(e) - - with pytest.raises(ValueError) as e: - _get_enum_values(("cat_10", "bah", "dog_0"), flags_definition, flag="dog") - assert "cat_10" in str(e) - - -def test_enum_to_mask(): - nmax = 129 - - def _fake_flags(prefix="cat_", n=nmax + 1): - return dict( - bits=list(range(8)), values={str(i): f"{prefix}{i}" for i in range(0, n)} - ) - - flags_definition = dict(fmask=_fake_flags()) - - fmask_no_flags = xr.DataArray(np.arange(0, nmax + 1, dtype="uint16")) - fmask = xr.DataArray( - np.arange(0, nmax + 1, dtype="uint16"), - attrs=dict(flags_definition=flags_definition), - ) - - mm = enum_to_bool(fmask, ("cat_1", "cat_3", nmax, 33)) - (ii,) = np.where(mm) - assert tuple(ii) == (1, 3, 33, nmax) - - mm = enum_to_bool(fmask, (0, 3, 17)) - (ii,) = np.where(mm) - assert tuple(ii) == (0, 3, 17) - - mm = enum_to_bool(fmask_no_flags, (0, 3, 17)) - (ii,) = np.where(mm) - assert tuple(ii) == (0, 3, 17) - assert mm.dtype == "bool" - - mm = enum_to_bool(fmask_no_flags, (0, 3, 8, 17), dtype="uint8", value_true=255) - (ii,) = np.where(mm == 255) - assert tuple(ii) == (0, 3, 8, 17) - assert mm.dtype == "uint8" - - mm = enum_to_bool( - fmask_no_flags, (0, 3, 8, 17), dtype="uint8", value_true=255, invert=True - ) - (ii,) = np.where(mm != 255) - assert tuple(ii) == (0, 3, 8, 17) - assert mm.dtype == "uint8" - - -def test_enum_to_mask_numexpr(): - elements = (1, 4, 23) - mm = np.asarray([1, 2, 3, 4, 5, 23], dtype="uint8") - - np.testing.assert_array_equal( - _enum_to_mask_numexpr(mm, elements), np.isin(mm, elements) - ) - np.testing.assert_array_equal( - _enum_to_mask_numexpr(mm, elements, invert=True), - np.isin(mm, elements, invert=True), - ) - - bb8 = _enum_to_mask_numexpr(mm, elements, dtype="uint8", value_true=255) - assert bb8.dtype == "uint8" - - np.testing.assert_array_equal( - _enum_to_mask_numexpr(mm, elements, dtype="uint8", value_true=255) == 255, - np.isin(mm, elements), - ) - - -def test_fuse_mean_np(): - data = np.array( - [ - [[255, 255], [255, 50]], - [[30, 40], [255, 80]], - [[25, 52], [255, 98]], - ] - ).astype(np.uint8) - - slices = [data[i : i + 1] for i in range(data.shape[0])] - out = _fuse_mean_np(*slices, nodata=255) - assert (out == np.array([[28, 46], [255, 76]])).all() - - -def test_mask_cleanup_np(): - mask = np.ndarray( - shape=(2, 2), dtype=bool, buffer=np.array([[True, False], [False, True]]) - ) - - mask_filter_with_opening_dilation = [("opening", 1), ("dilation", 1)] - result = mask_cleanup_np(mask, mask_filter_with_opening_dilation) - expected_result = np.array( - [[False, False], [False, False]], - ) - assert (result == expected_result).all() - - mask_filter_opening = [("opening", 1), ("dilation", 0)] - result = mask_cleanup_np(mask, mask_filter_opening) - expected_result = np.array( - [[False, False], [False, False]], - ) - assert (result == expected_result).all() - - mask_filter_with_dilation = [("opening", 0), ("dilation", 1)] - result = mask_cleanup_np(mask, mask_filter_with_dilation) - expected_result = np.array( - [[True, True], [True, True]], - ) - assert (result == expected_result).all() - - mask_filter_with_closing = [("closing", 1), ("opening", 1), ("dilation", 1)] - result = mask_cleanup_np(mask, mask_filter_with_closing) - expected_result = np.array( - [[True, True], [True, True]], - ) - assert (result == expected_result).all() - - mask_filter_with_all_zero = [("closing", 0), ("opening", 0), ("dilation", 0)] - result = mask_cleanup_np(mask, mask_filter_with_all_zero) - expected_result = np.array( - [[True, False], [False, True]], - ) - assert (result == expected_result).all() - - invalid_mask_filter = [("oppening", 1), ("dilation", 1)] - with pytest.raises(Exception): - mask_cleanup_np(mask, invalid_mask_filter) diff --git a/libs/algo/tests/test_memsink.py b/libs/algo/tests/test_memsink.py deleted file mode 100644 index f4085d52a..000000000 --- a/libs/algo/tests/test_memsink.py +++ /dev/null @@ -1,146 +0,0 @@ -import dask -import dask.array as da -import numpy as np -import xarray as xr -from odc.algo._memsink import ( - Cache, - CachedArray, - Token, - _da_from_mem, - da_mem_sink, - da_yxbt_sink, - da_yxt_sink, - yxt_sink, -) - - -def test_cache(): - k = Cache.new((5,), "uint8") - assert isinstance(k, Token) - xx = Cache.get(k) - assert xx.shape == (5,) - assert xx.dtype == "uint8" - assert Cache.get(k) is xx - assert Cache.get("some bad key") is None - assert Cache.pop(k) is xx - assert Cache.get(k) is None - - -def test_cached_array(): - ds = CachedArray.new((100, 200), "uint16") - xx = ds.data - assert xx.shape == (100, 200) - assert xx.dtype == "uint16" - assert ds.data is xx - - ds[:] = 0x1020 - assert (xx == 0x1020).all() - - ds2 = ds[:10, :20] - assert ds2.data.shape == (10, 20) - ds2[:, :] = 133 - assert (ds.data[:10, :20] == ds2.data).all() - assert (ds.data[:10, :20] == 133).all() - - ds.release() - - -def test_da_from_mem(): - shape = (100, 200) - chunks = (10, 101) - xx = (np.random.uniform(size=shape) * 1000).astype("uint16") - - k = Cache.put(xx) - yy = _da_from_mem( - dask.delayed(str(k)), xx.shape, xx.dtype, chunks=chunks, name="yy" - ) - assert yy.name.startswith("yy-") - assert yy.shape == xx.shape - assert yy.dtype == xx.dtype - assert yy.chunks[1] == (101, 99) - - assert (yy.compute() == xx).all() - - assert (yy[:3, :5].compute() == xx[:3, :5]).all() - - -def test_cache_dask_new(): - tk = Cache.dask_new((10, 10), "float32", "jj") - assert dask.is_dask_collection(tk) - assert tk.key.startswith("jj-") - - -def test_da_to_mem(): - xx = da.random.uniform(size=(10, 20), chunks=(5, 4)) - yy = da_mem_sink(xx, chunks=(-1, -1), name="yy") - - assert dask.is_dask_collection(yy) - assert xx.shape == yy.shape - assert xx.dtype == yy.dtype - - _yy = yy.compute() - _xx = xx.compute() - assert (_xx == _yy).all() - - -def test_yxbt_sink(): - NT, NY, NX = 3, 10, 20 - NB = 2 - aa = da.random.uniform(size=(NT, NY, NX), chunks=(1, 5, 4)) - bb = da.random.uniform(size=(NT, NY, NX), chunks=(1, 5, 4)) - - yxbt = da_yxbt_sink((aa, bb), (5, 5, -1, -1)) - assert yxbt.chunksize == (5, 5, NB, NT) - assert yxbt.shape == (NY, NX, NB, NT) - assert yxbt.dtype == aa.dtype - - _yxbt = yxbt.compute() - _aa = aa.compute() - _bb = bb.compute() - for t_idx in range(NT): - assert (_yxbt[:, :, 0, t_idx] == _aa[t_idx]).all() - assert (_yxbt[:, :, 1, t_idx] == _bb[t_idx]).all() - - -def test_da_yxt_sink(): - NT, NY, NX = 3, 10, 20 - aa = da.random.uniform(size=(NT, NY, NX), chunks=(1, 5, 4)) - - yxt = da_yxt_sink(aa, (5, 5, -1)) - assert yxt.chunksize == (5, 5, NT) - assert yxt.shape == (NY, NX, NT) - assert yxt.dtype == aa.dtype - - _yxt = yxt.compute() - _aa = aa.compute() - for t_idx in range(NT): - assert (_yxt[:, :, t_idx] == _aa[t_idx]).all() - - -def test_yxt_sink(): - NT, NY, NX = 3, 10, 20 - - data = da.random.uniform(size=(NT, NY, NX), chunks=(1, 5, 4)) - aa = xr.DataArray( - data=data, - dims=("time", "y", "x"), - coords={ - "time": np.arange(0, NT), - "x": np.arange(0, NX), - "y": np.arange(0, NY), - }, - ) - - yxt = yxt_sink(aa, (5, 5, -1)) - assert yxt.data.chunksize == (5, 5, NT) - assert yxt.shape == (NY, NX, NT) - assert yxt.dtype == aa.dtype - - assert (yxt.x == aa.x).all() - assert (yxt.y == aa.y).all() - assert (yxt.time == aa.time).all() - - _yxt = yxt.compute() - _aa = aa.compute() - for t_idx in range(NT): - assert (_yxt[:, :, t_idx] == _aa[t_idx]).all() diff --git a/libs/algo/tests/test_numeric.py b/libs/algo/tests/test_numeric.py deleted file mode 100644 index 5e9069f85..000000000 --- a/libs/algo/tests/test_numeric.py +++ /dev/null @@ -1,57 +0,0 @@ -import numpy as np -import pytest -from odc.algo._numeric import ( - half_up, - np_slice_to_idx, - roi_shrink2, - roundup16, - shape_shrink2, -) - - -def test_utils(): - assert half_up(4) == 2 - assert half_up(5) == 3 - for i in (1, 127, 24, 8889, 101010): - assert half_up(i * 2) == i - assert half_up(i * 2 + 1) == i + 1 - - assert roundup16(16) == 16 - assert roundup16(17) == 32 - assert roundup16(31) == 32 - assert roundup16(32) == 32 - - assert np_slice_to_idx(np.s_[:], 3) == (0, 1, 2) - assert np_slice_to_idx(np.s_[:3], 10) == (0, 1, 2) - assert np_slice_to_idx(np.s_[-1:], 10) == (9,) - assert np_slice_to_idx(np.s_[0], 10) == (0,) - assert np_slice_to_idx(np.s_[5], 10) == (5,) - - -@pytest.mark.parametrize( - "shape,axis,expect", - [ - ((4, 2), 0, (2, 1)), - ((5, 2), 0, (3, 1)), - ((6, 2), 0, (3, 1)), - ((7, 9, 3), 0, (4, 5, 3)), - ((7, 9, 3), 1, (7, 5, 2)), - ((10, 16, 8, 3), 1, (10, 8, 4, 3)), - ], -) -def test_shape_shrink2(shape, axis, expect): - assert shape_shrink2(shape, axis=axis) == expect - - -@pytest.mark.parametrize( - "roi,axis,expect", - [ - (np.s_[10:20, 20:30, :], 0, np.s_[5:10, 10:15, :]), - (np.s_[:, :], 0, np.s_[:, :]), - (np.s_[10:20, :30, :], 0, np.s_[5:10, :15, :]), - (np.s_[:, 10:20, 20:30, :], 1, np.s_[:, 5:10, 10:15, :]), - (np.s_[:, 10:21, 20:31, :], 1, np.s_[:, 5:11, 10:16, :]), - ], -) -def test_roi_shrink2(roi, axis, expect): - assert roi_shrink2(roi, axis=axis) == expect diff --git a/libs/algo/tests/test_percentile.py b/libs/algo/tests/test_percentile.py deleted file mode 100644 index 322c82cb8..000000000 --- a/libs/algo/tests/test_percentile.py +++ /dev/null @@ -1,136 +0,0 @@ -import dask.array as da -import numpy as np -import pytest -import xarray as xr -from odc.algo._percentile import np_percentile, xr_quantile, xr_quantile_bands - - -def test_np_percentile(): - arr = np.array( - [[0, 1, 4, 6, 8, 10, 15, 22, 25, 27], [3, 5, 6, 8, 9, 11, 15, 28, 31, 50]] - ) - - np.random.shuffle(arr[0, :]) - np.random.shuffle(arr[1, :]) - arr = arr.transpose() - - assert (np_percentile(arr, 0.5, 255) == np.array([8, 9])).all() - assert (np_percentile(arr, 0.7, 255) == np.array([15, 15])).all() - assert (np_percentile(arr, 1.0, 255) == np.array([27, 50])).all() - assert (np_percentile(arr, 0.0, 255) == np.array([0, 3])).all() - - -@pytest.mark.parametrize("nodata", [255, 200, np.nan, -1]) -def test_np_percentile_some_bad_data(nodata): - arr = np.array( - [ - [0, 1, 4, 6, 8, nodata, nodata, nodata, nodata, nodata], - [3, 5, 6, 8, 9, 11, 15, 28, 31, 50], - ] - ) - - np.random.shuffle(arr[0, :]) - np.random.shuffle(arr[1, :]) - arr = arr.transpose() - - assert (np_percentile(arr, 0.5, nodata) == np.array([4, 9])).all() - assert (np_percentile(arr, 0.7, nodata) == np.array([6, 15])).all() - assert (np_percentile(arr, 1.0, nodata) == np.array([8, 50])).all() - assert (np_percentile(arr, 0.0, nodata) == np.array([0, 3])).all() - - -@pytest.mark.parametrize("nodata", [255, 200, np.nan]) -def test_np_percentile_bad_data(nodata): - arr = np.array( - [ - [0, 1, nodata, nodata, nodata, nodata, nodata, nodata, nodata, nodata], - [3, 5, 6, 8, 9, 11, 15, 28, 31, 50], - ] - ) - - np.random.shuffle(arr[0, :]) - np.random.shuffle(arr[1, :]) - arr = arr.transpose() - - np.testing.assert_equal(np_percentile(arr, 0.5, nodata), np.array([nodata, 9])) - np.testing.assert_equal(np_percentile(arr, 0.7, nodata), np.array([nodata, 15])) - np.testing.assert_equal(np_percentile(arr, 1.0, nodata), np.array([nodata, 50])) - np.testing.assert_equal(np_percentile(arr, 0.0, nodata), np.array([nodata, 3])) - - -@pytest.mark.parametrize("nodata", [255, 200, np.nan, -1]) -@pytest.mark.parametrize("use_dask", [False, True]) -def test_xr_quantile_bands(nodata, use_dask): - band_1 = np.random.randint(0, 100, size=(10, 100, 200)).astype(type(nodata)) - band_2 = np.random.randint(0, 100, size=(10, 100, 200)).astype(type(nodata)) - - band_1[np.random.random(size=band_1.shape) > 0.5] = nodata - band_2[np.random.random(size=band_1.shape) > 0.5] = nodata - - true_results = dict() - true_results["band_1_pc_20"] = np_percentile(band_1, 0.2, nodata) - true_results["band_2_pc_20"] = np_percentile(band_2, 0.2, nodata) - true_results["band_1_pc_60"] = np_percentile(band_1, 0.6, nodata) - true_results["band_2_pc_60"] = np_percentile(band_2, 0.6, nodata) - - if use_dask: - band_1 = da.from_array(band_1, chunks=(2, 20, 20)) - band_2 = da.from_array(band_2, chunks=(2, 20, 20)) - - attrs = {"test": "attrs"} - coords = { - "x": np.linspace(10, 20, band_1.shape[2]), - "y": np.linspace(0, 5, band_1.shape[1]), - "t": np.linspace(0, 5, band_1.shape[0]), - } - - data_vars = { - "band_1": (("t", "y", "x"), band_1), - "band_2": (("t", "y", "x"), band_2), - } - - dataset = xr.Dataset(data_vars=data_vars, coords=coords, attrs=attrs) - output = xr_quantile_bands(dataset, [0.2, 0.6], nodata).compute() - - for band in output.keys(): - np.testing.assert_equal(output[band], true_results[band]) - - -@pytest.mark.parametrize("nodata", [255, 200, np.nan, -1]) -@pytest.mark.parametrize("use_dask", [False, True]) -def test_xr_quantile(nodata, use_dask): - band_1 = np.random.randint(0, 100, size=(10, 100, 200)).astype(type(nodata)) - band_2 = np.random.randint(0, 100, size=(10, 100, 200)).astype(type(nodata)) - - band_1[np.random.random(size=band_1.shape) > 0.5] = nodata - band_2[np.random.random(size=band_1.shape) > 0.5] = nodata - - true_results = dict() - true_results["band_1"] = np.stack( - [np_percentile(band_1, 0.2, nodata), np_percentile(band_1, 0.6, nodata)], axis=0 - ) - true_results["band_2"] = np.stack( - [np_percentile(band_2, 0.2, nodata), np_percentile(band_2, 0.6, nodata)], axis=0 - ) - - if use_dask: - band_1 = da.from_array(band_1, chunks=(2, 20, 20)) - band_2 = da.from_array(band_2, chunks=(2, 20, 20)) - - attrs = {"test": "attrs"} - coords = { - "x": np.linspace(10, 20, band_1.shape[2]), - "y": np.linspace(0, 5, band_1.shape[1]), - "t": np.linspace(0, 5, band_1.shape[0]), - } - - data_vars = { - "band_1": xr.DataArray(band_1, dims=("t", "y", "x"), attrs={"test_attr": 1}), - "band_2": xr.DataArray(band_2, dims=("t", "y", "x"), attrs={"test_attr": 2}), - } - - dataset = xr.Dataset(data_vars=data_vars, coords=coords, attrs=attrs) - output = xr_quantile(dataset, [0.2, 0.6], nodata).compute() - - for band in output.keys(): - np.testing.assert_equal(output[band], true_results[band]) diff --git a/libs/algo/tests/test_warp.py b/libs/algo/tests/test_warp.py deleted file mode 100644 index 456e5a84b..000000000 --- a/libs/algo/tests/test_warp.py +++ /dev/null @@ -1,8 +0,0 @@ -import numpy as np -from odc.algo._warp import _shrink2 - - -def test_shrink2_smoke_test(): - assert _shrink2(np.zeros((15, 17, 3), dtype="uint8")).shape == (8, 9, 3) - assert _shrink2(np.zeros((15, 17), dtype="uint8")).shape == (8, 9) - assert _shrink2(np.zeros((2, 15, 17), dtype="uint8"), axis=1).shape == (2, 8, 9) diff --git a/tests/test-env-py39.yml b/tests/test-env-py39.yml index 606c7cc70..3dbcc0ad5 100644 --- a/tests/test-env-py39.yml +++ b/tests/test-env-py39.yml @@ -13,13 +13,6 @@ dependencies: - datacube>=1.8.15 - sqlalchemy<2.0.0 - # odc.algo - - dask-image - - numexpr - - scikit-image - - scipy - - toolz - # odc.ui - ipywidgets>=8.0 - ipyleaflet @@ -61,8 +54,5 @@ dependencies: - thredds-crawler - rio-stac - # odc.algo optional dependency - - hdstats - # odc.ui - jupyter-ui-poll>=0.2.0a From a8d774f42c98366d88bc551f78cff8d08d17bbef Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 13 Nov 2023 17:04:16 +0000 Subject: [PATCH 20/26] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/adrienverge/yamllint: v1.32.0 → v1.33.0](https://github.com/adrienverge/yamllint/compare/v1.32.0...v1.33.0) - [github.com/psf/black: 23.10.0 → 23.11.0](https://github.com/psf/black/compare/23.10.0...23.11.0) --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 301e3996d..e560ee032 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ --- repos: - repo: https://github.com/adrienverge/yamllint - rev: v1.32.0 + rev: v1.33.0 hooks: - id: yamllint args: ['-c', '.yamllint'] @@ -26,7 +26,7 @@ repos: # name: isort (python) # args: [ "--profile", "black", "--filter-files" ] - repo: https://github.com/psf/black - rev: 23.10.0 + rev: 23.11.0 hooks: - id: black - repo: https://github.com/pre-commit/mirrors-pylint From 573d4fd0a40cb2ca5fa481f14552664169036b9b Mon Sep 17 00:00:00 2001 From: Paul Haesler Date: Wed, 15 Nov 2023 12:02:06 +1100 Subject: [PATCH 21/26] Give all packages a _version.py file for consistency, and bump all version numbers. --- apps/cloud/odc/apps/cloud/__init__.py | 2 +- apps/cloud/odc/apps/cloud/_version.py | 1 + apps/dc_tools/odc/apps/dc_tools/_version.py | 2 +- libs/cloud/odc/cloud/__init__.py | 2 +- libs/cloud/odc/cloud/_version.py | 1 + libs/io/odc/io/__init__.py | 2 +- libs/io/odc/io/_version.py | 1 + libs/ui/odc/ui/_version.py | 2 +- 8 files changed, 8 insertions(+), 5 deletions(-) create mode 100644 apps/cloud/odc/apps/cloud/_version.py create mode 100644 libs/cloud/odc/cloud/_version.py create mode 100644 libs/io/odc/io/_version.py diff --git a/apps/cloud/odc/apps/cloud/__init__.py b/apps/cloud/odc/apps/cloud/__init__.py index b5fdc7530..6a5649080 100644 --- a/apps/cloud/odc/apps/cloud/__init__.py +++ b/apps/cloud/odc/apps/cloud/__init__.py @@ -1 +1 @@ -__version__ = "0.2.2" +from _version import __version__ diff --git a/apps/cloud/odc/apps/cloud/_version.py b/apps/cloud/odc/apps/cloud/_version.py new file mode 100644 index 000000000..d31c31eae --- /dev/null +++ b/apps/cloud/odc/apps/cloud/_version.py @@ -0,0 +1 @@ +__version__ = "0.2.3" diff --git a/apps/dc_tools/odc/apps/dc_tools/_version.py b/apps/dc_tools/odc/apps/dc_tools/_version.py index f3291e93b..ddc77a880 100644 --- a/apps/dc_tools/odc/apps/dc_tools/_version.py +++ b/apps/dc_tools/odc/apps/dc_tools/_version.py @@ -1 +1 @@ -__version__ = "0.2.14" +__version__ = "0.2.15" diff --git a/libs/cloud/odc/cloud/__init__.py b/libs/cloud/odc/cloud/__init__.py index d31c31eae..6a5649080 100644 --- a/libs/cloud/odc/cloud/__init__.py +++ b/libs/cloud/odc/cloud/__init__.py @@ -1 +1 @@ -__version__ = "0.2.3" +from _version import __version__ diff --git a/libs/cloud/odc/cloud/_version.py b/libs/cloud/odc/cloud/_version.py new file mode 100644 index 000000000..788da1fb3 --- /dev/null +++ b/libs/cloud/odc/cloud/_version.py @@ -0,0 +1 @@ +__version__ = "0.2.4" diff --git a/libs/io/odc/io/__init__.py b/libs/io/odc/io/__init__.py index d998e9c97..6bc906ef1 100644 --- a/libs/io/odc/io/__init__.py +++ b/libs/io/odc/io/__init__.py @@ -4,7 +4,7 @@ from .text import parse_mtl, parse_yaml, read_stdin_lines, slurp, slurp_lines from .timer import RateEstimator -__version__ = "0.2.1" +from _version import __version__ __all__ = ( "parse_yaml", diff --git a/libs/io/odc/io/_version.py b/libs/io/odc/io/_version.py new file mode 100644 index 000000000..b5fdc7530 --- /dev/null +++ b/libs/io/odc/io/_version.py @@ -0,0 +1 @@ +__version__ = "0.2.2" diff --git a/libs/ui/odc/ui/_version.py b/libs/ui/odc/ui/_version.py index 5213695f5..3ced3581b 100644 --- a/libs/ui/odc/ui/_version.py +++ b/libs/ui/odc/ui/_version.py @@ -1 +1 @@ -__version__ = "0.2.0a3" +__version__ = "0.2.1" From 4ac901df7a73409bd6b7ac2a5f36792d83bd1407 Mon Sep 17 00:00:00 2001 From: Paul Haesler Date: Wed, 15 Nov 2023 12:07:19 +1100 Subject: [PATCH 22/26] Placate flake8. I'm doing this purposefully, sir. --- apps/cloud/odc/apps/cloud/__init__.py | 2 +- libs/cloud/odc/cloud/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/cloud/odc/apps/cloud/__init__.py b/apps/cloud/odc/apps/cloud/__init__.py index 6a5649080..76f2b73d0 100644 --- a/apps/cloud/odc/apps/cloud/__init__.py +++ b/apps/cloud/odc/apps/cloud/__init__.py @@ -1 +1 @@ -from _version import __version__ +from _version import __version__ #noqa: F401 diff --git a/libs/cloud/odc/cloud/__init__.py b/libs/cloud/odc/cloud/__init__.py index 6a5649080..76f2b73d0 100644 --- a/libs/cloud/odc/cloud/__init__.py +++ b/libs/cloud/odc/cloud/__init__.py @@ -1 +1 @@ -from _version import __version__ +from _version import __version__ #noqa: F401 From 291cd535921eb0b1f4ff2853d47e41f3f082fa29 Mon Sep 17 00:00:00 2001 From: Paul Haesler Date: Wed, 15 Nov 2023 12:10:34 +1100 Subject: [PATCH 23/26] Import from ._version, not _version. --- apps/cloud/odc/apps/cloud/__init__.py | 2 +- libs/cloud/odc/cloud/__init__.py | 2 +- libs/io/odc/io/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/cloud/odc/apps/cloud/__init__.py b/apps/cloud/odc/apps/cloud/__init__.py index 76f2b73d0..0bcedc1fc 100644 --- a/apps/cloud/odc/apps/cloud/__init__.py +++ b/apps/cloud/odc/apps/cloud/__init__.py @@ -1 +1 @@ -from _version import __version__ #noqa: F401 +from ._version import __version__ #noqa: F401 diff --git a/libs/cloud/odc/cloud/__init__.py b/libs/cloud/odc/cloud/__init__.py index 76f2b73d0..0bcedc1fc 100644 --- a/libs/cloud/odc/cloud/__init__.py +++ b/libs/cloud/odc/cloud/__init__.py @@ -1 +1 @@ -from _version import __version__ #noqa: F401 +from ._version import __version__ #noqa: F401 diff --git a/libs/io/odc/io/__init__.py b/libs/io/odc/io/__init__.py index 6bc906ef1..7841a735d 100644 --- a/libs/io/odc/io/__init__.py +++ b/libs/io/odc/io/__init__.py @@ -4,7 +4,7 @@ from .text import parse_mtl, parse_yaml, read_stdin_lines, slurp, slurp_lines from .timer import RateEstimator -from _version import __version__ +from ._version import __version__ __all__ = ( "parse_yaml", From 2d20db751f1e425e051820b075bf16860f594aca Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 Nov 2023 01:12:02 +0000 Subject: [PATCH 24/26] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- apps/cloud/odc/apps/cloud/__init__.py | 2 +- libs/cloud/odc/cloud/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/cloud/odc/apps/cloud/__init__.py b/apps/cloud/odc/apps/cloud/__init__.py index 0bcedc1fc..3a8d6d5cf 100644 --- a/apps/cloud/odc/apps/cloud/__init__.py +++ b/apps/cloud/odc/apps/cloud/__init__.py @@ -1 +1 @@ -from ._version import __version__ #noqa: F401 +from ._version import __version__ # noqa: F401 diff --git a/libs/cloud/odc/cloud/__init__.py b/libs/cloud/odc/cloud/__init__.py index 0bcedc1fc..3a8d6d5cf 100644 --- a/libs/cloud/odc/cloud/__init__.py +++ b/libs/cloud/odc/cloud/__init__.py @@ -1 +1 @@ -from ._version import __version__ #noqa: F401 +from ._version import __version__ # noqa: F401 From 81fb0d2b989beba23f41851b6b13e8d77be37390 Mon Sep 17 00:00:00 2001 From: Ariana Barzinpour Date: Wed, 15 Nov 2023 23:56:33 +0000 Subject: [PATCH 25/26] resolve duplicate test envs --- tests/test-env-py39.yml | 58 ----------------------------------------- tests/test-env.yml | 15 ++--------- 2 files changed, 2 insertions(+), 71 deletions(-) delete mode 100644 tests/test-env-py39.yml diff --git a/tests/test-env-py39.yml b/tests/test-env-py39.yml deleted file mode 100644 index 3dbcc0ad5..000000000 --- a/tests/test-env-py39.yml +++ /dev/null @@ -1,58 +0,0 @@ -# Conda environment for running tests in odc-tools -# conda env create -f test-env-py39.yml -# conda activate odc-tests-py39 - -name: odc-tests-py39 -channels: - - conda-forge - -dependencies: - - python=3.9 - - # Datacube - - datacube>=1.8.15 - - sqlalchemy<2.0.0 - - # odc.ui - - ipywidgets>=8.0 - - ipyleaflet - - tqdm - - # odc-apps-dc-tools - - pystac>=1.2.0 - - pystac-client>=0.4.0 - - azure-storage-blob - - fsspec - - lxml # needed for thredds-crawler - - urlpath - - datadog - - eodatasets3 - - importlib_resources>=6.0 - - # odc.{aws,aio}: aiobotocore/boto3 - # pin aiobotocore for easier resolution of dependencies - - aiobotocore - - boto3 - - # For tests - - pytest - - pytest-httpserver - - pytest-cov - - pytest-timeout - - moto - - deepdiff - - # for docs - - sphinx - - sphinx_rtd_theme - - sphinx-autodoc-typehints - - nbsphinx - - - pip=20 - - pip: - # odc.apps.dc-tools - - thredds-crawler - - rio-stac - - # odc.ui - - jupyter-ui-poll>=0.2.0a diff --git a/tests/test-env.yml b/tests/test-env.yml index a340aa518..40da16153 100644 --- a/tests/test-env.yml +++ b/tests/test-env.yml @@ -7,19 +7,12 @@ channels: - conda-forge dependencies: - - python>=3.9,<3.12 + - python=3.9 # Datacube - datacube>=1.8.15 - sqlalchemy<2.0.0 - # odc.algo - - dask-image - - numexpr - - scikit-image - - scipy - - toolz - # odc.ui - ipywidgets>=8.0 - ipyleaflet @@ -38,7 +31,7 @@ dependencies: # odc.{aws,aio}: aiobotocore/boto3 # pin aiobotocore for easier resolution of dependencies - - aiobotocore==1.4.2 + - aiobotocore - boto3 # For tests @@ -49,7 +42,6 @@ dependencies: - moto - deepdiff - # for docs - sphinx - sphinx_rtd_theme @@ -62,8 +54,5 @@ dependencies: - thredds-crawler - rio-stac - # odc.algo optional dependency - - hdstats - # odc.ui - jupyter-ui-poll>=0.2.0a From 7ba48f061008065b26d0b382126cf85bbd060c0f Mon Sep 17 00:00:00 2001 From: Ariana Barzinpour Date: Thu, 16 Nov 2023 02:15:51 +0000 Subject: [PATCH 26/26] fix docstring, rename uri to uris and remove str type --- apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py b/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py index 7f7c0a7b8..eb6d37d49 100755 --- a/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py +++ b/apps/dc_tools/odc/apps/dc_tools/s3_to_dc.py @@ -110,7 +110,7 @@ def dump_to_odc( @request_payer @archive_less_mature @publish_action -@click.argument("uri", type=str, nargs=-1) +@click.argument("uris", nargs=-1) @click.argument("product", type=str, nargs=1, required=False) def cli( skip_lineage, @@ -127,7 +127,7 @@ def cli( request_payer, archive_less_mature, publish_action, - uri, + uris, product, ): """ @@ -137,7 +137,8 @@ def cli( If more than one uri is given, all will be treated as absolute URLs. Product is optional; if one is provided, it must match all datasets. - Only one product can be provided. + Can provide a single product name or a space separated list of multiple products + (formatted as a single string). """ transform = None @@ -156,11 +157,7 @@ def cli( # if it's a uri, a product wasn't provided, and 'product' is actually another uri if product.startswith("s3://"): candidate_products = [] - if isinstance(uri, str): - uri = [uri, product] - else: - uri = list(uri) - uri.append(product) + uris += (product,) else: # Check datacube connection and products candidate_products = product.split() @@ -180,9 +177,9 @@ def cli( is_glob = True # we assume the uri to be an absolute URL if it contains no wildcards # or if there are multiple uri values provided - if (len(uri) > 1) or ("*" not in uri[0]): + if (len(uris) > 1) or ("*" not in uris[0]): is_glob = False - for url in uri: + for url in uris: if "*" in url: logging.warning( "A list of uris is assumed to include only absolute URLs. " @@ -195,11 +192,11 @@ def cli( if is_glob: document_stream = ( url.url - for url in s3_find_glob(uri[0], skip_check=skip_check, s3=fetcher, **opts) + for url in s3_find_glob(uris[0], skip_check=skip_check, s3=fetcher, **opts) ) else: # if working with absolute URLs, no need for all the globbing logic - document_stream = uri + document_stream = uris added, failed, skipped = dump_to_odc( fetcher(document_stream),