Synchronize docs between DataCube, VectorCube, ...

use tests to enforce keeping docs in sync related to #402/#720/#725
Open-EO · Feb 11, 2025 · 62f2221 · 62f2221
1 parent 9d96665
commit 62f2221
Show file tree

Hide file tree

Showing 6 changed files with 199 additions and 36 deletions.
diff --git a/openeo/internal/documentation.py b/openeo/internal/documentation.py
@@ -4,9 +4,10 @@
 
 import collections
 import inspect
+import re
 import textwrap
 from functools import partial
-from typing import Callable, Optional, Tuple, TypeVar
+from typing import Callable, Dict, Optional, Tuple, TypeVar, Union
 
 # TODO: give this a proper public API?
 _process_registry = collections.defaultdict(list)
@@ -58,3 +59,39 @@ def decorate(f: Callable) -> Callable:
         return f
 
     return decorate
+
+
+def _get_doc(obj: Union[str, Callable]) -> str:
+    """
+    Get docstring of a method or function.
+    """
+    if isinstance(obj, str):
+        doc = obj
+    else:
+        doc = obj.__doc__
+    return textwrap.dedent(doc)
+
+
+def extract_params(doc: Union[str, Callable]) -> Dict[str, str]:
+    """
+    Extract parameters (``:param name:`` format) from a docstring.
+    """
+    doc = _get_doc(doc)
+    params_regex = re.compile(r"^:param\s+(?P<param>\w+)\s*:(?P<doc>.*(\n +.*)*)", re.MULTILINE)
+    return {m.group("param"): m.group("doc").strip() for m in params_regex.finditer(doc)}
+
+
+def assert_same_param_docs(doc_a: Union[str, Callable], doc_b: Union[str, Callable], only_intersection: bool = False):
+    """
+    Compare parameters (``:param name:`` format) from two docstrings.
+    """
+    # TODO: option to also check order?
+    params_a = extract_params(doc_a)
+    params_b = extract_params(doc_b)
+
+    if only_intersection:
+        intersection = set(params_a.keys()).intersection(params_b.keys())
+        params_a = {k: v for k, v in params_a.items() if k in intersection}
+        params_b = {k: v for k, v in params_b.items() if k in intersection}
+
+    assert params_a == params_b
diff --git a/openeo/rest/datacube.py b/openeo/rest/datacube.py
@@ -2549,6 +2549,7 @@ def execute_batch(
             One of "error" (highest severity), "warning", "info", and "debug" (lowest severity).
         :param max_poll_interval: maximum number of seconds to sleep between job status polls
         :param connection_retry_interval: how long to wait when status poll failed due to connection issue
+        :param print: print/logging function to show progress/status
 
         .. versionchanged:: 0.32.0
             Added ``auto_add_save_result`` option
@@ -2641,7 +2642,7 @@ def create_job(
         :param log_level: Optional minimum severity level for log entries that the back-end should keep track of.
             One of "error" (highest severity), "warning", "info", and "debug" (lowest severity).
 
-        :return: Created job.
+        :return: Handle for the job created at the backend.
 
         .. versionchanged:: 0.32.0
             Added ``auto_add_save_result`` option

diff --git a/openeo/rest/mlmodel.py b/openeo/rest/mlmodel.py
@@ -79,15 +79,28 @@ def execute_batch(
         log_level: Optional[str] = None,
     ) -> BatchJob:
         """
-        Evaluate the process graph by creating a batch job, and retrieving the results when it is finished.
-        This method is mostly recommended if the batch job is expected to run in a reasonable amount of time.
-
-        For very long-running jobs, you probably do not want to keep the client running.
-
-        :param job_options:
-        :param outputfile: The path of a file to which a result can be written
-        :param out_format: (optional) Format of the job result.
-        :param format_options: String Parameters for the job result format
+        Execute the underlying process graph at the backend in batch job mode:
+
+        - create the job (like :py:meth:`create_job`)
+        - start the job (like :py:meth:`BatchJob.start() <openeo.rest.job.BatchJob.start>`)
+        - track the job's progress with an active polling loop
+          (like :py:meth:`BatchJob.run_synchronous() <openeo.rest.job.BatchJob.run_synchronous>`)
+        - optionally (if ``outputfile`` is specified) download the job's results
+          when the job finished successfully
+
+        .. note::
+            Because of the active polling loop,
+            which blocks any further progress of your script or application,
+            this :py:meth:`execute_batch` method is mainly recommended
+            for batch jobs that are expected to complete
+            in a time that is reasonable for your use case.
+
+        :param outputfile: Optional, output path to download to.
+        :param title: job title.
+        :param description: job description.
+        :param plan: The billing plan to process and charge the job with
+        :param budget: Maximum budget to be spent on executing the job.
+            Note that some backends do not honor this limit.
         :param additional: additional (top-level) properties to set in the request body
         :param job_options: dictionary of job options to pass to the backend
             (under top-level property "job_options")
@@ -96,6 +109,7 @@ def execute_batch(
             One of "error" (highest severity), "warning", "info", and "debug" (lowest severity).
         :param max_poll_interval: maximum number of seconds to sleep between job status polls
         :param connection_retry_interval: how long to wait when status poll failed due to connection issue
+        :param print: print/logging function to show progress/status
 
         .. versionchanged:: 0.36.0
             Added argument ``additional``.
@@ -136,17 +150,24 @@ def create_job(
         log_level: Optional[str] = None,
     ) -> BatchJob:
         """
-        Sends a job to the backend and returns a ClientJob instance.
+        Send the underlying process graph to the backend
+        to create an openEO batch job
+        and return a corresponding :py:class:`~openeo.rest.job.BatchJob` instance.
 
-        :param title: job title
-        :param description: job description
-        :param plan: The billing plan to process and charge the job with
+        Note that this method only *creates* the openEO batch job at the backend,
+        but it does not *start* it.
+        Use :py:meth:`execute_batch` instead to let the openEO Python client
+        take care of the full job life cycle: create, start and track its progress until completion.
+
+
+        :param title: job title.
+        :param description: job description.
+        :param plan: The billing plan to process and charge the job with.
         :param budget: Maximum budget to be spent on executing the job.
             Note that some backends do not honor this limit.
         :param additional: additional (top-level) properties to set in the request body
         :param job_options: dictionary of job options to pass to the backend
             (under top-level property "job_options")
-        :param format_options: String Parameters for the job result format
         :param log_level: Optional minimum severity level for log entries that the back-end should keep track of.
             One of "error" (highest severity), "warning", "info", and "debug" (lowest severity).
         :return: Created job.

diff --git a/openeo/rest/vectorcube.py b/openeo/rest/vectorcube.py
@@ -27,7 +27,6 @@
 from openeo.rest.job import BatchJob
 from openeo.rest.mlmodel import MlModel
 from openeo.rest.result import SaveResult
-from openeo.rest.stac_resource import StacResource
 from openeo.util import InvalidBBoxException, dict_no_none, guess_format, to_bbox_dict
 
 if typing.TYPE_CHECKING:
@@ -218,25 +217,33 @@ def download(
         *,
         validate: Optional[bool] = None,
         auto_add_save_result: bool = True,
+        additional: Optional[dict] = None,
+        job_options: Optional[dict] = None,
     ) -> Union[None, bytes]:
         """
         Execute synchronously and download the vector cube.
 
         The result will be stored to the output path, when specified.
         If no output path (or ``None``) is given, the raw download content will be returned as ``bytes`` object.
 
-        :param outputfile: (optional) output file to store the result to
-        :param format: (optional) output format to use.
-        :param options: (optional) additional output format options.
+        :param outputfile: Optional, output path to download to.
+        :param format: Optional, an output format supported by the backend.
+        :param options: Optional, file format options
         :param validate: Optional toggle to enable/prevent validation of the process graphs before execution
             (overruling the connection's ``auto_validate`` setting).
         :param auto_add_save_result: Automatically add a ``save_result`` node to the process graph if there is none yet.
+        :param additional: additional (top-level) properties to set in the request body
+        :param job_options: dictionary of job options to pass to the backend
+            (under top-level property "job_options")
 
         .. versionchanged:: 0.21.0
             When not specified explicitly, output format is guessed from output file extension.
 
         .. versionchanged:: 0.32.0
             Added ``auto_add_save_result`` option
+
+        .. versionchanged:: 0.39.0
+            Added arguments ``additional`` and ``job_options``.
         """
         # TODO #278 centralize download/create_job/execute_job logic in DataCube, VectorCube, MlModel, ...
         if auto_add_save_result:
@@ -250,7 +257,9 @@ def download(
             )
         else:
             res = self
-        return self._connection.download(res.flat_graph(), outputfile=outputfile, validate=validate)
+        return self._connection.download(
+            res.flat_graph(), outputfile=outputfile, validate=validate, additional=additional, job_options=job_options
+        )
 
     def execute_batch(
         self,
@@ -274,17 +283,32 @@ def execute_batch(
         **format_options,
     ) -> BatchJob:
         """
-        Evaluate the process graph by creating a batch job, and retrieving the results when it is finished.
-        This method is mostly recommended if the batch job is expected to run in a reasonable amount of time.
-
-        For very long running jobs, you probably do not want to keep the client running.
-
+        Execute the underlying process graph at the backend in batch job mode:
+
+        - create the job (like :py:meth:`create_job`)
+        - start the job (like :py:meth:`BatchJob.start() <openeo.rest.job.BatchJob.start>`)
+        - track the job's progress with an active polling loop
+          (like :py:meth:`BatchJob.run_synchronous() <openeo.rest.job.BatchJob.run_synchronous>`)
+        - optionally (if ``outputfile`` is specified) download the job's results
+          when the job finished successfully
+
+        .. note::
+            Because of the active polling loop,
+            which blocks any further progress of your script or application,
+            this :py:meth:`execute_batch` method is mainly recommended
+            for batch jobs that are expected to complete
+            in a time that is reasonable for your use case.
+
+        :param outputfile: Optional, output path to download to.
+        :param out_format: (optional) File format to use for the job result.
+        :param title: job title.
+        :param description: job description.
+        :param plan: The billing plan to process and charge the job with
+        :param budget: Maximum budget to be spent on executing the job.
+            Note that some backends do not honor this limit.
         :param additional: additional (top-level) properties to set in the request body
         :param job_options: dictionary of job options to pass to the backend
             (under top-level property "job_options")
-        :param outputfile: The path of a file to which a result can be written
-        :param out_format: (optional) output format to use.
-        :param format_options: (optional) additional output format options
         :param validate: Optional toggle to enable/prevent validation of the process graphs before execution
             (overruling the connection's ``auto_validate`` setting).
         :param auto_add_save_result: Automatically add a ``save_result`` node to the process graph if there is none yet.
@@ -293,6 +317,7 @@ def execute_batch(
             One of "error" (highest severity), "warning", "info", and "debug" (lowest severity).
         :param max_poll_interval: maximum number of seconds to sleep between job status polls
         :param connection_retry_interval: how long to wait when status poll failed due to connection issue
+        :param print: print/logging function to show progress/status
 
         .. versionchanged:: 0.21.0
             When not specified explicitly, output format is guessed from output file extension.
@@ -359,18 +384,24 @@ def create_job(
         **format_options,
     ) -> BatchJob:
         """
-        Sends a job to the backend and returns a ClientJob instance.
-
-        :param out_format: String Format of the job result.
-        :param title: job title
-        :param description: job description
-        :param plan: The billing plan to process and charge the job with
+        Send the underlying process graph to the backend
+        to create an openEO batch job
+        and return a corresponding :py:class:`~openeo.rest.job.BatchJob` instance.
+
+        Note that this method only *creates* the openEO batch job at the backend,
+        but it does not *start* it.
+        Use :py:meth:`execute_batch` instead to let the openEO Python client
+        take care of the full job life cycle: create, start and track its progress until completion.
+
+        :param out_format: output file format.
+        :param title: job title.
+        :param description: job description.
+        :param plan: The billing plan to process and charge the job with.
         :param budget: Maximum budget to be spent on executing the job.
             Note that some backends do not honor this limit.
         :param additional: additional (top-level) properties to set in the request body
         :param job_options: dictionary of job options to pass to the backend
             (under top-level property "job_options")
-        :param format_options: String Parameters for the job result format
         :param validate: Optional toggle to enable/prevent validation of the process graphs before execution
             (overruling the connection's ``auto_validate`` setting).
         :param auto_add_save_result: Automatically add a ``save_result`` node to the process graph if there is none yet.

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -4,6 +4,11 @@
 
 import pytest
 
+pytest.register_assert_rewrite(
+    "openeo.internal.documentation",
+)
+
+
 from openeo.testing import TestDataLoader
 from openeo.util import ensure_dir
 

diff --git a/tests/internal/test_documentation.py b/tests/internal/test_documentation.py
@@ -0,0 +1,68 @@
+import pytest
+
+from openeo import DataCube, VectorCube
+from openeo.internal.documentation import assert_same_param_docs, extract_params
+from openeo.rest.mlmodel import MlModel
+from openeo.rest.stac_resource import StacResource
+
+
+def test_extract_params():
+    assert (
+        extract_params(
+            """
+                The description
+
+                and more
+
+                :param a: description of a
+                :param b_b : multi-line description
+                    of b
+
+                That's it!
+                """
+        )
+        == {
+            "a": "description of a",
+            "b_b": "multi-line description\n    of b",
+        }
+    )
+
+
+def test_compare_param_docs_datacube():
+    assert_same_param_docs(DataCube.download, DataCube.create_job, only_intersection=True)
+    assert_same_param_docs(DataCube.download, DataCube.execute_batch, only_intersection=True)
+
+
+@pytest.mark.parametrize(
+    ["method_a", "method_b"],
+    [
+        (DataCube.download, VectorCube.download),
+        (DataCube.create_job, VectorCube.create_job),
+        (DataCube.execute_batch, VectorCube.execute_batch),
+    ],
+)
+def test_compare_docs_datacube_vectorcube(method_a, method_b):
+    assert_same_param_docs(method_a, method_b, only_intersection=False)
+
+
+@pytest.mark.parametrize(
+    ["method_a", "method_b"],
+    [
+        (DataCube.create_job, MlModel.create_job),
+        (DataCube.execute_batch, MlModel.execute_batch),
+    ],
+)
+def test_compare_docs_datacube_mlmodel(method_a, method_b):
+    assert_same_param_docs(method_a, method_b, only_intersection=True)
+
+
+@pytest.mark.parametrize(
+    ["method_a", "method_b"],
+    [
+        (DataCube.download, StacResource.download),
+        (DataCube.create_job, StacResource.create_job),
+        (DataCube.execute_batch, StacResource.execute_batch),
+    ],
+)
+def test_compare_docs_datacube_stac_resource(method_a, method_b):
+    assert_same_param_docs(method_a, method_b, only_intersection=True)