Loading Benchmarks (#4477)

trexfeathers · web-flow · commit 15bd351f7d68 · 2022-02-14T15:11:19.000Z
* Synthetic FF PP NetCDF and loading benchmarks.
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -16,7 +16,9 @@ jobs:
       IRIS_TEST_DATA_PATH: benchmarks/iris-test-data
       IRIS_TEST_DATA_VERSION: "2.5"
       # Lets us manually bump the cache to rebuild
+      ENV_CACHE_BUILD: "0"
       TEST_DATA_CACHE_BUILD: "2"
+      PY_VER: 3.8
 
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
@@ -32,19 +34,15 @@ jobs:
         run: |
           pip install nox
 
-      - name: Cache .nox and .asv/env directories
+      - name: Cache environment directories
         id: cache-env-dir
         uses: actions/cache@v2
         with:
           path: |
             .nox
             benchmarks/.asv/env
-          # Make sure GHA never gets an exact cache match by using the unique
-          #  github.sha. This means it will always store this run as a new
-          #  cache (Nox may have made relevant changes during run). Cache
-          #  restoration still succeeds via the partial restore-key match.
-          key: ${{ runner.os }}-${{ github.sha }}
-          restore-keys: ${{ runner.os }}
+            $CONDA/pkgs
+          key: ${{ runner.os }}-${{ hashFiles('requirements/') }}-${{ env.ENV_CACHE_BUILD }}
 
       - name: Cache test data directory
         id: cache-test-data
@@ -62,7 +60,7 @@ jobs:
           unzip -q iris-test-data.zip
           mkdir --parents ${GITHUB_WORKSPACE}/${IRIS_TEST_DATA_LOC_PATH}
           mv iris-test-data-${IRIS_TEST_DATA_VERSION} ${GITHUB_WORKSPACE}/${IRIS_TEST_DATA_PATH}
-          
+
       - name: Set test data var
         run: |
           echo "OVERRIDE_TEST_DATA_REPOSITORY=${GITHUB_WORKSPACE}/${IRIS_TEST_DATA_PATH}/test_data" >> $GITHUB_ENV
diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py
@@ -5,45 +5,4 @@
 # licensing details.
 """Common code for benchmarks."""
 
-import os
-from pathlib import Path
-
-# Environment variable names
-_ASVDIR_VARNAME = "ASV_DIR"  # As set in nightly script "asv_nightly/asv.sh"
-_DATADIR_VARNAME = "BENCHMARK_DATA"  # For local runs
-
 ARTIFICIAL_DIM_SIZE = int(10e3)  # For all artificial cubes, coords etc.
-
-# Work out where the benchmark data dir is.
-asv_dir = os.environ.get("ASV_DIR", None)
-if asv_dir:
-    # For an overnight run, this comes from the 'ASV_DIR' setting.
-    benchmark_data_dir = Path(asv_dir) / "data"
-else:
-    # For a local run, you set 'BENCHMARK_DATA'.
-    benchmark_data_dir = os.environ.get(_DATADIR_VARNAME, None)
-    if benchmark_data_dir is not None:
-        benchmark_data_dir = Path(benchmark_data_dir)
-
-
-def testdata_path(*path_names):
-    """
-    Return the path of a benchmark test data file.
-
-    These are based from a test-data location dir, which is either
-    ${}/data (for overnight tests), or ${} for local testing.
-
-    If neither of these were set, an error is raised.
-
-    """.format(
-        _ASVDIR_VARNAME, _DATADIR_VARNAME
-    )
-    if benchmark_data_dir is None:
-        msg = (
-            "Benchmark data dir is not defined : "
-            'Either "${}" or "${}" must be set.'
-        )
-        raise (ValueError(msg.format(_ASVDIR_VARNAME, _DATADIR_VARNAME)))
-    path = benchmark_data_dir.joinpath(*path_names)
-    path = str(path)  # Because Iris doesn't understand Path objects yet.
-    return path
diff --git a/benchmarks/benchmarks/generate_data/__init__.py b/benchmarks/benchmarks/generate_data/__init__.py
@@ -0,0 +1,94 @@
+# Copyright Iris contributors
+#
+# This file is part of Iris and is released under the LGPL license.
+# See COPYING and COPYING.LESSER in the root of the repository for full
+# licensing details.
+"""
+Scripts for generating supporting data for benchmarking.
+
+Data generated using Iris should use :func:`run_function_elsewhere`, which
+means that data is generated using a fixed version of Iris and a fixed
+environment, rather than those that get changed when the benchmarking run
+checks out a new commit.
+
+Downstream use of data generated 'elsewhere' requires saving; usually in a
+NetCDF file. Could also use pickling but there is a potential risk if the
+benchmark sequence runs over two different Python versions.
+
+"""
+from inspect import getsource
+from os import environ
+from pathlib import Path
+from subprocess import CalledProcessError, check_output, run
+from textwrap import dedent
+
+#: Python executable used by :func:`run_function_elsewhere`, set via env
+#:  variable of same name. Must be path of Python within an environment that
+#:  includes Iris (including dependencies and test modules) and Mule.
+try:
+    DATA_GEN_PYTHON = environ["DATA_GEN_PYTHON"]
+    _ = check_output([DATA_GEN_PYTHON, "-c", "a = True"])
+except KeyError:
+    error = "Env variable DATA_GEN_PYTHON not defined."
+    raise KeyError(error)
+except (CalledProcessError, FileNotFoundError, PermissionError):
+    error = (
+        "Env variable DATA_GEN_PYTHON not a runnable python executable path."
+    )
+    raise ValueError(error)
+
+# The default location of data files used in benchmarks. Used by CI.
+default_data_dir = (Path(__file__).parents[2] / ".data").resolve()
+# Optionally override the default data location with environment variable.
+BENCHMARK_DATA = Path(environ.get("BENCHMARK_DATA", default_data_dir))
+if BENCHMARK_DATA == default_data_dir:
+    BENCHMARK_DATA.mkdir(exist_ok=True)
+elif not BENCHMARK_DATA.is_dir():
+    message = f"Not a directory: {BENCHMARK_DATA} ."
+    raise ValueError(message)
+
+# Manual flag to allow the rebuilding of synthetic data.
+#  False forces a benchmark run to re-make all the data files.
+REUSE_DATA = True
+
+
+def run_function_elsewhere(func_to_run, *args, **kwargs):
+    """
+    Run a given function using the :const:`DATA_GEN_PYTHON` executable.
+
+    This structure allows the function to be written natively.
+
+    Parameters
+    ----------
+    func_to_run : FunctionType
+        The function object to be run.
+        NOTE: the function must be completely self-contained, i.e. perform all
+        its own imports (within the target :const:`DATA_GEN_PYTHON`
+        environment).
+    *args : tuple, optional
+        Function call arguments. Must all be expressible as simple literals,
+        i.e. the ``repr`` must be a valid literal expression.
+    **kwargs: dict, optional
+        Function call keyword arguments. All values must be expressible as
+        simple literals (see ``*args``).
+
+    Returns
+    -------
+    str
+        The ``stdout`` from the run.
+
+    """
+    func_string = dedent(getsource(func_to_run))
+    func_string = func_string.replace("@staticmethod\n", "")
+    func_call_term_strings = [repr(arg) for arg in args]
+    func_call_term_strings += [
+        f"{name}={repr(val)}" for name, val in kwargs.items()
+    ]
+    func_call_string = (
+        f"{func_to_run.__name__}(" + ",".join(func_call_term_strings) + ")"
+    )
+    python_string = "\n".join([func_string, func_call_string])
+    result = run(
+        [DATA_GEN_PYTHON, "-c", python_string], capture_output=True, check=True
+    )
+    return result.stdout
diff --git a/benchmarks/benchmarks/generate_data/um_files.py b/benchmarks/benchmarks/generate_data/um_files.py
@@ -0,0 +1,215 @@
+# Copyright Iris contributors
+#
+# This file is part of Iris and is released under the LGPL license.
+# See COPYING and COPYING.LESSER in the root of the repository for full
+# licensing details.
+"""
+Generate FF, PP and NetCDF files based on a minimal synthetic FF file.
+
+NOTE: uses the Mule package, so depends on an environment with Mule installed.
+"""
+
+
+def _create_um_files(
+    len_x: int, len_y: int, len_z: int, len_t: int, compress, save_paths: dict
+) -> None:
+    """
+    Generate an FF object of given shape and compression, save to FF/PP/NetCDF.
+
+    This is run externally
+    (:func:`benchmarks.generate_data.run_function_elsewhere`), so all imports
+    are self-contained and input parameters are simple types.
+    """
+    from copy import deepcopy
+    from datetime import datetime
+    from tempfile import NamedTemporaryFile
+
+    from mo_pack import compress_wgdos as mo_pack_compress
+    from mule import ArrayDataProvider, Field3, FieldsFile
+    from mule.pp import fields_to_pp_file
+    import numpy as np
+
+    from iris import load_cube
+    from iris import save as save_cube
+
+    def packing_patch(*compress_args, **compress_kwargs) -> bytes:
+        """
+        Force conversion from returned :class:`memoryview` to :class:`bytes`.
+
+        Downstream uses of :func:`mo_pack.compress_wgdos` were written
+        for the ``Python2`` behaviour, where the returned buffer had a
+        different ``__len__`` value to the current :class:`memoryview`.
+        Unable to fix directly in Mule, so monkey patching for now.
+        """
+        return mo_pack_compress(*compress_args, **compress_kwargs).tobytes()
+
+    import mo_pack
+
+    mo_pack.compress_wgdos = packing_patch
+
+    ########
+
+    template = {
+        "fixed_length_header": {"dataset_type": 3, "grid_staggering": 3},
+        "integer_constants": {
+            "num_p_levels": len_z,
+            "num_cols": len_x,
+            "num_rows": len_y,
+        },
+        "real_constants": {},
+        "level_dependent_constants": {"dims": (len_z + 1, None)},
+    }
+    new_ff = FieldsFile.from_template(deepcopy(template))
+
+    data_array = np.arange(len_x * len_y).reshape(len_x, len_y)
+    array_provider = ArrayDataProvider(data_array)
+
+    def add_field(level_: int, time_step_: int) -> None:
+        """
+        Add a minimal field to the new :class:`~mule.FieldsFile`.
+
+        Includes the minimum information to allow Mule saving and Iris
+        loading, as well as incrementation for vertical levels and time
+        steps to allow generation of z and t dimensions.
+        """
+        new_field = Field3.empty()
+        # To correspond to the header-release 3 class used.
+        new_field.lbrel = 3
+        # Mule uses the first element of the lookup to test for
+        #  unpopulated fields (and skips them), so the first element should
+        #  be set to something. The year will do.
+        new_field.raw[1] = datetime.now().year
+
+        # Horizontal.
+        new_field.lbcode = 1
+        new_field.lbnpt = len_x
+        new_field.lbrow = len_y
+        new_field.bdx = new_ff.real_constants.col_spacing
+        new_field.bdy = new_ff.real_constants.row_spacing
+        new_field.bzx = new_ff.real_constants.start_lon - 0.5 * new_field.bdx
+        new_field.bzy = new_ff.real_constants.start_lat - 0.5 * new_field.bdy
+
+        # Hemisphere.
+        new_field.lbhem = 32
+        # Processing.
+        new_field.lbproc = 0
+
+        # Vertical.
+        # Hybrid height values by simulating sequences similar to those in a
+        #  theta file.
+        new_field.lbvc = 65
+        if level_ == 0:
+            new_field.lblev = 9999
+        else:
+            new_field.lblev = level_
+
+        level_1 = level_ + 1
+        six_rec = 20 / 3
+        three_rec = six_rec / 2
+
+        new_field.blev = level_1 ** 2 * six_rec - six_rec
+        new_field.brsvd1 = (
+            level_1 ** 2 * six_rec + (six_rec * level_1) - three_rec
+        )
+
+        brsvd2_simulated = np.linspace(0.995, 0, len_z)
+        shift = min(len_z, 2)
+        bhrlev_simulated = np.concatenate(
+            [np.ones(shift), brsvd2_simulated[:-shift]]
+        )
+        new_field.brsvd2 = brsvd2_simulated[level_]
+        new_field.bhrlev = bhrlev_simulated[level_]
+
+        # Time.
+        new_field.lbtim = 11
+
+        new_field.lbyr = time_step_
+        for attr_name in ["lbmon", "lbdat", "lbhr", "lbmin", "lbsec"]:
+            setattr(new_field, attr_name, 0)
+
+        new_field.lbyrd = time_step_ + 1
+        for attr_name in ["lbmond", "lbdatd", "lbhrd", "lbmind", "lbsecd"]:
+            setattr(new_field, attr_name, 0)
+
+        # Data and packing.
+        new_field.lbuser1 = 1
+        new_field.lbpack = int(compress)
+        new_field.bacc = 0
+        new_field.bmdi = -1
+        new_field.lbext = 0
+        new_field.set_data_provider(array_provider)
+
+        new_ff.fields.append(new_field)
+
+    for time_step in range(len_t):
+        for level in range(len_z):
+            add_field(level, time_step + 1)
+
+    ff_path = save_paths.get("FF", None)
+    pp_path = save_paths.get("PP", None)
+    nc_path = save_paths.get("NetCDF", None)
+
+    if ff_path:
+        new_ff.to_file(ff_path)
+    if pp_path:
+        fields_to_pp_file(str(pp_path), new_ff.fields)
+    if nc_path:
+        temp_ff_path = None
+        # Need an Iris Cube from the FF content.
+        if ff_path:
+            # Use the existing file.
+            ff_cube = load_cube(ff_path)
+        else:
+            # Make a temporary file.
+            temp_ff_path = NamedTemporaryFile()
+            new_ff.to_file(temp_ff_path.name)
+            ff_cube = load_cube(temp_ff_path.name)
+
+        save_cube(ff_cube, nc_path, zlib=compress)
+        if temp_ff_path:
+            temp_ff_path.close()
+
+
+FILE_EXTENSIONS = {"FF": "", "PP": ".pp", "NetCDF": ".nc"}
+
+
+def create_um_files(
+    len_x: int,
+    len_y: int,
+    len_z: int,
+    len_t: int,
+    compress: bool,
+    file_types: list,
+) -> dict:
+    """
+    Generate FF-based FF / PP / NetCDF files with specified shape and compression.
+
+    All files representing a given shape are saved in a dedicated directory. A
+    dictionary of the saved paths is returned.
+
+    If the required files exist, they are re-used, unless
+    :const:`benchmarks.REUSE_DATA` is ``False``.
+    """
+    # Self contained imports to avoid linting confusion with _create_um_files().
+    from . import BENCHMARK_DATA, REUSE_DATA, run_function_elsewhere
+
+    save_name_sections = ["UM", len_x, len_y, len_z, len_t]
+    save_name = "_".join(str(section) for section in save_name_sections)
+    save_dir = BENCHMARK_DATA / save_name
+    if not save_dir.is_dir():
+        save_dir.mkdir(parents=True)
+
+    save_paths = {}
+    files_exist = True
+    for file_type in file_types:
+        file_ext = FILE_EXTENSIONS[file_type]
+        save_path = (save_dir / f"{compress}").with_suffix(file_ext)
+        files_exist = files_exist and save_path.is_file()
+        save_paths[file_type] = str(save_path)
+
+    if not REUSE_DATA or not files_exist:
+        _ = run_function_elsewhere(
+            _create_um_files, len_x, len_y, len_z, len_t, compress, save_paths
+        )
+
+    return save_paths
diff --git a/benchmarks/benchmarks/loading.py b/benchmarks/benchmarks/loading.py
diff --git a/noxfile.py b/noxfile.py