Skip to content

Commit 15bd351

Browse files
authored
Loading Benchmarks (#4477)
* Synthetic FF PP NetCDF and loading benchmarks.
1 parent d1d1e00 commit 15bd351

File tree

6 files changed

+543
-51
lines changed

6 files changed

+543
-51
lines changed

.github/workflows/benchmark.yml

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ jobs:
1616
IRIS_TEST_DATA_PATH: benchmarks/iris-test-data
1717
IRIS_TEST_DATA_VERSION: "2.5"
1818
# Lets us manually bump the cache to rebuild
19+
ENV_CACHE_BUILD: "0"
1920
TEST_DATA_CACHE_BUILD: "2"
21+
PY_VER: 3.8
2022

2123
steps:
2224
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
@@ -32,19 +34,15 @@ jobs:
3234
run: |
3335
pip install nox
3436
35-
- name: Cache .nox and .asv/env directories
37+
- name: Cache environment directories
3638
id: cache-env-dir
3739
uses: actions/cache@v2
3840
with:
3941
path: |
4042
.nox
4143
benchmarks/.asv/env
42-
# Make sure GHA never gets an exact cache match by using the unique
43-
# github.sha. This means it will always store this run as a new
44-
# cache (Nox may have made relevant changes during run). Cache
45-
# restoration still succeeds via the partial restore-key match.
46-
key: ${{ runner.os }}-${{ github.sha }}
47-
restore-keys: ${{ runner.os }}
44+
$CONDA/pkgs
45+
key: ${{ runner.os }}-${{ hashFiles('requirements/') }}-${{ env.ENV_CACHE_BUILD }}
4846

4947
- name: Cache test data directory
5048
id: cache-test-data
@@ -62,7 +60,7 @@ jobs:
6260
unzip -q iris-test-data.zip
6361
mkdir --parents ${GITHUB_WORKSPACE}/${IRIS_TEST_DATA_LOC_PATH}
6462
mv iris-test-data-${IRIS_TEST_DATA_VERSION} ${GITHUB_WORKSPACE}/${IRIS_TEST_DATA_PATH}
65-
63+
6664
- name: Set test data var
6765
run: |
6866
echo "OVERRIDE_TEST_DATA_REPOSITORY=${GITHUB_WORKSPACE}/${IRIS_TEST_DATA_PATH}/test_data" >> $GITHUB_ENV

benchmarks/benchmarks/__init__.py

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -5,45 +5,4 @@
55
# licensing details.
66
"""Common code for benchmarks."""
77

8-
import os
9-
from pathlib import Path
10-
11-
# Environment variable names
12-
_ASVDIR_VARNAME = "ASV_DIR" # As set in nightly script "asv_nightly/asv.sh"
13-
_DATADIR_VARNAME = "BENCHMARK_DATA" # For local runs
14-
158
ARTIFICIAL_DIM_SIZE = int(10e3) # For all artificial cubes, coords etc.
16-
17-
# Work out where the benchmark data dir is.
18-
asv_dir = os.environ.get("ASV_DIR", None)
19-
if asv_dir:
20-
# For an overnight run, this comes from the 'ASV_DIR' setting.
21-
benchmark_data_dir = Path(asv_dir) / "data"
22-
else:
23-
# For a local run, you set 'BENCHMARK_DATA'.
24-
benchmark_data_dir = os.environ.get(_DATADIR_VARNAME, None)
25-
if benchmark_data_dir is not None:
26-
benchmark_data_dir = Path(benchmark_data_dir)
27-
28-
29-
def testdata_path(*path_names):
30-
"""
31-
Return the path of a benchmark test data file.
32-
33-
These are based from a test-data location dir, which is either
34-
${}/data (for overnight tests), or ${} for local testing.
35-
36-
If neither of these were set, an error is raised.
37-
38-
""".format(
39-
_ASVDIR_VARNAME, _DATADIR_VARNAME
40-
)
41-
if benchmark_data_dir is None:
42-
msg = (
43-
"Benchmark data dir is not defined : "
44-
'Either "${}" or "${}" must be set.'
45-
)
46-
raise (ValueError(msg.format(_ASVDIR_VARNAME, _DATADIR_VARNAME)))
47-
path = benchmark_data_dir.joinpath(*path_names)
48-
path = str(path) # Because Iris doesn't understand Path objects yet.
49-
return path
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# Copyright Iris contributors
2+
#
3+
# This file is part of Iris and is released under the LGPL license.
4+
# See COPYING and COPYING.LESSER in the root of the repository for full
5+
# licensing details.
6+
"""
7+
Scripts for generating supporting data for benchmarking.
8+
9+
Data generated using Iris should use :func:`run_function_elsewhere`, which
10+
means that data is generated using a fixed version of Iris and a fixed
11+
environment, rather than those that get changed when the benchmarking run
12+
checks out a new commit.
13+
14+
Downstream use of data generated 'elsewhere' requires saving; usually in a
15+
NetCDF file. Could also use pickling but there is a potential risk if the
16+
benchmark sequence runs over two different Python versions.
17+
18+
"""
19+
from inspect import getsource
20+
from os import environ
21+
from pathlib import Path
22+
from subprocess import CalledProcessError, check_output, run
23+
from textwrap import dedent
24+
25+
#: Python executable used by :func:`run_function_elsewhere`, set via env
26+
#: variable of same name. Must be path of Python within an environment that
27+
#: includes Iris (including dependencies and test modules) and Mule.
28+
try:
29+
DATA_GEN_PYTHON = environ["DATA_GEN_PYTHON"]
30+
_ = check_output([DATA_GEN_PYTHON, "-c", "a = True"])
31+
except KeyError:
32+
error = "Env variable DATA_GEN_PYTHON not defined."
33+
raise KeyError(error)
34+
except (CalledProcessError, FileNotFoundError, PermissionError):
35+
error = (
36+
"Env variable DATA_GEN_PYTHON not a runnable python executable path."
37+
)
38+
raise ValueError(error)
39+
40+
# The default location of data files used in benchmarks. Used by CI.
41+
default_data_dir = (Path(__file__).parents[2] / ".data").resolve()
42+
# Optionally override the default data location with environment variable.
43+
BENCHMARK_DATA = Path(environ.get("BENCHMARK_DATA", default_data_dir))
44+
if BENCHMARK_DATA == default_data_dir:
45+
BENCHMARK_DATA.mkdir(exist_ok=True)
46+
elif not BENCHMARK_DATA.is_dir():
47+
message = f"Not a directory: {BENCHMARK_DATA} ."
48+
raise ValueError(message)
49+
50+
# Manual flag to allow the rebuilding of synthetic data.
51+
# False forces a benchmark run to re-make all the data files.
52+
REUSE_DATA = True
53+
54+
55+
def run_function_elsewhere(func_to_run, *args, **kwargs):
56+
"""
57+
Run a given function using the :const:`DATA_GEN_PYTHON` executable.
58+
59+
This structure allows the function to be written natively.
60+
61+
Parameters
62+
----------
63+
func_to_run : FunctionType
64+
The function object to be run.
65+
NOTE: the function must be completely self-contained, i.e. perform all
66+
its own imports (within the target :const:`DATA_GEN_PYTHON`
67+
environment).
68+
*args : tuple, optional
69+
Function call arguments. Must all be expressible as simple literals,
70+
i.e. the ``repr`` must be a valid literal expression.
71+
**kwargs: dict, optional
72+
Function call keyword arguments. All values must be expressible as
73+
simple literals (see ``*args``).
74+
75+
Returns
76+
-------
77+
str
78+
The ``stdout`` from the run.
79+
80+
"""
81+
func_string = dedent(getsource(func_to_run))
82+
func_string = func_string.replace("@staticmethod\n", "")
83+
func_call_term_strings = [repr(arg) for arg in args]
84+
func_call_term_strings += [
85+
f"{name}={repr(val)}" for name, val in kwargs.items()
86+
]
87+
func_call_string = (
88+
f"{func_to_run.__name__}(" + ",".join(func_call_term_strings) + ")"
89+
)
90+
python_string = "\n".join([func_string, func_call_string])
91+
result = run(
92+
[DATA_GEN_PYTHON, "-c", python_string], capture_output=True, check=True
93+
)
94+
return result.stdout
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
# Copyright Iris contributors
2+
#
3+
# This file is part of Iris and is released under the LGPL license.
4+
# See COPYING and COPYING.LESSER in the root of the repository for full
5+
# licensing details.
6+
"""
7+
Generate FF, PP and NetCDF files based on a minimal synthetic FF file.
8+
9+
NOTE: uses the Mule package, so depends on an environment with Mule installed.
10+
"""
11+
12+
13+
def _create_um_files(
14+
len_x: int, len_y: int, len_z: int, len_t: int, compress, save_paths: dict
15+
) -> None:
16+
"""
17+
Generate an FF object of given shape and compression, save to FF/PP/NetCDF.
18+
19+
This is run externally
20+
(:func:`benchmarks.generate_data.run_function_elsewhere`), so all imports
21+
are self-contained and input parameters are simple types.
22+
"""
23+
from copy import deepcopy
24+
from datetime import datetime
25+
from tempfile import NamedTemporaryFile
26+
27+
from mo_pack import compress_wgdos as mo_pack_compress
28+
from mule import ArrayDataProvider, Field3, FieldsFile
29+
from mule.pp import fields_to_pp_file
30+
import numpy as np
31+
32+
from iris import load_cube
33+
from iris import save as save_cube
34+
35+
def packing_patch(*compress_args, **compress_kwargs) -> bytes:
36+
"""
37+
Force conversion from returned :class:`memoryview` to :class:`bytes`.
38+
39+
Downstream uses of :func:`mo_pack.compress_wgdos` were written
40+
for the ``Python2`` behaviour, where the returned buffer had a
41+
different ``__len__`` value to the current :class:`memoryview`.
42+
Unable to fix directly in Mule, so monkey patching for now.
43+
"""
44+
return mo_pack_compress(*compress_args, **compress_kwargs).tobytes()
45+
46+
import mo_pack
47+
48+
mo_pack.compress_wgdos = packing_patch
49+
50+
########
51+
52+
template = {
53+
"fixed_length_header": {"dataset_type": 3, "grid_staggering": 3},
54+
"integer_constants": {
55+
"num_p_levels": len_z,
56+
"num_cols": len_x,
57+
"num_rows": len_y,
58+
},
59+
"real_constants": {},
60+
"level_dependent_constants": {"dims": (len_z + 1, None)},
61+
}
62+
new_ff = FieldsFile.from_template(deepcopy(template))
63+
64+
data_array = np.arange(len_x * len_y).reshape(len_x, len_y)
65+
array_provider = ArrayDataProvider(data_array)
66+
67+
def add_field(level_: int, time_step_: int) -> None:
68+
"""
69+
Add a minimal field to the new :class:`~mule.FieldsFile`.
70+
71+
Includes the minimum information to allow Mule saving and Iris
72+
loading, as well as incrementation for vertical levels and time
73+
steps to allow generation of z and t dimensions.
74+
"""
75+
new_field = Field3.empty()
76+
# To correspond to the header-release 3 class used.
77+
new_field.lbrel = 3
78+
# Mule uses the first element of the lookup to test for
79+
# unpopulated fields (and skips them), so the first element should
80+
# be set to something. The year will do.
81+
new_field.raw[1] = datetime.now().year
82+
83+
# Horizontal.
84+
new_field.lbcode = 1
85+
new_field.lbnpt = len_x
86+
new_field.lbrow = len_y
87+
new_field.bdx = new_ff.real_constants.col_spacing
88+
new_field.bdy = new_ff.real_constants.row_spacing
89+
new_field.bzx = new_ff.real_constants.start_lon - 0.5 * new_field.bdx
90+
new_field.bzy = new_ff.real_constants.start_lat - 0.5 * new_field.bdy
91+
92+
# Hemisphere.
93+
new_field.lbhem = 32
94+
# Processing.
95+
new_field.lbproc = 0
96+
97+
# Vertical.
98+
# Hybrid height values by simulating sequences similar to those in a
99+
# theta file.
100+
new_field.lbvc = 65
101+
if level_ == 0:
102+
new_field.lblev = 9999
103+
else:
104+
new_field.lblev = level_
105+
106+
level_1 = level_ + 1
107+
six_rec = 20 / 3
108+
three_rec = six_rec / 2
109+
110+
new_field.blev = level_1 ** 2 * six_rec - six_rec
111+
new_field.brsvd1 = (
112+
level_1 ** 2 * six_rec + (six_rec * level_1) - three_rec
113+
)
114+
115+
brsvd2_simulated = np.linspace(0.995, 0, len_z)
116+
shift = min(len_z, 2)
117+
bhrlev_simulated = np.concatenate(
118+
[np.ones(shift), brsvd2_simulated[:-shift]]
119+
)
120+
new_field.brsvd2 = brsvd2_simulated[level_]
121+
new_field.bhrlev = bhrlev_simulated[level_]
122+
123+
# Time.
124+
new_field.lbtim = 11
125+
126+
new_field.lbyr = time_step_
127+
for attr_name in ["lbmon", "lbdat", "lbhr", "lbmin", "lbsec"]:
128+
setattr(new_field, attr_name, 0)
129+
130+
new_field.lbyrd = time_step_ + 1
131+
for attr_name in ["lbmond", "lbdatd", "lbhrd", "lbmind", "lbsecd"]:
132+
setattr(new_field, attr_name, 0)
133+
134+
# Data and packing.
135+
new_field.lbuser1 = 1
136+
new_field.lbpack = int(compress)
137+
new_field.bacc = 0
138+
new_field.bmdi = -1
139+
new_field.lbext = 0
140+
new_field.set_data_provider(array_provider)
141+
142+
new_ff.fields.append(new_field)
143+
144+
for time_step in range(len_t):
145+
for level in range(len_z):
146+
add_field(level, time_step + 1)
147+
148+
ff_path = save_paths.get("FF", None)
149+
pp_path = save_paths.get("PP", None)
150+
nc_path = save_paths.get("NetCDF", None)
151+
152+
if ff_path:
153+
new_ff.to_file(ff_path)
154+
if pp_path:
155+
fields_to_pp_file(str(pp_path), new_ff.fields)
156+
if nc_path:
157+
temp_ff_path = None
158+
# Need an Iris Cube from the FF content.
159+
if ff_path:
160+
# Use the existing file.
161+
ff_cube = load_cube(ff_path)
162+
else:
163+
# Make a temporary file.
164+
temp_ff_path = NamedTemporaryFile()
165+
new_ff.to_file(temp_ff_path.name)
166+
ff_cube = load_cube(temp_ff_path.name)
167+
168+
save_cube(ff_cube, nc_path, zlib=compress)
169+
if temp_ff_path:
170+
temp_ff_path.close()
171+
172+
173+
FILE_EXTENSIONS = {"FF": "", "PP": ".pp", "NetCDF": ".nc"}
174+
175+
176+
def create_um_files(
177+
len_x: int,
178+
len_y: int,
179+
len_z: int,
180+
len_t: int,
181+
compress: bool,
182+
file_types: list,
183+
) -> dict:
184+
"""
185+
Generate FF-based FF / PP / NetCDF files with specified shape and compression.
186+
187+
All files representing a given shape are saved in a dedicated directory. A
188+
dictionary of the saved paths is returned.
189+
190+
If the required files exist, they are re-used, unless
191+
:const:`benchmarks.REUSE_DATA` is ``False``.
192+
"""
193+
# Self contained imports to avoid linting confusion with _create_um_files().
194+
from . import BENCHMARK_DATA, REUSE_DATA, run_function_elsewhere
195+
196+
save_name_sections = ["UM", len_x, len_y, len_z, len_t]
197+
save_name = "_".join(str(section) for section in save_name_sections)
198+
save_dir = BENCHMARK_DATA / save_name
199+
if not save_dir.is_dir():
200+
save_dir.mkdir(parents=True)
201+
202+
save_paths = {}
203+
files_exist = True
204+
for file_type in file_types:
205+
file_ext = FILE_EXTENSIONS[file_type]
206+
save_path = (save_dir / f"{compress}").with_suffix(file_ext)
207+
files_exist = files_exist and save_path.is_file()
208+
save_paths[file_type] = str(save_path)
209+
210+
if not REUSE_DATA or not files_exist:
211+
_ = run_function_elsewhere(
212+
_create_um_files, len_x, len_y, len_z, len_t, compress, save_paths
213+
)
214+
215+
return save_paths

0 commit comments

Comments
 (0)