Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix summary reading being too slow in case of many summary keys to match against #7410

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 48 additions & 8 deletions src/ert/config/_read_summary.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
from __future__ import annotations

import fnmatch
import os
import os.path
import re
from datetime import datetime, timedelta
from enum import Enum, auto
from fnmatch import fnmatch
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, TypeVar, Union
from typing import (
Any,
Callable,
Dict,
List,
Optional,
Sequence,
Tuple,
TypeVar,
Union,
)

import numpy as np
import numpy.typing as npt
Expand Down Expand Up @@ -72,7 +82,7 @@ def from_keyword(cls, summary_keyword: str) -> _SummaryType:
return cls.REGION

if any(
re.match(pattern, summary_keyword)
re.fullmatch(pattern, summary_keyword)
for pattern in [r"R.FT.*", r"R..FT.*", r"R.FR.*", r"R..FR.*", r"R.F"]
):
return cls.INTER_REGION
Expand Down Expand Up @@ -249,6 +259,38 @@ def _check_vals(
return vals


def _fetch_keys_to_matcher(fetch_keys: Sequence[str]) -> Callable[[str], bool]:
"""
Transform the list of keys (with * used as repeated wildcard) into
a matcher.

>>> match = _fetch_keys_to_matcher([""])
>>> match("FOPR")
False

>>> match = _fetch_keys_to_matcher(["*"])
>>> match("FOPR"), match("FO*")
(True, True)


>>> match = _fetch_keys_to_matcher(["F*PR"])
>>> match("WOPR"), match("FOPR"), match("FGPR"), match("SOIL")
(False, True, True, False)

>>> match = _fetch_keys_to_matcher(["WGOR:*"])
>>> match("FOPR"), match("WGOR:OP1"), match("WGOR:OP2"), match("WGOR")
(False, True, True, False)

>>> match = _fetch_keys_to_matcher(["FOPR", "FGPR"])
>>> match("FOPR"), match("FGPR"), match("WGOR:OP2"), match("WGOR")
(True, True, False, False)
"""
if not fetch_keys:
return lambda _: False
regex = re.compile("|".join(fnmatch.translate(key) for key in fetch_keys))
return lambda s: regex.fullmatch(s) is not None


def _read_spec(
spec: str, fetch_keys: Sequence[str]
) -> Tuple[int, datetime, DateUnit, List[str], npt.NDArray[np.int64]]:
Expand Down Expand Up @@ -349,6 +391,8 @@ def _read_spec(
index_mapping: Dict[str, int] = {}
date_index = None

should_load_key = _fetch_keys_to_matcher(fetch_keys)

def optional_get(arr: Optional[npt.NDArray[Any]], idx: int) -> Any:
if arr is None:
return None
Expand All @@ -373,7 +417,7 @@ def optional_get(arr: Optional[npt.NDArray[Any]], idx: int) -> Any:
lk = optional_get(numlz, i)

key = make_summary_key(keyword, num, name, nx, ny, lgr_name, li, lj, lk)
if key is not None and _should_load_summary_key(key, fetch_keys):
if key is not None and should_load_key(key):
if key in index_mapping:
# only keep the index of the last occurrence of a key
# this is done for backwards compatability
Expand Down Expand Up @@ -470,7 +514,3 @@ def read_params() -> None:
read_params()
read_params()
return np.array(values, dtype=np.float32).T, dates


def _should_load_summary_key(data_key: Any, user_set_keys: Sequence[str]) -> bool:
return any(fnmatch(data_key, key) for key in user_set_keys)
17 changes: 17 additions & 0 deletions tests/unit_tests/config/test_read_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,3 +486,20 @@ def test_that_ambiguous_case_restart_raises_an_informative_error(
match="Ambiguous reference to unified summary",
):
read_summary(str(tmp_path / "test"), ["*"])


@given(summaries())
def test_that_length_of_fetch_keys_does_not_reduce_performance(
tmp_path_factory, summary
):
"""With a compiled regex this takes seconds to run, and with
a naive implementation it will take almost an hour.
"""
tmp_path = tmp_path_factory.mktemp("summary")
smspec, unsmry = summary
unsmry.to_file(tmp_path / "TEST.UNSMRY")
smspec.to_file(tmp_path / "TEST.SMSPEC")
fetch_keys = [str(i) for i in range(100000)]
(_, keys, time_map, _) = read_summary(str(tmp_path / "TEST"), fetch_keys)
assert all(k in fetch_keys for k in keys)
assert len(time_map) == len(unsmry.steps)
Loading