Skip to content

Commit

Permalink
Use a compiled regex for matching in summary
Browse files Browse the repository at this point in the history
looping over a fnmatch was simply too slow
  • Loading branch information
eivindjahren committed Mar 8, 2024
1 parent 2f02032 commit afcd909
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 8 deletions.
55 changes: 47 additions & 8 deletions src/ert/config/_read_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,17 @@
import re
from datetime import datetime, timedelta
from enum import Enum, auto
from fnmatch import fnmatch
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, TypeVar, Union
from typing import (
Any,
Callable,
Dict,
List,
Optional,
Sequence,
Tuple,
TypeVar,
Union,
)

import numpy as np
import numpy.typing as npt
Expand Down Expand Up @@ -72,7 +81,7 @@ def from_keyword(cls, summary_keyword: str) -> _SummaryType:
return cls.REGION

if any(
re.match(pattern, summary_keyword)
re.fullmatch(pattern, summary_keyword)
for pattern in [r"R.FT.*", r"R..FT.*", r"R.FR.*", r"R..FR.*", r"R.F"]
):
return cls.INTER_REGION
Expand Down Expand Up @@ -249,6 +258,38 @@ def _check_vals(
return vals


def _fetch_keys_to_matcher(fetch_keys: Sequence[str]) -> Callable[[str], bool]:
"""
Transform the list of keys (with * used as repeated wildcard) into
a matcher.
>>> match = _fetch_keys_to_matcher([""])
>>> match("FOPR")
False
>>> match = _fetch_keys_to_matcher(["*"])
>>> match("FOPR"), match("FO*")
(True, True)
>>> match = _fetch_keys_to_matcher(["F*PR"])
>>> match("WOPR"), match("FOPR"), match("FGPR"), match("SOIL")
(False, True, True, False)
>>> match = _fetch_keys_to_matcher(["WGOR:*"])
>>> match("FOPR"), match("WGOR:OP1"), match("WGOR:OP2"), match("WGOR")
(False, True, True, False)
"""
regex = re.compile(
"("
+ "|".join(
".*".join(re.escape(part) for part in key.split("*")) for key in fetch_keys
)
+ ")"
)
return lambda s: regex.fullmatch(s) is not None


def _read_spec(
spec: str, fetch_keys: Sequence[str]
) -> Tuple[int, datetime, DateUnit, List[str], npt.NDArray[np.int64]]:
Expand Down Expand Up @@ -349,6 +390,8 @@ def _read_spec(
index_mapping: Dict[str, int] = {}
date_index = None

should_load_key = _fetch_keys_to_matcher(fetch_keys)

def optional_get(arr: Optional[npt.NDArray[Any]], idx: int) -> Any:
if arr is None:
return None
Expand All @@ -373,7 +416,7 @@ def optional_get(arr: Optional[npt.NDArray[Any]], idx: int) -> Any:
lk = optional_get(numlz, i)

key = make_summary_key(keyword, num, name, nx, ny, lgr_name, li, lj, lk)
if key is not None and _should_load_summary_key(key, fetch_keys):
if key is not None and should_load_key(key):
if key in index_mapping:
# only keep the index of the last occurrence of a key
# this is done for backwards compatability
Expand Down Expand Up @@ -470,7 +513,3 @@ def read_params() -> None:
read_params()
read_params()
return np.array(values, dtype=np.float32).T, dates


def _should_load_summary_key(data_key: Any, user_set_keys: Sequence[str]) -> bool:
return any(fnmatch(data_key, key) for key in user_set_keys)
17 changes: 17 additions & 0 deletions tests/unit_tests/config/test_read_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,3 +486,20 @@ def test_that_ambiguous_case_restart_raises_an_informative_error(
match="Ambiguous reference to unified summary",
):
read_summary(str(tmp_path / "test"), ["*"])


@given(summaries())
def test_that_length_of_fetch_keys_does_not_reduce_performance(
tmp_path_factory, summary
):
"""With a compiled regex this takes seconds to run, and with
a naive implementation it will take almost an hour.
"""
tmp_path = tmp_path_factory.mktemp("summary")
smspec, unsmry = summary
unsmry.to_file(tmp_path / "TEST.UNSMRY")
smspec.to_file(tmp_path / "TEST.SMSPEC")
fetch_keys = [str(i) for i in range(100000)]
(_, keys, time_map, _) = read_summary(str(tmp_path / "TEST"), fetch_keys)
assert all(k in fetch_keys for k in keys)
assert len(time_map) == len(unsmry.steps)

0 comments on commit afcd909

Please sign in to comment.