Skip to content

Commit

Permalink
CLN: Refactor filestem construction (#623)
Browse files Browse the repository at this point in the history
  • Loading branch information
tnatt authored Apr 22, 2024
1 parent 69dcd09 commit bec7647
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 61 deletions.
79 changes: 44 additions & 35 deletions src/fmu/dataio/providers/_filedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from __future__ import annotations

import re
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
Expand Down Expand Up @@ -110,47 +111,55 @@ def _add_filename_to_path(self, path: Path) -> Path:
return (path / stem).with_suffix(path.suffix + self.objdata.extension)

def _get_filestem(self) -> str:
"""Construct the file"""
"""
Construct the filestem string as a combinaton of various
attributes; parent, name, tagname and time information.
A '--' is used to separate the non-empty components, and a
filestem containing all components will look like this:
filestem = 'parent--name--tagname--time1_time0'
"""

if not self.name:
raise ValueError("The 'name' entry is missing for constructing a file name")
if not self.objdata.time0 and self.objdata.time1:
raise ValueError("Not legal: 'time0' is missing while 'time1' is present")

stem = self.name.lower()
if self.dataio.tagname:
stem += "--" + self.dataio.tagname.lower()
if self.dataio.parent:
stem = self.dataio.parent.lower() + "--" + stem

if self.objdata.time0 and not self.objdata.time1:
stem += "--" + (str(self.objdata.time0)[0:10]).replace("-", "")

elif self.objdata.time0 and self.objdata.time1:
monitor = (str(self.objdata.time1)[0:10]).replace("-", "")
base = (str(self.objdata.time0)[0:10]).replace("-", "")
if monitor == base:
warn(
"The monitor date and base date are equal", UserWarning
) # TODO: consider add clocktimes in such cases?
if self.dataio.filename_timedata_reverse: # class variable
stem += "--" + base + "_" + monitor
else:
stem += "--" + monitor + "_" + base

# remove unwanted characters
stem = stem.replace(".", "_").replace(" ", "_")

# avoid multiple double underscores
while "__" in stem:
stem = stem.replace("__", "_")

# treat norwegian special letters
# BUG(?): What about germen letter like "Ü"?
stem = stem.replace("æ", "ae")
stem = stem.replace("ø", "oe")
stem = stem.replace("å", "aa")
return stem.lower()
filestem_order = (
self.dataio.parent,
self.name,
self.dataio.tagname,
self._get_timepart_for_filename(),
)
# join non-empty parts with '--'
filestem = "--".join((p for p in filestem_order if p))
filestem = self._sanitize_the_filestem(filestem)
return filestem.lower()

def _get_timepart_for_filename(self) -> str:
if self.objdata.time0 is None:
return ""
t0 = self.objdata.time0.strftime("%Y%m%d")
if not self.objdata.time1:
return t0
t1 = self.objdata.time1.strftime("%Y%m%d")
return "_".join(
(t1, t0) if not self.dataio.filename_timedata_reverse else (t0, t1)
)

@staticmethod
def _sanitize_the_filestem(filestem: str) -> str:
"""
Clean up the filestem; remove unwanted characters, treat
norwegian special letters and remove multiple underscores
"""
filestem = (
filestem.replace(".", "_")
.replace(" ", "_")
.replace("æ", "ae")
.replace("ø", "oe")
.replace("å", "aa")
)
return re.sub(r"__+", "_", filestem)

def _get_forcefolder_if_absolute(self) -> Path | None:
if self.dataio.forcefolder.startswith("/"):
Expand Down
55 changes: 29 additions & 26 deletions tests/test_units/test_filedataprovider_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import os
from copy import deepcopy
from datetime import datetime
from pathlib import Path

import pytest
Expand All @@ -21,72 +22,72 @@
"name",
"tag",
"parent",
"2020-01-01",
"2022-01-02",
datetime.strptime("2020-01-01", "%Y-%m-%d"),
datetime.strptime("2022-01-02", "%Y-%m-%d"),
"parent--name--tag--20220102_20200101",
),
(
"name",
"",
"",
"2020-01-01",
"2022-01-02",
datetime.strptime("2020-01-01", "%Y-%m-%d"),
datetime.strptime("2022-01-02", "%Y-%m-%d"),
"name--20220102_20200101",
),
(
"name",
"",
"",
"2022-01-02",
"",
datetime.strptime("2022-01-02", "%Y-%m-%d"),
None,
"name--20220102",
),
(
"name",
"",
"",
"",
"",
None,
None,
"name",
),
(
"name",
"",
"",
20210101,
20220102,
datetime.strptime("2021-01-01", "%Y-%m-%d"),
datetime.strptime("2022-01-02", "%Y-%m-%d"),
"name--20220102_20210101",
),
(
"name with spaces",
"",
"",
"",
"",
None,
None,
"name_with_spaces",
),
(
"name with double space",
"",
"",
"",
"",
None,
None,
"name_with_double_space",
),
(
"name. some fm",
"",
"",
"",
"",
None,
None,
"name_some_fm",
),
(
"name with many .. . spaces",
"",
"",
"",
"",
None,
None,
"name_with_many_spaces",
),
],
Expand Down Expand Up @@ -128,16 +129,16 @@ def test_get_filestem(
"",
"tag",
"parent",
"2020-01-01",
"2022-01-02",
datetime.strptime("2020-01-01", "%Y-%m-%d"),
datetime.strptime("2022-01-02", "%Y-%m-%d"),
"'name' entry is missing",
),
(
"name",
"tag",
"parent",
"",
"2020-01-01",
None,
datetime.strptime("2020-01-01", "%Y-%m-%d"),
"'time1' is missing while",
),
],
Expand Down Expand Up @@ -218,18 +219,20 @@ def test_filedata_provider(regsurf, tmp_path):
objdata.name = "name"
objdata.efolder = "efolder"
objdata.extension = ".ext"
objdata.time0 = "t1"
objdata.time1 = "t2"
t1 = "19000101"
t2 = "20240101"
objdata.time0 = datetime.strptime(t1, "%Y%m%d")
objdata.time1 = datetime.strptime(t2, "%Y%m%d")

fdata = FileDataProvider(cfg, objdata)
filemeta = fdata.get_metadata()

assert isinstance(filemeta, meta.File)
assert (
str(filemeta.relative_path)
== "share/results/efolder/parent--name--tag--t2_t1.ext"
== f"share/results/efolder/parent--name--tag--{t2}_{t1}.ext"
)
absdata = tmp_path / "share/results/efolder/parent--name--tag--t2_t1.ext"
absdata = tmp_path / f"share/results/efolder/parent--name--tag--{t2}_{t1}.ext"
assert filemeta.absolute_path == absdata


Expand Down

0 comments on commit bec7647

Please sign in to comment.