Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLN: Refactor filestem construction #623

Merged
merged 1 commit into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 44 additions & 35 deletions src/fmu/dataio/providers/_filedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from __future__ import annotations

import re
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
Expand Down Expand Up @@ -110,47 +111,55 @@ def _add_filename_to_path(self, path: Path) -> Path:
return (path / stem).with_suffix(path.suffix + self.objdata.extension)

def _get_filestem(self) -> str:
"""Construct the file"""
"""
Construct the filestem string as a combinaton of various
attributes; parent, name, tagname and time information.
A '--' is used to separate the non-empty components, and a
filestem containing all components will look like this:
filestem = 'parent--name--tagname--time1_time0'
"""

if not self.name:
raise ValueError("The 'name' entry is missing for constructing a file name")
if not self.objdata.time0 and self.objdata.time1:
raise ValueError("Not legal: 'time0' is missing while 'time1' is present")

stem = self.name.lower()
if self.dataio.tagname:
stem += "--" + self.dataio.tagname.lower()
if self.dataio.parent:
stem = self.dataio.parent.lower() + "--" + stem

if self.objdata.time0 and not self.objdata.time1:
stem += "--" + (str(self.objdata.time0)[0:10]).replace("-", "")

elif self.objdata.time0 and self.objdata.time1:
monitor = (str(self.objdata.time1)[0:10]).replace("-", "")
base = (str(self.objdata.time0)[0:10]).replace("-", "")
if monitor == base:
warn(
"The monitor date and base date are equal", UserWarning
) # TODO: consider add clocktimes in such cases?
if self.dataio.filename_timedata_reverse: # class variable
stem += "--" + base + "_" + monitor
else:
stem += "--" + monitor + "_" + base

# remove unwanted characters
stem = stem.replace(".", "_").replace(" ", "_")

# avoid multiple double underscores
while "__" in stem:
stem = stem.replace("__", "_")

# treat norwegian special letters
# BUG(?): What about germen letter like "Ü"?
stem = stem.replace("æ", "ae")
stem = stem.replace("ø", "oe")
stem = stem.replace("å", "aa")
return stem.lower()
filestem_order = (
self.dataio.parent,
self.name,
self.dataio.tagname,
self._get_timepart_for_filename(),
)
# join non-empty parts with '--'
filestem = "--".join((p for p in filestem_order if p))
filestem = self._sanitize_the_filestem(filestem)
return filestem.lower()

def _get_timepart_for_filename(self) -> str:
if self.objdata.time0 is None:
return ""
t0 = self.objdata.time0.strftime("%Y%m%d")
if not self.objdata.time1:
return t0
t1 = self.objdata.time1.strftime("%Y%m%d")
return "_".join(
(t1, t0) if not self.dataio.filename_timedata_reverse else (t0, t1)
)

@staticmethod
def _sanitize_the_filestem(filestem: str) -> str:
"""
Clean up the filestem; remove unwanted characters, treat
norwegian special letters and remove multiple underscores
"""
filestem = (
filestem.replace(".", "_")
.replace(" ", "_")
.replace("æ", "ae")
.replace("ø", "oe")
.replace("å", "aa")
)
return re.sub(r"__+", "_", filestem)

def _get_forcefolder_if_absolute(self) -> Path | None:
if self.dataio.forcefolder.startswith("/"):
Expand Down
55 changes: 29 additions & 26 deletions tests/test_units/test_filedataprovider_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import os
from copy import deepcopy
from datetime import datetime
from pathlib import Path

import pytest
Expand All @@ -21,72 +22,72 @@
"name",
"tag",
"parent",
"2020-01-01",
"2022-01-02",
datetime.strptime("2020-01-01", "%Y-%m-%d"),
datetime.strptime("2022-01-02", "%Y-%m-%d"),
"parent--name--tag--20220102_20200101",
),
(
"name",
"",
"",
"2020-01-01",
"2022-01-02",
datetime.strptime("2020-01-01", "%Y-%m-%d"),
datetime.strptime("2022-01-02", "%Y-%m-%d"),
"name--20220102_20200101",
),
(
"name",
"",
"",
"2022-01-02",
"",
datetime.strptime("2022-01-02", "%Y-%m-%d"),
None,
"name--20220102",
),
(
"name",
"",
"",
"",
"",
None,
None,
"name",
),
(
"name",
"",
"",
20210101,
20220102,
datetime.strptime("2021-01-01", "%Y-%m-%d"),
datetime.strptime("2022-01-02", "%Y-%m-%d"),
"name--20220102_20210101",
),
(
"name with spaces",
"",
"",
"",
"",
None,
None,
"name_with_spaces",
),
(
"name with double space",
"",
"",
"",
"",
None,
None,
"name_with_double_space",
),
(
"name. some fm",
"",
"",
"",
"",
None,
None,
"name_some_fm",
),
(
"name with many .. . spaces",
"",
"",
"",
"",
None,
None,
"name_with_many_spaces",
),
],
Expand Down Expand Up @@ -128,16 +129,16 @@ def test_get_filestem(
"",
"tag",
"parent",
"2020-01-01",
"2022-01-02",
datetime.strptime("2020-01-01", "%Y-%m-%d"),
datetime.strptime("2022-01-02", "%Y-%m-%d"),
"'name' entry is missing",
),
(
"name",
"tag",
"parent",
"",
"2020-01-01",
None,
datetime.strptime("2020-01-01", "%Y-%m-%d"),
"'time1' is missing while",
),
],
Expand Down Expand Up @@ -218,18 +219,20 @@ def test_filedata_provider(regsurf, tmp_path):
objdata.name = "name"
objdata.efolder = "efolder"
objdata.extension = ".ext"
objdata.time0 = "t1"
objdata.time1 = "t2"
t1 = "19000101"
t2 = "20240101"
objdata.time0 = datetime.strptime(t1, "%Y%m%d")
objdata.time1 = datetime.strptime(t2, "%Y%m%d")

fdata = FileDataProvider(cfg, objdata)
filemeta = fdata.get_metadata()

assert isinstance(filemeta, meta.File)
assert (
str(filemeta.relative_path)
== "share/results/efolder/parent--name--tag--t2_t1.ext"
== f"share/results/efolder/parent--name--tag--{t2}_{t1}.ext"
)
absdata = tmp_path / "share/results/efolder/parent--name--tag--t2_t1.ext"
absdata = tmp_path / f"share/results/efolder/parent--name--tag--{t2}_{t1}.ext"
assert filemeta.absolute_path == absdata


Expand Down