Add a regression test and docs for new path loader

[ci skip]
DKISTDC · Jan 22, 2025 · ce6e6fa · ce6e6fa
1 parent 041ee09
commit ce6e6fa
Show file tree

Hide file tree

Showing 2 changed files with 79 additions and 6 deletions.
diff --git a/dkist/dataset/loader.py b/dkist/dataset/loader.py
@@ -138,21 +138,42 @@ def _load_from_path(path: Path):
 
 def _load_from_directory(directory):
     """
-    Construct a `~dkist.dataset.Dataset` from a directory containing one
-    asdf file and a collection of FITS files.
+    Construct a `~dkist.dataset.Dataset` from a directory containing one (or
+    more) ASDF files and a collection of FITS files.
+
+    ASDF files have the generic pattern:
+
+    ``{instrument}_L1_{start_time:%Y%m%dT%H%M%S}_{dataset_id}[_{suffix}].asdf``
+
+    where the ``_{suffix}`` on the end may be absent or one of a few different
+    suffixes which have been used at different times.  When searching a
+    directory for one or more ASDF file to load we should attempt to only load
+    one per dataset ID by selecting files in suffix order.
+
+    The order of suffixes are (from newest used to oldest):
+
+    - ``_metadata``
+    - ``_user_tools``
+    - None
+
+    The algorithm used to find ASDF files to load in a directory is therefore:
+
+    - Glob the directory for all ASDF files
+    - Group all results by the filename up to and including the dataset id in the filename
+    - Ignore any ASDF files with an old suffix if a new suffix is present
+    - Throw a warning to the user if any ASDF files with older suffixes are found
     """
     base_path = Path(directory).expanduser()
     asdf_files = tuple(base_path.glob("*.asdf"))
 
     if not asdf_files:
         raise ValueError(f"No asdf file found in directory {base_path}.")
 
-    if len(asdf_files) > 1:
-        return _load_from_iterable(asdf_files)
+    if len(asdf_files) == 1:
+        return _load_from_asdf(asdf_files[0])
 
-    asdf_file = asdf_files[0]
+    return _load_from_iterable(asdf_files)
 
-    return _load_from_asdf(asdf_file)
 
 
 def _load_from_asdf(filepath):

diff --git a/dkist/dataset/tests/test_load_dataset.py b/dkist/dataset/tests/test_load_dataset.py
@@ -1,4 +1,5 @@
 import shutil
+import numbers
 
 import pytest
 from parfive import Results
@@ -114,3 +115,54 @@ def test_not_dkist_asdf(tmp_path):
 
     with pytest.raises(TypeError, match="not a valid DKIST"):
         load_dataset(tmp_path / "test.asdf")
+
+
+def generate_asdf_folder(tmp_path, asdf_path, filenames):
+    for fname in filenames:
+        shutil.copy(asdf_path, tmp_path / fname)
+
+    return tmp_path
+
+
+@pytest.mark.parametrize(("filenames", "indices"), [
+    # param[0] is list of filenames
+    # parram[1] is the indices in that list that should be used
+    (("VBI_L1_20231016T184519_AJQWW.asdf",), 0),
+    (("VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), 0),
+    (("VBI_L1_20231016T184519_AJQWW_metadata.asdf",), 0),
+    (("VBI_L1_20231016T184519_AJQWW_unknown.asdf",), 0),
+    (("VBI_L1_20231016T184519_AJQWW.asdf",
+      "VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), 1),
+    (("VBI_L1_20231016T184519_AJQWW.asdf",
+      "VBI_L1_20231016T184519_AJQWW_user_tools.asdf",
+      "VBI_L1_20231016T184519_AJQWW_metadata.asdf",), 2),
+    (("VBI_L1_20231016T184519_AJQWW.asdf",
+      "VBI_L1_20231016T184519_AJQWW_user_tools.asdf",
+      "VBI_L1_20231016T184519_AJQWW_metadata.asdf",
+      "VBI_L1_20231016T184519_AJQWW_unknown.asdf"), (2, 3)),
+])
+def test_select_asdf(tmp_path, asdf_path, filenames, indices, mocker):
+    asdf_folder = generate_asdf_folder(tmp_path, asdf_path, filenames)
+
+    asdf_file_paths = tuple(asdf_folder / fname for fname in filenames)
+
+    # First we check that we load the correct amount of datasets and that the
+    # loading completes correctly
+
+    datasets = load_dataset(asdf_folder)
+
+    if isinstance(indices, numbers.Integral):
+        assert isinstance(datasets, Dataset)
+    else:
+        assert len(datasets) == len(indices)
+
+    # Now we check that the correct files are chosen
+
+    load_from_asdf = mocker.patch("dkist.dataset.loader._load_from_asdf")
+    load_from_iterable = mocker.patch("dkist.dataset.loader._load_from_iterable")
+
+    datasets = load_dataset(asdf_folder)
+    if isinstance(indices, numbers.Integral):
+        load_from_asdf.assert_called_once_with(asdf_file_paths[indices])
+    else:
+        load_from_iterable.assert_called_once_with(tuple(asdf_file_paths[i] for i in indices))