Merge branch 'fix-publish-tests' of github.com:SolarDrew/dkist into f…

…ix-publish-tests
DKISTDC · Sep 4, 2024 · ce81964 · ce81964
2 parents e2ca11e + 67c71ec
commit ce81964
Show file tree

Hide file tree

Showing 15 changed files with 199 additions and 75 deletions.
diff --git a/.cruft.json b/.cruft.json
@@ -1,6 +1,6 @@
 {
   "template": "https://github.com/sunpy/package-template",
-  "commit": "112d7d4adf0fa168bbb9ddb1886ad4f1e595b8be",
+  "commit": "f7458b35be5824d419efd6ce8c135ff67a00d1d5",
   "checkout": null,
   "context": {
     "cookiecutter": {

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 repos:
     # This should be before any formatting hooks like isort
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: "v0.5.2"
+    rev: "v0.6.2"
     hooks:
       - id: ruff
         args: ["--fix"]

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -1,7 +1,7 @@
 version: 2
 
 build:
-  os: ubuntu-22.04
+  os: ubuntu-lts-latest
   tools:
     python: "mambaforge-latest"
   jobs:

diff --git a/.rtd-environment.yml b/.rtd-environment.yml
@@ -2,6 +2,6 @@ name: dkist
 channels:
   - conda-forge
 dependencies:
-  - python=3.10
+  - python=3.12
   - pip
   - graphviz!=2.42.*,!=2.43.*
diff --git a/changelog/402.feature.rst b/changelog/402.feature.rst
@@ -0,0 +1,4 @@
+Add various features for easier inspection of `TiledDataset`:
+- `__repr__` method to output basic dataset info;
+- `tiles_shape` property to access data array shape for each individual tile;
+- `slice_tiles()` method to apply the same slice to all datasets.
diff --git a/changelog/422.trivial.rst b/changelog/422.trivial.rst
@@ -0,0 +1 @@
+replace usages of ``copy_arrays`` with ``memmap`` for ``asdf>=3.1.0``
diff --git a/changelog/431.trivial.rst b/changelog/431.trivial.rst
@@ -0,0 +1 @@
+Update Dataset representation for better readability.
diff --git a/dkist/conftest.py b/dkist/conftest.py
@@ -282,6 +282,15 @@ def simple_tiled_dataset(dataset):
     return TiledDataset(dataset_array, dataset.meta["inventory"])
 
 
+@pytest.fixture
+def large_tiled_dataset(tmp_path_factory):
+    vbidir = tmp_path_factory.mktemp("data")
+    with gzip.open(Path(rootdir) / "large_vbi.asdf.gz", mode="rb") as gfo:
+        with open(vbidir / "test_vbi.asdf", mode="wb") as afo:
+            afo.write(gfo.read())
+    return load_dataset(vbidir / "test_vbi.asdf")
+
+
 @pytest.fixture
 def small_visp_dataset():
     """

diff --git a/dkist/data/test/large_vbi.asdf.gz b/dkist/data/test/large_vbi.asdf.gz
diff --git a/dkist/dataset/loader.py b/dkist/dataset/loader.py
@@ -14,6 +14,12 @@
     from asdf import ValidationError
 
 
+def asdf_open_memory_mapping_kwarg(memmap: bool) -> dict:
+    if asdf.__version__ > "3.1.0":
+        return {"memmap": memmap}
+    return {"copy_arrays": not memmap}
+
+
 @singledispatch
 def load_dataset(target):
     """
@@ -39,45 +45,48 @@ def load_dataset(target):
     Examples
     --------
 
+    >>> import dkist
+
     >>> dkist.load_dataset("/path/to/VISP_L1_ABCDE.asdf")  # doctest: +SKIP
 
     >>> dkist.load_dataset("/path/to/ABCDE/")  # doctest: +SKIP
 
     >>> dkist.load_dataset(Path("/path/to/ABCDE"))  # doctest: +SKIP
 
-    >>> from sunpy.net import Fido, attrs as a
-    >>> import dkist.net
-    >>> search_results = Fido.search(a.dkist.Dataset("AGLKO"))   # doctest: +REMOTE_DATA
-    >>> files = Fido.fetch(search_results)   # doctest: +REMOTE_DATA
-    >>> dkist.load_dataset(files)   # doctest: +REMOTE_DATA
-    <dkist.dataset.dataset.Dataset object at ...>
-    This Dataset has 4 pixel and 5 world dimensions
+    >>> from dkist.data.sample import VISP_BKPLX  # doctest: +REMOTE_DATA
+    >>> print(dkist.load_dataset(VISP_BKPLX))  # doctest: +REMOTE_DATA
+    This VISP Dataset BKPLX consists of 1700 frames.
+    Files are stored in ...VISP_BKPLX
     <BLANKLINE>
-    dask.array<reshape, shape=(4, 1000, 976, 2555), dtype=float64, chunksize=(1, 1, 976, 2555), chunktype=numpy.ndarray>
+    This Dataset has 4 pixel and 5 world dimensions.
     <BLANKLINE>
-    Pixel Dim  Axis Name                Data size  Bounds
+    The data are represented by a <class 'dask.array.core.Array'> object:
+    dask.array<reshape, shape=(4, 425, 980, 2554), dtype=float64, chunksize=(1, 1, 980, 2554), chunktype=numpy.ndarray>
+    <BLANKLINE>
+    Array Dim  Axis Name                Data size  Bounds
             0  polarization state               4  None
-            1  raster scan step number       1000  None
-            2  dispersion axis                976  None
-            3  spatial along slit            2555  None
+            1  raster scan step number        425  None
+            2  dispersion axis                980  None
+            3  spatial along slit            2554  None
     <BLANKLINE>
     World Dim  Axis Name                  Physical Type                   Units
-            0  stokes                     phys.polarization.stokes        unknown
-            1  time                       time                            s
+            4  stokes                     phys.polarization.stokes        unknown
+            3  time                       time                            s
             2  helioprojective latitude   custom:pos.helioprojective.lat  arcsec
-            3  wavelength                 em.wl                           nm
-            4  helioprojective longitude  custom:pos.helioprojective.lon  arcsec
+            1  wavelength                 em.wl                           nm
+            0  helioprojective longitude  custom:pos.helioprojective.lon  arcsec
     <BLANKLINE>
     Correlation between pixel and world axes:
     <BLANKLINE>
-                   Pixel Dim
-    World Dim    0    1    2    3
-            0  yes   no   no   no
-            1   no  yes   no   no
-            2   no  yes   no  yes
-            3   no   no  yes   no
-            4   no  yes   no  yes
-
+                              |                      PIXEL DIMENSIONS
+                              |   spatial    |  dispersion  | raster scan  | polarization
+             WORLD DIMENSIONS |  along slit  |     axis     | step number  |    state
+    ------------------------- | ------------ | ------------ | ------------ | ------------
+    helioprojective longitude |      x       |              |      x       |
+                   wavelength |              |      x       |              |
+     helioprojective latitude |      x       |              |      x       |
+                         time |              |              |      x       |
+                       stokes |              |              |              |      x
     """
     known_types = _known_types_docs().keys()
     raise TypeError(f"Input type {type(target).__name__} not recognised. It must be one of {', '.join(known_types)}.")
@@ -156,7 +165,7 @@ def _load_from_asdf(filepath):
     try:
         with importlib_resources.as_file(importlib_resources.files("dkist.io") / "level_1_dataset_schema.yaml") as schema_path:
             with asdf.open(filepath, custom_schema=schema_path.as_posix(),
-                           lazy_load=False, copy_arrays=True) as ff:
+                           lazy_load=False, **asdf_open_memory_mapping_kwarg(memmap=False)) as ff:
                 ds = ff.tree["dataset"]
                 if isinstance(ds, TiledDataset):
                     for sub in ds.flat:

diff --git a/dkist/dataset/tests/test_tiled_dataset.py b/dkist/dataset/tests/test_tiled_dataset.py
@@ -28,6 +28,14 @@ def test_tiled_dataset_slice(simple_tiled_dataset, aslice):
     assert np.all(simple_tiled_dataset[aslice] == simple_tiled_dataset._data[aslice])
 
 
+@pytest.mark.parametrize("aslice", [np.s_[0, :100, 100:200]])
+def test_tiled_dataset_slice_tiles(large_tiled_dataset, aslice):
+    sliced = large_tiled_dataset.slice_tiles[aslice]
+    for i, tile in enumerate(sliced.flat):
+        # This will throw an AttributeError if you do tile.shape and I don't know why
+        assert tile.data.shape == (100, 100)
+
+
 def test_tiled_dataset_headers(simple_tiled_dataset, dataset):
     assert len(simple_tiled_dataset.combined_headers) == len(dataset.meta["headers"]) * 4
     assert simple_tiled_dataset.combined_headers.colnames == dataset.meta["headers"].colnames
@@ -75,3 +83,11 @@ def test_tileddataset_plot(share_zscale):
     fig = plt.figure(figsize=(600, 800))
     ds.plot(0, share_zscale=share_zscale)
     return plt.gcf()
+
+def test_repr(simple_tiled_dataset):
+    r = repr(simple_tiled_dataset)
+    assert str(simple_tiled_dataset[0, 0].data) in r
+
+
+def test_tiles_shape(simple_tiled_dataset):
+    assert simple_tiled_dataset.tiles_shape == [[tile.data.shape for tile in row] for row in simple_tiled_dataset]
diff --git a/dkist/dataset/tiled_dataset.py b/dkist/dataset/tiled_dataset.py
@@ -5,6 +5,7 @@
 but not representable in a single NDCube derived object as the array data are
 not contiguous in the spatial dimensions (due to overlaps and offsets).
 """
+from textwrap import dedent
 from collections.abc import Collection
 
 import matplotlib.pyplot as plt
@@ -13,10 +14,26 @@
 from astropy.table import vstack
 
 from .dataset import Dataset
+from .utils import dataset_info_str
 
 __all__ = ["TiledDataset"]
 
 
+class TiledDatasetSlicer:
+    """
+    Basic class to provide the slicing
+    """
+    def __init__(self, data, inventory):
+        self.data = data
+        self.inventory = inventory
+
+    def __getitem__(self, slice_):
+        new_data = []
+        for tile in self.data.flat:
+            new_data.append(tile[slice_])
+        return TiledDataset(np.array(new_data).reshape(self.data.shape), self.inventory)
+
+
 class TiledDataset(Collection):
     """
     Holds a grid of `.Dataset` objects.
@@ -125,6 +142,13 @@ def shape(self):
         """
         return self._data.shape
 
+    @property
+    def tiles_shape(self):
+        """
+        The shape of each individual tile in the TiledDataset.
+        """
+        return [[tile.data.shape for tile in row] for row in self]
+
     def plot(self, slice_index: int, share_zscale=False, **kwargs):
         vmin, vmax = np.inf, 0
         fig = plt.figure()
@@ -151,4 +175,18 @@ def plot(self, slice_index: int, share_zscale=False, **kwargs):
         fig.suptitle(f"{self.inventory['instrumentName']} Dataset ({self.inventory['datasetId']}) at time {timestamp} (slice={slice_index})", y=0.95)
         return fig
 
+    @property
+    def slice_tiles(self):
+        return TiledDatasetSlicer(self._data, self.inventory)
+
     # TODO: def regrid()
+
+    def __repr__(self):
+        """
+        Overload the NDData repr because it does not play nice with the dask delayed io.
+        """
+        prefix = object.__repr__(self)
+        return dedent(f"{prefix}\n{self.__str__()}")
+
+    def __str__(self):
+        return dataset_info_str(self)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		replace usages of ``copy_arrays`` with ``memmap`` for ``asdf>=3.1.0``
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Update Dataset representation for better readability.