[feature] add allow_multiple and return_dataseries

znamlab · Jul 17, 2023 · a02e97b · a02e97b
1 parent 75246fb
commit a02e97b
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,7 +6,8 @@
 
 - Add `get_data_root` function to get `raw` or `processed` root for a project
 - `get_children` can filter children by attributes before returning results
-- refactor `get_datasets` to be non recursive and add filtering options
+- refactor `get_datasets` to be non recursive and add filtering options. Also add 
+  multiple options to filter datasets and format output
 - add `get_datasets_recursively` to get all datasets below a given entity
 
 ### Bugfixes

diff --git a/flexiznam/main.py b/flexiznam/main.py
@@ -1134,20 +1134,28 @@ def get_datasets(
     dataset_type=None,
     project_id=None,
     flexilims_session=None,
-    return_paths=True,
     filter_datasets=None,
+    allow_multiple=True,
+    return_paths=False,
+    return_dataseries=False,
 ):
     """
     Args:
-        origin_id (str): hexadecimal ID of the origin session.
-        recording_type (str): type of the recording to filter by. If `None`,
-            will return datasets for all recordings.
+        origin_id (str): hexadecimal ID of the origin session. Not required if
+            origin_name is provided.
+        origin_name (str): text name of the origin session. Not required if origin_id
+            is provided.
         dataset_type (str): type of the dataseet to filter by. If `None`,
             will return all datasets.
         project_id (str): text name of the project. Not required if
             `flexilims_session` is provided.
         flexilims_session (:py:class:`flexilims.Flexilims`): Flexylims session object
+        filter_datasets (dict): dictionary of key-value pairs to filter datasets by.
+        allow_multiple (bool): if True, allow multiple datasets to be returned,
+            otherwise ensure that only one dataset exists online and return it.
         return_paths (bool): if True, return a list of paths. If False, return the
+            dataset objects or dataseries.
+        return_dataseries (bool): if True, return the dataseries instead of the
             dataset objects.
         _output (list): internal argument used for recursion.
 
@@ -1165,6 +1173,7 @@ def get_datasets(
         filter_datasets = {}
     if dataset_type is not None:
         filter_datasets.update({"dataset_type": dataset_type})
+
     datasets = get_children(
         parent_id=origin_id,
         parent_name=origin_name,
@@ -1173,14 +1182,22 @@ def get_datasets(
         filter=filter_datasets,
     )
 
-    datasets = [
-        flexiznam.Dataset.from_dataseries(
-            dataseries=ds, flexilims_session=flexilims_session
-        )
-        for _, ds in datasets.iterrows()
-    ]
-    if return_paths:
-        datasets = [ds.path_full for ds in datasets]
+    if not return_dataseries:
+        datasets = [
+            flexiznam.Dataset.from_dataseries(
+                dataseries=ds, flexilims_session=flexilims_session
+            )
+            for _, ds in datasets.iterrows()
+        ]
+        if return_paths:
+            datasets = [ds.path_full for ds in datasets]
+
+    if not allow_multiple:
+        assert len(datasets) <= 1, f"Fount {len(datasets)} datasets. Expected 1."
+        if len(datasets) == 1:
+            datasets = datasets[0] if not return_dataseries else datasets.iloc[0]
+        else:
+            datasets = None
     return datasets
 
 

diff --git a/tests/test_components/test_main.py b/tests/test_components/test_main.py
@@ -9,7 +9,7 @@
 from tests.tests_resources.data_for_testing import MOUSE_ID, SESSION
 
 # Test functions from main.py
-from flexiznam.schema import Dataset, HarpData
+from flexiznam.schema import Dataset, HarpData, ScanimageData
 
 # this needs to change every time I reset flexlilims
 
@@ -136,6 +136,7 @@ def test_get_datasets(flm_sess):
     ds = flz.get_datasets(
         origin_name=SESSION,
         flexilims_session=flm_sess,
+        return_paths=True,
     )
     assert len(ds) == 3
     assert all([isinstance(d, pathlib.PosixPath) for d in ds])
@@ -153,11 +154,42 @@ def test_get_datasets(flm_sess):
         filter_datasets=dict(acq_uid="overview_zoom1_00001"),
     )
     assert len(ds) == 1
+    ds = flz.get_datasets(
+        origin_name=SESSION,
+        flexilims_session=flm_sess,
+        return_paths=False,
+        filter_datasets=dict(acq_uid="overview_zoom1_00001"),
+        allow_multiple=False,
+    )
+    assert isinstance(ds, ScanimageData)
+    ds = flz.get_datasets(
+        origin_name=SESSION,
+        flexilims_session=flm_sess,
+        return_paths=True,
+        filter_datasets=dict(acq_uid="overview_zoom1_00001"),
+        allow_multiple=False,
+    )
+    assert isinstance(ds, pathlib.PosixPath)
+    ds = flz.get_datasets(
+        origin_name=SESSION,
+        flexilims_session=flm_sess,
+        return_dataseries=True,
+        filter_datasets=dict(acq_uid="overview_zoom1_00001"),
+        allow_multiple=True,
+    )
+    assert isinstance(ds,  pd.DataFrame)
+    ds = flz.get_datasets(
+        origin_name=SESSION,
+        flexilims_session=flm_sess,
+        return_dataseries=True,
+        filter_datasets=dict(acq_uid="overview_zoom1_00001"),
+        allow_multiple=False,
+    )
+    assert isinstance(ds,  pd.Series)
 
     rec = flz.get_children(
         parent_name=SESSION, flexilims_session=flm_sess, children_datatype="recording"
     ).iloc[0]
-
     ds_all = flz.get_datasets(
         origin_id=rec.id,
         flexilims_session=flm_sess,
@@ -186,6 +218,12 @@ def test_get_datasets(flm_sess):
         return_paths=True,
     )
     assert ds == ds2
+    with pytest.raises(AssertionError):
+        flz.get_datasets(
+            origin_id=rec.id,
+            project_id=flm_sess.project_id,
+            allow_multiple=False,
+        )
 
 
 def test_get_datasets_recursively(flm_sess):