From e66c0d951462c0e2464e2711a21cd5fdff6ec629 Mon Sep 17 00:00:00 2001 From: Antonin Blot Date: Tue, 11 Jul 2023 13:33:43 +0100 Subject: [PATCH 01/18] Update version and exclude test package --- CHANGELOG.md | 4 ++++ setup.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 09a632b..7caab39 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Change log +## v0.3.8 + +### Main changes + ## v0.3.7 ### Main changes diff --git a/setup.py b/setup.py index df05b7d..afd0c32 100755 --- a/setup.py +++ b/setup.py @@ -2,13 +2,13 @@ setup( name="flexiznam", - version="v0.3.7", + version="v0.3.8", url="https://github.com/znamlab/flexznam", license="MIT", author="Antonin Blot", author_email="antonin.blot@gmail.com", description="Znamlab tool to interact with flexilims", - packages=find_packages(), + packages=find_packages(exclude=["tests"]), include_package_data=True, install_requires=[ "Click", From 9ded96ec025821e8361b4c07b1dd5cd62cb24877 Mon Sep 17 00:00:00 2001 From: Antonin Blot Date: Mon, 17 Jul 2023 14:06:24 +0100 Subject: [PATCH 02/18] [feature] Add `get_data_root` function to get `raw` or `processed` root for a project --- CHANGELOG.md | 2 ++ flexiznam/main.py | 33 ++++++++++++++++++++++++++++++ tests/test_components/test_main.py | 21 +++++++++++++++++++ 3 files changed, 56 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7caab39..8b94e86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ ## v0.3.7 +- Add `get_data_root` function to get `raw` or `processed` root for a project + ### Main changes - Separate `Dataset.from_dataseries` and `Dataset.from_flexilims` to avoid confusion diff --git a/flexiznam/main.py b/flexiznam/main.py index a663f4b..2d6b7d6 100755 --- a/flexiznam/main.py +++ b/flexiznam/main.py @@ -22,6 +22,38 @@ def _format_project(project_id, prm): return project_id +def get_data_root(which, project=None, flexilims_session=None): + """Get raw or processed path for a project + + Args: + which (str): either "raw" or "processed" + project (str, optional): name or id of the project. Optional if + flexilims_session is provided + flexilims_session (:py:class:`flexilims.Flexilims`, optional): a flexilims + session with project set. Optional if project is provided. + """ + if which not in ["raw", "processed"]: + raise ValueError("which must be either 'raw' or 'processed'") + + if project is None: + assert ( + flexilims_session is not None, + "`flexilims_session` must be provided if `project` is None", + ) + project = flexilims_session.project_id + + if project not in PARAMETERS["project_ids"]: + project = lookup_project(project, prm=None) + assert project is not None, f"Invalid project {project}" + + if project in PARAMETERS["project_paths"]: + return Path(PARAMETERS["project_paths"][project][which]) + + if which == "raw": + return Path(PARAMETERS["data_root"]["raw"]) + return Path(PARAMETERS["data_root"]["processed"]) + + def lookup_project(project_id, prm=None): """ Look up project name by hexadecimal id @@ -975,6 +1007,7 @@ def get_children( return pd.DataFrame(results) if children_datatype is not None: results = [r for r in results if r["type"] == children_datatype] + results = pd.DataFrame(results) results.set_index("name", drop=False, inplace=True) return results diff --git a/tests/test_components/test_main.py b/tests/test_components/test_main.py index 8544755..73ec8dd 100644 --- a/tests/test_components/test_main.py +++ b/tests/test_components/test_main.py @@ -1,4 +1,5 @@ import datetime +from pathlib import Path import pandas as pd import pytest import flexiznam.main as flz @@ -12,6 +13,26 @@ # this needs to change every time I reset flexlilims +def test_get_path(): + p = flz.get_data_root(which="raw", project="test", flexilims_session=None) + assert p == Path(PARAMETERS["data_root"]["raw"]) + p = flz.get_data_root(which="processed", project="test", flexilims_session=None) + assert p == Path(PARAMETERS["data_root"]["processed"]) + sess = flz.get_flexilims_session( + project_id=PARAMETERS["project_ids"]["test"], reuse_token=False + ) + p = flz.get_data_root(which="processed", project=None, flexilims_session=sess) + assert p == Path(PARAMETERS["data_root"]["processed"]) + p = flz.get_data_root(which="raw", project="example", flexilims_session=None) + assert p == Path("/camp/project/example_project/raw") + with pytest.raises(AttributeError): + flz.get_data_root(which="processed", project=None, flexilims_session=None) + with pytest.raises(ValueError): + flz.get_data_root(which="crap", project="test", flexilims_session=None) + with pytest.raises(AssertionError): + p = flz.get_data_root(which="raw", project="random", flexilims_session=None) + + def test_get_flexilims_session(): sess = flz.get_flexilims_session( project_id=PARAMETERS["project_ids"]["test"], reuse_token=False From c414376d5bd5671e2886397904b9abb1b8a48444 Mon Sep 17 00:00:00 2001 From: Antonin Blot Date: Mon, 17 Jul 2023 14:22:36 +0100 Subject: [PATCH 03/18] [bugfix] harp dataset match csv even with short name --- CHANGELOG.md | 10 ++++++++-- flexiznam/schema/harp_data.py | 20 +++++++++----------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b94e86..8e6487c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,10 +4,16 @@ ### Main changes -## v0.3.7 - - Add `get_data_root` function to get `raw` or `processed` root for a project +### Minor + +- `harp_dataset.from_folder` will now match csv even if there is nothing before or after + `harpmessage` in the file name (i.e. the file is `harpmessage.bin`, and all csvs in + the folder will be matched) + +## v0.3.7 + ### Main changes - Separate `Dataset.from_dataseries` and `Dataset.from_flexilims` to avoid confusion diff --git a/flexiznam/schema/harp_data.py b/flexiznam/schema/harp_data.py index 3613cd1..70a9f4f 100644 --- a/flexiznam/schema/harp_data.py +++ b/flexiznam/schema/harp_data.py @@ -55,17 +55,15 @@ def from_folder( "file name." % bin_file ) continue - if any(m.groups()): # do not match csv if the bin is just "harpmessage" - pattern = "(.*)".join(m.groups()) + ".csv" - matches = [re.match(pattern, f) for f in csv_files] - associated_csv = { - m.groups()[0].strip("_"): f for f, m in zip(csv_files, matches) if m - } - if matched_files.intersection(associated_csv.values()): - raise IOError("A csv file matched with multiple binary files.") - matched_files.update(associated_csv.values()) - else: - associated_csv = {} + + pattern = "(.*)".join(m.groups()) + ".csv" + matches = [re.match(pattern, f) for f in csv_files] + associated_csv = { + m.groups()[0].strip("_"): f for f, m in zip(csv_files, matches) if m + } + if matched_files.intersection(associated_csv.values()): + raise IOError("A csv file matched with multiple binary files.") + matched_files.update(associated_csv.values()) bin_path = pathlib.Path(folder) / bin_file created = datetime.datetime.fromtimestamp(bin_path.stat().st_mtime) From 18450ef716a22f2d772785e05a06d586a02538ee Mon Sep 17 00:00:00 2001 From: Antonin Blot Date: Mon, 17 Jul 2023 14:31:00 +0100 Subject: [PATCH 04/18] [feature] `get_children` can filter by attributes --- CHANGELOG.md | 1 + flexiznam/main.py | 5 +++++ tests/test_components/test_main.py | 7 +++++++ 3 files changed, 13 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e6487c..0305fa0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Main changes - Add `get_data_root` function to get `raw` or `processed` root for a project +- `get_children` can filter children by attributes before returning results ### Minor diff --git a/flexiznam/main.py b/flexiznam/main.py index 2d6b7d6..cf45ecf 100755 --- a/flexiznam/main.py +++ b/flexiznam/main.py @@ -978,6 +978,7 @@ def get_children( children_datatype=None, project_id=None, flexilims_session=None, + filter=None, ): """ Get all entries belonging to a particular parent entity @@ -989,6 +990,7 @@ def get_children( types if None) project_id (str): text name of the project flexilims_session (:py:class:`flexilims.Flexilims`): Flexylims session object + filter (dict, None): filter to apply to the extra_attributes of the children Returns: DataFrame: containing all the relevant child entitites @@ -1007,6 +1009,9 @@ def get_children( return pd.DataFrame(results) if children_datatype is not None: results = [r for r in results if r["type"] == children_datatype] + if filter is not None: + for key, value in filter.items(): + results = [r for r in results if r.get(key, None) == value] results = pd.DataFrame(results) results.set_index("name", drop=False, inplace=True) diff --git a/tests/test_components/test_main.py b/tests/test_components/test_main.py index 73ec8dd..f40d4b3 100644 --- a/tests/test_components/test_main.py +++ b/tests/test_components/test_main.py @@ -208,6 +208,13 @@ def test_get_children(flm_sess): ) assert (res_part.type != "recording").sum() == 0 assert res_all.shape[1] > res_part.shape[1] + single_res = flz.get_children( + parent_name=SESSION, + flexilims_session=flm_sess, + children_datatype="dataset", + filter=dict(notes="Motion correction reference"), + ) + assert single_res.shape[0] == 1 def test_add_entity(flm_sess): From a2efd1bc4ea18007b93c184c451980524f8ae933 Mon Sep 17 00:00:00 2001 From: Antonin Blot Date: Mon, 17 Jul 2023 15:46:12 +0100 Subject: [PATCH 05/18] [bugfix] if get_children `filter` filteres everything --- CHANGELOG.md | 5 +++++ flexiznam/main.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0305fa0..9dd1258 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ - Add `get_data_root` function to get `raw` or `processed` root for a project - `get_children` can filter children by attributes before returning results +### Bugfixes + +- return empty dataframe if `filter` in `get_children` filters out everything (instead + of crashing) + ### Minor - `harp_dataset.from_folder` will now match csv even if there is nothing before or after diff --git a/flexiznam/main.py b/flexiznam/main.py index cf45ecf..4db9099 100755 --- a/flexiznam/main.py +++ b/flexiznam/main.py @@ -1014,7 +1014,8 @@ def get_children( results = [r for r in results if r.get(key, None) == value] results = pd.DataFrame(results) - results.set_index("name", drop=False, inplace=True) + if len(results): + results.set_index("name", drop=False, inplace=True) return results From e37cb43bab653f2415645057fc7679e9ac655b48 Mon Sep 17 00:00:00 2001 From: Antonin Blot Date: Mon, 17 Jul 2023 17:44:34 +0100 Subject: [PATCH 06/18] [feature] split get_datasets and recursive version --- CHANGELOG.md | 2 + flexiznam/main.py | 178 +++++++++++++++++++++-------- tests/test_components/test_main.py | 115 ++++++++++++++++--- 3 files changed, 232 insertions(+), 63 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9dd1258..86f233e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ - Add `get_data_root` function to get `raw` or `processed` root for a project - `get_children` can filter children by attributes before returning results +- refactor `get_datasets` to be non recursive and add filtering options +- add `get_datasets_recursively` to get all datasets below a given entity ### Bugfixes diff --git a/flexiznam/main.py b/flexiznam/main.py index 4db9099..3aa48a0 100755 --- a/flexiznam/main.py +++ b/flexiznam/main.py @@ -1019,20 +1019,125 @@ def get_children( return results -def get_datasets( - origin_id, - recording_type=None, +def get_child_dataset(flz_session, parent_name, dataset_type): + """ + Get the last dataset of a given type for a given parent entity. + + Args: + flz_session (flexilims_session): flexilims session + parent_name (str): name of the parent entity + dataset_type (str): type of the dataset + + Returns: + Dataset: the last dataset of the given type for the given parent entity + + """ + all_children = get_children( + parent_name=parent_name, + children_datatype="dataset", + flexilims_session=flz_session, + ) + selected_datasets = all_children[all_children["dataset_type"] == dataset_type] + if len(selected_datasets) == 0: + raise ValueError(f"No {dataset_type} dataset found for session {parent_name}") + elif len(selected_datasets) > 1: + print( + f"{len(selected_datasets)} {dataset_type} datasets found for session {parent_name}" + ) + print("Will return the last one...") + return flexiznam.Dataset.from_dataseries( + selected_datasets.iloc[-1], flexilims_session=flz_session + ) + + +def get_datasets_recursively( + origin_id=None, + origin_name=None, + origin_series=None, dataset_type=None, + filter_datasets=None, + parent_type=None, + filter_parents=None, + return_paths=False, project_id=None, flexilims_session=None, - return_paths=True, + _output=None, ): - """ - Recurse into recordings and get paths to child datasets of a given type. + """Get datasets recursively from a parent entity For example, this is useful if you want to retrieve paths to all *scanimage* datasets associated with a given session. + Returns: + dict: Dictionary with direct parent id as keys and lists of associated + datasets, or dataset paths as values + + """ + if origin_series is None: + if origin_id is None: + origin_id = get_id(origin_name, flexilims_session=flexilims_session) + origin_series = get_entity(id=origin_id, flexilims_session=flexilims_session) + else: + origin_id = origin_series["id"] + origin_is_valid = True + + # initialize output if first call + if _output is None: + _output = {} + + # Before adding the datasets of this level, check if the parent is valid + if (parent_type is not None) and (origin_series["type"] != parent_type): + origin_is_valid = False + if filter_parents is not None: + for key, value in filter_parents.items(): + if origin_series.get(key, None) != value: + origin_is_valid = False + + if origin_is_valid: + ds = get_datasets( + origin_id=origin_id, + dataset_type=dataset_type, + project_id=project_id, + flexilims_session=flexilims_session, + return_paths=return_paths, + filter_datasets=filter_datasets, + ) + # add only if there are datasets + if len(ds): + _output[origin_id] = ds + + # now recurse on children + children = get_children( + parent_id=origin_id, + parent_name=origin_name, + flexilims_session=flexilims_session, + ) + for _, child in children.iterrows(): + if child.type == "dataset": + continue + get_datasets_recursively( + origin_series=child, + dataset_type=dataset_type, + project_id=project_id, + flexilims_session=flexilims_session, + return_paths=return_paths, + filter_datasets=filter_datasets, + filter_parents=filter_parents, + _output=_output, + ) + return _output + + +def get_datasets( + origin_id=None, + origin_name=None, + dataset_type=None, + project_id=None, + flexilims_session=None, + return_paths=True, + filter_datasets=None, +): + """ Args: origin_id (str): hexadecimal ID of the origin session. recording_type (str): type of the recording to filter by. If `None`, @@ -1044,9 +1149,8 @@ def get_datasets( flexilims_session (:py:class:`flexilims.Flexilims`): Flexylims session object return_paths (bool): if True, return a list of paths. If False, return the dataset objects. + _output (list): internal argument used for recursion. - Returns: - dict: Dictionary with recording names as keys containing lists of associated dataset paths. """ assert (project_id is not None) or (flexilims_session is not None) @@ -1054,46 +1158,30 @@ def get_datasets( flexilims_session = get_flexilims_session(project_id) else: project_id = lookup_project(flexilims_session.project_id, PARAMETERS) - recordings = get_entities( - datatype="recording", - origin_id=origin_id, - query_key="recording_type", - query_value=recording_type, + + if origin_id is None: + assert origin_name is not None, "Must provide either origin_id or origin_name" + if filter_datasets is None: + filter_datasets = {} + if dataset_type is not None: + filter_datasets.update({"dataset_type": dataset_type}) + datasets = get_children( + parent_id=origin_id, + parent_name=origin_name, + children_datatype="dataset", flexilims_session=flexilims_session, + filter=filter_datasets, ) - datapath_dict = {} - if len(recordings) < 1: - return datapath_dict - for recording_id in recordings["id"]: - datasets = get_entities( - datatype="dataset", - origin_id=recording_id, - query_key="dataset_type", - query_value=dataset_type, - flexilims_session=flexilims_session, + + datasets = [ + flexiznam.Dataset.from_dataseries( + dataseries=ds, flexilims_session=flexilims_session ) - if return_paths: - datapaths = [] - for dataset_path, is_raw in zip(datasets["path"], datasets["is_raw"]): - prefix = ( - PARAMETERS["data_root"]["raw"] - if is_raw == "yes" - else PARAMETERS["data_root"]["processed"] - ) - this_path = Path(prefix) / dataset_path - if this_path.exists(): - datapaths.append(str(this_path)) - else: - raise IOError("Dataset {} not found".format(this_path)) - datapath_dict[recording_id] = datapaths - else: - datapath_dict[recording_id] = [ - flexiznam.Dataset.from_dataseries( - dataseries=ds, flexilims_session=flexilims_session - ) - for _, ds in datasets.iterrows() - ] - return datapath_dict + for _, ds in datasets.iterrows() + ] + if return_paths: + datasets = [ds.path_full for ds in datasets] + return datasets def generate_name(datatype, name, flexilims_session=None, project_id=None): diff --git a/tests/test_components/test_main.py b/tests/test_components/test_main.py index f40d4b3..ad2314a 100644 --- a/tests/test_components/test_main.py +++ b/tests/test_components/test_main.py @@ -1,4 +1,5 @@ import datetime +import pathlib from pathlib import Path import pandas as pd import pytest @@ -8,7 +9,7 @@ from tests.tests_resources.data_for_testing import MOUSE_ID, SESSION # Test functions from main.py -from flexiznam.schema import Dataset +from flexiznam.schema import Dataset, HarpData # this needs to change every time I reset flexlilims @@ -127,28 +128,106 @@ def test_get_mouse_id(flm_sess): def test_get_datasets(flm_sess): - sess = flz.get_entity( - name="mouse_physio_2p_S20211102", datatype="session", flexilims_session=flm_sess + ds = flz.get_datasets( + origin_id=MOUSE_ID, + flexilims_session=flm_sess, + ) + assert len(ds) == 0 + ds = flz.get_datasets( + origin_name=SESSION, + flexilims_session=flm_sess, + ) + assert len(ds) == 3 + assert all([isinstance(d, pathlib.PosixPath) for d in ds]) + ds = flz.get_datasets( + origin_name=SESSION, + flexilims_session=flm_sess, + return_paths=False, ) + assert len(ds) == 3 + assert all([hasattr(d, "path_full") for d in ds]) ds = flz.get_datasets( - flexilims_session=flm_sess, origin_id=sess.id, return_paths=True + origin_name=SESSION, + flexilims_session=flm_sess, + return_paths=False, + filter_datasets=dict(acq_uid="overview_zoom1_00001"), ) - assert len(ds) == 2 - for v in ds.values(): - assert isinstance(v, list) - assert all([isinstance(d, str) for d in v]) + assert len(ds) == 1 + + rec = flz.get_children( + parent_name=SESSION, flexilims_session=flm_sess, children_datatype="recording" + ).iloc[0] + ds_all = flz.get_datasets( + origin_id=rec.id, + flexilims_session=flm_sess, + return_paths=False, + ) + ds_cam = flz.get_datasets( + origin_id=rec.id, + dataset_type="camera", + flexilims_session=flm_sess, + return_paths=False, + ) + assert len(ds_all) > len(ds_cam) + ds = flz.get_datasets( + origin_id=rec.id, + dataset_type="camera", + filter_datasets=dict(timestamp_file="face_camera_timestamps.csv"), + flexilims_session=flm_sess, + return_paths=True, + ) + assert len(ds) == 1 ds2 = flz.get_datasets( - flexilims_session=flm_sess, origin_id=sess.id, return_paths=False - ) - assert len(ds2) == 2 - assert all([k in ds2 for k in ds]) - for k in ds2: - ds_paths = ds[k] - ds_ds = ds2[k] - assert len(ds_ds) == len(ds_paths) - assert all([isinstance(d, Dataset) for d in ds_ds]) - assert all([str(d.path_full) in ds_paths for d in ds_ds]) + origin_id=rec.id, + dataset_type="camera", + filter_datasets=dict(timestamp_file="face_camera_timestamps.csv"), + project_id=flm_sess.project_id, + return_paths=True, + ) + assert ds == ds2 + + +def test_get_datasets_recursively(flm_sess): + ds_dict = flz.get_datasets_recursively( + flexilims_session=flm_sess, origin_name=SESSION, return_paths=True + ) + assert len(ds_dict) == 3 + ds_dict = flz.get_datasets_recursively( + flexilims_session=flm_sess, + origin_name=SESSION, + return_paths=False, + dataset_type="harp", + ) + assert len(ds_dict) == 2 + ds = [] + for d in ds_dict.values(): + ds.extend(d) + assert all([isinstance(d, HarpData) for d in ds]) + ds_dict = flz.get_datasets_recursively( + flexilims_session=flm_sess, + origin_name=SESSION, + return_paths=False, + dataset_type="harp", + filter_datasets=dict(binary_file="harpmessage.bin"), + ) + assert len(ds_dict) == 1 + assert ds_dict.values().__iter__().__next__()[0].dataset_name == "harpmessage" + + ds_dict = flz.get_datasets_recursively( + flexilims_session=flm_sess, + origin_name=SESSION, + return_paths=False, + filter_parents={"timestamp": "165821"}, + ) + assert len(ds_dict) == 1 + ds_dict = flz.get_datasets_recursively( + flexilims_session=flm_sess, + origin_name=SESSION, + return_paths=False, + parent_type="recording", + ) + assert len(ds_dict) == 2 def test_add_mouse(flm_sess): From 591aa238e623899d2461a5830f6c2a201e4589a7 Mon Sep 17 00:00:00 2001 From: Antonin Blot Date: Mon, 17 Jul 2023 17:45:01 +0100 Subject: [PATCH 07/18] [minor] add `project_paths` to defautl config --- flexiznam/config/default_config.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/flexiznam/config/default_config.py b/flexiznam/config/default_config.py index 06b5e62..5e0d4ae 100644 --- a/flexiznam/config/default_config.py +++ b/flexiznam/config/default_config.py @@ -14,6 +14,12 @@ "test": "606df1ac08df4d77c72c9aa4", "AAVRKO_retina_hva": "60a7757a8901a2357f29080a", }, + project_paths={ + "example": dict( + raw="/camp/project/example_project/raw", + processed="/camp/project/example_project/processed", + ) + }, # a default username can be specified flexilims_username="yourusername", # Use for data dataset detection From d644f0836a6076f1f1a64cdbdb9a601c842c6453 Mon Sep 17 00:00:00 2001 From: Antonin Blot Date: Mon, 17 Jul 2023 17:51:24 +0100 Subject: [PATCH 08/18] [paths] adapt datasets to use `get_data_root` --- flexiznam/schema/datasets.py | 9 +++---- .../tests_schema/test_datasets.py | 26 +++++++++++++------ 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/flexiznam/schema/datasets.py b/flexiznam/schema/datasets.py index 11e181e..09f54af 100644 --- a/flexiznam/schema/datasets.py +++ b/flexiznam/schema/datasets.py @@ -711,12 +711,11 @@ def is_raw(self, value): @property def path_root(self): """Get CAMP root path that should apply to this dataset""" - if self.is_raw: - return Path(flz.config.PARAMETERS["data_root"]["raw"]) - elif self.is_raw is None: + if self.is_raw is None: raise AttributeError("`is_raw` must be set to find path.") - else: - return Path(flz.config.PARAMETERS["data_root"]["processed"]) + return flz.get_data_root( + which="raw" if self.is_raw else "processed", project=self.project + ) @property def path_full(self): diff --git a/tests/test_components/tests_schema/test_datasets.py b/tests/test_components/tests_schema/test_datasets.py index 8c58fc5..95fd6d7 100644 --- a/tests/test_components/tests_schema/test_datasets.py +++ b/tests/test_components/tests_schema/test_datasets.py @@ -1,6 +1,7 @@ import pytest import pathlib import pandas as pd +import flexiznam from flexiznam.schema import Dataset, microscopy_data from flexiznam.config import PARAMETERS from flexiznam.errors import DatasetError, FlexilimsError, NameNotUniqueError @@ -266,12 +267,12 @@ def test_from_origin(flm_sess): flexilims_session=flm_sess, ) ds0_bis = Dataset.from_origin( - origin_type="recording", - origin_name=origin_name, - dataset_type="suite2p_rois", - conflicts="overwrite", - flexilims_session=flm_sess, - ) + origin_type="recording", + origin_name=origin_name, + dataset_type="suite2p_rois", + conflicts="overwrite", + flexilims_session=flm_sess, + ) assert ds0.id == ds0_bis.id ds1 = Dataset.from_origin( @@ -307,8 +308,7 @@ def test_from_origin(flm_sess): # clean up for ds in (ds0, ds1): flm_sess.delete(ds.id) - - + def test_update_flexilims(flm_sess): """This test requires the database to be up-to-date for the physio mouse""" @@ -353,6 +353,16 @@ def test_dataset_paths(flm_sess): assert str(ds.path_full) == str( pathlib.Path(PARAMETERS["data_root"]["raw"] / ds.path) ) + ds = Dataset( + path="test_project", + is_raw="no", + dataset_type="camera", + extra_attributes={}, + created="", + project="example", + ) + assert ds.path_root != pathlib.Path(PARAMETERS["data_root"]["processed"]) + assert ds.path_root == flexiznam.get_data_root(which="processed", project="example") def test_project_project_id(flm_sess): From 75246fb10548f41e1efb38c3e7d8a13fe9dae4f6 Mon Sep 17 00:00:00 2001 From: Antonin Blot Date: Mon, 17 Jul 2023 17:52:54 +0100 Subject: [PATCH 09/18] [tests] fix test to change in demo data --- tests/test_components/tests_schema/test_harp.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_components/tests_schema/test_harp.py b/tests/test_components/tests_schema/test_harp.py index ed13fa4..e31679f 100644 --- a/tests/test_components/tests_schema/test_harp.py +++ b/tests/test_components/tests_schema/test_harp.py @@ -1,14 +1,13 @@ import pytest from flexiznam.schema.harp_data import HarpData from tests.tests_resources.data_for_testing import DATA_ROOT -from pathlib import Path def test_harp(): folder_genealogy = ["mouse_physio_2p", "S20211102", "R165821_SpheresPermTube"] data_dir = DATA_ROOT.joinpath(*folder_genealogy) ds = HarpData.from_folder(data_dir, verbose=False) - assert len(ds) == 2 + assert len(ds) == 1 ds_name = "PZAD9.4d_S20211102_R165821_SpheresPermTube_harpmessage" d = ds[ds_name] assert d.full_name == folder_genealogy[-1] + "_" + ds_name From a02e97b7113cb7901931ba0965fe59cd07d5fb02 Mon Sep 17 00:00:00 2001 From: Antonin Blot Date: Mon, 17 Jul 2023 18:46:31 +0100 Subject: [PATCH 10/18] [feature] add allow_multiple and return_dataseries --- CHANGELOG.md | 3 ++- flexiznam/main.py | 41 ++++++++++++++++++++--------- tests/test_components/test_main.py | 42 ++++++++++++++++++++++++++++-- 3 files changed, 71 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 86f233e..cf4abd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,8 @@ - Add `get_data_root` function to get `raw` or `processed` root for a project - `get_children` can filter children by attributes before returning results -- refactor `get_datasets` to be non recursive and add filtering options +- refactor `get_datasets` to be non recursive and add filtering options. Also add + multiple options to filter datasets and format output - add `get_datasets_recursively` to get all datasets below a given entity ### Bugfixes diff --git a/flexiznam/main.py b/flexiznam/main.py index 3aa48a0..814706c 100755 --- a/flexiznam/main.py +++ b/flexiznam/main.py @@ -1134,20 +1134,28 @@ def get_datasets( dataset_type=None, project_id=None, flexilims_session=None, - return_paths=True, filter_datasets=None, + allow_multiple=True, + return_paths=False, + return_dataseries=False, ): """ Args: - origin_id (str): hexadecimal ID of the origin session. - recording_type (str): type of the recording to filter by. If `None`, - will return datasets for all recordings. + origin_id (str): hexadecimal ID of the origin session. Not required if + origin_name is provided. + origin_name (str): text name of the origin session. Not required if origin_id + is provided. dataset_type (str): type of the dataseet to filter by. If `None`, will return all datasets. project_id (str): text name of the project. Not required if `flexilims_session` is provided. flexilims_session (:py:class:`flexilims.Flexilims`): Flexylims session object + filter_datasets (dict): dictionary of key-value pairs to filter datasets by. + allow_multiple (bool): if True, allow multiple datasets to be returned, + otherwise ensure that only one dataset exists online and return it. return_paths (bool): if True, return a list of paths. If False, return the + dataset objects or dataseries. + return_dataseries (bool): if True, return the dataseries instead of the dataset objects. _output (list): internal argument used for recursion. @@ -1165,6 +1173,7 @@ def get_datasets( filter_datasets = {} if dataset_type is not None: filter_datasets.update({"dataset_type": dataset_type}) + datasets = get_children( parent_id=origin_id, parent_name=origin_name, @@ -1173,14 +1182,22 @@ def get_datasets( filter=filter_datasets, ) - datasets = [ - flexiznam.Dataset.from_dataseries( - dataseries=ds, flexilims_session=flexilims_session - ) - for _, ds in datasets.iterrows() - ] - if return_paths: - datasets = [ds.path_full for ds in datasets] + if not return_dataseries: + datasets = [ + flexiznam.Dataset.from_dataseries( + dataseries=ds, flexilims_session=flexilims_session + ) + for _, ds in datasets.iterrows() + ] + if return_paths: + datasets = [ds.path_full for ds in datasets] + + if not allow_multiple: + assert len(datasets) <= 1, f"Fount {len(datasets)} datasets. Expected 1." + if len(datasets) == 1: + datasets = datasets[0] if not return_dataseries else datasets.iloc[0] + else: + datasets = None return datasets diff --git a/tests/test_components/test_main.py b/tests/test_components/test_main.py index ad2314a..c6aa785 100644 --- a/tests/test_components/test_main.py +++ b/tests/test_components/test_main.py @@ -9,7 +9,7 @@ from tests.tests_resources.data_for_testing import MOUSE_ID, SESSION # Test functions from main.py -from flexiznam.schema import Dataset, HarpData +from flexiznam.schema import Dataset, HarpData, ScanimageData # this needs to change every time I reset flexlilims @@ -136,6 +136,7 @@ def test_get_datasets(flm_sess): ds = flz.get_datasets( origin_name=SESSION, flexilims_session=flm_sess, + return_paths=True, ) assert len(ds) == 3 assert all([isinstance(d, pathlib.PosixPath) for d in ds]) @@ -153,11 +154,42 @@ def test_get_datasets(flm_sess): filter_datasets=dict(acq_uid="overview_zoom1_00001"), ) assert len(ds) == 1 + ds = flz.get_datasets( + origin_name=SESSION, + flexilims_session=flm_sess, + return_paths=False, + filter_datasets=dict(acq_uid="overview_zoom1_00001"), + allow_multiple=False, + ) + assert isinstance(ds, ScanimageData) + ds = flz.get_datasets( + origin_name=SESSION, + flexilims_session=flm_sess, + return_paths=True, + filter_datasets=dict(acq_uid="overview_zoom1_00001"), + allow_multiple=False, + ) + assert isinstance(ds, pathlib.PosixPath) + ds = flz.get_datasets( + origin_name=SESSION, + flexilims_session=flm_sess, + return_dataseries=True, + filter_datasets=dict(acq_uid="overview_zoom1_00001"), + allow_multiple=True, + ) + assert isinstance(ds, pd.DataFrame) + ds = flz.get_datasets( + origin_name=SESSION, + flexilims_session=flm_sess, + return_dataseries=True, + filter_datasets=dict(acq_uid="overview_zoom1_00001"), + allow_multiple=False, + ) + assert isinstance(ds, pd.Series) rec = flz.get_children( parent_name=SESSION, flexilims_session=flm_sess, children_datatype="recording" ).iloc[0] - ds_all = flz.get_datasets( origin_id=rec.id, flexilims_session=flm_sess, @@ -186,6 +218,12 @@ def test_get_datasets(flm_sess): return_paths=True, ) assert ds == ds2 + with pytest.raises(AssertionError): + flz.get_datasets( + origin_id=rec.id, + project_id=flm_sess.project_id, + allow_multiple=False, + ) def test_get_datasets_recursively(flm_sess): From af690d6a28eeed144227add40c4433d515862589 Mon Sep 17 00:00:00 2001 From: Antonin Blot Date: Tue, 18 Jul 2023 10:05:55 +0100 Subject: [PATCH 11/18] [minor] docstrings and tests update --- flexiznam/main.py | 8 +++----- flexiznam/schema/sequencing_data.py | 2 +- tests/test-results/pytest_in_tests.xml | 2 +- .../test_components/tests_schema/test_sequencing_data.py | 5 ++--- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/flexiznam/main.py b/flexiznam/main.py index 814706c..e8edb41 100755 --- a/flexiznam/main.py +++ b/flexiznam/main.py @@ -1153,10 +1153,8 @@ def get_datasets( filter_datasets (dict): dictionary of key-value pairs to filter datasets by. allow_multiple (bool): if True, allow multiple datasets to be returned, otherwise ensure that only one dataset exists online and return it. - return_paths (bool): if True, return a list of paths. If False, return the - dataset objects or dataseries. - return_dataseries (bool): if True, return the dataseries instead of the - dataset objects. + return_paths (bool): if True, return a list of paths + return_dataseries (bool): if True, a dataframe or a dataseries _output (list): internal argument used for recursion. @@ -1173,7 +1171,7 @@ def get_datasets( filter_datasets = {} if dataset_type is not None: filter_datasets.update({"dataset_type": dataset_type}) - + datasets = get_children( parent_id=origin_id, parent_name=origin_name, diff --git a/flexiznam/schema/sequencing_data.py b/flexiznam/schema/sequencing_data.py index d226b68..4f4ab0f 100644 --- a/flexiznam/schema/sequencing_data.py +++ b/flexiznam/schema/sequencing_data.py @@ -48,7 +48,7 @@ def from_folder( is_raw (bool): does this folder contain raw data? verbose (bool=True): print info about what is found flexilims_session (flm.Session): session to interact with flexilims - project (str): project ID or name + project (str): project name Returns: dict of datasets (fzm.schema.sequencing_data.SequencingData) diff --git a/tests/test-results/pytest_in_tests.xml b/tests/test-results/pytest_in_tests.xml index d6f2e8e..e3838f1 100644 --- a/tests/test-results/pytest_in_tests.xml +++ b/tests/test-results/pytest_in_tests.xml @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/tests/test_components/tests_schema/test_sequencing_data.py b/tests/test_components/tests_schema/test_sequencing_data.py index b4983c9..e066926 100644 --- a/tests/test_components/tests_schema/test_sequencing_data.py +++ b/tests/test_components/tests_schema/test_sequencing_data.py @@ -1,6 +1,6 @@ import pytest from flexiznam.schema.sequencing_data import SequencingData -from tests.tests_resources.data_for_testing import DATA_ROOT +from tests.tests_resources.data_for_testing import DATA_ROOT, PROJECT_ID # Test creation of all dataset types. # @@ -13,10 +13,9 @@ def test_from_folder(): raw_folder = DATA_ROOT / "mouse_mapseq_lcm" / "Raw_sequencing" - ds = SequencingData.from_folder(raw_folder, verbose=False) + ds = SequencingData.from_folder(raw_folder, verbose=False, project="demo_project") assert len(ds) == 2 d = ds["TUR675_S896_l769_R2_001"] assert d.full_name == "Raw_sequencing_TUR675_S896_l769_R2_001" assert d.dataset_name == "TUR675_S896_l769_R2_001" assert d.is_valid() - From f53a89d5c477d4e4273c3bbd8889dd4547a1dbbc Mon Sep 17 00:00:00 2001 From: Antonin Blot Date: Tue, 18 Jul 2023 10:21:54 +0100 Subject: [PATCH 12/18] `update_flexilims` correctly uploads tuples parameters --- CHANGELOG.md | 1 + flexiznam/schema/datasets.py | 2 +- tests/test_components/tests_schema/test_datasets.py | 13 +++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cf4abd4..173bfee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ - return empty dataframe if `filter` in `get_children` filters out everything (instead of crashing) +- `update_flexilims` correctly uploads tuples parameters ### Minor diff --git a/flexiznam/schema/datasets.py b/flexiznam/schema/datasets.py index 09f54af..3aa129d 100644 --- a/flexiznam/schema/datasets.py +++ b/flexiznam/schema/datasets.py @@ -397,7 +397,7 @@ def update_flexilims(self, mode="safe"): if isinstance(attributes[attribute], np.bool_): attributes[attribute] = bool(attributes[attribute]) if isinstance(attributes[attribute], tuple): - attributes[attribute] = list(attribute) + attributes[attribute] = list(attributes[attribute]) flz.utils.clean_recursively(attributes) if status == "different": diff --git a/tests/test_components/tests_schema/test_datasets.py b/tests/test_components/tests_schema/test_datasets.py index 95fd6d7..56358d8 100644 --- a/tests/test_components/tests_schema/test_datasets.py +++ b/tests/test_components/tests_schema/test_datasets.py @@ -341,6 +341,19 @@ def test_update_flexilims(flm_sess): ds.origin_id = None ds.update_flexilims(mode="overwrite") assert err.value.args[0] == "Cannot set origin_id to null" + params = dict( + harp_bin="some/random/path", + di_names=("frame_triggers", "lick_detection", "di2_encoder_initial_state"), + verbose=False, + ) + ds.extra_attributes = params + ds.origin_id = MOUSE_ID + ds.update_flexilims(mode="overwrite") + reloaded_ds = Dataset.from_flexilims( + project, name=ds_name, flexilims_session=flm_sess + ) + assert reloaded_ds.extra_attributes["di_names"] == list(params["di_names"]) + assert reloaded_ds.extra_attributes["verbose"] == params["verbose"] def test_dataset_paths(flm_sess): From 8137c9f0bf8c5cf9ce789d8156f8859838a64d6b Mon Sep 17 00:00:00 2001 From: Antonin Blot Date: Tue, 18 Jul 2023 10:44:36 +0100 Subject: [PATCH 13/18] [minor] assertion was always true --- flexiznam/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flexiznam/main.py b/flexiznam/main.py index e8edb41..b5e6544 100755 --- a/flexiznam/main.py +++ b/flexiznam/main.py @@ -37,9 +37,9 @@ def get_data_root(which, project=None, flexilims_session=None): if project is None: assert ( - flexilims_session is not None, - "`flexilims_session` must be provided if `project` is None", - ) + flexilims_session is not None + ), "`flexilims_session` must be provided if `project` is None" + project = flexilims_session.project_id if project not in PARAMETERS["project_ids"]: From 3d86365928eda78ee9dfac570a9aec67309a7996 Mon Sep 17 00:00:00 2001 From: Antonin Blot Date: Tue, 18 Jul 2023 13:42:09 +0100 Subject: [PATCH 14/18] [bugfix] json compatibility for np.float/np.int --- CHANGELOG.md | 1 + flexiznam/schema/datasets.py | 12 +------- flexiznam/utils.py | 12 ++++++-- .../tests_schema/test_datasets.py | 28 +++++++++++++++++++ 4 files changed, 40 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 173bfee..cc28e69 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ - return empty dataframe if `filter` in `get_children` filters out everything (instead of crashing) - `update_flexilims` correctly uploads tuples parameters +- `update_flexilims` correctly uploads floats and np.float/np.int parameters ### Minor diff --git a/flexiznam/schema/datasets.py b/flexiznam/schema/datasets.py index 3aa129d..c154f96 100644 --- a/flexiznam/schema/datasets.py +++ b/flexiznam/schema/datasets.py @@ -389,16 +389,6 @@ def update_flexilims(self, mode="safe"): status = self.flexilims_status() attributes = self.extra_attributes.copy() - # the following lines are necessary because pandas converts python types to numpy - # types, which JSON does not understand and because JSON doesn't like tuples - for attribute in attributes: - if isinstance(attributes[attribute], np.integer): - attributes[attribute] = int(attributes[attribute]) - if isinstance(attributes[attribute], np.bool_): - attributes[attribute] = bool(attributes[attribute]) - if isinstance(attributes[attribute], tuple): - attributes[attribute] = list(attributes[attribute]) - flz.utils.clean_recursively(attributes) if status == "different": if mode == "safe": @@ -417,7 +407,7 @@ def update_flexilims(self, mode="safe"): if self.origin_id is None: if self.get_flexilims_entry().get("origin_id", None) is not None: raise FlexilimsError("Cannot set origin_id to null") - + utils.clean_recursively(attributes) resp = flz.update_entity( datatype="dataset", id=self.id, diff --git a/flexiznam/utils.py b/flexiznam/utils.py index 0effaaa..1e13483 100644 --- a/flexiznam/utils.py +++ b/flexiznam/utils.py @@ -150,6 +150,8 @@ def clean_recursively( # we don't have a dictionary ds_classes = set(Dataset.SUBCLASSES.values()) ds_classes.add(Dataset) + floats = (float, np.float32, np.float64) + ints = (int, np.int32, np.int64) if ( (element is None) or isinstance(element, str) @@ -165,8 +167,14 @@ def clean_recursively( element = element.tolist() elif isinstance(element, pathlib.Path): element = str(PurePosixPath(element)) - elif isinstance(element, float) and (not np.isfinite(element)): - element = str(element) + elif isinstance(element, floats): + if not np.isfinite(element): + # nan and inf must be uploaded as string + element = str(element) + else: + element = float(element) + elif isinstance(element, ints): + element = int(element) elif isinstance(element, pd.Series or pd.DataFrame): raise IOError("Cannot make a pandas object json compatible") else: diff --git a/tests/test_components/tests_schema/test_datasets.py b/tests/test_components/tests_schema/test_datasets.py index 56358d8..2bc65b1 100644 --- a/tests/test_components/tests_schema/test_datasets.py +++ b/tests/test_components/tests_schema/test_datasets.py @@ -1,6 +1,7 @@ import pytest import pathlib import pandas as pd +import numpy as np import flexiznam from flexiznam.schema import Dataset, microscopy_data from flexiznam.config import PARAMETERS @@ -315,6 +316,33 @@ def test_update_flexilims(flm_sess): project = "demo_project" ds_name = "mouse_physio_2p_S20211102_R165821_SpheresPermTube_wf_camera" ds = Dataset.from_flexilims(project, name=ds_name, flexilims_session=flm_sess) + + # test weirded datatypes + ds.extra_attributes = dict( + nf64=np.float64(1), + nf32=np.float32(1), + ni64=np.int64(1), + ni32=np.int32(1), + pathobj=pathlib.Path("some/path"), + list=[1, 2, 3], + tuple=(1, 2, 3), + f=float(1.0), + i=int(34), + ) + ds.update_flexilims(mode="overwrite") + reloaded_ds = Dataset.from_flexilims( + project, name=ds_name, flexilims_session=flm_sess + ) + assert reloaded_ds.extra_attributes["nf64"] == 1.0 + assert reloaded_ds.extra_attributes["nf32"] == 1.0 + assert reloaded_ds.extra_attributes["ni64"] == 1 + assert reloaded_ds.extra_attributes["ni32"] == 1 + assert reloaded_ds.extra_attributes["pathobj"] == str(pathlib.Path("some/path")) + assert reloaded_ds.extra_attributes["list"] == [1, 2, 3] + assert reloaded_ds.extra_attributes["tuple"] == list((1, 2, 3)) + assert reloaded_ds.extra_attributes["f"] == 1.0 + assert reloaded_ds.extra_attributes["i"] == 34 + original_path = ds.path ds.path = "new/test/path" with pytest.raises(FlexilimsError) as err: From 2110424cda71324010f11b5bc54e811cf7f11f4d Mon Sep 17 00:00:00 2001 From: Antonin Blot Date: Tue, 18 Jul 2023 13:50:54 +0100 Subject: [PATCH 15/18] [test] update test Tests should not change entity uploaded in 2p tests --- .../tests_schema/test_datasets.py | 56 ++++++++++--------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/tests/test_components/tests_schema/test_datasets.py b/tests/test_components/tests_schema/test_datasets.py index 2bc65b1..15e1420 100644 --- a/tests/test_components/tests_schema/test_datasets.py +++ b/tests/test_components/tests_schema/test_datasets.py @@ -317,32 +317,6 @@ def test_update_flexilims(flm_sess): ds_name = "mouse_physio_2p_S20211102_R165821_SpheresPermTube_wf_camera" ds = Dataset.from_flexilims(project, name=ds_name, flexilims_session=flm_sess) - # test weirded datatypes - ds.extra_attributes = dict( - nf64=np.float64(1), - nf32=np.float32(1), - ni64=np.int64(1), - ni32=np.int32(1), - pathobj=pathlib.Path("some/path"), - list=[1, 2, 3], - tuple=(1, 2, 3), - f=float(1.0), - i=int(34), - ) - ds.update_flexilims(mode="overwrite") - reloaded_ds = Dataset.from_flexilims( - project, name=ds_name, flexilims_session=flm_sess - ) - assert reloaded_ds.extra_attributes["nf64"] == 1.0 - assert reloaded_ds.extra_attributes["nf32"] == 1.0 - assert reloaded_ds.extra_attributes["ni64"] == 1 - assert reloaded_ds.extra_attributes["ni32"] == 1 - assert reloaded_ds.extra_attributes["pathobj"] == str(pathlib.Path("some/path")) - assert reloaded_ds.extra_attributes["list"] == [1, 2, 3] - assert reloaded_ds.extra_attributes["tuple"] == list((1, 2, 3)) - assert reloaded_ds.extra_attributes["f"] == 1.0 - assert reloaded_ds.extra_attributes["i"] == 34 - original_path = ds.path ds.path = "new/test/path" with pytest.raises(FlexilimsError) as err: @@ -382,6 +356,36 @@ def test_update_flexilims(flm_sess): ) assert reloaded_ds.extra_attributes["di_names"] == list(params["di_names"]) assert reloaded_ds.extra_attributes["verbose"] == params["verbose"] + # test weirded datatypes + ds = Dataset.from_origin( + project, origin_id=ds.origin_id, flexilims_session=flm_sess, + dataset_type='microscopy' + ) + ds.extra_attributes = dict( + nf64=np.float64(1), + nf32=np.float32(1), + ni64=np.int64(1), + ni32=np.int32(1), + pathobj=pathlib.Path("some/path"), + list=[1, 2, 3], + tuple=(1, 2, 3), + f=float(1.0), + i=int(34), + ) + ds.update_flexilims(mode="overwrite") + reloaded_ds = Dataset.from_flexilims( + project, name=ds.full_name, flexilims_session=flm_sess + ) + assert reloaded_ds.extra_attributes["nf64"] == 1.0 + assert reloaded_ds.extra_attributes["nf32"] == 1.0 + assert reloaded_ds.extra_attributes["ni64"] == 1 + assert reloaded_ds.extra_attributes["ni32"] == 1 + assert reloaded_ds.extra_attributes["pathobj"] == str(pathlib.Path("some/path")) + assert reloaded_ds.extra_attributes["list"] == [1, 2, 3] + assert reloaded_ds.extra_attributes["tuple"] == list((1, 2, 3)) + assert reloaded_ds.extra_attributes["f"] == 1.0 + assert reloaded_ds.extra_attributes["i"] == 34 + flm_sess.delete(ds.id) def test_dataset_paths(flm_sess): From e8c002e82882afe43af9e1b9b88a9a9ec26aff7a Mon Sep 17 00:00:00 2001 From: yiran25 Date: Fri, 21 Jul 2023 11:51:56 +0100 Subject: [PATCH 16/18] change update_flexilims to overwrite is conflicts is overwrite --- flexiznam/camp/sync_data.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py index 7051587..d7d331e 100644 --- a/flexiznam/camp/sync_data.py +++ b/flexiznam/camp/sync_data.py @@ -171,6 +171,7 @@ def upload_yaml( list of names of entities created/updated """ + output = [] # if there are errors, I cannot safely parse the yaml errors = find_xxerrorxx(yml_file=source_yaml) @@ -232,12 +233,17 @@ def upload_yaml( root_id = mouse["id"] # session datasets + # use "overwrite" as mode if conflict is "overwrite", otherwise use "safe" mode + if conflicts == "overwrite": + mode = "overwrite" + else: + mode = "safe" for ds_name, ds in session_data.get("datasets", {}).items(): ds.genealogy = [mouse["name"], session_data["session"], ds_name] ds.project = session_data["project"] ds.origin_id = root_id ds.flexilims_session = flexilims_session - ds.update_flexilims(mode="safe") + ds.update_flexilims(mode=mode) output.append(ds.full_name) # now deal with recordings @@ -276,7 +282,7 @@ def upload_yaml( ds.project = session_data["project"] ds.origin_id = rec_rep["id"] ds.flexilims_session = flexilims_session - ds.update_flexilims(mode="safe") + ds.update_flexilims(mode=mode) output.append(ds.full_name) # now deal with samples From 68a694fa2568b34bcc5bdf725eae13b3967052ed Mon Sep 17 00:00:00 2001 From: Antonin Blot Date: Wed, 26 Jul 2023 19:31:52 +0100 Subject: [PATCH 17/18] [bugfix] Add lock file to token creation This should prevent race condition when running a bunch of jobs at once. --- CHANGELOG.md | 4 ++ flexiznam/config/config_tools.py | 42 ++++++++----- flexiznam/main.py | 35 +++++++---- setup.py | 1 + tests/test_components/test_main.py | 34 +++++++--- .../tests_schema/test_datasets.py | 62 ++++++++----------- tests/tests_resources/data_for_testing.py | 3 +- 7 files changed, 110 insertions(+), 71 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cc28e69..9de84ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,12 +16,16 @@ of crashing) - `update_flexilims` correctly uploads tuples parameters - `update_flexilims` correctly uploads floats and np.float/np.int parameters +- Add filelock for token creation to avoid concurrent access and move token to their own + file ### Minor - `harp_dataset.from_folder` will now match csv even if there is nothing before or after `harpmessage` in the file name (i.e. the file is `harpmessage.bin`, and all csvs in the folder will be matched) +- private function `config_tools._find_files` has now a `create_if_missing` argument to + create the file if it does not exist ## v0.3.7 diff --git a/flexiznam/config/config_tools.py b/flexiznam/config/config_tools.py index d1d4357..bf2d373 100644 --- a/flexiznam/config/config_tools.py +++ b/flexiznam/config/config_tools.py @@ -10,7 +10,7 @@ from getpass import getpass -def _find_file(file_name, config_folder=None): +def _find_file(file_name, config_folder=None, create_if_missing=False): """Find a file by looking at various places Only in config_folder (if provided) @@ -19,25 +19,37 @@ def _find_file(file_name, config_folder=None): - then in the ~/.config folder - then in the folder contain the file defining this function - then in sys.path + + Args: + file_name (str): name of the file to find + config_folder (str): folder to look for the file + create_if_missing (bool): if True, create the file in the config folder if it + does not exist. If False, raise an error if the file is not found """ if config_folder is not None: in_config_folder = Path(config_folder) / file_name if in_config_folder.is_file(): return in_config_folder - raise ConfigurationError("Cannot find %s in %s" % (file_name, config_folder)) - local = Path.cwd() / file_name - if local.is_file(): - return local - config = Path(__file__).parent.absolute() / "config" / file_name - home = Path.home() / ".flexiznam" - if home.is_dir() and (home / file_name).is_file(): - return home / file_name - if config.is_file(): - return config - for directory in sys.path: - fname = Path(directory) / file_name - if fname.is_file(): - return fname + missing_file = in_config_folder + else: + local = Path.cwd() / file_name + if local.is_file(): + return local + config = Path(__file__).parent.absolute() / "config" / file_name + home = Path.home() / ".flexiznam" + if home.is_dir() and (home / file_name).is_file(): + return home / file_name + if config.is_file(): + return config + for directory in sys.path: + fname = Path(directory) / file_name + if fname.is_file(): + return fname + missing_file = home / file_name + if create_if_missing: + with open(missing_file, "w") as f: + f.write("") + return missing_file raise ConfigurationError("Cannot find %s" % file_name) diff --git a/flexiznam/main.py b/flexiznam/main.py index b5e6544..2ec0867 100755 --- a/flexiznam/main.py +++ b/flexiznam/main.py @@ -1,11 +1,13 @@ import datetime import re +import portalocker import warnings import pandas as pd import flexilims as flm from pathlib import Path from flexilims.utils import SPECIAL_CHARACTERS import flexiznam +import yaml from flexiznam import mcms from flexiznam.config import PARAMETERS, get_password, add_password from flexiznam.errors import NameNotUniqueError, FlexilimsError, ConfigurationError @@ -68,7 +70,11 @@ def lookup_project(project_id, prm=None): def get_flexilims_session( - project_id=None, username=None, password=None, reuse_token=True + project_id=None, + username=None, + password=None, + reuse_token=True, + timeout=10, ): """Open a new flexilims session by creating a new authentication token. @@ -80,6 +86,8 @@ def get_flexilims_session( password (str): (optional) flexilims password. If not provided, it is read from the secrets file, or failing that triggers an input prompt. reuse_token (bool): (optional) if True, try to reuse an existing token + timeout (int): (optional) timeout in seconds for the portalocker lock. Default + to 10. Returns: :py:class:`flexilims.Flexilims`: Flexilims session object. @@ -96,21 +104,26 @@ def get_flexilims_session( if reuse_token: today = datetime.datetime.now().strftime("%Y-%m-%d") - try: - token = get_password(app="flexilims", username="token", allow_input=False) - token, date = token.split("_") + tocken_file = flexiznam.config.config_tools._find_file( + "flexilims_token.yml", create_if_missing=True + ) + with portalocker.Lock(tocken_file, "r+", timeout=timeout) as file_handle: + tokinfo = yaml.safe_load(file_handle) or {} + token = tokinfo.get("token", None) + date = tokinfo.get("date", None) if date != today: token = None else: token = dict(Authorization=f"Bearer {token}") - except ConfigurationError: - token = None + session = flm.Flexilims( + username, password, project_id=project_id, token=token + ) + if token is None: + # we need to update the token + token = session.session.headers["Authorization"].split(" ")[-1] + yaml.dump(dict(token=token, date=today), file_handle) else: - token = None - session = flm.Flexilims(username, password, project_id=project_id, token=token) - if reuse_token: - token = session.session.headers["Authorization"].split(" ")[-1] - add_password("flexilims", "token", f"{token}_{today}") + session = flm.Flexilims(username, password, project_id=project_id, token=None) return session diff --git a/setup.py b/setup.py index afd0c32..003ab18 100755 --- a/setup.py +++ b/setup.py @@ -13,6 +13,7 @@ install_requires=[ "Click", "pandas", + "portalocker", "webbot", "pyyaml", "flexilims @ git+ssh://git@github.com/znamlab/flexilims.git#egg=flexilims", diff --git a/tests/test_components/test_main.py b/tests/test_components/test_main.py index c6aa785..252a588 100644 --- a/tests/test_components/test_main.py +++ b/tests/test_components/test_main.py @@ -2,8 +2,10 @@ import pathlib from pathlib import Path import pandas as pd +import portalocker import pytest -import flexiznam.main as flz +import flexiznam as flz +import yaml from flexiznam.config import PARAMETERS, get_password from flexiznam.errors import FlexilimsError, NameNotUniqueError from tests.tests_resources.data_for_testing import MOUSE_ID, SESSION @@ -26,7 +28,7 @@ def test_get_path(): assert p == Path(PARAMETERS["data_root"]["processed"]) p = flz.get_data_root(which="raw", project="example", flexilims_session=None) assert p == Path("/camp/project/example_project/raw") - with pytest.raises(AttributeError): + with pytest.raises(AssertionError): flz.get_data_root(which="processed", project=None, flexilims_session=None) with pytest.raises(ValueError): flz.get_data_root(which="crap", project="test", flexilims_session=None) @@ -41,13 +43,28 @@ def test_get_flexilims_session(): assert sess.username == PARAMETERS["flexilims_username"] sess = flz.get_flexilims_session(project_id=None, reuse_token=True) assert sess.username == PARAMETERS["flexilims_username"] - token, date = get_password("flexilims", "token").split("_") + token_file = flz.config.config_tools._find_file("flexilims_token.yml") + tokinfo = yaml.safe_load(token_file.read_text()) + token = tokinfo.get("token", None) + date = tokinfo.get("date", None) today = datetime.datetime.now().strftime("%Y-%m-%d") assert date == today assert sess.session.headers["Authorization"].split(" ")[1] == token sess = flz.get_flexilims_session(project_id=None, reuse_token=True) assert sess.session.headers["Authorization"].split(" ")[1] == token + # manualy lock the token file to test timeout + with portalocker.Lock(token_file, "r+", timeout=10) as file_handle: + with pytest.raises(portalocker.exceptions.LockException): + sess = flz.get_flexilims_session( + project_id=None, reuse_token=True, timeout=0.1 + ) + # but fine without the reuse_token flag + sess = flz.get_flexilims_session(project_id=None, reuse_token=False) + assert sess.session.headers["Authorization"].split(" ")[1] != token + sess = flz.get_flexilims_session(project_id=None, reuse_token=True, timeout=0.1) + assert sess.session.headers["Authorization"].split(" ")[1] == token + def test_format_results(): exmple_res = { @@ -177,7 +194,7 @@ def test_get_datasets(flm_sess): filter_datasets=dict(acq_uid="overview_zoom1_00001"), allow_multiple=True, ) - assert isinstance(ds, pd.DataFrame) + assert isinstance(ds, pd.DataFrame) ds = flz.get_datasets( origin_name=SESSION, flexilims_session=flm_sess, @@ -185,7 +202,7 @@ def test_get_datasets(flm_sess): filter_datasets=dict(acq_uid="overview_zoom1_00001"), allow_multiple=False, ) - assert isinstance(ds, pd.Series) + assert isinstance(ds, pd.Series) rec = flz.get_children( parent_name=SESSION, flexilims_session=flm_sess, children_datatype="recording" @@ -247,10 +264,13 @@ def test_get_datasets_recursively(flm_sess): origin_name=SESSION, return_paths=False, dataset_type="harp", - filter_datasets=dict(binary_file="harpmessage.bin"), + filter_datasets=dict( + binary_file="PZAD9.4d_S20211102_R173917_SpheresPermTube_harpmessage.bin" + ), ) assert len(ds_dict) == 1 - assert ds_dict.values().__iter__().__next__()[0].dataset_name == "harpmessage" + ds = ds_dict.values().__iter__().__next__()[0] + assert isinstance(ds, HarpData) ds_dict = flz.get_datasets_recursively( flexilims_session=flm_sess, diff --git a/tests/test_components/tests_schema/test_datasets.py b/tests/test_components/tests_schema/test_datasets.py index 15e1420..38c5ab9 100644 --- a/tests/test_components/tests_schema/test_datasets.py +++ b/tests/test_components/tests_schema/test_datasets.py @@ -101,64 +101,47 @@ def test_dataset_flexilims_integration(flm_sess): extra_attributes={}, created="", flexilims_session=flm_sess, - genealogy=( - "mouse_physio_2p", - "S20211102", - "R165821_SpheresPermTube", - "wf_camera", - ), + genealogy=("PZAA15.1a", "temporary_dataset"), ) st = ds.flexilims_status() + assert st == "not online" + ds.update_flexilims() + st = ds.flexilims_status() + assert st == "up-to-date" + ds.extra_attributes = dict(test="test") + ds.path = "new/path" + st = ds.flexilims_status() assert st == "different" rep = ds.flexilims_report() expected = pd.DataFrame( dict( - offline={ - "is_raw": "no", - "path": "fake/path", - "created": "", - "metadata_file": "NA", - "timestamp_file": "NA", - "video_file": "NA", - }, - flexilims={ - "is_raw": "yes", - "created": "2021-11-02 17:03:17", - "path": "demo_project/mouse_physio_2p/" - "S20211102/R165821_SpheresPermTube", - "origin_id": "61ebf94120d82a35f724490d", - "timestamp_file": "wf_camera_timestamps.csv", - "video_file": "wf_camera_data.bin", - "metadata_file": "wf_camera_metadata.txt", - }, + offline={"path": "fake/path", "test": "test"}, + flexilims={"path": "new/path", "test": "NA"}, ) ) assert all(rep.sort_index() == expected.sort_index()) - ds_name = "mouse_physio_2p_S20211102_R165821_SpheresPermTube_wf_camera" + ds_name = "PZAA15.1a_temporary_dataset" + assert ds.format().name == ds_name fmt = { - "path": "fake/path", "created": "", "dataset_type": "camera", + "genealogy": ("PZAA15.1a", "temporary_dataset"), "is_raw": "no", - "name": ds_name, + "name": "PZAA15.1a_temporary_dataset", + "path": "new/path", "project": "610989f9a651ff0b6237e0f6", + "test": "test", "type": "dataset", - "genealogy": ( - "mouse_physio_2p", - "S20211102", - "R165821_SpheresPermTube", - "wf_camera", - ), } - assert ds.format().name == ds_name assert all( ds.format().drop("origin_id").sort_index() == pd.Series(data=fmt, name=ds_name).sort_index() ) # same with yaml mode - fmt["extra_attributes"] = {} + fmt["extra_attributes"] = {"test": "test"} fmt["genealogy"] = list(fmt["genealogy"]) + fmt.pop("test") ds_yaml = ds.format(mode="yaml") try: del ds_yaml["origin_id"] @@ -166,6 +149,9 @@ def test_dataset_flexilims_integration(flm_sess): pass assert ds_yaml == fmt + flm_sess.delete(ds.id) + assert ds.flexilims_status() == "not online" + ds = Dataset( path="fake/path", is_raw="no", @@ -358,8 +344,10 @@ def test_update_flexilims(flm_sess): assert reloaded_ds.extra_attributes["verbose"] == params["verbose"] # test weirded datatypes ds = Dataset.from_origin( - project, origin_id=ds.origin_id, flexilims_session=flm_sess, - dataset_type='microscopy' + project, + origin_id=ds.origin_id, + flexilims_session=flm_sess, + dataset_type="microscopy", ) ds.extra_attributes = dict( nf64=np.float64(1), diff --git a/tests/tests_resources/data_for_testing.py b/tests/tests_resources/data_for_testing.py index 78d5e34..dd6bb0a 100644 --- a/tests/tests_resources/data_for_testing.py +++ b/tests/tests_resources/data_for_testing.py @@ -4,7 +4,8 @@ from flexiznam.config import PARAMETERS -MOUSE_ID = "6437dcb13ded9c65df142a12" +MOUSE_ID = "6437dcb13ded9c65df142a12" # actual physio2p mouse +MOUSE_TEMP = "647a1aec7ddb34517470d3e6" # some random mouse where I can change data TEST_PROJECT = "demo_project" PROJECT_ID = "610989f9a651ff0b6237e0f6" SESSION = "mouse_physio_2p_S20211102" From d8cbad702096dbb07416feb46c230b3da2fdde59 Mon Sep 17 00:00:00 2001 From: Antonin Blot Date: Thu, 27 Jul 2023 18:11:52 +0100 Subject: [PATCH 18/18] [docs] Add changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9de84ce..053d5f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ of crashing) - `update_flexilims` correctly uploads tuples parameters - `update_flexilims` correctly uploads floats and np.float/np.int parameters +- `update_flexilims` can overwrite existing datasets (if `conflicts='overwrite'`) - Add filelock for token creation to avoid concurrent access and move token to their own file